4 |
4 |
use warnings;
|
5 |
5 |
use utf8;
|
6 |
6 |
|
7 |
|
use CAM::PDF;
|
|
7 |
use PDF::API2;
|
8 |
8 |
use Data::Dumper;
|
9 |
9 |
use List::Util qw(first);
|
10 |
10 |
use XML::LibXML;
|
... | ... | |
47 |
47 |
my $doc = shift;
|
48 |
48 |
my %res_fail;
|
49 |
49 |
|
50 |
|
$res_fail{'result'} = RES_ERR_NO_ATTACHMENT();
|
51 |
|
$res_fail{'message'} = "PDF does not have a Names dictionary.";
|
52 |
|
my $names_dict = $doc->getValue($doc->getRootDict->{Names}) or return \%res_fail;
|
53 |
|
|
54 |
|
$res_fail{'message'} = "PDF does not have a EmbeddedFiles tree.";
|
55 |
|
my $files_tree = $names_dict->{EmbeddedFiles} or return \%res_fail;
|
|
50 |
# unfortunately PDF::API2 has no public facing api to access the actual pdf name dictionaries
|
|
51 |
# so we need to use the internal data, just like with PDF::CAM before
|
|
52 |
#
|
|
53 |
# PDF::API2 will internally read $doc->{pdf}{Root}{Names} for us, but after that every entry
|
|
54 |
# in the tree may be an indirect object (Objind) before realising it.
|
|
55 |
#
|
|
56 |
# The actual embedded files will be located at $doc->{pdf}{Root}{Names}{EmbeddedFiles}
|
|
57 |
#
|
|
58 |
|
|
59 |
my $node = $doc->{pdf};
|
|
60 |
for (qw(Root Names EmbeddedFiles)) {
|
|
61 |
$node = $node->{$_};
|
|
62 |
if (!ref $node) {
|
|
63 |
return {
|
|
64 |
result => RES_ERR_NO_ATTACHMENT(),
|
|
65 |
message => "unexpected unbless node while trying to access $_ node",
|
|
66 |
}
|
|
67 |
}
|
|
68 |
if ('PDF::API2::Basic::PDF::Objind' eq ref $node) {
|
|
69 |
$node->realise;
|
|
70 |
}
|
|
71 |
# after realising it should be a Dict
|
|
72 |
if ('PDF::API2::Basic::PDF::Dict' ne ref $node) {
|
|
73 |
return {
|
|
74 |
result => RES_ERR_NO_ATTACHMENT(),
|
|
75 |
message => "unexpected node type [@{[ref($node)]}] after realising $_ node",
|
|
76 |
}
|
|
77 |
}
|
|
78 |
}
|
56 |
79 |
|
57 |
|
my @agenda = $files_tree;
|
|
80 |
# now we have an array of possible attachments
|
|
81 |
my @agenda = $node;
|
58 |
82 |
|
59 |
83 |
my $parser; # SL::XMLInvoice object used as return value
|
60 |
84 |
my @res; # Temporary storage for error messages encountered during
|
... | ... | |
63 |
87 |
# Hardly ever more than single leaf, but...
|
64 |
88 |
|
65 |
89 |
while (@agenda) {
|
66 |
|
my $item = $doc->getValue(shift @agenda);
|
|
90 |
my $item = shift @agenda;
|
67 |
91 |
|
68 |
92 |
if ($item->{Kids}) {
|
69 |
|
my $kids = $doc->getValue($item->{Kids});
|
70 |
|
push @agenda, @$kids
|
|
93 |
my @kids = $item->{Kids}->realise->elements;
|
|
94 |
push @agenda, @kids;
|
71 |
95 |
|
72 |
96 |
} else {
|
73 |
|
my $nodes = $doc->getValue($item->{Names});
|
74 |
|
my @names = map { $doc->getValue($_)} @$nodes;
|
|
97 |
my @names = $item->{Names}->realise->elements;
|
75 |
98 |
|
|
99 |
TRY_NEXT:
|
76 |
100 |
while (@names) {
|
77 |
101 |
my ($k, $v) = splice @names, 0, 2;
|
78 |
|
my $ef_node = $v->{EF};
|
79 |
|
my $ef_dict = $doc->getValue($ef_node);
|
80 |
|
my $fnode = (values %$ef_dict)[0];
|
81 |
|
my $any_num = $fnode->{value};
|
82 |
|
my $obj_node = $doc->dereference($any_num);
|
83 |
|
my $content = $doc->decodeOne($obj_node->{value}, 0) // '';
|
|
102 |
my $fnode = $v->realise->{EF}->realise->{F}->realise;
|
|
103 |
|
|
104 |
$fnode->read_stream(1);
|
|
105 |
|
|
106 |
my $content = $fnode->{' stream'};
|
84 |
107 |
|
85 |
|
$parser = $parser = SL::XMLInvoice->new($content);
|
|
108 |
$parser = SL::XMLInvoice->new($content);
|
86 |
109 |
|
87 |
110 |
# Caveat: this will only ever catch the first attachment looking like
|
88 |
111 |
# an XML invoice.
|
... | ... | |
114 |
137 |
sub _get_xmp_metadata {
|
115 |
138 |
my ($doc) = @_;
|
116 |
139 |
|
117 |
|
my $node = $doc->getValue($doc->getRootDict->{Metadata});
|
118 |
|
if ($node && $node->{StreamData} && defined($node->{StreamData}->{value})) {
|
119 |
|
return $node->{StreamData}->{value};
|
120 |
|
}
|
121 |
|
return undef;
|
|
140 |
$doc->xmpMetadata;
|
122 |
141 |
}
|
123 |
142 |
|
124 |
143 |
sub extract_from_pdf {
|
125 |
144 |
my ($self, $file_name) = @_;
|
126 |
145 |
my @warnings;
|
127 |
146 |
|
128 |
|
my $pdf_doc = CAM::PDF->new($file_name);
|
|
147 |
my $pdf_doc = PDF::API2->openScalar($file_name);
|
129 |
148 |
|
130 |
149 |
if (!$pdf_doc) {
|
131 |
150 |
return {
|
... | ... | |
200 |
219 |
my %res;
|
201 |
220 |
|
202 |
221 |
my $invoice_xml = SL::XMLInvoice->new($data);
|
203 |
|
|
|
222 |
|
204 |
223 |
%res = (
|
205 |
224 |
result => $invoice_xml->{result},
|
206 |
225 |
message => $invoice_xml->{message},
|
ZUGFeRD: CAM::PDF durch PDF::API2 ersetzt
CAM::PDF wird seit 2014 nicht mehr gepflegt und kann PDF 1.7+ nicht
richtig öffnen. PDF::API2 ist aktiv maintained, hat aber nicht ganz so
schöne Zugriffsmethoden.
Die Version hier ist mit einer Rechnung von kivitendo getestet (PDF/A-1,
PDF 1.5) und einer externen (PDF/A-3, PDF 1.7).