Projekt

Allgemein

Profil

« Zurück | Weiter » 

Revision a827a37d

Von Moritz Bunkus vor fast 5 Jahren hinzugefügt

  • ID a827a37dbc9fda429719d199613a930200820ee6
  • Vorgänger dc78c225
  • Nachfolger 80cda5e6

SL::ZUGFeRD: Funktionen zum Extrahieren von ZUGFeRD-XML-Daten aus PDFs

Unterschiede anzeigen:

SL/InstallationCheck.pm
19 19
  { name => "parent",                              url => "http://search.cpan.org/~corion/",    debian => 'libparent-perl' },
20 20
  { name => "Algorithm::CheckDigits",              url => "http://search.cpan.org/~mamawe/",    debian => 'libalgorithm-checkdigits-perl' },
21 21
  { name => "Archive::Zip",    version => '1.16',  url => "http://search.cpan.org/~phred/",     debian => 'libarchive-zip-perl' },
22
  { name => "CAM::PDF",                            url => "https://metacpan.org/pod/CAM::PDF",  debian => 'libcap-pdf-perl' },
22 23
  { name => "CGI",             version => '3.43',  url => "http://search.cpan.org/~leejo/",     debian => 'libcgi-pm-perl' }, # 4.09 is not core anymore (perl 5.20)
23 24
  { name => "Clone",                               url => "http://search.cpan.org/~rdf/",       debian => 'libclone-perl' },
24 25
  { name => "Config::Std",                         url => "http://search.cpan.org/~dconway/",   debian => 'libconfig-std-perl' },
......
62 63
  { name => "Text::Iconv",     version => '1.2',   url => "http://search.cpan.org/~mpiotr/",    debian => 'libtext-iconv-perl' },
63 64
  { name => "Text::Unidecode",                     url => "http://search.cpan.org/~sburke/",    debian => 'libtext-unidecode-perl' },
64 65
  { name => "URI",             version => '1.35',  url => "http://search.cpan.org/~gaas/",      debian => 'liburi-perl' },
66
  { name => "XML::LibXML",                         url => "https://metacpan.org/pod/XML::LibXML", debian => 'libxml-libxml-perl' },
65 67
  { name => "XML::Writer",     version => '0.602', url => "http://search.cpan.org/~josephw/",   debian => 'libxml-writer-perl' },
66 68
  { name => "YAML",            version => '0.62',  url => "http://search.cpan.org/~ingy/",      debian => 'libyaml-perl' },
67 69
);
SL/ZUGFeRD.pm
1
package SL::ZUGFeRD;
2

  
3
use strict;
4
use warnings;
5
use utf8;
6

  
7
use CAM::PDF;
8
use Data::Dumper;
9
use List::Util qw(first);
10
use XML::LibXML;
11

  
12
use constant RES_OK                              => 0;
13
use constant RES_ERR_FILE_OPEN                   => 1;
14
use constant RES_ERR_NO_XMP_METADATA             => 2;
15
use constant RES_ERR_NO_XML_INVOICE              => 3;
16
use constant RES_ERR_NOT_ZUGFERD                 => 4;
17
use constant RES_ERR_UNSUPPORTED_ZUGFERD_VERSION => 5;
18

  
19
sub _extract_zugferd_invoice_xml {
20
  my $doc        = shift;
21
  my $names_dict = $doc->getValue($doc->getRootDict->{Names}) or return {};
22
  my $files_tree = $names_dict->{EmbeddedFiles}               or return {};
23
  my @agenda     = $files_tree;
24
  my $ret        = {};
25

  
26
  # Hardly ever more than single leaf, but...
27

  
28
  while (@agenda) {
29
    my $item = $doc->getValue(shift @agenda);
30

  
31
    if ($item->{Kids}) {
32
      my $kids = $doc->getValue($item->{Kids});
33
      push @agenda, @$kids
34

  
35
    } else {
36
      my $nodes = $doc->getValue($item->{Names});
37
      my @names = map { $doc->getValue($_)} @$nodes;
38

  
39
      while (@names) {
40
        my ($k, $v)  = splice @names, 0, 2;
41
        my $ef_node  = $v->{EF};
42
        my $ef_dict  = $doc->getValue($ef_node);
43
        my $fnode    = (values %$ef_dict)[0];
44
        my $any_num  = $fnode->{value};
45
        my $obj_node = $doc->dereference($any_num);
46
        my $content  = $doc->decodeOne($obj_node->{value}, 0) // '';
47

  
48
        print "1\n";
49

  
50
        next if $content !~ m{<rsm:CrossIndustryInvoice};
51
        print "2\n";
52

  
53
        my $dom = eval { XML::LibXML->load_xml(string => $content) };
54
        return $content if $dom && ($dom->documentElement->nodeName eq 'rsm:CrossIndustryInvoice');
55
      }
56
    }
57
  }
58

  
59
  return undef;
60
}
61

  
62
sub _get_xmp_metadata {
63
  my ($doc) = @_;
64

  
65
  my $node = $doc->getValue($doc->getRootDict->{Metadata});
66
  if ($node && $node->{StreamData} && defined($node->{StreamData}->{value})) {
67
    return $node->{StreamData}->{value};
68
  }
69

  
70
  return undef;
71
}
72

  
73
sub extract_from_pdf {
74
  my ($self, $file_name) = @_;
75

  
76
  my $pdf_doc = CAM::PDF->new($file_name);
77

  
78
  if (!$pdf_doc) {
79
    return {
80
      result  => RES_ERR_FILE_OPEN(),
81
      message => $::locale->text('The file \'#1\' could not be opened for reading.', $file_name),
82
    };
83
  }
84

  
85
  my $xmp = _get_xmp_metadata($pdf_doc);
86
  if (!defined $xmp) {
87
    return {
88
      result  => RES_ERR_NO_XMP_METADATA(),
89
      message => $::locale->text('The file \'#1\' does not contain the required XMP meta data.', $file_name),
90
    };
91
  }
92

  
93
  my $bad = {
94
    result  => RES_ERR_NO_XMP_METADATA(),
95
    message => $::locale->text('Parsing the XMP metadata failed.'),
96
  };
97

  
98
  my $dom = eval { XML::LibXML->load_xml(string => $xmp) };
99

  
100
  return $bad if !$dom;
101

  
102
  my $xpc = XML::LibXML::XPathContext->new($dom);
103
  $xpc->registerNs('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#');
104

  
105
  my $zugferd_version;
106

  
107
  foreach my $node ($xpc->findnodes('/x:xmpmeta/rdf:RDF/rdf:Description')) {
108
    my $ns = first { ref($_) eq 'XML::LibXML::Namespace' } $node->attributes;
109
    next unless $ns;
110

  
111
    if ($ns->getData =~ m{urn:zugferd:pdfa:CrossIndustryDocument:invoice:2p0}) {
112
      $zugferd_version = '2p0';
113
      last;
114
    }
115

  
116
    if ($ns->getData =~ m{zugferd}i) {
117
      $zugferd_version = 'unsupported';
118
      last;
119
    }
120
  }
121

  
122
  if (!$zugferd_version) {
123
    return {
124
      result  => RES_ERR_NOT_ZUGFERD(),
125
      message => $::locale->text('The XMP metadata does not declare the ZUGFeRD data.'),
126
    };
127
  }
128

  
129
  if ($zugferd_version !~ m{^2p}) {
130
    return {
131
      result  => RES_ERR_UNSUPPORTED_ZUGFERD_VERSION(),
132
      message => $::locale->text('The ZUGFeRD version used is not supported.'),
133
    };
134
  }
135

  
136
  my $invoice_xml = _extract_zugferd_invoice_xml($pdf_doc);
137

  
138
  if (!defined $invoice_xml) {
139
    return {
140
      result  => RES_ERR_NO_XML_INVOICE(),
141
      message => $::locale->text('The ZUGFeRD XML invoice was not found.'),
142
    };
143
  }
144

  
145
  return {
146
    result       => RES_OK(),
147
    metadata_xmp => $xmp,
148
    invoice_xml  => $invoice_xml,
149
  };
150
}
151

  
152
1;
153

  
154
__END__
155

  
156
=pod
157

  
158
=encoding utf8
159

  
160
=head1 NAME
161

  
162
SL::ZUGFeRD - Helper functions for dealing with PDFs containing ZUGFeRD invoice data
163

  
164
=head1 SYNOPSIS
165

  
166
    my $pdf  = '/path/to/my.pdf';
167
    my $info = SL::ZUGFeRD->extract_from_pdf($pdf);
168

  
169
    if ($info->{result} != SL::ZUGFeRD::RES_OK()) {
170
      # An error occurred; log message from parser:
171
      $::lxdebug->message(LXDebug::DEBUG1(), "Could not extract ZUGFeRD data from $pdf: " . $info->{message});
172
      return;
173
    }
174

  
175
    # Parse & handle invoice XML:
176
    my $dom = XML::LibXML->load_xml(string => $info->{invoice_xml});
177

  
178

  
179
=head1 FUNCTIONS
180

  
181
=over 4
182

  
183
=item C<extract_from_pdf> C<$file_name>
184

  
185
Opens an existing PDF in the file system and tries to extract ZUGFeRD
186
invoice data from it. First it'll parse the XMP metadata and look for
187
the ZUGFeRD declaration inside. If the declaration isn't found or the
188
declared version isn't 2p0, an error is returned.
189

  
190
Otherwise it'll continue to look through all embedded files in the
191
PDF. The first embedded XML file with a root node of
192
C<rsm:CrossCountryInvoice> will be returnd.
193

  
194
Always returns a hash ref containing the key C<result>, a number that
195
can be one of the following constants:
196

  
197
=over 4
198

  
199
=item C<RES_OK> (0): parsing was OK; the returned hash will also
200
contain the keys C<xmp_metadata> and C<invoice_xml> which will contain
201
the XML text of the metadata & the ZUGFeRD invoice.
202

  
203
=item C<RES_ERR_…> (all values E<gt> 0): parsing failed; the hash will
204
also contain a key C<message> which contains a human-readable
205
information about what exactly failed.
206

  
207
=back
208

  
209
=back
210

  
211
=head1 BUGS
212

  
213
Nothing here yet.
214

  
215
=head1 AUTHOR
216

  
217
Moritz Bunkus E<lt>m.bunkus@linet-services.deE<gt>
218

  
219
=cut

Auch abrufbar als: Unified diff