kivitendo/SL/Helper/Csv.pm @ 202844a1
2f6ebd89 | Sven Schöling | package SL::Helper::Csv;
|
||
use strict;
|
||||
use warnings;
|
||||
5c4833d7 | Sven Schöling | use version 0.77;
|
||
2f6ebd89 | Sven Schöling | use Carp;
|
||
use IO::File;
|
||||
use Params::Validate qw(:all);
|
||||
202844a1 | Bernd Bleßmann | use List::MoreUtils qw(all);
|
||
dcd6ce29 | Moritz Bunkus | use Text::CSV_XS;
|
||
2f6ebd89 | Sven Schöling | use Rose::Object::MakeMethods::Generic scalar => [ qw(
|
||
c8473408 | Bernd Bleßmann | file encoding sep_char quote_char escape_char header profile
|
||
af205393 | Bernd Bleßmann | numberformat dateformat ignore_unknown_columns strict_profile is_multiplexed
|
||
_io _csv _objects _parsed _data _errors all_cvar_configs case_insensitive_header
|
||||
2f6ebd89 | Sven Schöling | ) ];
|
||
17d58914 | Sven Schöling | use SL::Helper::Csv::Dispatcher;
|
||
c46898c7 | Sven Schöling | use SL::Helper::Csv::Error;
|
||
2f6ebd89 | Sven Schöling | |||
# public interface
|
||||
sub new {
|
||||
my $class = shift;
|
||||
my %params = validate(@_, {
|
||||
1dcc096b | Sven Schöling | sep_char => { default => ';' },
|
||
quote_char => { default => '"' },
|
||||
escape_char => { default => '"' },
|
||||
header => { type => ARRAYREF, optional => 1 },
|
||||
af205393 | Bernd Bleßmann | profile => { type => ARRAYREF, optional => 1 },
|
||
1dcc096b | Sven Schöling | file => 1,
|
||
encoding => 0,
|
||||
numberformat => 0,
|
||||
dateformat => 0,
|
||||
ignore_unknown_columns => 0,
|
||||
09294068 | Sven Schöling | strict_profile => 0,
|
||
a54fc392 | Sven Schöling | case_insensitive_header => 0,
|
||
2f6ebd89 | Sven Schöling | });
|
||
my $self = bless {}, $class;
|
||||
$self->$_($params{$_}) for keys %params;
|
||||
$self->_io(IO::File->new);
|
||||
dcd6ce29 | Moritz Bunkus | $self->_csv(Text::CSV_XS->new({
|
||
2f6ebd89 | Sven Schöling | binary => 1,
|
||
8fba112b | Sven Schöling | sep_char => $self->sep_char,
|
||
quote_char => $self->quote_char,
|
||||
escape_char => $self->escape_char,
|
||||
2f6ebd89 | Sven Schöling | |||
}));
|
||||
8fba112b | Sven Schöling | $self->_errors([]);
|
||
2f6ebd89 | Sven Schöling | |||
return $self;
|
||||
}
|
||||
sub parse {
|
||||
my ($self, %params) = @_;
|
||||
$self->_open_file;
|
||||
af205393 | Bernd Bleßmann | return if ! $self->_check_multiplexed;
|
||
1dcc096b | Sven Schöling | return if ! $self->_check_header;
|
||
17d58914 | Sven Schöling | return if ! $self->dispatcher->parse_profile;
|
||
1dcc096b | Sven Schöling | return if ! $self->_parse_data;
|
||
2f6ebd89 | Sven Schöling | |||
$self->_parsed(1);
|
||||
return $self;
|
||||
}
|
||||
sub get_data {
|
||||
$_[0]->_data;
|
||||
}
|
||||
sub get_objects {
|
||||
my ($self, %params) = @_;
|
||||
croak 'must parse first' unless $self->_parsed;
|
||||
$self->_make_objects unless $self->_objects;
|
||||
return wantarray ? @{ $self->_objects } : $self->_objects;
|
||||
}
|
||||
8fba112b | Sven Schöling | sub errors {
|
||
@{ $_[0]->_errors }
|
||||
}
|
||||
f9f7b56e | Sven Schöling | sub check_header {
|
||
$_[0]->_check_header;
|
||||
}
|
||||
2f6ebd89 | Sven Schöling | # private stuff
|
||
sub _open_file {
|
||||
my ($self, %params) = @_;
|
||||
$self->encoding($self->_guess_encoding) if !$self->encoding;
|
||||
$self->_io->open($self->file, '<' . $self->_encode_layer)
|
||||
or die "could not open file " . $self->file;
|
||||
return $self->_io;
|
||||
}
|
||||
af205393 | Bernd Bleßmann | # check, if data is multiplexed and if all nessesary infos are given
|
||
sub _check_multiplexed {
|
||||
my ($self, %params) = @_;
|
||||
$self->is_multiplexed(0);
|
||||
# If more than one profile is given, it is multiplexed.
|
||||
if ($self->profile) {
|
||||
my @profile = @{ $self->profile };
|
||||
if (scalar @profile > 1) {
|
||||
# Each profile needs a class and a row_ident
|
||||
202844a1 | Bernd Bleßmann | my $info_ok = all { defined $_->{class} && defined $_->{row_ident} } @profile;
|
||
af205393 | Bernd Bleßmann | |||
# If header is given, there need to be a header for each profile
|
||||
202844a1 | Bernd Bleßmann | # and no empty headers.
|
||
af205393 | Bernd Bleßmann | if ($info_ok && $self->header) {
|
||
my @header = @{ $self->header };
|
||||
202844a1 | Bernd Bleßmann | $info_ok = $info_ok && scalar @profile == scalar @header;
|
||
$info_ok = $info_ok && all { scalar @$_ > 0} @header;
|
||||
af205393 | Bernd Bleßmann | }
|
||
$self->is_multiplexed($info_ok);
|
||||
return $info_ok;
|
||||
}
|
||||
}
|
||||
# ok, if not multiplexed
|
||||
return 1;
|
||||
}
|
||||
2f6ebd89 | Sven Schöling | sub _check_header {
|
||
my ($self, %params) = @_;
|
||||
af205393 | Bernd Bleßmann | my $header;
|
||
2f6ebd89 | Sven Schöling | |||
af205393 | Bernd Bleßmann | $header = $self->header;
|
||
if (!$header) {
|
||||
my $n_header = ($self->is_multiplexed)? scalar @{ $self->profile } : 1;
|
||||
foreach my $p_num (0..$n_header - 1) {
|
||||
my $h = $self->_csv->getline($self->_io);
|
||||
2f6ebd89 | Sven Schöling | |||
af205393 | Bernd Bleßmann | $self->_push_error([
|
||
$self->_csv->error_input,
|
||||
$self->_csv->error_diag,
|
||||
0,
|
||||
]) unless $h;
|
||||
push @{ $header }, $h;
|
||||
}
|
||||
7d9888e3 | Sven Schöling | }
|
||
62add698 | Sven Schöling | |||
07a38b9f | Sven Schöling | # Special case: utf8 BOM.
|
||
# certain software (namely MS Office and notepad.exe insist on prefixing
|
||||
# data with a discouraged but valid byte order mark
|
||||
# if not removed, the first header field will not be recognized
|
||||
af205393 | Bernd Bleßmann | if ($header) {
|
||
202844a1 | Bernd Bleßmann | my $h = $header->[0];
|
||
if ($h && $h->[0] && $self->encoding =~ /utf-?8/i) {
|
||||
$h->[0] =~ s/^\x{FEFF}//;
|
||||
af205393 | Bernd Bleßmann | }
|
||
07a38b9f | Sven Schöling | }
|
||
af205393 | Bernd Bleßmann | # check, if all header fields are parsed well
|
||
202844a1 | Bernd Bleßmann | return unless $header && all { $_ } @$header;
|
||
a54fc392 | Sven Schöling | |||
# Special case: human stupidity
|
||||
# people insist that case sensitivity doesn't exist and try to enter all
|
||||
# sorts of stuff. at this point we've got a profile (with keys that represent
|
||||
# valid methods), and a header full of strings. if two of them match, the user
|
||||
# mopst likely meant that field, so rewrite the header
|
||||
if ($self->case_insensitive_header) {
|
||||
die 'case_insensitive_header is only possible with profile' unless $self->profile;
|
||||
my @names = (
|
||||
keys %{ $self->profile || {} },
|
||||
);
|
||||
for my $name (@names) {
|
||||
for my $i (0..$#$header) {
|
||||
$header->[$i] = $name if lc $header->[$i] eq lc $name;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $self->header($header);
|
||||
2f6ebd89 | Sven Schöling | }
|
||
sub _parse_data {
|
||||
my ($self, %params) = @_;
|
||||
8fba112b | Sven Schöling | my (@data, @errors);
|
||
2f6ebd89 | Sven Schöling | |||
8fba112b | Sven Schöling | while (1) {
|
||
my $row = $self->_csv->getline($self->_io);
|
||||
if ($row) {
|
||||
af205393 | Bernd Bleßmann | my $header = $self->_header_by_row($row);
|
||
8fba112b | Sven Schöling | my %hr;
|
||
af205393 | Bernd Bleßmann | @hr{@{ $header }} = @$row;
|
||
8fba112b | Sven Schöling | push @data, \%hr;
|
||
} else {
|
||||
61a56da0 | Sven Schöling | last if $self->_csv->eof;
|
||
5c4833d7 | Sven Schöling | # Text::CSV_XS 0.89 added record number to error_diag
|
||
if (qv(Text::CSV_XS->VERSION) >= qv('0.89')) {
|
||||
push @errors, [
|
||||
$self->_csv->error_input,
|
||||
$self->_csv->error_diag,
|
||||
];
|
||||
} else {
|
||||
push @errors, [
|
||||
$self->_csv->error_input,
|
||||
$self->_csv->error_diag,
|
||||
$self->_io->input_line_number,
|
||||
];
|
||||
}
|
||||
8fba112b | Sven Schöling | }
|
||
61a56da0 | Sven Schöling | last if $self->_csv->eof;
|
||
8fba112b | Sven Schöling | }
|
||
2f6ebd89 | Sven Schöling | |||
$self->_data(\@data);
|
||||
f9f7b56e | Sven Schöling | $self->_push_error(@errors);
|
||
8fba112b | Sven Schöling | |||
f9f7b56e | Sven Schöling | return ! @errors;
|
||
2f6ebd89 | Sven Schöling | }
|
||
af205393 | Bernd Bleßmann | sub _header_by_row {
|
||
my ($self, $row) = @_;
|
||||
my @header = @{ $self->header };
|
||||
if ($self->is_multiplexed) {
|
||||
my $i = 0;
|
||||
foreach my $profile (@{ $self->profile }) {
|
||||
202844a1 | Bernd Bleßmann | if ($row->[0] eq $profile->{row_ident}) {
|
||
af205393 | Bernd Bleßmann | return $header[$i];
|
||
}
|
||||
$i++;
|
||||
}
|
||||
} else {
|
||||
return $header[0];
|
||||
}
|
||||
}
|
||||
2f6ebd89 | Sven Schöling | sub _encode_layer {
|
||
':encoding(' . $_[0]->encoding . ')';
|
||||
}
|
||||
sub _make_objects {
|
||||
my ($self, %params) = @_;
|
||||
my @objs;
|
||||
local $::myconfig{numberformat} = $self->numberformat if $self->numberformat;
|
||||
local $::myconfig{dateformat} = $self->dateformat if $self->dateformat;
|
||||
for my $line (@{ $self->_data }) {
|
||||
c8473408 | Bernd Bleßmann | my $tmp_obj = $self->dispatcher->dispatch($line);
|
||
17d58914 | Sven Schöling | push @objs, $tmp_obj;
|
||
2f6ebd89 | Sven Schöling | }
|
||
$self->_objects(\@objs);
|
||||
}
|
||||
17d58914 | Sven Schöling | sub dispatcher {
|
||
my ($self, %params) = @_;
|
||||
$self->{_dispatcher} ||= $self->_make_dispatcher;
|
||||
}
|
||||
sub _make_dispatcher {
|
||||
my ($self, %params) = @_;
|
||||
die 'need a header to make a dispatcher' unless $self->header;
|
||||
return SL::Helper::Csv::Dispatcher->new($self);
|
||||
f9f7b56e | Sven Schöling | }
|
||
2f6ebd89 | Sven Schöling | sub _guess_encoding {
|
||
# won't fix
|
||||
'utf-8';
|
||||
}
|
||||
f9f7b56e | Sven Schöling | sub _push_error {
|
||
my ($self, @errors) = @_;
|
||||
c46898c7 | Sven Schöling | my @new_errors = ($self->errors, map { SL::Helper::Csv::Error->new(@$_) } @errors);
|
||
f9f7b56e | Sven Schöling | $self->_errors(\@new_errors);
|
||
}
|
||||
2f6ebd89 | Sven Schöling | |||
1;
|
||||
__END__
|
||||
8fba112b | Sven Schöling | =encoding utf-8
|
||
2f6ebd89 | Sven Schöling | =head1 NAME
|
||
SL::Helper::Csv - take care of csv file uploads
|
||||
=head1 SYNOPSIS
|
||||
use SL::Helper::Csv;
|
||||
my $csv = SL::Helper::Csv->new(
|
||||
file => \$::form->{upload_file},
|
||||
encoding => 'utf-8', # undef means utf8
|
||||
sep_char => ',', # default ';'
|
||||
efb48636 | Moritz Bunkus | quote_char => '\'', # default '"'
|
||
escape_char => '"', # default '"'
|
||||
af205393 | Bernd Bleßmann | header => [ [qw(id text sellprice word)] ], # see later
|
||
profile => [ { profile => { sellprice => 'sellprice_as_number'},
|
||||
class => 'SL::DB::Part' } ],
|
||||
efb48636 | Moritz Bunkus | );
|
||
2f6ebd89 | Sven Schöling | |||
my $status = $csv->parse;
|
||||
8fba112b | Sven Schöling | my $hrefs = $csv->get_data;
|
||
bfb0d001 | Sven Schöling | my @objects = $csv->get_objects;
|
||
my @errors = $csv->errors;
|
||||
2f6ebd89 | Sven Schöling | |||
=head1 DESCRIPTION
|
||||
See Synopsis.
|
||||
Text::CSV offeres already good functions to get lines out of a csv file, but in
|
||||
most cases you will want those line to be parsed into hashes or even objects,
|
||||
so this model just skips ahead and gives you objects.
|
||||
bfb0d001 | Sven Schöling | Its basic assumptions are:
|
||
=over 4
|
||||
=item You do know what you expect to be in that csv file.
|
||||
This means first and foremost you have knowledge about encoding, number and
|
||||
date format, csv parameters such as quoting and separation characters. You also
|
||||
know what content will be in that csv and what L<Rose::DB> is responsible for
|
||||
it. You provide valid header columns and their mapping to the objects.
|
||||
=item You do NOT know if the csv provider yields to your expectations.
|
||||
Stuff that does not work with what you expect should not crash anything, but
|
||||
46e31af5 | Bernd Bleßmann | give you a hint what went wrong. As a result, if you remember to check for
|
||
bfb0d001 | Sven Schöling | errors after each step, you should be fine.
|
||
=item Data does not make sense. It's just data.
|
||||
Almost all data imports have some type of constraints. Some data needs to be
|
||||
unique, other data needs to be connected to existing data sets. This will not
|
||||
happen here. You will receive a plain mapping of the data into the class tree,
|
||||
nothing more.
|
||||
af205393 | Bernd Bleßmann | =item Multiplex data
|
||
This module can handle multiplexed data of different class types. In that case
|
||||
multiple profiles with classes and row identifiers must be given. Multiple
|
||||
headers may also be given or read from csv data. Data must contain the row
|
||||
identifier in the first column and it's field name must be 'datatype'.
|
||||
bfb0d001 | Sven Schöling | =back
|
||
2f6ebd89 | Sven Schöling | |||
=head1 METHODS
|
||||
=over 4
|
||||
=item C<new> PARAMS
|
||||
Standard constructor. You can use this to set most of the data.
|
||||
=item C<parse>
|
||||
Do the actual work. Will return true ($self actually) if success, undef if not.
|
||||
=item C<get_objects>
|
||||
Parse the data into objects and return those.
|
||||
8fba112b | Sven Schöling | This method will return list or arrayref depending on context.
|
||
2f6ebd89 | Sven Schöling | =item C<get_data>
|
||
Returns an arrayref of the raw lines as hashrefs.
|
||||
8fba112b | Sven Schöling | =item C<errors>
|
||
bfb0d001 | Sven Schöling | Return all errors that came up during parsing. See error handling for detailed
|
||
8fba112b | Sven Schöling | information.
|
||
=back
|
||||
=head1 PARAMS
|
||||
=over 4
|
||||
2f6ebd89 | Sven Schöling | =item C<file>
|
||
The file which contents are to be read. Can be a name of a physical file or a
|
||||
scalar ref for memory data.
|
||||
=item C<encoding>
|
||||
f9f7b56e | Sven Schöling | Encoding of the CSV file. Note that this module does not do any encoding
|
||
efb48636 | Moritz Bunkus | guessing. Know what your data is. Defaults to utf-8.
|
||
2f6ebd89 | Sven Schöling | |||
=item C<sep_char>
|
||||
=item C<quote_char>
|
||||
8fba112b | Sven Schöling | =item C<escape_char>
|
||
2f6ebd89 | Sven Schöling | Same as in L<Text::CSV>
|
||
af205393 | Bernd Bleßmann | =item C<header> \@HEADERS
|
||
If given, it contains an ARRAYREF for each different class type (i.e. one
|
||||
ARRAYREF if the data is only of one class type). These ARRAYREFS are the header
|
||||
fields which are an array of columns. In this case the first lines are not used
|
||||
as a header. Empty header fields will be ignored in objects.
|
||||
If not given, headers are taken from the first n lines of data, where n is the
|
||||
number of different class types.
|
||||
2f6ebd89 | Sven Schöling | |||
af205393 | Bernd Bleßmann | Examples:
|
||
2f6ebd89 | Sven Schöling | |||
af205393 | Bernd Bleßmann | classic data of one type:
|
||
[ [ 'name', 'street', 'zipcode', 'city' ] ]
|
||||
2f6ebd89 | Sven Schöling | |||
af205393 | Bernd Bleßmann | multiplexed data with two different types
|
||
[ [ 'ordernumber', 'customer', 'transdate' ], [ 'partnumber', 'qty', 'sellprice' ] ]
|
||||
=item C<profile> [{profile => \%ACCESSORS, class => class, row_ident => ri},]
|
||||
This is an ARRAYREF to HASHREFs which may contain the keys C<profile>, C<class>
|
||||
and C<row_ident>.
|
||||
2f6ebd89 | Sven Schöling | |||
c8473408 | Bernd Bleßmann | The C<profile> is a HASHREF which may be used to map header fields to custom
|
||
accessors. Example:
|
||||
af205393 | Bernd Bleßmann | [ {profile => { listprice => listprice_as_number }} ]
|
||
2f6ebd89 | Sven Schöling | |||
In this case C<listprice_as_number> will be used to read in values from the
|
||||
C<listprice> column.
|
||||
8a635325 | Sven Schöling | In case of a One-To-One relationsship these can also be set over
|
||
relationsships by sparating the steps with a dot (C<.>). This will work:
|
||||
af205393 | Bernd Bleßmann | [ {profile => { customer => 'customer.name' }} ]
|
||
8a635325 | Sven Schöling | |||
And will result in something like this:
|
||||
$obj->customer($obj->meta->relationship('customer')->class->new);
|
||||
$obj->customer->name($csv_line->{customer})
|
||||
But beware, this will not try to look up anything in the database. You will
|
||||
simply receive objects that represent what the profile defined. If some of
|
||||
these information are unique, and should be connected to preexisting data, you
|
||||
will have to do that for yourself. Since you provided the profile, it is
|
||||
assumed you know what to do in this case.
|
||||
c8473408 | Bernd Bleßmann | If C<class> is present, the line will be handed to the new sub of this class,
|
||
2f6ebd89 | Sven Schöling | and the return value used instead of the line itself.
|
||
af205393 | Bernd Bleßmann | C<row_ident> is a string to recognize the right profile and class for each data
|
||
line in multiplexed data.
|
||||
In case of multiplexed data, C<class> and C<row_ident> must be given.
|
||||
Example:
|
||||
[ {
|
||||
class => 'SL::DB::Order',
|
||||
row_ident => 'O'
|
||||
},
|
||||
{
|
||||
class => 'SL::DB::OrderItem',
|
||||
row_ident => 'I',
|
||||
profile => {sellprice => sellprice_as_number}
|
||||
} ]
|
||||
1dcc096b | Sven Schöling | =item C<ignore_unknown_columns>
|
||
If set, the import will ignore unkown header columns. Useful for lazy imports,
|
||||
but deactivated by default.
|
||||
a54fc392 | Sven Schöling | =item C<case_insensitive_header>
|
||
If set, header columns will be matched against profile entries case
|
||||
insensitive, and on match the profile name will be taken.
|
||||
Only works if a profile is given, will die otherwise.
|
||||
If both C<case_insensitive_header> and C<strict_profile> is set, matched header
|
||||
columns will be accepted.
|
||||
09294068 | Sven Schöling | =item C<strict_profile>
|
||
If set, all columns to be parsed must be specified in C<profile>. Every header
|
||||
field not listed there will be treated like an unknown column.
|
||||
a54fc392 | Sven Schöling | If both C<case_insensitive_header> and C<strict_profile> is set, matched header
|
||
columns will be accepted.
|
||||
2f6ebd89 | Sven Schöling | =back
|
||
8fba112b | Sven Schöling | =head1 ERROR HANDLING
|
||
After parsing a file all errors will be accumulated into C<errors>.
|
||||
c46898c7 | Sven Schöling | Each entry is an object with the following attributes:
|
||
8fba112b | Sven Schöling | |||
c46898c7 | Sven Schöling | raw_input: offending raw input,
|
||
code: Text::CSV error code if Text:CSV signalled an error, 0 else,
|
||||
diag: error diagnostics,
|
||||
line: position in line,
|
||||
col: estimated line in file,
|
||||
8fba112b | Sven Schöling | |||
Note that the last entry can be off, but will give an estimate.
|
||||
=head1 CAVEATS
|
||||
=over 4
|
||||
=item *
|
||||
sep_char, quote_char, and escape_char are passed to Text::CSV on creation.
|
||||
Changing them later has no effect currently.
|
||||
=item *
|
||||
Encoding errors are not dealt with properly.
|
||||
=back
|
||||
2f6ebd89 | Sven Schöling | |||
f9f7b56e | Sven Schöling | =head1 TODO
|
||
Dispatch to child objects, like this:
|
||||
$csv = SL::Helper::Csv->new(
|
||||
c8473408 | Bernd Bleßmann | file => ...
|
||
af205393 | Bernd Bleßmann | profile => [ {
|
||
c8473408 | Bernd Bleßmann | profile => [
|
||
makemodel => {
|
||||
make_1 => make,
|
||||
model_1 => model,
|
||||
},
|
||||
makemodel => {
|
||||
make_2 => make,
|
||||
model_2 => model,
|
||||
},
|
||||
],
|
||||
class => SL::DB::Part,
|
||||
af205393 | Bernd Bleßmann | } ]
|
||
f9f7b56e | Sven Schöling | );
|
||
2f6ebd89 | Sven Schöling | =head1 AUTHOR
|
||
8fba112b | Sven Schöling | Sven Schöling E<lt>s.schoeling@linet-services.deE<gt>
|
||
2f6ebd89 | Sven Schöling | =cut
|