Revision 0532e721
Von Bernd Bleßmann vor mehr als 2 Jahren hinzugefügt
SL/BackgroundJob/CreateOrUpdateFileFullTexts.pm | ||
---|---|---|
1 |
package SL::BackgroundJob::CreateOrUpdateFileFullTexts; |
|
2 |
|
|
3 |
use strict; |
|
4 |
|
|
5 |
use parent qw(SL::BackgroundJob::Base); |
|
6 |
|
|
7 |
use Encode qw(decode); |
|
8 |
use English qw( -no_match_vars ); |
|
9 |
use File::Slurp qw(read_file); |
|
10 |
use List::MoreUtils qw(uniq); |
|
11 |
use IPC::Run qw(); |
|
12 |
use Unicode::Normalize qw(); |
|
13 |
|
|
14 |
use SL::DB::File; |
|
15 |
use SL::DB::FileFullText; |
|
16 |
use SL::HTML::Util; |
|
17 |
|
|
18 |
my %extractor_by_mime_type = ( |
|
19 |
'application/pdf' => \&_pdf_to_strings, |
|
20 |
'text/html' => \&_html_to_strings, |
|
21 |
'text/plain' => \&_text_to_strings, |
|
22 |
); |
|
23 |
|
|
24 |
sub create_job { |
|
25 |
$_[0]->create_standard_job('7 * * * *'); # seven minutes after every hour |
|
26 |
} |
|
27 |
|
|
28 |
# |
|
29 |
# If job does not throw an error, |
|
30 |
# success in background_job_histories is 'success'. |
|
31 |
# It is 'failure' otherwise. |
|
32 |
# |
|
33 |
# return value goes to result in background_job_histories |
|
34 |
# |
|
35 |
sub run { |
|
36 |
my $self = shift; |
|
37 |
my $db_obj = shift; |
|
38 |
|
|
39 |
my $all_dbfiles = SL::DB::Manager::File->get_all; |
|
40 |
|
|
41 |
foreach my $dbfile (@$all_dbfiles) { |
|
42 |
next if $dbfile->full_text && (($dbfile->mtime || $dbfile->itime) <= ($dbfile->full_text->mtime || $dbfile->full_text->itime)); |
|
43 |
next if !defined $extractor_by_mime_type{$dbfile->mime_type}; |
|
44 |
|
|
45 |
my $file_name; |
|
46 |
if (!eval { $file_name = SL::File->get(dbfile => $dbfile)->get_file(); 1; }) { |
|
47 |
$::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::run: get_file failed: " . $EVAL_ERROR); |
|
48 |
next; |
|
49 |
} |
|
50 |
|
|
51 |
my $text = $extractor_by_mime_type{$dbfile->mime_type}->($file_name); |
|
52 |
|
|
53 |
if ($dbfile->full_text) { |
|
54 |
$dbfile->full_text->update_attributes(full_text => $text); |
|
55 |
} else { |
|
56 |
SL::DB::FileFullText->new(file => $dbfile, full_text => $text)->save; |
|
57 |
} |
|
58 |
} |
|
59 |
|
|
60 |
return 'ok'; |
|
61 |
} |
|
62 |
|
|
63 |
sub _pdf_to_strings { |
|
64 |
my ($file_name) = @_; |
|
65 |
|
|
66 |
my @cmd = qw(pdftotext -enc UTF-8); |
|
67 |
push @cmd, $file_name; |
|
68 |
push @cmd, '-'; |
|
69 |
|
|
70 |
my ($txt, $err); |
|
71 |
|
|
72 |
IPC::Run::run \@cmd, \undef, \$txt, \$err; |
|
73 |
|
|
74 |
if ($CHILD_ERROR) { |
|
75 |
$::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::_pdf_to_text failed for '$file_name': " . ($CHILD_ERROR >> 8) . ": " . $err); |
|
76 |
return ''; |
|
77 |
} |
|
78 |
|
|
79 |
$txt = Encode::decode('utf-8-strict', $txt); |
|
80 |
$txt =~ s{\r}{ }g; |
|
81 |
$txt =~ s{\p{WSpace}+}{ }g; |
|
82 |
$txt = Unicode::Normalize::normalize('C', $txt); |
|
83 |
$txt = join ' ' , uniq(split(' ', $txt)); |
|
84 |
|
|
85 |
return $txt; |
|
86 |
} |
|
87 |
|
|
88 |
sub _html_to_strings { |
|
89 |
my ($file_name) = @_; |
|
90 |
|
|
91 |
my $txt = read_file($file_name); |
|
92 |
|
|
93 |
$txt = Encode::decode('utf-8-strict', $txt); |
|
94 |
$txt = SL::HTML::Util::strip($txt); |
|
95 |
$txt =~ s{\r}{ }g; |
|
96 |
$txt =~ s{\p{WSpace}+}{ }g; |
|
97 |
$txt = Unicode::Normalize::normalize('C', $txt); |
|
98 |
$txt = join ' ' , uniq(split(' ', $txt)); |
|
99 |
|
|
100 |
return $txt; |
|
101 |
} |
|
102 |
|
|
103 |
sub _text_to_strings { |
|
104 |
my ($file_name) = @_; |
|
105 |
|
|
106 |
my $txt = read_file($file_name); |
|
107 |
|
|
108 |
$txt = Encode::decode('utf-8-strict', $txt); |
|
109 |
$txt =~ s{\r}{ }g; |
|
110 |
$txt =~ s{\p{WSpace}+}{ }g; |
|
111 |
$txt = Unicode::Normalize::normalize('C', $txt); |
|
112 |
$txt = join ' ' , uniq(split(' ', $txt)); |
|
113 |
|
|
114 |
return $txt; |
|
115 |
} |
|
116 |
|
|
117 |
1; |
Auch abrufbar als: Unified diff
Volltext-Suche: Hintergrund-Job zum Extrahieren von Texten aus Dokumenten