Projekt

Allgemein

Profil

« Zurück | Weiter » 

Revision 0532e721

Von Bernd Bleßmann vor mehr als 2 Jahren hinzugefügt

  • ID 0532e721f1427c5d3a545975b6d7fe5b4fcc9873
  • Vorgänger 7718459c
  • Nachfolger af8f0a39

Volltext-Suche: Hintergrund-Job zum Extrahieren von Texten aus Dokumenten

Unterschiede anzeigen:

SL/BackgroundJob/CreateOrUpdateFileFullTexts.pm
1
package SL::BackgroundJob::CreateOrUpdateFileFullTexts;
2

  
3
use strict;
4

  
5
use parent qw(SL::BackgroundJob::Base);
6

  
7
use Encode qw(decode);
8
use English qw( -no_match_vars );
9
use File::Slurp qw(read_file);
10
use List::MoreUtils qw(uniq);
11
use IPC::Run qw();
12
use Unicode::Normalize qw();
13

  
14
use SL::DB::File;
15
use SL::DB::FileFullText;
16
use SL::HTML::Util;
17

  
18
my %extractor_by_mime_type = (
19
  'application/pdf' => \&_pdf_to_strings,
20
  'text/html'       => \&_html_to_strings,
21
  'text/plain'      => \&_text_to_strings,
22
);
23

  
24
sub create_job {
25
  $_[0]->create_standard_job('7 * * * *'); # seven minutes after every hour
26
}
27

  
28
#
29
# If job does not throw an error,
30
# success in background_job_histories is 'success'.
31
# It is 'failure' otherwise.
32
#
33
# return value goes to result in background_job_histories
34
#
35
sub run {
36
  my $self    = shift;
37
  my $db_obj  = shift;
38

  
39
  my $all_dbfiles = SL::DB::Manager::File->get_all;
40

  
41
  foreach my $dbfile (@$all_dbfiles) {
42
    next if $dbfile->full_text && (($dbfile->mtime || $dbfile->itime) <= ($dbfile->full_text->mtime || $dbfile->full_text->itime));
43
    next if !defined $extractor_by_mime_type{$dbfile->mime_type};
44

  
45
    my $file_name;
46
    if (!eval { $file_name = SL::File->get(dbfile => $dbfile)->get_file(); 1; }) {
47
      $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::run: get_file failed: " . $EVAL_ERROR);
48
      next;
49
    }
50

  
51
    my $text = $extractor_by_mime_type{$dbfile->mime_type}->($file_name);
52

  
53
    if ($dbfile->full_text) {
54
      $dbfile->full_text->update_attributes(full_text => $text);
55
    } else {
56
      SL::DB::FileFullText->new(file => $dbfile, full_text => $text)->save;
57
    }
58
  }
59

  
60
  return 'ok';
61
}
62

  
63
sub _pdf_to_strings {
64
  my ($file_name) = @_;
65

  
66
  my   @cmd = qw(pdftotext -enc UTF-8);
67
  push @cmd,  $file_name;
68
  push @cmd,  '-';
69

  
70
  my ($txt, $err);
71

  
72
  IPC::Run::run \@cmd, \undef, \$txt, \$err;
73

  
74
  if ($CHILD_ERROR) {
75
    $::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::_pdf_to_text failed for '$file_name': " . ($CHILD_ERROR >> 8) . ": " . $err);
76
    return '';
77
  }
78

  
79
  $txt = Encode::decode('utf-8-strict', $txt);
80
  $txt =~ s{\r}{ }g;
81
  $txt =~ s{\p{WSpace}+}{ }g;
82
  $txt = Unicode::Normalize::normalize('C', $txt);
83
  $txt = join ' ' , uniq(split(' ', $txt));
84

  
85
  return $txt;
86
}
87

  
88
sub _html_to_strings {
89
  my ($file_name) = @_;
90

  
91
  my $txt = read_file($file_name);
92

  
93
  $txt = Encode::decode('utf-8-strict', $txt);
94
  $txt = SL::HTML::Util::strip($txt);
95
  $txt =~ s{\r}{ }g;
96
  $txt =~ s{\p{WSpace}+}{ }g;
97
  $txt = Unicode::Normalize::normalize('C', $txt);
98
  $txt = join ' ' , uniq(split(' ', $txt));
99

  
100
  return $txt;
101
}
102

  
103
sub _text_to_strings {
104
  my ($file_name) = @_;
105

  
106
  my $txt = read_file($file_name);
107

  
108
  $txt = Encode::decode('utf-8-strict', $txt);
109
  $txt =~ s{\r}{ }g;
110
  $txt =~ s{\p{WSpace}+}{ }g;
111
  $txt = Unicode::Normalize::normalize('C', $txt);
112
  $txt = join ' ' , uniq(split(' ', $txt));
113

  
114
  return $txt;
115
}
116

  
117
1;

Auch abrufbar als: Unified diff