package SL::BackgroundJob::CreateOrUpdateFileFullTexts;

use strict;

use parent qw(SL::BackgroundJob::Base);

use Encode qw(decode);
use English qw( -no_match_vars );
use File::Slurp qw(read_file);
use List::MoreUtils qw(uniq);
use IPC::Run qw();
use Unicode::Normalize qw();

use SL::DB::File;
use SL::DB::FileFullText;
use SL::HTML::Util;

my %extractor_by_mime_type = (
'application/pdf' => \&_pdf_to_strings,
'text/html' => \&_html_to_strings,
'text/plain' => \&_text_to_strings,

sub create_job {
$_[0]->create_standard_job('20 3 * * *'); # # every day at 3:20 am

# If job does not throw an error,
# success in background_job_histories is 'success'.
# It is 'failure' otherwise.
# return value goes to result in background_job_histories
sub run {
my $self = shift;
my $db_obj = shift;

my $all_dbfiles = SL::DB::Manager::File->get_all;

foreach my $dbfile (@$all_dbfiles) {
next if $dbfile->full_text && (($dbfile->mtime || $dbfile->itime) <= ($dbfile->full_text->mtime || $dbfile->full_text->itime));
next if !defined $extractor_by_mime_type{$dbfile->mime_type};

my $file_name;
if (!eval { $file_name = SL::File->get(dbfile => $dbfile)->get_file(); 1; }) {
$::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::run: get_file failed: " . $EVAL_ERROR);

my $text = $extractor_by_mime_type{$dbfile->mime_type}->($file_name);

if ($dbfile->full_text) {
$dbfile->full_text->update_attributes(full_text => $text);
} else {
SL::DB::FileFullText->new(file => $dbfile, full_text => $text)->save;

return 'ok';

sub _pdf_to_strings {
my ($file_name) = @_;

my @cmd = qw(pdftotext -enc UTF-8);
push @cmd, $file_name;
push @cmd, '-';

my ($txt, $err);

IPC::Run::run \@cmd, \undef, \$txt, \$err;

$::lxdebug->message(LXDebug::WARN(), "CreateOrUpdateFileFullTexts::_pdf_to_text failed for '$file_name': " . ($CHILD_ERROR >> 8) . ": " . $err);
return '';

$txt = Encode::decode('utf-8-strict', $txt);
$txt =~ s{\r}{ }g;
$txt =~ s{\p{WSpace}+}{ }g;
$txt = Unicode::Normalize::normalize('C', $txt);
$txt = join ' ' , uniq(split(' ', $txt));

return $txt;

sub _html_to_strings {
my ($file_name) = @_;

my $txt = read_file($file_name);

$txt = Encode::decode('utf-8-strict', $txt);
$txt = SL::HTML::Util::strip($txt);
$txt =~ s{\r}{ }g;
$txt =~ s{\p{WSpace}+}{ }g;
$txt = Unicode::Normalize::normalize('C', $txt);
$txt = join ' ' , uniq(split(' ', $txt));

return $txt;

sub _text_to_strings {
my ($file_name) = @_;

my $txt = read_file($file_name);

$txt = Encode::decode('utf-8-strict', $txt);
$txt =~ s{\r}{ }g;
$txt =~ s{\p{WSpace}+}{ }g;
$txt = Unicode::Normalize::normalize('C', $txt);
$txt = join ' ' , uniq(split(' ', $txt));

return $txt;




=encoding utf8

=head1 NAME

SL::BackgroundJob::CreateOrUpdateFileFullTexts - Extract text strings/words from
files in the DMS for full text search.


Search all documents in the files table and try to extract strings from them
and store the strings in the database.

Duplicate strings/words in one text are removed.

Strings are updated if the change or creation time of the document is newer than
the old entry.

=head1 AUTHOR

Bernd Bleßmann E<lt>bernd@kivitendo-premium.deE<gt>
