}
// Script.
$order = array("\r\n", "\n", "\r");
echo '<div style="padding:1em 2em">';
echo '<h3>Batch PDF re-indexing.</h3>';
echo '<h4>Error log:</h4>';
// Iterate all PDF files.
$glob = new GlobIterator(IL_PDF_PATH . DIRECTORY_SEPARATOR . '[0-9][0-9]' . DIRECTORY_SEPARATOR . '*.pdf');
foreach ($glob as $pdf) {
    $answer = array();
    $file_path = $pdf->getPathname();
    $file_name = $pdf->getFilename();
    $file_id = intval(basename($pdf->getFilename(), '.pdf'));
    // Extract text from PDF.
    if (is_readable($file_path)) {
        system(select_pdftotext() . ' -enc UTF-8 "' . $file_path . '" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file_name . '.txt"', $ret);
        if (is_file(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file_name . ".txt")) {
            $string = trim(file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file_name . ".txt"));
            unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file_name . ".txt");
            $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
            $string = trim($string);
            if (!empty($string)) {
                $string = str_replace($order, ' ', $string);
                $string = preg_replace('/\\s{2,}/ui', ' ', $string);
                $output = false;
                database_connect(IL_DATABASE_PATH, 'fulltext');
                $file_query = $dbHandle->quote($file_id);
                $fulltext_query = $dbHandle->quote($string);
                $dbHandle->beginTransaction();
                $dbHandle->exec("DELETE FROM full_text WHERE fileID={$file_query}");
                $output = $dbHandle->exec("INSERT INTO full_text (fileID,full_text) VALUES ({$file_query},{$fulltext_query})");
Esempio n. 2
0
     if (substr($doi, -1) == ')' || substr($doi, -1) == ']') {
         preg_match_all('/(.)(doi:\\s?)?(10\\.\\d{4}\\/\\S+)/ui', $string, $doi2, PREG_PATTERN_ORDER);
         if (substr($doi, -1) == ')' && $doi2[1][0] == '(') {
             $doi = substr($doi, 0, -1);
         }
         if (substr($doi, -1) == ']' && $doi2[1][0] == '[') {
             $doi = substr($doi, 0, -1);
         }
     }
     $_POST['doi'] = $doi;
     preg_match('/(?<=arXiv:)\\S+/ui', $string, $arxiv_id);
     $arxiv_id = current($arxiv_id);
 }
 if (empty($doi) && empty($arxiv_id)) {
     ##########	try to find DOI in the whole PDF	##########
     system(select_pdftotext() . '"' . $temp_dir . DIRECTORY_SEPARATOR . $rand . '.pdf" "' . $temp_dir . DIRECTORY_SEPARATOR . $rand . '.txt"', $ret);
     if (is_file($temp_dir . DIRECTORY_SEPARATOR . $rand . ".txt")) {
         $string = file_get_contents($temp_dir . DIRECTORY_SEPARATOR . $rand . ".txt");
         unlink($temp_dir . DIRECTORY_SEPARATOR . $rand . ".txt");
     }
     if (!empty($string)) {
         $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
         $order = array("\r\n", "\n", "\r");
         $replace = ' ';
         $string = str_replace($order, $replace, $string);
         $order = array("–", "—");
         $replace = '-';
         $string = str_replace($order, $replace, $string);
         preg_match('/10\\.\\d{4}\\/\\S+/ui', $string, $doi);
         $doi = current($doi);
         if (substr($doi, -1) == '.') {
             $result = null;
         }
         $file_to_copy = '';
         if (isset($file_to_copy_match[0])) {
             $description = strtoupper(strrchr($file_to_copy_match[0], ':'));
             if ($description == ':PDF') {
                 $file_to_copy = substr($file_to_copy_match[0], strpos($file_to_copy_match[0], ':') + 1, strrpos($file_to_copy_match[0], ':') - strlen($file_to_copy_match[0]));
             }
             $file_to_copy = str_replace("\\", "/", $file_to_copy);
         }
         if (is_file($file_to_copy) && is_readable($file_to_copy)) {
             $result = $dbHandle->query("SELECT file FROM library WHERE id=" . $last_id);
             $pdf_filename = $result->fetchColumn();
             $result = null;
             copy($file_to_copy, IL_PDF_PATH . DIRECTORY_SEPARATOR . $pdf_filename);
             system(select_pdftotext() . ' -enc UTF-8 "' . $file_to_copy . '" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . $pdf_filename . '.txt"');
             if (is_file(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $pdf_filename . ".txt")) {
                 $string = file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $pdf_filename . ".txt");
                 unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $pdf_filename . ".txt");
                 if (!empty($string)) {
                     $order = array("\r\n", "\n", "\r");
                     $string = str_replace($order, ' ', $string);
                     $string = preg_replace('/\\s{2,}/ui', ' ', $string);
                     $fulltext_query = $fdbHandle->quote($string);
                     $fdbHandle->exec("INSERT INTO full_text (fileID,full_text) VALUES (" . $last_id . ",{$fulltext_query})");
                 }
             }
         }
     }
 }
 $dbHandle->commit();
 } else {
     $error = "This is not a PDF.";
 }
 //RECORD FILE HASH FOR DUPLICATE DETECTION
 if (!empty($hash)) {
     database_connect(IL_DATABASE_PATH, 'library');
     $hash = $dbHandle->quote($hash);
     $file = $dbHandle->quote($_POST['filename']);
     $dbHandle->exec('UPDATE library SET filehash=' . $hash . ' WHERE file=' . $file);
     $dbHandle = null;
 }
 ##########	extract text from pdf	##########
 if (!isset($error)) {
     $filename = $_POST['filename'];
     if (is_file(IL_PDF_PATH . DIRECTORY_SEPARATOR . get_subfolder($filename) . DIRECTORY_SEPARATOR . $filename)) {
         system(select_pdftotext() . ' -enc UTF-8 "' . IL_PDF_PATH . DIRECTORY_SEPARATOR . get_subfolder($filename) . DIRECTORY_SEPARATOR . $filename . '" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . '.txt"', $ret);
         if (is_file(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt")) {
             $string = file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt");
             unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt");
             if (!empty($string)) {
                 $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
                 $order = array("\r\n", "\n", "\r");
                 $string = str_replace($order, ' ', $string);
                 $string = preg_replace('/\\s{2,}/ui', ' ', $string);
                 $output = null;
                 database_connect(IL_DATABASE_PATH, 'fulltext');
                 $file_query = $dbHandle->quote(intval($_POST['file']));
                 $fulltext_query = $dbHandle->quote($string);
                 $dbHandle->beginTransaction();
                 $dbHandle->exec("DELETE FROM full_text WHERE fileID={$file_query}");
                 $output = $dbHandle->exec("INSERT INTO full_text (fileID,full_text) VALUES ({$file_query},{$fulltext_query})");
Esempio n. 5
0
 } else {
     $error = "This is not a PDF.";
 }
 //RECORD FILE HASH FOR DUPLICATE DETECTION
 if (!empty($hash)) {
     database_connect($database_path, 'library');
     $hash = $dbHandle->quote($hash);
     $file = $dbHandle->quote($_POST['filename']);
     $dbHandle->exec('UPDATE library SET filehash=' . $hash . ' WHERE file=' . $file);
     $dbHandle = null;
 }
 ##########	extract text from pdf	##########
 if (!isset($error)) {
     $filename = $_POST['filename'];
     if (is_file($library_path . DIRECTORY_SEPARATOR . $filename)) {
         system(select_pdftotext() . '"' . $library_path . DIRECTORY_SEPARATOR . $filename . '" "' . $temp_dir . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . '.txt"', $ret);
         if (is_file($temp_dir . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt")) {
             $stopwords = "a's, able, about, above, according, accordingly, across, actually, after, afterwards, again, against, ain't, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren't, around, as, aside, ask, asking, associated, at, available, away, awfully, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c'mon, c's, came, can, can't, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, contain, containing, contains, corresponding, could, couldn't, currently, definitely, described, despite, did, didn't, different, do, does, doesn't, doing, don't, done, down, during, each, edu, eg, either, else, elsewhere, enough, entirely, especially, et, etc, even, ever, every, everybody, everyone, everything, everywhere, ex, exactly, example, except, far, few, followed, following, follows, for, former, formerly, from, further, furthermore, get, gets, getting, given, gives, go, goes, going, gone, got, gotten, greetings, had, hadn't, happens, hardly, has, hasn't, have, haven't, having, he, he's, hello, help, hence, her, here, here's, hereafter, hereby, herein, hereupon, hers, herself, hi, him, himself, his, hither, hopefully, how, howbeit, however, i'd, i'll, i'm, i've, ie, if, in, inasmuch, inc, indeed, indicate, indicated, indicates, inner, insofar, instead, into, inward, is, isn't, it, it'd, it'll, it's, its, itself, just, keep, keeps, kept, know, knows, known, last, lately, later, latter, latterly, least, less, lest, let, let's, like, liked, likely, little, look, looking, looks, ltd, mainly, many, may, maybe, me, mean, meanwhile, merely, might, more, moreover, most, mostly, much, must, my, myself, name, namely, nd, near, nearly, necessary, need, needs, neither, never, nevertheless, new, next, no, nobody, non, none, noone, nor, normally, not, nothing, novel, now, nowhere, obviously, of, off, often, oh, ok, okay, old, on, once, ones, only, onto, or, other, others, otherwise, ought, our, ours, ourselves, out, outside, over, overall, own, particular, particularly, per, perhaps, placed, please, possible, presumably, probably, provides, que, quite, qv, rather, rd, re, really, reasonably, regarding, regardless, regards, relatively, respectively, right, said, same, saw, say, saying, says, secondly, see, seeing, seem, seemed, seeming, seems, seen, self, selves, sensible, sent, serious, seriously, several, shall, she, should, shouldn't, since, so, some, somebody, somehow, someone, something, sometime, sometimes, somewhat, somewhere, soon, sorry, specified, specify, specifying, still, sub, such, sup, sure, t's, take, taken, tell, tends, th, than, thank, thanks, thanx, that, that's, thats, the, their, theirs, them, themselves, then, thence, there, there's, thereafter, thereby, therefore, therein, theres, thereupon, these, they, they'd, they'll, they're, they've, think, this, thorough, thoroughly, those, though, through, throughout, thru, thus, to, together, too, took, toward, towards, tried, tries, truly, try, trying, twice, un, under, unfortunately, unless, unlikely, until, unto, up, upon, us, use, used, useful, uses, using, usually, value, various, very, via, viz, vs, want, wants, was, wasn't, way, we, we'd, we'll, we're, we've, welcome, well, went, were, weren't, what, what's, whatever, when, whence, whenever, where, where's, whereafter, whereas, whereby, wherein, whereupon, wherever, whether, which, while, whither, who, who's, whoever, whole, whom, whose, why, will, willing, wish, with, within, without, won't, wonder, would, would, wouldn't, yes, yet, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves";
             $stopwords = explode(', ', $stopwords);
             $string = file_get_contents($temp_dir . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt");
             unlink($temp_dir . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt");
             if (!empty($string)) {
                 $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
                 $patterns = join("\\b/ui /\\b", $stopwords);
                 $patterns = "/\\b{$patterns}\\b/ui";
                 $patterns = explode(" ", $patterns);
                 $order = array("\r\n", "\n", "\r");
                 $string = str_replace($order, ' ', $string);
                 $string = preg_replace($patterns, '', $string);
                 $string = preg_replace('/\\s{2,}/ui', ' ', $string);
                 $fulltext_array = array();
Esempio n. 6
0
     if (substr($doi, -1) == ')' || substr($doi, -1) == ']') {
         preg_match_all('/(.)(doi:\\s?)?(10\\.\\d{4}\\/\\S+)/ui', $string, $doi2, PREG_PATTERN_ORDER);
         if (substr($doi, -1) == ')' && $doi2[1][0] == '(') {
             $doi = substr($doi, 0, -1);
         }
         if (substr($doi, -1) == ']' && $doi2[1][0] == '[') {
             $doi = substr($doi, 0, -1);
         }
     }
     $_POST['doi'] = $doi;
     preg_match('/(?<=arXiv:)\\S+/ui', $string, $arxiv_id);
     $arxiv_id = current($arxiv_id);
 }
 if (empty($doi) && empty($arxiv_id)) {
     ##########	try to find DOI in the whole PDF	##########
     system(select_pdftotext() . ' -enc UTF-8 "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . '.pdf" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . '.txt"', $ret);
     if (is_file(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . ".txt")) {
         $string = file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . ".txt");
         unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . ".txt");
     }
     if (!empty($string)) {
         $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
         $order = array("\r\n", "\n", "\r");
         $replace = ' ';
         $string = str_replace($order, $replace, $string);
         $order = array("–", "—");
         $replace = '-';
         $string = str_replace($order, $replace, $string);
         preg_match('/10\\.\\d{4}\\/\\S+/ui', $string, $doi);
         $doi = current($doi);
         if (substr($doi, -1) == '.') {
Esempio n. 7
0
    if (!file_exists($file)) {
        die('{"Error":"PDF does not exist!"}');
    }
} else {
    die('{"Error":"No PDF provided!"}');
}
if (!empty($_GET['search_term'])) {
    $search_term = addcslashes($_GET['search_term'], "\$(..+./<>?[\\^{|");
    $search_term = str_replace('\\<\\?\\>', '.', $search_term);
    $search_term = str_replace('\\<\\*\\>', '.*', $search_term);
} else {
    die('{"Error":"No search term provided!"}');
}
$temp_file = $temp_dir . DIRECTORY_SEPARATOR . $file_name . '.txt';
if (!file_exists($temp_file) || filemtime($temp_file) < filemtime($file)) {
    system(select_pdftotext() . '-layout -enc UTF-8 "' . $file . '" "' . $temp_file . '"', $ret);
}
$string = file_get_contents($temp_file);
if (empty($string)) {
    die('{"Error":"PDF to text conversion failed!"}');
}
$pages = array();
$pages = explode("\f", $string);
$output_pages = array();
while (list($page_num, $page_str) = each($pages)) {
    if (preg_match("/{$search_term}/ui", $page_str) > 0) {
        $output_pages[] = $page_num;
    }
}
$final_pages = array();
foreach ($output_pages as $output_page) {
Esempio n. 8
0
 $uid = array();
 $editor = '';
 $reference_type = 'article';
 $publisher = '';
 $place_published = '';
 $doi = '';
 $authors_ascii = '';
 $title_ascii = '';
 $abstract_ascii = '';
 $unpacked_files = array();
 $response = array();
 if (file_exists($temp_dir . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt")) {
     unlink($temp_dir . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt");
 }
 ##########	extract text from pdf	##########
 system(select_pdftotext() . '"' . $file . '" "' . $temp_dir . DIRECTORY_SEPARATOR . 'librarian_temp' . $i . '.txt"', $ret);
 if (file_exists($temp_dir . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt")) {
     $string = file_get_contents($temp_dir . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt");
 }
 if (empty($string)) {
     if (isset($_GET['failed']) && $_GET['failed'] == '1') {
         database_connect($database_path, 'library');
         record_unknown($dbHandle, $orig_filename, $string, $file, $userID);
         $put = basename($orig_filename) . ": Recorded as unknown. Full text not indexed (copying disallowed).<br>";
     } else {
         $put = basename($orig_filename) . ": copying disallowed.<br>";
     }
 } else {
     $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
     $string = str_replace($order, ' ', $string);
     $order = array("–", "—");
 $uid = array();
 $editor = '';
 $reference_type = 'article';
 $publisher = '';
 $place_published = '';
 $doi = '';
 $authors_ascii = '';
 $title_ascii = '';
 $abstract_ascii = '';
 $unpacked_files = array();
 $response = array();
 if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt")) {
     unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt");
 }
 ##########	extract text from pdf	##########
 system(select_pdftotext() . ' -enc UTF-8 "' . $file . '" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'librarian_temp' . $i . '.txt"', $ret);
 if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt")) {
     $string = file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt");
 }
 if (empty($string)) {
     if (isset($_GET['failed']) && $_GET['failed'] == '1') {
         database_connect(IL_DATABASE_PATH, 'library');
         record_unknown($dbHandle, $orig_filename, $string, $file, $userID);
         $put = basename($orig_filename) . ": Recorded as unknown. Full text not indexed (copying disallowed).<br>";
     } else {
         $put = basename($orig_filename) . ": copying disallowed.<br>";
     }
 } else {
     $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
     $string = str_replace($order, ' ', $string);
     $order = array("–", "—");
Esempio n. 10
0
<?php

include_once 'data.php';
include_once 'functions.php';
session_write_close();
if ($_GET['binary'] == 'pdftotext') {
    exec(select_pdftotext() . ' test.pdf "' . $temp_dir . DIRECTORY_SEPARATOR . 'test.txt"');
    if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.txt')) {
        unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.txt');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdfinfo') {
    exec(select_pdfinfo() . ' test.pdf', $output);
    if (!empty($output)) {
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdftohtml') {
    exec(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -xml test.pdf "' . $temp_dir . DIRECTORY_SEPARATOR . 'test"');
    if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.xml')) {
        unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.xml');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'ghostscript') {
    exec(select_ghostscript() . ' -sDEVICE=png16m -r15 -dTextAlphaBits=1 -dGraphicsAlphaBits=1 -dFirstPage=1 -dLastPage=1 -o "' . $temp_dir . DIRECTORY_SEPARATOR . 'test.png" test.pdf');
    if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.png')) {
<?php

include_once 'data.php';
include_once 'functions.php';
session_write_close();
if ($_GET['binary'] == 'pdftotext') {
    exec(select_pdftotext() . ' -enc UTF-8 test.pdf "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt"');
    if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt')) {
        unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdfinfo') {
    exec(select_pdfinfo() . ' test.pdf', $output);
    if (!empty($output)) {
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdftohtml') {
    exec(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -xml test.pdf "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test"');
    if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.xml')) {
        unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.xml');
        die('OK');
    } else {
        die;
    }
} elseif ($_GET['binary'] == 'pdfdetach') {
    exec(select_pdfdetach() . ' -saveall -o "' . IL_TEMP_PATH . '" test.pdf');
    if (is_readable(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt') && filesize(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt') > 0) {
Esempio n. 12
0
 $uid = '';
 $editor = '';
 $reference_type = 'article';
 $publisher = '';
 $place_published = '';
 $doi = '';
 $authors_ascii = '';
 $title_ascii = '';
 $abstract_ascii = '';
 $unpacked_files = array();
 $temp_file = $temp_dir . DIRECTORY_SEPARATOR . $_GET['user'] . "_librarian_temp" . $i . ".txt";
 if (file_exists($temp_file)) {
     unlink($temp_file);
 }
 ##########	extract text from pdf	##########
 system(select_pdftotext() . '"' . $file . '" "' . $temp_file . '"', $ret);
 if (file_exists($temp_file)) {
     $string = file_get_contents($temp_file);
 }
 if (empty($string)) {
     if (isset($_GET['failed']) && $_GET['failed'] == '1') {
         database_connect($database_path, 'library');
         record_unknown($dbHandle, '', $string, $file, $userID);
         $put = " ({$i}) " . basename($file) . ": Recorded into category !unknown. Full text not indexed (copying disallowed).<br>";
         file_put_contents($log, $put, FILE_APPEND);
     } else {
         $put = " ({$i}) " . basename($file) . ": copying disallowed.<br>";
         file_put_contents($log, $put, FILE_APPEND);
     }
 } else {
     $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
Esempio n. 13
0
     $result = null;
 }
 $file_to_copy = '';
 if (isset($file_to_copy_match[0])) {
     $description = strtoupper(strrchr($file_to_copy_match[0], ':'));
     if ($description == ':PDF') {
         $file_to_copy = substr($file_to_copy_match[0], strpos($file_to_copy_match[0], ':') + 1, strrpos($file_to_copy_match[0], ':') - strlen($file_to_copy_match[0]));
     }
 }
 if (is_file($file_to_copy) && is_readable($file_to_copy)) {
     $result = $dbHandle->query("SELECT file FROM library WHERE id=" . $last_id);
     $pdf_filename = $result->fetchColumn();
     $pdf_filename = 'temp-' . $pdf_filename;
     $result = null;
     copy($file_to_copy, dirname(__FILE__) . DIRECTORY_SEPARATOR . 'library' . DIRECTORY_SEPARATOR . $pdf_filename);
     system(select_pdftotext() . '"' . $file_to_copy . '" "' . $temp_dir . DIRECTORY_SEPARATOR . $pdf_filename . '.txt"');
     if (is_file($temp_dir . DIRECTORY_SEPARATOR . $pdf_filename . ".txt")) {
         $stopwords = "a's, able, about, above, according, accordingly, across, actually, after, afterwards, again, against, ain't, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren't, around, as, aside, ask, asking, associated, at, available, away, awfully, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c'mon, c's, came, can, can't, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, contain, containing, contains, corresponding, could, couldn't, currently, definitely, described, despite, did, didn't, different, do, does, doesn't, doing, don't, done, down, during, each, edu, eg, either, else, elsewhere, enough, entirely, especially, et, etc, even, ever, every, everybody, everyone, everything, everywhere, ex, exactly, example, except, far, few, followed, following, follows, for, former, formerly, from, further, furthermore, get, gets, getting, given, gives, go, goes, going, gone, got, gotten, greetings, had, hadn't, happens, hardly, has, hasn't, have, haven't, having, he, he's, hello, help, hence, her, here, here's, hereafter, hereby, herein, hereupon, hers, herself, hi, him, himself, his, hither, hopefully, how, howbeit, however, i'd, i'll, i'm, i've, ie, if, in, inasmuch, inc, indeed, indicate, indicated, indicates, inner, insofar, instead, into, inward, is, isn't, it, it'd, it'll, it's, its, itself, just, keep, keeps, kept, know, knows, known, last, lately, later, latter, latterly, least, less, lest, let, let's, like, liked, likely, little, look, looking, looks, ltd, mainly, many, may, maybe, me, mean, meanwhile, merely, might, more, moreover, most, mostly, much, must, my, myself, name, namely, nd, near, nearly, necessary, need, needs, neither, never, nevertheless, new, next, no, nobody, non, none, noone, nor, normally, not, nothing, novel, now, nowhere, obviously, of, off, often, oh, ok, okay, old, on, once, ones, only, onto, or, other, others, otherwise, ought, our, ours, ourselves, out, outside, over, overall, own, particular, particularly, per, perhaps, placed, please, possible, presumably, probably, provides, que, quite, qv, rather, rd, re, really, reasonably, regarding, regardless, regards, relatively, respectively, right, said, same, saw, say, saying, says, secondly, see, seeing, seem, seemed, seeming, seems, seen, self, selves, sensible, sent, serious, seriously, several, shall, she, should, shouldn't, since, so, some, somebody, somehow, someone, something, sometime, sometimes, somewhat, somewhere, soon, sorry, specified, specify, specifying, still, sub, such, sup, sure, t's, take, taken, tell, tends, th, than, thank, thanks, thanx, that, that's, thats, the, their, theirs, them, themselves, then, thence, there, there's, thereafter, thereby, therefore, therein, theres, thereupon, these, they, they'd, they'll, they're, they've, think, this, thorough, thoroughly, those, though, through, throughout, thru, thus, to, together, too, took, toward, towards, tried, tries, truly, try, trying, twice, un, under, unfortunately, unless, unlikely, until, unto, up, upon, us, use, used, useful, uses, using, usually, value, various, very, via, viz, vs, want, wants, was, wasn't, way, we, we'd, we'll, we're, we've, welcome, well, went, were, weren't, what, what's, whatever, when, whence, whenever, where, where's, whereafter, whereas, whereby, wherein, whereupon, wherever, whether, which, while, whither, who, who's, whoever, whole, whom, whose, why, will, willing, wish, with, within, without, won't, wonder, would, would, wouldn't, yes, yet, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves";
         $stopwords = explode(', ', $stopwords);
         $string = file_get_contents($temp_dir . DIRECTORY_SEPARATOR . $pdf_filename . ".txt");
         unlink($temp_dir . DIRECTORY_SEPARATOR . $pdf_filename . ".txt");
         if (!empty($string)) {
             $patterns = join("\\b/ui /\\b", $stopwords);
             $patterns = "/\\b{$patterns}\\b/ui";
             $patterns = explode(" ", $patterns);
             $order = array("\r\n", "\n", "\r");
             $string = str_replace($order, ' ', $string);
             $string = preg_replace($patterns, '', $string);
             $string = preg_replace('/\\s{2,}/ui', ' ', $string);
             $fulltext_array = array();
             $fulltext_unique = array();