} // Script. $order = array("\r\n", "\n", "\r"); echo '<div style="padding:1em 2em">'; echo '<h3>Batch PDF re-indexing.</h3>'; echo '<h4>Error log:</h4>'; // Iterate all PDF files. $glob = new GlobIterator(IL_PDF_PATH . DIRECTORY_SEPARATOR . '[0-9][0-9]' . DIRECTORY_SEPARATOR . '*.pdf'); foreach ($glob as $pdf) { $answer = array(); $file_path = $pdf->getPathname(); $file_name = $pdf->getFilename(); $file_id = intval(basename($pdf->getFilename(), '.pdf')); // Extract text from PDF. if (is_readable($file_path)) { system(select_pdftotext() . ' -enc UTF-8 "' . $file_path . '" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file_name . '.txt"', $ret); if (is_file(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file_name . ".txt")) { $string = trim(file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file_name . ".txt")); unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $file_name . ".txt"); $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string); $string = trim($string); if (!empty($string)) { $string = str_replace($order, ' ', $string); $string = preg_replace('/\\s{2,}/ui', ' ', $string); $output = false; database_connect(IL_DATABASE_PATH, 'fulltext'); $file_query = $dbHandle->quote($file_id); $fulltext_query = $dbHandle->quote($string); $dbHandle->beginTransaction(); $dbHandle->exec("DELETE FROM full_text WHERE fileID={$file_query}"); $output = $dbHandle->exec("INSERT INTO full_text (fileID,full_text) VALUES ({$file_query},{$fulltext_query})");
if (substr($doi, -1) == ')' || substr($doi, -1) == ']') { preg_match_all('/(.)(doi:\\s?)?(10\\.\\d{4}\\/\\S+)/ui', $string, $doi2, PREG_PATTERN_ORDER); if (substr($doi, -1) == ')' && $doi2[1][0] == '(') { $doi = substr($doi, 0, -1); } if (substr($doi, -1) == ']' && $doi2[1][0] == '[') { $doi = substr($doi, 0, -1); } } $_POST['doi'] = $doi; preg_match('/(?<=arXiv:)\\S+/ui', $string, $arxiv_id); $arxiv_id = current($arxiv_id); } if (empty($doi) && empty($arxiv_id)) { ########## try to find DOI in the whole PDF ########## system(select_pdftotext() . '"' . $temp_dir . DIRECTORY_SEPARATOR . $rand . '.pdf" "' . $temp_dir . DIRECTORY_SEPARATOR . $rand . '.txt"', $ret); if (is_file($temp_dir . DIRECTORY_SEPARATOR . $rand . ".txt")) { $string = file_get_contents($temp_dir . DIRECTORY_SEPARATOR . $rand . ".txt"); unlink($temp_dir . DIRECTORY_SEPARATOR . $rand . ".txt"); } if (!empty($string)) { $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string); $order = array("\r\n", "\n", "\r"); $replace = ' '; $string = str_replace($order, $replace, $string); $order = array("–", "—"); $replace = '-'; $string = str_replace($order, $replace, $string); preg_match('/10\\.\\d{4}\\/\\S+/ui', $string, $doi); $doi = current($doi); if (substr($doi, -1) == '.') {
$result = null; } $file_to_copy = ''; if (isset($file_to_copy_match[0])) { $description = strtoupper(strrchr($file_to_copy_match[0], ':')); if ($description == ':PDF') { $file_to_copy = substr($file_to_copy_match[0], strpos($file_to_copy_match[0], ':') + 1, strrpos($file_to_copy_match[0], ':') - strlen($file_to_copy_match[0])); } $file_to_copy = str_replace("\\", "/", $file_to_copy); } if (is_file($file_to_copy) && is_readable($file_to_copy)) { $result = $dbHandle->query("SELECT file FROM library WHERE id=" . $last_id); $pdf_filename = $result->fetchColumn(); $result = null; copy($file_to_copy, IL_PDF_PATH . DIRECTORY_SEPARATOR . $pdf_filename); system(select_pdftotext() . ' -enc UTF-8 "' . $file_to_copy . '" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . $pdf_filename . '.txt"'); if (is_file(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $pdf_filename . ".txt")) { $string = file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $pdf_filename . ".txt"); unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $pdf_filename . ".txt"); if (!empty($string)) { $order = array("\r\n", "\n", "\r"); $string = str_replace($order, ' ', $string); $string = preg_replace('/\\s{2,}/ui', ' ', $string); $fulltext_query = $fdbHandle->quote($string); $fdbHandle->exec("INSERT INTO full_text (fileID,full_text) VALUES (" . $last_id . ",{$fulltext_query})"); } } } } } $dbHandle->commit();
} else { $error = "This is not a PDF."; } //RECORD FILE HASH FOR DUPLICATE DETECTION if (!empty($hash)) { database_connect(IL_DATABASE_PATH, 'library'); $hash = $dbHandle->quote($hash); $file = $dbHandle->quote($_POST['filename']); $dbHandle->exec('UPDATE library SET filehash=' . $hash . ' WHERE file=' . $file); $dbHandle = null; } ########## extract text from pdf ########## if (!isset($error)) { $filename = $_POST['filename']; if (is_file(IL_PDF_PATH . DIRECTORY_SEPARATOR . get_subfolder($filename) . DIRECTORY_SEPARATOR . $filename)) { system(select_pdftotext() . ' -enc UTF-8 "' . IL_PDF_PATH . DIRECTORY_SEPARATOR . get_subfolder($filename) . DIRECTORY_SEPARATOR . $filename . '" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . '.txt"', $ret); if (is_file(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt")) { $string = file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt"); unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt"); if (!empty($string)) { $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string); $order = array("\r\n", "\n", "\r"); $string = str_replace($order, ' ', $string); $string = preg_replace('/\\s{2,}/ui', ' ', $string); $output = null; database_connect(IL_DATABASE_PATH, 'fulltext'); $file_query = $dbHandle->quote(intval($_POST['file'])); $fulltext_query = $dbHandle->quote($string); $dbHandle->beginTransaction(); $dbHandle->exec("DELETE FROM full_text WHERE fileID={$file_query}"); $output = $dbHandle->exec("INSERT INTO full_text (fileID,full_text) VALUES ({$file_query},{$fulltext_query})");
} else { $error = "This is not a PDF."; } //RECORD FILE HASH FOR DUPLICATE DETECTION if (!empty($hash)) { database_connect($database_path, 'library'); $hash = $dbHandle->quote($hash); $file = $dbHandle->quote($_POST['filename']); $dbHandle->exec('UPDATE library SET filehash=' . $hash . ' WHERE file=' . $file); $dbHandle = null; } ########## extract text from pdf ########## if (!isset($error)) { $filename = $_POST['filename']; if (is_file($library_path . DIRECTORY_SEPARATOR . $filename)) { system(select_pdftotext() . '"' . $library_path . DIRECTORY_SEPARATOR . $filename . '" "' . $temp_dir . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . '.txt"', $ret); if (is_file($temp_dir . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt")) { $stopwords = "a's, able, about, above, according, accordingly, across, actually, after, afterwards, again, against, ain't, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren't, around, as, aside, ask, asking, associated, at, available, away, awfully, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c'mon, c's, came, can, can't, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, contain, containing, contains, corresponding, could, couldn't, currently, definitely, described, despite, did, didn't, different, do, does, doesn't, doing, don't, done, down, during, each, edu, eg, either, else, elsewhere, enough, entirely, especially, et, etc, even, ever, every, everybody, everyone, everything, everywhere, ex, exactly, example, except, far, few, followed, following, follows, for, former, formerly, from, further, furthermore, get, gets, getting, given, gives, go, goes, going, gone, got, gotten, greetings, had, hadn't, happens, hardly, has, hasn't, have, haven't, having, he, he's, hello, help, hence, her, here, here's, hereafter, hereby, herein, hereupon, hers, herself, hi, him, himself, his, hither, hopefully, how, howbeit, however, i'd, i'll, i'm, i've, ie, if, in, inasmuch, inc, indeed, indicate, indicated, indicates, inner, insofar, instead, into, inward, is, isn't, it, it'd, it'll, it's, its, itself, just, keep, keeps, kept, know, knows, known, last, lately, later, latter, latterly, least, less, lest, let, let's, like, liked, likely, little, look, looking, looks, ltd, mainly, many, may, maybe, me, mean, meanwhile, merely, might, more, moreover, most, mostly, much, must, my, myself, name, namely, nd, near, nearly, necessary, need, needs, neither, never, nevertheless, new, next, no, nobody, non, none, noone, nor, normally, not, nothing, novel, now, nowhere, obviously, of, off, often, oh, ok, okay, old, on, once, ones, only, onto, or, other, others, otherwise, ought, our, ours, ourselves, out, outside, over, overall, own, particular, particularly, per, perhaps, placed, please, possible, presumably, probably, provides, que, quite, qv, rather, rd, re, really, reasonably, regarding, regardless, regards, relatively, respectively, right, said, same, saw, say, saying, says, secondly, see, seeing, seem, seemed, seeming, seems, seen, self, selves, sensible, sent, serious, seriously, several, shall, she, should, shouldn't, since, so, some, somebody, somehow, someone, something, sometime, sometimes, somewhat, somewhere, soon, sorry, specified, specify, specifying, still, sub, such, sup, sure, t's, take, taken, tell, tends, th, than, thank, thanks, thanx, that, that's, thats, the, their, theirs, them, themselves, then, thence, there, there's, thereafter, thereby, therefore, therein, theres, thereupon, these, they, they'd, they'll, they're, they've, think, this, thorough, thoroughly, those, though, through, throughout, thru, thus, to, together, too, took, toward, towards, tried, tries, truly, try, trying, twice, un, under, unfortunately, unless, unlikely, until, unto, up, upon, us, use, used, useful, uses, using, usually, value, various, very, via, viz, vs, want, wants, was, wasn't, way, we, we'd, we'll, we're, we've, welcome, well, went, were, weren't, what, what's, whatever, when, whence, whenever, where, where's, whereafter, whereas, whereby, wherein, whereupon, wherever, whether, which, while, whither, who, who's, whoever, whole, whom, whose, why, will, willing, wish, with, within, without, won't, wonder, would, would, wouldn't, yes, yet, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves"; $stopwords = explode(', ', $stopwords); $string = file_get_contents($temp_dir . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt"); unlink($temp_dir . DIRECTORY_SEPARATOR . 'lib_' . session_id() . DIRECTORY_SEPARATOR . $filename . ".txt"); if (!empty($string)) { $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string); $patterns = join("\\b/ui /\\b", $stopwords); $patterns = "/\\b{$patterns}\\b/ui"; $patterns = explode(" ", $patterns); $order = array("\r\n", "\n", "\r"); $string = str_replace($order, ' ', $string); $string = preg_replace($patterns, '', $string); $string = preg_replace('/\\s{2,}/ui', ' ', $string); $fulltext_array = array();
if (substr($doi, -1) == ')' || substr($doi, -1) == ']') { preg_match_all('/(.)(doi:\\s?)?(10\\.\\d{4}\\/\\S+)/ui', $string, $doi2, PREG_PATTERN_ORDER); if (substr($doi, -1) == ')' && $doi2[1][0] == '(') { $doi = substr($doi, 0, -1); } if (substr($doi, -1) == ']' && $doi2[1][0] == '[') { $doi = substr($doi, 0, -1); } } $_POST['doi'] = $doi; preg_match('/(?<=arXiv:)\\S+/ui', $string, $arxiv_id); $arxiv_id = current($arxiv_id); } if (empty($doi) && empty($arxiv_id)) { ########## try to find DOI in the whole PDF ########## system(select_pdftotext() . ' -enc UTF-8 "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . '.pdf" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . '.txt"', $ret); if (is_file(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . ".txt")) { $string = file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . ".txt"); unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . $rand . ".txt"); } if (!empty($string)) { $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string); $order = array("\r\n", "\n", "\r"); $replace = ' '; $string = str_replace($order, $replace, $string); $order = array("–", "—"); $replace = '-'; $string = str_replace($order, $replace, $string); preg_match('/10\\.\\d{4}\\/\\S+/ui', $string, $doi); $doi = current($doi); if (substr($doi, -1) == '.') {
if (!file_exists($file)) { die('{"Error":"PDF does not exist!"}'); } } else { die('{"Error":"No PDF provided!"}'); } if (!empty($_GET['search_term'])) { $search_term = addcslashes($_GET['search_term'], "\$(..+./<>?[\\^{|"); $search_term = str_replace('\\<\\?\\>', '.', $search_term); $search_term = str_replace('\\<\\*\\>', '.*', $search_term); } else { die('{"Error":"No search term provided!"}'); } $temp_file = $temp_dir . DIRECTORY_SEPARATOR . $file_name . '.txt'; if (!file_exists($temp_file) || filemtime($temp_file) < filemtime($file)) { system(select_pdftotext() . '-layout -enc UTF-8 "' . $file . '" "' . $temp_file . '"', $ret); } $string = file_get_contents($temp_file); if (empty($string)) { die('{"Error":"PDF to text conversion failed!"}'); } $pages = array(); $pages = explode("\f", $string); $output_pages = array(); while (list($page_num, $page_str) = each($pages)) { if (preg_match("/{$search_term}/ui", $page_str) > 0) { $output_pages[] = $page_num; } } $final_pages = array(); foreach ($output_pages as $output_page) {
$uid = array(); $editor = ''; $reference_type = 'article'; $publisher = ''; $place_published = ''; $doi = ''; $authors_ascii = ''; $title_ascii = ''; $abstract_ascii = ''; $unpacked_files = array(); $response = array(); if (file_exists($temp_dir . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt")) { unlink($temp_dir . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt"); } ########## extract text from pdf ########## system(select_pdftotext() . '"' . $file . '" "' . $temp_dir . DIRECTORY_SEPARATOR . 'librarian_temp' . $i . '.txt"', $ret); if (file_exists($temp_dir . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt")) { $string = file_get_contents($temp_dir . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt"); } if (empty($string)) { if (isset($_GET['failed']) && $_GET['failed'] == '1') { database_connect($database_path, 'library'); record_unknown($dbHandle, $orig_filename, $string, $file, $userID); $put = basename($orig_filename) . ": Recorded as unknown. Full text not indexed (copying disallowed).<br>"; } else { $put = basename($orig_filename) . ": copying disallowed.<br>"; } } else { $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string); $string = str_replace($order, ' ', $string); $order = array("–", "—");
$uid = array(); $editor = ''; $reference_type = 'article'; $publisher = ''; $place_published = ''; $doi = ''; $authors_ascii = ''; $title_ascii = ''; $abstract_ascii = ''; $unpacked_files = array(); $response = array(); if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt")) { unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt"); } ########## extract text from pdf ########## system(select_pdftotext() . ' -enc UTF-8 "' . $file . '" "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'librarian_temp' . $i . '.txt"', $ret); if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt")) { $string = file_get_contents(IL_TEMP_PATH . DIRECTORY_SEPARATOR . "librarian_temp" . $i . ".txt"); } if (empty($string)) { if (isset($_GET['failed']) && $_GET['failed'] == '1') { database_connect(IL_DATABASE_PATH, 'library'); record_unknown($dbHandle, $orig_filename, $string, $file, $userID); $put = basename($orig_filename) . ": Recorded as unknown. Full text not indexed (copying disallowed).<br>"; } else { $put = basename($orig_filename) . ": copying disallowed.<br>"; } } else { $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string); $string = str_replace($order, ' ', $string); $order = array("–", "—");
<?php include_once 'data.php'; include_once 'functions.php'; session_write_close(); if ($_GET['binary'] == 'pdftotext') { exec(select_pdftotext() . ' test.pdf "' . $temp_dir . DIRECTORY_SEPARATOR . 'test.txt"'); if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.txt')) { unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.txt'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdfinfo') { exec(select_pdfinfo() . ' test.pdf', $output); if (!empty($output)) { die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdftohtml') { exec(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -xml test.pdf "' . $temp_dir . DIRECTORY_SEPARATOR . 'test"'); if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.xml')) { unlink($temp_dir . DIRECTORY_SEPARATOR . 'test.xml'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'ghostscript') { exec(select_ghostscript() . ' -sDEVICE=png16m -r15 -dTextAlphaBits=1 -dGraphicsAlphaBits=1 -dFirstPage=1 -dLastPage=1 -o "' . $temp_dir . DIRECTORY_SEPARATOR . 'test.png" test.pdf'); if (file_exists($temp_dir . DIRECTORY_SEPARATOR . 'test.png')) {
<?php include_once 'data.php'; include_once 'functions.php'; session_write_close(); if ($_GET['binary'] == 'pdftotext') { exec(select_pdftotext() . ' -enc UTF-8 test.pdf "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt"'); if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt')) { unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.txt'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdfinfo') { exec(select_pdfinfo() . ' test.pdf', $output); if (!empty($output)) { die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdftohtml') { exec(select_pdftohtml() . ' -q -noframes -enc UTF-8 -nomerge -c -xml test.pdf "' . IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test"'); if (file_exists(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.xml')) { unlink(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.xml'); die('OK'); } else { die; } } elseif ($_GET['binary'] == 'pdfdetach') { exec(select_pdfdetach() . ' -saveall -o "' . IL_TEMP_PATH . '" test.pdf'); if (is_readable(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt') && filesize(IL_TEMP_PATH . DIRECTORY_SEPARATOR . 'test.odt') > 0) {
$uid = ''; $editor = ''; $reference_type = 'article'; $publisher = ''; $place_published = ''; $doi = ''; $authors_ascii = ''; $title_ascii = ''; $abstract_ascii = ''; $unpacked_files = array(); $temp_file = $temp_dir . DIRECTORY_SEPARATOR . $_GET['user'] . "_librarian_temp" . $i . ".txt"; if (file_exists($temp_file)) { unlink($temp_file); } ########## extract text from pdf ########## system(select_pdftotext() . '"' . $file . '" "' . $temp_file . '"', $ret); if (file_exists($temp_file)) { $string = file_get_contents($temp_file); } if (empty($string)) { if (isset($_GET['failed']) && $_GET['failed'] == '1') { database_connect($database_path, 'library'); record_unknown($dbHandle, '', $string, $file, $userID); $put = " ({$i}) " . basename($file) . ": Recorded into category !unknown. Full text not indexed (copying disallowed).<br>"; file_put_contents($log, $put, FILE_APPEND); } else { $put = " ({$i}) " . basename($file) . ": copying disallowed.<br>"; file_put_contents($log, $put, FILE_APPEND); } } else { $string = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $string);
$result = null; } $file_to_copy = ''; if (isset($file_to_copy_match[0])) { $description = strtoupper(strrchr($file_to_copy_match[0], ':')); if ($description == ':PDF') { $file_to_copy = substr($file_to_copy_match[0], strpos($file_to_copy_match[0], ':') + 1, strrpos($file_to_copy_match[0], ':') - strlen($file_to_copy_match[0])); } } if (is_file($file_to_copy) && is_readable($file_to_copy)) { $result = $dbHandle->query("SELECT file FROM library WHERE id=" . $last_id); $pdf_filename = $result->fetchColumn(); $pdf_filename = 'temp-' . $pdf_filename; $result = null; copy($file_to_copy, dirname(__FILE__) . DIRECTORY_SEPARATOR . 'library' . DIRECTORY_SEPARATOR . $pdf_filename); system(select_pdftotext() . '"' . $file_to_copy . '" "' . $temp_dir . DIRECTORY_SEPARATOR . $pdf_filename . '.txt"'); if (is_file($temp_dir . DIRECTORY_SEPARATOR . $pdf_filename . ".txt")) { $stopwords = "a's, able, about, above, according, accordingly, across, actually, after, afterwards, again, against, ain't, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren't, around, as, aside, ask, asking, associated, at, available, away, awfully, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c'mon, c's, came, can, can't, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, contain, containing, contains, corresponding, could, couldn't, currently, definitely, described, despite, did, didn't, different, do, does, doesn't, doing, don't, done, down, during, each, edu, eg, either, else, elsewhere, enough, entirely, especially, et, etc, even, ever, every, everybody, everyone, everything, everywhere, ex, exactly, example, except, far, few, followed, following, follows, for, former, formerly, from, further, furthermore, get, gets, getting, given, gives, go, goes, going, gone, got, gotten, greetings, had, hadn't, happens, hardly, has, hasn't, have, haven't, having, he, he's, hello, help, hence, her, here, here's, hereafter, hereby, herein, hereupon, hers, herself, hi, him, himself, his, hither, hopefully, how, howbeit, however, i'd, i'll, i'm, i've, ie, if, in, inasmuch, inc, indeed, indicate, indicated, indicates, inner, insofar, instead, into, inward, is, isn't, it, it'd, it'll, it's, its, itself, just, keep, keeps, kept, know, knows, known, last, lately, later, latter, latterly, least, less, lest, let, let's, like, liked, likely, little, look, looking, looks, ltd, mainly, many, may, maybe, me, mean, meanwhile, merely, might, more, moreover, most, mostly, much, must, my, myself, name, namely, nd, near, nearly, necessary, need, needs, neither, never, nevertheless, new, next, no, nobody, non, none, noone, nor, normally, not, nothing, novel, now, nowhere, obviously, of, off, often, oh, ok, okay, old, on, once, ones, only, onto, or, other, others, otherwise, ought, our, ours, ourselves, out, outside, over, overall, own, particular, particularly, per, perhaps, placed, please, possible, presumably, probably, provides, que, quite, qv, rather, rd, re, really, reasonably, regarding, regardless, regards, relatively, respectively, right, said, same, saw, say, saying, says, secondly, see, seeing, seem, seemed, seeming, seems, seen, self, selves, sensible, sent, serious, seriously, several, shall, she, should, shouldn't, since, so, some, somebody, somehow, someone, something, sometime, sometimes, somewhat, somewhere, soon, sorry, specified, specify, specifying, still, sub, such, sup, sure, t's, take, taken, tell, tends, th, than, thank, thanks, thanx, that, that's, thats, the, their, theirs, them, themselves, then, thence, there, there's, thereafter, thereby, therefore, therein, theres, thereupon, these, they, they'd, they'll, they're, they've, think, this, thorough, thoroughly, those, though, through, throughout, thru, thus, to, together, too, took, toward, towards, tried, tries, truly, try, trying, twice, un, under, unfortunately, unless, unlikely, until, unto, up, upon, us, use, used, useful, uses, using, usually, value, various, very, via, viz, vs, want, wants, was, wasn't, way, we, we'd, we'll, we're, we've, welcome, well, went, were, weren't, what, what's, whatever, when, whence, whenever, where, where's, whereafter, whereas, whereby, wherein, whereupon, wherever, whether, which, while, whither, who, who's, whoever, whole, whom, whose, why, will, willing, wish, with, within, without, won't, wonder, would, would, wouldn't, yes, yet, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves"; $stopwords = explode(', ', $stopwords); $string = file_get_contents($temp_dir . DIRECTORY_SEPARATOR . $pdf_filename . ".txt"); unlink($temp_dir . DIRECTORY_SEPARATOR . $pdf_filename . ".txt"); if (!empty($string)) { $patterns = join("\\b/ui /\\b", $stopwords); $patterns = "/\\b{$patterns}\\b/ui"; $patterns = explode(" ", $patterns); $order = array("\r\n", "\n", "\r"); $string = str_replace($order, ' ', $string); $string = preg_replace($patterns, '', $string); $string = preg_replace('/\\s{2,}/ui', ' ', $string); $fulltext_array = array(); $fulltext_unique = array();