//first we have to figure out how many pages //are in the file. this is a rough method. //we have gs kick up an error after it opens //the file and sees how many pages there are $numpages = `{$gs} -dNODISPLAY "{$filepath}" -c quit`; $pos1 = strpos($numpages, 'through'); $numpages = substr($numpages, $pos1); $pos2 = strpos($numpages, "."); $numpages = trim(substr($numpages, 8, $pos2 - 8)); for ($page = 1; $page <= $numpages; $page++) { //gs the page and return as a string $tempstring = `{$gs} -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -dFirstPage={$page} -dLastPage={$page} -c save -f ps2ascii.ps "{$filepath}" -c quit`; //strip out all the trash from the string //$tempstring = string_clean($tempstring,$preventIndex,$keepIndex); $filetext = strtolower($tempstring) . "\n" . $filename; doindexfile($db, $filetext, $rfiles->fields['id'], $rindextable->fields['associated_table'], $rfiles->fields['ftableid'], $page); } $db->Execute("UPDATE files SET indexed=1 WHERE id=" . $rfiles->fields['id']); $pdffilecounter++; } } } echo 'Indexed file: ' . $rfiles->fields['filename'] . '.<br>'; $rfiles->MoveNext(); } // The rest just serves to report some statistics.. if (!$textfilecounter) { $textfilecounter = 0; } if (!$pdffilecounter) { $pdffilecounter = 0;
/** * Indexes the content of the given file * * The file is converted to a text file (pdfs with ghost script, * word files were already converted to html,html characters are stripped), * all words are lowercased, it is checked whether an entry in the table words * already exists, if not, it is added. A relation to the word is made in * the table associated with the given column */ function indexfile($db, $tableinfo, $indextable, $recordid, $fileid, $htmlfileid) { return false; if (!$indextable) { return false; } // if the html file exists, we'll work with that one if ($htmlfileid) { $fp = fopen(file_path($db, $htmlfileid), "r"); if ($fp) { while (!feof($fp)) { $filetext .= fgetss($fp, 64000); } fclose($fp); } $filetext = strtolower($filetext); doindexfile($db, $filetext, $htmlfileid, $indextable, $recordid, $pagenr); } }