function index($datapath, $dbpath) { // Create or open the database we're going to be writing to. $db = new XapianWritableDatabase($dbpath, Xapian::DB_CREATE_OR_OPEN); // Set up a TermGenerator that we'll use in indexing $termgenerator = new XapianTermGenerator(); $termgenerator->set_stemmer(new XapianStem('en')); // open the file $fH = open_file($datapath); // Read the header row in $headers = get_csv_headers($fH); while (($row = parse_csv_row($fH, $headers)) !== false) { // mapping from field name to value using first row headers // We're just going to use id_NUMBER, TITLE and DESCRIPTION $description = $row['DESCRIPTION']; $title = $row['TITLE']; $identifier = $row['id_NUMBER']; $collection = $row['COLLECTION']; $maker = $row['MAKER']; // we make a document and tell the term generator to use this $doc = new XapianDocument(); $termgenerator->set_document($doc); // index each field with a suitable prefix $termgenerator->index_text($title, 1, 'S'); $termgenerator->index_text($description, 1, 'XD'); // index fields without prefixes for general search $termgenerator->index_text($title); $termgenerator->increase_termpos(); $termgenerator->index_text($description); ### Start of new indexing code. // index the MATERIALS field, splitting on semicolons $materials = explode(";", $row['MATERIALS']); foreach ($materials as $material) { $material = strtolower(trim($material)); if ($material != '') { $doc->add_boolean_term('XM' . $material); } } ### End of new indexing code. // store all the fields for display purposes $doc->set_data(json_encode($row)); // we use the identifier to ensure each object ends up // in the database only once no matter how many times // we run the indexer $idterm = "Q" . $identifier; $doc->add_term($idterm); $db->replace_document($idterm, $doc); } }
/** * Index file contents * * @param array $lines The array of the file contents, each entry corresponds to a new line (included) */ protected function _index($lines, $file_path) { if (empty($lines)) { return false; } // Open the database for update, creating a new database if necessary. $database = new XapianWritableDatabase(self::$_database_path, Xapian::DB_CREATE_OR_OPEN); $indexer = new XapianTermGenerator(); $stemmer = new XapianStem("english"); $indexer->set_stemmer($stemmer); $para = ''; //$lines = file($path); foreach ($lines as $line) { $line = rtrim($line); if ($line == "" && $para != "") { // We've reached the end of a paragraph, so index it. $doc = new XapianDocument(); $doc->set_data($para); $doc->add_value('file', $file_path); //add meta-information to the entry $indexer->set_document($doc); $indexer->index_text($para); // Add the document to the database. $database->add_document($doc); $para = ""; } else { if ($para != "") { $para .= " "; } $para .= $line; } } // Set the database handle to Null to ensure that it gets closed // down cleanly or uncommitted changes may be lost. $database = Null; }
function index($datapath, $dbpath) { // Create or open the database we're going to be writing to. $db = new XapianWritableDatabase($dbpath, Xapian::DB_CREATE_OR_OPEN); // Set up a TermGenerator that we'll use in indexing. $termgenerator = new XapianTermGenerator(); $termgenerator->set_stemmer(new XapianStem('english')); // Open the file. $fH = open_file($datapath); // Read the header row in. $headers = get_csv_headers($fH); while (($row = parse_csv_row($fH, $headers)) !== false) { // '$row' maps field name to value. The field names come from the // first row of the CSV file. // // We're just going to use DESCRIPTION, TITLE and id_NUMBER. $description = $row['DESCRIPTION']; $title = $row['TITLE']; $identifier = $row['id_NUMBER']; // We make a document and tell the term generator to use this. $doc = new XapianDocument(); $termgenerator->set_document($doc); // Index each field with a suitable prefix. $termgenerator->index_text($title, 1, 'S'); $termgenerator->index_text($description, 1, 'XD'); // Index fields without prefixes for general search. $termgenerator->index_text($title); $termgenerator->increase_termpos(); $termgenerator->index_text($description); // Store all the fields for display purposes. $doc->set_data(json_encode($row)); // We use the identifier to ensure each object ends up in the // database only once no matter how many times we run the // indexer. $idterm = "Q" . $identifier; $doc->add_boolean_term($idterm); $db->replace_document($idterm, $doc); } }
print "the PHP interpreter, but you're using the '".php_sapi_name()."' version\n"; exit(1); } include "php5/xapian.php"; if ($argc != 2) { print "Usage: {$argv[0]} PATH_TO_DATABASE\n"; exit(1); } try { // Open the database for update, creating a new database if necessary. $database = new XapianWritableDatabase($argv[1], Xapian::DB_CREATE_OR_OPEN); $indexer = new XapianTermGenerator(); $stemmer = new XapianStem("english"); $indexer->set_stemmer($stemmer); $para = ''; $lines = file("php://stdin"); foreach ($lines as $line) { $line = rtrim($line); if ($line == "" && $para != "") { // We've reached the end of a paragraph, so index it. $doc = new XapianDocument(); $doc->set_data($para); $indexer->set_document($doc); $indexer->index_text($para);
$enquire->get_mset(0, 10); $values = array(); foreach ($matchspy->values_begin() as $k => $term) { $values[$term] = $k->get_termfreq(); } $expected = array("ABB" => 1, "ABC" => 1, "ABC" => 1, "ABCD" => 1, "ABCÿ" => 1); if ($values != $expected) { print "Unexpected matchspy values():\n"; var_dump($values); var_dump($expected); print "\n"; exit(1); } # Regression test for SWIG bug - it was generating "return $r;" in wrapper # functions which didn't set $r. $indexer = new XapianTermGenerator(); $doc = new XapianDocument(); $indexer->set_document($doc); $indexer->index_text("I ask nothing in return"); $indexer->index_text_without_positions("Tea time"); $indexer->index_text("Return in time"); $s = ''; foreach ($doc->termlist_begin() as $term) { $s .= $term . ' '; } if ($s !== 'ask i in nothing return tea time ') { print "PHP Iterator wrapping of TermIterator doesn't work ({$s})\n"; exit(1); } $s = ''; foreach ($doc->termlist_begin() as $k => $term) {
<?php /* * Simple indexer to stick all the Dr Who subtitles into Xapian * Oh, how I do like Xapian * * Matthew Somerville, http://www.dracos.co.uk/ * Version 1.5, now I have all modern series */ $colors = array('#fefe00' => 'yellow', '#ffff00' => 'yellow', '#00ffff' => 'cyan', '#00fffd' => 'cyan', '#00fefc' => 'cyan', '#ededed' => 'white', '#ececec' => 'white', '#00ff00' => 'green', '#00fe00' => 'green', '#ffffff' => 'white'); include '/usr/share/php/xapian.php'; include './config.php'; $db = new XapianWritableDatabase(XAPIAN_DIR . 'write', Xapian::DB_CREATE_OR_OPEN); $indexer = new XapianTermGenerator(); $stemmer = new XapianStem("english"); $indexer->set_flags(128); $indexer->set_database($db); # For spelling $indexer->set_stemmer($stemmer); # Read in files array_shift($argv); # Script name foreach ($argv as $file) { print "Processing {$file}...\n"; preg_match('#/(\\d+)-(\\d+?)\\.xml$#', $file, $m); $series = $m[1]; $epid = $m[2] + 0; $file = file_get_contents($file); # <p begin = "00:01:13.555" dur="00:00:05.00"><span tts:color="#fefe00" tts:textAlign="center"> I'm happy right now, thanks. </span><br/></p> # <p begin="00:00:54.578" end="00:00:57.198"><span tts:color="#ececec" tts:textAlign="left"> BIG BEN STRIKES </span><br/></p> # <p begin="00:02:22.008" end="00:02:27.628"><span tts:color="#ececec" tts:textAlign="center"> Dear Santa, thank you for the dolls </span><br/><span tts:color="#ececec" tts:textAlign="center"> and pencils and the fish. </span></p>
rmr($lastcfile); // XXX: usually I don't find myself saying this, but this would be a great time to have a goto instruction if(!file_exists($indexfile)) rmr($indexfile); } } if(!file_exists($lastcfile)) { // if we've never left off before, just start at 1. it'll figure itself out. $lastc = 0; } // ok, here we go try { // set up the Xapian environment $database = new XapianWritableDatabase($indexfile, Xapian::DB_CREATE_OR_OPEN); $indexer = new XapianTermGenerator(); $stemmer = new XapianStem("english"); $indexer->set_stemmer($stemmer); $comments = 0; while(true) { // get a batch of comments $sql = "SELECT * FROM offensive_comments WHERE id != $lastc AND id > $lastc AND id <= ".($lastc + $stepsize) ." ORDER BY id ASC"; $res = tmbo_query($sql); while($row = mysql_fetch_assoc($res)) { // update the last comment we've seen $lastc = $row["id"]; // skip comments with no comment (votes)