コード例 #1
0
function index($datapath, $dbpath)
{
    // Create or open the database we're going to be writing to.
    $db = new XapianWritableDatabase($dbpath, Xapian::DB_CREATE_OR_OPEN);
    // Set up a TermGenerator that we'll use in indexing
    $termgenerator = new XapianTermGenerator();
    $termgenerator->set_stemmer(new XapianStem('en'));
    // open the file
    $fH = open_file($datapath);
    //    Read the header row in
    $headers = get_csv_headers($fH);
    while (($row = parse_csv_row($fH, $headers)) !== false) {
        // mapping from field name to value using first row headers
        // We're just going to use id_NUMBER, TITLE and DESCRIPTION
        $description = $row['DESCRIPTION'];
        $title = $row['TITLE'];
        $identifier = $row['id_NUMBER'];
        $collection = $row['COLLECTION'];
        $maker = $row['MAKER'];
        // we make a document and tell the term generator to use this
        $doc = new XapianDocument();
        $termgenerator->set_document($doc);
        // index each field with a suitable prefix
        $termgenerator->index_text($title, 1, 'S');
        $termgenerator->index_text($description, 1, 'XD');
        // index fields without prefixes for general search
        $termgenerator->index_text($title);
        $termgenerator->increase_termpos();
        $termgenerator->index_text($description);
        ### Start of new indexing code.
        // index the MATERIALS field, splitting on semicolons
        $materials = explode(";", $row['MATERIALS']);
        foreach ($materials as $material) {
            $material = strtolower(trim($material));
            if ($material != '') {
                $doc->add_boolean_term('XM' . $material);
            }
        }
        ### End of new indexing code.
        // store all the fields for display purposes
        $doc->set_data(json_encode($row));
        // we use the identifier to ensure each object ends up
        // in the database only once no matter how many times
        // we run the indexer
        $idterm = "Q" . $identifier;
        $doc->add_term($idterm);
        $db->replace_document($idterm, $doc);
    }
}
コード例 #2
0
ファイル: xapian.class.php プロジェクト: bqq1986/efront
 /**
  * Index file contents
  * 
  * @param array $lines The array of the file contents, each entry corresponds to a new line (included) 
  */
 protected function _index($lines, $file_path)
 {
     if (empty($lines)) {
         return false;
     }
     // Open the database for update, creating a new database if necessary.
     $database = new XapianWritableDatabase(self::$_database_path, Xapian::DB_CREATE_OR_OPEN);
     $indexer = new XapianTermGenerator();
     $stemmer = new XapianStem("english");
     $indexer->set_stemmer($stemmer);
     $para = '';
     //$lines = file($path);
     foreach ($lines as $line) {
         $line = rtrim($line);
         if ($line == "" && $para != "") {
             // We've reached the end of a paragraph, so index it.
             $doc = new XapianDocument();
             $doc->set_data($para);
             $doc->add_value('file', $file_path);
             //add meta-information to the entry
             $indexer->set_document($doc);
             $indexer->index_text($para);
             // Add the document to the database.
             $database->add_document($doc);
             $para = "";
         } else {
             if ($para != "") {
                 $para .= " ";
             }
             $para .= $line;
         }
     }
     // Set the database handle to Null to ensure that it gets closed
     // down cleanly or uncommitted changes may be lost.
     $database = Null;
 }
コード例 #3
0
function index($datapath, $dbpath)
{
    // Create or open the database we're going to be writing to.
    $db = new XapianWritableDatabase($dbpath, Xapian::DB_CREATE_OR_OPEN);
    // Set up a TermGenerator that we'll use in indexing.
    $termgenerator = new XapianTermGenerator();
    $termgenerator->set_stemmer(new XapianStem('english'));
    // Open the file.
    $fH = open_file($datapath);
    // Read the header row in.
    $headers = get_csv_headers($fH);
    while (($row = parse_csv_row($fH, $headers)) !== false) {
        // '$row' maps field name to value.  The field names come from the
        // first row of the CSV file.
        //
        // We're just going to use DESCRIPTION, TITLE and id_NUMBER.
        $description = $row['DESCRIPTION'];
        $title = $row['TITLE'];
        $identifier = $row['id_NUMBER'];
        // We make a document and tell the term generator to use this.
        $doc = new XapianDocument();
        $termgenerator->set_document($doc);
        // Index each field with a suitable prefix.
        $termgenerator->index_text($title, 1, 'S');
        $termgenerator->index_text($description, 1, 'XD');
        // Index fields without prefixes for general search.
        $termgenerator->index_text($title);
        $termgenerator->increase_termpos();
        $termgenerator->index_text($description);
        // Store all the fields for display purposes.
        $doc->set_data(json_encode($row));
        // We use the identifier to ensure each object ends up in the
        // database only once no matter how many times we run the
        // indexer.
        $idterm = "Q" . $identifier;
        $doc->add_boolean_term($idterm);
        $db->replace_document($idterm, $doc);
    }
}
コード例 #4
0
ファイル: simpleindex.php5 プロジェクト: nsmetanin/xapian
    print "the PHP interpreter, but you're using the '".php_sapi_name()."' version\n";
    exit(1);
}

include "php5/xapian.php";

if ($argc != 2) {
    print "Usage: {$argv[0]} PATH_TO_DATABASE\n";
    exit(1);
}

try {
    // Open the database for update, creating a new database if necessary.
    $database = new XapianWritableDatabase($argv[1], Xapian::DB_CREATE_OR_OPEN);

    $indexer = new XapianTermGenerator();
    $stemmer = new XapianStem("english");
    $indexer->set_stemmer($stemmer);

    $para = '';
    $lines = file("php://stdin");
    foreach ($lines as $line) {
	$line = rtrim($line);
	if ($line == "" && $para != "") {
	    // We've reached the end of a paragraph, so index it.
	    $doc = new XapianDocument();
	    $doc->set_data($para);

	    $indexer->set_document($doc);
	    $indexer->index_text($para);
コード例 #5
0
$enquire->get_mset(0, 10);
$values = array();
foreach ($matchspy->values_begin() as $k => $term) {
    $values[$term] = $k->get_termfreq();
}
$expected = array("ABB" => 1, "ABC" => 1, "ABC" => 1, "ABCD" => 1, "ABCÿ" => 1);
if ($values != $expected) {
    print "Unexpected matchspy values():\n";
    var_dump($values);
    var_dump($expected);
    print "\n";
    exit(1);
}
# Regression test for SWIG bug - it was generating "return $r;" in wrapper
# functions which didn't set $r.
$indexer = new XapianTermGenerator();
$doc = new XapianDocument();
$indexer->set_document($doc);
$indexer->index_text("I ask nothing in return");
$indexer->index_text_without_positions("Tea time");
$indexer->index_text("Return in time");
$s = '';
foreach ($doc->termlist_begin() as $term) {
    $s .= $term . ' ';
}
if ($s !== 'ask i in nothing return tea time ') {
    print "PHP Iterator wrapping of TermIterator doesn't work ({$s})\n";
    exit(1);
}
$s = '';
foreach ($doc->termlist_begin() as $k => $term) {
コード例 #6
0
ファイル: indexer.php プロジェクト: ratan203/who-said
<?php

/*
 * Simple indexer to stick all the Dr Who subtitles into Xapian
 * Oh, how I do like Xapian
 *
 * Matthew Somerville, http://www.dracos.co.uk/
 * Version 1.5, now I have all modern series
 */
$colors = array('#fefe00' => 'yellow', '#ffff00' => 'yellow', '#00ffff' => 'cyan', '#00fffd' => 'cyan', '#00fefc' => 'cyan', '#ededed' => 'white', '#ececec' => 'white', '#00ff00' => 'green', '#00fe00' => 'green', '#ffffff' => 'white');
include '/usr/share/php/xapian.php';
include './config.php';
$db = new XapianWritableDatabase(XAPIAN_DIR . 'write', Xapian::DB_CREATE_OR_OPEN);
$indexer = new XapianTermGenerator();
$stemmer = new XapianStem("english");
$indexer->set_flags(128);
$indexer->set_database($db);
# For spelling
$indexer->set_stemmer($stemmer);
# Read in files
array_shift($argv);
# Script name
foreach ($argv as $file) {
    print "Processing {$file}...\n";
    preg_match('#/(\\d+)-(\\d+?)\\.xml$#', $file, $m);
    $series = $m[1];
    $epid = $m[2] + 0;
    $file = file_get_contents($file);
    # <p begin = "00:01:13.555" dur="00:00:05.00"><span tts:color="#fefe00" tts:textAlign="center"> I'm happy right now, thanks. </span><br/></p>
    # <p begin="00:00:54.578" end="00:00:57.198"><span tts:color="#ececec" tts:textAlign="left"> BIG BEN STRIKES </span><br/></p>
    # <p begin="00:02:22.008" end="00:02:27.628"><span tts:color="#ececec" tts:textAlign="center"> Dear Santa, thank you for the dolls </span><br/><span tts:color="#ececec" tts:textAlign="center"> and pencils and the fish. </span></p>
コード例 #7
0
		rmr($lastcfile);
		// XXX: usually I don't find myself saying this, but this would be a great time to have a goto instruction
		if(!file_exists($indexfile))
			rmr($indexfile);
	}
}
if(!file_exists($lastcfile)) {
	// if we've never left off before, just start at 1.  it'll figure itself out.
	$lastc = 0;
}

// ok, here we go
try {
	// set up the Xapian environment
    $database = new XapianWritableDatabase($indexfile, Xapian::DB_CREATE_OR_OPEN);
    $indexer = new XapianTermGenerator();
    $stemmer = new XapianStem("english");
    $indexer->set_stemmer($stemmer);
	$comments = 0;
	
	while(true) {
		// get a batch of comments
		$sql = "SELECT * FROM offensive_comments WHERE id != $lastc AND id > $lastc AND id <= ".($lastc + $stepsize)
		       ." ORDER BY id ASC";
		$res = tmbo_query($sql);
		
		while($row = mysql_fetch_assoc($res)) {
			// update the last comment we've seen
			$lastc = $row["id"];
			
			// skip comments with no comment (votes)