function test_index_object_with_same_words_in_db() { $this->db->sql_insert('sys_word', array('word' => 'this', 'object_count' => 1)); $this->db->sql_insert('sys_word', array('word' => 'is', 'object_count' => 1)); $this->db->sql_insert('sys_word', array('word' => 'test', 'object_count' => 1)); $this->db->sql_insert('sys_word', array('word' => 'title', 'object_count' => 1)); $this->db->sql_insert('sys_word', array('word' => 'content', 'object_count' => 1)); $this->db->sql_insert('sys_word', array('word' => 'a', 'object_count' => 1)); indexer :: add($this->site_object); $this->db->sql_select('sys_word', '*', array('word' => 'test', 'object_count' => 3)); $row = $this->db->fetch_row(); $this->assertNotEqual($row, array()); }
<?php session_start(); if (isset($_GET['function'])) { $i = new indexer(); require_once "mysql.class.php"; $mysql = new mySQL(); if ($_GET['function'] == 'init') { $i->init(); } if ($_GET['function'] == 'indexIt') { $result = mysql_query("select count(*) as c from `test`.`crawler_tree`"); $row = mysql_fetch_array($result); $no_of_links = $row['c']; //echo $no_of_links; $result = mysql_query("select id,url from `test`.`crawler` where ftch=1"); if ($row = mysql_fetch_array($result)) { $id = $row['id']; $url = $row['url']; $xml = simplexml_load_file("indexData/" . $id . ".xml"); $title = preg_replace('/^( )*$/', "No title", $xml->title); $linksCount = (int) $xml->out->attributes() + (int) $xml->links->attributes(); $output = mysql_fetch_array(mysql_query("select count(*) as count from test.crawler_tree where child_id={$id}")); $inLinks = $output['count']; $outXML = simplexml_load_file("../data/links/" . $_SESSION['domainID'] . ".xml"); if (!($link = $outXML->XPath("/domain/link[url = '{$url}']"))) { $link = $outXML->addChild('link'); $_SESSION['linkID'] = count($outXML); $link->addAttribute('id', count($outXML)); $link->url = $url; $link->title = $title;
Vosbox is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Vosbox. If not, see <http://www.gnu.org/licenses/>. Vosbox copyright Callan Bryant 2011-2012 <*****@*****.**> http://callanbryant.co.uk/ */ $keywords =& $_REQUEST['keywords']; require_once __DIR__ . '/../../VSE/indexer.class.php'; // include original class to reconstruct each item require_once __DIR__ . '/../../audioFile.class.php'; try { if (!extension_loaded('json')) { throw new Exception('json extension not loaded'); } if (!$keywords) { throw new Exception('Erm, please search for something!'); } $i = indexer::getInstance(); $response = $i->search($keywords); header('Content-Type:application/json'); echo json_encode($response); } catch (Exception $e) { // manually throw the error, as the json ext may not be loaded header('Content-Type:application/json'); echo '{"error":"' . $e->getMessage() . '"}'; }
function remove(&$site_object) { $indexer =& indexer::instance(); $indexer->db =& db_factory::instance(); $object_id = $site_object->get_id(); $indexer->db->sql_exec("SELECT word_id FROM sys_word_link WHERE object_id='{$object_id}'"); $word_array =& $indexer->db->get_array(); $word_ids_array = complex_array::get_column_values('word_id', $word_array); if (count($word_ids_array) > 0) { $word_id_string = implode(',', $word_ids_array); if (count($word_ids_array) > 0) { $indexer->db->sql_exec("UPDATE sys_word SET object_count=( object_count - 1 ) WHERE id in ( {$word_id_string} )"); } $indexer->db->sql_exec("DELETE FROM sys_word WHERE object_count='0'"); $indexer->db->sql_exec("DELETE FROM sys_word_link WHERE object_id='{$object_id}'"); } }
<?php error_reporting(E_ALL); //error_reporting(0); ini_set("display_errors", 1); set_time_limit(0); define('INDEXLOCATION', dirname(__FILE__) . '/index/'); define('DOCUMENTLOCATION', dirname(__FILE__) . '/documents/'); include_once './classes/ranker.class.php'; include_once './classes/indexer.class.php'; include_once './classes/multifolderindex.class.php'; include_once './classes/multifolderdocumentstore.class.php'; $ranker = new ranker(); $index = new multifolderindex(); $docstore = new multifolderdocumentstore(); $indexer = new indexer($index, $docstore, $ranker); function html2txt($document) { $search = array('@<script[^>]*?>.*?</script>@si', '@<[\\/\\!]*?[^<>]*?>@si', '@<style[^>]*?>.*?</style>@siU', '@<![\\s\\S]*?--[ \\t\\n\\r]*>@', '@<style[^>]*?>.*?</style>@si', '@\\W+@si'); $text = preg_replace($search, ' ', $document); return $text; } $toindex = array(); $count = 0; foreach (new RecursiveIteratorIterator(new RecursiveDirectoryIterator('./crawler/documents/')) as $x) { $filename = $x->getPathname(); if (is_file($filename)) { $handle = fopen($filename, 'r'); $contents = fread($handle, filesize($filename)); fclose($handle); $unserialized = unserialize($contents);
<?php include_once 'connect.php'; include_once 'classes/storage.class.php'; include_once 'classes/index.class.php'; include_once 'classes/indexer.class.php'; define("SOURCE_DIR", "source/"); $storage = new storage(); $index = new index(); $indexer = new indexer($index, $storage); $file_names = scandir(SOURCE_DIR, 1); if (!$file_names) { die; } $documents = array(); foreach ($file_names as $file_name) { if ($file_name[0] != '.') { //copy file from source to storage $fp = fopen(SOURCE_DIR . $file_name, "r"); //scan file if (!$fp) { continue; } $document = array(); $offset = 0; $counter = 0; while (($line = fgets($fp)) !== false) { //if the line start with # it is document data if ($line[0] == '#') { //add document to db and get the docID $arr = explode(" = ", substr($line, 1));