public function search($query) { $results = array(); if ($query != "") { $SQL = "SELECT ts_rank( to_tsvector('english', n.content), to_tsquery('english', '{$query}')) as rank "; $SQL .= ", d.document_id , n1.content as title, ts_headline('english', n.content, to_tsquery('english', '{$query}')) as content, d.url, d.content_type "; $SQL .= " from document d, node n, node n1 where n1.document_id=d.document_id and d.collection_id = '" . $this->collectionId . "' "; $SQL .= " and n1.name='title' and n.document_id = d.document_id and n.name='content' order by rank desc"; $res = pg_query($SQL); $pos = 0; while ($row = pg_fetch_array($res)) { $rank = $row['rank']; if ($rank > 0.005) { $title = $row['title']; $content = $row['content']; $content = substr($content, 1, 400); $result = new Result(); $result->id = $pos + 1; $result->documentId = $row['document_id']; $result->url = urldecode($row['url']); $result->rank = $rank; $result->title = HTMLRobot::clean(html_entity_decode($title)); $result->fragment = HTMLRobot::clean(html_entity_decode($content)); $results[$pos] = $result; $pos++; } } } return $results; }
protected function analyze($document) { try { $title = ""; if ($document->contentType != "application/pdf") { $document->content = html_entity_decode($document->content, ENT_QUOTES); $document->title = HTMLRobot::findTitle($document->content); $document->title = htmlentities($document->title, ENT_QUOTES); $document->content = HTMLRobot::clean($document->content); } if ($document->title == "") { $document->title = $document->url; } $md5 = md5($document->content); $this->setMD5($document->id, $md5); $this->saveNodes($document); } catch (Exception $e) { $this->collection->log("failed adding {$document->url} " . $e->getMessage()); } }
public function add($document) { try { $title = ""; if (URL::hasDuplicate($this->accountId, $document->url)) { return false; } if (URL::filter($this->accountId, $document->url, "indexerfilter")) { return false; } if ($document->contenttype != "application/pdf") { //default to HTML $document->content = html_entity_decode($document->content, ENT_QUOTES); $document->title = HTMLRobot::findTitle($this->accountId, $document->content); $document->title = htmlentities($document->title, ENT_QUOTES); $document->content = HTMLRobot::clean($document->content); } //default title if ($document->title == "") { $document->title = $document->url; } $md5 = md5($document->content); if ($Document::hasDuplicateContent($accountId, $md5)) { return false; } $this->update_index_info($document->id, $md5); $length = strlen($document->content); if ($length > 0 && strlen($document->url) > 0) { $SQL = "INSERT INTO facet(account_id,document_id,name,content) values('" . $this->accountId . "','" . $document->id . "','title','" . $document->title . "');"; mysql_query($SQL) or die(mysql_error()); $SQL = "INSERT INTO facet(account_id,document_id,name,content) values('" . $this->accountId . "','" . $document->id . "','content','" . $document->content . "');"; mysql_query($SQL) or die(mysql_error()); } else { print $document->url . " empty doc <br />\r\n"; } } catch (Exception $e) { print "failed adding {$document->url}\r\n"; } }