Beispiel #1
0
 public function search($query)
 {
     $results = array();
     if ($query != "") {
         $SQL = "SELECT ts_rank( to_tsvector('english', n.content), to_tsquery('english', '{$query}')) as rank ";
         $SQL .= ", d.document_id , n1.content as title, ts_headline('english', n.content, to_tsquery('english', '{$query}')) as content, d.url, d.content_type ";
         $SQL .= " from document d, node n, node n1 where n1.document_id=d.document_id  and d.collection_id = '" . $this->collectionId . "' ";
         $SQL .= " and n1.name='title' and n.document_id = d.document_id and n.name='content' order by rank desc";
         $res = pg_query($SQL);
         $pos = 0;
         while ($row = pg_fetch_array($res)) {
             $rank = $row['rank'];
             if ($rank > 0.005) {
                 $title = $row['title'];
                 $content = $row['content'];
                 $content = substr($content, 1, 400);
                 $result = new Result();
                 $result->id = $pos + 1;
                 $result->documentId = $row['document_id'];
                 $result->url = urldecode($row['url']);
                 $result->rank = $rank;
                 $result->title = HTMLRobot::clean(html_entity_decode($title));
                 $result->fragment = HTMLRobot::clean(html_entity_decode($content));
                 $results[$pos] = $result;
                 $pos++;
             }
         }
     }
     return $results;
 }
Beispiel #2
0
 protected function analyze($document)
 {
     try {
         $title = "";
         if ($document->contentType != "application/pdf") {
             $document->content = html_entity_decode($document->content, ENT_QUOTES);
             $document->title = HTMLRobot::findTitle($document->content);
             $document->title = htmlentities($document->title, ENT_QUOTES);
             $document->content = HTMLRobot::clean($document->content);
         }
         if ($document->title == "") {
             $document->title = $document->url;
         }
         $md5 = md5($document->content);
         $this->setMD5($document->id, $md5);
         $this->saveNodes($document);
     } catch (Exception $e) {
         $this->collection->log("failed adding {$document->url} " . $e->getMessage());
     }
 }
Beispiel #3
0
 public function add($document)
 {
     try {
         $title = "";
         if (URL::hasDuplicate($this->accountId, $document->url)) {
             return false;
         }
         if (URL::filter($this->accountId, $document->url, "indexerfilter")) {
             return false;
         }
         if ($document->contenttype != "application/pdf") {
             //default to HTML
             $document->content = html_entity_decode($document->content, ENT_QUOTES);
             $document->title = HTMLRobot::findTitle($this->accountId, $document->content);
             $document->title = htmlentities($document->title, ENT_QUOTES);
             $document->content = HTMLRobot::clean($document->content);
         }
         //default title
         if ($document->title == "") {
             $document->title = $document->url;
         }
         $md5 = md5($document->content);
         if ($Document::hasDuplicateContent($accountId, $md5)) {
             return false;
         }
         $this->update_index_info($document->id, $md5);
         $length = strlen($document->content);
         if ($length > 0 && strlen($document->url) > 0) {
             $SQL = "INSERT INTO facet(account_id,document_id,name,content) values('" . $this->accountId . "','" . $document->id . "','title','" . $document->title . "');";
             mysql_query($SQL) or die(mysql_error());
             $SQL = "INSERT INTO facet(account_id,document_id,name,content) values('" . $this->accountId . "','" . $document->id . "','content','" . $document->content . "');";
             mysql_query($SQL) or die(mysql_error());
         } else {
             print $document->url . " empty doc <br />\r\n";
         }
     } catch (Exception $e) {
         print "failed adding {$document->url}\r\n";
     }
 }