protected function analyze($document) { try { $title = ""; if ($document->contentType != "application/pdf") { $document->content = html_entity_decode($document->content, ENT_QUOTES); $document->title = HTMLRobot::findTitle($document->content); $document->title = htmlentities($document->title, ENT_QUOTES); $document->content = HTMLRobot::clean($document->content); } if ($document->title == "") { $document->title = $document->url; } $md5 = md5($document->content); $this->setMD5($document->id, $md5); $this->saveNodes($document); } catch (Exception $e) { $this->collection->log("failed adding {$document->url} " . $e->getMessage()); } }
public function add($document) { try { $title = ""; if (URL::hasDuplicate($this->accountId, $document->url)) { return false; } if (URL::filter($this->accountId, $document->url, "indexerfilter")) { return false; } if ($document->contenttype != "application/pdf") { //default to HTML $document->content = html_entity_decode($document->content, ENT_QUOTES); $document->title = HTMLRobot::findTitle($this->accountId, $document->content); $document->title = htmlentities($document->title, ENT_QUOTES); $document->content = HTMLRobot::clean($document->content); } //default title if ($document->title == "") { $document->title = $document->url; } $md5 = md5($document->content); if ($Document::hasDuplicateContent($accountId, $md5)) { return false; } $this->update_index_info($document->id, $md5); $length = strlen($document->content); if ($length > 0 && strlen($document->url) > 0) { $SQL = "INSERT INTO facet(account_id,document_id,name,content) values('" . $this->accountId . "','" . $document->id . "','title','" . $document->title . "');"; mysql_query($SQL) or die(mysql_error()); $SQL = "INSERT INTO facet(account_id,document_id,name,content) values('" . $this->accountId . "','" . $document->id . "','content','" . $document->content . "');"; mysql_query($SQL) or die(mysql_error()); } else { print $document->url . " empty doc <br />\r\n"; } } catch (Exception $e) { print "failed adding {$document->url}\r\n"; } }