/** * Since offsets are integers, even if the queue is kept relatively small, * periodically we will need to rebuild the archive for storing urls. */ function rebuildUrlTable() { crawlLog("Rebuilding URL table"); $dir_name = $this->dir_name; $count = $this->to_crawl_queue->count; $tmp_archive_name = $dir_name . "/tmp_archive" . NonCompressor::fileExtension(); $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension(); $tmp_archive = new WebArchive($tmp_archive_name, new NonCompressor(), false, true); for ($i = 1; $i <= $count; $i++) { list($url, $weight, $flag, $probe) = $this->peekQueue($i); $url_container = array(array($url)); $objects = $tmp_archive->addObjects("offset", $url_container); if (isset($objects[0]['offset'])) { $offset = $objects[0]['offset']; } else { crawlLog("Error inserting {$url} into rebuild url archive file"); continue; } $hash_url = crawlHash($url, true); $data = packInt($offset) . packInt($i) . packInt($flag); $this->insertHashTable($hash_url, $data, $probe); } $this->to_crawl_archive = NULL; gc_collect_cycles(); $tmp_archive->filename = $url_archive_name; $this->to_crawl_archive = $tmp_archive; }
} protected function getSettingValue($id) { $sql = "SELECT `value` FROM `settings` WHERE `id`= :id;"; $prep = $this->dbh->prepare($sql); // Interogarea e pregatita si stocata in $prep $ar_val = array('id' => $id); if ($prep->execute($ar_val)) { $row = $prep->fetch(); return $row['value']; } } protected function saveUrl($url) { $sql = "INSERT INTO urls (type,value,parameter_counter) VALUES ('2',:url, :parameter);"; $prep = $this->dbh->prepare($sql); // Interogarea e pregatita si stocata in $prep $ar_val = array('url' => $url, 'parameter' => $this->parameterCounter); $prep->execute($ar_val); } protected function saveCounter($counter) { $sql = "UPDATE settings SET value=:counter WHERE id='1';"; $prep = $this->dbh->prepare($sql); // Interogarea e pregatita si stocata in $prep $ar_val = array('counter' => $counter); $prep->execute($ar_val); } } $urlSaver = new WebArchive(); $urlSaver->checkPage();