/** * Since offsets are integers, even if the queue is kept relatively small, * periodically we will need to rebuild the archive for storing urls. */ function rebuildUrlTable() { crawlLog("Rebuilding URL table"); $dir_name = $this->dir_name; $count = $this->to_crawl_queue->count; $tmp_archive_name = $dir_name . "/tmp_archive" . NonCompressor::fileExtension(); $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension(); $tmp_archive = new WebArchive($tmp_archive_name, new NonCompressor(), false, true); for ($i = 1; $i <= $count; $i++) { list($url, $weight, $flag, $probe) = $this->peekQueue($i); $url_container = array(array($url)); $objects = $tmp_archive->addObjects("offset", $url_container); if (isset($objects[0]['offset'])) { $offset = $objects[0]['offset']; } else { crawlLog("Error inserting {$url} into rebuild url archive file"); continue; } $hash_url = crawlHash($url, true); $data = packInt($offset) . packInt($i) . packInt($flag); $this->insertHashTable($hash_url, $data, $probe); } $this->to_crawl_archive = NULL; gc_collect_cycles(); $tmp_archive->filename = $url_archive_name; $this->to_crawl_archive = $tmp_archive; }