Beispiel #1
0
 /**
  * Since offsets are integers, even if the queue is kept relatively small,
  * periodically we will need to rebuild the archive for storing urls.
  */
 function rebuildUrlTable()
 {
     crawlLog("Rebuilding URL table");
     $dir_name = $this->dir_name;
     $count = $this->to_crawl_queue->count;
     $tmp_archive_name = $dir_name . "/tmp_archive" . NonCompressor::fileExtension();
     $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension();
     $tmp_archive = new WebArchive($tmp_archive_name, new NonCompressor(), false, true);
     for ($i = 1; $i <= $count; $i++) {
         list($url, $weight, $flag, $probe) = $this->peekQueue($i);
         $url_container = array(array($url));
         $objects = $tmp_archive->addObjects("offset", $url_container);
         if (isset($objects[0]['offset'])) {
             $offset = $objects[0]['offset'];
         } else {
             crawlLog("Error inserting {$url} into rebuild url archive file");
             continue;
         }
         $hash_url = crawlHash($url, true);
         $data = packInt($offset) . packInt($i) . packInt($flag);
         $this->insertHashTable($hash_url, $data, $probe);
     }
     $this->to_crawl_archive = NULL;
     gc_collect_cycles();
     $tmp_archive->filename = $url_archive_name;
     $this->to_crawl_archive = $tmp_archive;
 }