Exemple #1
0
 /**
  * Makes a WebQueueBundle with the provided parameters
  *
  * @param string $dir_name folder name used by this WebQueueBundle
  * @param int $filter_size size of each partition in the page exists
  *     BloomFilterBundle
  * @param int $num_urls_ram number of entries in ram for the priority queue
  * @param string $min_or_max when the priority queue maintain the heap
  *     property with respect to the least or the largest weight
  */
 function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max)
 {
     $this->dir_name = $dir_name;
     $this->filter_size = $filter_size;
     $this->num_urls_ram = $num_urls_ram;
     $this->min_or_max = $min_or_max;
     if (!file_exists($this->dir_name)) {
         mkdir($this->dir_name);
     }
     /*
         if we are resuming a crawl we discard the old priority queue and
         associated hash table and archive new queue data will be read in
         from any existing schedule
     */
     // set up the priority queue... stores (hash(url), weight) pairs.
     $this->to_crawl_queue = new PriorityQueue($dir_name . "/queue.dat", $num_urls_ram, self::HASH_KEY_SIZE, $min_or_max, $this, 0);
     /* set up the hash table... stores (hash(url), offset into url archive,
         index in priority queue) triples.
        */
     /*to ensure we can always insert into table, because of how deletions
         work we will periodically want to
         rebuild our table we will also want to give a little more than the
         usual twice the number we want to insert slack
       */
     $this->to_crawl_table = $this->constructHashTable($dir_name . "/hash_table.dat", 8 * $num_urls_ram);
     /* set up url archive, used to store the full text of the urls which
          are on the priority queue
        */
     $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension();
     if (file_exists($url_archive_name)) {
         unlink($url_archive_name);
     }
     $this->to_crawl_archive = new WebArchive($url_archive_name, new NonCompressor(), false, true);
     //timestamp for url filters (so can delete if get too old)
     if (!file_exists($dir_name . "/url_timestamp.txt")) {
         file_put_contents($dir_name . "/url_timestamp.txt", time());
     }
     //filter bundle to check if we have already visited a URL
     $this->url_exists_filter_bundle = new BloomFilterBundle($dir_name . "/UrlExistsFilterBundle", $filter_size);
     //timestamp for robot filters (so can delete if get too old)
     if (!file_exists($dir_name . "/robot_timestamp.txt")) {
         file_put_contents($dir_name . "/robot_timestamp.txt", time());
     }
     //filter to check if we have already have a copy of a robot.txt file
     if (file_exists($dir_name . "/got_robottxt.ftr")) {
         $this->got_robottxt_filter = BloomFilterFile::load($dir_name . "/got_robottxt.ftr");
     } else {
         $this->got_robottxt_filter = new BloomFilterFile($dir_name . "/got_robottxt.ftr", $filter_size);
     }
     /* Hash table containing DNS cache this is cleared whenever robot
          filters cleared
        */
     if (file_exists($dir_name . "/dns_table.dat")) {
         $this->dns_table = HashTable::load($dir_name . "/dns_table.dat");
     } else {
         $this->dns_table = new HashTable($dir_name . "/dns_table.dat", 4 * $num_urls_ram, self::HASH_KEY_SIZE, self::IP_SIZE);
     }
     //set up storage for robots.txt info
     $robot_archive_name = $dir_name . "/robot_archive" . NonCompressor::fileExtension();
     $this->robot_archive = new WebArchive($robot_archive_name, new NonCompressor(), false, true);
     if (file_exists($dir_name . "/robot.dat")) {
         $this->robot_table = HashTable::load($dir_name . "/robot.dat");
     } else {
         $this->robot_table = new HashTable($dir_name . "/robot.dat", 16 * $num_urls_ram, self::HASH_KEY_SIZE, self::INT_SIZE);
     }
     //filter to check for and determine crawl delay
     if (file_exists($dir_name . "/crawl_delay.ftr")) {
         $this->crawl_delay_filter = BloomFilterFile::load($dir_name . "/crawl_delay.ftr");
     } else {
         $this->crawl_delay_filter = new BloomFilterFile($dir_name . "/crawl_delay.ftr", $filter_size);
     }
     //Initialize B-Tree for storing cache page validation data
     $this->etag_btree = new BTree($dir_name . '/EtagExpiresTree');
     $this->notify_buffer = array();
 }