/** * Makes a WebQueueBundle with the provided parameters * * @param string $dir_name folder name used by this WebQueueBundle * @param int $filter_size size of each partition in the page exists * BloomFilterBundle * @param int $num_urls_ram number of entries in ram for the priority queue * @param string $min_or_max when the priority queue maintain the heap * property with respect to the least or the largest weight */ function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max) { $this->dir_name = $dir_name; $this->filter_size = $filter_size; $this->num_urls_ram = $num_urls_ram; $this->min_or_max = $min_or_max; if (!file_exists($this->dir_name)) { mkdir($this->dir_name); } /* if we are resuming a crawl we discard the old priority queue and associated hash table and archive new queue data will be read in from any existing schedule */ // set up the priority queue... stores (hash(url), weight) pairs. $this->to_crawl_queue = new PriorityQueue($dir_name . "/queue.dat", $num_urls_ram, self::HASH_KEY_SIZE, $min_or_max, $this, 0); /* set up the hash table... stores (hash(url), offset into url archive, index in priority queue) triples. */ /*to ensure we can always insert into table, because of how deletions work we will periodically want to rebuild our table we will also want to give a little more than the usual twice the number we want to insert slack */ $this->to_crawl_table = $this->constructHashTable($dir_name . "/hash_table.dat", 8 * $num_urls_ram); /* set up url archive, used to store the full text of the urls which are on the priority queue */ $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension(); if (file_exists($url_archive_name)) { unlink($url_archive_name); } $this->to_crawl_archive = new WebArchive($url_archive_name, new NonCompressor(), false, true); //timestamp for url filters (so can delete if get too old) if (!file_exists($dir_name . "/url_timestamp.txt")) { file_put_contents($dir_name . "/url_timestamp.txt", time()); } //filter bundle to check if we have already visited a URL $this->url_exists_filter_bundle = new BloomFilterBundle($dir_name . "/UrlExistsFilterBundle", $filter_size); //timestamp for robot filters (so can delete if get too old) if (!file_exists($dir_name . "/robot_timestamp.txt")) { file_put_contents($dir_name . "/robot_timestamp.txt", time()); } //filter to check if we have already have a copy of a robot.txt file if (file_exists($dir_name . "/got_robottxt.ftr")) { $this->got_robottxt_filter = BloomFilterFile::load($dir_name . "/got_robottxt.ftr"); } else { $this->got_robottxt_filter = new BloomFilterFile($dir_name . "/got_robottxt.ftr", $filter_size); } /* Hash table containing DNS cache this is cleared whenever robot filters cleared */ if (file_exists($dir_name . "/dns_table.dat")) { $this->dns_table = HashTable::load($dir_name . "/dns_table.dat"); } else { $this->dns_table = new HashTable($dir_name . "/dns_table.dat", 4 * $num_urls_ram, self::HASH_KEY_SIZE, self::IP_SIZE); } //set up storage for robots.txt info $robot_archive_name = $dir_name . "/robot_archive" . NonCompressor::fileExtension(); $this->robot_archive = new WebArchive($robot_archive_name, new NonCompressor(), false, true); if (file_exists($dir_name . "/robot.dat")) { $this->robot_table = HashTable::load($dir_name . "/robot.dat"); } else { $this->robot_table = new HashTable($dir_name . "/robot.dat", 16 * $num_urls_ram, self::HASH_KEY_SIZE, self::INT_SIZE); } //filter to check for and determine crawl delay if (file_exists($dir_name . "/crawl_delay.ftr")) { $this->crawl_delay_filter = BloomFilterFile::load($dir_name . "/crawl_delay.ftr"); } else { $this->crawl_delay_filter = new BloomFilterFile($dir_name . "/crawl_delay.ftr", $filter_size); } //Initialize B-Tree for storing cache page validation data $this->etag_btree = new BTree($dir_name . '/EtagExpiresTree'); $this->notify_buffer = array(); }