/** * Removes from the passed array those elements $elt who either are in * the filter bundle or whose $elt[$field_name] is in the bundle. * * @param array& $arr the array to remove elements from * @param array $field_names if not NULL an array of field names of $arr * to use to do filtering */ function differenceFilter(&$arr, $field_names = NULL) { $incremental_time = microtime(); $num_filters = $this->num_filters; $count = count($arr); for ($i = 0; $i < $num_filters; $i++) { if ($i == $num_filters - 1) { $tmp_filter = $this->current_filter; } else { $tmp_filter = BloomFilterFile::load($this->dir_name . "/filter_{$i}.ftr"); } for ($j = 0; $j < $count; $j++) { if ($field_names === NULL) { $tmp =& $arr[$j]; if ($tmp !== false && $tmp_filter->contains($tmp)) { /* We deliberately don't try to add anything that has the hash field set to false. This is our cue to skip an element such as a link document which we know will almost always be unique and so be unnecessary to de-duplicate */ unset($arr[$j]); } } else { //now do the same strategy for the array of fields case foreach ($field_names as $field_name) { $tmp =& $arr[$j][$field_name]; if ($tmp !== false && $tmp_filter->contains($tmp)) { unset($arr[$j]); break; } } } if (changeInMicrotime($incremental_time) > 30) { crawlLog("..Processing item {$j} of {$count} from filter " . "number {$i} of {$num_filters}."); $incremental_time = microtime(); } } } }
/** * Check that if we force save the bloom filter file and then we reload it * back in that it has the same Contents * */ function saveLoadTestCase() { $this->test_objects['FILE1']->add(77); $this->test_objects['FILE1']->save(); $this->test_objects['FILE1'] = NULL; $this->test_objects['FILE2'] = BloomFilterFile::load(WORK_DIRECTORY . "/test.ftr"); $this->assertTrue($this->test_objects['FILE2']->contains(77), "File 2 contains 77"); $this->assertFalse($this->test_objects['FILE2']->contains(66), "File 2 contains 66"); }
/** * Makes a WebQueueBundle with the provided parameters * * @param string $dir_name folder name used by this WebQueueBundle * @param int $filter_size size of each partition in the page exists * BloomFilterBundle * @param int $num_urls_ram number of entries in ram for the priority queue * @param string $min_or_max when the priority queue maintain the heap * property with respect to the least or the largest weight */ function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max) { $this->dir_name = $dir_name; $this->filter_size = $filter_size; $this->num_urls_ram = $num_urls_ram; $this->min_or_max = $min_or_max; if (!file_exists($this->dir_name)) { mkdir($this->dir_name); } /* if we are resuming a crawl we discard the old priority queue and associated hash table and archive new queue data will be read in from any existing schedule */ // set up the priority queue... stores (hash(url), weight) pairs. $this->to_crawl_queue = new PriorityQueue($dir_name . "/queue.dat", $num_urls_ram, self::HASH_KEY_SIZE, $min_or_max, $this, 0); /* set up the hash table... stores (hash(url), offset into url archive, index in priority queue) triples. */ /*to ensure we can always insert into table, because of how deletions work we will periodically want to rebuild our table we will also want to give a little more than the usual twice the number we want to insert slack */ $this->to_crawl_table = $this->constructHashTable($dir_name . "/hash_table.dat", 8 * $num_urls_ram); /* set up url archive, used to store the full text of the urls which are on the priority queue */ $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension(); if (file_exists($url_archive_name)) { unlink($url_archive_name); } $this->to_crawl_archive = new WebArchive($url_archive_name, new NonCompressor(), false, true); //timestamp for url filters (so can delete if get too old) if (!file_exists($dir_name . "/url_timestamp.txt")) { file_put_contents($dir_name . "/url_timestamp.txt", time()); } //filter bundle to check if we have already visited a URL $this->url_exists_filter_bundle = new BloomFilterBundle($dir_name . "/UrlExistsFilterBundle", $filter_size); //timestamp for robot filters (so can delete if get too old) if (!file_exists($dir_name . "/robot_timestamp.txt")) { file_put_contents($dir_name . "/robot_timestamp.txt", time()); } //filter to check if we have already have a copy of a robot.txt file if (file_exists($dir_name . "/got_robottxt.ftr")) { $this->got_robottxt_filter = BloomFilterFile::load($dir_name . "/got_robottxt.ftr"); } else { $this->got_robottxt_filter = new BloomFilterFile($dir_name . "/got_robottxt.ftr", $filter_size); } /* Hash table containing DNS cache this is cleared whenever robot filters cleared */ if (file_exists($dir_name . "/dns_table.dat")) { $this->dns_table = HashTable::load($dir_name . "/dns_table.dat"); } else { $this->dns_table = new HashTable($dir_name . "/dns_table.dat", 4 * $num_urls_ram, self::HASH_KEY_SIZE, self::IP_SIZE); } //set up storage for robots.txt info $robot_archive_name = $dir_name . "/robot_archive" . NonCompressor::fileExtension(); $this->robot_archive = new WebArchive($robot_archive_name, new NonCompressor(), false, true); if (file_exists($dir_name . "/robot.dat")) { $this->robot_table = HashTable::load($dir_name . "/robot.dat"); } else { $this->robot_table = new HashTable($dir_name . "/robot.dat", 16 * $num_urls_ram, self::HASH_KEY_SIZE, self::INT_SIZE); } //filter to check for and determine crawl delay if (file_exists($dir_name . "/crawl_delay.ftr")) { $this->crawl_delay_filter = BloomFilterFile::load($dir_name . "/crawl_delay.ftr"); } else { $this->crawl_delay_filter = new BloomFilterFile($dir_name . "/crawl_delay.ftr", $filter_size); } //Initialize B-Tree for storing cache page validation data $this->etag_btree = new BTree($dir_name . '/EtagExpiresTree'); $this->notify_buffer = array(); }
/** * Says whether or not phrase exists in the N word gram Bloom Filter * * @param $phrase what to check if is a bigram * @param string $lang language of bigrams file * @param string $filter_prefix either the word "segment", "all", or * number n of the number of words in an ngram in filter. * @return true or false */ static function ngramsContains($phrase, $lang, $filter_prefix = 2) { if (self::$ngrams == NULL || !isset(self::$ngrams[$filter_prefix])) { $filter_path = LOCALE_DIR . "/{$lang}/resources/" . "{$filter_prefix}" . self::FILTER_SUFFIX; if (file_exists($filter_path)) { self::$ngrams[$filter_prefix] = BloomFilterFile::load($filter_path); } else { return false; } } return self::$ngrams[$filter_prefix]->contains(mb_strtolower($phrase)); }