示例#1
0
 /**
  * Removes from the passed array those elements $elt who either are in
  * the filter bundle or whose $elt[$field_name] is in the bundle.
  *
  * @param array& $arr the array to remove elements from
  * @param array $field_names if not NULL an array of field names of $arr
  *     to use to do filtering
  */
 function differenceFilter(&$arr, $field_names = NULL)
 {
     $incremental_time = microtime();
     $num_filters = $this->num_filters;
     $count = count($arr);
     for ($i = 0; $i < $num_filters; $i++) {
         if ($i == $num_filters - 1) {
             $tmp_filter = $this->current_filter;
         } else {
             $tmp_filter = BloomFilterFile::load($this->dir_name . "/filter_{$i}.ftr");
         }
         for ($j = 0; $j < $count; $j++) {
             if ($field_names === NULL) {
                 $tmp =& $arr[$j];
                 if ($tmp !== false && $tmp_filter->contains($tmp)) {
                     /*
                        We deliberately don't try to add anything that has
                        the hash field set to false. This is our cue to
                        skip an element such as a link document which we
                        know will almost always be unique and so be unnecessary
                        to de-duplicate
                     */
                     unset($arr[$j]);
                 }
             } else {
                 //now do the same strategy for the array of fields case
                 foreach ($field_names as $field_name) {
                     $tmp =& $arr[$j][$field_name];
                     if ($tmp !== false && $tmp_filter->contains($tmp)) {
                         unset($arr[$j]);
                         break;
                     }
                 }
             }
             if (changeInMicrotime($incremental_time) > 30) {
                 crawlLog("..Processing item {$j} of {$count} from filter " . "number {$i} of {$num_filters}.");
                 $incremental_time = microtime();
             }
         }
     }
 }
示例#2
0
 /**
  * Check that if we force save the bloom filter file and then we reload it
  * back in that it has the same Contents
  *
  */
 function saveLoadTestCase()
 {
     $this->test_objects['FILE1']->add(77);
     $this->test_objects['FILE1']->save();
     $this->test_objects['FILE1'] = NULL;
     $this->test_objects['FILE2'] = BloomFilterFile::load(WORK_DIRECTORY . "/test.ftr");
     $this->assertTrue($this->test_objects['FILE2']->contains(77), "File 2 contains 77");
     $this->assertFalse($this->test_objects['FILE2']->contains(66), "File 2 contains 66");
 }
示例#3
0
 /**
  * Makes a WebQueueBundle with the provided parameters
  *
  * @param string $dir_name folder name used by this WebQueueBundle
  * @param int $filter_size size of each partition in the page exists
  *     BloomFilterBundle
  * @param int $num_urls_ram number of entries in ram for the priority queue
  * @param string $min_or_max when the priority queue maintain the heap
  *     property with respect to the least or the largest weight
  */
 function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max)
 {
     $this->dir_name = $dir_name;
     $this->filter_size = $filter_size;
     $this->num_urls_ram = $num_urls_ram;
     $this->min_or_max = $min_or_max;
     if (!file_exists($this->dir_name)) {
         mkdir($this->dir_name);
     }
     /*
         if we are resuming a crawl we discard the old priority queue and
         associated hash table and archive new queue data will be read in
         from any existing schedule
     */
     // set up the priority queue... stores (hash(url), weight) pairs.
     $this->to_crawl_queue = new PriorityQueue($dir_name . "/queue.dat", $num_urls_ram, self::HASH_KEY_SIZE, $min_or_max, $this, 0);
     /* set up the hash table... stores (hash(url), offset into url archive,
         index in priority queue) triples.
        */
     /*to ensure we can always insert into table, because of how deletions
         work we will periodically want to
         rebuild our table we will also want to give a little more than the
         usual twice the number we want to insert slack
       */
     $this->to_crawl_table = $this->constructHashTable($dir_name . "/hash_table.dat", 8 * $num_urls_ram);
     /* set up url archive, used to store the full text of the urls which
          are on the priority queue
        */
     $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension();
     if (file_exists($url_archive_name)) {
         unlink($url_archive_name);
     }
     $this->to_crawl_archive = new WebArchive($url_archive_name, new NonCompressor(), false, true);
     //timestamp for url filters (so can delete if get too old)
     if (!file_exists($dir_name . "/url_timestamp.txt")) {
         file_put_contents($dir_name . "/url_timestamp.txt", time());
     }
     //filter bundle to check if we have already visited a URL
     $this->url_exists_filter_bundle = new BloomFilterBundle($dir_name . "/UrlExistsFilterBundle", $filter_size);
     //timestamp for robot filters (so can delete if get too old)
     if (!file_exists($dir_name . "/robot_timestamp.txt")) {
         file_put_contents($dir_name . "/robot_timestamp.txt", time());
     }
     //filter to check if we have already have a copy of a robot.txt file
     if (file_exists($dir_name . "/got_robottxt.ftr")) {
         $this->got_robottxt_filter = BloomFilterFile::load($dir_name . "/got_robottxt.ftr");
     } else {
         $this->got_robottxt_filter = new BloomFilterFile($dir_name . "/got_robottxt.ftr", $filter_size);
     }
     /* Hash table containing DNS cache this is cleared whenever robot
          filters cleared
        */
     if (file_exists($dir_name . "/dns_table.dat")) {
         $this->dns_table = HashTable::load($dir_name . "/dns_table.dat");
     } else {
         $this->dns_table = new HashTable($dir_name . "/dns_table.dat", 4 * $num_urls_ram, self::HASH_KEY_SIZE, self::IP_SIZE);
     }
     //set up storage for robots.txt info
     $robot_archive_name = $dir_name . "/robot_archive" . NonCompressor::fileExtension();
     $this->robot_archive = new WebArchive($robot_archive_name, new NonCompressor(), false, true);
     if (file_exists($dir_name . "/robot.dat")) {
         $this->robot_table = HashTable::load($dir_name . "/robot.dat");
     } else {
         $this->robot_table = new HashTable($dir_name . "/robot.dat", 16 * $num_urls_ram, self::HASH_KEY_SIZE, self::INT_SIZE);
     }
     //filter to check for and determine crawl delay
     if (file_exists($dir_name . "/crawl_delay.ftr")) {
         $this->crawl_delay_filter = BloomFilterFile::load($dir_name . "/crawl_delay.ftr");
     } else {
         $this->crawl_delay_filter = new BloomFilterFile($dir_name . "/crawl_delay.ftr", $filter_size);
     }
     //Initialize B-Tree for storing cache page validation data
     $this->etag_btree = new BTree($dir_name . '/EtagExpiresTree');
     $this->notify_buffer = array();
 }
示例#4
0
 /**
  * Says whether or not phrase exists in the N word gram Bloom Filter
  *
  * @param $phrase what to check if is a bigram
  * @param string $lang language of bigrams file
  * @param string $filter_prefix either the word "segment", "all", or
  *     number n of the number of words in an ngram in filter.
  * @return true or false
  */
 static function ngramsContains($phrase, $lang, $filter_prefix = 2)
 {
     if (self::$ngrams == NULL || !isset(self::$ngrams[$filter_prefix])) {
         $filter_path = LOCALE_DIR . "/{$lang}/resources/" . "{$filter_prefix}" . self::FILTER_SUFFIX;
         if (file_exists($filter_path)) {
             self::$ngrams[$filter_prefix] = BloomFilterFile::load($filter_path);
         } else {
             return false;
         }
     }
     return self::$ngrams[$filter_prefix]->contains(mb_strtolower($phrase));
 }