Пример #1
0
 /**
  * Removes from the passed array those elements $elt who either are in
  * the filter bundle or whose $elt[$field_name] is in the bundle.
  *
  * @param array& $arr the array to remove elements from
  * @param array $field_names if not NULL an array of field names of $arr
  *     to use to do filtering
  */
 function differenceFilter(&$arr, $field_names = NULL)
 {
     $incremental_time = microtime();
     $num_filters = $this->num_filters;
     $count = count($arr);
     for ($i = 0; $i < $num_filters; $i++) {
         if ($i == $num_filters - 1) {
             $tmp_filter = $this->current_filter;
         } else {
             $tmp_filter = BloomFilterFile::load($this->dir_name . "/filter_{$i}.ftr");
         }
         for ($j = 0; $j < $count; $j++) {
             if ($field_names === NULL) {
                 $tmp =& $arr[$j];
                 if ($tmp !== false && $tmp_filter->contains($tmp)) {
                     /*
                        We deliberately don't try to add anything that has
                        the hash field set to false. This is our cue to
                        skip an element such as a link document which we
                        know will almost always be unique and so be unnecessary
                        to de-duplicate
                     */
                     unset($arr[$j]);
                 }
             } else {
                 //now do the same strategy for the array of fields case
                 foreach ($field_names as $field_name) {
                     $tmp =& $arr[$j][$field_name];
                     if ($tmp !== false && $tmp_filter->contains($tmp)) {
                         unset($arr[$j]);
                         break;
                     }
                 }
             }
             if (changeInMicrotime($incremental_time) > 30) {
                 crawlLog("..Processing item {$j} of {$count} from filter " . "number {$i} of {$num_filters}.");
                 $incremental_time = microtime();
             }
         }
     }
 }
Пример #2
0
 /**
  * Check that if we force save the bloom filter file and then we reload it
  * back in that it has the same Contents
  *
  */
 function saveLoadTestCase()
 {
     $this->test_objects['FILE1']->add(77);
     $this->test_objects['FILE1']->save();
     $this->test_objects['FILE1'] = NULL;
     $this->test_objects['FILE2'] = BloomFilterFile::load(WORK_DIRECTORY . "/test.ftr");
     $this->assertTrue($this->test_objects['FILE2']->contains(77), "File 2 contains 77");
     $this->assertFalse($this->test_objects['FILE2']->contains(66), "File 2 contains 66");
 }
Пример #3
0
 /**
  * Makes a WebQueueBundle with the provided parameters
  *
  * @param string $dir_name folder name used by this WebQueueBundle
  * @param int $filter_size size of each partition in the page exists
  *     BloomFilterBundle
  * @param int $num_urls_ram number of entries in ram for the priority queue
  * @param string $min_or_max when the priority queue maintain the heap
  *     property with respect to the least or the largest weight
  */
 function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max)
 {
     $this->dir_name = $dir_name;
     $this->filter_size = $filter_size;
     $this->num_urls_ram = $num_urls_ram;
     $this->min_or_max = $min_or_max;
     if (!file_exists($this->dir_name)) {
         mkdir($this->dir_name);
     }
     /*
         if we are resuming a crawl we discard the old priority queue and
         associated hash table and archive new queue data will be read in
         from any existing schedule
     */
     // set up the priority queue... stores (hash(url), weight) pairs.
     $this->to_crawl_queue = new PriorityQueue($dir_name . "/queue.dat", $num_urls_ram, self::HASH_KEY_SIZE, $min_or_max, $this, 0);
     /* set up the hash table... stores (hash(url), offset into url archive,
         index in priority queue) triples.
        */
     /*to ensure we can always insert into table, because of how deletions
         work we will periodically want to
         rebuild our table we will also want to give a little more than the
         usual twice the number we want to insert slack
       */
     $this->to_crawl_table = $this->constructHashTable($dir_name . "/hash_table.dat", 8 * $num_urls_ram);
     /* set up url archive, used to store the full text of the urls which
          are on the priority queue
        */
     $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension();
     if (file_exists($url_archive_name)) {
         unlink($url_archive_name);
     }
     $this->to_crawl_archive = new WebArchive($url_archive_name, new NonCompressor(), false, true);
     //timestamp for url filters (so can delete if get too old)
     if (!file_exists($dir_name . "/url_timestamp.txt")) {
         file_put_contents($dir_name . "/url_timestamp.txt", time());
     }
     //filter bundle to check if we have already visited a URL
     $this->url_exists_filter_bundle = new BloomFilterBundle($dir_name . "/UrlExistsFilterBundle", $filter_size);
     //timestamp for robot filters (so can delete if get too old)
     if (!file_exists($dir_name . "/robot_timestamp.txt")) {
         file_put_contents($dir_name . "/robot_timestamp.txt", time());
     }
     //filter to check if we have already have a copy of a robot.txt file
     if (file_exists($dir_name . "/got_robottxt.ftr")) {
         $this->got_robottxt_filter = BloomFilterFile::load($dir_name . "/got_robottxt.ftr");
     } else {
         $this->got_robottxt_filter = new BloomFilterFile($dir_name . "/got_robottxt.ftr", $filter_size);
     }
     /* Hash table containing DNS cache this is cleared whenever robot
          filters cleared
        */
     if (file_exists($dir_name . "/dns_table.dat")) {
         $this->dns_table = HashTable::load($dir_name . "/dns_table.dat");
     } else {
         $this->dns_table = new HashTable($dir_name . "/dns_table.dat", 4 * $num_urls_ram, self::HASH_KEY_SIZE, self::IP_SIZE);
     }
     //set up storage for robots.txt info
     $robot_archive_name = $dir_name . "/robot_archive" . NonCompressor::fileExtension();
     $this->robot_archive = new WebArchive($robot_archive_name, new NonCompressor(), false, true);
     if (file_exists($dir_name . "/robot.dat")) {
         $this->robot_table = HashTable::load($dir_name . "/robot.dat");
     } else {
         $this->robot_table = new HashTable($dir_name . "/robot.dat", 16 * $num_urls_ram, self::HASH_KEY_SIZE, self::INT_SIZE);
     }
     //filter to check for and determine crawl delay
     if (file_exists($dir_name . "/crawl_delay.ftr")) {
         $this->crawl_delay_filter = BloomFilterFile::load($dir_name . "/crawl_delay.ftr");
     } else {
         $this->crawl_delay_filter = new BloomFilterFile($dir_name . "/crawl_delay.ftr", $filter_size);
     }
     //Initialize B-Tree for storing cache page validation data
     $this->etag_btree = new BTree($dir_name . '/EtagExpiresTree');
     $this->notify_buffer = array();
 }
Пример #4
0
 /**
  * Used to create a filter file suitable for use in word segmentation
  * (splitting text like "thiscontainsnospaces" into
  * "this contains no spaces"). Used by @see token_tool.php
  *
  * @param string $dict_file file to use as a dictionary to make filter from
  * @param string $lang locale tag of locale we are building the filter for
  */
 static function makeSegmentFilterFile($dict_file, $lang)
 {
     $filter_path = LOCALE_DIR . "/{$lang}/resources/" . "segment" . self::FILTER_SUFFIX;
     if (file_exists($filter_path)) {
         unlink($filter_path);
         //build again from scratch
     }
     $words = file($dict_file);
     $filter = new BloomFilterFile($filter_path, count($words));
     foreach ($words as $word) {
         $tmp = trim($word);
         $len = mb_strlen($tmp);
         $filter->add(mb_strtolower($tmp));
         if ($len >= 3) {
             for ($i = 1; $i < $len - 1; $i++) {
                 $tmp2 = "*" . mb_substr($tmp, $i, $len - $i, "UTF-8");
                 if ($tmp2 == "*") {
                     continue;
                 }
                 $filter->add(mb_strtolower($tmp2));
             }
         }
     }
     $filter->max_gram_len = 1;
     $filter->save();
 }