PHP BloomFilterFile примеры использования

Язык программирования: PHP

Класс/Тип: BloomFilterFile

Примеров на hotexamples.com: 4

PHP BloomFilterFile - 4 примера найдено. Это лучшие примеры PHP кода для BloomFilterFile, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

load(4)

add(1)

save(1)

Пример #1

Показать файл

Файл: bloom_filter_bundle.php Проект: yakar/yioop

 /**
  * Removes from the passed array those elements $elt who either are in
  * the filter bundle or whose $elt[$field_name] is in the bundle.
  *
  * @param array& $arr the array to remove elements from
  * @param array $field_names if not NULL an array of field names of $arr
  *     to use to do filtering
  */
 function differenceFilter(&$arr, $field_names = NULL)
 {
     $incremental_time = microtime();
     $num_filters = $this->num_filters;
     $count = count($arr);
     for ($i = 0; $i < $num_filters; $i++) {
         if ($i == $num_filters - 1) {
             $tmp_filter = $this->current_filter;
         } else {
             $tmp_filter = BloomFilterFile::load($this->dir_name . "/filter_{$i}.ftr");
         }
         for ($j = 0; $j < $count; $j++) {
             if ($field_names === NULL) {
                 $tmp =& $arr[$j];
                 if ($tmp !== false && $tmp_filter->contains($tmp)) {
                     /*
                        We deliberately don't try to add anything that has
                        the hash field set to false. This is our cue to
                        skip an element such as a link document which we
                        know will almost always be unique and so be unnecessary
                        to de-duplicate
                     */
                     unset($arr[$j]);
                 }
             } else {
                 //now do the same strategy for the array of fields case
                 foreach ($field_names as $field_name) {
                     $tmp =& $arr[$j][$field_name];
                     if ($tmp !== false && $tmp_filter->contains($tmp)) {
                         unset($arr[$j]);
                         break;
                     }
                 }
             }
             if (changeInMicrotime($incremental_time) > 30) {
                 crawlLog("..Processing item {$j} of {$count} from filter " . "number {$i} of {$num_filters}.");
                 $incremental_time = microtime();
             }
         }
     }
 }

Пример #2

Показать файл

Файл: bloom_filter_file_test.php Проект: yakar/yioop

 /**
  * Check that if we force save the bloom filter file and then we reload it
  * back in that it has the same Contents
  *
  */
 function saveLoadTestCase()
 {
     $this->test_objects['FILE1']->add(77);
     $this->test_objects['FILE1']->save();
     $this->test_objects['FILE1'] = NULL;
     $this->test_objects['FILE2'] = BloomFilterFile::load(WORK_DIRECTORY . "/test.ftr");
     $this->assertTrue($this->test_objects['FILE2']->contains(77), "File 2 contains 77");
     $this->assertFalse($this->test_objects['FILE2']->contains(66), "File 2 contains 66");
 }

Пример #3

Показать файл

Файл: web_queue_bundle.php Проект: yakar/yioop

 /**
  * Makes a WebQueueBundle with the provided parameters
  *
  * @param string $dir_name folder name used by this WebQueueBundle
  * @param int $filter_size size of each partition in the page exists
  *     BloomFilterBundle
  * @param int $num_urls_ram number of entries in ram for the priority queue
  * @param string $min_or_max when the priority queue maintain the heap
  *     property with respect to the least or the largest weight
  */
 function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max)
 {
     $this->dir_name = $dir_name;
     $this->filter_size = $filter_size;
     $this->num_urls_ram = $num_urls_ram;
     $this->min_or_max = $min_or_max;
     if (!file_exists($this->dir_name)) {
         mkdir($this->dir_name);
     }
     /*
         if we are resuming a crawl we discard the old priority queue and
         associated hash table and archive new queue data will be read in
         from any existing schedule
     */
     // set up the priority queue... stores (hash(url), weight) pairs.
     $this->to_crawl_queue = new PriorityQueue($dir_name . "/queue.dat", $num_urls_ram, self::HASH_KEY_SIZE, $min_or_max, $this, 0);
     /* set up the hash table... stores (hash(url), offset into url archive,
         index in priority queue) triples.
        */
     /*to ensure we can always insert into table, because of how deletions
         work we will periodically want to
         rebuild our table we will also want to give a little more than the
         usual twice the number we want to insert slack
       */
     $this->to_crawl_table = $this->constructHashTable($dir_name . "/hash_table.dat", 8 * $num_urls_ram);
     /* set up url archive, used to store the full text of the urls which
          are on the priority queue
        */
     $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension();
     if (file_exists($url_archive_name)) {
         unlink($url_archive_name);
     }
     $this->to_crawl_archive = new WebArchive($url_archive_name, new NonCompressor(), false, true);
     //timestamp for url filters (so can delete if get too old)
     if (!file_exists($dir_name . "/url_timestamp.txt")) {
         file_put_contents($dir_name . "/url_timestamp.txt", time());
     }
     //filter bundle to check if we have already visited a URL
     $this->url_exists_filter_bundle = new BloomFilterBundle($dir_name . "/UrlExistsFilterBundle", $filter_size);
     //timestamp for robot filters (so can delete if get too old)
     if (!file_exists($dir_name . "/robot_timestamp.txt")) {
         file_put_contents($dir_name . "/robot_timestamp.txt", time());
     }
     //filter to check if we have already have a copy of a robot.txt file
     if (file_exists($dir_name . "/got_robottxt.ftr")) {
         $this->got_robottxt_filter = BloomFilterFile::load($dir_name . "/got_robottxt.ftr");
     } else {
         $this->got_robottxt_filter = new BloomFilterFile($dir_name . "/got_robottxt.ftr", $filter_size);
     }
     /* Hash table containing DNS cache this is cleared whenever robot
          filters cleared
        */
     if (file_exists($dir_name . "/dns_table.dat")) {
         $this->dns_table = HashTable::load($dir_name . "/dns_table.dat");
     } else {
         $this->dns_table = new HashTable($dir_name . "/dns_table.dat", 4 * $num_urls_ram, self::HASH_KEY_SIZE, self::IP_SIZE);
     }
     //set up storage for robots.txt info
     $robot_archive_name = $dir_name . "/robot_archive" . NonCompressor::fileExtension();
     $this->robot_archive = new WebArchive($robot_archive_name, new NonCompressor(), false, true);
     if (file_exists($dir_name . "/robot.dat")) {
         $this->robot_table = HashTable::load($dir_name . "/robot.dat");
     } else {
         $this->robot_table = new HashTable($dir_name . "/robot.dat", 16 * $num_urls_ram, self::HASH_KEY_SIZE, self::INT_SIZE);
     }
     //filter to check for and determine crawl delay
     if (file_exists($dir_name . "/crawl_delay.ftr")) {
         $this->crawl_delay_filter = BloomFilterFile::load($dir_name . "/crawl_delay.ftr");
     } else {
         $this->crawl_delay_filter = new BloomFilterFile($dir_name . "/crawl_delay.ftr", $filter_size);
     }
     //Initialize B-Tree for storing cache page validation data
     $this->etag_btree = new BTree($dir_name . '/EtagExpiresTree');
     $this->notify_buffer = array();
 }

Пример #4

Показать файл

Файл: nword_grams.php Проект: yakar/yioop

 /**
  * Used to create a filter file suitable for use in word segmentation
  * (splitting text like "thiscontainsnospaces" into
  * "this contains no spaces"). Used by @see token_tool.php
  *
  * @param string $dict_file file to use as a dictionary to make filter from
  * @param string $lang locale tag of locale we are building the filter for
  */
 static function makeSegmentFilterFile($dict_file, $lang)
 {
     $filter_path = LOCALE_DIR . "/{$lang}/resources/" . "segment" . self::FILTER_SUFFIX;
     if (file_exists($filter_path)) {
         unlink($filter_path);
         //build again from scratch
     }
     $words = file($dict_file);
     $filter = new BloomFilterFile($filter_path, count($words));
     foreach ($words as $word) {
         $tmp = trim($word);
         $len = mb_strlen($tmp);
         $filter->add(mb_strtolower($tmp));
         if ($len >= 3) {
             for ($i = 1; $i < $len - 1; $i++) {
                 $tmp2 = "*" . mb_substr($tmp, $i, $len - $i, "UTF-8");
                 if ($tmp2 == "*") {
                     continue;
                 }
                 $filter->add(mb_strtolower($tmp2));
             }
         }
     }
     $filter->max_gram_len = 1;
     $filter->save();
 }