PHP IndexArchiveBundle примеры использования

Язык программирования: PHP

Класс/Тип: IndexArchiveBundle

Примеров на hotexamples.com: 3

PHP IndexArchiveBundle - 3 примера найдено. Это лучшие примеры PHP кода для IndexArchiveBundle, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

getArchiveInfo(2)

addIndexData(1)

addPages(1)

getParamModifiedTime(1)

initGenerationToAdd(1)

saveAndAddCurrentShardDictionary(1)

setArchiveInfo(1)

Пример #1

Показать файл

Файл: queue_server.php Проект: yakar/yioop

 /**
  * Writes status information about the current crawl so that the webserver
  * app can use it for its display.
  *
  * @param array $sites contains the most recently crawled sites
  */
 function writeCrawlStatus(&$sites)
 {
     $crawl_status = array();
     $stat_file = CRAWL_DIR . "/schedules/crawl_status.txt";
     if (file_exists($stat_file)) {
         $crawl_status = unserialize(file_get_contents($stat_file));
         if (!isset($crawl_status['CRAWL_TIME']) || $crawl_status['CRAWL_TIME'] != $this->crawl_time) {
             $crawl_status = array();
             // status of some other crawl
         }
     }
     $crawl_status['MOST_RECENT_FETCHER'] = $this->most_recent_fetcher;
     if (isset($sites[self::RECENT_URLS])) {
         $crawl_status['MOST_RECENT_URLS_SEEN'] = $sites[self::RECENT_URLS];
     }
     $crawl_status['CRAWL_TIME'] = $this->crawl_time;
     $info_bundle = IndexArchiveBundle::getArchiveInfo(CRAWL_DIR . '/cache/' . self::index_data_base_name . $this->crawl_time);
     $index_archive_info = unserialize($info_bundle['DESCRIPTION']);
     $crawl_status['COUNT'] = $info_bundle['COUNT'];
     $now = time();
     $change_in_time = ONE_HOUR + 1;
     while (count($this->hourly_crawl_data) > 0 && $change_in_time > ONE_HOUR) {
         $least_recent_hourly_pair = array_pop($this->hourly_crawl_data);
         $change_in_time = $now - $least_recent_hourly_pair[0];
     }
     if ($change_in_time <= ONE_HOUR) {
         $this->hourly_crawl_data[] = $least_recent_hourly_pair;
     }
     array_unshift($this->hourly_crawl_data, array($now, $info_bundle['VISITED_URLS_COUNT']));
     $crawl_status['VISITED_COUNT_HISTORY'] = $this->hourly_crawl_data;
     $crawl_status['VISITED_URLS_COUNT'] = $info_bundle['VISITED_URLS_COUNT'];
     $crawl_status['DESCRIPTION'] = $index_archive_info['DESCRIPTION'];
     $crawl_status['QUEUE_PEAK_MEMORY'] = memory_get_peak_usage();
     file_put_contents($stat_file, serialize($crawl_status), LOCK_EX);
     chmod($stat_file, 0777);
     crawlLog("End checking for new URLs data memory usage" . memory_get_usage());
     crawlLog("The current crawl description is: " . $index_archive_info['DESCRIPTION']);
     crawlLog("Number of unique pages so far: " . $info_bundle['VISITED_URLS_COUNT']);
     crawlLog("Total urls extracted so far: " . $info_bundle['COUNT']);
     if (isset($sites[self::RECENT_URLS])) {
         crawlLog("Of these, the most recent urls are:");
         foreach ($sites[self::RECENT_URLS] as $url) {
             crawlLog("URL: " . iconv("UTF-8", "ISO-8859-1//IGNORE", $url));
         }
     }
 }

Пример #2

Показать файл

Файл: recipe_plugin.php Проект: yakar/yioop

 /**
  * Implements post processing of recipes. recipes are extracted
  * ingredients are scrubbed and recipes are clustered. The clustered
  * recipes are added back to the index.
  *
  * @param string $index_name  index name of the current crawl.
  */
 function postProcessing($index_name)
 {
     global $INDEXING_PLUGINS;
     if (!class_exists("SplHeap")) {
         crawlLog("...Recipe Plugin Requires SPLHeap for clustering!");
         crawlLog("...Aborting plugin");
         return;
     }
     $locale_tag = guessLocale();
     setLocaleObject($locale_tag);
     $search_controller = new SearchController($INDEXING_PLUGINS);
     $query = "recipe:all i:{$index_name}";
     crawlLog("...Running Recipe Plugin!");
     crawlLog("...Finding docs tagged as recipes.");
     $more_docs = true;
     $raw_recipes = array();
     $limit = 0;
     $num = 100;
     while ($more_docs) {
         $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name);
         if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) {
             $raw_recipes = array_merge($raw_recipes, $results["PAGES"]);
         }
         crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . ".");
         $limit += $num_results;
         if (isset($results["SAVE_POINT"])) {
             $end = true;
             foreach ($results["SAVE_POINT"] as $save_point) {
                 if ($save_point != -1) {
                     $end = false;
                 }
             }
             if ($end) {
                 $more_docs = false;
             }
         } else {
             $more_docs = false;
         }
     }
     crawlLog("...Clustering.");
     // only cluster if would make more than one cluster
     if (count($raw_recipes) * CLUSTER_RATIO > 1) {
         $recipes = array();
         $i = 0;
         foreach ($raw_recipes as $raw_recipe) {
             $description = $raw_recipe[self::DESCRIPTION];
             $ingredients = explode("||", $description);
             if (is_array($ingredients) && count($ingredients) > 1) {
                 $recipes[$i][0] = $raw_recipe[self::TITLE];
                 $recipes[$i][1] = $ingredients;
                 $recipes[$i][2] = crawlHash($raw_recipe[self::URL]);
                 $recipes[$i][3] = $raw_recipe;
                 $i++;
             }
         }
         $recipes_ingredients = array();
         $count = count($recipes);
         foreach ($recipes as $key => $recipe) {
             foreach ($recipe[1] as $index => $ingredient) {
                 if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") {
                     $mainIngredient = $this->getIngredientName((string) $ingredient);
                     if (strlen($mainIngredient) != 0) {
                         $recipe[1][$index] = $mainIngredient;
                     } else {
                         unset($recipe[1][$index]);
                     }
                 } else {
                     unset($recipe[1][$index]);
                 }
             }
             $recipes[$key] = $recipe;
         }
         $count = count($recipes);
         $k = 0;
         $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray');
         for ($i = 0; $i < $count; $i++) {
             $recipe1_main_ingredient = "";
             $recipe1 = $recipes[$i][1];
             $recipe_name = $recipes[$i][0];
             $recipe1_title = strtolower($recipes[$i][0]);
             $distinct_ingredients[$recipe_name] = $recipes[$i][1];
             $doc_keys[$recipe_name] = $recipes[$i][2];
             $recipes_summary[$recipe_name] = $recipes[$i][3];
             for ($j = $i + 1; $j < $count; $j++) {
                 $recipe2_main_ingredient = "";
                 $recipe2 = $recipes[$j][1];
                 $recipe2_title = strtolower($recipes[$j][0]);
                 $weights[$k][0] = $recipes[$i][0];
                 $weights[$k][1] = $recipes[$j][0];
                 $merge_array = array_merge($recipe1, $recipe2);
                 $vector_array = array_unique($merge_array);
                 sort($vector_array);
                 $recipe1_vector = array_fill_keys($vector_array, 0);
                 $recipe2_vector = array_fill_keys($vector_array, 0);
                 foreach ($recipe1 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe1_title, $ingredient)) {
                             $recipe1_main_ingredient = $ingredient;
                         }
                     }
                     $recipe1_vector[$ingredient] = 1;
                 }
                 foreach ($recipe2 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe2_title, $ingredient)) {
                             $recipe2_main_ingredient = $ingredient;
                         }
                     }
                     $recipe2_vector[$ingredient] = 1;
                 }
                 $edge_weight = 0;
                 $matches = 1;
                 foreach ($vector_array as $vector) {
                     $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector];
                     $vector_diff[$vector] = pow($diff, 2);
                     if (abs($diff) == 1) {
                         $matches += 1;
                     }
                     $edge_weight += $vector_diff[$vector];
                 }
                 $main_ingredient_match = 1;
                 if ($recipe1_main_ingredient != $recipe2_main_ingredient) {
                     $main_ingredient_match = 1000;
                 }
                 $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match;
                 $weights[$k][2] = $edge_weight;
                 $k++;
             }
         }
         crawlLog("...Making new shard with clustered recipes as docs.");
         $clusters = kruskalClustering($weights, $count, $distinct_ingredients);
         $index_shard = new IndexShard("cluster_shard");
         $word_lists = array();
         $recipe_sites = array();
         foreach ($clusters as $cluster) {
             $count = count($cluster);
             for ($i = 0; $i < $count - 1; $i++) {
                 $meta_ids = array();
                 $summary = array();
                 $recipe = $cluster[$i];
                 $summary[self::URL] = $recipes_summary[$recipe][self::URL];
                 $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE];
                 $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION];
                 $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP];
                 $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING];
                 $summary[self::HASH] = $recipes_summary[$recipe][self::HASH];
                 $doc_keys[$recipe] = crawlHash($summary[self::URL], true);
                 $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1);
                 $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                 $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE];
                 $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE];
                 $recipe_sites[] = $summary;
                 $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]);
                 crawlLog("ingredient:" . $cluster["ingredient"]);
                 if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) {
                     crawlLog("Problem inserting recipe: " . $summary[self::TITLE]);
                 }
             }
         }
         $shard_string = $index_shard->save(true);
         $index_shard = IndexShard::load("cluster_shard", $shard_string);
         unset($shard_string);
         crawlLog("...Adding recipe shard to index archive bundle");
         $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle($dir, false);
         if ($index_shard->word_docs_packed) {
             $index_shard->unpackWordDocs();
         }
         $generation = $index_archive->initGenerationToAdd($index_shard);
         if (isset($recipe_sites)) {
             crawlLog("... Adding " . count($recipe_sites) . " recipe docs.");
             $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0);
         }
         $k = 0;
         foreach ($recipe_sites as $site) {
             $recipe = $site[self::TITLE];
             $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1);
             $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
         }
         $index_shard->changeDocumentOffsets($summary_offsets);
         $index_archive->addIndexData($index_shard);
         $index_archive->saveAndAddCurrentShardDictionary();
         $index_archive->dictionary->mergeAllTiers();
         $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name);
         crawlLog("...Recipe plugin finished.");
     }
 }

Пример #3

Показать файл

Файл: crawl_model.php Проект: yakar/yioop

 /**
  * Gets a list of all index archives of crawls that have been conducted
  *
  * @param bool $return_arc_bundles whether index bundles used for indexing
  *     arc or other archive bundles should be included in the lsit
  * @param bool $return_recrawls whether index archive bundles generated as
  *     a result of recrawling should be included in the result
  * @param array $machine_urls an array of urls of yioop queue servers
  * @param bool $cache whether to try to get/set the data to a cache file
  *
  * @return array available IndexArchiveBundle directories and
  *     their meta information this meta information includes the time of
  *     the crawl, its description, the number of pages downloaded, and the
  *     number of partitions used in storing the inverted index
  */
 function getCrawlList($return_arc_bundles = false, $return_recrawls = false, $machine_urls = NULL, $cache = false)
 {
     if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
         $arg = $return_arc_bundles && $return_recrawls ? 3 : ($return_recrawls ? 2 : ($return_arc_bundles ? 1 : 0));
         $cache_file = CRAWL_DIR . "/cache/" . self::network_crawllist_base_name . "{$arg}.txt";
         if ($cache && file_exists($cache_file) && filemtime($cache_file) + 300 > time()) {
             return unserialize(file_get_contents($cache_file));
         }
         $list_strings = $this->execMachines("getCrawlList", $machine_urls, $arg);
         $list = $this->aggregateCrawlList($list_strings);
         if ($cache) {
             file_put_contents($cache_file, serialize($list));
         }
         return $list;
     }
     $list = array();
     $dirs = glob(CRAWL_DIR . '/cache/' . self::index_data_base_name . '*', GLOB_ONLYDIR);
     foreach ($dirs as $dir) {
         $crawl = array();
         $pre_timestamp = strstr($dir, self::index_data_base_name);
         $crawl['CRAWL_TIME'] = substr($pre_timestamp, strlen(self::index_data_base_name));
         $info = IndexArchiveBundle::getArchiveInfo($dir);
         $index_info = @unserialize($info['DESCRIPTION']);
         $crawl['DESCRIPTION'] = "";
         if (!$return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
             continue;
         } else {
             if ($return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
                 $crawl['DESCRIPTION'] = "RECRAWL::";
             }
         }
         $sched_path = CRAWL_DIR . '/schedules/' . self::schedule_data_base_name . $crawl['CRAWL_TIME'];
         $crawl['RESUMABLE'] = false;
         if (is_dir($sched_path)) {
             $sched_dir = opendir($sched_path);
             while (($name = readdir($sched_dir)) !== false) {
                 $sub_path = "{$sched_path}/{$name}";
                 if (!is_dir($sub_path) || $name == '.' || $name == '..') {
                     continue;
                 }
                 $sub_dir = opendir($sub_path);
                 $i = 0;
                 while (($sub_name = readdir($sub_dir)) !== false && $i < 5) {
                     if ($sub_name[0] == 'A' && $sub_name[1] == 't') {
                         $crawl['RESUMABLE'] = true;
                         break 2;
                     }
                 }
                 closedir($sub_dir);
             }
             closedir($sched_dir);
         }
         if (isset($index_info['DESCRIPTION'])) {
             $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION'];
         }
         $crawl['VISITED_URLS_COUNT'] = isset($info['VISITED_URLS_COUNT']) ? $info['VISITED_URLS_COUNT'] : 0;
         $crawl['COUNT'] = isset($info['COUNT']) ? $info['COUNT'] : 0;
         $crawl['NUM_DOCS_PER_PARTITION'] = isset($info['NUM_DOCS_PER_PARTITION']) ? $info['NUM_DOCS_PER_PARTITION'] : 0;
         $crawl['WRITE_PARTITION'] = isset($info['WRITE_PARTITION']) ? $info['WRITE_PARTITION'] : 0;
         $list[] = $crawl;
     }
     if ($return_arc_bundles) {
         $dirs = glob(CRAWL_DIR . '/archives/*', GLOB_ONLYDIR);
         foreach ($dirs as $dir) {
             $crawl = array();
             $crawl['CRAWL_TIME'] = crc32($dir);
             $crawl['DESCRIPTION'] = "ARCFILE::";
             $crawl['ARC_DIR'] = $dir;
             $ini_file = "{$dir}/arc_description.ini";
             if (!file_exists($ini_file)) {
                 continue;
             } else {
                 $ini = parse_ini_with_fallback($ini_file);
                 $crawl['ARC_TYPE'] = $ini['arc_type'];
                 $crawl['DESCRIPTION'] .= $ini['description'];
             }
             $crawl['VISITED_URLS_COUNT'] = 0;
             $crawl['COUNT'] = 0;
             $crawl['NUM_DOCS_PER_PARTITION'] = 0;
             $crawl['WRITE_PARTITION'] = 0;
             $list[] = $crawl;
         }
     }
     return $list;
 }