Пример #1
0
 /**
  * Writes status information about the current crawl so that the webserver
  * app can use it for its display.
  *
  * @param array $sites contains the most recently crawled sites
  */
 function writeCrawlStatus(&$sites)
 {
     $crawl_status = array();
     $stat_file = CRAWL_DIR . "/schedules/crawl_status.txt";
     if (file_exists($stat_file)) {
         $crawl_status = unserialize(file_get_contents($stat_file));
         if (!isset($crawl_status['CRAWL_TIME']) || $crawl_status['CRAWL_TIME'] != $this->crawl_time) {
             $crawl_status = array();
             // status of some other crawl
         }
     }
     $crawl_status['MOST_RECENT_FETCHER'] = $this->most_recent_fetcher;
     if (isset($sites[self::RECENT_URLS])) {
         $crawl_status['MOST_RECENT_URLS_SEEN'] = $sites[self::RECENT_URLS];
     }
     $crawl_status['CRAWL_TIME'] = $this->crawl_time;
     $info_bundle = IndexArchiveBundle::getArchiveInfo(CRAWL_DIR . '/cache/' . self::index_data_base_name . $this->crawl_time);
     $index_archive_info = unserialize($info_bundle['DESCRIPTION']);
     $crawl_status['COUNT'] = $info_bundle['COUNT'];
     $now = time();
     $change_in_time = ONE_HOUR + 1;
     while (count($this->hourly_crawl_data) > 0 && $change_in_time > ONE_HOUR) {
         $least_recent_hourly_pair = array_pop($this->hourly_crawl_data);
         $change_in_time = $now - $least_recent_hourly_pair[0];
     }
     if ($change_in_time <= ONE_HOUR) {
         $this->hourly_crawl_data[] = $least_recent_hourly_pair;
     }
     array_unshift($this->hourly_crawl_data, array($now, $info_bundle['VISITED_URLS_COUNT']));
     $crawl_status['VISITED_COUNT_HISTORY'] = $this->hourly_crawl_data;
     $crawl_status['VISITED_URLS_COUNT'] = $info_bundle['VISITED_URLS_COUNT'];
     $crawl_status['DESCRIPTION'] = $index_archive_info['DESCRIPTION'];
     $crawl_status['QUEUE_PEAK_MEMORY'] = memory_get_peak_usage();
     file_put_contents($stat_file, serialize($crawl_status), LOCK_EX);
     chmod($stat_file, 0777);
     crawlLog("End checking for new URLs data memory usage" . memory_get_usage());
     crawlLog("The current crawl description is: " . $index_archive_info['DESCRIPTION']);
     crawlLog("Number of unique pages so far: " . $info_bundle['VISITED_URLS_COUNT']);
     crawlLog("Total urls extracted so far: " . $info_bundle['COUNT']);
     if (isset($sites[self::RECENT_URLS])) {
         crawlLog("Of these, the most recent urls are:");
         foreach ($sites[self::RECENT_URLS] as $url) {
             crawlLog("URL: " . iconv("UTF-8", "ISO-8859-1//IGNORE", $url));
         }
     }
 }
Пример #2
0
 /**
  * Implements post processing of recipes. recipes are extracted
  * ingredients are scrubbed and recipes are clustered. The clustered
  * recipes are added back to the index.
  *
  * @param string $index_name  index name of the current crawl.
  */
 function postProcessing($index_name)
 {
     global $INDEXING_PLUGINS;
     if (!class_exists("SplHeap")) {
         crawlLog("...Recipe Plugin Requires SPLHeap for clustering!");
         crawlLog("...Aborting plugin");
         return;
     }
     $locale_tag = guessLocale();
     setLocaleObject($locale_tag);
     $search_controller = new SearchController($INDEXING_PLUGINS);
     $query = "recipe:all i:{$index_name}";
     crawlLog("...Running Recipe Plugin!");
     crawlLog("...Finding docs tagged as recipes.");
     $more_docs = true;
     $raw_recipes = array();
     $limit = 0;
     $num = 100;
     while ($more_docs) {
         $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name);
         if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) {
             $raw_recipes = array_merge($raw_recipes, $results["PAGES"]);
         }
         crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . ".");
         $limit += $num_results;
         if (isset($results["SAVE_POINT"])) {
             $end = true;
             foreach ($results["SAVE_POINT"] as $save_point) {
                 if ($save_point != -1) {
                     $end = false;
                 }
             }
             if ($end) {
                 $more_docs = false;
             }
         } else {
             $more_docs = false;
         }
     }
     crawlLog("...Clustering.");
     // only cluster if would make more than one cluster
     if (count($raw_recipes) * CLUSTER_RATIO > 1) {
         $recipes = array();
         $i = 0;
         foreach ($raw_recipes as $raw_recipe) {
             $description = $raw_recipe[self::DESCRIPTION];
             $ingredients = explode("||", $description);
             if (is_array($ingredients) && count($ingredients) > 1) {
                 $recipes[$i][0] = $raw_recipe[self::TITLE];
                 $recipes[$i][1] = $ingredients;
                 $recipes[$i][2] = crawlHash($raw_recipe[self::URL]);
                 $recipes[$i][3] = $raw_recipe;
                 $i++;
             }
         }
         $recipes_ingredients = array();
         $count = count($recipes);
         foreach ($recipes as $key => $recipe) {
             foreach ($recipe[1] as $index => $ingredient) {
                 if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") {
                     $mainIngredient = $this->getIngredientName((string) $ingredient);
                     if (strlen($mainIngredient) != 0) {
                         $recipe[1][$index] = $mainIngredient;
                     } else {
                         unset($recipe[1][$index]);
                     }
                 } else {
                     unset($recipe[1][$index]);
                 }
             }
             $recipes[$key] = $recipe;
         }
         $count = count($recipes);
         $k = 0;
         $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray');
         for ($i = 0; $i < $count; $i++) {
             $recipe1_main_ingredient = "";
             $recipe1 = $recipes[$i][1];
             $recipe_name = $recipes[$i][0];
             $recipe1_title = strtolower($recipes[$i][0]);
             $distinct_ingredients[$recipe_name] = $recipes[$i][1];
             $doc_keys[$recipe_name] = $recipes[$i][2];
             $recipes_summary[$recipe_name] = $recipes[$i][3];
             for ($j = $i + 1; $j < $count; $j++) {
                 $recipe2_main_ingredient = "";
                 $recipe2 = $recipes[$j][1];
                 $recipe2_title = strtolower($recipes[$j][0]);
                 $weights[$k][0] = $recipes[$i][0];
                 $weights[$k][1] = $recipes[$j][0];
                 $merge_array = array_merge($recipe1, $recipe2);
                 $vector_array = array_unique($merge_array);
                 sort($vector_array);
                 $recipe1_vector = array_fill_keys($vector_array, 0);
                 $recipe2_vector = array_fill_keys($vector_array, 0);
                 foreach ($recipe1 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe1_title, $ingredient)) {
                             $recipe1_main_ingredient = $ingredient;
                         }
                     }
                     $recipe1_vector[$ingredient] = 1;
                 }
                 foreach ($recipe2 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe2_title, $ingredient)) {
                             $recipe2_main_ingredient = $ingredient;
                         }
                     }
                     $recipe2_vector[$ingredient] = 1;
                 }
                 $edge_weight = 0;
                 $matches = 1;
                 foreach ($vector_array as $vector) {
                     $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector];
                     $vector_diff[$vector] = pow($diff, 2);
                     if (abs($diff) == 1) {
                         $matches += 1;
                     }
                     $edge_weight += $vector_diff[$vector];
                 }
                 $main_ingredient_match = 1;
                 if ($recipe1_main_ingredient != $recipe2_main_ingredient) {
                     $main_ingredient_match = 1000;
                 }
                 $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match;
                 $weights[$k][2] = $edge_weight;
                 $k++;
             }
         }
         crawlLog("...Making new shard with clustered recipes as docs.");
         $clusters = kruskalClustering($weights, $count, $distinct_ingredients);
         $index_shard = new IndexShard("cluster_shard");
         $word_lists = array();
         $recipe_sites = array();
         foreach ($clusters as $cluster) {
             $count = count($cluster);
             for ($i = 0; $i < $count - 1; $i++) {
                 $meta_ids = array();
                 $summary = array();
                 $recipe = $cluster[$i];
                 $summary[self::URL] = $recipes_summary[$recipe][self::URL];
                 $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE];
                 $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION];
                 $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP];
                 $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING];
                 $summary[self::HASH] = $recipes_summary[$recipe][self::HASH];
                 $doc_keys[$recipe] = crawlHash($summary[self::URL], true);
                 $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1);
                 $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                 $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE];
                 $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE];
                 $recipe_sites[] = $summary;
                 $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]);
                 crawlLog("ingredient:" . $cluster["ingredient"]);
                 if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) {
                     crawlLog("Problem inserting recipe: " . $summary[self::TITLE]);
                 }
             }
         }
         $shard_string = $index_shard->save(true);
         $index_shard = IndexShard::load("cluster_shard", $shard_string);
         unset($shard_string);
         crawlLog("...Adding recipe shard to index archive bundle");
         $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle($dir, false);
         if ($index_shard->word_docs_packed) {
             $index_shard->unpackWordDocs();
         }
         $generation = $index_archive->initGenerationToAdd($index_shard);
         if (isset($recipe_sites)) {
             crawlLog("... Adding " . count($recipe_sites) . " recipe docs.");
             $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0);
         }
         $k = 0;
         foreach ($recipe_sites as $site) {
             $recipe = $site[self::TITLE];
             $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1);
             $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
         }
         $index_shard->changeDocumentOffsets($summary_offsets);
         $index_archive->addIndexData($index_shard);
         $index_archive->saveAndAddCurrentShardDictionary();
         $index_archive->dictionary->mergeAllTiers();
         $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name);
         crawlLog("...Recipe plugin finished.");
     }
 }
Пример #3
0
 /**
  * Gets a list of all index archives of crawls that have been conducted
  *
  * @param bool $return_arc_bundles whether index bundles used for indexing
  *     arc or other archive bundles should be included in the lsit
  * @param bool $return_recrawls whether index archive bundles generated as
  *     a result of recrawling should be included in the result
  * @param array $machine_urls an array of urls of yioop queue servers
  * @param bool $cache whether to try to get/set the data to a cache file
  *
  * @return array available IndexArchiveBundle directories and
  *     their meta information this meta information includes the time of
  *     the crawl, its description, the number of pages downloaded, and the
  *     number of partitions used in storing the inverted index
  */
 function getCrawlList($return_arc_bundles = false, $return_recrawls = false, $machine_urls = NULL, $cache = false)
 {
     if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
         $arg = $return_arc_bundles && $return_recrawls ? 3 : ($return_recrawls ? 2 : ($return_arc_bundles ? 1 : 0));
         $cache_file = CRAWL_DIR . "/cache/" . self::network_crawllist_base_name . "{$arg}.txt";
         if ($cache && file_exists($cache_file) && filemtime($cache_file) + 300 > time()) {
             return unserialize(file_get_contents($cache_file));
         }
         $list_strings = $this->execMachines("getCrawlList", $machine_urls, $arg);
         $list = $this->aggregateCrawlList($list_strings);
         if ($cache) {
             file_put_contents($cache_file, serialize($list));
         }
         return $list;
     }
     $list = array();
     $dirs = glob(CRAWL_DIR . '/cache/' . self::index_data_base_name . '*', GLOB_ONLYDIR);
     foreach ($dirs as $dir) {
         $crawl = array();
         $pre_timestamp = strstr($dir, self::index_data_base_name);
         $crawl['CRAWL_TIME'] = substr($pre_timestamp, strlen(self::index_data_base_name));
         $info = IndexArchiveBundle::getArchiveInfo($dir);
         $index_info = @unserialize($info['DESCRIPTION']);
         $crawl['DESCRIPTION'] = "";
         if (!$return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
             continue;
         } else {
             if ($return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
                 $crawl['DESCRIPTION'] = "RECRAWL::";
             }
         }
         $sched_path = CRAWL_DIR . '/schedules/' . self::schedule_data_base_name . $crawl['CRAWL_TIME'];
         $crawl['RESUMABLE'] = false;
         if (is_dir($sched_path)) {
             $sched_dir = opendir($sched_path);
             while (($name = readdir($sched_dir)) !== false) {
                 $sub_path = "{$sched_path}/{$name}";
                 if (!is_dir($sub_path) || $name == '.' || $name == '..') {
                     continue;
                 }
                 $sub_dir = opendir($sub_path);
                 $i = 0;
                 while (($sub_name = readdir($sub_dir)) !== false && $i < 5) {
                     if ($sub_name[0] == 'A' && $sub_name[1] == 't') {
                         $crawl['RESUMABLE'] = true;
                         break 2;
                     }
                 }
                 closedir($sub_dir);
             }
             closedir($sched_dir);
         }
         if (isset($index_info['DESCRIPTION'])) {
             $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION'];
         }
         $crawl['VISITED_URLS_COUNT'] = isset($info['VISITED_URLS_COUNT']) ? $info['VISITED_URLS_COUNT'] : 0;
         $crawl['COUNT'] = isset($info['COUNT']) ? $info['COUNT'] : 0;
         $crawl['NUM_DOCS_PER_PARTITION'] = isset($info['NUM_DOCS_PER_PARTITION']) ? $info['NUM_DOCS_PER_PARTITION'] : 0;
         $crawl['WRITE_PARTITION'] = isset($info['WRITE_PARTITION']) ? $info['WRITE_PARTITION'] : 0;
         $list[] = $crawl;
     }
     if ($return_arc_bundles) {
         $dirs = glob(CRAWL_DIR . '/archives/*', GLOB_ONLYDIR);
         foreach ($dirs as $dir) {
             $crawl = array();
             $crawl['CRAWL_TIME'] = crc32($dir);
             $crawl['DESCRIPTION'] = "ARCFILE::";
             $crawl['ARC_DIR'] = $dir;
             $ini_file = "{$dir}/arc_description.ini";
             if (!file_exists($ini_file)) {
                 continue;
             } else {
                 $ini = parse_ini_with_fallback($ini_file);
                 $crawl['ARC_TYPE'] = $ini['arc_type'];
                 $crawl['DESCRIPTION'] .= $ini['description'];
             }
             $crawl['VISITED_URLS_COUNT'] = 0;
             $crawl['COUNT'] = 0;
             $crawl['NUM_DOCS_PER_PARTITION'] = 0;
             $crawl['WRITE_PARTITION'] = 0;
             $list[] = $crawl;
         }
     }
     return $list;
 }