/** * Builds an inverted index shard (word --> {docs it appears in}) * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. * This inverted index shard is then merged by a queue_server * into the inverted index of the current generation of the crawl. * The complete inverted index for the whole crawl is built out of these * inverted indexes for generations. The point of computing a partial * inverted index on the fetcher is to reduce some of the computational * burden on the queue server. The resulting mini index computed by * buildMiniInvertedIndex() is stored in * $this->found_sites[self::INVERTED_INDEX] * */ function buildMiniInvertedIndex() { $start_time = microtime(); $keypad = ""; crawlLog(" Start building mini inverted index ... Current Memory:" . memory_get_usage()); $num_seen = count($this->found_sites[self::SEEN_URLS]); $this->num_seen_sites += $num_seen; /* for the fetcher we are not saving the index shards so name doesn't matter. */ if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) { $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}"); } for ($i = 0; $i < $num_seen; $i++) { $interim_time = microtime(); $site = $this->found_sites[self::SEEN_URLS][$i]; if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) { continue; } $doc_rank = false; if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) { $doc_rank = $this->archive_iterator->weight($site); } if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources); } $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { if (isset($site[self::LANG])) { if (isset($this->programming_language_extension[$site[self::LANG]])) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (!$is_link) { //store inlinks so they can be searched by $num_links = count($site[self::LINKS]); if ($num_links > 0) { $link_rank = false; if ($doc_rank !== false) { $link_rank = max($doc_rank - 1, 1); } } else { $link_rank = false; } } $num_queue_servers = count($this->queue_servers); if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank); /* $this->no_process_links is set when doing things like mix recrawls. In this case links likely already will appear in what indexing, so don't index again. $site[self::JUST_META] is set when have a sitemap or robots.txt (this case set later). In this case link info is not particularly useful for indexing and can greatly slow building inverted index. */ if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) { foreach ($site[self::LINKS] as $url => $link_text) { /* this mysterious check means won't index links from robots.txt. Sitemap will still be in TO_CRAWL, but that's done elsewhere */ if (strlen($url) == 0 || is_numeric($url)) { continue; } $link_host = UrlParser::getHost($url); if (strlen($link_host) == 0) { continue; } $part_num = calculatePartition($link_host, $num_queue_servers); $summary = array(); if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) { $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array(); } $elink_flag = $link_host != $host ? true : false; $link_text = strip_tags($link_text); $ref = $elink_flag ? "eref" : "iref"; $url = str_replace('|', "%7C", $url); $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url; $elink_flag_string = $elink_flag ? "e" : "i"; $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1); $summary[self::URL] = $link_id; $summary[self::TITLE] = $url; // stripping html to be on the safe side $summary[self::DESCRIPTION] = $link_text; $summary[self::TIMESTAMP] = $site[self::TIMESTAMP]; $summary[self::ENCODING] = $site[self::ENCODING]; $summary[self::HASH] = $link_id; $summary[self::TYPE] = "link"; $summary[self::HTTP_CODE] = $link_keys; $summary[self::LANG] = $lang; $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary; $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang); $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url); if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) { $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}"); } $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank); } } $iterim_elapse = changeInMicrotime($interim_time); if ($iterim_elapse > 5) { crawlLog("..Inverting " . $site[self::URL] . "...took > 5s."); } crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]); } if ($this->crawl_type == self::ARCHIVE_CRAWL) { $this->recrawl_check_scheduler = true; } crawlLog(" Build mini inverted index time " . changeInMicrotime($start_time)); }
/** * Gets $num many objects out of the web archive starting at byte $offset * * If the $next_flag is true the archive iterator is advance and if $fh * is not NULL then it is assumed to be an open resource pointing to the * archive (saving the time to open it). * * @param int $offset a valid byte offset into a web archive * @param int $num number of objects to return * @param bool $next_flag whether to advance the archive iterator * @param resource $fh either NULL or a file resource to the archive * @return array the $num objects beginning at $offset */ function getObjects($offset, $num, $next_flag = true, $fh = NULL) { $open_flag = false; if ($fh == NULL) { $fh = $this->open(); $open_flag = true; } $is_string = $this->is_string; $objects = array(); $compressed_int_len = $this->compressor->compressedIntLen(); if ($is_string) { $storage_len = strlen($this->storage); } if (!$is_string && fseek($fh, $offset) == 0 || $is_string && $offset < $storage_len) { for ($i = 0; $i < $num; $i++) { if (!$is_string && feof($fh)) { break; } if ($is_string && $offset >= $storage_len) { break; } $object = NULL; $compressed_len = $is_string ? substr($this->storage, $offset, $compressed_int_len) : fread($fh, $compressed_int_len); $len = $this->compressor->uncompressInt($compressed_len); if ($len > 0 && $len < MAX_ARCHIVE_OBJECT_SIZE) { $compressed_file = $is_string ? substr($this->storage, $offset + $compressed_int_len, $len) : fread($fh, $len); $file = $this->compressor->uncompress($compressed_file); $object = @unserialize($file); $offset += $compressed_int_len + $len; $objects[] = array($offset, $object); } else { crawlLog("Web archive saw blank line " . "when looked for offset {$offset}"); } } if ($next_flag) { $this->iterator_pos = $offset; } } if ($open_flag) { $this->close($fh); } return $objects; }
/** * Deletes file associated with given node from disk * @param int $id is the id of the node whose file is to be deleted */ function deleteNodeFile($id) { $node_file = $this->dir . "/{$id}.txt"; if (file_exists($node_file)) { unlink($node_file); } else { crawlLog("Could not delete node {$id} from disk"); } }
/** * Inserts the provided $key - $value pair into the hash table * * @param string $key the key to use for the insert (will be needed for * lookup) * @param string $value the value associated with $key * @param int $probe if the location in the hash table is already known * to be $probe then this variable can be used to save a lookup * @return bool whether the insert was successful or not */ function insert($key, $value, $probe = false) { $null = $this->null; $deleted = $this->deleted; if ($probe === false) { $probe = $this->lookup($key, self::ALWAYS_RETURN_PROBE); } if ($probe === false) { /* this is a little slow the idea is we can't use deleted slots until we are sure $key isn't in the table */ $probe = $this->lookupArray($key, array($null, $deleted), self::ALWAYS_RETURN_PROBE); if ($probe === false) { crawlLog("No space in hash table"); return false; } } //there was a free slot so write entry... $data = pack("x" . ($this->key_size + $this->value_size)); if (strlen($value) < $this->value_size) { /* this case should not happen, rather give an error we null terminate the string to the desired length */ $value = str_pad($value, $this->value_size, '\\0'); } //first the key for ($i = 0; $i < $this->key_size; $i++) { $data[$i] = $key[$i]; } //then the value for ($i = 0; $i < $this->value_size; $i++) { $data[$i + $this->key_size] = $value[$i]; } $this->put($probe, $data); $this->count++; $this->checkSave(); return true; }
/** * Given the results of a getPage call, check whether or not the response * had the words NOTICE, WARNING, FATAL which might indicate an error on * the server. If it does, then the $response string is sent to the * crawlLog * * @param string $response getPage response in which to check for errors */ static function checkResponseForErrors($response) { if (preg_match("/NOTICE|WARNING|FATAL/i", $response)) { crawlLog("There appears to have been an error in the server " . "response. Response was:"); crawlLog(wordwrap($response)); } }
/** * Used to stop a daemon that is running in the background * * @param string $name the main name of this daemon such as queue_server * or fetcher. * @param string $subname the instance name if it is possible for more * than one copy of the daemon to be running at the same time * @param bool $exit whether this method should just return (false) or * call exit() (true) */ static function stop($name, $subname = "", $exit = true) { $name_string = CrawlDaemon::getNameString($name, $subname); $lock_file = CrawlDaemon::getLockFileName($name, $subname); $not_web_setting = php_sapi_name() == 'cli'; if (file_exists($lock_file)) { unlink($lock_file); if ($not_web_setting) { crawlLog("Sending stop signal to {$name_string}..."); } } else { if ($not_web_setting) { crawlLog("{$name_string} does not appear to running..."); } } if ($exit) { exit; } }
/** * Implements post processing of recipes. recipes are extracted * ingredients are scrubbed and recipes are clustered. The clustered * recipes are added back to the index. * * @param string $index_name index name of the current crawl. */ function postProcessing($index_name) { global $INDEXING_PLUGINS; if (!class_exists("SplHeap")) { crawlLog("...Recipe Plugin Requires SPLHeap for clustering!"); crawlLog("...Aborting plugin"); return; } $locale_tag = guessLocale(); setLocaleObject($locale_tag); $search_controller = new SearchController($INDEXING_PLUGINS); $query = "recipe:all i:{$index_name}"; crawlLog("...Running Recipe Plugin!"); crawlLog("...Finding docs tagged as recipes."); $more_docs = true; $raw_recipes = array(); $limit = 0; $num = 100; while ($more_docs) { $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name); if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) { $raw_recipes = array_merge($raw_recipes, $results["PAGES"]); } crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . "."); $limit += $num_results; if (isset($results["SAVE_POINT"])) { $end = true; foreach ($results["SAVE_POINT"] as $save_point) { if ($save_point != -1) { $end = false; } } if ($end) { $more_docs = false; } } else { $more_docs = false; } } crawlLog("...Clustering."); // only cluster if would make more than one cluster if (count($raw_recipes) * CLUSTER_RATIO > 1) { $recipes = array(); $i = 0; foreach ($raw_recipes as $raw_recipe) { $description = $raw_recipe[self::DESCRIPTION]; $ingredients = explode("||", $description); if (is_array($ingredients) && count($ingredients) > 1) { $recipes[$i][0] = $raw_recipe[self::TITLE]; $recipes[$i][1] = $ingredients; $recipes[$i][2] = crawlHash($raw_recipe[self::URL]); $recipes[$i][3] = $raw_recipe; $i++; } } $recipes_ingredients = array(); $count = count($recipes); foreach ($recipes as $key => $recipe) { foreach ($recipe[1] as $index => $ingredient) { if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") { $mainIngredient = $this->getIngredientName((string) $ingredient); if (strlen($mainIngredient) != 0) { $recipe[1][$index] = $mainIngredient; } else { unset($recipe[1][$index]); } } else { unset($recipe[1][$index]); } } $recipes[$key] = $recipe; } $count = count($recipes); $k = 0; $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray'); for ($i = 0; $i < $count; $i++) { $recipe1_main_ingredient = ""; $recipe1 = $recipes[$i][1]; $recipe_name = $recipes[$i][0]; $recipe1_title = strtolower($recipes[$i][0]); $distinct_ingredients[$recipe_name] = $recipes[$i][1]; $doc_keys[$recipe_name] = $recipes[$i][2]; $recipes_summary[$recipe_name] = $recipes[$i][3]; for ($j = $i + 1; $j < $count; $j++) { $recipe2_main_ingredient = ""; $recipe2 = $recipes[$j][1]; $recipe2_title = strtolower($recipes[$j][0]); $weights[$k][0] = $recipes[$i][0]; $weights[$k][1] = $recipes[$j][0]; $merge_array = array_merge($recipe1, $recipe2); $vector_array = array_unique($merge_array); sort($vector_array); $recipe1_vector = array_fill_keys($vector_array, 0); $recipe2_vector = array_fill_keys($vector_array, 0); foreach ($recipe1 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe1_title, $ingredient)) { $recipe1_main_ingredient = $ingredient; } } $recipe1_vector[$ingredient] = 1; } foreach ($recipe2 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe2_title, $ingredient)) { $recipe2_main_ingredient = $ingredient; } } $recipe2_vector[$ingredient] = 1; } $edge_weight = 0; $matches = 1; foreach ($vector_array as $vector) { $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector]; $vector_diff[$vector] = pow($diff, 2); if (abs($diff) == 1) { $matches += 1; } $edge_weight += $vector_diff[$vector]; } $main_ingredient_match = 1; if ($recipe1_main_ingredient != $recipe2_main_ingredient) { $main_ingredient_match = 1000; } $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match; $weights[$k][2] = $edge_weight; $k++; } } crawlLog("...Making new shard with clustered recipes as docs."); $clusters = kruskalClustering($weights, $count, $distinct_ingredients); $index_shard = new IndexShard("cluster_shard"); $word_lists = array(); $recipe_sites = array(); foreach ($clusters as $cluster) { $count = count($cluster); for ($i = 0; $i < $count - 1; $i++) { $meta_ids = array(); $summary = array(); $recipe = $cluster[$i]; $summary[self::URL] = $recipes_summary[$recipe][self::URL]; $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE]; $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION]; $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP]; $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING]; $summary[self::HASH] = $recipes_summary[$recipe][self::HASH]; $doc_keys[$recipe] = crawlHash($summary[self::URL], true); $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1); $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost; $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE]; $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE]; $recipe_sites[] = $summary; $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]); crawlLog("ingredient:" . $cluster["ingredient"]); if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) { crawlLog("Problem inserting recipe: " . $summary[self::TITLE]); } } } $shard_string = $index_shard->save(true); $index_shard = IndexShard::load("cluster_shard", $shard_string); unset($shard_string); crawlLog("...Adding recipe shard to index archive bundle"); $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle($dir, false); if ($index_shard->word_docs_packed) { $index_shard->unpackWordDocs(); } $generation = $index_archive->initGenerationToAdd($index_shard); if (isset($recipe_sites)) { crawlLog("... Adding " . count($recipe_sites) . " recipe docs."); $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0); } $k = 0; foreach ($recipe_sites as $site) { $recipe = $site[self::TITLE]; $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1); $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } $index_shard->changeDocumentOffsets($summary_offsets); $index_archive->addIndexData($index_shard); $index_archive->saveAndAddCurrentShardDictionary(); $index_archive->dictionary->mergeAllTiers(); $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name); crawlLog("...Recipe plugin finished."); } }
/** * Add the array of $pages to the WebArchiveBundle pages being stored in * the partition according to write partition and the field used to store * the resulting offsets given by $offset_field. * * @param string $offset_field field used to record offsets after storing * @param array& $pages data to store * @return int the write_partition the pages were stored in */ function addPages($offset_field, &$pages) { $num_pages = count($pages); if ($this->num_docs_per_partition > 0 && $num_pages > $this->num_docs_per_partition) { crawlLog("ERROR! At most " . $this->num_docs_per_partition . "many pages can be added in one go!"); exit; } $partition = $this->getPartition($this->write_partition); $part_count = $partition->count; if ($this->num_docs_per_partition > 0 && $num_pages + $part_count > $this->num_docs_per_partition) { $this->setWritePartition($this->write_partition + 1); $partition = $this->getPartition($this->write_partition); } $this->addCount($num_pages); //only adds to count on disk $this->count += $num_pages; $partition->addObjects($offset_field, $pages, NULL, NULL, false); return $this->write_partition; }
/** * Used to flush changes of hash_url indexes caused by adjusting weights * in the bundle's priority queue to its hash table. */ function notifyFlush() { foreach ($this->notify_buffer as $hash_url => $index) { $both = $this->lookupHashTable($hash_url, HashTable::RETURN_BOTH); if ($both !== false) { list($probe, $value) = $both; $packed_offset = substr($value, 0, 4); $packed_flag = substr($value, 8, 4); $new_data = $packed_offset . packInt($index) . $packed_flag; $this->insertHashTable($hash_url, $new_data, $probe); } else { crawlLog("NOTIFY LOOKUP FAILED. INDEX WAS {$index}. DATA WAS " . bin2hex($hash_url)); } } $this->notify_buffer = array(); }
/** * Used to recompute both the index shards and the dictionary * of an index archive. The first step involves re-extracting the * word into an inverted index from the summaries' web_archives. * Then a reindex is done. * * @param string $archive_path file path to a IndexArchiveBundle */ function rebuildIndexArchive($archive_path) { $archive_type = $this->getArchiveKind($archive_path); if ($archive_type != "IndexArchiveBundle") { $this->badFormatMessageAndExit($archive_path); } $info = $archive_type::getArchiveInfo($archive_path); $tmp = unserialize($info["DESCRIPTION"]); $video_sources = $tmp[self::VIDEO_SOURCES]; $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt")); $num_generations = $generation_info['ACTIVE'] + 1; $archive = new WebArchiveBundle($archive_path . "/summaries"); $seen = 0; $generation = 0; $keypad = ""; while ($generation < $num_generations) { $partition = $archive->getPartition($generation, false); $shard_name = $archive_path . "/posting_doc_shards/index{$generation}"; crawlLog("Processing partition {$generation}"); if (file_exists($shard_name)) { crawlLog("..Unlinking old shard {$generation}"); @unlink($shard_name); } $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true); $seen_partition = 0; while ($seen_partition < $partition->count) { $num_to_get = min($partition->count - $seen_partition, 8000); $offset = $partition->iterator_pos; $objects = $partition->nextObjects($num_to_get); $cnt = 0; foreach ($objects as $object) { $cnt++; $site = $object[1]; if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); $link_to = "LINK TO:"; } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $video_sources); $link_to = ""; } $so_far_cnt = $seen_partition + $cnt; $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. "; crawlTimeoutLog($time_out_message); $seen++; $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false); $offset = $object[0]; } $seen_partition += $num_to_get; } $shard->save(false, true); $generation++; } $this->reindexIndexArchive($archive_path); }
/** * This method adds robots metas to or removes entirely a summary * produced by a text page processor or its subsclasses depending on * whether the summary title and description satisfy various rules * in $this->filter_rules * * @param array& $summary the summary data produced by the relevant page * processor's handle method; modified in-place. * @param string $url the url where the summary contents came from */ function pageSummaryProcessing(&$summary, $url) { $sites = array_keys($this->filter_rules); $filter_rules = $this->filter_rules; $rules = $filter_rules['default'] ? $filter_rules['default'] : array(); foreach ($sites as $site) { if ($site == "default") { continue; } $sign = $site[0] == '-' ? false : true; if (!$sign || $site[0] == '+') { $check_url = substr($site, 1); } else { $check_url = $site; } if ($sign && UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url) || !$sign && !UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url)) { $rules = array_merge($rules, $filter_rules[$site]); } } foreach ($rules as $rule) { $preconditions = $rule["PRECONDITIONS"]; $actions = $rule["ACTIONS"]; $filter_flag = $this->checkFilter($preconditions, $summary[self::TITLE], $summary[self::DESCRIPTION]); if ($filter_flag) { if (in_array("NOPROCESS", $actions)) { crawlLog(" Word filter plugin removed page."); $summary = false; break; } else { if (!isset($summary[self::ROBOT_METAS])) { $summary[self::ROBOT_METAS] = array(); } $summary[self::ROBOT_METAS] += $actions; } } } }
/** * This is the function that should be called to get the * classifier_trainer to start training a logistic regression instance for * a particular classifier. The class label corresponding to the * classifier to be finalized should be passed as the second command-line * argument. */ function start() { global $argv; CrawlDaemon::init($argv, "classifier_trainer"); $label = $argv[2]; crawlLog("Initializing classifier trainer log..", $label . '-classifier_trainer', true); $classifier = Classifier::getClassifier($label); $classifier->prepareToFinalize(); $classifier->finalize(); Classifier::setClassifier($classifier); crawlLog("Training complete.\n"); CrawlDaemon::stop('classifier_trainer', $label); }
/** * Write a message to log file depending on debug level for this subpackage * @param string $message what to write to the log */ function log($message) { if ($this->debug > 0) { crawlLog($message); } }
/** * Copies all feeds items newer than $age to a new shard, then deletes * old index shard and database entries older than $age. Finally sets copied * shard to be active. If this method is going to take max_execution_time/2 * it returns false, so an additional job can be schedules; otherwise * it returns true * * @param int $age how many seconds old records should be deleted * @return bool whether job executed to complete */ function rebuildFeedShard($age) { $time = time(); $feed_shard_name = WORK_DIRECTORY . "/feeds/index"; $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index"; $prune_shard = new IndexShard($prune_shard_name); $too_old = $time - $age; if (!$prune_shard) { return false; } $pre_feeds = $this->getNewsSources(); if (!$pre_feeds) { return false; } $feeds = array(); foreach ($pre_feeds as $pre_feed) { if (!isset($pre_feed['NAME'])) { continue; } $feeds[$pre_feed['NAME']] = $pre_feed; } $db = $this->db; // we now rebuild the inverted index with the remaining items $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC"; $result = $db->execute($sql, array($too_old)); if ($result) { $completed = true; crawlLog("..still deleting. Making new index of non-pruned items."); $i = 0; while ($item = $db->fetchArray($result)) { crawlTimeoutLog("..have added %s non-pruned items to index.", $i); $i++; if (!isset($item['SOURCE_NAME'])) { continue; } $source_name = $item['SOURCE_NAME']; if (isset($feeds[$source_name])) { $lang = $feeds[$source_name]['LANGUAGE']; } else { $lang = ""; } $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"]; $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $raw_guid = unbase64Hash($item["GUID"]); $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1); $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]); $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false); } } $prune_shard->save(); @chmod($prune_shard_name, 0777); @chmod($feed_shard_name, 0777); @rename($prune_shard_name, $feed_shard_name); @chmod($feed_shard_name, 0777); $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?"; $db->execute($sql, array($too_old)); }
/** * Removes from the passed array those elements $elt who either are in * the filter bundle or whose $elt[$field_name] is in the bundle. * * @param array& $arr the array to remove elements from * @param array $field_names if not NULL an array of field names of $arr * to use to do filtering */ function differenceFilter(&$arr, $field_names = NULL) { $incremental_time = microtime(); $num_filters = $this->num_filters; $count = count($arr); for ($i = 0; $i < $num_filters; $i++) { if ($i == $num_filters - 1) { $tmp_filter = $this->current_filter; } else { $tmp_filter = BloomFilterFile::load($this->dir_name . "/filter_{$i}.ftr"); } for ($j = 0; $j < $count; $j++) { if ($field_names === NULL) { $tmp =& $arr[$j]; if ($tmp !== false && $tmp_filter->contains($tmp)) { /* We deliberately don't try to add anything that has the hash field set to false. This is our cue to skip an element such as a link document which we know will almost always be unique and so be unnecessary to de-duplicate */ unset($arr[$j]); } } else { //now do the same strategy for the array of fields case foreach ($field_names as $field_name) { $tmp =& $arr[$j][$field_name]; if ($tmp !== false && $tmp_filter->contains($tmp)) { unset($arr[$j]); break; } } } if (changeInMicrotime($incremental_time) > 30) { crawlLog("..Processing item {$j} of {$count} from filter " . "number {$i} of {$num_filters}."); $incremental_time = microtime(); } } } }
/** * Scaless the weights of elements in the queue so that the sum fo the new * weights is $new_total * * This function is used periodically to prevent the queue from being * gummed up because all of the weights stored in it are too small. * * @param int $new_total what the new sum of weights of elements in the * queue will be after normalization */ function normalize($new_total = NUM_URLS_QUEUE_RAM) { $count = $this->count; $total_weight = $this->totalWeight(); if ($total_weight <= 0) { crawlLog("Total queue weight was zero!! Doing uniform renormalization!"); } for ($i = 1; $i <= $count; $i++) { $row = $this->getRow($i); if ($total_weight > 0) { $row[1] = $new_total * $row[1] / $total_weight; } else { $row[1] = $new_total / $count; } $this->putRow($i, $row); } }
/** * Determines based on its size, if index_shard should be added to * the active generation or in a new generation should be started. * If so, a new generation is started, the old generation is saved, and * the dictionary of the old shard is copied to the bundles dictionary * and a log-merge performed if needed * * @param int $add_num_docs number of docs in the shard about to be added * @param object $callback object with join function to be * called if process is taking too long * @param bool $blocking whether there is an ongoing merge tiers operation * occurring, if so don't do anything and return -1 * @return int the active generation after the check and possible change has * been performed */ function initGenerationToAdd($add_num_docs, $callback = NULL, $blocking = false) { $current_num_docs = $this->getActiveShard()->num_docs; crawlLog("Current index shard has " . $current_num_docs . " documents."); $memory_limit = metricToInt(ini_get("memory_limit")); crawlLog("Memory Indexer limit is " . $memory_limit . ". Usage is " . memory_get_usage()); if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation || 0.65 * $memory_limit < memory_get_usage()) { if ($blocking == true) { return -1; } crawlLog("Switching Index Shard..."); $switch_time = microtime(); // Save current shard dictionary to main dictionary $this->forceSave(); $this->addAdvanceGeneration($callback); crawlLog("Switch Index Shard time:" . changeInMicrotime($switch_time)); } return $this->generation_info['ACTIVE']; }
/** * Downloads the next file from the schedule of files to download received * from the web app. */ function copyNextSyncFile() { $dir = $this->sync_dir; $name_server = $this->name_server; $time = time(); $session = md5($time . AUTH_KEY); if (count($this->sync_schedule) <= 0) { return; } $file = array_pop($this->sync_schedule); crawlLog("Start syncing {$file['name']}.."); if ($file['is_dir']) { if (!file_exists("{$dir}/{$file['name']}")) { mkdir("{$dir}/{$file['name']}"); crawlLog(".. {$file['name']} directory created."); } else { crawlLog(".. {$file['name']} directory exists."); } } else { $request = "{$name_server}?c=resource&a=get&time={$time}&session={$session}" . "&robot_instance=" . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&last_sync=" . $this->last_sync . "&f=cache&n=" . urlencode($file["name"]); if ($file["size"] < self::DOWNLOAD_RANGE) { $data = FetchUrl::getPage($request, NULL, true); if ($file["size"] != strlen($data)) { array_push($this->sync_schedule, $file); crawlLog(".. {$file['name']} error downloading, retrying."); return; } file_put_contents("{$dir}/{$file['name']}", $data); crawlLog(".. {$file['name']} file copied."); } else { $offset = 0; $fh = fopen("{$dir}/{$file['name']}", "wb"); $request .= "&l=" . self::DOWNLOAD_RANGE; while ($offset < $file['size']) { $data = FetchUrl::getPage($request . "&o={$offset}", NULL, true); $old_offset = $offset; $offset += self::DOWNLOAD_RANGE; $end_point = min($offset, $file["size"]); //crude check if we need to redownload segment if (strlen($data) != $end_point - $old_offset) { $offset = $old_offset; crawlLog(".. Download error re-requesting segment"); continue; } fwrite($fh, $data); crawlLog(".. {$file['name']} downloaded bytes {$old_offset} " . "to {$end_point}.."); } crawlLog(".. {$file['name']} file copied."); fclose($fh); } } }
/** * Used to remove from the queue urls that are no longer crawlable * because the allowed and disallowed sites have changed. */ function cullNoncrawlableSites() { $count = $this->web_queue->to_crawl_queue->count; crawlLog("Scheduler: " . " Culling noncrawlable urls after change in crawl parameters;" . " Queue Size {$count}"); $start_time = microtime(); $fh = $this->web_queue->openUrlArchive(); $delete_urls = array(); $i = 1; while ($i < $count) { crawlTimeoutLog("..Scheduler: " . "still culling noncrawlable urls. Examining " . "location %s in queue of %s.", $i, $count); $tmp = $this->web_queue->peekQueue($i, $fh); list($url, $weight, $flag, $probe) = $tmp; if (!$this->allowedToCrawlSite($url) || $this->disallowedToCrawlSite($url)) { $delete_urls[] = $url; } $i++; } $this->web_queue->closeUrlArchive($fh); $new_time = microtime(); crawlLog("...Scheduler: Done selecting cullable URLS, time so far:" . changeInMicrotime($start_time)); $this->web_queue->closeUrlArchive($fh); $new_time = microtime(); $num_deletes = count($delete_urls); $k = 0; foreach ($delete_urls as $delete_url) { $k++; crawlTimeoutLog("..Scheduler: Removing selected url %s of %s " . "from queue.", $k, $num_deletes); if ($delete_url) { $this->web_queue->removeQueue($delete_url); } else { /* if there was a hash table look up error still get rid of index from priority queue */ $this->web_queue->to_crawl_queue->poll($k); } } crawlLog("...Scheduler: Removed {$k} cullable URLS from queue in time: " . changeInMicrotime($new_time)); }
/** * {@inheritDoc} * * @param string $db_host the hostname of where the database is located * (not used in all dbms's) * @param string $db_user the user to connect as * @param string $db_password the password of the user to connect as * @param string $db_name the name of the database on host we are * connecting to * @return mixed return false if not successful and some kind of * connection object/identifier otherwise */ function connect($db_host = DB_HOST, $db_user = DB_USER, $db_password = DB_PASSWORD, $db_name = DB_NAME) { try { $this->pdo = new PDO($db_host, $db_user, $db_password); } catch (PDOException $e) { $this->pdo = false; crawlLog('Connection failed: ' . $e->getMessage()); } $this->to_upper_dbms = false; if (stristr($db_host, 'PGSQL')) { $this->to_upper_dbms = 'PGSQL'; } return $this->pdo; }
/** * Used to extract the title, description and links from * a string consisting of webpage data. * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $is_centroid = $this->summarizer_option == self::CENTROID_SUMMARIZER; if (is_string($page)) { $page = preg_replace('/\\ \\;|\\&rdquo\\;|\\&ldquo\\;|\\&mdash\\;/si', ' ', $page); $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page); $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ', $page); $dom = self::dom($dom_page); if ($dom !== false) { $summary[self::ROBOT_METAS] = self::getMetaRobots($dom); $summary[self::TITLE] = self::title($dom); if ($summary[self::TITLE] == "") { $summary[self::TITLE] = self::crudeTitle($dom_page); } $summary[self::LANG] = self::lang($dom, $summary[self::TITLE], $url); if ($is_centroid) { $summary_cloud = CentroidSummarizer::getCentroidSummary($dom_page, $summary[self::LANG]); $summary[self::DESCRIPTION] = $summary_cloud[0]; $summary[self::WORD_CLOUD] = $summary_cloud[1]; crawlLog("..Using Centroid Summarizer"); } else { $summary[self::DESCRIPTION] = self::description($dom, $dom_page); crawlLog("..Using Basic Summarizer"); } $crude = false; if (trim($summary[self::DESCRIPTION]) == "") { $summary[self::DESCRIPTION] = self::crudeDescription($dom_page); crawlLog("..No text extracted. " . "Invoked crude description fallback."); $crude = true; } $summary[self::LINKS] = self::links($dom, $url); if ($summary[self::LINKS] == array()) { $summary[self::LINKS] = parent::extractHttpHttpsUrls($page); } $location = self::location($dom, $url); if ($location) { $summary[self::LINKS][$location] = "location:" . $url; $summary[self::LOCATION] = true; $summary[self::DESCRIPTION] .= $url . " => " . $location; if (!$summary[self::TITLE]) { $summary[self::TITLE] = $url; } } if (!$crude && !$location) { $location = self::relCanonical($dom, $url); if ($location) { $summary[self::LINKS] = array(); $summary[self::LINKS][$location] = "location:" . $url; $summary[self::LOCATION] = true; if (!$summary[self::DESCRIPTION]) { $summary[self::DESCRIPTION] .= $url . " => " . $location; } if (!$summary[self::TITLE]) { $summary[self::TITLE] = $url; } } } $summary[self::PAGE] = $page; if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0 && !$location) { /*maybe not html? treat as text with messed up tags still try to get urls */ $summary_text = parent::process(strip_tags($page), $url); foreach ($summary as $field => $value) { if (($value == "" || $value == array()) && isset($summary_text[$field])) { $summary[$field] = $summary_text[$field]; } } } } else { if ($dom == false) { $summary = parent::process($page, $url); } } } return $summary; }
/** * Creates an database archive iterator with the given parameters. This * kind of iterator is used to cycle through the results of a SQL query * to a database, so that the results might be indexed by Yioop. * * @param string $iterate_timestamp timestamp of the arc archive bundle to * iterate over the pages of * @param string $iterate_dir folder of files to iterate over * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in * @param string $result_dir where to write last position checkpoints to */ function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir) { $this->iterate_timestamp = $iterate_timestamp; $this->iterate_dir = $iterate_dir; $this->result_timestamp = $result_timestamp; $this->result_dir = $result_dir; $ini = parse_ini_with_fallback("{$this->iterate_dir}/arc_description.ini"); $this->dbinfo = array("DBMS" => DBMS, "DB_HOST" => DB_HOST, "DB_NAME" => DB_NAME, "DB_USER" => DB_USER, "DB_PASSWORD" => DB_PASSWORD); foreach ($this->dbinfo as $key => $value) { $ini_key = strtolower($key); if (isset($ini[$ini_key])) { $this->dbinfo[$key] = $ini[$ini_key]; } } $db_class = ucfirst($this->dbinfo["DBMS"]) . "Manager"; $this->db = new $db_class(); $this->db->connect($this->dbinfo['DB_HOST'], $this->dbinfo['DB_USER'], $this->dbinfo['DB_PASSWORD'], $this->dbinfo['DB_NAME']); if (isset($ini['sql'])) { $this->sql = $ini['sql']; } else { crawlLog("Database Archive Iterator needs a SQL statement to run"); exit; } if (isset($ini['field_value_separator'])) { $this->field_value_separator = $ini['field_value_separator']; } else { $this->field_value_separator = "\n----\n"; } if (isset($ini['column_separator'])) { $this->column_separator = $ini['column_separator']; } else { $this->column_separator = "\n====\n"; } if (isset($ini['encoding'])) { $this->encoding = $ini['encoding']; } else { $this->encoding = "UTF-8"; } if (!file_exists($result_dir)) { mkdir($result_dir); } if (file_exists("{$this->result_dir}/iterate_status.txt")) { $this->restoreCheckpoint(); } else { $this->reset(); } }
/** * Creates an text archive iterator with the given parameters. * * @param string $iterate_timestamp timestamp of the arc archive bundle to * iterate over the pages of * @param string $iterate_dir folder of files to iterate over. If this * iterator is used in a fetcher and the data is on a name server * set this to false * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in * @param string $result_dir where to write last position checkpoints to * @param array $ini describes start_ and end_delimiter, file_extension, * encoding, and compression method used for pages in this archive */ function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini = array()) { $this->iterate_timestamp = $iterate_timestamp; $this->iterate_dir = $iterate_dir; $this->result_timestamp = $result_timestamp; $this->result_dir = $result_dir; if (!file_exists($result_dir)) { mkdir($result_dir); } $this->partitions = array(); if ($this->iterate_dir != false) { // false =network/fetcher iterator if ($ini == array()) { $ini = parse_ini_with_fallback("{$this->iterate_dir}/arc_description.ini"); } $extension = $ini['file_extension']; } $this->setIniInfo($ini); if ($this->start_delimiter == "" && $this->end_delimiter == "" && $this->iterate_dir != false) { crawlLog("At least one of start or end delimiter must be set!!"); exit; } if ($this->iterate_dir != false) { foreach (glob("{$this->iterate_dir}/*.{$extension}", GLOB_BRACE) as $filename) { $this->partitions[] = $filename; } } $this->num_partitions = count($this->partitions); $this->status_filename = "{$this->result_dir}/iterate_status.txt"; $this->buffer_filename = $this->result_dir . "/buffer.txt"; if (file_exists($this->status_filename)) { $this->restoreCheckpoint(); } else { $this->reset(); } }
/** * If news_update time has passed, then updates news feeds associated with * this Yioop instance * * @param array $data used by view to render itself. In this case, if there * is a problem updating the news then we will flash a message * @param bool $no_news_process if true than assume news_updater.php is * not running. If false, assume being run from news_updater.php so * update news_process cron time. */ function newsUpdate() { $time = time(); $something_updated = false; $delta = $time - $this->update_time; // every hour get items from feeds if ($delta > ONE_HOUR) { $this->update_time = $time; crawlLog("Performing news feeds update"); $this->sourceModel->updateFeedItems(ONE_WEEK); $something_updated = true; } /* if anything changed rebuild shard */ if ($something_updated) { crawlLog("Deleting feed items and rebuild shard..."); $this->sourceModel->rebuildFeedShard(ONE_WEEK); crawlLog("... delete complete, shard rebuilt"); } else { crawlLog("No updates needed."); } }