Esempio n. 1
0
 /**
  * Builds an inverted index shard (word --> {docs it appears in})
  * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
  * This inverted index shard is then merged by a queue_server
  * into the inverted index of the current generation of the crawl.
  * The complete inverted index for the whole crawl is built out of these
  * inverted indexes for generations. The point of computing a partial
  * inverted index on the fetcher is to reduce some of the computational
  * burden on the queue server. The resulting mini index computed by
  * buildMiniInvertedIndex() is stored in
  * $this->found_sites[self::INVERTED_INDEX]
  *
  */
 function buildMiniInvertedIndex()
 {
     $start_time = microtime();
     $keypad = "";
     crawlLog("  Start building mini inverted index ...  Current Memory:" . memory_get_usage());
     $num_seen = count($this->found_sites[self::SEEN_URLS]);
     $this->num_seen_sites += $num_seen;
     /*
         for the fetcher we are not saving the index shards so
         name doesn't matter.
     */
     if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) {
         $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}");
     }
     for ($i = 0; $i < $num_seen; $i++) {
         $interim_time = microtime();
         $site = $this->found_sites[self::SEEN_URLS][$i];
         if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) {
             continue;
         }
         $doc_rank = false;
         if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) {
             $doc_rank = $this->archive_iterator->weight($site);
         }
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
             $is_link = true;
             $doc_keys = $site[self::HTTP_CODE];
             $site_url = $site[self::TITLE];
             $host = UrlParser::getHost($site_url);
             $link_parts = explode('|', $site[self::HASH]);
             if (isset($link_parts[5])) {
                 $link_origin = $link_parts[5];
             } else {
                 $link_origin = $site_url;
             }
             $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
         } else {
             $is_link = false;
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
             $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
             $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources);
         }
         $word_lists = array();
         /*
            self::JUST_METAS check to avoid getting sitemaps in results for
            popular words
         */
         $lang = NULL;
         if (!isset($site[self::JUST_METAS])) {
             $host_words = UrlParser::getWordsIfHostUrl($site_url);
             $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
             if ($is_link) {
                 $phrase_string = $site[self::DESCRIPTION];
             } else {
                 if (isset($site[self::LANG])) {
                     if (isset($this->programming_language_extension[$site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                 } else {
                     $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                 }
             }
             if (isset($site[self::LANG])) {
                 $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!$is_link) {
             //store inlinks so they can be searched by
             $num_links = count($site[self::LINKS]);
             if ($num_links > 0) {
                 $link_rank = false;
                 if ($doc_rank !== false) {
                     $link_rank = max($doc_rank - 1, 1);
                 }
             } else {
                 $link_rank = false;
             }
         }
         $num_queue_servers = count($this->queue_servers);
         if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
             $score_keys = "";
             foreach ($site[self::USER_RANKS] as $label => $score) {
                 $score_keys .= packInt($score);
             }
             if (strlen($score_keys) % 8 != 0) {
                 $score_keys .= $keypad;
             }
             $doc_keys .= $score_keys;
         }
         $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank);
         /*
            $this->no_process_links is set when doing things like
            mix recrawls. In this case links likely already will appear
            in what indexing, so don't index again. $site[self::JUST_META]
            is set when have a sitemap or robots.txt (this case set later).
            In this case link  info is not particularly useful for indexing
            and can greatly slow building inverted index.
         */
         if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) {
             foreach ($site[self::LINKS] as $url => $link_text) {
                 /* this mysterious check means won't index links from
                     robots.txt. Sitemap will still be in TO_CRAWL, but that's
                     done elsewhere
                    */
                 if (strlen($url) == 0 || is_numeric($url)) {
                     continue;
                 }
                 $link_host = UrlParser::getHost($url);
                 if (strlen($link_host) == 0) {
                     continue;
                 }
                 $part_num = calculatePartition($link_host, $num_queue_servers);
                 $summary = array();
                 if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) {
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array();
                 }
                 $elink_flag = $link_host != $host ? true : false;
                 $link_text = strip_tags($link_text);
                 $ref = $elink_flag ? "eref" : "iref";
                 $url = str_replace('|', "%7C", $url);
                 $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url;
                 $elink_flag_string = $elink_flag ? "e" : "i";
                 $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1);
                 $summary[self::URL] = $link_id;
                 $summary[self::TITLE] = $url;
                 // stripping html to be on the safe side
                 $summary[self::DESCRIPTION] = $link_text;
                 $summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
                 $summary[self::ENCODING] = $site[self::ENCODING];
                 $summary[self::HASH] = $link_id;
                 $summary[self::TYPE] = "link";
                 $summary[self::HTTP_CODE] = $link_keys;
                 $summary[self::LANG] = $lang;
                 $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary;
                 $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang);
                 $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url);
                 if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) {
                     $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}");
                 }
                 $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank);
             }
         }
         $iterim_elapse = changeInMicrotime($interim_time);
         if ($iterim_elapse > 5) {
             crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
         }
         crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]);
     }
     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
         $this->recrawl_check_scheduler = true;
     }
     crawlLog("  Build mini inverted index time " . changeInMicrotime($start_time));
 }
Esempio n. 2
0
 /**
  * Gets $num many objects out of the web archive starting at byte $offset
  *
  * If the $next_flag is true the archive iterator is advance and if $fh
  * is not NULL then it is assumed to be an open resource pointing to the
  * archive (saving the time to open it).
  *
  * @param int $offset a valid byte offset into a web archive
  * @param int $num number of objects to return
  * @param bool $next_flag whether to advance the archive iterator
  * @param resource $fh either NULL or a file resource to the archive
  * @return array the $num objects beginning at $offset
  */
 function getObjects($offset, $num, $next_flag = true, $fh = NULL)
 {
     $open_flag = false;
     if ($fh == NULL) {
         $fh = $this->open();
         $open_flag = true;
     }
     $is_string = $this->is_string;
     $objects = array();
     $compressed_int_len = $this->compressor->compressedIntLen();
     if ($is_string) {
         $storage_len = strlen($this->storage);
     }
     if (!$is_string && fseek($fh, $offset) == 0 || $is_string && $offset < $storage_len) {
         for ($i = 0; $i < $num; $i++) {
             if (!$is_string && feof($fh)) {
                 break;
             }
             if ($is_string && $offset >= $storage_len) {
                 break;
             }
             $object = NULL;
             $compressed_len = $is_string ? substr($this->storage, $offset, $compressed_int_len) : fread($fh, $compressed_int_len);
             $len = $this->compressor->uncompressInt($compressed_len);
             if ($len > 0 && $len < MAX_ARCHIVE_OBJECT_SIZE) {
                 $compressed_file = $is_string ? substr($this->storage, $offset + $compressed_int_len, $len) : fread($fh, $len);
                 $file = $this->compressor->uncompress($compressed_file);
                 $object = @unserialize($file);
                 $offset += $compressed_int_len + $len;
                 $objects[] = array($offset, $object);
             } else {
                 crawlLog("Web archive saw blank line " . "when looked for offset {$offset}");
             }
         }
         if ($next_flag) {
             $this->iterator_pos = $offset;
         }
     }
     if ($open_flag) {
         $this->close($fh);
     }
     return $objects;
 }
Esempio n. 3
0
 /**
  * Deletes file associated with given node from disk
  * @param int $id is the id of the node whose file is to be deleted
  */
 function deleteNodeFile($id)
 {
     $node_file = $this->dir . "/{$id}.txt";
     if (file_exists($node_file)) {
         unlink($node_file);
     } else {
         crawlLog("Could not delete node {$id} from disk");
     }
 }
Esempio n. 4
0
 /**
  * Inserts the provided $key - $value pair into the hash table
  *
  * @param string $key the key to use for the insert (will be needed for
  *     lookup)
  * @param string $value the value associated with $key
  * @param int $probe if the location in the hash table is already known
  *     to be $probe then this variable can be used to save a lookup
  * @return bool whether the insert was successful or not
  */
 function insert($key, $value, $probe = false)
 {
     $null = $this->null;
     $deleted = $this->deleted;
     if ($probe === false) {
         $probe = $this->lookup($key, self::ALWAYS_RETURN_PROBE);
     }
     if ($probe === false) {
         /* this is a little slow
              the idea is we can't use deleted slots until we are sure
              $key isn't in the table
            */
         $probe = $this->lookupArray($key, array($null, $deleted), self::ALWAYS_RETURN_PROBE);
         if ($probe === false) {
             crawlLog("No space in hash table");
             return false;
         }
     }
     //there was a free slot so write entry...
     $data = pack("x" . ($this->key_size + $this->value_size));
     if (strlen($value) < $this->value_size) {
         /* this case should not happen, rather
               give an error we null terminate the string to the desired
               length
            */
         $value = str_pad($value, $this->value_size, '\\0');
     }
     //first the key
     for ($i = 0; $i < $this->key_size; $i++) {
         $data[$i] = $key[$i];
     }
     //then the value
     for ($i = 0; $i < $this->value_size; $i++) {
         $data[$i + $this->key_size] = $value[$i];
     }
     $this->put($probe, $data);
     $this->count++;
     $this->checkSave();
     return true;
 }
Esempio n. 5
0
 /**
  * Given the results of a getPage call, check whether or not the response
  * had the words NOTICE, WARNING, FATAL which might indicate an error on
  * the server. If it does, then the $response string is sent to the
  * crawlLog
  *
  * @param string $response getPage response in which to check for errors
  */
 static function checkResponseForErrors($response)
 {
     if (preg_match("/NOTICE|WARNING|FATAL/i", $response)) {
         crawlLog("There appears to have been an error in the server " . "response. Response was:");
         crawlLog(wordwrap($response));
     }
 }
Esempio n. 6
0
 /**
  * Used to stop a daemon that is running in the background
  *
  * @param string $name the main name of this daemon such as queue_server
  *     or fetcher.
  * @param string $subname the instance name if it is possible for more
  *     than one copy of the daemon to be running at the same time
  * @param bool $exit whether this method should just return (false) or
  *      call exit() (true)
  */
 static function stop($name, $subname = "", $exit = true)
 {
     $name_string = CrawlDaemon::getNameString($name, $subname);
     $lock_file = CrawlDaemon::getLockFileName($name, $subname);
     $not_web_setting = php_sapi_name() == 'cli';
     if (file_exists($lock_file)) {
         unlink($lock_file);
         if ($not_web_setting) {
             crawlLog("Sending stop signal to {$name_string}...");
         }
     } else {
         if ($not_web_setting) {
             crawlLog("{$name_string} does not appear to running...");
         }
     }
     if ($exit) {
         exit;
     }
 }
Esempio n. 7
0
 /**
  * Implements post processing of recipes. recipes are extracted
  * ingredients are scrubbed and recipes are clustered. The clustered
  * recipes are added back to the index.
  *
  * @param string $index_name  index name of the current crawl.
  */
 function postProcessing($index_name)
 {
     global $INDEXING_PLUGINS;
     if (!class_exists("SplHeap")) {
         crawlLog("...Recipe Plugin Requires SPLHeap for clustering!");
         crawlLog("...Aborting plugin");
         return;
     }
     $locale_tag = guessLocale();
     setLocaleObject($locale_tag);
     $search_controller = new SearchController($INDEXING_PLUGINS);
     $query = "recipe:all i:{$index_name}";
     crawlLog("...Running Recipe Plugin!");
     crawlLog("...Finding docs tagged as recipes.");
     $more_docs = true;
     $raw_recipes = array();
     $limit = 0;
     $num = 100;
     while ($more_docs) {
         $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name);
         if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) {
             $raw_recipes = array_merge($raw_recipes, $results["PAGES"]);
         }
         crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . ".");
         $limit += $num_results;
         if (isset($results["SAVE_POINT"])) {
             $end = true;
             foreach ($results["SAVE_POINT"] as $save_point) {
                 if ($save_point != -1) {
                     $end = false;
                 }
             }
             if ($end) {
                 $more_docs = false;
             }
         } else {
             $more_docs = false;
         }
     }
     crawlLog("...Clustering.");
     // only cluster if would make more than one cluster
     if (count($raw_recipes) * CLUSTER_RATIO > 1) {
         $recipes = array();
         $i = 0;
         foreach ($raw_recipes as $raw_recipe) {
             $description = $raw_recipe[self::DESCRIPTION];
             $ingredients = explode("||", $description);
             if (is_array($ingredients) && count($ingredients) > 1) {
                 $recipes[$i][0] = $raw_recipe[self::TITLE];
                 $recipes[$i][1] = $ingredients;
                 $recipes[$i][2] = crawlHash($raw_recipe[self::URL]);
                 $recipes[$i][3] = $raw_recipe;
                 $i++;
             }
         }
         $recipes_ingredients = array();
         $count = count($recipes);
         foreach ($recipes as $key => $recipe) {
             foreach ($recipe[1] as $index => $ingredient) {
                 if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") {
                     $mainIngredient = $this->getIngredientName((string) $ingredient);
                     if (strlen($mainIngredient) != 0) {
                         $recipe[1][$index] = $mainIngredient;
                     } else {
                         unset($recipe[1][$index]);
                     }
                 } else {
                     unset($recipe[1][$index]);
                 }
             }
             $recipes[$key] = $recipe;
         }
         $count = count($recipes);
         $k = 0;
         $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray');
         for ($i = 0; $i < $count; $i++) {
             $recipe1_main_ingredient = "";
             $recipe1 = $recipes[$i][1];
             $recipe_name = $recipes[$i][0];
             $recipe1_title = strtolower($recipes[$i][0]);
             $distinct_ingredients[$recipe_name] = $recipes[$i][1];
             $doc_keys[$recipe_name] = $recipes[$i][2];
             $recipes_summary[$recipe_name] = $recipes[$i][3];
             for ($j = $i + 1; $j < $count; $j++) {
                 $recipe2_main_ingredient = "";
                 $recipe2 = $recipes[$j][1];
                 $recipe2_title = strtolower($recipes[$j][0]);
                 $weights[$k][0] = $recipes[$i][0];
                 $weights[$k][1] = $recipes[$j][0];
                 $merge_array = array_merge($recipe1, $recipe2);
                 $vector_array = array_unique($merge_array);
                 sort($vector_array);
                 $recipe1_vector = array_fill_keys($vector_array, 0);
                 $recipe2_vector = array_fill_keys($vector_array, 0);
                 foreach ($recipe1 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe1_title, $ingredient)) {
                             $recipe1_main_ingredient = $ingredient;
                         }
                     }
                     $recipe1_vector[$ingredient] = 1;
                 }
                 foreach ($recipe2 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe2_title, $ingredient)) {
                             $recipe2_main_ingredient = $ingredient;
                         }
                     }
                     $recipe2_vector[$ingredient] = 1;
                 }
                 $edge_weight = 0;
                 $matches = 1;
                 foreach ($vector_array as $vector) {
                     $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector];
                     $vector_diff[$vector] = pow($diff, 2);
                     if (abs($diff) == 1) {
                         $matches += 1;
                     }
                     $edge_weight += $vector_diff[$vector];
                 }
                 $main_ingredient_match = 1;
                 if ($recipe1_main_ingredient != $recipe2_main_ingredient) {
                     $main_ingredient_match = 1000;
                 }
                 $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match;
                 $weights[$k][2] = $edge_weight;
                 $k++;
             }
         }
         crawlLog("...Making new shard with clustered recipes as docs.");
         $clusters = kruskalClustering($weights, $count, $distinct_ingredients);
         $index_shard = new IndexShard("cluster_shard");
         $word_lists = array();
         $recipe_sites = array();
         foreach ($clusters as $cluster) {
             $count = count($cluster);
             for ($i = 0; $i < $count - 1; $i++) {
                 $meta_ids = array();
                 $summary = array();
                 $recipe = $cluster[$i];
                 $summary[self::URL] = $recipes_summary[$recipe][self::URL];
                 $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE];
                 $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION];
                 $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP];
                 $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING];
                 $summary[self::HASH] = $recipes_summary[$recipe][self::HASH];
                 $doc_keys[$recipe] = crawlHash($summary[self::URL], true);
                 $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1);
                 $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                 $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE];
                 $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE];
                 $recipe_sites[] = $summary;
                 $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]);
                 crawlLog("ingredient:" . $cluster["ingredient"]);
                 if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) {
                     crawlLog("Problem inserting recipe: " . $summary[self::TITLE]);
                 }
             }
         }
         $shard_string = $index_shard->save(true);
         $index_shard = IndexShard::load("cluster_shard", $shard_string);
         unset($shard_string);
         crawlLog("...Adding recipe shard to index archive bundle");
         $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle($dir, false);
         if ($index_shard->word_docs_packed) {
             $index_shard->unpackWordDocs();
         }
         $generation = $index_archive->initGenerationToAdd($index_shard);
         if (isset($recipe_sites)) {
             crawlLog("... Adding " . count($recipe_sites) . " recipe docs.");
             $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0);
         }
         $k = 0;
         foreach ($recipe_sites as $site) {
             $recipe = $site[self::TITLE];
             $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1);
             $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
         }
         $index_shard->changeDocumentOffsets($summary_offsets);
         $index_archive->addIndexData($index_shard);
         $index_archive->saveAndAddCurrentShardDictionary();
         $index_archive->dictionary->mergeAllTiers();
         $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name);
         crawlLog("...Recipe plugin finished.");
     }
 }
Esempio n. 8
0
 /**
  * Add the array of $pages to the WebArchiveBundle pages being stored in
  * the partition according to write partition and the field used to store
  * the resulting offsets given by $offset_field.
  *
  * @param string $offset_field field used to record offsets after storing
  * @param array& $pages data to store
  * @return int the write_partition the pages were stored in
  */
 function addPages($offset_field, &$pages)
 {
     $num_pages = count($pages);
     if ($this->num_docs_per_partition > 0 && $num_pages > $this->num_docs_per_partition) {
         crawlLog("ERROR! At most " . $this->num_docs_per_partition . "many pages can be added in one go!");
         exit;
     }
     $partition = $this->getPartition($this->write_partition);
     $part_count = $partition->count;
     if ($this->num_docs_per_partition > 0 && $num_pages + $part_count > $this->num_docs_per_partition) {
         $this->setWritePartition($this->write_partition + 1);
         $partition = $this->getPartition($this->write_partition);
     }
     $this->addCount($num_pages);
     //only adds to count on disk
     $this->count += $num_pages;
     $partition->addObjects($offset_field, $pages, NULL, NULL, false);
     return $this->write_partition;
 }
Esempio n. 9
0
 /**
  * Used to flush changes of hash_url indexes caused by adjusting weights
  * in the bundle's priority queue to its hash table.
  */
 function notifyFlush()
 {
     foreach ($this->notify_buffer as $hash_url => $index) {
         $both = $this->lookupHashTable($hash_url, HashTable::RETURN_BOTH);
         if ($both !== false) {
             list($probe, $value) = $both;
             $packed_offset = substr($value, 0, 4);
             $packed_flag = substr($value, 8, 4);
             $new_data = $packed_offset . packInt($index) . $packed_flag;
             $this->insertHashTable($hash_url, $new_data, $probe);
         } else {
             crawlLog("NOTIFY LOOKUP FAILED. INDEX WAS {$index}. DATA WAS " . bin2hex($hash_url));
         }
     }
     $this->notify_buffer = array();
 }
Esempio n. 10
0
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }
Esempio n. 11
0
 /**
  * This method adds robots metas to or removes entirely a summary
  * produced by a text page processor or its subsclasses depending on
  * whether the summary title and description satisfy various rules
  * in $this->filter_rules
  *
  * @param array& $summary the summary data produced by the relevant page
  *     processor's handle method; modified in-place.
  * @param string $url the url where the summary contents came from
  */
 function pageSummaryProcessing(&$summary, $url)
 {
     $sites = array_keys($this->filter_rules);
     $filter_rules = $this->filter_rules;
     $rules = $filter_rules['default'] ? $filter_rules['default'] : array();
     foreach ($sites as $site) {
         if ($site == "default") {
             continue;
         }
         $sign = $site[0] == '-' ? false : true;
         if (!$sign || $site[0] == '+') {
             $check_url = substr($site, 1);
         } else {
             $check_url = $site;
         }
         if ($sign && UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url) || !$sign && !UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url)) {
             $rules = array_merge($rules, $filter_rules[$site]);
         }
     }
     foreach ($rules as $rule) {
         $preconditions = $rule["PRECONDITIONS"];
         $actions = $rule["ACTIONS"];
         $filter_flag = $this->checkFilter($preconditions, $summary[self::TITLE], $summary[self::DESCRIPTION]);
         if ($filter_flag) {
             if (in_array("NOPROCESS", $actions)) {
                 crawlLog("  Word filter plugin removed page.");
                 $summary = false;
                 break;
             } else {
                 if (!isset($summary[self::ROBOT_METAS])) {
                     $summary[self::ROBOT_METAS] = array();
                 }
                 $summary[self::ROBOT_METAS] += $actions;
             }
         }
     }
 }
Esempio n. 12
0
 /**
  * This is the function that should be called to get the
  * classifier_trainer to start training a logistic regression instance for
  * a particular classifier. The class label corresponding to the
  * classifier to be finalized should be passed as the second command-line
  * argument.
  */
 function start()
 {
     global $argv;
     CrawlDaemon::init($argv, "classifier_trainer");
     $label = $argv[2];
     crawlLog("Initializing classifier trainer log..", $label . '-classifier_trainer', true);
     $classifier = Classifier::getClassifier($label);
     $classifier->prepareToFinalize();
     $classifier->finalize();
     Classifier::setClassifier($classifier);
     crawlLog("Training complete.\n");
     CrawlDaemon::stop('classifier_trainer', $label);
 }
Esempio n. 13
0
 /**
  * Write a message to log file depending on debug level for this subpackage
  * @param string $message what to write to the log
  */
 function log($message)
 {
     if ($this->debug > 0) {
         crawlLog($message);
     }
 }
Esempio n. 14
0
 /**
  * Copies all feeds items newer than $age to a new shard, then deletes
  * old index shard and database entries older than $age. Finally sets copied
  * shard to be active. If this method is going to take max_execution_time/2
  * it returns false, so an additional job can be schedules; otherwise
  * it returns true
  *
  * @param int $age how many seconds old records should be deleted
  * @return bool whether job executed to complete
  */
 function rebuildFeedShard($age)
 {
     $time = time();
     $feed_shard_name = WORK_DIRECTORY . "/feeds/index";
     $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index";
     $prune_shard = new IndexShard($prune_shard_name);
     $too_old = $time - $age;
     if (!$prune_shard) {
         return false;
     }
     $pre_feeds = $this->getNewsSources();
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $feeds[$pre_feed['NAME']] = $pre_feed;
     }
     $db = $this->db;
     // we now rebuild the inverted index with the remaining items
     $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC";
     $result = $db->execute($sql, array($too_old));
     if ($result) {
         $completed = true;
         crawlLog("..still deleting. Making new index of non-pruned items.");
         $i = 0;
         while ($item = $db->fetchArray($result)) {
             crawlTimeoutLog("..have added %s non-pruned items to index.", $i);
             $i++;
             if (!isset($item['SOURCE_NAME'])) {
                 continue;
             }
             $source_name = $item['SOURCE_NAME'];
             if (isset($feeds[$source_name])) {
                 $lang = $feeds[$source_name]['LANGUAGE'];
             } else {
                 $lang = "";
             }
             $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"];
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $raw_guid = unbase64Hash($item["GUID"]);
             $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1);
             $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]);
             $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
         }
     }
     $prune_shard->save();
     @chmod($prune_shard_name, 0777);
     @chmod($feed_shard_name, 0777);
     @rename($prune_shard_name, $feed_shard_name);
     @chmod($feed_shard_name, 0777);
     $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?";
     $db->execute($sql, array($too_old));
 }
Esempio n. 15
0
 /**
  * Removes from the passed array those elements $elt who either are in
  * the filter bundle or whose $elt[$field_name] is in the bundle.
  *
  * @param array& $arr the array to remove elements from
  * @param array $field_names if not NULL an array of field names of $arr
  *     to use to do filtering
  */
 function differenceFilter(&$arr, $field_names = NULL)
 {
     $incremental_time = microtime();
     $num_filters = $this->num_filters;
     $count = count($arr);
     for ($i = 0; $i < $num_filters; $i++) {
         if ($i == $num_filters - 1) {
             $tmp_filter = $this->current_filter;
         } else {
             $tmp_filter = BloomFilterFile::load($this->dir_name . "/filter_{$i}.ftr");
         }
         for ($j = 0; $j < $count; $j++) {
             if ($field_names === NULL) {
                 $tmp =& $arr[$j];
                 if ($tmp !== false && $tmp_filter->contains($tmp)) {
                     /*
                        We deliberately don't try to add anything that has
                        the hash field set to false. This is our cue to
                        skip an element such as a link document which we
                        know will almost always be unique and so be unnecessary
                        to de-duplicate
                     */
                     unset($arr[$j]);
                 }
             } else {
                 //now do the same strategy for the array of fields case
                 foreach ($field_names as $field_name) {
                     $tmp =& $arr[$j][$field_name];
                     if ($tmp !== false && $tmp_filter->contains($tmp)) {
                         unset($arr[$j]);
                         break;
                     }
                 }
             }
             if (changeInMicrotime($incremental_time) > 30) {
                 crawlLog("..Processing item {$j} of {$count} from filter " . "number {$i} of {$num_filters}.");
                 $incremental_time = microtime();
             }
         }
     }
 }
Esempio n. 16
0
 /**
  * Scaless the weights of elements in the queue so that the sum fo the new
  * weights is $new_total
  *
  * This function is used periodically to prevent the queue from being
  * gummed up because all of the weights stored in it are too small.
  *
  * @param int $new_total what the new sum of weights of elements in the
  *     queue will be after normalization
  */
 function normalize($new_total = NUM_URLS_QUEUE_RAM)
 {
     $count = $this->count;
     $total_weight = $this->totalWeight();
     if ($total_weight <= 0) {
         crawlLog("Total queue weight was zero!! Doing uniform renormalization!");
     }
     for ($i = 1; $i <= $count; $i++) {
         $row = $this->getRow($i);
         if ($total_weight > 0) {
             $row[1] = $new_total * $row[1] / $total_weight;
         } else {
             $row[1] = $new_total / $count;
         }
         $this->putRow($i, $row);
     }
 }
Esempio n. 17
0
 /**
  * Determines based on its size, if index_shard should be added to
  * the active generation or in a new generation should be started.
  * If so, a new generation is started, the old generation is saved, and
  * the dictionary of the old shard is copied to the bundles dictionary
  * and a log-merge performed if needed
  *
  * @param int $add_num_docs number of docs in the shard about to be added
  * @param object $callback object with join function to be
  *     called if process is taking too long
  * @param bool $blocking whether there is an ongoing merge tiers operation
  *      occurring, if so don't do anything and return -1
  * @return int the active generation after the check and possible change has
  *     been performed
  */
 function initGenerationToAdd($add_num_docs, $callback = NULL, $blocking = false)
 {
     $current_num_docs = $this->getActiveShard()->num_docs;
     crawlLog("Current index shard has " . $current_num_docs . " documents.");
     $memory_limit = metricToInt(ini_get("memory_limit"));
     crawlLog("Memory Indexer limit is " . $memory_limit . ". Usage is " . memory_get_usage());
     if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation || 0.65 * $memory_limit < memory_get_usage()) {
         if ($blocking == true) {
             return -1;
         }
         crawlLog("Switching Index Shard...");
         $switch_time = microtime();
         // Save current shard dictionary to main dictionary
         $this->forceSave();
         $this->addAdvanceGeneration($callback);
         crawlLog("Switch Index Shard time:" . changeInMicrotime($switch_time));
     }
     return $this->generation_info['ACTIVE'];
 }
Esempio n. 18
0
 /**
  * Downloads the next file from the schedule of files to download received
  * from the web app.
  */
 function copyNextSyncFile()
 {
     $dir = $this->sync_dir;
     $name_server = $this->name_server;
     $time = time();
     $session = md5($time . AUTH_KEY);
     if (count($this->sync_schedule) <= 0) {
         return;
     }
     $file = array_pop($this->sync_schedule);
     crawlLog("Start syncing {$file['name']}..");
     if ($file['is_dir']) {
         if (!file_exists("{$dir}/{$file['name']}")) {
             mkdir("{$dir}/{$file['name']}");
             crawlLog(".. {$file['name']} directory created.");
         } else {
             crawlLog(".. {$file['name']} directory exists.");
         }
     } else {
         $request = "{$name_server}?c=resource&a=get&time={$time}&session={$session}" . "&robot_instance=" . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&last_sync=" . $this->last_sync . "&f=cache&n=" . urlencode($file["name"]);
         if ($file["size"] < self::DOWNLOAD_RANGE) {
             $data = FetchUrl::getPage($request, NULL, true);
             if ($file["size"] != strlen($data)) {
                 array_push($this->sync_schedule, $file);
                 crawlLog(".. {$file['name']} error downloading, retrying.");
                 return;
             }
             file_put_contents("{$dir}/{$file['name']}", $data);
             crawlLog(".. {$file['name']} file copied.");
         } else {
             $offset = 0;
             $fh = fopen("{$dir}/{$file['name']}", "wb");
             $request .= "&l=" . self::DOWNLOAD_RANGE;
             while ($offset < $file['size']) {
                 $data = FetchUrl::getPage($request . "&o={$offset}", NULL, true);
                 $old_offset = $offset;
                 $offset += self::DOWNLOAD_RANGE;
                 $end_point = min($offset, $file["size"]);
                 //crude check if we need to redownload segment
                 if (strlen($data) != $end_point - $old_offset) {
                     $offset = $old_offset;
                     crawlLog(".. Download error re-requesting segment");
                     continue;
                 }
                 fwrite($fh, $data);
                 crawlLog(".. {$file['name']} downloaded bytes {$old_offset} " . "to {$end_point}..");
             }
             crawlLog(".. {$file['name']} file copied.");
             fclose($fh);
         }
     }
 }
Esempio n. 19
0
 /**
  * Used to remove from the queue urls that are no longer crawlable
  * because the allowed and disallowed sites have changed.
  */
 function cullNoncrawlableSites()
 {
     $count = $this->web_queue->to_crawl_queue->count;
     crawlLog("Scheduler: " . " Culling noncrawlable urls after change in crawl parameters;" . " Queue Size {$count}");
     $start_time = microtime();
     $fh = $this->web_queue->openUrlArchive();
     $delete_urls = array();
     $i = 1;
     while ($i < $count) {
         crawlTimeoutLog("..Scheduler: " . "still culling noncrawlable urls. Examining " . "location %s in queue of %s.", $i, $count);
         $tmp = $this->web_queue->peekQueue($i, $fh);
         list($url, $weight, $flag, $probe) = $tmp;
         if (!$this->allowedToCrawlSite($url) || $this->disallowedToCrawlSite($url)) {
             $delete_urls[] = $url;
         }
         $i++;
     }
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     crawlLog("...Scheduler: Done selecting cullable URLS, time so far:" . changeInMicrotime($start_time));
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     $num_deletes = count($delete_urls);
     $k = 0;
     foreach ($delete_urls as $delete_url) {
         $k++;
         crawlTimeoutLog("..Scheduler: Removing selected url %s of %s " . "from queue.", $k, $num_deletes);
         if ($delete_url) {
             $this->web_queue->removeQueue($delete_url);
         } else {
             /*  if there was a hash table look up error still get rid of
                 index from priority queue */
             $this->web_queue->to_crawl_queue->poll($k);
         }
     }
     crawlLog("...Scheduler: Removed {$k} cullable URLS  from queue in time: " . changeInMicrotime($new_time));
 }
Esempio n. 20
0
 /**
  * {@inheritDoc}
  *
  * @param string $db_host the hostname of where the database is located
  *     (not used in all dbms's)
  * @param string $db_user the user to connect as
  * @param string $db_password the password of the user to connect as
  * @param string $db_name the name of the database on host we are
  * connecting to
  * @return mixed return false if not successful and some kind of
  *     connection object/identifier otherwise
  */
 function connect($db_host = DB_HOST, $db_user = DB_USER, $db_password = DB_PASSWORD, $db_name = DB_NAME)
 {
     try {
         $this->pdo = new PDO($db_host, $db_user, $db_password);
     } catch (PDOException $e) {
         $this->pdo = false;
         crawlLog('Connection failed: ' . $e->getMessage());
     }
     $this->to_upper_dbms = false;
     if (stristr($db_host, 'PGSQL')) {
         $this->to_upper_dbms = 'PGSQL';
     }
     return $this->pdo;
 }
Esempio n. 21
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of webpage data.
  *
  * @param string $page web-page contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $is_centroid = $this->summarizer_option == self::CENTROID_SUMMARIZER;
     if (is_string($page)) {
         $page = preg_replace('/\\&nbsp\\;|\\&rdquo\\;|\\&ldquo\\;|\\&mdash\\;/si', ' ', $page);
         $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page);
         $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ', $page);
         $dom = self::dom($dom_page);
         if ($dom !== false) {
             $summary[self::ROBOT_METAS] = self::getMetaRobots($dom);
             $summary[self::TITLE] = self::title($dom);
             if ($summary[self::TITLE] == "") {
                 $summary[self::TITLE] = self::crudeTitle($dom_page);
             }
             $summary[self::LANG] = self::lang($dom, $summary[self::TITLE], $url);
             if ($is_centroid) {
                 $summary_cloud = CentroidSummarizer::getCentroidSummary($dom_page, $summary[self::LANG]);
                 $summary[self::DESCRIPTION] = $summary_cloud[0];
                 $summary[self::WORD_CLOUD] = $summary_cloud[1];
                 crawlLog("..Using Centroid Summarizer");
             } else {
                 $summary[self::DESCRIPTION] = self::description($dom, $dom_page);
                 crawlLog("..Using Basic Summarizer");
             }
             $crude = false;
             if (trim($summary[self::DESCRIPTION]) == "") {
                 $summary[self::DESCRIPTION] = self::crudeDescription($dom_page);
                 crawlLog("..No text extracted. " . "Invoked crude description fallback.");
                 $crude = true;
             }
             $summary[self::LINKS] = self::links($dom, $url);
             if ($summary[self::LINKS] == array()) {
                 $summary[self::LINKS] = parent::extractHttpHttpsUrls($page);
             }
             $location = self::location($dom, $url);
             if ($location) {
                 $summary[self::LINKS][$location] = "location:" . $url;
                 $summary[self::LOCATION] = true;
                 $summary[self::DESCRIPTION] .= $url . " => " . $location;
                 if (!$summary[self::TITLE]) {
                     $summary[self::TITLE] = $url;
                 }
             }
             if (!$crude && !$location) {
                 $location = self::relCanonical($dom, $url);
                 if ($location) {
                     $summary[self::LINKS] = array();
                     $summary[self::LINKS][$location] = "location:" . $url;
                     $summary[self::LOCATION] = true;
                     if (!$summary[self::DESCRIPTION]) {
                         $summary[self::DESCRIPTION] .= $url . " => " . $location;
                     }
                     if (!$summary[self::TITLE]) {
                         $summary[self::TITLE] = $url;
                     }
                 }
             }
             $summary[self::PAGE] = $page;
             if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0 && !$location) {
                 /*maybe not html? treat as text with messed up tags
                      still try to get urls
                   */
                 $summary_text = parent::process(strip_tags($page), $url);
                 foreach ($summary as $field => $value) {
                     if (($value == "" || $value == array()) && isset($summary_text[$field])) {
                         $summary[$field] = $summary_text[$field];
                     }
                 }
             }
         } else {
             if ($dom == false) {
                 $summary = parent::process($page, $url);
             }
         }
     }
     return $summary;
 }
Esempio n. 22
0
 /**
  * Creates an database archive iterator with the given parameters. This
  * kind of iterator is used to cycle through the results of a SQL query
  * to a database, so that the results might be indexed by Yioop.
  *
  * @param string $iterate_timestamp timestamp of the arc archive bundle to
  *     iterate  over the pages of
  * @param string $iterate_dir folder of files to iterate over
  * @param string $result_timestamp timestamp of the arc archive bundle
  *     results are being stored in
  * @param string $result_dir where to write last position checkpoints to
  */
 function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir)
 {
     $this->iterate_timestamp = $iterate_timestamp;
     $this->iterate_dir = $iterate_dir;
     $this->result_timestamp = $result_timestamp;
     $this->result_dir = $result_dir;
     $ini = parse_ini_with_fallback("{$this->iterate_dir}/arc_description.ini");
     $this->dbinfo = array("DBMS" => DBMS, "DB_HOST" => DB_HOST, "DB_NAME" => DB_NAME, "DB_USER" => DB_USER, "DB_PASSWORD" => DB_PASSWORD);
     foreach ($this->dbinfo as $key => $value) {
         $ini_key = strtolower($key);
         if (isset($ini[$ini_key])) {
             $this->dbinfo[$key] = $ini[$ini_key];
         }
     }
     $db_class = ucfirst($this->dbinfo["DBMS"]) . "Manager";
     $this->db = new $db_class();
     $this->db->connect($this->dbinfo['DB_HOST'], $this->dbinfo['DB_USER'], $this->dbinfo['DB_PASSWORD'], $this->dbinfo['DB_NAME']);
     if (isset($ini['sql'])) {
         $this->sql = $ini['sql'];
     } else {
         crawlLog("Database Archive Iterator needs a SQL statement to run");
         exit;
     }
     if (isset($ini['field_value_separator'])) {
         $this->field_value_separator = $ini['field_value_separator'];
     } else {
         $this->field_value_separator = "\n----\n";
     }
     if (isset($ini['column_separator'])) {
         $this->column_separator = $ini['column_separator'];
     } else {
         $this->column_separator = "\n====\n";
     }
     if (isset($ini['encoding'])) {
         $this->encoding = $ini['encoding'];
     } else {
         $this->encoding = "UTF-8";
     }
     if (!file_exists($result_dir)) {
         mkdir($result_dir);
     }
     if (file_exists("{$this->result_dir}/iterate_status.txt")) {
         $this->restoreCheckpoint();
     } else {
         $this->reset();
     }
 }
Esempio n. 23
0
 /**
  * Creates an text archive iterator with the given parameters.
  *
  * @param string $iterate_timestamp timestamp of the arc archive bundle to
  *     iterate  over the pages of
  * @param string $iterate_dir folder of files to iterate over. If this
  *     iterator is used in a fetcher and the data is on a name server
  *     set this to false
  * @param string $result_timestamp timestamp of the arc archive bundle
  *     results are being stored in
  * @param string $result_dir where to write last position checkpoints to
  * @param array $ini describes start_ and end_delimiter, file_extension,
  *     encoding, and compression method used for pages in this archive
  */
 function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini = array())
 {
     $this->iterate_timestamp = $iterate_timestamp;
     $this->iterate_dir = $iterate_dir;
     $this->result_timestamp = $result_timestamp;
     $this->result_dir = $result_dir;
     if (!file_exists($result_dir)) {
         mkdir($result_dir);
     }
     $this->partitions = array();
     if ($this->iterate_dir != false) {
         // false =network/fetcher iterator
         if ($ini == array()) {
             $ini = parse_ini_with_fallback("{$this->iterate_dir}/arc_description.ini");
         }
         $extension = $ini['file_extension'];
     }
     $this->setIniInfo($ini);
     if ($this->start_delimiter == "" && $this->end_delimiter == "" && $this->iterate_dir != false) {
         crawlLog("At least one of start or end delimiter must be set!!");
         exit;
     }
     if ($this->iterate_dir != false) {
         foreach (glob("{$this->iterate_dir}/*.{$extension}", GLOB_BRACE) as $filename) {
             $this->partitions[] = $filename;
         }
     }
     $this->num_partitions = count($this->partitions);
     $this->status_filename = "{$this->result_dir}/iterate_status.txt";
     $this->buffer_filename = $this->result_dir . "/buffer.txt";
     if (file_exists($this->status_filename)) {
         $this->restoreCheckpoint();
     } else {
         $this->reset();
     }
 }
Esempio n. 24
0
 /**
  * If news_update time has passed, then updates news feeds associated with
  * this Yioop instance
  *
  * @param array $data used by view to render itself. In this case, if there
  *     is a problem updating the news then we will flash a message
  * @param bool $no_news_process if true than assume news_updater.php is
  *     not running. If false, assume being run from news_updater.php so
  *     update news_process cron time.
  */
 function newsUpdate()
 {
     $time = time();
     $something_updated = false;
     $delta = $time - $this->update_time;
     // every hour get items from feeds
     if ($delta > ONE_HOUR) {
         $this->update_time = $time;
         crawlLog("Performing news feeds update");
         $this->sourceModel->updateFeedItems(ONE_WEEK);
         $something_updated = true;
     }
     /*
        if anything changed rebuild shard
     */
     if ($something_updated) {
         crawlLog("Deleting feed items and rebuild shard...");
         $this->sourceModel->rebuildFeedShard(ONE_WEEK);
         crawlLog("... delete complete, shard rebuilt");
     } else {
         crawlLog("No updates needed.");
     }
 }