/** * Used to remove from the queue urls that are no longer crawlable * because the allowed and disallowed sites have changed. */ function cullNoncrawlableSites() { $count = $this->web_queue->to_crawl_queue->count; crawlLog("Scheduler: " . " Culling noncrawlable urls after change in crawl parameters;" . " Queue Size {$count}"); $start_time = microtime(); $fh = $this->web_queue->openUrlArchive(); $delete_urls = array(); $i = 1; while ($i < $count) { crawlTimeoutLog("..Scheduler: " . "still culling noncrawlable urls. Examining " . "location %s in queue of %s.", $i, $count); $tmp = $this->web_queue->peekQueue($i, $fh); list($url, $weight, $flag, $probe) = $tmp; if (!$this->allowedToCrawlSite($url) || $this->disallowedToCrawlSite($url)) { $delete_urls[] = $url; } $i++; } $this->web_queue->closeUrlArchive($fh); $new_time = microtime(); crawlLog("...Scheduler: Done selecting cullable URLS, time so far:" . changeInMicrotime($start_time)); $this->web_queue->closeUrlArchive($fh); $new_time = microtime(); $num_deletes = count($delete_urls); $k = 0; foreach ($delete_urls as $delete_url) { $k++; crawlTimeoutLog("..Scheduler: Removing selected url %s of %s " . "from queue.", $k, $num_deletes); if ($delete_url) { $this->web_queue->removeQueue($delete_url); } else { /* if there was a hash table look up error still get rid of index from priority queue */ $this->web_queue->to_crawl_queue->poll($k); } } crawlLog("...Scheduler: Removed {$k} cullable URLS from queue in time: " . changeInMicrotime($new_time)); }
/** * Runs the QueryTool on the supplied command line arguments */ function start() { global $argv, $INDEXING_PLUGINS; if (!isset($argv[1])) { $this->usageMessageAndExit(); } $query = $argv[1]; $results_per_page = isset($argv[2]) ? $argv[2] : 10; $limit = isset($argv[3]) ? $argv[3] : 0; setLocaleObject(getLocaleTag()); $start_time = microtime(); $controller = new SearchController($INDEXING_PLUGINS); $data = $controller->queryRequest($query, $results_per_page, $limit); if (!isset($data['PAGES'])) { $data['PAGES'] = array(); } foreach ($data['PAGES'] as $page) { echo "============\n"; echo "TITLE: " . trim($page[self::TITLE]) . "\n"; echo "URL: " . trim($page[self::URL]) . "\n"; echo "IPs: "; if (isset($page[self::IP_ADDRESSES])) { foreach ($page[self::IP_ADDRESSES] as $address) { echo $address . " "; } } echo "\n"; echo "DESCRIPTION: " . wordwrap(trim($page[self::DESCRIPTION])) . "\n"; echo "Rank: " . $page[self::DOC_RANK] . "\n"; echo "Relevance: " . $page[self::RELEVANCE] . "\n"; echo "Proximity: " . $page[self::PROXIMITY] . "\n"; echo "Score: " . $page[self::SCORE] . "\n"; echo "============\n\n"; } $data['ELAPSED_TIME'] = changeInMicrotime($start_time); echo "QUERY STATISTICS\n"; echo "============\n"; echo "ELAPSED TIME: " . $data['ELAPSED_TIME'] . "\n"; if (isset($data['LIMIT'])) { echo "LOW: " . $data['LIMIT'] . "\n"; } if (isset($data['HIGH'])) { echo "HIGH: " . min($data['TOTAL_ROWS'], $data['LIMIT'] + $data['RESULTS_PER_PAGE']) . "\n"; } if (isset($data['TOTAL_ROWS'])) { echo "TOTAL ROWS: " . $data['TOTAL_ROWS'] . "\n"; } if (isset($data['ERROR'])) { echo $data['ERROR'] . "\n"; } }
/** * Sends an email (much like PHP's mail command, but not requiring * a configured smtp server on the current machine) * * @param string $subject subject line of the email * @param string $from sender email address * @param string $to recipient email address * @param string $message message body for the email */ function send($subject, $from, $to, $message) { $start_time = microtime(); if ($from == "") { $from = $this->sender_email; } $eol = self::EOL; if (USE_MAIL_PHP) { $header = "From: " . $from . $eol; mail($to, $subject, $message, $header); return; } $this->messages = ""; $mail = "Date: " . date(DATE_RFC822) . $eol; $mail .= "Subject: " . $subject . $eol; $mail .= "From: " . $from . $eol; $mail .= "To: " . $to . $eol; $mail .= $eol . $eol . $message . $eol . "."; $commands = array("MAIL FROM: <{$from}>" => self::OKAY, "RCPT TO: <{$to}>" => self::OKAY, "DATA" => self::START_INPUT, $mail => self::OKAY); if ($this->startSession()) { foreach ($commands as $command => $good_response) { $response = $this->smtpCommand($command); if ($response != $good_response) { $this->messages .= "{$command} failed!! {$response} {$good_response}\n"; break; } } $this->endSession(); } if (QUERY_STATISTICS) { $current_messages = AnalyticsManager::get("MAIL_MESSAGES"); if (!$current_messages) { $current_messages = array(); } $total_time = AnalyticsManager::get("MAIL_TOTAL_TIME"); if (!$total_time) { $total_time = 0; } $elapsed_time = changeInMicrotime($start_time); $total_time += $elapsed_time; $current_messages[] = array("QUERY" => "<p>Send Mail</p>" . "<pre>" . wordwrap($this->messages, 60, "\n", true) . "</pre>", "ELAPSED_TIME" => $elapsed_time); AnalyticsManager::set("MAIL_MESSAGES", $current_messages); AnalyticsManager::set("MAIL_TOTAL_TIME", $total_time); } }
/** * Determines based on its size, if index_shard should be added to * the active generation or in a new generation should be started. * If so, a new generation is started, the old generation is saved, and * the dictionary of the old shard is copied to the bundles dictionary * and a log-merge performed if needed * * @param int $add_num_docs number of docs in the shard about to be added * @param object $callback object with join function to be * called if process is taking too long * @param bool $blocking whether there is an ongoing merge tiers operation * occurring, if so don't do anything and return -1 * @return int the active generation after the check and possible change has * been performed */ function initGenerationToAdd($add_num_docs, $callback = NULL, $blocking = false) { $current_num_docs = $this->getActiveShard()->num_docs; crawlLog("Current index shard has " . $current_num_docs . " documents."); $memory_limit = metricToInt(ini_get("memory_limit")); crawlLog("Memory Indexer limit is " . $memory_limit . ". Usage is " . memory_get_usage()); if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation || 0.65 * $memory_limit < memory_get_usage()) { if ($blocking == true) { return -1; } crawlLog("Switching Index Shard..."); $switch_time = microtime(); // Save current shard dictionary to main dictionary $this->forceSave(); $this->addAdvanceGeneration($callback); crawlLog("Switch Index Shard time:" . changeInMicrotime($switch_time)); } return $this->generation_info['ACTIVE']; }
/** * Builds an inverted index shard (word --> {docs it appears in}) * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. * This inverted index shard is then merged by a queue_server * into the inverted index of the current generation of the crawl. * The complete inverted index for the whole crawl is built out of these * inverted indexes for generations. The point of computing a partial * inverted index on the fetcher is to reduce some of the computational * burden on the queue server. The resulting mini index computed by * buildMiniInvertedIndex() is stored in * $this->found_sites[self::INVERTED_INDEX] * */ function buildMiniInvertedIndex() { $start_time = microtime(); $keypad = ""; crawlLog(" Start building mini inverted index ... Current Memory:" . memory_get_usage()); $num_seen = count($this->found_sites[self::SEEN_URLS]); $this->num_seen_sites += $num_seen; /* for the fetcher we are not saving the index shards so name doesn't matter. */ if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) { $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}"); } for ($i = 0; $i < $num_seen; $i++) { $interim_time = microtime(); $site = $this->found_sites[self::SEEN_URLS][$i]; if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) { continue; } $doc_rank = false; if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) { $doc_rank = $this->archive_iterator->weight($site); } if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources); } $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { if (isset($site[self::LANG])) { if (isset($this->programming_language_extension[$site[self::LANG]])) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (!$is_link) { //store inlinks so they can be searched by $num_links = count($site[self::LINKS]); if ($num_links > 0) { $link_rank = false; if ($doc_rank !== false) { $link_rank = max($doc_rank - 1, 1); } } else { $link_rank = false; } } $num_queue_servers = count($this->queue_servers); if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank); /* $this->no_process_links is set when doing things like mix recrawls. In this case links likely already will appear in what indexing, so don't index again. $site[self::JUST_META] is set when have a sitemap or robots.txt (this case set later). In this case link info is not particularly useful for indexing and can greatly slow building inverted index. */ if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) { foreach ($site[self::LINKS] as $url => $link_text) { /* this mysterious check means won't index links from robots.txt. Sitemap will still be in TO_CRAWL, but that's done elsewhere */ if (strlen($url) == 0 || is_numeric($url)) { continue; } $link_host = UrlParser::getHost($url); if (strlen($link_host) == 0) { continue; } $part_num = calculatePartition($link_host, $num_queue_servers); $summary = array(); if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) { $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array(); } $elink_flag = $link_host != $host ? true : false; $link_text = strip_tags($link_text); $ref = $elink_flag ? "eref" : "iref"; $url = str_replace('|', "%7C", $url); $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url; $elink_flag_string = $elink_flag ? "e" : "i"; $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1); $summary[self::URL] = $link_id; $summary[self::TITLE] = $url; // stripping html to be on the safe side $summary[self::DESCRIPTION] = $link_text; $summary[self::TIMESTAMP] = $site[self::TIMESTAMP]; $summary[self::ENCODING] = $site[self::ENCODING]; $summary[self::HASH] = $link_id; $summary[self::TYPE] = "link"; $summary[self::HTTP_CODE] = $link_keys; $summary[self::LANG] = $lang; $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary; $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang); $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url); if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) { $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}"); } $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank); } } $iterim_elapse = changeInMicrotime($interim_time); if ($iterim_elapse > 5) { crawlLog("..Inverting " . $site[self::URL] . "...took > 5s."); } crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]); } if ($this->crawl_type == self::ARCHIVE_CRAWL) { $this->recrawl_check_scheduler = true; } crawlLog(" Build mini inverted index time " . changeInMicrotime($start_time)); }
/** * Removes from the passed array those elements $elt who either are in * the filter bundle or whose $elt[$field_name] is in the bundle. * * @param array& $arr the array to remove elements from * @param array $field_names if not NULL an array of field names of $arr * to use to do filtering */ function differenceFilter(&$arr, $field_names = NULL) { $incremental_time = microtime(); $num_filters = $this->num_filters; $count = count($arr); for ($i = 0; $i < $num_filters; $i++) { if ($i == $num_filters - 1) { $tmp_filter = $this->current_filter; } else { $tmp_filter = BloomFilterFile::load($this->dir_name . "/filter_{$i}.ftr"); } for ($j = 0; $j < $count; $j++) { if ($field_names === NULL) { $tmp =& $arr[$j]; if ($tmp !== false && $tmp_filter->contains($tmp)) { /* We deliberately don't try to add anything that has the hash field set to false. This is our cue to skip an element such as a link document which we know will almost always be unique and so be unnecessary to de-duplicate */ unset($arr[$j]); } } else { //now do the same strategy for the array of fields case foreach ($field_names as $field_name) { $tmp =& $arr[$j][$field_name]; if ($tmp !== false && $tmp_filter->contains($tmp)) { unset($arr[$j]); break; } } } if (changeInMicrotime($incremental_time) > 30) { crawlLog("..Processing item {$j} of {$count} from filter " . "number {$i} of {$num_filters}."); $incremental_time = microtime(); } } } }
/** * Gets doc summaries of documents containing given words and meeting the * additional provided criteria * @param array $word_structs an array of word_structs. Here a word_struct * is an associative array with at least the following fields * KEYS -- an array of word keys * QUOTE_POSITIONS -- an array of positions of words that appeared in * quotes (so need to be matched exactly) * DISALLOW_PHRASES -- an array of words the document must not contain * WEIGHT -- a weight to multiple scores returned from this iterator by * INDEX_NAME -- an index timestamp to get results from * @param int $limit number of first document in order to return * @param int $num number of documents to return summaries of * @param array& $filter an array of hashes of domains to filter from * results * @param bool $use_cache_if_allowed if true and USE_CACHE is true then * an attempt will be made to look up the results in either * the file cache or memcache. Otherwise, items will be recomputed * and then potentially restored in cache * @param int $raw ($raw == 0) normal grouping, ($raw > 0) * no grouping done on data. if ($raw == 1) no lookups of summaries * done * @param array $queue_servers a list of urls of yioop machines which might * be used during lookup * @param string $original_query if set, the original query that corresponds * to $word_structs * @param string $save_timestamp_name if this timestamp is not empty, then * save iterate position, so can resume on future queries that make * use of the timestamp. If used then $limit ignored and get next $num * docs after $save_timestamp 's previous iterate position. * @param bool $limit_news if true the number of media:news items to * allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT * * @return array document summaries */ function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true) { global $CACHE; $indent = " "; $in2 = $indent . $indent; $in3 = $in2 . $indent; $in4 = $in2 . $in2; if (QUERY_STATISTICS) { $lookup_time = microtime(); } $use_proximity = false; $time = time(); if (count($word_structs) > 1 || isset($word_structs[0]["KEYS"]) && count($word_structs[0]["KEYS"]) > 1 || $word_structs == array() && substr_count($original_query, " ") > 1) { $use_proximity = true; } if (!isset($filter['time'])) { $filter['time'] = 0; } $filter_time = $filter['time']; unset($filter['time']); //iterators don't expect time field $pages = array(); $generation = 0; $to_retrieve = ceil(($limit + $num) / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; $start_slice = floor($limit / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; if ($save_timestamp_name != "") { $to_retrieve = $num; $limit = 0; $start_slice = 0; } if (USE_CACHE && $save_timestamp_name == "") { $mem_tmp = serialize($raw) . serialize($word_structs) . $original_query . $this->index_name; $summary_hash = crawlHash($mem_tmp . ":" . $limit . ":" . $num); if ($use_cache_if_allowed) { $cache_success = true; $results = $CACHE->get($summary_hash); if (!isset($results['TIME']) || $filter_time > $results['TIME']) { //if filter has changed since cached, then invalidate cache $results = false; } if (isset($results['TIME'])) { $cached_time = $time - $results['TIME']; } else { $cached_time = $time; } if ($cached_time > MAX_QUERY_CACHE_TIME) { $results = false; } if (isset($results['PAGES'])) { $close_prefix = WORK_DIRECTORY . "/schedules/" . self::index_closed_name; $has_changeable_results = false; $seen_times = array(); foreach ($results['PAGES'] as $page) { if (!isset($page[self::CRAWL_TIME]) || in_array($page[self::CRAWL_TIME], $seen_times)) { continue; } $seen_times[] = $page[self::CRAWL_TIME]; $current_closed = $close_prefix . $page[self::CRAWL_TIME] . ".txt"; if (!file_exists($current_closed)) { //either feed result or from active crawl $has_changeable_results = true; break; } } if ($has_changeable_results) { if ($cached_time > MIN_QUERY_CACHE_TIME) { $results = false; } } } if (QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in2}<b>Cache Lookup Time</b>: " . changeInMicrotime($lookup_time) . "<br />"; } if ($results !== false) { return $results; } } } $old_to_retrieve = $to_retrieve; $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, $save_timestamp_name, $limit_news); $num_retrieved = 0; $pages = array(); if (is_object($query_iterator)) { while ($num_retrieved < $to_retrieve && is_array($next_docs = $query_iterator->nextDocsWithWord())) { $pages += $next_docs; $num_retrieved = count($pages); } } if ($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) { // used for archive crawls of crawl mixes $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt"; $iterators = $query_iterator->save_iterators; $cnt_iterators = count($iterators); $save_point = array(); for ($i = 0; $i < $cnt_iterators; $i++) { $save_point[$i] = $iterators[$i]->currentGenDocOffsetWithWord(); } $results["SAVE_POINT"] = $save_point; file_put_contents($save_file, serialize($save_point)); $this->db->setWorldPermissionsRecursive($save_file); } $pages = array_values($pages); $result_count = count($pages); $sort_time = 0; if ($raw == 0) { // initialize scores $sort_start = microtime(); $max_user_ranks = 0; for ($i = 0; $i < $result_count; $i++) { $pages[$i]["OUT_SCORE"] = 0; if (isset($pages[$i][self::USER_RANKS])) { $j = count($pages[$i][self::USER_RANKS]); if ($max_user_ranks < $j) { $max_user_ranks = $j; } } } if ($max_user_ranks > 0) { for ($i = 0; $i < $result_count; $i++) { for ($j = 0; $j < $max_user_ranks; $j++) { if (isset($pages[$i][self::USER_RANKS][$j])) { $pages[$i]["USCORE{$j}"] = $pages[$i][self::USER_RANKS][$j]; } else { $pages[$i]["USCORE{$j}"] = 0; } } } } $subscore_fields = array(self::DOC_RANK, self::RELEVANCE); if ($use_proximity) { $subscore_fields[] = self::PROXIMITY; } if ($max_user_ranks > 0) { for ($j = 0; $j < $max_user_ranks; $j++) { $subscore_fields[] = "USCORE{$j}"; } } $num_fields = count($subscore_fields); // Compute Reciprocal Rank Fusion Score $alpha = 600 / $num_fields; if (isset($pages[0])) { foreach ($subscore_fields as $field) { orderCallback($pages[0], $pages[0], $field); usort($pages, "orderCallback"); $score = 0; for ($i = 0; $i < $result_count; $i++) { if ($i > 0) { if ($pages[$i - 1][$field] != $pages[$i][$field]) { $score++; } } $pages[$i]["OUT_SCORE"] += $alpha / (59 + $score); } } orderCallback($pages[0], $pages[0], "OUT_SCORE"); } usort($pages, "orderCallback"); if ($use_proximity) { for ($i = 0; $i < $result_count; $i++) { $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"]; } } else { for ($i = 0; $i < $result_count; $i++) { $pages[$i][self::PROXIMITY] = 1; $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"]; } } $sort_time = changeInMicrotime($sort_start); } if ($num_retrieved < $to_retrieve) { $results['TOTAL_ROWS'] = $num_retrieved; } else { $results['TOTAL_ROWS'] = $query_iterator->num_docs; //this is only an approximation } if ($raw == 1 && $save_timestamp_name == "") { $pages = array_slice($pages, $start_slice); $pages = array_slice($pages, $limit - $start_slice, $num); $results['PAGES'] =& $pages; if ($old_to_retrieve != $to_retrieve) { $results['HARD_QUERY'] = $old_to_retrieve; } return $results; } if (QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in2}<b>Lookup Offsets Time</b>: " . changeInMicrotime($lookup_time) . "<br />"; $machine_times = AnalyticsManager::get("MACHINE_TIMES"); if ($machine_times) { $this->query_info['QUERY'] .= "{$in3}<i>Machine Sub-Times</i>:<br />" . $machine_times . "<br />"; } $net_times = AnalyticsManager::get("NET_TIMES"); $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES"); if ($net_times && $max_machine_times) { $this->query_info['QUERY'] .= "{$in3}<i>Network Overhead Sub-Time</i>: " . ($net_times - $max_machine_times) . "<br />"; } if ($sort_time) { $this->query_info['QUERY'] .= "{$in3}<i>Merge-Rank Sub-Time</i>: " . $sort_time . "<br />"; } $summaries_time = microtime(); } $get_pages = array_slice($pages, $limit, $num); $to_get_count = count($get_pages); $groups_with_docs = false; if (preg_match("/\\bsite:doc\\b/", $original_query)) { $groups_with_docs = true; } $out_pages = array(); $cur_limit = $limit; while (count($out_pages) < $to_get_count && $get_pages) { $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, $raw, $groups_with_docs)); if ($save_timestamp_name != "") { break; } $cur_limit += $num; $get_pages = array_slice($pages, $cur_limit, $num); } $out_pages = array_slice($out_pages, 0, $num); if (QUERY_STATISTICS) { $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES"); if ($summary_times_string) { $round_summary_times = unserialize($summary_times_string); $summary_delta_time = changeInMicrotime($summaries_time); $summary_time_info = "{$summary_delta_time}<br /> {$in4}"; $sum_max_time = 0; foreach ($round_summary_times as $summary_times) { $i = 0; $max_time = 0; foreach ($summary_times as $summary_time) { $summary_time_info .= "ID_{$i}: " . $summary_time . "{$indent}"; $max_time = $summary_time > $max_time ? $summary_time : $max_time; $i++; } $sum_max_time += $max_time; } $net_overhead = $summary_delta_time - $sum_max_time; $summary_time_info .= "<br />{$in3}<i>Network Overhead Sub-Time</i>: " . $net_overhead; } else { $summary_time_info = changeInMicrotime($summaries_time); } $this->query_info['QUERY'] .= "{$in2}<b>Get Summaries Time</b>: " . $summary_time_info . "<br />"; } $results['PAGES'] =& $out_pages; $results['TIME'] = time(); $lang = guessLocaleFromString($original_query); $tokenizer = PhraseParser::getTokenizer($lang); //only use tokenizer if no meta word or disjuncts in query if (!preg_match('/(\\||\\:)/u', $original_query) && $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") && method_exists($tokenizer, "tagPartsOfSpeechPhrase") && isset($tokenizer->use_thesaurus)) { $results = $this->sortByThesaurusScore($results, $original_query, $lang); } if (USE_CACHE && $save_timestamp_name == "") { $CACHE->set($summary_hash, $results); } return $results; }
/** * Test how fast insertion and deletions can be done */ function timingTestCase() { $start_time = microtime(); for ($i = 0; $i < 10000; $i++) { $this->test_objects['FILE1']->insert(crawlHash("hi{$i}", true), "0000" . packInt($i)); } $this->assertTrue(changeInMicrotime($start_time) < 2, "Insert 10000 into table of size 20000 takes less than 2 seconds"); $start_time = microtime(); for ($i = 0; $i < 10000; $i++) { $this->test_objects['FILE1']->delete(crawlHash("hi{$i}", true)); } $this->assertTrue(changeInMicrotime($start_time) < 2, "Delete 10000 from table of size 20000 takes less than 2 seconds"); }
/** * Executes the supplied sql command on the database, depending on debug * levels computes query statistics * * This method operates either query or data manipulation statements * * @param string $sql SQL statement to execute * @param array $params bind_name => value values to interpolate into * the $sql to be executes * @return mixed false if query fails, resource or true otherwise */ function execute($sql, $params = array()) { if (QUERY_STATISTICS) { $query_info = array(); $query_info['QUERY'] = $sql; if ($params != array()) { $query_info['QUERY'] .= "<br />" . print_r($params, true); } $start_time = microtime(); } $result = $this->exec($sql, $params); if (QUERY_STATISTICS) { $query_info['ELAPSED_TIME'] = changeInMicrotime($start_time); $this->total_time += $query_info['ELAPSED_TIME']; $this->query_log[] = $query_info; } return $result; }
/** * Gets status and, if done processing all other mirroring activities, * gets a new list of files that have changed since the last synchronization * from the web app of the machine we are mirroring with. * * @return mixed array or bool. Returns false if weren't succesful in * contacting web app, otherwise, returns an array with a status * and potentially a list of files ot sync */ function checkScheduler() { $info = array(); $name_server = $this->name_server; $start_time = microtime(); $time = time(); $session = md5($time . AUTH_KEY); $request = $name_server . "?c=resource&time={$time}&session={$session}" . "&robot_instance=" . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&last_sync=" . $this->last_sync; if ($this->start_sync <= $this->last_sync) { $request .= "&a=syncList"; $info_string = FetchUrl::getPage($request, NULL, true); if ($info_string === false) { return false; } $this->last_notify = $time; $info_string = trim($info_string); $info = unserialize(gzuncompress(base64_decode($info_string))); if (isset($info[self::STATUS]) && $info[self::STATUS] == self::CONTINUE_STATE) { $this->start_sync = time(); $this->sync_schedule = $info[self::DATA]; unset($info[self::DATA]); } } else { $info[self::STATUS] = self::CONTINUE_STATE; if ($time - $this->last_notify > MIRROR_NOTIFY_FREQUENCY) { $request .= "&a=syncNotify"; FetchUrl::getPage($request, NULL, true); $this->last_notify = $time; CrawlLog("Notifying master that mirror is alive.."); } } if (count($this->sync_schedule) == 0) { $this->last_sync = $this->start_sync; $this->db->setWorldPermissionsRecursive($this->sync_dir, true); file_put_contents($this->last_sync_file, serialize($this->last_sync)); } crawlLog(" Time to check Scheduler " . changeInMicrotime($start_time)); return $info; }
/** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and score if there are docs left, -1 otherwise */ function findDocsWithWord() { $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}"; $sites = array(); $lookup = array(); $i = 0; $j = 0; foreach ($this->queue_servers as $server) { if ($this->more_flags[$i]) { $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}"; $lookup[$j] = $i; $j++; } $i++; } $net_times = AnalyticsManager::get("NET_TIMES"); $net_times = $net_times ? $net_times : 0; $download_time = microtime(); $downloads = array(); if (count($sites) > 0) { $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true); } $net_times += changeInMicrotime($download_time); AnalyticsManager::set("NET_TIMES", $net_times); $results = array(); $count = count($downloads); $this->num_docs = 0; $in4 = " "; $machine_times = AnalyticsManager::get("MACHINE_TIMES"); $indent = $machine_times ? "<br />{$in4}" : $in4; $machine_times = $machine_times ? $machine_times : ""; $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES"); $max_machine_times = $max_machine_times ? $max_machine_times : 0; $max_time = 0; $num_with_results = $count; for ($j = 0; $j < $count; $j++) { $download =& $downloads[$j]; if (isset($download[self::PAGE])) { $pre_result = @unserialize($download[self::PAGE]); if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) { $this->more_flags[$lookup[$j]] = false; $num_with_results--; } if (isset($pre_result["TOTAL_ROWS"])) { $this->num_docs += $pre_result["TOTAL_ROWS"]; } if (isset($pre_result["PAGES"])) { foreach ($pre_result["PAGES"] as $page_data) { if (isset($page_data[self::KEY])) { $results[$page_data[self::KEY]] = $page_data; $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j]; } } } $max_time = max($max_time, $pre_result['ELAPSED_TIME']); $lookup_link = $this->makeLookupLink($sites, $lookup[$j]); $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . " "; $indent = ""; } } if (isset($pre_result["HARD_QUERY"])) { $this->hard_query = $pre_result["HARD_QUERY"]; } if ($num_with_results > 0) { $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results)); } $max_machine_times += $max_time; AnalyticsManager::set("MACHINE_TIMES", $machine_times); AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times); if ($results == array()) { $results = -1; } if ($results != -1) { if ($this->filter != NULL) { foreach ($results as $keys => $data) { $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); if (in_array($host_key, $this->filter)) { unset($results[$keys]); } } } } $this->count_block = count($results); $this->pages = $results; return $results; }
/** * This is the main entry point for handling a search request. * * ProcessRequest determines the type of search request (normal request , * cache request, or related request), or if its a * user is returning from the admin panel via signout. It then calls the * appropriate method to handle the given activity.Finally, it draw the * search screen. */ function processRequest() { $data = array(); $start_time = microtime(); if (AD_LOCATION != "none") { $ad_fields = array('TOP_ADSCRIPT', 'SIDE_ADSCRIPT', 'GLOBAL_ADSCRIPT'); foreach ($ad_fields as $ad_field) { $ad = html_entity_decode(constant($ad_field), ENT_QUOTES); $ad = preg_replace("[(]", "(", $ad); $data[$ad_field] = preg_replace("[)]", ")", $ad); } } list($subsearches, $no_query) = $this->initializeSubsearches(); $format_info = $this->initializeResponseFormat(); if (!$format_info) { return; } list($view, $web_flag, $raw, $results_per_page, $limit) = $format_info; list($query, $activity, $arg) = $this->initializeUserAndDefaultActivity($data); if ($activity == "query" && $this->mirrorHandle()) { return; } list($index_timestamp, $index_info, $save_timestamp) = $this->initializeIndexInfo($web_flag, $raw, $data); unset($_SESSION['LAST_ACTIVITY']); if (isset($_REQUEST['q']) && strlen($_REQUEST['q']) > 0 || $activity != "query") { if ($activity != "cache") { $this->processQuery($data, $query, $activity, $arg, $results_per_page, $limit, $index_timestamp, $raw, $save_timestamp); // calculate the results of a search if there is one } else { if (isset($_REQUEST['repository'])) { $ui_array = array(); } else { $ui_array = array("highlight", "yioop_nav", "history", "summaries", "version"); if (isset($_REQUEST['from_cache'])) { $ui_array[] = "cache_link_referrer"; } if (isset($_REQUEST['hist_open'])) { $ui_array[] = "hist_ui_open"; } } $this->cacheRequestAndOutput($arg, $ui_array, $query, $index_timestamp); return; } } $data['ELAPSED_TIME'] = changeInMicrotime($start_time); if ($view == "serial" || $view == "json") { if (isset($data["PAGES"])) { $count = count($data["PAGES"]); for ($i = 0; $i < $count; $i++) { unset($data["PAGES"][$i]["OUT_SCORE"]); $data["PAGES"][$i][self::SCORE] = "" . round($data["PAGES"][$i][self::SCORE], 3); $data["PAGES"][$i][self::DOC_RANK] = "" . round($data["PAGES"][$i][self::DOC_RANK], 3); $data["PAGES"][$i][self::RELEVANCE] = "" . round($data["PAGES"][$i][self::RELEVANCE], 3); } } if ($view == "serial") { echo serialize($data); } else { $out_data = array(); $out_data["language"] = getLocaleTag(); $out_data["link"] = NAME_SERVER . "?f=json&q={$data['QUERY']}"; $out_data["totalResults"] = $data['TOTAL_ROWS']; $out_data["startIndex"] = $data['LIMIT']; $out_data["itemsPerPage"] = $data['RESULTS_PER_PAGE']; foreach ($data['PAGES'] as $page) { $item = array(); $item["title"] = $page[self::TITLE]; if (!isset($page[self::TYPE]) || isset($page[self::TYPE]) && $page[self::TYPE] != "link") { $item["link"] = $page[self::URL]; } else { $item["link"] = strip_tags($page[self::TITLE]); } $item["description"] = strip_tags($page[self::DESCRIPTION]); if (isset($page[self::THUMB]) && $page[self::THUMB] != 'NULL') { $item["thumb"] = $page[self::THUMB]; } if (isset($page[self::TYPE])) { $item["type"] = $page[self::TYPE]; } $out_data['item'][] = $item; } echo json_encode($out_data); } exit; } if ($web_flag) { $this->addSearchViewData($index_info, $no_query, $raw, $view, $subsearches, $data); } if (!isset($data["USERNAME"]) && isset($_SESSION['USER_ID'])) { $signin_model = $this->model("signin"); $data['USERNAME'] = $signin_model->getUserName($_SESSION['USER_ID']); } $this->displayView($view, $data); }
/** * Make multi_curl requests for an array of sites with urls or onion urls * * @param array $sites an array containing urls of pages to request * @param bool $timer flag, true means print timing statistics to log * @param int $page_range_request maximum number of bytes to download/page * 0 means download all * @param string $temp_dir folder to store temporary ip header info * @param string $key the component of $sites[$i] that has the value of * a url to get defaults to URL * @param string $value component of $sites[$i] in which to store the * page that was gotten * @param bool $minimal if true do a faster request of pages by not * doing things like extract HTTP headers sent, etcs * @param array $post_data data to be POST'd to each site * @param bool $follow whether to follow redirects or not * @param string $tor_proxy url of a proxy that knows how to download * .onion urls * @param array $proxy_servers if not array(), then an array of proxy * server to use rather than to directly download web pages from * the current machine * * @return array an updated array with the contents of those pages */ static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array()) { $agent_handler = curl_multi_init(); $active = NULL; $start_time = microtime(); if (!$minimal && $temp_dir == NULL) { $temp_dir = CRAWL_DIR . "/temp"; if (!file_exists($temp_dir)) { mkdir($temp_dir); } } //Set-up requests $num_sites = count($sites); for ($i = 0; $i < $num_sites; $i++) { $is_gopher = false; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; if (isset($sites[$i][$key])) { list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers); if ($headers == "gopher") { $is_gopher = true; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; $headers = array(); } $sites[$i][0] = curl_init(); if (!$minimal) { $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+'); curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]); curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true); } curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT); curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER); curl_setopt($sites[$i][0], CURLOPT_URL, $url); if (strcmp(substr($url, -10), "robots.txt") == 0) { $sites[$i]['ROBOT'] = true; $follow = true; /*wikipedia redirects their robot page. grr want to force this for robots pages */ } curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow); curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true); curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true); curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT); curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT); if (stripos($url, '.onion') !== false && $tor_proxy != "") { curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy); //CURLPROXY_SOCKS5_HOSTNAME = 7 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7); if ($timer) { crawlLog("Using Tor proxy for {$url}.."); } } else { if ($proxy_servers != array() && !$is_gopher) { $select_proxy = rand(0, count($proxy_servers) - 1); $proxy_server = $proxy_servers[$select_proxy]; $proxy_parts = explode(":", $proxy_server); $proxy_ip = $proxy_parts[0]; if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') { $proxy_type = CURLPROXY_HTTP; } else { if (strtolower($proxy_parts[2]) == 'socks5') { $proxy_type = CURLPROXY_SOCKS5; } else { $proxy_type = $proxy_parts[2]; } } if (isset($proxy_parts[1])) { $proxy_port = $proxy_parts[1]; } else { $proxy_port = "80"; } curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}"); curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type); if ($timer) { crawlLog("Selecting proxy {$select_proxy} for {$url}"); } } } if (!$minimal) { curl_setopt($sites[$i][0], CURLOPT_HEADER, true); } //make lighttpd happier if (!$is_gopher) { curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers); } curl_setopt($sites[$i][0], CURLOPT_ENCODING, ""); // ^ need to set for sites like att that use gzip if ($page_range_request > 0) { curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request); } if ($post_data != NULL) { curl_setopt($sites[$i][0], CURLOPT_POST, true); curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]); } curl_multi_add_handle($agent_handler, $sites[$i][0]); } } if ($timer) { crawlLog(" Init Get Pages " . changeInMicrotime($start_time)); } $start_time = microtime(); $start = time(); //Wait for responses $running = NULL; $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7; do { $mrc = curl_multi_exec($agent_handler, $running); $ready = curl_multi_select($agent_handler, 0.005); } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0); if (time() - $start > PAGE_TIMEOUT && $timer) { crawlLog(" TIMED OUT!!!"); } if ($timer) { crawlLog(" Page Request time " . changeInMicrotime($start_time)); } $start_time = microtime(); //Process returned pages for ($i = 0; $i < $num_sites; $i++) { if ($timer) { crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites); } if (!$minimal && isset($ip_holder[$i])) { rewind($ip_holder[$i]); $header = fread($ip_holder[$i], 8192); $ip_addresses = self::getCurlIp($header); fclose($ip_holder[$i]); } $is_gopher = false; if (isset($sites[$i][0]) && $sites[$i][0]) { // Get Data and Message Code $content = @curl_multi_getcontent($sites[$i][0]); $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL]; /* If the Transfer-encoding was chunked then the Range header we sent was ignored. So we manually truncate the data here */ if ($page_range_request > 0) { $content = substr($content, 0, $page_range_request); } if (isset($content) && !$minimal && !$is_gopher) { $site = self::parseHeaderPage($content, $value); $sites[$i] = array_merge($sites[$i], $site); if (isset($header)) { $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4); } else { $header = ""; } $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER]; unset($header); } else { if (isset($content) && !$minimal && $is_gopher) { $sites[$i][CrawlConstants::HEADER] = $header; $sites[$i][$value] = $content; unset($header); } else { $sites[$i][$value] = $content; } } if (!$minimal) { $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD); $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME); $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME); $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE); if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) { $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]); } else { $sites[$i][self::HTTP_CODE] = 200; } if ($ip_addresses) { $sites[$i][self::IP_ADDRESSES] = $ip_addresses; } else { $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0"); } //Get Time, Mime type and Character encoding $sites[$i][self::TIMESTAMP] = time(); if ($is_gopher) { $path = UrlParser::getPath($sites[$i][self::URL]); $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]); if (isset($path[1])) { $gopher_type = $path[1]; } else { $gopher_type = 1; } if ($gopher_type == 1) { $sites[$i][self::TYPE] = "text/gopher"; } else { if (in_array($gopher_type, array(0, 3, 6))) { $sites[$i][self::TYPE] = "text/plain"; if ($gopher_type == 6) { $sites[$i][$value] = convert_uudecode($content); } } else { if ($gopher_type == 'h') { $sites[$i][self::TYPE] = "text/html"; } else { if ($gopher_type == 'g') { $sites[$i][self::TYPE] = "image/gif"; } } } } $path_info = pathinfo($filename); if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) { $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename); } else { if (!isset($sites[$i][self::TYPE])) { $sites[$i][self::TYPE] = "unknown"; } } } else { $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE)); $sites[$i][self::TYPE] = strtolower(trim($type_parts[0])); } } //curl_multi_remove_handle($agent_handler, $sites[$i][0]); curl_close($sites[$i][0]); if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) { if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) { $sites[$i][self::TYPE] = "text/plain"; $sites[$i][self::HTTP_CODE] = "200"; $tmp = wordwrap($sites[$i][$value], 80); $tmp_parts = explode("\n", $tmp); $tmp = "# Suspect server misconfiguration\n"; $tmp .= "# Assume shouldn't crawl this site.\n"; $tmp .= "# Pretending got following robots.txt.\n"; $tmp .= "User-agent: *\n"; $tmp .= "Disallow: /\n"; $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n"; $tmp .= "# Original content:\n"; foreach ($tmp_parts as $part) { $tmp = "#" . $part . "\n"; } $sites[$i][$value] = $tmp; $sites[$i][self::HTTP_CODE] = "200"; unset($site[CrawlConstants::LOCATION]); } } } //end big if } //end for if ($timer) { crawlLog(" Get Page Content time " . changeInMicrotime($start_time)); } curl_multi_close($agent_handler); return $sites; }
/** * Main loop for the news updater. */ function loop() { crawlLog("In News Update Loop"); $info[self::STATUS] = self::CONTINUE_STATE; $local_archives = array(""); while (CrawlDaemon::processHandler()) { $start_time = microtime(); crawlLog("Checking if news feeds should be updated..."); $this->newsUpdate(); $sleep_time = max(0, ceil(MINIMUM_UPDATE_LOOP_TIME - changeInMicrotime($start_time))); if ($sleep_time > 0) { crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time); sleep($sleep_time); } } //end while crawlLog("News Updater shutting down!!"); }
/** * Receives a request to get crawl summary data for an array of urls * from a remote name server and then looks these up on the local * queue server */ function getCrawlItems() { $crawl_model = $this->model("crawl"); $start_time = microtime(); if (!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"]) || !isset($_REQUEST["i"])) { return; } $num = $this->clean($_REQUEST["num"], "int"); $i = $this->clean($_REQUEST["i"], "int"); $crawl_model->current_machine = $i; $lookups = unserialize(webdecode($_REQUEST["arg"])); $our_lookups = array(); foreach ($lookups as $lookup => $lookup_info) { if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h' || $lookup_info[0][0] === 'r' || $lookup_info[0][0] === 'g')) { $our_lookups[$lookup] = $lookup_info; } else { $our_lookups[$lookup] = array(); foreach ($lookup_info as $lookup_item) { if (count($lookup_item) == 2) { $our_lookups[$lookup][] = $lookup_item; } else { list($index, , , , ) = $lookup_item; if ($index == $i) { $our_lookups[$lookup][] = $lookup_item; } } } } } $items = $crawl_model->getCrawlItems($our_lookups); $items["ELAPSED_TIME"] = changeInMicrotime($start_time); echo webencode(serialize($items)); }