Пример #1
0
 /**
  * Used to remove from the queue urls that are no longer crawlable
  * because the allowed and disallowed sites have changed.
  */
 function cullNoncrawlableSites()
 {
     $count = $this->web_queue->to_crawl_queue->count;
     crawlLog("Scheduler: " . " Culling noncrawlable urls after change in crawl parameters;" . " Queue Size {$count}");
     $start_time = microtime();
     $fh = $this->web_queue->openUrlArchive();
     $delete_urls = array();
     $i = 1;
     while ($i < $count) {
         crawlTimeoutLog("..Scheduler: " . "still culling noncrawlable urls. Examining " . "location %s in queue of %s.", $i, $count);
         $tmp = $this->web_queue->peekQueue($i, $fh);
         list($url, $weight, $flag, $probe) = $tmp;
         if (!$this->allowedToCrawlSite($url) || $this->disallowedToCrawlSite($url)) {
             $delete_urls[] = $url;
         }
         $i++;
     }
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     crawlLog("...Scheduler: Done selecting cullable URLS, time so far:" . changeInMicrotime($start_time));
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     $num_deletes = count($delete_urls);
     $k = 0;
     foreach ($delete_urls as $delete_url) {
         $k++;
         crawlTimeoutLog("..Scheduler: Removing selected url %s of %s " . "from queue.", $k, $num_deletes);
         if ($delete_url) {
             $this->web_queue->removeQueue($delete_url);
         } else {
             /*  if there was a hash table look up error still get rid of
                 index from priority queue */
             $this->web_queue->to_crawl_queue->poll($k);
         }
     }
     crawlLog("...Scheduler: Removed {$k} cullable URLS  from queue in time: " . changeInMicrotime($new_time));
 }
Пример #2
0
 /**
  * Runs the QueryTool on the supplied command line arguments
  */
 function start()
 {
     global $argv, $INDEXING_PLUGINS;
     if (!isset($argv[1])) {
         $this->usageMessageAndExit();
     }
     $query = $argv[1];
     $results_per_page = isset($argv[2]) ? $argv[2] : 10;
     $limit = isset($argv[3]) ? $argv[3] : 0;
     setLocaleObject(getLocaleTag());
     $start_time = microtime();
     $controller = new SearchController($INDEXING_PLUGINS);
     $data = $controller->queryRequest($query, $results_per_page, $limit);
     if (!isset($data['PAGES'])) {
         $data['PAGES'] = array();
     }
     foreach ($data['PAGES'] as $page) {
         echo "============\n";
         echo "TITLE: " . trim($page[self::TITLE]) . "\n";
         echo "URL: " . trim($page[self::URL]) . "\n";
         echo "IPs: ";
         if (isset($page[self::IP_ADDRESSES])) {
             foreach ($page[self::IP_ADDRESSES] as $address) {
                 echo $address . " ";
             }
         }
         echo "\n";
         echo "DESCRIPTION: " . wordwrap(trim($page[self::DESCRIPTION])) . "\n";
         echo "Rank: " . $page[self::DOC_RANK] . "\n";
         echo "Relevance: " . $page[self::RELEVANCE] . "\n";
         echo "Proximity: " . $page[self::PROXIMITY] . "\n";
         echo "Score: " . $page[self::SCORE] . "\n";
         echo "============\n\n";
     }
     $data['ELAPSED_TIME'] = changeInMicrotime($start_time);
     echo "QUERY STATISTICS\n";
     echo "============\n";
     echo "ELAPSED TIME: " . $data['ELAPSED_TIME'] . "\n";
     if (isset($data['LIMIT'])) {
         echo "LOW: " . $data['LIMIT'] . "\n";
     }
     if (isset($data['HIGH'])) {
         echo "HIGH: " . min($data['TOTAL_ROWS'], $data['LIMIT'] + $data['RESULTS_PER_PAGE']) . "\n";
     }
     if (isset($data['TOTAL_ROWS'])) {
         echo "TOTAL ROWS: " . $data['TOTAL_ROWS'] . "\n";
     }
     if (isset($data['ERROR'])) {
         echo $data['ERROR'] . "\n";
     }
 }
Пример #3
0
 /**
  * Sends an email (much like PHP's mail command, but not requiring
  * a configured smtp server on the current machine)
  *
  * @param string $subject subject line of the email
  * @param string $from sender email address
  * @param string $to recipient email address
  * @param string $message message body for the email
  */
 function send($subject, $from, $to, $message)
 {
     $start_time = microtime();
     if ($from == "") {
         $from = $this->sender_email;
     }
     $eol = self::EOL;
     if (USE_MAIL_PHP) {
         $header = "From: " . $from . $eol;
         mail($to, $subject, $message, $header);
         return;
     }
     $this->messages = "";
     $mail = "Date: " . date(DATE_RFC822) . $eol;
     $mail .= "Subject: " . $subject . $eol;
     $mail .= "From: " . $from . $eol;
     $mail .= "To: " . $to . $eol;
     $mail .= $eol . $eol . $message . $eol . ".";
     $commands = array("MAIL FROM: <{$from}>" => self::OKAY, "RCPT TO: <{$to}>" => self::OKAY, "DATA" => self::START_INPUT, $mail => self::OKAY);
     if ($this->startSession()) {
         foreach ($commands as $command => $good_response) {
             $response = $this->smtpCommand($command);
             if ($response != $good_response) {
                 $this->messages .= "{$command} failed!! {$response} {$good_response}\n";
                 break;
             }
         }
         $this->endSession();
     }
     if (QUERY_STATISTICS) {
         $current_messages = AnalyticsManager::get("MAIL_MESSAGES");
         if (!$current_messages) {
             $current_messages = array();
         }
         $total_time = AnalyticsManager::get("MAIL_TOTAL_TIME");
         if (!$total_time) {
             $total_time = 0;
         }
         $elapsed_time = changeInMicrotime($start_time);
         $total_time += $elapsed_time;
         $current_messages[] = array("QUERY" => "<p>Send Mail</p>" . "<pre>" . wordwrap($this->messages, 60, "\n", true) . "</pre>", "ELAPSED_TIME" => $elapsed_time);
         AnalyticsManager::set("MAIL_MESSAGES", $current_messages);
         AnalyticsManager::set("MAIL_TOTAL_TIME", $total_time);
     }
 }
Пример #4
0
 /**
  * Determines based on its size, if index_shard should be added to
  * the active generation or in a new generation should be started.
  * If so, a new generation is started, the old generation is saved, and
  * the dictionary of the old shard is copied to the bundles dictionary
  * and a log-merge performed if needed
  *
  * @param int $add_num_docs number of docs in the shard about to be added
  * @param object $callback object with join function to be
  *     called if process is taking too long
  * @param bool $blocking whether there is an ongoing merge tiers operation
  *      occurring, if so don't do anything and return -1
  * @return int the active generation after the check and possible change has
  *     been performed
  */
 function initGenerationToAdd($add_num_docs, $callback = NULL, $blocking = false)
 {
     $current_num_docs = $this->getActiveShard()->num_docs;
     crawlLog("Current index shard has " . $current_num_docs . " documents.");
     $memory_limit = metricToInt(ini_get("memory_limit"));
     crawlLog("Memory Indexer limit is " . $memory_limit . ". Usage is " . memory_get_usage());
     if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation || 0.65 * $memory_limit < memory_get_usage()) {
         if ($blocking == true) {
             return -1;
         }
         crawlLog("Switching Index Shard...");
         $switch_time = microtime();
         // Save current shard dictionary to main dictionary
         $this->forceSave();
         $this->addAdvanceGeneration($callback);
         crawlLog("Switch Index Shard time:" . changeInMicrotime($switch_time));
     }
     return $this->generation_info['ACTIVE'];
 }
Пример #5
0
 /**
  * Builds an inverted index shard (word --> {docs it appears in})
  * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
  * This inverted index shard is then merged by a queue_server
  * into the inverted index of the current generation of the crawl.
  * The complete inverted index for the whole crawl is built out of these
  * inverted indexes for generations. The point of computing a partial
  * inverted index on the fetcher is to reduce some of the computational
  * burden on the queue server. The resulting mini index computed by
  * buildMiniInvertedIndex() is stored in
  * $this->found_sites[self::INVERTED_INDEX]
  *
  */
 function buildMiniInvertedIndex()
 {
     $start_time = microtime();
     $keypad = "";
     crawlLog("  Start building mini inverted index ...  Current Memory:" . memory_get_usage());
     $num_seen = count($this->found_sites[self::SEEN_URLS]);
     $this->num_seen_sites += $num_seen;
     /*
         for the fetcher we are not saving the index shards so
         name doesn't matter.
     */
     if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) {
         $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}");
     }
     for ($i = 0; $i < $num_seen; $i++) {
         $interim_time = microtime();
         $site = $this->found_sites[self::SEEN_URLS][$i];
         if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) {
             continue;
         }
         $doc_rank = false;
         if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) {
             $doc_rank = $this->archive_iterator->weight($site);
         }
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
             $is_link = true;
             $doc_keys = $site[self::HTTP_CODE];
             $site_url = $site[self::TITLE];
             $host = UrlParser::getHost($site_url);
             $link_parts = explode('|', $site[self::HASH]);
             if (isset($link_parts[5])) {
                 $link_origin = $link_parts[5];
             } else {
                 $link_origin = $site_url;
             }
             $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
         } else {
             $is_link = false;
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
             $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
             $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources);
         }
         $word_lists = array();
         /*
            self::JUST_METAS check to avoid getting sitemaps in results for
            popular words
         */
         $lang = NULL;
         if (!isset($site[self::JUST_METAS])) {
             $host_words = UrlParser::getWordsIfHostUrl($site_url);
             $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
             if ($is_link) {
                 $phrase_string = $site[self::DESCRIPTION];
             } else {
                 if (isset($site[self::LANG])) {
                     if (isset($this->programming_language_extension[$site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                 } else {
                     $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                 }
             }
             if (isset($site[self::LANG])) {
                 $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!$is_link) {
             //store inlinks so they can be searched by
             $num_links = count($site[self::LINKS]);
             if ($num_links > 0) {
                 $link_rank = false;
                 if ($doc_rank !== false) {
                     $link_rank = max($doc_rank - 1, 1);
                 }
             } else {
                 $link_rank = false;
             }
         }
         $num_queue_servers = count($this->queue_servers);
         if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
             $score_keys = "";
             foreach ($site[self::USER_RANKS] as $label => $score) {
                 $score_keys .= packInt($score);
             }
             if (strlen($score_keys) % 8 != 0) {
                 $score_keys .= $keypad;
             }
             $doc_keys .= $score_keys;
         }
         $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank);
         /*
            $this->no_process_links is set when doing things like
            mix recrawls. In this case links likely already will appear
            in what indexing, so don't index again. $site[self::JUST_META]
            is set when have a sitemap or robots.txt (this case set later).
            In this case link  info is not particularly useful for indexing
            and can greatly slow building inverted index.
         */
         if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) {
             foreach ($site[self::LINKS] as $url => $link_text) {
                 /* this mysterious check means won't index links from
                     robots.txt. Sitemap will still be in TO_CRAWL, but that's
                     done elsewhere
                    */
                 if (strlen($url) == 0 || is_numeric($url)) {
                     continue;
                 }
                 $link_host = UrlParser::getHost($url);
                 if (strlen($link_host) == 0) {
                     continue;
                 }
                 $part_num = calculatePartition($link_host, $num_queue_servers);
                 $summary = array();
                 if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) {
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array();
                 }
                 $elink_flag = $link_host != $host ? true : false;
                 $link_text = strip_tags($link_text);
                 $ref = $elink_flag ? "eref" : "iref";
                 $url = str_replace('|', "%7C", $url);
                 $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url;
                 $elink_flag_string = $elink_flag ? "e" : "i";
                 $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1);
                 $summary[self::URL] = $link_id;
                 $summary[self::TITLE] = $url;
                 // stripping html to be on the safe side
                 $summary[self::DESCRIPTION] = $link_text;
                 $summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
                 $summary[self::ENCODING] = $site[self::ENCODING];
                 $summary[self::HASH] = $link_id;
                 $summary[self::TYPE] = "link";
                 $summary[self::HTTP_CODE] = $link_keys;
                 $summary[self::LANG] = $lang;
                 $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary;
                 $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang);
                 $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url);
                 if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) {
                     $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}");
                 }
                 $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank);
             }
         }
         $iterim_elapse = changeInMicrotime($interim_time);
         if ($iterim_elapse > 5) {
             crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
         }
         crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]);
     }
     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
         $this->recrawl_check_scheduler = true;
     }
     crawlLog("  Build mini inverted index time " . changeInMicrotime($start_time));
 }
Пример #6
0
 /**
  * Removes from the passed array those elements $elt who either are in
  * the filter bundle or whose $elt[$field_name] is in the bundle.
  *
  * @param array& $arr the array to remove elements from
  * @param array $field_names if not NULL an array of field names of $arr
  *     to use to do filtering
  */
 function differenceFilter(&$arr, $field_names = NULL)
 {
     $incremental_time = microtime();
     $num_filters = $this->num_filters;
     $count = count($arr);
     for ($i = 0; $i < $num_filters; $i++) {
         if ($i == $num_filters - 1) {
             $tmp_filter = $this->current_filter;
         } else {
             $tmp_filter = BloomFilterFile::load($this->dir_name . "/filter_{$i}.ftr");
         }
         for ($j = 0; $j < $count; $j++) {
             if ($field_names === NULL) {
                 $tmp =& $arr[$j];
                 if ($tmp !== false && $tmp_filter->contains($tmp)) {
                     /*
                        We deliberately don't try to add anything that has
                        the hash field set to false. This is our cue to
                        skip an element such as a link document which we
                        know will almost always be unique and so be unnecessary
                        to de-duplicate
                     */
                     unset($arr[$j]);
                 }
             } else {
                 //now do the same strategy for the array of fields case
                 foreach ($field_names as $field_name) {
                     $tmp =& $arr[$j][$field_name];
                     if ($tmp !== false && $tmp_filter->contains($tmp)) {
                         unset($arr[$j]);
                         break;
                     }
                 }
             }
             if (changeInMicrotime($incremental_time) > 30) {
                 crawlLog("..Processing item {$j} of {$count} from filter " . "number {$i} of {$num_filters}.");
                 $incremental_time = microtime();
             }
         }
     }
 }
Пример #7
0
 /**
  * Gets doc summaries of documents containing given words and meeting the
  * additional provided criteria
  * @param array $word_structs an array of word_structs. Here a word_struct
  *     is an associative array with at least the following fields
  *     KEYS -- an array of word keys
  *     QUOTE_POSITIONS -- an array of positions of words that appeared in
  *         quotes (so need to be matched exactly)
  *     DISALLOW_PHRASES -- an array of words the document must not contain
  *     WEIGHT -- a weight to multiple scores returned from this iterator by
  *     INDEX_NAME -- an index timestamp to get results from
  * @param int $limit number of first document in order to return
  * @param int $num number of documents to return summaries of
  * @param array& $filter an array of hashes of domains to filter from
  *     results
  * @param bool $use_cache_if_allowed if true and USE_CACHE is true then
  *     an attempt will be made to look up the results in either
  *     the file cache or memcache. Otherwise, items will be recomputed
  *     and then potentially restored in cache
  * @param int $raw ($raw == 0) normal grouping, ($raw > 0)
  *     no grouping done on data. if ($raw == 1) no lookups of summaries
  *     done
  * @param array $queue_servers a list of urls of yioop machines which might
  *     be used during lookup
  * @param string $original_query if set, the original query that corresponds
  *     to $word_structs
  * @param string $save_timestamp_name if this timestamp is not empty, then
  *     save iterate position, so can resume on future queries that make
  *     use of the timestamp. If used then $limit ignored and get next $num
  *     docs after $save_timestamp 's previous iterate position.
  * @param bool $limit_news if true the number of media:news items to
  *     allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT
  *
  * @return array document summaries
  */
 function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true)
 {
     global $CACHE;
     $indent = "&nbsp;&nbsp;";
     $in2 = $indent . $indent;
     $in3 = $in2 . $indent;
     $in4 = $in2 . $in2;
     if (QUERY_STATISTICS) {
         $lookup_time = microtime();
     }
     $use_proximity = false;
     $time = time();
     if (count($word_structs) > 1 || isset($word_structs[0]["KEYS"]) && count($word_structs[0]["KEYS"]) > 1 || $word_structs == array() && substr_count($original_query, " ") > 1) {
         $use_proximity = true;
     }
     if (!isset($filter['time'])) {
         $filter['time'] = 0;
     }
     $filter_time = $filter['time'];
     unset($filter['time']);
     //iterators don't expect time field
     $pages = array();
     $generation = 0;
     $to_retrieve = ceil(($limit + $num) / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     $start_slice = floor($limit / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     if ($save_timestamp_name != "") {
         $to_retrieve = $num;
         $limit = 0;
         $start_slice = 0;
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $mem_tmp = serialize($raw) . serialize($word_structs) . $original_query . $this->index_name;
         $summary_hash = crawlHash($mem_tmp . ":" . $limit . ":" . $num);
         if ($use_cache_if_allowed) {
             $cache_success = true;
             $results = $CACHE->get($summary_hash);
             if (!isset($results['TIME']) || $filter_time > $results['TIME']) {
                 //if filter has changed since cached, then invalidate cache
                 $results = false;
             }
             if (isset($results['TIME'])) {
                 $cached_time = $time - $results['TIME'];
             } else {
                 $cached_time = $time;
             }
             if ($cached_time > MAX_QUERY_CACHE_TIME) {
                 $results = false;
             }
             if (isset($results['PAGES'])) {
                 $close_prefix = WORK_DIRECTORY . "/schedules/" . self::index_closed_name;
                 $has_changeable_results = false;
                 $seen_times = array();
                 foreach ($results['PAGES'] as $page) {
                     if (!isset($page[self::CRAWL_TIME]) || in_array($page[self::CRAWL_TIME], $seen_times)) {
                         continue;
                     }
                     $seen_times[] = $page[self::CRAWL_TIME];
                     $current_closed = $close_prefix . $page[self::CRAWL_TIME] . ".txt";
                     if (!file_exists($current_closed)) {
                         //either feed result or from active crawl
                         $has_changeable_results = true;
                         break;
                     }
                 }
                 if ($has_changeable_results) {
                     if ($cached_time > MIN_QUERY_CACHE_TIME) {
                         $results = false;
                     }
                 }
             }
             if (QUERY_STATISTICS) {
                 $this->query_info['QUERY'] .= "{$in2}<b>Cache Lookup Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
             }
             if ($results !== false) {
                 return $results;
             }
         }
     }
     $old_to_retrieve = $to_retrieve;
     $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, $save_timestamp_name, $limit_news);
     $num_retrieved = 0;
     $pages = array();
     if (is_object($query_iterator)) {
         while ($num_retrieved < $to_retrieve && is_array($next_docs = $query_iterator->nextDocsWithWord())) {
             $pages += $next_docs;
             $num_retrieved = count($pages);
         }
     }
     if ($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) {
         // used for archive crawls of crawl mixes
         $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt";
         $iterators = $query_iterator->save_iterators;
         $cnt_iterators = count($iterators);
         $save_point = array();
         for ($i = 0; $i < $cnt_iterators; $i++) {
             $save_point[$i] = $iterators[$i]->currentGenDocOffsetWithWord();
         }
         $results["SAVE_POINT"] = $save_point;
         file_put_contents($save_file, serialize($save_point));
         $this->db->setWorldPermissionsRecursive($save_file);
     }
     $pages = array_values($pages);
     $result_count = count($pages);
     $sort_time = 0;
     if ($raw == 0) {
         // initialize scores
         $sort_start = microtime();
         $max_user_ranks = 0;
         for ($i = 0; $i < $result_count; $i++) {
             $pages[$i]["OUT_SCORE"] = 0;
             if (isset($pages[$i][self::USER_RANKS])) {
                 $j = count($pages[$i][self::USER_RANKS]);
                 if ($max_user_ranks < $j) {
                     $max_user_ranks = $j;
                 }
             }
         }
         if ($max_user_ranks > 0) {
             for ($i = 0; $i < $result_count; $i++) {
                 for ($j = 0; $j < $max_user_ranks; $j++) {
                     if (isset($pages[$i][self::USER_RANKS][$j])) {
                         $pages[$i]["USCORE{$j}"] = $pages[$i][self::USER_RANKS][$j];
                     } else {
                         $pages[$i]["USCORE{$j}"] = 0;
                     }
                 }
             }
         }
         $subscore_fields = array(self::DOC_RANK, self::RELEVANCE);
         if ($use_proximity) {
             $subscore_fields[] = self::PROXIMITY;
         }
         if ($max_user_ranks > 0) {
             for ($j = 0; $j < $max_user_ranks; $j++) {
                 $subscore_fields[] = "USCORE{$j}";
             }
         }
         $num_fields = count($subscore_fields);
         // Compute Reciprocal Rank Fusion Score
         $alpha = 600 / $num_fields;
         if (isset($pages[0])) {
             foreach ($subscore_fields as $field) {
                 orderCallback($pages[0], $pages[0], $field);
                 usort($pages, "orderCallback");
                 $score = 0;
                 for ($i = 0; $i < $result_count; $i++) {
                     if ($i > 0) {
                         if ($pages[$i - 1][$field] != $pages[$i][$field]) {
                             $score++;
                         }
                     }
                     $pages[$i]["OUT_SCORE"] += $alpha / (59 + $score);
                 }
             }
             orderCallback($pages[0], $pages[0], "OUT_SCORE");
         }
         usort($pages, "orderCallback");
         if ($use_proximity) {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         } else {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::PROXIMITY] = 1;
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         }
         $sort_time = changeInMicrotime($sort_start);
     }
     if ($num_retrieved < $to_retrieve) {
         $results['TOTAL_ROWS'] = $num_retrieved;
     } else {
         $results['TOTAL_ROWS'] = $query_iterator->num_docs;
         //this is only an approximation
     }
     if ($raw == 1 && $save_timestamp_name == "") {
         $pages = array_slice($pages, $start_slice);
         $pages = array_slice($pages, $limit - $start_slice, $num);
         $results['PAGES'] =& $pages;
         if ($old_to_retrieve != $to_retrieve) {
             $results['HARD_QUERY'] = $old_to_retrieve;
         }
         return $results;
     }
     if (QUERY_STATISTICS) {
         $this->query_info['QUERY'] .= "{$in2}<b>Lookup Offsets Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
         $machine_times = AnalyticsManager::get("MACHINE_TIMES");
         if ($machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Machine Sub-Times</i>:<br />" . $machine_times . "<br />";
         }
         $net_times = AnalyticsManager::get("NET_TIMES");
         $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
         if ($net_times && $max_machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Network Overhead Sub-Time</i>: " . ($net_times - $max_machine_times) . "<br />";
         }
         if ($sort_time) {
             $this->query_info['QUERY'] .= "{$in3}<i>Merge-Rank Sub-Time</i>: " . $sort_time . "<br />";
         }
         $summaries_time = microtime();
     }
     $get_pages = array_slice($pages, $limit, $num);
     $to_get_count = count($get_pages);
     $groups_with_docs = false;
     if (preg_match("/\\bsite:doc\\b/", $original_query)) {
         $groups_with_docs = true;
     }
     $out_pages = array();
     $cur_limit = $limit;
     while (count($out_pages) < $to_get_count && $get_pages) {
         $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, $raw, $groups_with_docs));
         if ($save_timestamp_name != "") {
             break;
         }
         $cur_limit += $num;
         $get_pages = array_slice($pages, $cur_limit, $num);
     }
     $out_pages = array_slice($out_pages, 0, $num);
     if (QUERY_STATISTICS) {
         $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
         if ($summary_times_string) {
             $round_summary_times = unserialize($summary_times_string);
             $summary_delta_time = changeInMicrotime($summaries_time);
             $summary_time_info = "{$summary_delta_time}<br /> {$in4}";
             $sum_max_time = 0;
             foreach ($round_summary_times as $summary_times) {
                 $i = 0;
                 $max_time = 0;
                 foreach ($summary_times as $summary_time) {
                     $summary_time_info .= "ID_{$i}: " . $summary_time . "{$indent}";
                     $max_time = $summary_time > $max_time ? $summary_time : $max_time;
                     $i++;
                 }
                 $sum_max_time += $max_time;
             }
             $net_overhead = $summary_delta_time - $sum_max_time;
             $summary_time_info .= "<br />{$in3}<i>Network Overhead Sub-Time</i>: " . $net_overhead;
         } else {
             $summary_time_info = changeInMicrotime($summaries_time);
         }
         $this->query_info['QUERY'] .= "{$in2}<b>Get Summaries Time</b>: " . $summary_time_info . "<br />";
     }
     $results['PAGES'] =& $out_pages;
     $results['TIME'] = time();
     $lang = guessLocaleFromString($original_query);
     $tokenizer = PhraseParser::getTokenizer($lang);
     //only use tokenizer if no meta word or disjuncts in query
     if (!preg_match('/(\\||\\:)/u', $original_query) && $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") && method_exists($tokenizer, "tagPartsOfSpeechPhrase") && isset($tokenizer->use_thesaurus)) {
         $results = $this->sortByThesaurusScore($results, $original_query, $lang);
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $CACHE->set($summary_hash, $results);
     }
     return $results;
 }
Пример #8
0
 /**
  * Test how fast insertion and deletions can be done
  */
 function timingTestCase()
 {
     $start_time = microtime();
     for ($i = 0; $i < 10000; $i++) {
         $this->test_objects['FILE1']->insert(crawlHash("hi{$i}", true), "0000" . packInt($i));
     }
     $this->assertTrue(changeInMicrotime($start_time) < 2, "Insert 10000 into table of size 20000 takes less than 2 seconds");
     $start_time = microtime();
     for ($i = 0; $i < 10000; $i++) {
         $this->test_objects['FILE1']->delete(crawlHash("hi{$i}", true));
     }
     $this->assertTrue(changeInMicrotime($start_time) < 2, "Delete 10000 from table of size 20000 takes less than 2 seconds");
 }
Пример #9
0
 /**
  * Executes the supplied sql command on the database, depending on debug
  * levels computes query statistics
  *
  * This method operates either query or data manipulation statements
  *
  * @param string $sql  SQL statement to execute
  * @param array $params bind_name => value values to interpolate into
  *      the $sql to be executes
  * @return mixed false if query fails, resource or true otherwise
  */
 function execute($sql, $params = array())
 {
     if (QUERY_STATISTICS) {
         $query_info = array();
         $query_info['QUERY'] = $sql;
         if ($params != array()) {
             $query_info['QUERY'] .= "<br />" . print_r($params, true);
         }
         $start_time = microtime();
     }
     $result = $this->exec($sql, $params);
     if (QUERY_STATISTICS) {
         $query_info['ELAPSED_TIME'] = changeInMicrotime($start_time);
         $this->total_time += $query_info['ELAPSED_TIME'];
         $this->query_log[] = $query_info;
     }
     return $result;
 }
Пример #10
0
 /**
  * Gets status and, if done processing all other mirroring activities,
  * gets a new list of files that have changed since the last synchronization
  * from the web app of the machine we are mirroring with.
  *
  * @return mixed array or bool. Returns false if weren't succesful in
  *     contacting web app, otherwise, returns an array with a status
  *     and potentially a list of files ot sync
  */
 function checkScheduler()
 {
     $info = array();
     $name_server = $this->name_server;
     $start_time = microtime();
     $time = time();
     $session = md5($time . AUTH_KEY);
     $request = $name_server . "?c=resource&time={$time}&session={$session}" . "&robot_instance=" . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&last_sync=" . $this->last_sync;
     if ($this->start_sync <= $this->last_sync) {
         $request .= "&a=syncList";
         $info_string = FetchUrl::getPage($request, NULL, true);
         if ($info_string === false) {
             return false;
         }
         $this->last_notify = $time;
         $info_string = trim($info_string);
         $info = unserialize(gzuncompress(base64_decode($info_string)));
         if (isset($info[self::STATUS]) && $info[self::STATUS] == self::CONTINUE_STATE) {
             $this->start_sync = time();
             $this->sync_schedule = $info[self::DATA];
             unset($info[self::DATA]);
         }
     } else {
         $info[self::STATUS] = self::CONTINUE_STATE;
         if ($time - $this->last_notify > MIRROR_NOTIFY_FREQUENCY) {
             $request .= "&a=syncNotify";
             FetchUrl::getPage($request, NULL, true);
             $this->last_notify = $time;
             CrawlLog("Notifying master that mirror is alive..");
         }
     }
     if (count($this->sync_schedule) == 0) {
         $this->last_sync = $this->start_sync;
         $this->db->setWorldPermissionsRecursive($this->sync_dir, true);
         file_put_contents($this->last_sync_file, serialize($this->last_sync));
     }
     crawlLog("  Time to check Scheduler " . changeInMicrotime($start_time));
     return $info;
 }
Пример #11
0
 /**
  * Hook function used by currentDocsWithWord to return the current block
  * of docs if it is not cached
  *
  * @return mixed doc ids and score if there are docs left, -1 otherwise
  */
 function findDocsWithWord()
 {
     $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}";
     $sites = array();
     $lookup = array();
     $i = 0;
     $j = 0;
     foreach ($this->queue_servers as $server) {
         if ($this->more_flags[$i]) {
             $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}";
             $lookup[$j] = $i;
             $j++;
         }
         $i++;
     }
     $net_times = AnalyticsManager::get("NET_TIMES");
     $net_times = $net_times ? $net_times : 0;
     $download_time = microtime();
     $downloads = array();
     if (count($sites) > 0) {
         $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true);
     }
     $net_times += changeInMicrotime($download_time);
     AnalyticsManager::set("NET_TIMES", $net_times);
     $results = array();
     $count = count($downloads);
     $this->num_docs = 0;
     $in4 = "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
     $machine_times = AnalyticsManager::get("MACHINE_TIMES");
     $indent = $machine_times ? "<br />{$in4}" : $in4;
     $machine_times = $machine_times ? $machine_times : "";
     $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
     $max_machine_times = $max_machine_times ? $max_machine_times : 0;
     $max_time = 0;
     $num_with_results = $count;
     for ($j = 0; $j < $count; $j++) {
         $download =& $downloads[$j];
         if (isset($download[self::PAGE])) {
             $pre_result = @unserialize($download[self::PAGE]);
             if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) {
                 $this->more_flags[$lookup[$j]] = false;
                 $num_with_results--;
             }
             if (isset($pre_result["TOTAL_ROWS"])) {
                 $this->num_docs += $pre_result["TOTAL_ROWS"];
             }
             if (isset($pre_result["PAGES"])) {
                 foreach ($pre_result["PAGES"] as $page_data) {
                     if (isset($page_data[self::KEY])) {
                         $results[$page_data[self::KEY]] = $page_data;
                         $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j];
                     }
                 }
             }
             $max_time = max($max_time, $pre_result['ELAPSED_TIME']);
             $lookup_link = $this->makeLookupLink($sites, $lookup[$j]);
             $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . "&nbsp;&nbsp;";
             $indent = "";
         }
     }
     if (isset($pre_result["HARD_QUERY"])) {
         $this->hard_query = $pre_result["HARD_QUERY"];
     }
     if ($num_with_results > 0) {
         $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results));
     }
     $max_machine_times += $max_time;
     AnalyticsManager::set("MACHINE_TIMES", $machine_times);
     AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times);
     if ($results == array()) {
         $results = -1;
     }
     if ($results != -1) {
         if ($this->filter != NULL) {
             foreach ($results as $keys => $data) {
                 $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
                 if (in_array($host_key, $this->filter)) {
                     unset($results[$keys]);
                 }
             }
         }
     }
     $this->count_block = count($results);
     $this->pages = $results;
     return $results;
 }
Пример #12
0
 /**
  * This is the main entry point for handling a search request.
  *
  * ProcessRequest determines the type of search request (normal request ,
  * cache request, or related request), or if its a
  * user is returning from the admin panel via signout. It then calls the
  * appropriate method to handle the given activity.Finally, it draw the
  * search screen.
  */
 function processRequest()
 {
     $data = array();
     $start_time = microtime();
     if (AD_LOCATION != "none") {
         $ad_fields = array('TOP_ADSCRIPT', 'SIDE_ADSCRIPT', 'GLOBAL_ADSCRIPT');
         foreach ($ad_fields as $ad_field) {
             $ad = html_entity_decode(constant($ad_field), ENT_QUOTES);
             $ad = preg_replace("[&#40;]", "(", $ad);
             $data[$ad_field] = preg_replace("[&#41;]", ")", $ad);
         }
     }
     list($subsearches, $no_query) = $this->initializeSubsearches();
     $format_info = $this->initializeResponseFormat();
     if (!$format_info) {
         return;
     }
     list($view, $web_flag, $raw, $results_per_page, $limit) = $format_info;
     list($query, $activity, $arg) = $this->initializeUserAndDefaultActivity($data);
     if ($activity == "query" && $this->mirrorHandle()) {
         return;
     }
     list($index_timestamp, $index_info, $save_timestamp) = $this->initializeIndexInfo($web_flag, $raw, $data);
     unset($_SESSION['LAST_ACTIVITY']);
     if (isset($_REQUEST['q']) && strlen($_REQUEST['q']) > 0 || $activity != "query") {
         if ($activity != "cache") {
             $this->processQuery($data, $query, $activity, $arg, $results_per_page, $limit, $index_timestamp, $raw, $save_timestamp);
             // calculate the results of a search if there is one
         } else {
             if (isset($_REQUEST['repository'])) {
                 $ui_array = array();
             } else {
                 $ui_array = array("highlight", "yioop_nav", "history", "summaries", "version");
                 if (isset($_REQUEST['from_cache'])) {
                     $ui_array[] = "cache_link_referrer";
                 }
                 if (isset($_REQUEST['hist_open'])) {
                     $ui_array[] = "hist_ui_open";
                 }
             }
             $this->cacheRequestAndOutput($arg, $ui_array, $query, $index_timestamp);
             return;
         }
     }
     $data['ELAPSED_TIME'] = changeInMicrotime($start_time);
     if ($view == "serial" || $view == "json") {
         if (isset($data["PAGES"])) {
             $count = count($data["PAGES"]);
             for ($i = 0; $i < $count; $i++) {
                 unset($data["PAGES"][$i]["OUT_SCORE"]);
                 $data["PAGES"][$i][self::SCORE] = "" . round($data["PAGES"][$i][self::SCORE], 3);
                 $data["PAGES"][$i][self::DOC_RANK] = "" . round($data["PAGES"][$i][self::DOC_RANK], 3);
                 $data["PAGES"][$i][self::RELEVANCE] = "" . round($data["PAGES"][$i][self::RELEVANCE], 3);
             }
         }
         if ($view == "serial") {
             echo serialize($data);
         } else {
             $out_data = array();
             $out_data["language"] = getLocaleTag();
             $out_data["link"] = NAME_SERVER . "?f=json&amp;q={$data['QUERY']}";
             $out_data["totalResults"] = $data['TOTAL_ROWS'];
             $out_data["startIndex"] = $data['LIMIT'];
             $out_data["itemsPerPage"] = $data['RESULTS_PER_PAGE'];
             foreach ($data['PAGES'] as $page) {
                 $item = array();
                 $item["title"] = $page[self::TITLE];
                 if (!isset($page[self::TYPE]) || isset($page[self::TYPE]) && $page[self::TYPE] != "link") {
                     $item["link"] = $page[self::URL];
                 } else {
                     $item["link"] = strip_tags($page[self::TITLE]);
                 }
                 $item["description"] = strip_tags($page[self::DESCRIPTION]);
                 if (isset($page[self::THUMB]) && $page[self::THUMB] != 'NULL') {
                     $item["thumb"] = $page[self::THUMB];
                 }
                 if (isset($page[self::TYPE])) {
                     $item["type"] = $page[self::TYPE];
                 }
                 $out_data['item'][] = $item;
             }
             echo json_encode($out_data);
         }
         exit;
     }
     if ($web_flag) {
         $this->addSearchViewData($index_info, $no_query, $raw, $view, $subsearches, $data);
     }
     if (!isset($data["USERNAME"]) && isset($_SESSION['USER_ID'])) {
         $signin_model = $this->model("signin");
         $data['USERNAME'] = $signin_model->getUserName($_SESSION['USER_ID']);
     }
     $this->displayView($view, $data);
 }
Пример #13
0
 /**
  * Make multi_curl requests for an array of sites with urls or onion urls
  *
  * @param array $sites  an array containing urls of pages to request
  * @param bool $timer  flag, true means print timing statistics to log
  * @param int $page_range_request maximum number of bytes to download/page
  *     0 means download all
  * @param string $temp_dir folder to store temporary ip header info
  * @param string $key  the component of $sites[$i] that has the value of
  *     a url to get defaults to URL
  * @param string $value component of $sites[$i] in which to store the
  *     page that was gotten
  * @param bool $minimal if true do a faster request of pages by not
  *     doing things like extract HTTP headers sent, etcs
  * @param array $post_data data to be POST'd to each site
  * @param bool $follow whether to follow redirects or not
  * @param string $tor_proxy url of a proxy that knows how to download
  *     .onion urls
  * @param array $proxy_servers if not array(), then an array of proxy
  *     server to use rather than to directly download web pages from
  *     the current machine
  *
  * @return array an updated array with the contents of those pages
  */
 static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array())
 {
     $agent_handler = curl_multi_init();
     $active = NULL;
     $start_time = microtime();
     if (!$minimal && $temp_dir == NULL) {
         $temp_dir = CRAWL_DIR . "/temp";
         if (!file_exists($temp_dir)) {
             mkdir($temp_dir);
         }
     }
     //Set-up requests
     $num_sites = count($sites);
     for ($i = 0; $i < $num_sites; $i++) {
         $is_gopher = false;
         $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
         if (isset($sites[$i][$key])) {
             list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers);
             if ($headers == "gopher") {
                 $is_gopher = true;
                 $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
                 $headers = array();
             }
             $sites[$i][0] = curl_init();
             if (!$minimal) {
                 $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+');
                 curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
                 curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
             }
             curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT);
             curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER);
             curl_setopt($sites[$i][0], CURLOPT_URL, $url);
             if (strcmp(substr($url, -10), "robots.txt") == 0) {
                 $sites[$i]['ROBOT'] = true;
                 $follow = true;
                 /*wikipedia redirects their robot page. grr
                     want to force this for robots pages
                   */
             }
             curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
             curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
             curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
             curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
             curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT);
             curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT);
             if (stripos($url, '.onion') !== false && $tor_proxy != "") {
                 curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy);
                 //CURLPROXY_SOCKS5_HOSTNAME = 7
                 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7);
                 if ($timer) {
                     crawlLog("Using Tor proxy for {$url}..");
                 }
             } else {
                 if ($proxy_servers != array() && !$is_gopher) {
                     $select_proxy = rand(0, count($proxy_servers) - 1);
                     $proxy_server = $proxy_servers[$select_proxy];
                     $proxy_parts = explode(":", $proxy_server);
                     $proxy_ip = $proxy_parts[0];
                     if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') {
                         $proxy_type = CURLPROXY_HTTP;
                     } else {
                         if (strtolower($proxy_parts[2]) == 'socks5') {
                             $proxy_type = CURLPROXY_SOCKS5;
                         } else {
                             $proxy_type = $proxy_parts[2];
                         }
                     }
                     if (isset($proxy_parts[1])) {
                         $proxy_port = $proxy_parts[1];
                     } else {
                         $proxy_port = "80";
                     }
                     curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}");
                     curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type);
                     if ($timer) {
                         crawlLog("Selecting proxy {$select_proxy} for {$url}");
                     }
                 }
             }
             if (!$minimal) {
                 curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
             }
             //make lighttpd happier
             if (!$is_gopher) {
                 curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers);
             }
             curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
             // ^ need to set for sites like att that use gzip
             if ($page_range_request > 0) {
                 curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request);
             }
             if ($post_data != NULL) {
                 curl_setopt($sites[$i][0], CURLOPT_POST, true);
                 curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]);
             }
             curl_multi_add_handle($agent_handler, $sites[$i][0]);
         }
     }
     if ($timer) {
         crawlLog("  Init Get Pages " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     $start = time();
     //Wait for responses
     $running = NULL;
     $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
     do {
         $mrc = curl_multi_exec($agent_handler, $running);
         $ready = curl_multi_select($agent_handler, 0.005);
     } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0);
     if (time() - $start > PAGE_TIMEOUT && $timer) {
         crawlLog("  TIMED OUT!!!");
     }
     if ($timer) {
         crawlLog("  Page Request time " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     //Process returned pages
     for ($i = 0; $i < $num_sites; $i++) {
         if ($timer) {
             crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites);
         }
         if (!$minimal && isset($ip_holder[$i])) {
             rewind($ip_holder[$i]);
             $header = fread($ip_holder[$i], 8192);
             $ip_addresses = self::getCurlIp($header);
             fclose($ip_holder[$i]);
         }
         $is_gopher = false;
         if (isset($sites[$i][0]) && $sites[$i][0]) {
             // Get Data and Message Code
             $content = @curl_multi_getcontent($sites[$i][0]);
             $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL];
             /*
                If the Transfer-encoding was chunked then the Range header
                we sent was ignored. So we manually truncate the data
                here
             */
             if ($page_range_request > 0) {
                 $content = substr($content, 0, $page_range_request);
             }
             if (isset($content) && !$minimal && !$is_gopher) {
                 $site = self::parseHeaderPage($content, $value);
                 $sites[$i] = array_merge($sites[$i], $site);
                 if (isset($header)) {
                     $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4);
                 } else {
                     $header = "";
                 }
                 $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER];
                 unset($header);
             } else {
                 if (isset($content) && !$minimal && $is_gopher) {
                     $sites[$i][CrawlConstants::HEADER] = $header;
                     $sites[$i][$value] = $content;
                     unset($header);
                 } else {
                     $sites[$i][$value] = $content;
                 }
             }
             if (!$minimal) {
                 $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD);
                 $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME);
                 $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME);
                 $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
                 if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) {
                     $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
                 } else {
                     $sites[$i][self::HTTP_CODE] = 200;
                 }
                 if ($ip_addresses) {
                     $sites[$i][self::IP_ADDRESSES] = $ip_addresses;
                 } else {
                     $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0");
                 }
                 //Get Time, Mime type and Character encoding
                 $sites[$i][self::TIMESTAMP] = time();
                 if ($is_gopher) {
                     $path = UrlParser::getPath($sites[$i][self::URL]);
                     $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]);
                     if (isset($path[1])) {
                         $gopher_type = $path[1];
                     } else {
                         $gopher_type = 1;
                     }
                     if ($gopher_type == 1) {
                         $sites[$i][self::TYPE] = "text/gopher";
                     } else {
                         if (in_array($gopher_type, array(0, 3, 6))) {
                             $sites[$i][self::TYPE] = "text/plain";
                             if ($gopher_type == 6) {
                                 $sites[$i][$value] = convert_uudecode($content);
                             }
                         } else {
                             if ($gopher_type == 'h') {
                                 $sites[$i][self::TYPE] = "text/html";
                             } else {
                                 if ($gopher_type == 'g') {
                                     $sites[$i][self::TYPE] = "image/gif";
                                 }
                             }
                         }
                     }
                     $path_info = pathinfo($filename);
                     if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) {
                         $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename);
                     } else {
                         if (!isset($sites[$i][self::TYPE])) {
                             $sites[$i][self::TYPE] = "unknown";
                         }
                     }
                 } else {
                     $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE));
                     $sites[$i][self::TYPE] = strtolower(trim($type_parts[0]));
                 }
             }
             //curl_multi_remove_handle($agent_handler, $sites[$i][0]);
             curl_close($sites[$i][0]);
             if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) {
                 if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) {
                     $sites[$i][self::TYPE] = "text/plain";
                     $sites[$i][self::HTTP_CODE] = "200";
                     $tmp = wordwrap($sites[$i][$value], 80);
                     $tmp_parts = explode("\n", $tmp);
                     $tmp = "# Suspect server misconfiguration\n";
                     $tmp .= "# Assume shouldn't crawl this site.\n";
                     $tmp .= "# Pretending got following robots.txt.\n";
                     $tmp .= "User-agent: *\n";
                     $tmp .= "Disallow: /\n";
                     $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n";
                     $tmp .= "# Original content:\n";
                     foreach ($tmp_parts as $part) {
                         $tmp = "#" . $part . "\n";
                     }
                     $sites[$i][$value] = $tmp;
                     $sites[$i][self::HTTP_CODE] = "200";
                     unset($site[CrawlConstants::LOCATION]);
                 }
             }
         }
         //end big if
     }
     //end for
     if ($timer) {
         crawlLog("  Get Page Content time " . changeInMicrotime($start_time));
     }
     curl_multi_close($agent_handler);
     return $sites;
 }
Пример #14
0
 /**
  * Main loop for the news updater.
  */
 function loop()
 {
     crawlLog("In News Update Loop");
     $info[self::STATUS] = self::CONTINUE_STATE;
     $local_archives = array("");
     while (CrawlDaemon::processHandler()) {
         $start_time = microtime();
         crawlLog("Checking if news feeds should be updated...");
         $this->newsUpdate();
         $sleep_time = max(0, ceil(MINIMUM_UPDATE_LOOP_TIME - changeInMicrotime($start_time)));
         if ($sleep_time > 0) {
             crawlLog("Ensure minimum loop time by sleeping..." . $sleep_time);
             sleep($sleep_time);
         }
     }
     //end while
     crawlLog("News Updater shutting down!!");
 }
Пример #15
0
 /**
  * Receives a request to get crawl summary data for an array of urls
  * from a remote name server and then looks these up on the local
  * queue server
  */
 function getCrawlItems()
 {
     $crawl_model = $this->model("crawl");
     $start_time = microtime();
     if (!isset($_REQUEST["arg"]) || !isset($_REQUEST["num"]) || !isset($_REQUEST["i"])) {
         return;
     }
     $num = $this->clean($_REQUEST["num"], "int");
     $i = $this->clean($_REQUEST["i"], "int");
     $crawl_model->current_machine = $i;
     $lookups = unserialize(webdecode($_REQUEST["arg"]));
     $our_lookups = array();
     foreach ($lookups as $lookup => $lookup_info) {
         if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h' || $lookup_info[0][0] === 'r' || $lookup_info[0][0] === 'g')) {
             $our_lookups[$lookup] = $lookup_info;
         } else {
             $our_lookups[$lookup] = array();
             foreach ($lookup_info as $lookup_item) {
                 if (count($lookup_item) == 2) {
                     $our_lookups[$lookup][] = $lookup_item;
                 } else {
                     list($index, , , , ) = $lookup_item;
                     if ($index == $i) {
                         $our_lookups[$lookup][] = $lookup_item;
                     }
                 }
             }
         }
     }
     $items = $crawl_model->getCrawlItems($our_lookups);
     $items["ELAPSED_TIME"] = changeInMicrotime($start_time);
     echo webencode(serialize($items));
 }