Exemple #1
0
 /**
  * Sends an email (much like PHP's mail command, but not requiring
  * a configured smtp server on the current machine)
  *
  * @param string $subject subject line of the email
  * @param string $from sender email address
  * @param string $to recipient email address
  * @param string $message message body for the email
  */
 function send($subject, $from, $to, $message)
 {
     $start_time = microtime();
     if ($from == "") {
         $from = $this->sender_email;
     }
     $eol = self::EOL;
     if (USE_MAIL_PHP) {
         $header = "From: " . $from . $eol;
         mail($to, $subject, $message, $header);
         return;
     }
     $this->messages = "";
     $mail = "Date: " . date(DATE_RFC822) . $eol;
     $mail .= "Subject: " . $subject . $eol;
     $mail .= "From: " . $from . $eol;
     $mail .= "To: " . $to . $eol;
     $mail .= $eol . $eol . $message . $eol . ".";
     $commands = array("MAIL FROM: <{$from}>" => self::OKAY, "RCPT TO: <{$to}>" => self::OKAY, "DATA" => self::START_INPUT, $mail => self::OKAY);
     if ($this->startSession()) {
         foreach ($commands as $command => $good_response) {
             $response = $this->smtpCommand($command);
             if ($response != $good_response) {
                 $this->messages .= "{$command} failed!! {$response} {$good_response}\n";
                 break;
             }
         }
         $this->endSession();
     }
     if (QUERY_STATISTICS) {
         $current_messages = AnalyticsManager::get("MAIL_MESSAGES");
         if (!$current_messages) {
             $current_messages = array();
         }
         $total_time = AnalyticsManager::get("MAIL_TOTAL_TIME");
         if (!$total_time) {
             $total_time = 0;
         }
         $elapsed_time = changeInMicrotime($start_time);
         $total_time += $elapsed_time;
         $current_messages[] = array("QUERY" => "<p>Send Mail</p>" . "<pre>" . wordwrap($this->messages, 60, "\n", true) . "</pre>", "ELAPSED_TIME" => $elapsed_time);
         AnalyticsManager::set("MAIL_MESSAGES", $current_messages);
         AnalyticsManager::set("MAIL_TOTAL_TIME", $total_time);
     }
 }
Exemple #2
0
 /**
  * Gets doc summaries of documents containing given words and meeting the
  * additional provided criteria
  * @param array $word_structs an array of word_structs. Here a word_struct
  *     is an associative array with at least the following fields
  *     KEYS -- an array of word keys
  *     QUOTE_POSITIONS -- an array of positions of words that appeared in
  *         quotes (so need to be matched exactly)
  *     DISALLOW_PHRASES -- an array of words the document must not contain
  *     WEIGHT -- a weight to multiple scores returned from this iterator by
  *     INDEX_NAME -- an index timestamp to get results from
  * @param int $limit number of first document in order to return
  * @param int $num number of documents to return summaries of
  * @param array& $filter an array of hashes of domains to filter from
  *     results
  * @param bool $use_cache_if_allowed if true and USE_CACHE is true then
  *     an attempt will be made to look up the results in either
  *     the file cache or memcache. Otherwise, items will be recomputed
  *     and then potentially restored in cache
  * @param int $raw ($raw == 0) normal grouping, ($raw > 0)
  *     no grouping done on data. if ($raw == 1) no lookups of summaries
  *     done
  * @param array $queue_servers a list of urls of yioop machines which might
  *     be used during lookup
  * @param string $original_query if set, the original query that corresponds
  *     to $word_structs
  * @param string $save_timestamp_name if this timestamp is not empty, then
  *     save iterate position, so can resume on future queries that make
  *     use of the timestamp. If used then $limit ignored and get next $num
  *     docs after $save_timestamp 's previous iterate position.
  * @param bool $limit_news if true the number of media:news items to
  *     allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT
  *
  * @return array document summaries
  */
 function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true)
 {
     global $CACHE;
     $indent = "&nbsp;&nbsp;";
     $in2 = $indent . $indent;
     $in3 = $in2 . $indent;
     $in4 = $in2 . $in2;
     if (QUERY_STATISTICS) {
         $lookup_time = microtime();
     }
     $use_proximity = false;
     $time = time();
     if (count($word_structs) > 1 || isset($word_structs[0]["KEYS"]) && count($word_structs[0]["KEYS"]) > 1 || $word_structs == array() && substr_count($original_query, " ") > 1) {
         $use_proximity = true;
     }
     if (!isset($filter['time'])) {
         $filter['time'] = 0;
     }
     $filter_time = $filter['time'];
     unset($filter['time']);
     //iterators don't expect time field
     $pages = array();
     $generation = 0;
     $to_retrieve = ceil(($limit + $num) / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     $start_slice = floor($limit / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     if ($save_timestamp_name != "") {
         $to_retrieve = $num;
         $limit = 0;
         $start_slice = 0;
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $mem_tmp = serialize($raw) . serialize($word_structs) . $original_query . $this->index_name;
         $summary_hash = crawlHash($mem_tmp . ":" . $limit . ":" . $num);
         if ($use_cache_if_allowed) {
             $cache_success = true;
             $results = $CACHE->get($summary_hash);
             if (!isset($results['TIME']) || $filter_time > $results['TIME']) {
                 //if filter has changed since cached, then invalidate cache
                 $results = false;
             }
             if (isset($results['TIME'])) {
                 $cached_time = $time - $results['TIME'];
             } else {
                 $cached_time = $time;
             }
             if ($cached_time > MAX_QUERY_CACHE_TIME) {
                 $results = false;
             }
             if (isset($results['PAGES'])) {
                 $close_prefix = WORK_DIRECTORY . "/schedules/" . self::index_closed_name;
                 $has_changeable_results = false;
                 $seen_times = array();
                 foreach ($results['PAGES'] as $page) {
                     if (!isset($page[self::CRAWL_TIME]) || in_array($page[self::CRAWL_TIME], $seen_times)) {
                         continue;
                     }
                     $seen_times[] = $page[self::CRAWL_TIME];
                     $current_closed = $close_prefix . $page[self::CRAWL_TIME] . ".txt";
                     if (!file_exists($current_closed)) {
                         //either feed result or from active crawl
                         $has_changeable_results = true;
                         break;
                     }
                 }
                 if ($has_changeable_results) {
                     if ($cached_time > MIN_QUERY_CACHE_TIME) {
                         $results = false;
                     }
                 }
             }
             if (QUERY_STATISTICS) {
                 $this->query_info['QUERY'] .= "{$in2}<b>Cache Lookup Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
             }
             if ($results !== false) {
                 return $results;
             }
         }
     }
     $old_to_retrieve = $to_retrieve;
     $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, $save_timestamp_name, $limit_news);
     $num_retrieved = 0;
     $pages = array();
     if (is_object($query_iterator)) {
         while ($num_retrieved < $to_retrieve && is_array($next_docs = $query_iterator->nextDocsWithWord())) {
             $pages += $next_docs;
             $num_retrieved = count($pages);
         }
     }
     if ($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) {
         // used for archive crawls of crawl mixes
         $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt";
         $iterators = $query_iterator->save_iterators;
         $cnt_iterators = count($iterators);
         $save_point = array();
         for ($i = 0; $i < $cnt_iterators; $i++) {
             $save_point[$i] = $iterators[$i]->currentGenDocOffsetWithWord();
         }
         $results["SAVE_POINT"] = $save_point;
         file_put_contents($save_file, serialize($save_point));
         $this->db->setWorldPermissionsRecursive($save_file);
     }
     $pages = array_values($pages);
     $result_count = count($pages);
     $sort_time = 0;
     if ($raw == 0) {
         // initialize scores
         $sort_start = microtime();
         $max_user_ranks = 0;
         for ($i = 0; $i < $result_count; $i++) {
             $pages[$i]["OUT_SCORE"] = 0;
             if (isset($pages[$i][self::USER_RANKS])) {
                 $j = count($pages[$i][self::USER_RANKS]);
                 if ($max_user_ranks < $j) {
                     $max_user_ranks = $j;
                 }
             }
         }
         if ($max_user_ranks > 0) {
             for ($i = 0; $i < $result_count; $i++) {
                 for ($j = 0; $j < $max_user_ranks; $j++) {
                     if (isset($pages[$i][self::USER_RANKS][$j])) {
                         $pages[$i]["USCORE{$j}"] = $pages[$i][self::USER_RANKS][$j];
                     } else {
                         $pages[$i]["USCORE{$j}"] = 0;
                     }
                 }
             }
         }
         $subscore_fields = array(self::DOC_RANK, self::RELEVANCE);
         if ($use_proximity) {
             $subscore_fields[] = self::PROXIMITY;
         }
         if ($max_user_ranks > 0) {
             for ($j = 0; $j < $max_user_ranks; $j++) {
                 $subscore_fields[] = "USCORE{$j}";
             }
         }
         $num_fields = count($subscore_fields);
         // Compute Reciprocal Rank Fusion Score
         $alpha = 600 / $num_fields;
         if (isset($pages[0])) {
             foreach ($subscore_fields as $field) {
                 orderCallback($pages[0], $pages[0], $field);
                 usort($pages, "orderCallback");
                 $score = 0;
                 for ($i = 0; $i < $result_count; $i++) {
                     if ($i > 0) {
                         if ($pages[$i - 1][$field] != $pages[$i][$field]) {
                             $score++;
                         }
                     }
                     $pages[$i]["OUT_SCORE"] += $alpha / (59 + $score);
                 }
             }
             orderCallback($pages[0], $pages[0], "OUT_SCORE");
         }
         usort($pages, "orderCallback");
         if ($use_proximity) {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         } else {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::PROXIMITY] = 1;
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         }
         $sort_time = changeInMicrotime($sort_start);
     }
     if ($num_retrieved < $to_retrieve) {
         $results['TOTAL_ROWS'] = $num_retrieved;
     } else {
         $results['TOTAL_ROWS'] = $query_iterator->num_docs;
         //this is only an approximation
     }
     if ($raw == 1 && $save_timestamp_name == "") {
         $pages = array_slice($pages, $start_slice);
         $pages = array_slice($pages, $limit - $start_slice, $num);
         $results['PAGES'] =& $pages;
         if ($old_to_retrieve != $to_retrieve) {
             $results['HARD_QUERY'] = $old_to_retrieve;
         }
         return $results;
     }
     if (QUERY_STATISTICS) {
         $this->query_info['QUERY'] .= "{$in2}<b>Lookup Offsets Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
         $machine_times = AnalyticsManager::get("MACHINE_TIMES");
         if ($machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Machine Sub-Times</i>:<br />" . $machine_times . "<br />";
         }
         $net_times = AnalyticsManager::get("NET_TIMES");
         $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
         if ($net_times && $max_machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Network Overhead Sub-Time</i>: " . ($net_times - $max_machine_times) . "<br />";
         }
         if ($sort_time) {
             $this->query_info['QUERY'] .= "{$in3}<i>Merge-Rank Sub-Time</i>: " . $sort_time . "<br />";
         }
         $summaries_time = microtime();
     }
     $get_pages = array_slice($pages, $limit, $num);
     $to_get_count = count($get_pages);
     $groups_with_docs = false;
     if (preg_match("/\\bsite:doc\\b/", $original_query)) {
         $groups_with_docs = true;
     }
     $out_pages = array();
     $cur_limit = $limit;
     while (count($out_pages) < $to_get_count && $get_pages) {
         $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, $raw, $groups_with_docs));
         if ($save_timestamp_name != "") {
             break;
         }
         $cur_limit += $num;
         $get_pages = array_slice($pages, $cur_limit, $num);
     }
     $out_pages = array_slice($out_pages, 0, $num);
     if (QUERY_STATISTICS) {
         $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
         if ($summary_times_string) {
             $round_summary_times = unserialize($summary_times_string);
             $summary_delta_time = changeInMicrotime($summaries_time);
             $summary_time_info = "{$summary_delta_time}<br /> {$in4}";
             $sum_max_time = 0;
             foreach ($round_summary_times as $summary_times) {
                 $i = 0;
                 $max_time = 0;
                 foreach ($summary_times as $summary_time) {
                     $summary_time_info .= "ID_{$i}: " . $summary_time . "{$indent}";
                     $max_time = $summary_time > $max_time ? $summary_time : $max_time;
                     $i++;
                 }
                 $sum_max_time += $max_time;
             }
             $net_overhead = $summary_delta_time - $sum_max_time;
             $summary_time_info .= "<br />{$in3}<i>Network Overhead Sub-Time</i>: " . $net_overhead;
         } else {
             $summary_time_info = changeInMicrotime($summaries_time);
         }
         $this->query_info['QUERY'] .= "{$in2}<b>Get Summaries Time</b>: " . $summary_time_info . "<br />";
     }
     $results['PAGES'] =& $out_pages;
     $results['TIME'] = time();
     $lang = guessLocaleFromString($original_query);
     $tokenizer = PhraseParser::getTokenizer($lang);
     //only use tokenizer if no meta word or disjuncts in query
     if (!preg_match('/(\\||\\:)/u', $original_query) && $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") && method_exists($tokenizer, "tagPartsOfSpeechPhrase") && isset($tokenizer->use_thesaurus)) {
         $results = $this->sortByThesaurusScore($results, $original_query, $lang);
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $CACHE->set($summary_hash, $results);
     }
     return $results;
 }
Exemple #3
0
 /**
  * In a multiple queue server setting, gets summaries for a set of document
  * by their url, or by group of 5-tuples of the form
  * (machine, key, index, generation, offset). This makes an execMachines
  * call to make a network request to the CrawlController's on each machine
  * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems)
  * on each machine. The results are then sent back to networkGetCrawlItems
  * and aggregated.
  *
  * @param string $lookups things whose summaries we are trying to look up
  * @param array $machine_urls an array of urls of yioop queue servers
  * @return array of summary data for the matching documents
  */
 function networkGetCrawlItems($lookups, $machine_urls)
 {
     //Set-up network request
     $machines = array();
     $indexes = array();
     $num_machines = count($machine_urls);
     foreach ($lookups as $lookup => $lookup_info) {
         if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h' || $lookup_info[0][0] === 'r' || $lookup_info[0][0] === 'g')) {
             $machines = $machine_urls;
             break;
         } else {
             foreach ($lookup_info as $lookup_item) {
                 $out_lookup_info = array();
                 if (count($lookup_item) == 5) {
                     list($index, , , , ) = $lookup_item;
                     $machines[$index] = $machine_urls[$index];
                 } else {
                     $machines = $machine_urls;
                     break;
                 }
             }
         }
     }
     //Make request
     $page_set = $this->execMachines("getCrawlItems", $machines, serialize($lookups), $num_machines);
     //Aggregate results
     $summaries = array();
     $elapsed_times = array();
     if (is_array($page_set)) {
         foreach ($page_set as $elt) {
             $description_hash = array();
             $result = @unserialize(webdecode($elt[self::PAGE]));
             if (!is_array($result)) {
                 $elapsed_times[] = 0;
                 continue;
             }
             $elapsed_times[] = $result["ELAPSED_TIME"];
             unset($result["ELAPSED_TIME"]);
             $ellipsis = "";
             foreach ($result as $lookup => $summary) {
                 if (isset($summaries[$lookup])) {
                     if (isset($summary[self::DESCRIPTION])) {
                         $description = trim($summary[self::DESCRIPTION]);
                         if (!isset($summaries[$lookup][self::DESCRIPTION])) {
                             $summaries[$lookup][self::DESCRIPTION] = "";
                         }
                         if (!isset($description_hash[$description])) {
                             $summaries[$lookup][self::DESCRIPTION] = $ellipsis . $description;
                             $ellipsis = " .. ";
                             $description_hash[$description] = true;
                         }
                     }
                     foreach ($summary as $attr => $value) {
                         if ($attr != self::DESCRIPTION && !isset($summaries[$lookup][$attr])) {
                             $summaries[$lookup][$attr] = $value;
                         }
                     }
                 } else {
                     $summaries[$lookup] = $summary;
                 }
             }
         }
         $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
         if ($summary_times_string) {
             $all_elapsed_times = unserialize($summary_times_string);
         } else {
             $all_elapsed_times = array();
         }
         $all_elapsed_times[] = $elapsed_times;
         AnalyticsManager::set("SUMMARY_TIMES", serialize($all_elapsed_times));
     }
     return $summaries;
 }
Exemple #4
0
 /**
  * Send the provided view to output, drawing it with the given
  * data variable, using the current locale for translation, and
  * writing mode
  *
  * @param string $view   the name of the view to draw
  * @param array $data   an array of values to use in drawing the view
  */
 function displayView($view, $data)
 {
     $data['LOCALE_TAG'] = getLocaleTag();
     $data['LOCALE_DIR'] = getLocaleDirection();
     $data['BLOCK_PROGRESSION'] = getBlockProgression();
     $data['WRITING_MODE'] = getWritingMode();
     if (QUERY_STATISTICS) {
         $data['QUERY_STATISTICS'] = array();
         $machine = isset($_SERVER["HTTP_HOST"]) ? htmlentities($_SERVER["HTTP_HOST"]) : "localhost";
         $machine_uri = isset($_SERVER['REQUEST_URI']) ? htmlentities($_SERVER['REQUEST_URI']) : "/";
         $protocol = isset($_SERVER["HTTPS"]) ? "https://" : "http://";
         if ($machine == '::1') {
             //IPv6 :(
             $machine = "[::1]/";
             //used if the fetching and queue serving on the same machine
         }
         $data['YIOOP_INSTANCE'] = $protocol . $machine . $machine_uri;
         $data['TOTAL_ELAPSED_TIME'] = 0;
         foreach ($this->model_instances as $model_name => $model) {
             $data['QUERY_STATISTICS'] = array_merge($model->db->query_log, $data['QUERY_STATISTICS']);
             $data['TOTAL_ELAPSED_TIME'] += $model->db->total_time;
         }
         $locale_info = getLocaleQueryStatistics();
         $data['QUERY_STATISTICS'] = array_merge($locale_info['QUERY_LOG'], $data['QUERY_STATISTICS']);
         $data['TOTAL_ELAPSED_TIME'] += $locale_info['TOTAL_ELAPSED_TIME'];
         $mail_total_time = AnalyticsManager::get("MAIL_TOTAL_TIME");
         $mail_messages = AnalyticsManager::get("MAIL_MESSAGES");
         if ($mail_total_time && $mail_messages) {
             $data['QUERY_STATISTICS'] = array_merge($mail_messages, $data['QUERY_STATISTICS']);
             $data['TOTAL_ELAPSED_TIME'] += $mail_total_time;
         }
     }
     $data['c'] = isset($_REQUEST['c']) ? $_REQUEST['c'] : NULL;
     if (isset($_SESSION['DISPLAY_MESSAGE'])) {
         $data['DISPLAY_MESSAGE'] = $_SESSION['DISPLAY_MESSAGE'];
         unset($_SESSION['DISPLAY_MESSAGE']);
     }
     $this->view($view)->render($data);
 }
Exemple #5
0
 /**
  * Hook function used by currentDocsWithWord to return the current block
  * of docs if it is not cached
  *
  * @return mixed doc ids and score if there are docs left, -1 otherwise
  */
 function findDocsWithWord()
 {
     $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}";
     $sites = array();
     $lookup = array();
     $i = 0;
     $j = 0;
     foreach ($this->queue_servers as $server) {
         if ($this->more_flags[$i]) {
             $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}";
             $lookup[$j] = $i;
             $j++;
         }
         $i++;
     }
     $net_times = AnalyticsManager::get("NET_TIMES");
     $net_times = $net_times ? $net_times : 0;
     $download_time = microtime();
     $downloads = array();
     if (count($sites) > 0) {
         $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true);
     }
     $net_times += changeInMicrotime($download_time);
     AnalyticsManager::set("NET_TIMES", $net_times);
     $results = array();
     $count = count($downloads);
     $this->num_docs = 0;
     $in4 = "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
     $machine_times = AnalyticsManager::get("MACHINE_TIMES");
     $indent = $machine_times ? "<br />{$in4}" : $in4;
     $machine_times = $machine_times ? $machine_times : "";
     $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
     $max_machine_times = $max_machine_times ? $max_machine_times : 0;
     $max_time = 0;
     $num_with_results = $count;
     for ($j = 0; $j < $count; $j++) {
         $download =& $downloads[$j];
         if (isset($download[self::PAGE])) {
             $pre_result = @unserialize($download[self::PAGE]);
             if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) {
                 $this->more_flags[$lookup[$j]] = false;
                 $num_with_results--;
             }
             if (isset($pre_result["TOTAL_ROWS"])) {
                 $this->num_docs += $pre_result["TOTAL_ROWS"];
             }
             if (isset($pre_result["PAGES"])) {
                 foreach ($pre_result["PAGES"] as $page_data) {
                     if (isset($page_data[self::KEY])) {
                         $results[$page_data[self::KEY]] = $page_data;
                         $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j];
                     }
                 }
             }
             $max_time = max($max_time, $pre_result['ELAPSED_TIME']);
             $lookup_link = $this->makeLookupLink($sites, $lookup[$j]);
             $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . "&nbsp;&nbsp;";
             $indent = "";
         }
     }
     if (isset($pre_result["HARD_QUERY"])) {
         $this->hard_query = $pre_result["HARD_QUERY"];
     }
     if ($num_with_results > 0) {
         $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results));
     }
     $max_machine_times += $max_time;
     AnalyticsManager::set("MACHINE_TIMES", $machine_times);
     AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times);
     if ($results == array()) {
         $results = -1;
     }
     if ($results != -1) {
         if ($this->filter != NULL) {
             foreach ($results as $keys => $data) {
                 $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
                 if (in_array($host_key, $this->filter)) {
                     unset($results[$keys]);
                 }
             }
         }
     }
     $this->count_block = count($results);
     $this->pages = $results;
     return $results;
 }