/** * Sends an email (much like PHP's mail command, but not requiring * a configured smtp server on the current machine) * * @param string $subject subject line of the email * @param string $from sender email address * @param string $to recipient email address * @param string $message message body for the email */ function send($subject, $from, $to, $message) { $start_time = microtime(); if ($from == "") { $from = $this->sender_email; } $eol = self::EOL; if (USE_MAIL_PHP) { $header = "From: " . $from . $eol; mail($to, $subject, $message, $header); return; } $this->messages = ""; $mail = "Date: " . date(DATE_RFC822) . $eol; $mail .= "Subject: " . $subject . $eol; $mail .= "From: " . $from . $eol; $mail .= "To: " . $to . $eol; $mail .= $eol . $eol . $message . $eol . "."; $commands = array("MAIL FROM: <{$from}>" => self::OKAY, "RCPT TO: <{$to}>" => self::OKAY, "DATA" => self::START_INPUT, $mail => self::OKAY); if ($this->startSession()) { foreach ($commands as $command => $good_response) { $response = $this->smtpCommand($command); if ($response != $good_response) { $this->messages .= "{$command} failed!! {$response} {$good_response}\n"; break; } } $this->endSession(); } if (QUERY_STATISTICS) { $current_messages = AnalyticsManager::get("MAIL_MESSAGES"); if (!$current_messages) { $current_messages = array(); } $total_time = AnalyticsManager::get("MAIL_TOTAL_TIME"); if (!$total_time) { $total_time = 0; } $elapsed_time = changeInMicrotime($start_time); $total_time += $elapsed_time; $current_messages[] = array("QUERY" => "<p>Send Mail</p>" . "<pre>" . wordwrap($this->messages, 60, "\n", true) . "</pre>", "ELAPSED_TIME" => $elapsed_time); AnalyticsManager::set("MAIL_MESSAGES", $current_messages); AnalyticsManager::set("MAIL_TOTAL_TIME", $total_time); } }
/** * Gets doc summaries of documents containing given words and meeting the * additional provided criteria * @param array $word_structs an array of word_structs. Here a word_struct * is an associative array with at least the following fields * KEYS -- an array of word keys * QUOTE_POSITIONS -- an array of positions of words that appeared in * quotes (so need to be matched exactly) * DISALLOW_PHRASES -- an array of words the document must not contain * WEIGHT -- a weight to multiple scores returned from this iterator by * INDEX_NAME -- an index timestamp to get results from * @param int $limit number of first document in order to return * @param int $num number of documents to return summaries of * @param array& $filter an array of hashes of domains to filter from * results * @param bool $use_cache_if_allowed if true and USE_CACHE is true then * an attempt will be made to look up the results in either * the file cache or memcache. Otherwise, items will be recomputed * and then potentially restored in cache * @param int $raw ($raw == 0) normal grouping, ($raw > 0) * no grouping done on data. if ($raw == 1) no lookups of summaries * done * @param array $queue_servers a list of urls of yioop machines which might * be used during lookup * @param string $original_query if set, the original query that corresponds * to $word_structs * @param string $save_timestamp_name if this timestamp is not empty, then * save iterate position, so can resume on future queries that make * use of the timestamp. If used then $limit ignored and get next $num * docs after $save_timestamp 's previous iterate position. * @param bool $limit_news if true the number of media:news items to * allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT * * @return array document summaries */ function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true) { global $CACHE; $indent = " "; $in2 = $indent . $indent; $in3 = $in2 . $indent; $in4 = $in2 . $in2; if (QUERY_STATISTICS) { $lookup_time = microtime(); } $use_proximity = false; $time = time(); if (count($word_structs) > 1 || isset($word_structs[0]["KEYS"]) && count($word_structs[0]["KEYS"]) > 1 || $word_structs == array() && substr_count($original_query, " ") > 1) { $use_proximity = true; } if (!isset($filter['time'])) { $filter['time'] = 0; } $filter_time = $filter['time']; unset($filter['time']); //iterators don't expect time field $pages = array(); $generation = 0; $to_retrieve = ceil(($limit + $num) / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; $start_slice = floor($limit / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; if ($save_timestamp_name != "") { $to_retrieve = $num; $limit = 0; $start_slice = 0; } if (USE_CACHE && $save_timestamp_name == "") { $mem_tmp = serialize($raw) . serialize($word_structs) . $original_query . $this->index_name; $summary_hash = crawlHash($mem_tmp . ":" . $limit . ":" . $num); if ($use_cache_if_allowed) { $cache_success = true; $results = $CACHE->get($summary_hash); if (!isset($results['TIME']) || $filter_time > $results['TIME']) { //if filter has changed since cached, then invalidate cache $results = false; } if (isset($results['TIME'])) { $cached_time = $time - $results['TIME']; } else { $cached_time = $time; } if ($cached_time > MAX_QUERY_CACHE_TIME) { $results = false; } if (isset($results['PAGES'])) { $close_prefix = WORK_DIRECTORY . "/schedules/" . self::index_closed_name; $has_changeable_results = false; $seen_times = array(); foreach ($results['PAGES'] as $page) { if (!isset($page[self::CRAWL_TIME]) || in_array($page[self::CRAWL_TIME], $seen_times)) { continue; } $seen_times[] = $page[self::CRAWL_TIME]; $current_closed = $close_prefix . $page[self::CRAWL_TIME] . ".txt"; if (!file_exists($current_closed)) { //either feed result or from active crawl $has_changeable_results = true; break; } } if ($has_changeable_results) { if ($cached_time > MIN_QUERY_CACHE_TIME) { $results = false; } } } if (QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in2}<b>Cache Lookup Time</b>: " . changeInMicrotime($lookup_time) . "<br />"; } if ($results !== false) { return $results; } } } $old_to_retrieve = $to_retrieve; $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, $save_timestamp_name, $limit_news); $num_retrieved = 0; $pages = array(); if (is_object($query_iterator)) { while ($num_retrieved < $to_retrieve && is_array($next_docs = $query_iterator->nextDocsWithWord())) { $pages += $next_docs; $num_retrieved = count($pages); } } if ($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) { // used for archive crawls of crawl mixes $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt"; $iterators = $query_iterator->save_iterators; $cnt_iterators = count($iterators); $save_point = array(); for ($i = 0; $i < $cnt_iterators; $i++) { $save_point[$i] = $iterators[$i]->currentGenDocOffsetWithWord(); } $results["SAVE_POINT"] = $save_point; file_put_contents($save_file, serialize($save_point)); $this->db->setWorldPermissionsRecursive($save_file); } $pages = array_values($pages); $result_count = count($pages); $sort_time = 0; if ($raw == 0) { // initialize scores $sort_start = microtime(); $max_user_ranks = 0; for ($i = 0; $i < $result_count; $i++) { $pages[$i]["OUT_SCORE"] = 0; if (isset($pages[$i][self::USER_RANKS])) { $j = count($pages[$i][self::USER_RANKS]); if ($max_user_ranks < $j) { $max_user_ranks = $j; } } } if ($max_user_ranks > 0) { for ($i = 0; $i < $result_count; $i++) { for ($j = 0; $j < $max_user_ranks; $j++) { if (isset($pages[$i][self::USER_RANKS][$j])) { $pages[$i]["USCORE{$j}"] = $pages[$i][self::USER_RANKS][$j]; } else { $pages[$i]["USCORE{$j}"] = 0; } } } } $subscore_fields = array(self::DOC_RANK, self::RELEVANCE); if ($use_proximity) { $subscore_fields[] = self::PROXIMITY; } if ($max_user_ranks > 0) { for ($j = 0; $j < $max_user_ranks; $j++) { $subscore_fields[] = "USCORE{$j}"; } } $num_fields = count($subscore_fields); // Compute Reciprocal Rank Fusion Score $alpha = 600 / $num_fields; if (isset($pages[0])) { foreach ($subscore_fields as $field) { orderCallback($pages[0], $pages[0], $field); usort($pages, "orderCallback"); $score = 0; for ($i = 0; $i < $result_count; $i++) { if ($i > 0) { if ($pages[$i - 1][$field] != $pages[$i][$field]) { $score++; } } $pages[$i]["OUT_SCORE"] += $alpha / (59 + $score); } } orderCallback($pages[0], $pages[0], "OUT_SCORE"); } usort($pages, "orderCallback"); if ($use_proximity) { for ($i = 0; $i < $result_count; $i++) { $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"]; } } else { for ($i = 0; $i < $result_count; $i++) { $pages[$i][self::PROXIMITY] = 1; $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"]; } } $sort_time = changeInMicrotime($sort_start); } if ($num_retrieved < $to_retrieve) { $results['TOTAL_ROWS'] = $num_retrieved; } else { $results['TOTAL_ROWS'] = $query_iterator->num_docs; //this is only an approximation } if ($raw == 1 && $save_timestamp_name == "") { $pages = array_slice($pages, $start_slice); $pages = array_slice($pages, $limit - $start_slice, $num); $results['PAGES'] =& $pages; if ($old_to_retrieve != $to_retrieve) { $results['HARD_QUERY'] = $old_to_retrieve; } return $results; } if (QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in2}<b>Lookup Offsets Time</b>: " . changeInMicrotime($lookup_time) . "<br />"; $machine_times = AnalyticsManager::get("MACHINE_TIMES"); if ($machine_times) { $this->query_info['QUERY'] .= "{$in3}<i>Machine Sub-Times</i>:<br />" . $machine_times . "<br />"; } $net_times = AnalyticsManager::get("NET_TIMES"); $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES"); if ($net_times && $max_machine_times) { $this->query_info['QUERY'] .= "{$in3}<i>Network Overhead Sub-Time</i>: " . ($net_times - $max_machine_times) . "<br />"; } if ($sort_time) { $this->query_info['QUERY'] .= "{$in3}<i>Merge-Rank Sub-Time</i>: " . $sort_time . "<br />"; } $summaries_time = microtime(); } $get_pages = array_slice($pages, $limit, $num); $to_get_count = count($get_pages); $groups_with_docs = false; if (preg_match("/\\bsite:doc\\b/", $original_query)) { $groups_with_docs = true; } $out_pages = array(); $cur_limit = $limit; while (count($out_pages) < $to_get_count && $get_pages) { $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, $raw, $groups_with_docs)); if ($save_timestamp_name != "") { break; } $cur_limit += $num; $get_pages = array_slice($pages, $cur_limit, $num); } $out_pages = array_slice($out_pages, 0, $num); if (QUERY_STATISTICS) { $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES"); if ($summary_times_string) { $round_summary_times = unserialize($summary_times_string); $summary_delta_time = changeInMicrotime($summaries_time); $summary_time_info = "{$summary_delta_time}<br /> {$in4}"; $sum_max_time = 0; foreach ($round_summary_times as $summary_times) { $i = 0; $max_time = 0; foreach ($summary_times as $summary_time) { $summary_time_info .= "ID_{$i}: " . $summary_time . "{$indent}"; $max_time = $summary_time > $max_time ? $summary_time : $max_time; $i++; } $sum_max_time += $max_time; } $net_overhead = $summary_delta_time - $sum_max_time; $summary_time_info .= "<br />{$in3}<i>Network Overhead Sub-Time</i>: " . $net_overhead; } else { $summary_time_info = changeInMicrotime($summaries_time); } $this->query_info['QUERY'] .= "{$in2}<b>Get Summaries Time</b>: " . $summary_time_info . "<br />"; } $results['PAGES'] =& $out_pages; $results['TIME'] = time(); $lang = guessLocaleFromString($original_query); $tokenizer = PhraseParser::getTokenizer($lang); //only use tokenizer if no meta word or disjuncts in query if (!preg_match('/(\\||\\:)/u', $original_query) && $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") && method_exists($tokenizer, "tagPartsOfSpeechPhrase") && isset($tokenizer->use_thesaurus)) { $results = $this->sortByThesaurusScore($results, $original_query, $lang); } if (USE_CACHE && $save_timestamp_name == "") { $CACHE->set($summary_hash, $results); } return $results; }
/** * In a multiple queue server setting, gets summaries for a set of document * by their url, or by group of 5-tuples of the form * (machine, key, index, generation, offset). This makes an execMachines * call to make a network request to the CrawlController's on each machine * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems) * on each machine. The results are then sent back to networkGetCrawlItems * and aggregated. * * @param string $lookups things whose summaries we are trying to look up * @param array $machine_urls an array of urls of yioop queue servers * @return array of summary data for the matching documents */ function networkGetCrawlItems($lookups, $machine_urls) { //Set-up network request $machines = array(); $indexes = array(); $num_machines = count($machine_urls); foreach ($lookups as $lookup => $lookup_info) { if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h' || $lookup_info[0][0] === 'r' || $lookup_info[0][0] === 'g')) { $machines = $machine_urls; break; } else { foreach ($lookup_info as $lookup_item) { $out_lookup_info = array(); if (count($lookup_item) == 5) { list($index, , , , ) = $lookup_item; $machines[$index] = $machine_urls[$index]; } else { $machines = $machine_urls; break; } } } } //Make request $page_set = $this->execMachines("getCrawlItems", $machines, serialize($lookups), $num_machines); //Aggregate results $summaries = array(); $elapsed_times = array(); if (is_array($page_set)) { foreach ($page_set as $elt) { $description_hash = array(); $result = @unserialize(webdecode($elt[self::PAGE])); if (!is_array($result)) { $elapsed_times[] = 0; continue; } $elapsed_times[] = $result["ELAPSED_TIME"]; unset($result["ELAPSED_TIME"]); $ellipsis = ""; foreach ($result as $lookup => $summary) { if (isset($summaries[$lookup])) { if (isset($summary[self::DESCRIPTION])) { $description = trim($summary[self::DESCRIPTION]); if (!isset($summaries[$lookup][self::DESCRIPTION])) { $summaries[$lookup][self::DESCRIPTION] = ""; } if (!isset($description_hash[$description])) { $summaries[$lookup][self::DESCRIPTION] = $ellipsis . $description; $ellipsis = " .. "; $description_hash[$description] = true; } } foreach ($summary as $attr => $value) { if ($attr != self::DESCRIPTION && !isset($summaries[$lookup][$attr])) { $summaries[$lookup][$attr] = $value; } } } else { $summaries[$lookup] = $summary; } } } $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES"); if ($summary_times_string) { $all_elapsed_times = unserialize($summary_times_string); } else { $all_elapsed_times = array(); } $all_elapsed_times[] = $elapsed_times; AnalyticsManager::set("SUMMARY_TIMES", serialize($all_elapsed_times)); } return $summaries; }
/** * Send the provided view to output, drawing it with the given * data variable, using the current locale for translation, and * writing mode * * @param string $view the name of the view to draw * @param array $data an array of values to use in drawing the view */ function displayView($view, $data) { $data['LOCALE_TAG'] = getLocaleTag(); $data['LOCALE_DIR'] = getLocaleDirection(); $data['BLOCK_PROGRESSION'] = getBlockProgression(); $data['WRITING_MODE'] = getWritingMode(); if (QUERY_STATISTICS) { $data['QUERY_STATISTICS'] = array(); $machine = isset($_SERVER["HTTP_HOST"]) ? htmlentities($_SERVER["HTTP_HOST"]) : "localhost"; $machine_uri = isset($_SERVER['REQUEST_URI']) ? htmlentities($_SERVER['REQUEST_URI']) : "/"; $protocol = isset($_SERVER["HTTPS"]) ? "https://" : "http://"; if ($machine == '::1') { //IPv6 :( $machine = "[::1]/"; //used if the fetching and queue serving on the same machine } $data['YIOOP_INSTANCE'] = $protocol . $machine . $machine_uri; $data['TOTAL_ELAPSED_TIME'] = 0; foreach ($this->model_instances as $model_name => $model) { $data['QUERY_STATISTICS'] = array_merge($model->db->query_log, $data['QUERY_STATISTICS']); $data['TOTAL_ELAPSED_TIME'] += $model->db->total_time; } $locale_info = getLocaleQueryStatistics(); $data['QUERY_STATISTICS'] = array_merge($locale_info['QUERY_LOG'], $data['QUERY_STATISTICS']); $data['TOTAL_ELAPSED_TIME'] += $locale_info['TOTAL_ELAPSED_TIME']; $mail_total_time = AnalyticsManager::get("MAIL_TOTAL_TIME"); $mail_messages = AnalyticsManager::get("MAIL_MESSAGES"); if ($mail_total_time && $mail_messages) { $data['QUERY_STATISTICS'] = array_merge($mail_messages, $data['QUERY_STATISTICS']); $data['TOTAL_ELAPSED_TIME'] += $mail_total_time; } } $data['c'] = isset($_REQUEST['c']) ? $_REQUEST['c'] : NULL; if (isset($_SESSION['DISPLAY_MESSAGE'])) { $data['DISPLAY_MESSAGE'] = $_SESSION['DISPLAY_MESSAGE']; unset($_SESSION['DISPLAY_MESSAGE']); } $this->view($view)->render($data); }
/** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and score if there are docs left, -1 otherwise */ function findDocsWithWord() { $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}"; $sites = array(); $lookup = array(); $i = 0; $j = 0; foreach ($this->queue_servers as $server) { if ($this->more_flags[$i]) { $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}"; $lookup[$j] = $i; $j++; } $i++; } $net_times = AnalyticsManager::get("NET_TIMES"); $net_times = $net_times ? $net_times : 0; $download_time = microtime(); $downloads = array(); if (count($sites) > 0) { $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true); } $net_times += changeInMicrotime($download_time); AnalyticsManager::set("NET_TIMES", $net_times); $results = array(); $count = count($downloads); $this->num_docs = 0; $in4 = " "; $machine_times = AnalyticsManager::get("MACHINE_TIMES"); $indent = $machine_times ? "<br />{$in4}" : $in4; $machine_times = $machine_times ? $machine_times : ""; $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES"); $max_machine_times = $max_machine_times ? $max_machine_times : 0; $max_time = 0; $num_with_results = $count; for ($j = 0; $j < $count; $j++) { $download =& $downloads[$j]; if (isset($download[self::PAGE])) { $pre_result = @unserialize($download[self::PAGE]); if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) { $this->more_flags[$lookup[$j]] = false; $num_with_results--; } if (isset($pre_result["TOTAL_ROWS"])) { $this->num_docs += $pre_result["TOTAL_ROWS"]; } if (isset($pre_result["PAGES"])) { foreach ($pre_result["PAGES"] as $page_data) { if (isset($page_data[self::KEY])) { $results[$page_data[self::KEY]] = $page_data; $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j]; } } } $max_time = max($max_time, $pre_result['ELAPSED_TIME']); $lookup_link = $this->makeLookupLink($sites, $lookup[$j]); $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . " "; $indent = ""; } } if (isset($pre_result["HARD_QUERY"])) { $this->hard_query = $pre_result["HARD_QUERY"]; } if ($num_with_results > 0) { $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results)); } $max_machine_times += $max_time; AnalyticsManager::set("MACHINE_TIMES", $machine_times); AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times); if ($results == array()) { $results = -1; } if ($results != -1) { if ($this->filter != NULL) { foreach ($results as $keys => $data) { $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); if (in_array($host_key, $this->filter)) { unset($results[$keys]); } } } } $this->count_block = count($results); $this->pages = $results; return $results; }