/** * In a multiple queue server setting, gets summaries for a set of document * by their url, or by group of 5-tuples of the form * (machine, key, index, generation, offset). This makes an execMachines * call to make a network request to the CrawlController's on each machine * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems) * on each machine. The results are then sent back to networkGetCrawlItems * and aggregated. * * @param string $lookups things whose summaries we are trying to look up * @param array $machine_urls an array of urls of yioop queue servers * @return array of summary data for the matching documents */ function networkGetCrawlItems($lookups, $machine_urls) { //Set-up network request $machines = array(); $indexes = array(); $num_machines = count($machine_urls); foreach ($lookups as $lookup => $lookup_info) { if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h' || $lookup_info[0][0] === 'r' || $lookup_info[0][0] === 'g')) { $machines = $machine_urls; break; } else { foreach ($lookup_info as $lookup_item) { $out_lookup_info = array(); if (count($lookup_item) == 5) { list($index, , , , ) = $lookup_item; $machines[$index] = $machine_urls[$index]; } else { $machines = $machine_urls; break; } } } } //Make request $page_set = $this->execMachines("getCrawlItems", $machines, serialize($lookups), $num_machines); //Aggregate results $summaries = array(); $elapsed_times = array(); if (is_array($page_set)) { foreach ($page_set as $elt) { $description_hash = array(); $result = @unserialize(webdecode($elt[self::PAGE])); if (!is_array($result)) { $elapsed_times[] = 0; continue; } $elapsed_times[] = $result["ELAPSED_TIME"]; unset($result["ELAPSED_TIME"]); $ellipsis = ""; foreach ($result as $lookup => $summary) { if (isset($summaries[$lookup])) { if (isset($summary[self::DESCRIPTION])) { $description = trim($summary[self::DESCRIPTION]); if (!isset($summaries[$lookup][self::DESCRIPTION])) { $summaries[$lookup][self::DESCRIPTION] = ""; } if (!isset($description_hash[$description])) { $summaries[$lookup][self::DESCRIPTION] = $ellipsis . $description; $ellipsis = " .. "; $description_hash[$description] = true; } } foreach ($summary as $attr => $value) { if ($attr != self::DESCRIPTION && !isset($summaries[$lookup][$attr])) { $summaries[$lookup][$attr] = $value; } } } else { $summaries[$lookup] = $summary; } } } $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES"); if ($summary_times_string) { $all_elapsed_times = unserialize($summary_times_string); } else { $all_elapsed_times = array(); } $all_elapsed_times[] = $elapsed_times; AnalyticsManager::set("SUMMARY_TIMES", serialize($all_elapsed_times)); } return $summaries; }
/** * Sends an email (much like PHP's mail command, but not requiring * a configured smtp server on the current machine) * * @param string $subject subject line of the email * @param string $from sender email address * @param string $to recipient email address * @param string $message message body for the email */ function send($subject, $from, $to, $message) { $start_time = microtime(); if ($from == "") { $from = $this->sender_email; } $eol = self::EOL; if (USE_MAIL_PHP) { $header = "From: " . $from . $eol; mail($to, $subject, $message, $header); return; } $this->messages = ""; $mail = "Date: " . date(DATE_RFC822) . $eol; $mail .= "Subject: " . $subject . $eol; $mail .= "From: " . $from . $eol; $mail .= "To: " . $to . $eol; $mail .= $eol . $eol . $message . $eol . "."; $commands = array("MAIL FROM: <{$from}>" => self::OKAY, "RCPT TO: <{$to}>" => self::OKAY, "DATA" => self::START_INPUT, $mail => self::OKAY); if ($this->startSession()) { foreach ($commands as $command => $good_response) { $response = $this->smtpCommand($command); if ($response != $good_response) { $this->messages .= "{$command} failed!! {$response} {$good_response}\n"; break; } } $this->endSession(); } if (QUERY_STATISTICS) { $current_messages = AnalyticsManager::get("MAIL_MESSAGES"); if (!$current_messages) { $current_messages = array(); } $total_time = AnalyticsManager::get("MAIL_TOTAL_TIME"); if (!$total_time) { $total_time = 0; } $elapsed_time = changeInMicrotime($start_time); $total_time += $elapsed_time; $current_messages[] = array("QUERY" => "<p>Send Mail</p>" . "<pre>" . wordwrap($this->messages, 60, "\n", true) . "</pre>", "ELAPSED_TIME" => $elapsed_time); AnalyticsManager::set("MAIL_MESSAGES", $current_messages); AnalyticsManager::set("MAIL_TOTAL_TIME", $total_time); } }
/** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and score if there are docs left, -1 otherwise */ function findDocsWithWord() { $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}"; $sites = array(); $lookup = array(); $i = 0; $j = 0; foreach ($this->queue_servers as $server) { if ($this->more_flags[$i]) { $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}"; $lookup[$j] = $i; $j++; } $i++; } $net_times = AnalyticsManager::get("NET_TIMES"); $net_times = $net_times ? $net_times : 0; $download_time = microtime(); $downloads = array(); if (count($sites) > 0) { $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true); } $net_times += changeInMicrotime($download_time); AnalyticsManager::set("NET_TIMES", $net_times); $results = array(); $count = count($downloads); $this->num_docs = 0; $in4 = " "; $machine_times = AnalyticsManager::get("MACHINE_TIMES"); $indent = $machine_times ? "<br />{$in4}" : $in4; $machine_times = $machine_times ? $machine_times : ""; $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES"); $max_machine_times = $max_machine_times ? $max_machine_times : 0; $max_time = 0; $num_with_results = $count; for ($j = 0; $j < $count; $j++) { $download =& $downloads[$j]; if (isset($download[self::PAGE])) { $pre_result = @unserialize($download[self::PAGE]); if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) { $this->more_flags[$lookup[$j]] = false; $num_with_results--; } if (isset($pre_result["TOTAL_ROWS"])) { $this->num_docs += $pre_result["TOTAL_ROWS"]; } if (isset($pre_result["PAGES"])) { foreach ($pre_result["PAGES"] as $page_data) { if (isset($page_data[self::KEY])) { $results[$page_data[self::KEY]] = $page_data; $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j]; } } } $max_time = max($max_time, $pre_result['ELAPSED_TIME']); $lookup_link = $this->makeLookupLink($sites, $lookup[$j]); $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . " "; $indent = ""; } } if (isset($pre_result["HARD_QUERY"])) { $this->hard_query = $pre_result["HARD_QUERY"]; } if ($num_with_results > 0) { $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results)); } $max_machine_times += $max_time; AnalyticsManager::set("MACHINE_TIMES", $machine_times); AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times); if ($results == array()) { $results = -1; } if ($results != -1) { if ($this->filter != NULL) { foreach ($results as $keys => $data) { $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); if (in_array($host_key, $this->filter)) { unset($results[$keys]); } } } } $this->count_block = count($results); $this->pages = $results; return $results; }