Example #1
0
 /**
  * In a multiple queue server setting, gets summaries for a set of document
  * by their url, or by group of 5-tuples of the form
  * (machine, key, index, generation, offset). This makes an execMachines
  * call to make a network request to the CrawlController's on each machine
  * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems)
  * on each machine. The results are then sent back to networkGetCrawlItems
  * and aggregated.
  *
  * @param string $lookups things whose summaries we are trying to look up
  * @param array $machine_urls an array of urls of yioop queue servers
  * @return array of summary data for the matching documents
  */
 function networkGetCrawlItems($lookups, $machine_urls)
 {
     //Set-up network request
     $machines = array();
     $indexes = array();
     $num_machines = count($machine_urls);
     foreach ($lookups as $lookup => $lookup_info) {
         if (count($lookup_info) == 2 && ($lookup_info[0][0] === 'h' || $lookup_info[0][0] === 'r' || $lookup_info[0][0] === 'g')) {
             $machines = $machine_urls;
             break;
         } else {
             foreach ($lookup_info as $lookup_item) {
                 $out_lookup_info = array();
                 if (count($lookup_item) == 5) {
                     list($index, , , , ) = $lookup_item;
                     $machines[$index] = $machine_urls[$index];
                 } else {
                     $machines = $machine_urls;
                     break;
                 }
             }
         }
     }
     //Make request
     $page_set = $this->execMachines("getCrawlItems", $machines, serialize($lookups), $num_machines);
     //Aggregate results
     $summaries = array();
     $elapsed_times = array();
     if (is_array($page_set)) {
         foreach ($page_set as $elt) {
             $description_hash = array();
             $result = @unserialize(webdecode($elt[self::PAGE]));
             if (!is_array($result)) {
                 $elapsed_times[] = 0;
                 continue;
             }
             $elapsed_times[] = $result["ELAPSED_TIME"];
             unset($result["ELAPSED_TIME"]);
             $ellipsis = "";
             foreach ($result as $lookup => $summary) {
                 if (isset($summaries[$lookup])) {
                     if (isset($summary[self::DESCRIPTION])) {
                         $description = trim($summary[self::DESCRIPTION]);
                         if (!isset($summaries[$lookup][self::DESCRIPTION])) {
                             $summaries[$lookup][self::DESCRIPTION] = "";
                         }
                         if (!isset($description_hash[$description])) {
                             $summaries[$lookup][self::DESCRIPTION] = $ellipsis . $description;
                             $ellipsis = " .. ";
                             $description_hash[$description] = true;
                         }
                     }
                     foreach ($summary as $attr => $value) {
                         if ($attr != self::DESCRIPTION && !isset($summaries[$lookup][$attr])) {
                             $summaries[$lookup][$attr] = $value;
                         }
                     }
                 } else {
                     $summaries[$lookup] = $summary;
                 }
             }
         }
         $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
         if ($summary_times_string) {
             $all_elapsed_times = unserialize($summary_times_string);
         } else {
             $all_elapsed_times = array();
         }
         $all_elapsed_times[] = $elapsed_times;
         AnalyticsManager::set("SUMMARY_TIMES", serialize($all_elapsed_times));
     }
     return $summaries;
 }
Example #2
0
 /**
  * Sends an email (much like PHP's mail command, but not requiring
  * a configured smtp server on the current machine)
  *
  * @param string $subject subject line of the email
  * @param string $from sender email address
  * @param string $to recipient email address
  * @param string $message message body for the email
  */
 function send($subject, $from, $to, $message)
 {
     $start_time = microtime();
     if ($from == "") {
         $from = $this->sender_email;
     }
     $eol = self::EOL;
     if (USE_MAIL_PHP) {
         $header = "From: " . $from . $eol;
         mail($to, $subject, $message, $header);
         return;
     }
     $this->messages = "";
     $mail = "Date: " . date(DATE_RFC822) . $eol;
     $mail .= "Subject: " . $subject . $eol;
     $mail .= "From: " . $from . $eol;
     $mail .= "To: " . $to . $eol;
     $mail .= $eol . $eol . $message . $eol . ".";
     $commands = array("MAIL FROM: <{$from}>" => self::OKAY, "RCPT TO: <{$to}>" => self::OKAY, "DATA" => self::START_INPUT, $mail => self::OKAY);
     if ($this->startSession()) {
         foreach ($commands as $command => $good_response) {
             $response = $this->smtpCommand($command);
             if ($response != $good_response) {
                 $this->messages .= "{$command} failed!! {$response} {$good_response}\n";
                 break;
             }
         }
         $this->endSession();
     }
     if (QUERY_STATISTICS) {
         $current_messages = AnalyticsManager::get("MAIL_MESSAGES");
         if (!$current_messages) {
             $current_messages = array();
         }
         $total_time = AnalyticsManager::get("MAIL_TOTAL_TIME");
         if (!$total_time) {
             $total_time = 0;
         }
         $elapsed_time = changeInMicrotime($start_time);
         $total_time += $elapsed_time;
         $current_messages[] = array("QUERY" => "<p>Send Mail</p>" . "<pre>" . wordwrap($this->messages, 60, "\n", true) . "</pre>", "ELAPSED_TIME" => $elapsed_time);
         AnalyticsManager::set("MAIL_MESSAGES", $current_messages);
         AnalyticsManager::set("MAIL_TOTAL_TIME", $total_time);
     }
 }
Example #3
0
 /**
  * Hook function used by currentDocsWithWord to return the current block
  * of docs if it is not cached
  *
  * @return mixed doc ids and score if there are docs left, -1 otherwise
  */
 function findDocsWithWord()
 {
     $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}";
     $sites = array();
     $lookup = array();
     $i = 0;
     $j = 0;
     foreach ($this->queue_servers as $server) {
         if ($this->more_flags[$i]) {
             $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}";
             $lookup[$j] = $i;
             $j++;
         }
         $i++;
     }
     $net_times = AnalyticsManager::get("NET_TIMES");
     $net_times = $net_times ? $net_times : 0;
     $download_time = microtime();
     $downloads = array();
     if (count($sites) > 0) {
         $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true);
     }
     $net_times += changeInMicrotime($download_time);
     AnalyticsManager::set("NET_TIMES", $net_times);
     $results = array();
     $count = count($downloads);
     $this->num_docs = 0;
     $in4 = "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
     $machine_times = AnalyticsManager::get("MACHINE_TIMES");
     $indent = $machine_times ? "<br />{$in4}" : $in4;
     $machine_times = $machine_times ? $machine_times : "";
     $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
     $max_machine_times = $max_machine_times ? $max_machine_times : 0;
     $max_time = 0;
     $num_with_results = $count;
     for ($j = 0; $j < $count; $j++) {
         $download =& $downloads[$j];
         if (isset($download[self::PAGE])) {
             $pre_result = @unserialize($download[self::PAGE]);
             if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) {
                 $this->more_flags[$lookup[$j]] = false;
                 $num_with_results--;
             }
             if (isset($pre_result["TOTAL_ROWS"])) {
                 $this->num_docs += $pre_result["TOTAL_ROWS"];
             }
             if (isset($pre_result["PAGES"])) {
                 foreach ($pre_result["PAGES"] as $page_data) {
                     if (isset($page_data[self::KEY])) {
                         $results[$page_data[self::KEY]] = $page_data;
                         $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j];
                     }
                 }
             }
             $max_time = max($max_time, $pre_result['ELAPSED_TIME']);
             $lookup_link = $this->makeLookupLink($sites, $lookup[$j]);
             $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . "&nbsp;&nbsp;";
             $indent = "";
         }
     }
     if (isset($pre_result["HARD_QUERY"])) {
         $this->hard_query = $pre_result["HARD_QUERY"];
     }
     if ($num_with_results > 0) {
         $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results));
     }
     $max_machine_times += $max_time;
     AnalyticsManager::set("MACHINE_TIMES", $machine_times);
     AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times);
     if ($results == array()) {
         $results = -1;
     }
     if ($results != -1) {
         if ($this->filter != NULL) {
             foreach ($results as $keys => $data) {
                 $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
                 if (in_array($host_key, $this->filter)) {
                     unset($results[$keys]);
                 }
             }
         }
     }
     $this->count_block = count($results);
     $this->pages = $results;
     return $results;
 }