Example #1
0
 /**
  * For a collection of grouped pages generates a grouped summary for each
  * group and returns an array of out pages consisting
  * of single summarized documents for each group. These single summarized
  * documents have aggregated scores.
  *
  * @param array& $pre_out_pages array of groups of pages for which out pages
  *     are to be generated.
  * @return array $out_pages array of single summarized documents
  */
 function computeOutPages(&$pre_out_pages)
 {
     $out_pages = array();
     foreach ($pre_out_pages as $hash_url => $group_infos) {
         $out_pages[$hash_url] = $pre_out_pages[$hash_url][0];
         $add_lookup = false;
         if ($this->network_flag) {
             $hash = $out_pages[$hash_url][self::HASH];
             $is_location = crawlHash($hash_url . "LOCATION", true) == $hash;
             if (!$out_pages[$hash_url][self::IS_DOC] || $is_location) {
                 $add_lookup = true;
             }
         }
         $out_pages[$hash_url][self::SUMMARY_OFFSET] = array();
         unset($out_pages[$hash_url][self::GENERATION]);
         $hash_count = $out_pages[$hash_url][self::HASH_URL_COUNT];
         for ($i = 0; $i < $hash_count; $i++) {
             $doc_info = $group_infos[$i];
             if (isset($doc_info[self::GENERATION])) {
                 if (is_int($doc_info[self::SUMMARY_OFFSET])) {
                     $machine_id = isset($doc_info[self::MACHINE_ID]) ? $doc_info[self::MACHINE_ID] : $this->current_machine;
                     $out_pages[$hash_url][self::SUMMARY_OFFSET][] = array($machine_id, $doc_info[self::KEY], $doc_info[self::CRAWL_TIME], $doc_info[self::GENERATION], $doc_info[self::SUMMARY_OFFSET]);
                 } else {
                     if (is_array($doc_info[self::SUMMARY_OFFSET])) {
                         $out_pages[$hash_url][self::SUMMARY_OFFSET] = array_merge($out_pages[$hash_url][self::SUMMARY_OFFSET], $doc_info[self::SUMMARY_OFFSET]);
                     }
                 }
             }
         }
         $out_pages[$hash_url][self::SCORE] = $out_pages[$hash_url][self::HASH_SUM_SCORE];
         if ($add_lookup) {
             $prefix = $is_location ? "location:" : "info:";
             $word_key = $prefix . base64Hash($hash_url);
             array_unshift($out_pages[$hash_url][self::SUMMARY_OFFSET], array($word_key, $group_infos[0][self::CRAWL_TIME]));
         }
     }
     return $out_pages;
 }
Example #2
0
 /**
  * Gets summaries on a particular machine for a set of document by
  * their url, or by group of 5-tuples of the form
  * (machine, key, index, generation, offset)
  * This may be used in either the single queue_server setting or
  * it may be called indirectly by a particular machine's
  * CrawlController as part of fufilling a network-based getCrawlItems
  * request. $lookups contains items which are to be grouped (as came
  * from same url or site with the same cache). So this function aggregates
  * their descriptions.
  *
  * @param string $lookups things whose summaries we are trying to look up
  * @return array of summary data for the matching documents
  */
 function nonNetworkGetCrawlItems($lookups)
 {
     $summary_offset = NULL;
     $generation = NULL;
     $summaries = array();
     $db = $this->db;
     foreach ($lookups as $lookup => $lookup_info) {
         $scheme = isset($lookup_info[0]) && is_string($lookup_info[0]) ? substr($lookup_info[0], 0, 3) : "";
         if (count($lookup_info) == 2 && ($scheme == 'htt' || $scheme == 'gop' || $scheme == 'rec')) {
             list($url, $index_name) = $lookup_info;
             $index_archive = IndexManager::getIndex($index_name);
             $offset_gen_arr = $this->lookupSummaryOffsetGeneration($url, $index_name);
             if ($offset_gen_arr !== false) {
                 list($summary_offset, $generation) = $offset_gen_arr;
             } else {
                 return false;
             }
             $summary = $index_archive->getPage($summary_offset, $generation);
         } else {
             $summary = array();
             $ellipsis = "";
             $description_hash = array();
             $sql = "SELECT * FROM FEED_ITEM WHERE GUID=?";
             foreach ($lookup_info as $lookup_item) {
                 if (count($lookup_item) == 2) {
                     list($word_key, $index_name) = $lookup_item;
                     $offset_info = $this->lookupSummaryOffsetGeneration($word_key, $index_name, true);
                     if (is_array($offset_info)) {
                         list($summary_offset, $generation) = $offset_info;
                     } else {
                         continue;
                     }
                 } else {
                     list($machine, $key, $index_name, $generation, $summary_offset) = $lookup_item;
                 }
                 if (strcmp($index_name, "feed") != 0) {
                     $index = IndexManager::getIndex($index_name);
                     $index->setCurrentShard($generation, true);
                     if (is_integer($summary_offset) && is_integer($generation)) {
                         $page = @$index->getPage($summary_offset);
                     } else {
                         $page = NULL;
                     }
                 } else {
                     $guid = base64Hash(substr($key, IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN));
                     $result = $db->execute($sql, array($guid));
                     $page = false;
                     if ($result) {
                         $row = $db->fetchArray($result);
                         if ($row) {
                             $page[self::TITLE] = $row["TITLE"];
                             $page[self::DESCRIPTION] = $row["DESCRIPTION"];
                             $page[self::URL] = $row["LINK"];
                             $page[self::SOURCE_NAME] = $row["SOURCE_NAME"];
                         }
                     }
                 }
                 if (!$page || $page == array()) {
                     continue;
                 }
                 $copy = false;
                 if ($summary == array()) {
                     if (isset($page[self::DESCRIPTION])) {
                         $description = trim($page[self::DESCRIPTION]);
                         $page[self::DESCRIPTION] = $description;
                         $description_hash[$description] = true;
                     }
                     $ellipsis = " .. ";
                     $summary = $page;
                 } else {
                     if (isset($page[self::DESCRIPTION])) {
                         $description = trim($page[self::DESCRIPTION]);
                         if (!isset($summary[self::DESCRIPTION])) {
                             $summary[self::DESCRIPTION] = "";
                         }
                         if (!isset($description_hash[$description])) {
                             $summary[self::DESCRIPTION] .= $ellipsis . $description;
                             $ellipsis = " .. ";
                             $description_hash[$description] = true;
                         }
                         $copy = true;
                     } else {
                         $copy = true;
                     }
                 }
                 if (strlen($summary[self::DESCRIPTION]) > self::MIN_DESCRIPTION_LENGTH) {
                     break;
                 }
                 if ($copy) {
                     foreach ($page as $attr => $value) {
                         if ($attr != self::DESCRIPTION && !isset($summary[$attr])) {
                             $summary[$attr] = $value;
                         }
                     }
                 }
             }
         }
         if ($summary != array()) {
             $summaries[$lookup] = $summary;
         }
     }
     return $summaries;
 }