/** * For a collection of grouped pages generates a grouped summary for each * group and returns an array of out pages consisting * of single summarized documents for each group. These single summarized * documents have aggregated scores. * * @param array& $pre_out_pages array of groups of pages for which out pages * are to be generated. * @return array $out_pages array of single summarized documents */ function computeOutPages(&$pre_out_pages) { $out_pages = array(); foreach ($pre_out_pages as $hash_url => $group_infos) { $out_pages[$hash_url] = $pre_out_pages[$hash_url][0]; $add_lookup = false; if ($this->network_flag) { $hash = $out_pages[$hash_url][self::HASH]; $is_location = crawlHash($hash_url . "LOCATION", true) == $hash; if (!$out_pages[$hash_url][self::IS_DOC] || $is_location) { $add_lookup = true; } } $out_pages[$hash_url][self::SUMMARY_OFFSET] = array(); unset($out_pages[$hash_url][self::GENERATION]); $hash_count = $out_pages[$hash_url][self::HASH_URL_COUNT]; for ($i = 0; $i < $hash_count; $i++) { $doc_info = $group_infos[$i]; if (isset($doc_info[self::GENERATION])) { if (is_int($doc_info[self::SUMMARY_OFFSET])) { $machine_id = isset($doc_info[self::MACHINE_ID]) ? $doc_info[self::MACHINE_ID] : $this->current_machine; $out_pages[$hash_url][self::SUMMARY_OFFSET][] = array($machine_id, $doc_info[self::KEY], $doc_info[self::CRAWL_TIME], $doc_info[self::GENERATION], $doc_info[self::SUMMARY_OFFSET]); } else { if (is_array($doc_info[self::SUMMARY_OFFSET])) { $out_pages[$hash_url][self::SUMMARY_OFFSET] = array_merge($out_pages[$hash_url][self::SUMMARY_OFFSET], $doc_info[self::SUMMARY_OFFSET]); } } } } $out_pages[$hash_url][self::SCORE] = $out_pages[$hash_url][self::HASH_SUM_SCORE]; if ($add_lookup) { $prefix = $is_location ? "location:" : "info:"; $word_key = $prefix . base64Hash($hash_url); array_unshift($out_pages[$hash_url][self::SUMMARY_OFFSET], array($word_key, $group_infos[0][self::CRAWL_TIME])); } } return $out_pages; }
/** * Gets summaries on a particular machine for a set of document by * their url, or by group of 5-tuples of the form * (machine, key, index, generation, offset) * This may be used in either the single queue_server setting or * it may be called indirectly by a particular machine's * CrawlController as part of fufilling a network-based getCrawlItems * request. $lookups contains items which are to be grouped (as came * from same url or site with the same cache). So this function aggregates * their descriptions. * * @param string $lookups things whose summaries we are trying to look up * @return array of summary data for the matching documents */ function nonNetworkGetCrawlItems($lookups) { $summary_offset = NULL; $generation = NULL; $summaries = array(); $db = $this->db; foreach ($lookups as $lookup => $lookup_info) { $scheme = isset($lookup_info[0]) && is_string($lookup_info[0]) ? substr($lookup_info[0], 0, 3) : ""; if (count($lookup_info) == 2 && ($scheme == 'htt' || $scheme == 'gop' || $scheme == 'rec')) { list($url, $index_name) = $lookup_info; $index_archive = IndexManager::getIndex($index_name); $offset_gen_arr = $this->lookupSummaryOffsetGeneration($url, $index_name); if ($offset_gen_arr !== false) { list($summary_offset, $generation) = $offset_gen_arr; } else { return false; } $summary = $index_archive->getPage($summary_offset, $generation); } else { $summary = array(); $ellipsis = ""; $description_hash = array(); $sql = "SELECT * FROM FEED_ITEM WHERE GUID=?"; foreach ($lookup_info as $lookup_item) { if (count($lookup_item) == 2) { list($word_key, $index_name) = $lookup_item; $offset_info = $this->lookupSummaryOffsetGeneration($word_key, $index_name, true); if (is_array($offset_info)) { list($summary_offset, $generation) = $offset_info; } else { continue; } } else { list($machine, $key, $index_name, $generation, $summary_offset) = $lookup_item; } if (strcmp($index_name, "feed") != 0) { $index = IndexManager::getIndex($index_name); $index->setCurrentShard($generation, true); if (is_integer($summary_offset) && is_integer($generation)) { $page = @$index->getPage($summary_offset); } else { $page = NULL; } } else { $guid = base64Hash(substr($key, IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN)); $result = $db->execute($sql, array($guid)); $page = false; if ($result) { $row = $db->fetchArray($result); if ($row) { $page[self::TITLE] = $row["TITLE"]; $page[self::DESCRIPTION] = $row["DESCRIPTION"]; $page[self::URL] = $row["LINK"]; $page[self::SOURCE_NAME] = $row["SOURCE_NAME"]; } } } if (!$page || $page == array()) { continue; } $copy = false; if ($summary == array()) { if (isset($page[self::DESCRIPTION])) { $description = trim($page[self::DESCRIPTION]); $page[self::DESCRIPTION] = $description; $description_hash[$description] = true; } $ellipsis = " .. "; $summary = $page; } else { if (isset($page[self::DESCRIPTION])) { $description = trim($page[self::DESCRIPTION]); if (!isset($summary[self::DESCRIPTION])) { $summary[self::DESCRIPTION] = ""; } if (!isset($description_hash[$description])) { $summary[self::DESCRIPTION] .= $ellipsis . $description; $ellipsis = " .. "; $description_hash[$description] = true; } $copy = true; } else { $copy = true; } } if (strlen($summary[self::DESCRIPTION]) > self::MIN_DESCRIPTION_LENGTH) { break; } if ($copy) { foreach ($page as $attr => $value) { if ($attr != self::DESCRIPTION && !isset($summary[$attr])) { $summary[$attr] = $value; } } } } } if ($summary != array()) { $summaries[$lookup] = $summary; } } return $summaries; }