/** * Using the supplied $word_structs, contructs an iterator for getting * results to a query * * @param array $word_structs an array of word_structs. Here a word_struct * is an associative array with at least the following fields * KEYS -- an array of word keys * QUOTE_POSITIONS -- an array of positions of words that appreared in * quotes (so need to be matched exactly) * DISALLOW_PHRASES -- an array of words the document must not contain * WEIGHT -- a weight to multiple scores returned from this iterator by * INDEX_NAME -- an index timestamp to get results from * @param array& $filter an array of hashes of domains to filter from * results * and then potentially restored in cache * @param int $raw ($raw == 0) normal grouping, ($raw == 1) * no grouping done on data also no summaries returned (only lookup * info), $raw > 1 return summaries but no grouping * @param int $to_retrieve number of items to retrieve from location in * in interator * @param array $queue_servers a list of urls of yioop machines which might * be used during lookup * @param string $original_query if set, the orginal query that corresponds * to $word_structs * @param string $save_timestamp_name if this timestamp is non empty, then * when making iterator get sub-iterators to advance to gen doc_offset * stored with respect to save_timestamp if exists. * @param bool $limit_news if true the number of media:news items to * allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT * * @return &object an iterator for iterating through results to the * query */ function getQueryIterator($word_structs, &$filter, $raw, &$to_retrieve, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true) { $iterators = array(); $total_iterators = 0; $network_flag = false; $min_group_flag = false; $min_group_override = false; if ($queue_servers != array() && !$this->isSingleLocalhost($queue_servers)) { $network_flag = true; $total_iterators = 1; if (!in_array(NAME_SERVER, $queue_servers)) { $queue_servers[] = NAME_SERVER; //name server might still have news } $num_servers = count($queue_servers); if ((!isset($this->index_name) || !$this->index_name) && isset($word_structs[0]["INDEX_NAME"])) { $index_name = $word_structs[0]["INDEX_NAME"]; } else { $index_name = $this->index_name; } $iterators[0] = new NetworkIterator($original_query, $queue_servers, $index_name, $filter, $save_timestamp_name, $limit_news); } if (!$network_flag) { $doc_iterate_hashes = array(substr(crawlHashWord("site:any"), 0, 9), substr(crawlHash("site:any"), 0, 9), substr(crawlHashWord("site:doc"), 0, 9), substr(crawlHash("site:doc"), 0, 9)); if ($save_timestamp_name != "") { // used for archive crawls of crawl mixes $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt"; if (file_exists($save_file)) { $save_point = unserialize(file_get_contents($save_file)); } $save_count = 0; } foreach ($word_structs as $word_struct) { if (!is_array($word_struct)) { continue; } $word_keys = $word_struct["KEYS"]; $distinct_word_keys = array(); $seen_keys = array(); foreach ($word_keys as $wkey) { if (is_string($wkey) || is_string($wkey[0])) { $tmp_key = is_string($wkey) ? $wkey : $wkey[0]; if (!isset($seen_keys[$tmp_key])) { $seen_keys[$tmp_key] = true; $distinct_word_keys[] = $wkey; } } else { $distinct_word_keys[] = $wkey; } } $quote_positions = $word_struct["QUOTE_POSITIONS"]; $disallow_keys = $word_struct["DISALLOW_KEYS"]; $index_name = $word_struct["INDEX_NAME"]; $weight = $word_struct["WEIGHT"]; $num_word_keys = count($word_keys); $total_iterators = count($distinct_word_keys); $word_iterators = array(); $word_iterator_map = array(); if ($num_word_keys < 1) { continue; } $sum = 0; for ($i = 0; $i < $total_iterators; $i++) { $current_key = is_string($distinct_word_keys[$i]) ? $distinct_word_keys[$i] : (is_string($distinct_word_keys[$i][0]) ? $distinct_word_keys[$i][0] : $distinct_word_keys[$i][0][0]); if (!is_string($current_key)) { $current_key = $current_key[0]; } if (in_array(substr($current_key, 0, 9), $doc_iterate_hashes)) { $word_iterators[$i] = new DocIterator($index_name, $filter, $to_retrieve); $min_group_override = true; } else { //can happen if exact phrase search suffix approach used if (isset($distinct_word_keys[$i][0][0]) && is_array($distinct_word_keys[$i][0][0])) { $distinct_keys = array($distinct_word_keys[$i][0][1]); } else { if (isset($distinct_word_keys[$i][0]) && is_array($distinct_word_keys[$i][0])) { $distinct_keys = $distinct_word_keys[$i]; } else { $distinct_keys = array($distinct_word_keys[$i]); } } $out_keys = array(); $old_distinct_key_id = ""; foreach ($distinct_keys as $distinct_key) { if (is_array($distinct_key)) { if (!isset($distinct_key[2]) && isset($distinct_key[1])) { $distinct_keys[] = $distinct_key[1]; } $shift = isset($distinct_key[1]) ? $distinct_key[1] : 0; $mask = isset($distinct_key[2]) ? $distinct_key[2] : "" . ""; if (isset($distinct_key[3])) { $old_distinct_key_id = unbase64Hash($distinct_key[3]); } $distinct_key_id = unbase64Hash($distinct_key[0]); } else { $shift = 0; $mask = "" . ""; $distinct_key_id = unbase64Hash($distinct_key); } $lookup_cutoff = max(MIN_RESULTS_TO_GROUP, $to_retrieve); $info = IndexManager::getWordInfo($index_name, $distinct_key_id, $shift, $mask); if ($old_distinct_key_id != "") { $old_info = IndexManager::getWordInfo($index_name, $old_distinct_key_id, $shift, $mask); if ($info !== false && $old_info !== false) { $info = array_merge($info, $old_info); } else { if ($old_info !== false) { $info = $old_info; } } } if ($info != array()) { $tmp_keys = arrayColumnCount($info, 4, 3); $sum += array_sum($tmp_keys); $out_keys = array_merge($out_keys, $tmp_keys); } if ($sum > $lookup_cutoff) { break; } } arsort($out_keys); $out_keys = array_keys(array_slice($out_keys, 0, 50)); $tmp_word_iterators = array(); $m = 0; foreach ($out_keys as $distinct_key) { $tmp_word_iterators[$m] = new WordIterator($distinct_key, $index_name, true, $filter, $to_retrieve, $limit_news); if ($tmp_word_iterators[$m]->dictionary_info != array() || $tmp_word_iterators[$m]->feed_count > 0) { $min_group_override = true; $m++; } else { unset($tmp_word_iterators[$m]); } } if ($m == 1) { $word_iterators[$i] = $tmp_word_iterators[0]; } else { $word_iterators[$i] = new DisjointIterator($tmp_word_iterators); } } foreach ($word_keys as $index => $key) { if (isset($distinct_word_keys[$i]) && $key == $distinct_word_keys[$i]) { $word_iterator_map[$index] = $i; } } } $num_disallow_keys = count($disallow_keys); if ($num_disallow_keys > 0) { for ($i = 0; $i < $num_disallow_keys; $i++) { $disallow_iterator = new WordIterator($disallow_keys[$i], $index_name, false, $filter); $word_iterators[$num_word_keys + $i] = new NegationIterator($disallow_iterator); } } $num_word_keys += $num_disallow_keys; if ($num_word_keys == 1 && $weight == 1) { $base_iterator = $word_iterators[0]; } else { $base_iterator = new IntersectIterator($word_iterators, $word_iterator_map, $quote_positions, $weight); $min_group_flag = true; if ($save_timestamp_name == "") { $base_iterator->sync_timer_on = true; } else { $base_iterator->sync_timer_on = false; } } if ($save_timestamp_name != "") { if (isset($save_point[$save_count]) && $save_point[$save_count] != -1) { $base_iterator->advance($save_point[$save_count]); } $save_count++; } $iterators[] = $base_iterator; } } $num_iterators = count($iterators); //if network_flag should be 1 if ($num_iterators < 1) { return NULL; } else { if ($num_iterators == 1) { $union_iterator = $iterators[0]; } else { $union_iterator = new UnionIterator($iterators); } } $raw = intval($raw); if ($raw > 0) { $group_iterator = $union_iterator; } else { $group_iterator = new GroupIterator($union_iterator, $total_iterators, $this->current_machine, $network_flag); } if ($network_flag) { $union_iterator->results_per_block = ceil(SERVER_ALPHA * $group_iterator->results_per_block / $num_servers); } else { if ($save_timestamp_name != "") { $group_iterator->save_iterators = $iterators; } else { if ($min_group_flag && !$min_group_override) { $group_iterator->results_per_block = max(MIN_RESULTS_TO_GROUP / 20, 1); $to_retrieve = -1; } } } return $group_iterator; }
/** * Adds $item to FEED_ITEM table in db if it isn't already there * * @param array $item data from a single news feed item * @param string $source_name string name of the news feed $item was found * on * @param int $age how many seconds old records should be ignored * @param string $lang locale-tag of the news feed * @return bool whether an item was added */ function addFeedItemIfNew($item, $source_name, $lang, $age) { if (!isset($item["link"]) || !isset($item["title"]) || !isset($item["description"])) { return false; } if (!isset($item["guid"]) || $item["guid"] == "") { $item["guid"] = crawlHash($item["link"]); } else { $item["guid"] = crawlHash($item["guid"]); } $raw_guid = unbase64Hash($item["guid"]); if (!isset($item["pubDate"]) || $item["pubDate"] == "") { $item["pubDate"] = time(); } else { $item["pubDate"] = strtotime($item["pubDate"]); } if (time() - $item["pubDate"] > $age) { return false; } $sql = "SELECT COUNT(*) AS NUMBER FROM FEED_ITEM WHERE GUID = ?"; $db = $this->db; $result = $db->execute($sql, array($item["guid"])); if ($result) { $row = $db->fetchArray($result); if ($row["NUMBER"] > 0) { return false; } } else { return true; } $sql = "INSERT INTO FEED_ITEM VALUES (?, ?, ?, ?, ?, ?)"; $result = $db->execute($sql, array($item['guid'], $item['title'], $item['link'], $item['description'], $item['pubDate'], $source_name)); if (!$result) { return false; } return true; }