示例#1
0
 /**
  * Using the supplied $word_structs, contructs an iterator for getting
  * results to a query
  *
  * @param array $word_structs an array of word_structs. Here a word_struct
  *     is an associative array with at least the following fields
  *     KEYS -- an array of word keys
  *     QUOTE_POSITIONS -- an array of positions of words that appreared in
  *         quotes (so need to be matched exactly)
  *     DISALLOW_PHRASES -- an array of words the document must not contain
  *     WEIGHT -- a weight to multiple scores returned from this iterator by
  *     INDEX_NAME -- an index timestamp to get results from
  * @param array& $filter an array of hashes of domains to filter from
  *     results
  *     and then potentially restored in cache
  * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
  *     no grouping done on data also no summaries returned (only lookup
  *     info), $raw > 1 return summaries but no grouping
  * @param int $to_retrieve number of items to retrieve from location in
  *     in interator
  * @param array $queue_servers a list of urls of yioop machines which might
  *     be used during lookup
  * @param string $original_query if set, the orginal query that corresponds
  *     to $word_structs
  * @param string $save_timestamp_name if this timestamp is non empty, then
  *     when making iterator get sub-iterators to advance to gen doc_offset
  *     stored with respect to save_timestamp if exists.
  * @param bool $limit_news if true the number of media:news items to
  *     allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT
  *
  * @return &object an iterator for iterating through results to the
  * query
  */
 function getQueryIterator($word_structs, &$filter, $raw, &$to_retrieve, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true)
 {
     $iterators = array();
     $total_iterators = 0;
     $network_flag = false;
     $min_group_flag = false;
     $min_group_override = false;
     if ($queue_servers != array() && !$this->isSingleLocalhost($queue_servers)) {
         $network_flag = true;
         $total_iterators = 1;
         if (!in_array(NAME_SERVER, $queue_servers)) {
             $queue_servers[] = NAME_SERVER;
             //name server might still have news
         }
         $num_servers = count($queue_servers);
         if ((!isset($this->index_name) || !$this->index_name) && isset($word_structs[0]["INDEX_NAME"])) {
             $index_name = $word_structs[0]["INDEX_NAME"];
         } else {
             $index_name = $this->index_name;
         }
         $iterators[0] = new NetworkIterator($original_query, $queue_servers, $index_name, $filter, $save_timestamp_name, $limit_news);
     }
     if (!$network_flag) {
         $doc_iterate_hashes = array(substr(crawlHashWord("site:any"), 0, 9), substr(crawlHash("site:any"), 0, 9), substr(crawlHashWord("site:doc"), 0, 9), substr(crawlHash("site:doc"), 0, 9));
         if ($save_timestamp_name != "") {
             // used for archive crawls of crawl mixes
             $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt";
             if (file_exists($save_file)) {
                 $save_point = unserialize(file_get_contents($save_file));
             }
             $save_count = 0;
         }
         foreach ($word_structs as $word_struct) {
             if (!is_array($word_struct)) {
                 continue;
             }
             $word_keys = $word_struct["KEYS"];
             $distinct_word_keys = array();
             $seen_keys = array();
             foreach ($word_keys as $wkey) {
                 if (is_string($wkey) || is_string($wkey[0])) {
                     $tmp_key = is_string($wkey) ? $wkey : $wkey[0];
                     if (!isset($seen_keys[$tmp_key])) {
                         $seen_keys[$tmp_key] = true;
                         $distinct_word_keys[] = $wkey;
                     }
                 } else {
                     $distinct_word_keys[] = $wkey;
                 }
             }
             $quote_positions = $word_struct["QUOTE_POSITIONS"];
             $disallow_keys = $word_struct["DISALLOW_KEYS"];
             $index_name = $word_struct["INDEX_NAME"];
             $weight = $word_struct["WEIGHT"];
             $num_word_keys = count($word_keys);
             $total_iterators = count($distinct_word_keys);
             $word_iterators = array();
             $word_iterator_map = array();
             if ($num_word_keys < 1) {
                 continue;
             }
             $sum = 0;
             for ($i = 0; $i < $total_iterators; $i++) {
                 $current_key = is_string($distinct_word_keys[$i]) ? $distinct_word_keys[$i] : (is_string($distinct_word_keys[$i][0]) ? $distinct_word_keys[$i][0] : $distinct_word_keys[$i][0][0]);
                 if (!is_string($current_key)) {
                     $current_key = $current_key[0];
                 }
                 if (in_array(substr($current_key, 0, 9), $doc_iterate_hashes)) {
                     $word_iterators[$i] = new DocIterator($index_name, $filter, $to_retrieve);
                     $min_group_override = true;
                 } else {
                     //can happen if exact phrase search suffix approach used
                     if (isset($distinct_word_keys[$i][0][0]) && is_array($distinct_word_keys[$i][0][0])) {
                         $distinct_keys = array($distinct_word_keys[$i][0][1]);
                     } else {
                         if (isset($distinct_word_keys[$i][0]) && is_array($distinct_word_keys[$i][0])) {
                             $distinct_keys = $distinct_word_keys[$i];
                         } else {
                             $distinct_keys = array($distinct_word_keys[$i]);
                         }
                     }
                     $out_keys = array();
                     $old_distinct_key_id = "";
                     foreach ($distinct_keys as $distinct_key) {
                         if (is_array($distinct_key)) {
                             if (!isset($distinct_key[2]) && isset($distinct_key[1])) {
                                 $distinct_keys[] = $distinct_key[1];
                             }
                             $shift = isset($distinct_key[1]) ? $distinct_key[1] : 0;
                             $mask = isset($distinct_key[2]) ? $distinct_key[2] : "" . "";
                             if (isset($distinct_key[3])) {
                                 $old_distinct_key_id = unbase64Hash($distinct_key[3]);
                             }
                             $distinct_key_id = unbase64Hash($distinct_key[0]);
                         } else {
                             $shift = 0;
                             $mask = "" . "";
                             $distinct_key_id = unbase64Hash($distinct_key);
                         }
                         $lookup_cutoff = max(MIN_RESULTS_TO_GROUP, $to_retrieve);
                         $info = IndexManager::getWordInfo($index_name, $distinct_key_id, $shift, $mask);
                         if ($old_distinct_key_id != "") {
                             $old_info = IndexManager::getWordInfo($index_name, $old_distinct_key_id, $shift, $mask);
                             if ($info !== false && $old_info !== false) {
                                 $info = array_merge($info, $old_info);
                             } else {
                                 if ($old_info !== false) {
                                     $info = $old_info;
                                 }
                             }
                         }
                         if ($info != array()) {
                             $tmp_keys = arrayColumnCount($info, 4, 3);
                             $sum += array_sum($tmp_keys);
                             $out_keys = array_merge($out_keys, $tmp_keys);
                         }
                         if ($sum > $lookup_cutoff) {
                             break;
                         }
                     }
                     arsort($out_keys);
                     $out_keys = array_keys(array_slice($out_keys, 0, 50));
                     $tmp_word_iterators = array();
                     $m = 0;
                     foreach ($out_keys as $distinct_key) {
                         $tmp_word_iterators[$m] = new WordIterator($distinct_key, $index_name, true, $filter, $to_retrieve, $limit_news);
                         if ($tmp_word_iterators[$m]->dictionary_info != array() || $tmp_word_iterators[$m]->feed_count > 0) {
                             $min_group_override = true;
                             $m++;
                         } else {
                             unset($tmp_word_iterators[$m]);
                         }
                     }
                     if ($m == 1) {
                         $word_iterators[$i] = $tmp_word_iterators[0];
                     } else {
                         $word_iterators[$i] = new DisjointIterator($tmp_word_iterators);
                     }
                 }
                 foreach ($word_keys as $index => $key) {
                     if (isset($distinct_word_keys[$i]) && $key == $distinct_word_keys[$i]) {
                         $word_iterator_map[$index] = $i;
                     }
                 }
             }
             $num_disallow_keys = count($disallow_keys);
             if ($num_disallow_keys > 0) {
                 for ($i = 0; $i < $num_disallow_keys; $i++) {
                     $disallow_iterator = new WordIterator($disallow_keys[$i], $index_name, false, $filter);
                     $word_iterators[$num_word_keys + $i] = new NegationIterator($disallow_iterator);
                 }
             }
             $num_word_keys += $num_disallow_keys;
             if ($num_word_keys == 1 && $weight == 1) {
                 $base_iterator = $word_iterators[0];
             } else {
                 $base_iterator = new IntersectIterator($word_iterators, $word_iterator_map, $quote_positions, $weight);
                 $min_group_flag = true;
                 if ($save_timestamp_name == "") {
                     $base_iterator->sync_timer_on = true;
                 } else {
                     $base_iterator->sync_timer_on = false;
                 }
             }
             if ($save_timestamp_name != "") {
                 if (isset($save_point[$save_count]) && $save_point[$save_count] != -1) {
                     $base_iterator->advance($save_point[$save_count]);
                 }
                 $save_count++;
             }
             $iterators[] = $base_iterator;
         }
     }
     $num_iterators = count($iterators);
     //if network_flag should be 1
     if ($num_iterators < 1) {
         return NULL;
     } else {
         if ($num_iterators == 1) {
             $union_iterator = $iterators[0];
         } else {
             $union_iterator = new UnionIterator($iterators);
         }
     }
     $raw = intval($raw);
     if ($raw > 0) {
         $group_iterator = $union_iterator;
     } else {
         $group_iterator = new GroupIterator($union_iterator, $total_iterators, $this->current_machine, $network_flag);
     }
     if ($network_flag) {
         $union_iterator->results_per_block = ceil(SERVER_ALPHA * $group_iterator->results_per_block / $num_servers);
     } else {
         if ($save_timestamp_name != "") {
             $group_iterator->save_iterators = $iterators;
         } else {
             if ($min_group_flag && !$min_group_override) {
                 $group_iterator->results_per_block = max(MIN_RESULTS_TO_GROUP / 20, 1);
                 $to_retrieve = -1;
             }
         }
     }
     return $group_iterator;
 }
示例#2
0
 /**
  * Adds $item to  FEED_ITEM table in db if it isn't already there
  *
  * @param array $item data from a single news feed item
  * @param string $source_name string name of the news feed $item was found
  * on
  * @param int $age how many seconds old records should be ignored
  * @param string $lang locale-tag of the news feed
  * @return bool whether an item was added
  */
 function addFeedItemIfNew($item, $source_name, $lang, $age)
 {
     if (!isset($item["link"]) || !isset($item["title"]) || !isset($item["description"])) {
         return false;
     }
     if (!isset($item["guid"]) || $item["guid"] == "") {
         $item["guid"] = crawlHash($item["link"]);
     } else {
         $item["guid"] = crawlHash($item["guid"]);
     }
     $raw_guid = unbase64Hash($item["guid"]);
     if (!isset($item["pubDate"]) || $item["pubDate"] == "") {
         $item["pubDate"] = time();
     } else {
         $item["pubDate"] = strtotime($item["pubDate"]);
     }
     if (time() - $item["pubDate"] > $age) {
         return false;
     }
     $sql = "SELECT COUNT(*) AS NUMBER FROM FEED_ITEM WHERE GUID = ?";
     $db = $this->db;
     $result = $db->execute($sql, array($item["guid"]));
     if ($result) {
         $row = $db->fetchArray($result);
         if ($row["NUMBER"] > 0) {
             return false;
         }
     } else {
         return true;
     }
     $sql = "INSERT INTO FEED_ITEM VALUES (?, ?, ?, ?, ?, ?)";
     $result = $db->execute($sql, array($item['guid'], $item['title'], $item['link'], $item['description'], $item['pubDate'], $source_name));
     if (!$result) {
         return false;
     }
     return true;
 }