Пример #1
0
 /**
  * Get a list of urls from the current fetch batch provided by the queue
  * server. Then downloads these pages. Finally, reschedules, if
  * possible, pages that did not successfully get downloaded.
  *
  * @return array an associative array of web pages and meta data
  * fetched from the internet
  */
 function downloadPagesWebCrawl()
 {
     $start_time = microtime();
     $can_schedule_again = false;
     if (count($this->to_crawl) > 0) {
         $can_schedule_again = true;
     }
     $sites = $this->getFetchSites();
     crawlLog("Done getting list of " . count($sites) . " to download...");
     if (!$sites) {
         crawlLog("No seeds to fetch...");
         sleep(max(0, ceil(MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time))));
         return array();
     }
     $prefix = $this->fetcher_num . "-";
     $tmp_dir = CRAWL_DIR . "/{$prefix}temp";
     $filtered_sites = array();
     $site_pages = array();
     foreach ($sites as $site) {
         $hard_coded_parts = explode("###!", $site[self::URL]);
         if (count($hard_coded_parts) > 1) {
             if (!isset($hard_coded_parts[2])) {
                 $hard_coded_parts[2] = "";
             }
             $site[self::URL] = $hard_coded_parts[0];
             $title = urldecode($hard_coded_parts[1]);
             $description = urldecode($hard_coded_parts[2]);
             $site[self::PAGE] = "<html><head><title>{$title}" . "</title></head><body><h1>{$title}</h1>" . "<p>{$description}</p></body></html>";
             $site[self::HTTP_CODE] = 200;
             $site[self::TYPE] = "text/html";
             $site[self::ENCODING] = "UTF-8";
             $site[self::IP_ADDRESSES] = array("0.0.0.0");
             $site[self::TIMESTAMP] = time();
             $site_pages[] = $site;
         } else {
             $filtered_sites[] = $site;
         }
     }
     $site_pages = array_merge($site_pages, FetchUrl::getPages($filtered_sites, true, $this->page_range_request, $tmp_dir, self::URL, self::PAGE, false, NULL, false, $this->tor_proxy, $this->proxy_servers));
     crawlLog("..getPages call complete..");
     for ($j = 0; $j < count($site_pages); $j++) {
         if (isset($site_pages[$j][self::REPOSITORY_TYPE])) {
             $git_repository_url = $site_pages[$j][self::URL];
             $git_compressed_content = FetchGitRepositoryUrls::getGitdata($git_repository_url);
             $git_uncompressed_content = gzuncompress($git_compressed_content);
             $length = strlen($git_uncompressed_content);
             $git_hash_end = strpos($git_uncompressed_content, self::HEX_NULL_CHARACTER);
             $git_uncompressed_content = substr($git_uncompressed_content, $git_hash_end + 1, $length);
             $site_pages[$j][self::PAGE] = $git_uncompressed_content;
             $mime_type = UrlParser::guessMimeTypeFromFileName($site_pages[$j][self::FILE_NAME]);
             $site_pages[$j][self::TYPE] = $mime_type;
         }
     }
     list($downloaded_pages, $schedule_again_pages) = $this->reschedulePages($site_pages);
     if ($can_schedule_again == true) {
         //only schedule to crawl again on fail sites without crawl-delay
         crawlLog("  Scheduling again..");
         foreach ($schedule_again_pages as $schedule_again_page) {
             if (isset($schedule_again_page[self::CRAWL_DELAY]) && $schedule_again_page[self::CRAWL_DELAY] == 0) {
                 $this->to_crawl_again[] = array($schedule_again_page[self::URL], $schedule_again_page[self::WEIGHT], $schedule_again_page[self::CRAWL_DELAY]);
             }
             crawlLog("....reschedule count:" . count($this->to_crawl_again));
         }
         crawlLog("....done.");
     }
     crawlLog("Downloading complete");
     return $downloaded_pages;
 }
Пример #2
0
 /**
  * Returns the statuses of machines in the machine table of their
  * fetchers and queue_server as well as the name and url's of these machines
  *
  * @param array $machines an array of machines to check the status for
  * @return array  a list of machines, together with all their properties
  * and the statuses of their fetchers and queue_servers
  */
 function getMachineStatuses($machines = array())
 {
     $num_machines = count($machines);
     $time = time();
     $session = md5($time . AUTH_KEY);
     for ($i = 0; $i < $num_machines; $i++) {
         $hash_url = crawlHash($machines[$i]["URL"]);
         $machines[$i][CrawlConstants::URL] = $machines[$i]["URL"] . "?c=machine&a=statuses&time={$time}" . "&session={$session}&arg={$hash_url}";
     }
     $statuses = FetchUrl::getPages($machines);
     for ($i = 0; $i < $num_machines; $i++) {
         foreach ($statuses as $status) {
             if ($machines[$i][CrawlConstants::URL] == $status[CrawlConstants::URL]) {
                 $pre_status = json_decode($status[CrawlConstants::PAGE], true);
                 if (is_array($pre_status)) {
                     $machines[$i]["STATUSES"] = $pre_status;
                 } else {
                     $machines[$i]["STATUSES"] = "NOT_CONFIGURED_ERROR";
                 }
             }
         }
     }
     $sql = "SELECT * FROM ACTIVE_FETCHER";
     $result = $this->db->execute($sql);
     if (!$result) {
         return $machines;
     }
     $active_fetchers = array();
     while ($row = $this->db->fetchArray($result)) {
         for ($i = 0; $i < $num_machines; $i++) {
             if ($machines[$i]['NAME'] == $row['NAME']) {
                 if (!isset($machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']])) {
                     $machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']] = 0;
                 }
             }
         }
     }
     stringROrderCallback("", "", "NAME");
     if ($machines != array()) {
         usort($machines, "stringROrderCallback");
     }
     $name_server_statuses = CrawlDaemon::statuses();
     $machines['NAME_SERVER']['news_updater'] = 0;
     if (isset($name_server_statuses['news_updater'])) {
         $machines['NAME_SERVER']['news_updater'] = 1;
     }
     return $machines;
 }
Пример #3
0
 /**
  * Hook function used by currentDocsWithWord to return the current block
  * of docs if it is not cached
  *
  * @return mixed doc ids and score if there are docs left, -1 otherwise
  */
 function findDocsWithWord()
 {
     $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}";
     $sites = array();
     $lookup = array();
     $i = 0;
     $j = 0;
     foreach ($this->queue_servers as $server) {
         if ($this->more_flags[$i]) {
             $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}";
             $lookup[$j] = $i;
             $j++;
         }
         $i++;
     }
     $net_times = AnalyticsManager::get("NET_TIMES");
     $net_times = $net_times ? $net_times : 0;
     $download_time = microtime();
     $downloads = array();
     if (count($sites) > 0) {
         $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true);
     }
     $net_times += changeInMicrotime($download_time);
     AnalyticsManager::set("NET_TIMES", $net_times);
     $results = array();
     $count = count($downloads);
     $this->num_docs = 0;
     $in4 = "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
     $machine_times = AnalyticsManager::get("MACHINE_TIMES");
     $indent = $machine_times ? "<br />{$in4}" : $in4;
     $machine_times = $machine_times ? $machine_times : "";
     $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
     $max_machine_times = $max_machine_times ? $max_machine_times : 0;
     $max_time = 0;
     $num_with_results = $count;
     for ($j = 0; $j < $count; $j++) {
         $download =& $downloads[$j];
         if (isset($download[self::PAGE])) {
             $pre_result = @unserialize($download[self::PAGE]);
             if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) {
                 $this->more_flags[$lookup[$j]] = false;
                 $num_with_results--;
             }
             if (isset($pre_result["TOTAL_ROWS"])) {
                 $this->num_docs += $pre_result["TOTAL_ROWS"];
             }
             if (isset($pre_result["PAGES"])) {
                 foreach ($pre_result["PAGES"] as $page_data) {
                     if (isset($page_data[self::KEY])) {
                         $results[$page_data[self::KEY]] = $page_data;
                         $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j];
                     }
                 }
             }
             $max_time = max($max_time, $pre_result['ELAPSED_TIME']);
             $lookup_link = $this->makeLookupLink($sites, $lookup[$j]);
             $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . "&nbsp;&nbsp;";
             $indent = "";
         }
     }
     if (isset($pre_result["HARD_QUERY"])) {
         $this->hard_query = $pre_result["HARD_QUERY"];
     }
     if ($num_with_results > 0) {
         $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results));
     }
     $max_machine_times += $max_time;
     AnalyticsManager::set("MACHINE_TIMES", $machine_times);
     AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times);
     if ($results == array()) {
         $results = -1;
     }
     if ($results != -1) {
         if ($this->filter != NULL) {
             foreach ($results as $keys => $data) {
                 $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
                 if (in_array($host_key, $this->filter)) {
                     unset($results[$keys]);
                 }
             }
         }
     }
     $this->count_block = count($results);
     $this->pages = $results;
     return $results;
 }
Пример #4
0
 /**
  * This method is invoked by other ParallelModel (@see CrawlModel
  * for examples) methods when they want to have their method performed
  * on an array of other  Yioop instances. The results returned can then
  * be aggregated.  The invocation sequence is
  * crawlModelMethodA invokes execMachine with a list of
  * urls of other Yioop instances. execMachine makes REST requests of
  * those instances of the given command and optional arguments
  * This request would be handled by a CrawlController which in turn
  * calls crawlModelMethodA on the given Yioop instance, serializes the
  * result and gives it back to execMachine and then back to the originally
  * calling function.
  *
  * @param string $command the ParallelModel method to invoke on the remote
  *     Yioop instances
  * @param array $machine_urls machines to invoke this command on
  * @param string $arg additional arguments to be passed to the remote
  *      machine
  * @param int $num_machines the integer to be used in calculating partition
  * @return array a list of outputs from each machine that was called.
  */
 function execMachines($command, $machine_urls, $arg = NULL, $num_machines = 0)
 {
     if ($num_machines == 0) {
         $num_machines = count($machine_urls);
     }
     $time = time();
     $session = md5($time . AUTH_KEY);
     $query = "c=crawl&a={$command}&time={$time}&session={$session}" . "&num={$num_machines}";
     if ($arg != NULL) {
         $arg = webencode($arg);
         $query .= "&arg={$arg}";
     }
     $sites = array();
     $post_data = array();
     $i = 0;
     foreach ($machine_urls as $index => $machine_url) {
         $sites[$i][CrawlConstants::URL] = $machine_url;
         $post_data[$i] = $query . "&i={$index}";
         $i++;
     }
     $outputs = array();
     if (count($sites) > 0) {
         $outputs = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true, $post_data);
     }
     return $outputs;
 }
Пример #5
0
 /**
  * Downloads one batch of $feeds_one_go feed items for @see updateFeedItems
  * For each feed source downloads the feeds, checks which items are
  * not in the database, adds them. This method does not update
  * the inverted index shard.
  *
  * @param array $feeds list of feeds to download
  * @param int $age how many seconds old records should be ignored
  */
 function updateFeedItemsOneGo($feeds, $age = ONE_WEEK)
 {
     $feeds = FetchUrl::getPages($feeds, false, 0, NULL, "SOURCE_URL", CrawlConstants::PAGE, true, NULL, true);
     $sql = "UPDATE MEDIA_SOURCE SET LANGUAGE=? WHERE TIMESTAMP=?";
     foreach ($feeds as $feed) {
         $is_html = $feed['TYPE'] == 'html' ? true : false;
         crawlLog("Updating {$feed['NAME']}. Making dom object from feed.");
         if (!$feed[CrawlConstants::PAGE]) {
             crawlLog("...No data in feed skipping.");
             continue;
         }
         $dom = new DOMDocument();
         if ($is_html) {
             @$dom->loadHTML($feed[CrawlConstants::PAGE]);
         } else {
             @$dom->loadXML($feed[CrawlConstants::PAGE]);
         }
         crawlLog("...done. Extracting info about whole feed.");
         $lang = "";
         if ($feed['TYPE'] != 'html' && !isset($feed["LANGUAGE"]) || $feed["LANGUAGE"] == "") {
             $languages = $dom->getElementsByTagName('language');
             if ($languages && is_object($languages) && is_object($languages->item(0))) {
                 $lang = $languages->item(0)->textContent;
                 $db->execute($sql, array($lang, $feed['TIMESTAMP']));
             }
         } else {
             if (isset($feed["LANGUAGE"]) && $feed["LANGUAGE"] != "") {
                 $lang = $feed["LANGUAGE"];
             } else {
                 $lang = DEFAULT_LOCALE;
             }
         }
         crawlLog("...Language is {$lang}. Getting channel, finding nodes.");
         if ($is_html) {
             $sub_dom = $this->getTags($dom, $feed['CHANNEL_PATH']);
             if (!$sub_dom) {
                 crawlLog("... Scraper couldn't parse channel" . " path so bailing on this feed.");
                 continue;
             } else {
                 crawlLog("...Channel scraped.");
             }
             $nodes = $this->getTags($sub_dom[0], $feed['ITEM_PATH']);
             $rss_elements = array("title" => $feed['TITLE_PATH'], "description" => $feed['DESCRIPTION_PATH'], "link" => $feed['LINK_PATH']);
         } else {
             $nodes = $dom->getElementsByTagName('item');
             $rss_elements = array("title" => "title", "description" => "description", "link" => "link", "guid" => "guid", "pubDate" => "pubDate");
             if ($nodes->length == 0) {
                 // maybe we're dealing with atom rather than rss
                 $nodes = $dom->getElementsByTagName('entry');
                 $rss_elements = array("title" => "title", "description" => "summary", "link" => "link", "guid" => "id", "pubDate" => "updated");
             }
         }
         crawlLog("...done extracting info. Check for new news " . "items in {$feed['NAME']}.");
         $num_added = 0;
         $num_seen = 0;
         foreach ($nodes as $node) {
             $item = array();
             foreach ($rss_elements as $db_element => $feed_element) {
                 crawlTimeoutLog("..still adding feed items to index.");
                 if ($is_html) {
                     $tag_nodes = $this->getTags($node, $feed_element);
                     if (!isset($tag_nodes[0])) {
                         $tag_node = NULL;
                     } else {
                         $tag_node = $tag_nodes[0];
                     }
                     $element_text = is_object($tag_node) ? $tag_node->textContent : "";
                 } else {
                     $tag_node = $node->getElementsByTagName($feed_element)->item(0);
                     $element_text = is_object($tag_node) ? $tag_node->nodeValue : "";
                 }
                 if ($db_element == "link" && $tag_node && ($element_text == "" || $is_html)) {
                     if ($is_html) {
                         $element_text = $tag_node->documentElement->getAttribute("href");
                     } else {
                         $element_text = $tag_node->getAttribute("href");
                     }
                     $element_text = UrlParser::canonicalLink($element_text, $feed["SOURCE_URL"]);
                 }
                 $item[$db_element] = strip_tags($element_text);
             }
             $did_add = $this->addFeedItemIfNew($item, $feed['NAME'], $lang, $age);
             if ($did_add) {
                 $num_added++;
             }
             $num_seen++;
         }
         crawlLog("...added {$num_added} news items of {$num_seen} " . "on rss page.\n Done Processing {$feed['NAME']}.");
     }
 }