/** * Get a list of urls from the current fetch batch provided by the queue * server. Then downloads these pages. Finally, reschedules, if * possible, pages that did not successfully get downloaded. * * @return array an associative array of web pages and meta data * fetched from the internet */ function downloadPagesWebCrawl() { $start_time = microtime(); $can_schedule_again = false; if (count($this->to_crawl) > 0) { $can_schedule_again = true; } $sites = $this->getFetchSites(); crawlLog("Done getting list of " . count($sites) . " to download..."); if (!$sites) { crawlLog("No seeds to fetch..."); sleep(max(0, ceil(MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time)))); return array(); } $prefix = $this->fetcher_num . "-"; $tmp_dir = CRAWL_DIR . "/{$prefix}temp"; $filtered_sites = array(); $site_pages = array(); foreach ($sites as $site) { $hard_coded_parts = explode("###!", $site[self::URL]); if (count($hard_coded_parts) > 1) { if (!isset($hard_coded_parts[2])) { $hard_coded_parts[2] = ""; } $site[self::URL] = $hard_coded_parts[0]; $title = urldecode($hard_coded_parts[1]); $description = urldecode($hard_coded_parts[2]); $site[self::PAGE] = "<html><head><title>{$title}" . "</title></head><body><h1>{$title}</h1>" . "<p>{$description}</p></body></html>"; $site[self::HTTP_CODE] = 200; $site[self::TYPE] = "text/html"; $site[self::ENCODING] = "UTF-8"; $site[self::IP_ADDRESSES] = array("0.0.0.0"); $site[self::TIMESTAMP] = time(); $site_pages[] = $site; } else { $filtered_sites[] = $site; } } $site_pages = array_merge($site_pages, FetchUrl::getPages($filtered_sites, true, $this->page_range_request, $tmp_dir, self::URL, self::PAGE, false, NULL, false, $this->tor_proxy, $this->proxy_servers)); crawlLog("..getPages call complete.."); for ($j = 0; $j < count($site_pages); $j++) { if (isset($site_pages[$j][self::REPOSITORY_TYPE])) { $git_repository_url = $site_pages[$j][self::URL]; $git_compressed_content = FetchGitRepositoryUrls::getGitdata($git_repository_url); $git_uncompressed_content = gzuncompress($git_compressed_content); $length = strlen($git_uncompressed_content); $git_hash_end = strpos($git_uncompressed_content, self::HEX_NULL_CHARACTER); $git_uncompressed_content = substr($git_uncompressed_content, $git_hash_end + 1, $length); $site_pages[$j][self::PAGE] = $git_uncompressed_content; $mime_type = UrlParser::guessMimeTypeFromFileName($site_pages[$j][self::FILE_NAME]); $site_pages[$j][self::TYPE] = $mime_type; } } list($downloaded_pages, $schedule_again_pages) = $this->reschedulePages($site_pages); if ($can_schedule_again == true) { //only schedule to crawl again on fail sites without crawl-delay crawlLog(" Scheduling again.."); foreach ($schedule_again_pages as $schedule_again_page) { if (isset($schedule_again_page[self::CRAWL_DELAY]) && $schedule_again_page[self::CRAWL_DELAY] == 0) { $this->to_crawl_again[] = array($schedule_again_page[self::URL], $schedule_again_page[self::WEIGHT], $schedule_again_page[self::CRAWL_DELAY]); } crawlLog("....reschedule count:" . count($this->to_crawl_again)); } crawlLog("....done."); } crawlLog("Downloading complete"); return $downloaded_pages; }
/** * Returns the statuses of machines in the machine table of their * fetchers and queue_server as well as the name and url's of these machines * * @param array $machines an array of machines to check the status for * @return array a list of machines, together with all their properties * and the statuses of their fetchers and queue_servers */ function getMachineStatuses($machines = array()) { $num_machines = count($machines); $time = time(); $session = md5($time . AUTH_KEY); for ($i = 0; $i < $num_machines; $i++) { $hash_url = crawlHash($machines[$i]["URL"]); $machines[$i][CrawlConstants::URL] = $machines[$i]["URL"] . "?c=machine&a=statuses&time={$time}" . "&session={$session}&arg={$hash_url}"; } $statuses = FetchUrl::getPages($machines); for ($i = 0; $i < $num_machines; $i++) { foreach ($statuses as $status) { if ($machines[$i][CrawlConstants::URL] == $status[CrawlConstants::URL]) { $pre_status = json_decode($status[CrawlConstants::PAGE], true); if (is_array($pre_status)) { $machines[$i]["STATUSES"] = $pre_status; } else { $machines[$i]["STATUSES"] = "NOT_CONFIGURED_ERROR"; } } } } $sql = "SELECT * FROM ACTIVE_FETCHER"; $result = $this->db->execute($sql); if (!$result) { return $machines; } $active_fetchers = array(); while ($row = $this->db->fetchArray($result)) { for ($i = 0; $i < $num_machines; $i++) { if ($machines[$i]['NAME'] == $row['NAME']) { if (!isset($machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']])) { $machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']] = 0; } } } } stringROrderCallback("", "", "NAME"); if ($machines != array()) { usort($machines, "stringROrderCallback"); } $name_server_statuses = CrawlDaemon::statuses(); $machines['NAME_SERVER']['news_updater'] = 0; if (isset($name_server_statuses['news_updater'])) { $machines['NAME_SERVER']['news_updater'] = 1; } return $machines; }
/** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and score if there are docs left, -1 otherwise */ function findDocsWithWord() { $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}"; $sites = array(); $lookup = array(); $i = 0; $j = 0; foreach ($this->queue_servers as $server) { if ($this->more_flags[$i]) { $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}"; $lookup[$j] = $i; $j++; } $i++; } $net_times = AnalyticsManager::get("NET_TIMES"); $net_times = $net_times ? $net_times : 0; $download_time = microtime(); $downloads = array(); if (count($sites) > 0) { $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true); } $net_times += changeInMicrotime($download_time); AnalyticsManager::set("NET_TIMES", $net_times); $results = array(); $count = count($downloads); $this->num_docs = 0; $in4 = " "; $machine_times = AnalyticsManager::get("MACHINE_TIMES"); $indent = $machine_times ? "<br />{$in4}" : $in4; $machine_times = $machine_times ? $machine_times : ""; $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES"); $max_machine_times = $max_machine_times ? $max_machine_times : 0; $max_time = 0; $num_with_results = $count; for ($j = 0; $j < $count; $j++) { $download =& $downloads[$j]; if (isset($download[self::PAGE])) { $pre_result = @unserialize($download[self::PAGE]); if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) { $this->more_flags[$lookup[$j]] = false; $num_with_results--; } if (isset($pre_result["TOTAL_ROWS"])) { $this->num_docs += $pre_result["TOTAL_ROWS"]; } if (isset($pre_result["PAGES"])) { foreach ($pre_result["PAGES"] as $page_data) { if (isset($page_data[self::KEY])) { $results[$page_data[self::KEY]] = $page_data; $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j]; } } } $max_time = max($max_time, $pre_result['ELAPSED_TIME']); $lookup_link = $this->makeLookupLink($sites, $lookup[$j]); $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . " "; $indent = ""; } } if (isset($pre_result["HARD_QUERY"])) { $this->hard_query = $pre_result["HARD_QUERY"]; } if ($num_with_results > 0) { $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results)); } $max_machine_times += $max_time; AnalyticsManager::set("MACHINE_TIMES", $machine_times); AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times); if ($results == array()) { $results = -1; } if ($results != -1) { if ($this->filter != NULL) { foreach ($results as $keys => $data) { $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); if (in_array($host_key, $this->filter)) { unset($results[$keys]); } } } } $this->count_block = count($results); $this->pages = $results; return $results; }
/** * This method is invoked by other ParallelModel (@see CrawlModel * for examples) methods when they want to have their method performed * on an array of other Yioop instances. The results returned can then * be aggregated. The invocation sequence is * crawlModelMethodA invokes execMachine with a list of * urls of other Yioop instances. execMachine makes REST requests of * those instances of the given command and optional arguments * This request would be handled by a CrawlController which in turn * calls crawlModelMethodA on the given Yioop instance, serializes the * result and gives it back to execMachine and then back to the originally * calling function. * * @param string $command the ParallelModel method to invoke on the remote * Yioop instances * @param array $machine_urls machines to invoke this command on * @param string $arg additional arguments to be passed to the remote * machine * @param int $num_machines the integer to be used in calculating partition * @return array a list of outputs from each machine that was called. */ function execMachines($command, $machine_urls, $arg = NULL, $num_machines = 0) { if ($num_machines == 0) { $num_machines = count($machine_urls); } $time = time(); $session = md5($time . AUTH_KEY); $query = "c=crawl&a={$command}&time={$time}&session={$session}" . "&num={$num_machines}"; if ($arg != NULL) { $arg = webencode($arg); $query .= "&arg={$arg}"; } $sites = array(); $post_data = array(); $i = 0; foreach ($machine_urls as $index => $machine_url) { $sites[$i][CrawlConstants::URL] = $machine_url; $post_data[$i] = $query . "&i={$index}"; $i++; } $outputs = array(); if (count($sites) > 0) { $outputs = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true, $post_data); } return $outputs; }
/** * Downloads one batch of $feeds_one_go feed items for @see updateFeedItems * For each feed source downloads the feeds, checks which items are * not in the database, adds them. This method does not update * the inverted index shard. * * @param array $feeds list of feeds to download * @param int $age how many seconds old records should be ignored */ function updateFeedItemsOneGo($feeds, $age = ONE_WEEK) { $feeds = FetchUrl::getPages($feeds, false, 0, NULL, "SOURCE_URL", CrawlConstants::PAGE, true, NULL, true); $sql = "UPDATE MEDIA_SOURCE SET LANGUAGE=? WHERE TIMESTAMP=?"; foreach ($feeds as $feed) { $is_html = $feed['TYPE'] == 'html' ? true : false; crawlLog("Updating {$feed['NAME']}. Making dom object from feed."); if (!$feed[CrawlConstants::PAGE]) { crawlLog("...No data in feed skipping."); continue; } $dom = new DOMDocument(); if ($is_html) { @$dom->loadHTML($feed[CrawlConstants::PAGE]); } else { @$dom->loadXML($feed[CrawlConstants::PAGE]); } crawlLog("...done. Extracting info about whole feed."); $lang = ""; if ($feed['TYPE'] != 'html' && !isset($feed["LANGUAGE"]) || $feed["LANGUAGE"] == "") { $languages = $dom->getElementsByTagName('language'); if ($languages && is_object($languages) && is_object($languages->item(0))) { $lang = $languages->item(0)->textContent; $db->execute($sql, array($lang, $feed['TIMESTAMP'])); } } else { if (isset($feed["LANGUAGE"]) && $feed["LANGUAGE"] != "") { $lang = $feed["LANGUAGE"]; } else { $lang = DEFAULT_LOCALE; } } crawlLog("...Language is {$lang}. Getting channel, finding nodes."); if ($is_html) { $sub_dom = $this->getTags($dom, $feed['CHANNEL_PATH']); if (!$sub_dom) { crawlLog("... Scraper couldn't parse channel" . " path so bailing on this feed."); continue; } else { crawlLog("...Channel scraped."); } $nodes = $this->getTags($sub_dom[0], $feed['ITEM_PATH']); $rss_elements = array("title" => $feed['TITLE_PATH'], "description" => $feed['DESCRIPTION_PATH'], "link" => $feed['LINK_PATH']); } else { $nodes = $dom->getElementsByTagName('item'); $rss_elements = array("title" => "title", "description" => "description", "link" => "link", "guid" => "guid", "pubDate" => "pubDate"); if ($nodes->length == 0) { // maybe we're dealing with atom rather than rss $nodes = $dom->getElementsByTagName('entry'); $rss_elements = array("title" => "title", "description" => "summary", "link" => "link", "guid" => "id", "pubDate" => "updated"); } } crawlLog("...done extracting info. Check for new news " . "items in {$feed['NAME']}."); $num_added = 0; $num_seen = 0; foreach ($nodes as $node) { $item = array(); foreach ($rss_elements as $db_element => $feed_element) { crawlTimeoutLog("..still adding feed items to index."); if ($is_html) { $tag_nodes = $this->getTags($node, $feed_element); if (!isset($tag_nodes[0])) { $tag_node = NULL; } else { $tag_node = $tag_nodes[0]; } $element_text = is_object($tag_node) ? $tag_node->textContent : ""; } else { $tag_node = $node->getElementsByTagName($feed_element)->item(0); $element_text = is_object($tag_node) ? $tag_node->nodeValue : ""; } if ($db_element == "link" && $tag_node && ($element_text == "" || $is_html)) { if ($is_html) { $element_text = $tag_node->documentElement->getAttribute("href"); } else { $element_text = $tag_node->getAttribute("href"); } $element_text = UrlParser::canonicalLink($element_text, $feed["SOURCE_URL"]); } $item[$db_element] = strip_tags($element_text); } $did_add = $this->addFeedItemIfNew($item, $feed['NAME'], $lang, $age); if ($did_add) { $num_added++; } $num_seen++; } crawlLog("...added {$num_added} news items of {$num_seen} " . "on rss page.\n Done Processing {$feed['NAME']}."); } }