/** * Gets the next doc from the iterator * @param bool $no_process if true then just return page string found * not any additional meta data. * @return mixed associative array for doc or just string of doc */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } $matches = array(); while (preg_match($this->delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) { crawlTimeoutLog("..still looking for a page in local buffer"); $block = $this->getFileBlock(); if (!$block || !$this->checkFileHandle() || $this->checkEof()) { return NULL; } $this->buffer .= $block; } $delim_len = strlen($matches[0][0]); $pos = $matches[0][1] + $delim_len; $page_pos = $this->start_delimiter == "" ? $pos : $pos - $delim_len; $page = substr($this->buffer, 0, $page_pos); if ($this->end_delimiter == "") { $page = $this->remainder . $page; $this->remainder = $matches[0][0]; } $this->buffer = substr($this->buffer, $pos + $delim_len); if ($this->start_delimiter != "") { $matches = array(); if (preg_match($this->start_delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) { if (isset($matches[0][1])) { $page = substr($page, $matches[0][1]); } } } if ($no_process == true) { return $page; } $site = array(); $site[self::HEADER] = "text_archive_bundle_iterator extractor"; $site[self::IP_ADDRESSES] = array("0.0.0.0"); $site[self::TIMESTAMP] = date("U", time()); $site[self::TYPE] = "text/plain"; $site[self::PAGE] = $page; $site[self::HASH] = FetchUrl::computePageHash($page); $site[self::URL] = "record:" . webencode($site[self::HASH]); $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = $this->encoding; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::WEIGHT] = 1; return $site; }
/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } $tag_data = $this->getNextTagsData(array("Topic", "ExternalPage")); if (!$tag_data) { return false; } list($page_info, $tag) = $tag_data; if ($no_process) { return $page_info; } $page_info = str_replace("r:id", "id", $page_info); $page_info = str_replace("r:resource", "resource", $page_info); $page_info = str_replace("d:Title", "Title", $page_info); $page_info = str_replace("d:Description", "Description", $page_info); $dom = new DOMDocument(); $dom->loadXML($page_info); $processMethod = "process" . $tag; $site[self::IP_ADDRESSES] = array($this->header['ip_address']); $site[self::MODIFIED] = time(); $site[self::TIMESTAMP] = time(); $site[self::TYPE] = "text/html"; $site[self::HEADER] = "odp_rdf_bundle_iterator extractor"; $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = "UTF-8"; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $this->{$processMethod}($dom, $site); $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); return $site; }
/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { static $minimal_regexes = false; static $first_call = true; if ($first_call) { $this->initializeSubstitutions($this->header['base_address']); } $page_info = $this->getNextTagData("page"); if ($no_process) { return $page_info; } $dom = new DOMDocument(); @$dom->loadXML($page_info); $site = array(); $pre_url = $this->getTextContent($dom, "/page/title"); $pre_url = str_replace(" ", "_", $pre_url); $site[self::URL] = $this->header['base_address'] . $pre_url; $site[self::IP_ADDRESSES] = array($this->header['ip_address']); $pre_timestamp = $this->getTextContent($dom, "/page/revision/timestamp"); $site[self::MODIFIED] = date("U", strtotime($pre_timestamp)); $site[self::TIMESTAMP] = time(); $site[self::TYPE] = "text/html"; $site[self::HEADER] = "mediawiki_bundle_iterator extractor"; $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = "UTF-8"; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::PAGE] = "<html lang='" . $this->header['lang'] . "' >\n" . "<head><title>{$pre_url}</title>\n" . WIKI_PAGE_STYLES . "\n</head>\n" . "<body><h1>{$pre_url}</h1>\n"; $pre_page = $this->getTextContent($dom, "/page/revision/text"); $current_hash = crawlHash($pre_page); if ($first_call) { $this->saveCheckPoint(); //ensure we remember to advance one on fail $first_call = false; } $pre_page = $this->parser->parse($pre_page, false, true); $pre_page = preg_replace("/{{Other uses}}/i", "<div class='indent'>\"\$1\". (<a href='" . $site[self::URL] . "_(disambiguation)'>{$pre_url}</a>)</div>", $pre_page); $site[self::PAGE] .= $pre_page; $site[self::PAGE] .= "\n</body>\n</html>"; $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); $site[self::WEIGHT] = ceil(max(log(strlen($site[self::PAGE]) + 1, 2) - 10, 1)); return $site; }
/** * Gets the next at most $num many docs from the iterator. It might return * less than $num many documents if the partition changes or the end of the * bundle is reached. * * @param int $num number of docs to get * @param bool $no_process do not do any processing on page data * @return array associative arrays for $num pages */ function nextPages($num, $no_process = false) { $pages = array(); $page_count = 0; $db = $this->db; $query = "{$this->sql} " . $db->limitOffset($this->limit, $num); $result = $db->execute($query); $i = 0; while ($row = $db->fetchArray($result)) { crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num); $page = ""; foreach ($row as $key => $value) { $page .= "{$key}{$this->field_value_separator}" . "{$value}{$this->column_separator}"; } if ($no_process) { $pages[] = $page; } else { $site = array(); $site[self::HEADER] = "database_bundle_iterator extractor"; $site[self::IP_ADDRESSES] = array("0.0.0.0"); $site[self::TIMESTAMP] = date("U", time()); $site[self::TYPE] = "text/plain"; $site[self::PAGE] = $page; $site[self::HASH] = FetchUrl::computePageHash($page); $site[self::URL] = "record:" . webencode($site[self::HASH]); $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = $this->encoding; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::WEIGHT] = 1; $pages[] = $site; } $page_count++; } $this->limit += $page_count; if ($page_count < $num) { $this->end_of_iterator = true; } $this->saveCheckpoint(); return $pages; }
/** * Sends to crawl, robot, and index data to the current queue server. * If this data is more than post_max_size, it splits it into chunks * which are then reassembled by the queue server web app before being * put into the appropriate schedule sub-directory. * * @param string $queue_server url of the current queue server * @param array $byte_counts has four fields: TOTAL, ROBOT, SCHEDULE, * INDEX. These give the number of bytes overall for the * 'data' field of $post_data and for each of these components. * @param array $post_data data to be uploaded to the queue server web app */ function uploadCrawlData($queue_server, $byte_counts, &$post_data) { $post_data['fetcher_peak_memory'] = memory_get_peak_usage(); $post_data['byte_counts'] = webencode(serialize($byte_counts)); $len = strlen($post_data['data']); $max_len = $this->post_max_size - 10 * 1024; // non-data post vars < 10K $post_data['num_parts'] = ceil($len / $max_len); $num_parts = $post_data['num_parts']; $data =& $post_data['data']; unset($post_data['data']); $post_data['hash_data'] = crawlHash($data); $offset = 0; for ($i = 1; $i <= $num_parts; $i++) { $time = time(); $session = md5($time . AUTH_KEY); $post_data['time'] = $time; $post_data['session'] = $session; $post_data['part'] = substr($data, $offset, $max_len); $post_data['hash_part'] = crawlHash($post_data['part']); $post_data['current_part'] = $i; $offset += $max_len; $part_len = strlen($post_data['part']); crawlLog("Sending Queue Server Part {$i} of {$num_parts}..."); crawlLog("...sending about {$part_len} bytes."); $sleep = false; do { if ($sleep == true) { crawlLog("Trouble sending to the scheduler at url:"); crawlLog($queue_server); crawlLog("Response was:"); crawlLog("{$info_string}"); $info = @unserialize($info_string); $time = time(); $session = md5($time . AUTH_KEY); $post_data['time'] = $time; $post_data['session'] = $session; if (isset($info[self::STATUS]) && $info[self::STATUS] == self::REDO_STATE) { crawlLog("Server requested last item to be re-sent..."); if (isset($info[self::SUMMARY])) { crawlLog($info[self::SUMMARY]); } crawlLog("Trying again in 5 seconds..."); } else { crawlLog("Trying again in 5 seconds. You might want"); crawlLog("to check the queue server url and server"); crawlLog("key. Queue Server post_max_size is:" . $this->post_max_size); } if ($i == 1 && !defined('FORCE_SMALL') && $this->post_max_size > 1000000) { /* maybe server has limited memory and two high a post_max_size */ crawlLog("Using smaller post size to see if helps"); define('FORCE_SMALL', true); $this->post_max_size = 1000000; $info[self::POST_MAX_SIZE] = 1000001; /* set to small value before try again. */ } sleep(5); } $sleep = true; $info_string = FetchUrl::getPage($queue_server, $post_data, true); $info = unserialize(trim($info_string)); if (isset($info[self::LOGGING])) { crawlLog("Messages from Fetch Controller:"); crawlLog($info[self::LOGGING]); } if (isset($info[self::POST_MAX_SIZE]) && $this->post_max_size > $info[self::POST_MAX_SIZE]) { if (!defined('FORCE_SMALL')) { crawlLog("post_max_size has changed was " . "{$this->post_max_size}. Now is " . $info[self::POST_MAX_SIZE] . "."); $this->post_max_size = $info[self::POST_MAX_SIZE]; } else { crawlLog("...Using Force Small Rule on Server Posting"); } if ($max_len > $this->post_max_size) { crawlLog("Restarting upload..."); if (isset($post_data["resized_once"])) { crawlLog("Restart failed"); return; } $post_data['data'] = $data; $post_data["resized_once"] = true; return $this->uploadCrawlData($queue_server, $byte_counts, $post_data); } } } while (!isset($info[self::STATUS]) || $info[self::STATUS] != self::CONTINUE_STATE); crawlLog("Queue Server info response code: " . $info[self::STATUS]); crawlLog("Queue Server's crawl time is: " . $info[self::CRAWL_TIME]); crawlLog("Web Server peak memory usage: " . $info[self::MEMORY_USAGE]); crawlLog("This fetcher peak memory usage: " . memory_get_peak_usage()); } crawlLog("Updated Queue Server, sent approximately" . " {$byte_counts['TOTAL']} bytes:"); }
/** * Downloads the next file from the schedule of files to download received * from the web app. */ function copyNextSyncFile() { $dir = $this->sync_dir; $name_server = $this->name_server; $time = time(); $session = md5($time . AUTH_KEY); if (count($this->sync_schedule) <= 0) { return; } $file = array_pop($this->sync_schedule); crawlLog("Start syncing {$file['name']}.."); if ($file['is_dir']) { if (!file_exists("{$dir}/{$file['name']}")) { mkdir("{$dir}/{$file['name']}"); crawlLog(".. {$file['name']} directory created."); } else { crawlLog(".. {$file['name']} directory exists."); } } else { $request = "{$name_server}?c=resource&a=get&time={$time}&session={$session}" . "&robot_instance=" . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&last_sync=" . $this->last_sync . "&f=cache&n=" . urlencode($file["name"]); if ($file["size"] < self::DOWNLOAD_RANGE) { $data = FetchUrl::getPage($request, NULL, true); if ($file["size"] != strlen($data)) { array_push($this->sync_schedule, $file); crawlLog(".. {$file['name']} error downloading, retrying."); return; } file_put_contents("{$dir}/{$file['name']}", $data); crawlLog(".. {$file['name']} file copied."); } else { $offset = 0; $fh = fopen("{$dir}/{$file['name']}", "wb"); $request .= "&l=" . self::DOWNLOAD_RANGE; while ($offset < $file['size']) { $data = FetchUrl::getPage($request . "&o={$offset}", NULL, true); $old_offset = $offset; $offset += self::DOWNLOAD_RANGE; $end_point = min($offset, $file["size"]); //crude check if we need to redownload segment if (strlen($data) != $end_point - $old_offset) { $offset = $old_offset; crawlLog(".. Download error re-requesting segment"); continue; } fwrite($fh, $data); crawlLog(".. {$file['name']} downloaded bytes {$old_offset} " . "to {$end_point}.."); } crawlLog(".. {$file['name']} file copied."); fclose($fh); } } }
/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } $indexable_records = array('response', 'resource'); do { $this->getRecordStart(); $page_info = $this->getWarcHeaders(); if ($page_info == NULL || !isset($page_info[self::SIZE])) { return NULL; } $length = intval($page_info[self::SIZE]); $page_info[self::SIZE] = $length; $header_and_page = ltrim($this->fileRead($length + 2)); $this->fileGets(); $this->fileGets(); if (!$header_and_page) { return NULL; } } while (!in_array($page_info['warc-type'], $indexable_records) || substr($page_info[self::URL], 0, 4) == 'dns:'); //ignore warcinfo, request, metadata, revisit, etc. records if ($no_process) { return $header_and_page; } unset($page_info['line']); unset($page_info['warc-type']); $site = $page_info; $site_contents = FetchUrl::parseHeaderPage($header_and_page); $site = array_merge($site, $site_contents); $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); $site[self::WEIGHT] = 1; if (!isset($site[self::TYPE])) { $site[self::TYPE] = "text/plain"; } return $site; }
/** * This method is invoked by other ParallelModel (@see CrawlModel * for examples) methods when they want to have their method performed * on an array of other Yioop instances. The results returned can then * be aggregated. The invocation sequence is * crawlModelMethodA invokes execMachine with a list of * urls of other Yioop instances. execMachine makes REST requests of * those instances of the given command and optional arguments * This request would be handled by a CrawlController which in turn * calls crawlModelMethodA on the given Yioop instance, serializes the * result and gives it back to execMachine and then back to the originally * calling function. * * @param string $command the ParallelModel method to invoke on the remote * Yioop instances * @param array $machine_urls machines to invoke this command on * @param string $arg additional arguments to be passed to the remote * machine * @param int $num_machines the integer to be used in calculating partition * @return array a list of outputs from each machine that was called. */ function execMachines($command, $machine_urls, $arg = NULL, $num_machines = 0) { if ($num_machines == 0) { $num_machines = count($machine_urls); } $time = time(); $session = md5($time . AUTH_KEY); $query = "c=crawl&a={$command}&time={$time}&session={$session}" . "&num={$num_machines}"; if ($arg != NULL) { $arg = webencode($arg); $query .= "&arg={$arg}"; } $sites = array(); $post_data = array(); $i = 0; foreach ($machine_urls as $index => $machine_url) { $sites[$i][CrawlConstants::URL] = $machine_url; $post_data[$i] = $query . "&i={$index}"; $i++; } $outputs = array(); if (count($sites) > 0) { $outputs = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true, $post_data); } return $outputs; }
/** * Used to start or stop a queue_server, fetcher, mirror instance on * a machine managed by the current one * * @param string $machine_name name of machine * @param string $action "start" or "stop" * @param int $fetcher_num if the action is for a fetcher this value is not * NULL and indicated which fetcher. * @param bool $is_mirror whether the requested machine is a mirror of * another machine. (If $fetcher_num is NULL and this is false, * then message is for a queue server) * */ function update($machine_name, $action, $fetcher_num = NULL, $is_mirror = false) { $db = $this->db; $value = $action == "start" ? "true" : "false"; $time = time(); $session = md5($time . AUTH_KEY); $sql = "SELECT URL FROM MACHINE WHERE NAME=?"; $result = $db->execute($sql, array($machine_name)); $row = $db->fetchArray($result); if ($row) { $url = $row["URL"] . "?c=machine&a=update&time={$time}" . "&session={$session}"; if ($fetcher_num !== NULL) { $url .= "&fetcher[{$fetcher_num}]={$value}"; $sql = "DELETE FROM ACTIVE_FETCHER WHERE NAME=? AND\n FETCHER_ID=?"; $db->execute($sql, array($machine_name, $fetcher_num)); if ($action == "start") { $sql = "INSERT INTO ACTIVE_FETCHER VALUES (?, ?)"; } $db->execute($sql, array($machine_name, $fetcher_num)); } else { if ($is_mirror) { $url .= "&mirror={$value}"; } else { $url .= "&queue_server={$value}"; } } echo FetchUrl::getPage($url); } }
/** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached * * @return mixed doc ids and score if there are docs left, -1 otherwise */ function findDocsWithWord() { $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}"; $sites = array(); $lookup = array(); $i = 0; $j = 0; foreach ($this->queue_servers as $server) { if ($this->more_flags[$i]) { $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}"; $lookup[$j] = $i; $j++; } $i++; } $net_times = AnalyticsManager::get("NET_TIMES"); $net_times = $net_times ? $net_times : 0; $download_time = microtime(); $downloads = array(); if (count($sites) > 0) { $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true); } $net_times += changeInMicrotime($download_time); AnalyticsManager::set("NET_TIMES", $net_times); $results = array(); $count = count($downloads); $this->num_docs = 0; $in4 = " "; $machine_times = AnalyticsManager::get("MACHINE_TIMES"); $indent = $machine_times ? "<br />{$in4}" : $in4; $machine_times = $machine_times ? $machine_times : ""; $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES"); $max_machine_times = $max_machine_times ? $max_machine_times : 0; $max_time = 0; $num_with_results = $count; for ($j = 0; $j < $count; $j++) { $download =& $downloads[$j]; if (isset($download[self::PAGE])) { $pre_result = @unserialize($download[self::PAGE]); if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) { $this->more_flags[$lookup[$j]] = false; $num_with_results--; } if (isset($pre_result["TOTAL_ROWS"])) { $this->num_docs += $pre_result["TOTAL_ROWS"]; } if (isset($pre_result["PAGES"])) { foreach ($pre_result["PAGES"] as $page_data) { if (isset($page_data[self::KEY])) { $results[$page_data[self::KEY]] = $page_data; $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j]; } } } $max_time = max($max_time, $pre_result['ELAPSED_TIME']); $lookup_link = $this->makeLookupLink($sites, $lookup[$j]); $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . " "; $indent = ""; } } if (isset($pre_result["HARD_QUERY"])) { $this->hard_query = $pre_result["HARD_QUERY"]; } if ($num_with_results > 0) { $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results)); } $max_machine_times += $max_time; AnalyticsManager::set("MACHINE_TIMES", $machine_times); AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times); if ($results == array()) { $results = -1; } if ($results != -1) { if ($this->filter != NULL) { foreach ($results as $keys => $data) { $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN); if (in_array($host_key, $this->filter)) { unset($results[$keys]); } } } } $this->count_block = count($results); $this->pages = $results; return $results; }
/** * Gets the cached version of a web page from the machine on which it was * fetched. * * Complete cached versions of web pages typically only live on a fetcher * machine. The queue server machine typically only maintains summaries. * This method makes a REST request of a fetcher machine for a cached page * and get the results back. * * @param string $machine the ip address of domain name of the machine the * cached page lives on * @param string $machine_uri the path from document root on $machine where * the yioop scripts live * @param int $partition the partition in the WebArchiveBundle the page is * in * @param int $offset the offset in bytes into the WebArchive partition in * the WebArchiveBundle at which the cached page lives. * @param string $crawl_time the timestamp of the crawl the cache page is * from * @param int $instance_num which fetcher instance for the particular * fetcher crawled the page (if more than one), false otherwise * @return array page data of the cached page */ function getCacheFile($machine, $machine_uri, $partition, $offset, $crawl_time, $instance_num = false) { $time = time(); $session = md5($time . AUTH_KEY); if ($machine == '::1') { //IPv6 :( $machine = "[::1]"; //used if the fetching and queue serving were on the same machine } // we assume all machines use the same scheme & port of the name server $port = UrlParser::getPort(NAME_SERVER); $scheme = UrlParser::getScheme(NAME_SERVER); $request = "{$scheme}://{$machine}:{$port}{$machine_uri}?c=archive&a=cache&" . "time={$time}&session={$session}&partition={$partition}&offset={$offset}" . "&crawl_time={$crawl_time}"; if ($instance_num !== false) { $request .= "&instance_num={$instance_num}"; } $tmp = FetchUrl::getPage($request); $page = @unserialize(base64_decode($tmp)); $page['REQUEST'] = $request; return $page; }
/** * Gets the next doc from the iterator * @param bool $no_process do not do any processing on page data * @return array associative array for doc or string if no_process true */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } do { $page_info = $this->fileGets(); if (trim($page_info) == "") { return NULL; } $info_parts = explode(" ", $page_info); $num_parts = count($info_parts); $length = intval($info_parts[$num_parts - 1]); $header_and_page = $this->fileRead($length + 1); if (!$header_and_page) { return NULL; } } while (substr($page_info, 0, 3) == 'dns' || substr($page_info, 0, 8) == 'filedesc'); //ignore dns entries in arc and ignore first record if ($no_process) { return $header_and_page; } $site = array(); $site[self::URL] = $info_parts[0]; $site[self::IP_ADDRESSES] = array($info_parts[1]); $site[self::TIMESTAMP] = date("U", strtotime($info_parts[2])); $site[self::TYPE] = $info_parts[3]; $site_contents = FetchUrl::parseHeaderPage($header_and_page); $site = array_merge($site, $site_contents); $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); $site[self::WEIGHT] = 1; return $site; }
/** * Used to check if there are any mirrors of the current server. * If so, it tries to distribute the query requests randomly amongst * the mirrors * @return bool whether or not a mirror of the current site handled it */ function mirrorHandle() { $mirror_table_name = CRAWL_DIR . "/" . self::mirror_table_name; $handled = false; if (file_exists($mirror_table_name)) { $mirror_table = unserialize(file_get_contents($mirror_table_name)); $mirrors = array(); $time = time(); foreach ($mirror_table['machines'] as $entry) { if ($time - $entry[3] < 2 * MIRROR_NOTIFY_FREQUENCY) { if ($entry[0] == "::1") { $entry[0] = "[::1]"; } $request = "http://" . $entry[0] . $entry[1]; $mirrors[] = $request; } } $count = count($mirrors); if ($count > 0) { mt_srand(); $rand = mt_rand(0, $count); // if ==$count, we'll let the current machine handle it if ($rand < $count) { $request = $mirrors[$rand] . "?" . $_SERVER["QUERY_STRING"]; echo FetchUrl::getPage($request); $handled = true; } } } return $handled; }
/** * Downloads one batch of $feeds_one_go feed items for @see updateFeedItems * For each feed source downloads the feeds, checks which items are * not in the database, adds them. This method does not update * the inverted index shard. * * @param array $feeds list of feeds to download * @param int $age how many seconds old records should be ignored */ function updateFeedItemsOneGo($feeds, $age = ONE_WEEK) { $feeds = FetchUrl::getPages($feeds, false, 0, NULL, "SOURCE_URL", CrawlConstants::PAGE, true, NULL, true); $sql = "UPDATE MEDIA_SOURCE SET LANGUAGE=? WHERE TIMESTAMP=?"; foreach ($feeds as $feed) { $is_html = $feed['TYPE'] == 'html' ? true : false; crawlLog("Updating {$feed['NAME']}. Making dom object from feed."); if (!$feed[CrawlConstants::PAGE]) { crawlLog("...No data in feed skipping."); continue; } $dom = new DOMDocument(); if ($is_html) { @$dom->loadHTML($feed[CrawlConstants::PAGE]); } else { @$dom->loadXML($feed[CrawlConstants::PAGE]); } crawlLog("...done. Extracting info about whole feed."); $lang = ""; if ($feed['TYPE'] != 'html' && !isset($feed["LANGUAGE"]) || $feed["LANGUAGE"] == "") { $languages = $dom->getElementsByTagName('language'); if ($languages && is_object($languages) && is_object($languages->item(0))) { $lang = $languages->item(0)->textContent; $db->execute($sql, array($lang, $feed['TIMESTAMP'])); } } else { if (isset($feed["LANGUAGE"]) && $feed["LANGUAGE"] != "") { $lang = $feed["LANGUAGE"]; } else { $lang = DEFAULT_LOCALE; } } crawlLog("...Language is {$lang}. Getting channel, finding nodes."); if ($is_html) { $sub_dom = $this->getTags($dom, $feed['CHANNEL_PATH']); if (!$sub_dom) { crawlLog("... Scraper couldn't parse channel" . " path so bailing on this feed."); continue; } else { crawlLog("...Channel scraped."); } $nodes = $this->getTags($sub_dom[0], $feed['ITEM_PATH']); $rss_elements = array("title" => $feed['TITLE_PATH'], "description" => $feed['DESCRIPTION_PATH'], "link" => $feed['LINK_PATH']); } else { $nodes = $dom->getElementsByTagName('item'); $rss_elements = array("title" => "title", "description" => "description", "link" => "link", "guid" => "guid", "pubDate" => "pubDate"); if ($nodes->length == 0) { // maybe we're dealing with atom rather than rss $nodes = $dom->getElementsByTagName('entry'); $rss_elements = array("title" => "title", "description" => "summary", "link" => "link", "guid" => "id", "pubDate" => "updated"); } } crawlLog("...done extracting info. Check for new news " . "items in {$feed['NAME']}."); $num_added = 0; $num_seen = 0; foreach ($nodes as $node) { $item = array(); foreach ($rss_elements as $db_element => $feed_element) { crawlTimeoutLog("..still adding feed items to index."); if ($is_html) { $tag_nodes = $this->getTags($node, $feed_element); if (!isset($tag_nodes[0])) { $tag_node = NULL; } else { $tag_node = $tag_nodes[0]; } $element_text = is_object($tag_node) ? $tag_node->textContent : ""; } else { $tag_node = $node->getElementsByTagName($feed_element)->item(0); $element_text = is_object($tag_node) ? $tag_node->nodeValue : ""; } if ($db_element == "link" && $tag_node && ($element_text == "" || $is_html)) { if ($is_html) { $element_text = $tag_node->documentElement->getAttribute("href"); } else { $element_text = $tag_node->getAttribute("href"); } $element_text = UrlParser::canonicalLink($element_text, $feed["SOURCE_URL"]); } $item[$db_element] = strip_tags($element_text); } $did_add = $this->addFeedItemIfNew($item, $feed['NAME'], $lang, $age); if ($did_add) { $num_added++; } $num_seen++; } crawlLog("...added {$num_added} news items of {$num_seen} " . "on rss page.\n Done Processing {$feed['NAME']}."); } }