/**
  * Gets the next doc from the iterator
  * @param bool $no_process if true then just return page string found
  *     not any additional meta data.
  * @return mixed associative array for doc or just string of doc
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     $matches = array();
     while (preg_match($this->delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) {
         crawlTimeoutLog("..still looking for a page in local buffer");
         $block = $this->getFileBlock();
         if (!$block || !$this->checkFileHandle() || $this->checkEof()) {
             return NULL;
         }
         $this->buffer .= $block;
     }
     $delim_len = strlen($matches[0][0]);
     $pos = $matches[0][1] + $delim_len;
     $page_pos = $this->start_delimiter == "" ? $pos : $pos - $delim_len;
     $page = substr($this->buffer, 0, $page_pos);
     if ($this->end_delimiter == "") {
         $page = $this->remainder . $page;
         $this->remainder = $matches[0][0];
     }
     $this->buffer = substr($this->buffer, $pos + $delim_len);
     if ($this->start_delimiter != "") {
         $matches = array();
         if (preg_match($this->start_delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) {
             if (isset($matches[0][1])) {
                 $page = substr($page, $matches[0][1]);
             }
         }
     }
     if ($no_process == true) {
         return $page;
     }
     $site = array();
     $site[self::HEADER] = "text_archive_bundle_iterator extractor";
     $site[self::IP_ADDRESSES] = array("0.0.0.0");
     $site[self::TIMESTAMP] = date("U", time());
     $site[self::TYPE] = "text/plain";
     $site[self::PAGE] = $page;
     $site[self::HASH] = FetchUrl::computePageHash($page);
     $site[self::URL] = "record:" . webencode($site[self::HASH]);
     $site[self::HTTP_CODE] = 200;
     $site[self::ENCODING] = $this->encoding;
     $site[self::SERVER] = "unknown";
     $site[self::SERVER_VERSION] = "unknown";
     $site[self::OPERATING_SYSTEM] = "unknown";
     $site[self::WEIGHT] = 1;
     return $site;
 }
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     $tag_data = $this->getNextTagsData(array("Topic", "ExternalPage"));
     if (!$tag_data) {
         return false;
     }
     list($page_info, $tag) = $tag_data;
     if ($no_process) {
         return $page_info;
     }
     $page_info = str_replace("r:id", "id", $page_info);
     $page_info = str_replace("r:resource", "resource", $page_info);
     $page_info = str_replace("d:Title", "Title", $page_info);
     $page_info = str_replace("d:Description", "Description", $page_info);
     $dom = new DOMDocument();
     $dom->loadXML($page_info);
     $processMethod = "process" . $tag;
     $site[self::IP_ADDRESSES] = array($this->header['ip_address']);
     $site[self::MODIFIED] = time();
     $site[self::TIMESTAMP] = time();
     $site[self::TYPE] = "text/html";
     $site[self::HEADER] = "odp_rdf_bundle_iterator extractor";
     $site[self::HTTP_CODE] = 200;
     $site[self::ENCODING] = "UTF-8";
     $site[self::SERVER] = "unknown";
     $site[self::SERVER_VERSION] = "unknown";
     $site[self::OPERATING_SYSTEM] = "unknown";
     $this->{$processMethod}($dom, $site);
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     return $site;
 }
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     static $minimal_regexes = false;
     static $first_call = true;
     if ($first_call) {
         $this->initializeSubstitutions($this->header['base_address']);
     }
     $page_info = $this->getNextTagData("page");
     if ($no_process) {
         return $page_info;
     }
     $dom = new DOMDocument();
     @$dom->loadXML($page_info);
     $site = array();
     $pre_url = $this->getTextContent($dom, "/page/title");
     $pre_url = str_replace(" ", "_", $pre_url);
     $site[self::URL] = $this->header['base_address'] . $pre_url;
     $site[self::IP_ADDRESSES] = array($this->header['ip_address']);
     $pre_timestamp = $this->getTextContent($dom, "/page/revision/timestamp");
     $site[self::MODIFIED] = date("U", strtotime($pre_timestamp));
     $site[self::TIMESTAMP] = time();
     $site[self::TYPE] = "text/html";
     $site[self::HEADER] = "mediawiki_bundle_iterator extractor";
     $site[self::HTTP_CODE] = 200;
     $site[self::ENCODING] = "UTF-8";
     $site[self::SERVER] = "unknown";
     $site[self::SERVER_VERSION] = "unknown";
     $site[self::OPERATING_SYSTEM] = "unknown";
     $site[self::PAGE] = "<html lang='" . $this->header['lang'] . "' >\n" . "<head><title>{$pre_url}</title>\n" . WIKI_PAGE_STYLES . "\n</head>\n" . "<body><h1>{$pre_url}</h1>\n";
     $pre_page = $this->getTextContent($dom, "/page/revision/text");
     $current_hash = crawlHash($pre_page);
     if ($first_call) {
         $this->saveCheckPoint();
         //ensure we remember to advance one on fail
         $first_call = false;
     }
     $pre_page = $this->parser->parse($pre_page, false, true);
     $pre_page = preg_replace("/{{Other uses}}/i", "<div class='indent'>\"\$1\". (<a href='" . $site[self::URL] . "_(disambiguation)'>{$pre_url}</a>)</div>", $pre_page);
     $site[self::PAGE] .= $pre_page;
     $site[self::PAGE] .= "\n</body>\n</html>";
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     $site[self::WEIGHT] = ceil(max(log(strlen($site[self::PAGE]) + 1, 2) - 10, 1));
     return $site;
 }
 /**
  * Gets the next at most $num many docs from the iterator. It might return
  * less than $num many documents if the partition changes or the end of the
  * bundle is reached.
  *
  * @param int $num number of docs to get
  * @param bool $no_process do not do any processing on page data
  * @return array associative arrays for $num pages
  */
 function nextPages($num, $no_process = false)
 {
     $pages = array();
     $page_count = 0;
     $db = $this->db;
     $query = "{$this->sql} " . $db->limitOffset($this->limit, $num);
     $result = $db->execute($query);
     $i = 0;
     while ($row = $db->fetchArray($result)) {
         crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num);
         $page = "";
         foreach ($row as $key => $value) {
             $page .= "{$key}{$this->field_value_separator}" . "{$value}{$this->column_separator}";
         }
         if ($no_process) {
             $pages[] = $page;
         } else {
             $site = array();
             $site[self::HEADER] = "database_bundle_iterator extractor";
             $site[self::IP_ADDRESSES] = array("0.0.0.0");
             $site[self::TIMESTAMP] = date("U", time());
             $site[self::TYPE] = "text/plain";
             $site[self::PAGE] = $page;
             $site[self::HASH] = FetchUrl::computePageHash($page);
             $site[self::URL] = "record:" . webencode($site[self::HASH]);
             $site[self::HTTP_CODE] = 200;
             $site[self::ENCODING] = $this->encoding;
             $site[self::SERVER] = "unknown";
             $site[self::SERVER_VERSION] = "unknown";
             $site[self::OPERATING_SYSTEM] = "unknown";
             $site[self::WEIGHT] = 1;
             $pages[] = $site;
         }
         $page_count++;
     }
     $this->limit += $page_count;
     if ($page_count < $num) {
         $this->end_of_iterator = true;
     }
     $this->saveCheckpoint();
     return $pages;
 }
Exemple #5
0
 /**
  * Sends to crawl, robot, and index data to the current queue server.
  * If this data is more than post_max_size, it splits it into chunks
  * which are then reassembled by the queue server web app before being
  * put into the appropriate schedule sub-directory.
  *
  * @param string $queue_server url of the current queue server
  * @param array $byte_counts has four fields: TOTAL, ROBOT, SCHEDULE,
  *     INDEX. These give the number of bytes overall for the
  *     'data' field of $post_data and for each of these components.
  * @param array $post_data data to be uploaded to the queue server web app
  */
 function uploadCrawlData($queue_server, $byte_counts, &$post_data)
 {
     $post_data['fetcher_peak_memory'] = memory_get_peak_usage();
     $post_data['byte_counts'] = webencode(serialize($byte_counts));
     $len = strlen($post_data['data']);
     $max_len = $this->post_max_size - 10 * 1024;
     // non-data post vars < 10K
     $post_data['num_parts'] = ceil($len / $max_len);
     $num_parts = $post_data['num_parts'];
     $data =& $post_data['data'];
     unset($post_data['data']);
     $post_data['hash_data'] = crawlHash($data);
     $offset = 0;
     for ($i = 1; $i <= $num_parts; $i++) {
         $time = time();
         $session = md5($time . AUTH_KEY);
         $post_data['time'] = $time;
         $post_data['session'] = $session;
         $post_data['part'] = substr($data, $offset, $max_len);
         $post_data['hash_part'] = crawlHash($post_data['part']);
         $post_data['current_part'] = $i;
         $offset += $max_len;
         $part_len = strlen($post_data['part']);
         crawlLog("Sending Queue Server Part {$i} of {$num_parts}...");
         crawlLog("...sending about {$part_len} bytes.");
         $sleep = false;
         do {
             if ($sleep == true) {
                 crawlLog("Trouble sending to the scheduler at url:");
                 crawlLog($queue_server);
                 crawlLog("Response was:");
                 crawlLog("{$info_string}");
                 $info = @unserialize($info_string);
                 $time = time();
                 $session = md5($time . AUTH_KEY);
                 $post_data['time'] = $time;
                 $post_data['session'] = $session;
                 if (isset($info[self::STATUS]) && $info[self::STATUS] == self::REDO_STATE) {
                     crawlLog("Server requested last item to be re-sent...");
                     if (isset($info[self::SUMMARY])) {
                         crawlLog($info[self::SUMMARY]);
                     }
                     crawlLog("Trying again in 5 seconds...");
                 } else {
                     crawlLog("Trying again in 5 seconds. You might want");
                     crawlLog("to check the queue server url and server");
                     crawlLog("key. Queue Server post_max_size is:" . $this->post_max_size);
                 }
                 if ($i == 1 && !defined('FORCE_SMALL') && $this->post_max_size > 1000000) {
                     /* maybe server has limited memory
                          and two high a post_max_size
                        */
                     crawlLog("Using smaller post size to see if helps");
                     define('FORCE_SMALL', true);
                     $this->post_max_size = 1000000;
                     $info[self::POST_MAX_SIZE] = 1000001;
                     /* set to small value before try again.
                      */
                 }
                 sleep(5);
             }
             $sleep = true;
             $info_string = FetchUrl::getPage($queue_server, $post_data, true);
             $info = unserialize(trim($info_string));
             if (isset($info[self::LOGGING])) {
                 crawlLog("Messages from Fetch Controller:");
                 crawlLog($info[self::LOGGING]);
             }
             if (isset($info[self::POST_MAX_SIZE]) && $this->post_max_size > $info[self::POST_MAX_SIZE]) {
                 if (!defined('FORCE_SMALL')) {
                     crawlLog("post_max_size has changed was " . "{$this->post_max_size}. Now is " . $info[self::POST_MAX_SIZE] . ".");
                     $this->post_max_size = $info[self::POST_MAX_SIZE];
                 } else {
                     crawlLog("...Using Force Small Rule on Server Posting");
                 }
                 if ($max_len > $this->post_max_size) {
                     crawlLog("Restarting upload...");
                     if (isset($post_data["resized_once"])) {
                         crawlLog("Restart failed");
                         return;
                     }
                     $post_data['data'] = $data;
                     $post_data["resized_once"] = true;
                     return $this->uploadCrawlData($queue_server, $byte_counts, $post_data);
                 }
             }
         } while (!isset($info[self::STATUS]) || $info[self::STATUS] != self::CONTINUE_STATE);
         crawlLog("Queue Server info response code: " . $info[self::STATUS]);
         crawlLog("Queue Server's crawl time is: " . $info[self::CRAWL_TIME]);
         crawlLog("Web Server peak memory usage: " . $info[self::MEMORY_USAGE]);
         crawlLog("This fetcher peak memory usage: " . memory_get_peak_usage());
     }
     crawlLog("Updated Queue Server, sent approximately" . " {$byte_counts['TOTAL']} bytes:");
 }
Exemple #6
0
 /**
  * Downloads the next file from the schedule of files to download received
  * from the web app.
  */
 function copyNextSyncFile()
 {
     $dir = $this->sync_dir;
     $name_server = $this->name_server;
     $time = time();
     $session = md5($time . AUTH_KEY);
     if (count($this->sync_schedule) <= 0) {
         return;
     }
     $file = array_pop($this->sync_schedule);
     crawlLog("Start syncing {$file['name']}..");
     if ($file['is_dir']) {
         if (!file_exists("{$dir}/{$file['name']}")) {
             mkdir("{$dir}/{$file['name']}");
             crawlLog(".. {$file['name']} directory created.");
         } else {
             crawlLog(".. {$file['name']} directory exists.");
         }
     } else {
         $request = "{$name_server}?c=resource&a=get&time={$time}&session={$session}" . "&robot_instance=" . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&last_sync=" . $this->last_sync . "&f=cache&n=" . urlencode($file["name"]);
         if ($file["size"] < self::DOWNLOAD_RANGE) {
             $data = FetchUrl::getPage($request, NULL, true);
             if ($file["size"] != strlen($data)) {
                 array_push($this->sync_schedule, $file);
                 crawlLog(".. {$file['name']} error downloading, retrying.");
                 return;
             }
             file_put_contents("{$dir}/{$file['name']}", $data);
             crawlLog(".. {$file['name']} file copied.");
         } else {
             $offset = 0;
             $fh = fopen("{$dir}/{$file['name']}", "wb");
             $request .= "&l=" . self::DOWNLOAD_RANGE;
             while ($offset < $file['size']) {
                 $data = FetchUrl::getPage($request . "&o={$offset}", NULL, true);
                 $old_offset = $offset;
                 $offset += self::DOWNLOAD_RANGE;
                 $end_point = min($offset, $file["size"]);
                 //crude check if we need to redownload segment
                 if (strlen($data) != $end_point - $old_offset) {
                     $offset = $old_offset;
                     crawlLog(".. Download error re-requesting segment");
                     continue;
                 }
                 fwrite($fh, $data);
                 crawlLog(".. {$file['name']} downloaded bytes {$old_offset} " . "to {$end_point}..");
             }
             crawlLog(".. {$file['name']} file copied.");
             fclose($fh);
         }
     }
 }
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     $indexable_records = array('response', 'resource');
     do {
         $this->getRecordStart();
         $page_info = $this->getWarcHeaders();
         if ($page_info == NULL || !isset($page_info[self::SIZE])) {
             return NULL;
         }
         $length = intval($page_info[self::SIZE]);
         $page_info[self::SIZE] = $length;
         $header_and_page = ltrim($this->fileRead($length + 2));
         $this->fileGets();
         $this->fileGets();
         if (!$header_and_page) {
             return NULL;
         }
     } while (!in_array($page_info['warc-type'], $indexable_records) || substr($page_info[self::URL], 0, 4) == 'dns:');
     //ignore warcinfo, request, metadata, revisit, etc. records
     if ($no_process) {
         return $header_and_page;
     }
     unset($page_info['line']);
     unset($page_info['warc-type']);
     $site = $page_info;
     $site_contents = FetchUrl::parseHeaderPage($header_and_page);
     $site = array_merge($site, $site_contents);
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     $site[self::WEIGHT] = 1;
     if (!isset($site[self::TYPE])) {
         $site[self::TYPE] = "text/plain";
     }
     return $site;
 }
Exemple #8
0
 /**
  * This method is invoked by other ParallelModel (@see CrawlModel
  * for examples) methods when they want to have their method performed
  * on an array of other  Yioop instances. The results returned can then
  * be aggregated.  The invocation sequence is
  * crawlModelMethodA invokes execMachine with a list of
  * urls of other Yioop instances. execMachine makes REST requests of
  * those instances of the given command and optional arguments
  * This request would be handled by a CrawlController which in turn
  * calls crawlModelMethodA on the given Yioop instance, serializes the
  * result and gives it back to execMachine and then back to the originally
  * calling function.
  *
  * @param string $command the ParallelModel method to invoke on the remote
  *     Yioop instances
  * @param array $machine_urls machines to invoke this command on
  * @param string $arg additional arguments to be passed to the remote
  *      machine
  * @param int $num_machines the integer to be used in calculating partition
  * @return array a list of outputs from each machine that was called.
  */
 function execMachines($command, $machine_urls, $arg = NULL, $num_machines = 0)
 {
     if ($num_machines == 0) {
         $num_machines = count($machine_urls);
     }
     $time = time();
     $session = md5($time . AUTH_KEY);
     $query = "c=crawl&a={$command}&time={$time}&session={$session}" . "&num={$num_machines}";
     if ($arg != NULL) {
         $arg = webencode($arg);
         $query .= "&arg={$arg}";
     }
     $sites = array();
     $post_data = array();
     $i = 0;
     foreach ($machine_urls as $index => $machine_url) {
         $sites[$i][CrawlConstants::URL] = $machine_url;
         $post_data[$i] = $query . "&i={$index}";
         $i++;
     }
     $outputs = array();
     if (count($sites) > 0) {
         $outputs = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true, $post_data);
     }
     return $outputs;
 }
Exemple #9
0
 /**
  * Used to start or stop a queue_server, fetcher, mirror instance on
  * a machine managed by the current one
  *
  * @param string $machine_name name of machine
  * @param string $action "start" or "stop"
  * @param int $fetcher_num if the action is for a fetcher this value is not
  *      NULL and indicated which fetcher.
  * @param bool $is_mirror whether the requested machine is a mirror of
  *      another machine. (If $fetcher_num is NULL and this is false,
  *      then message is for a queue server)
  *
  */
 function update($machine_name, $action, $fetcher_num = NULL, $is_mirror = false)
 {
     $db = $this->db;
     $value = $action == "start" ? "true" : "false";
     $time = time();
     $session = md5($time . AUTH_KEY);
     $sql = "SELECT URL FROM MACHINE WHERE NAME=?";
     $result = $db->execute($sql, array($machine_name));
     $row = $db->fetchArray($result);
     if ($row) {
         $url = $row["URL"] . "?c=machine&a=update&time={$time}" . "&session={$session}";
         if ($fetcher_num !== NULL) {
             $url .= "&fetcher[{$fetcher_num}]={$value}";
             $sql = "DELETE FROM ACTIVE_FETCHER WHERE NAME=? AND\n                    FETCHER_ID=?";
             $db->execute($sql, array($machine_name, $fetcher_num));
             if ($action == "start") {
                 $sql = "INSERT INTO ACTIVE_FETCHER VALUES (?, ?)";
             }
             $db->execute($sql, array($machine_name, $fetcher_num));
         } else {
             if ($is_mirror) {
                 $url .= "&mirror={$value}";
             } else {
                 $url .= "&queue_server={$value}";
             }
         }
         echo FetchUrl::getPage($url);
     }
 }
Exemple #10
0
 /**
  * Hook function used by currentDocsWithWord to return the current block
  * of docs if it is not cached
  *
  * @return mixed doc ids and score if there are docs left, -1 otherwise
  */
 function findDocsWithWord()
 {
     $query = $this->base_query . "&num={$this->results_per_block}&limit={$this->limit}";
     $sites = array();
     $lookup = array();
     $i = 0;
     $j = 0;
     foreach ($this->queue_servers as $server) {
         if ($this->more_flags[$i]) {
             $sites[$j][CrawlConstants::URL] = $server . "?" . $query . "&machine={$i}";
             $lookup[$j] = $i;
             $j++;
         }
         $i++;
     }
     $net_times = AnalyticsManager::get("NET_TIMES");
     $net_times = $net_times ? $net_times : 0;
     $download_time = microtime();
     $downloads = array();
     if (count($sites) > 0) {
         $downloads = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true);
     }
     $net_times += changeInMicrotime($download_time);
     AnalyticsManager::set("NET_TIMES", $net_times);
     $results = array();
     $count = count($downloads);
     $this->num_docs = 0;
     $in4 = "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
     $machine_times = AnalyticsManager::get("MACHINE_TIMES");
     $indent = $machine_times ? "<br />{$in4}" : $in4;
     $machine_times = $machine_times ? $machine_times : "";
     $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
     $max_machine_times = $max_machine_times ? $max_machine_times : 0;
     $max_time = 0;
     $num_with_results = $count;
     for ($j = 0; $j < $count; $j++) {
         $download =& $downloads[$j];
         if (isset($download[self::PAGE])) {
             $pre_result = @unserialize($download[self::PAGE]);
             if (!isset($pre_result["TOTAL_ROWS"]) || $pre_result["TOTAL_ROWS"] < $this->results_per_block) {
                 $this->more_flags[$lookup[$j]] = false;
                 $num_with_results--;
             }
             if (isset($pre_result["TOTAL_ROWS"])) {
                 $this->num_docs += $pre_result["TOTAL_ROWS"];
             }
             if (isset($pre_result["PAGES"])) {
                 foreach ($pre_result["PAGES"] as $page_data) {
                     if (isset($page_data[self::KEY])) {
                         $results[$page_data[self::KEY]] = $page_data;
                         $results[$page_data[self::KEY]][self::MACHINE_ID] = $lookup[$j];
                     }
                 }
             }
             $max_time = max($max_time, $pre_result['ELAPSED_TIME']);
             $lookup_link = $this->makeLookupLink($sites, $lookup[$j]);
             $machine_times .= $indent . $lookup_link . $pre_result['ELAPSED_TIME'] . "&nbsp;&nbsp;";
             $indent = "";
         }
     }
     if (isset($pre_result["HARD_QUERY"])) {
         $this->hard_query = $pre_result["HARD_QUERY"];
     }
     if ($num_with_results > 0) {
         $this->next_results_per_block = ceil(floatval($count * $this->results_per_block) / floatval($num_with_results));
     }
     $max_machine_times += $max_time;
     AnalyticsManager::set("MACHINE_TIMES", $machine_times);
     AnalyticsManager::set("MAX_MACHINE_TIMES", $max_machine_times);
     if ($results == array()) {
         $results = -1;
     }
     if ($results != -1) {
         if ($this->filter != NULL) {
             foreach ($results as $keys => $data) {
                 $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
                 if (in_array($host_key, $this->filter)) {
                     unset($results[$keys]);
                 }
             }
         }
     }
     $this->count_block = count($results);
     $this->pages = $results;
     return $results;
 }
Exemple #11
0
 /**
  * Gets the cached version of a web page from the machine on which it was
  * fetched.
  *
  * Complete cached versions of web pages typically only live on a fetcher
  * machine. The queue server machine typically only maintains summaries.
  * This method makes a REST request of a fetcher machine for a cached page
  * and get the results back.
  *
  * @param string $machine the ip address of domain name of the machine the
  *     cached page lives on
  * @param string $machine_uri the path from document root on $machine where
  *     the yioop scripts live
  * @param int $partition the partition in the WebArchiveBundle the page is
  *      in
  * @param int $offset the offset in bytes into the WebArchive partition in
  *     the WebArchiveBundle at which the cached page lives.
  * @param string $crawl_time the timestamp of the crawl the cache page is
  *     from
  * @param int $instance_num which fetcher instance for the particular
  *     fetcher crawled the page (if more than one), false otherwise
  * @return array page data of the cached page
  */
 function getCacheFile($machine, $machine_uri, $partition, $offset, $crawl_time, $instance_num = false)
 {
     $time = time();
     $session = md5($time . AUTH_KEY);
     if ($machine == '::1') {
         //IPv6 :(
         $machine = "[::1]";
         //used if the fetching and queue serving were on the same machine
     }
     // we assume all machines use the same scheme & port of the name server
     $port = UrlParser::getPort(NAME_SERVER);
     $scheme = UrlParser::getScheme(NAME_SERVER);
     $request = "{$scheme}://{$machine}:{$port}{$machine_uri}?c=archive&a=cache&" . "time={$time}&session={$session}&partition={$partition}&offset={$offset}" . "&crawl_time={$crawl_time}";
     if ($instance_num !== false) {
         $request .= "&instance_num={$instance_num}";
     }
     $tmp = FetchUrl::getPage($request);
     $page = @unserialize(base64_decode($tmp));
     $page['REQUEST'] = $request;
     return $page;
 }
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     if (!$this->checkFileHandle()) {
         return NULL;
     }
     do {
         $page_info = $this->fileGets();
         if (trim($page_info) == "") {
             return NULL;
         }
         $info_parts = explode(" ", $page_info);
         $num_parts = count($info_parts);
         $length = intval($info_parts[$num_parts - 1]);
         $header_and_page = $this->fileRead($length + 1);
         if (!$header_and_page) {
             return NULL;
         }
     } while (substr($page_info, 0, 3) == 'dns' || substr($page_info, 0, 8) == 'filedesc');
     //ignore dns entries in arc and ignore first record
     if ($no_process) {
         return $header_and_page;
     }
     $site = array();
     $site[self::URL] = $info_parts[0];
     $site[self::IP_ADDRESSES] = array($info_parts[1]);
     $site[self::TIMESTAMP] = date("U", strtotime($info_parts[2]));
     $site[self::TYPE] = $info_parts[3];
     $site_contents = FetchUrl::parseHeaderPage($header_and_page);
     $site = array_merge($site, $site_contents);
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     $site[self::WEIGHT] = 1;
     return $site;
 }
Exemple #13
0
 /**
  * Used to check if there are any mirrors of the current server.
  * If so, it tries to distribute the query requests randomly amongst
  * the mirrors
  * @return bool whether or not a mirror of the current site handled it
  */
 function mirrorHandle()
 {
     $mirror_table_name = CRAWL_DIR . "/" . self::mirror_table_name;
     $handled = false;
     if (file_exists($mirror_table_name)) {
         $mirror_table = unserialize(file_get_contents($mirror_table_name));
         $mirrors = array();
         $time = time();
         foreach ($mirror_table['machines'] as $entry) {
             if ($time - $entry[3] < 2 * MIRROR_NOTIFY_FREQUENCY) {
                 if ($entry[0] == "::1") {
                     $entry[0] = "[::1]";
                 }
                 $request = "http://" . $entry[0] . $entry[1];
                 $mirrors[] = $request;
             }
         }
         $count = count($mirrors);
         if ($count > 0) {
             mt_srand();
             $rand = mt_rand(0, $count);
             // if ==$count, we'll let the current machine handle it
             if ($rand < $count) {
                 $request = $mirrors[$rand] . "?" . $_SERVER["QUERY_STRING"];
                 echo FetchUrl::getPage($request);
                 $handled = true;
             }
         }
     }
     return $handled;
 }
Exemple #14
0
 /**
  * Downloads one batch of $feeds_one_go feed items for @see updateFeedItems
  * For each feed source downloads the feeds, checks which items are
  * not in the database, adds them. This method does not update
  * the inverted index shard.
  *
  * @param array $feeds list of feeds to download
  * @param int $age how many seconds old records should be ignored
  */
 function updateFeedItemsOneGo($feeds, $age = ONE_WEEK)
 {
     $feeds = FetchUrl::getPages($feeds, false, 0, NULL, "SOURCE_URL", CrawlConstants::PAGE, true, NULL, true);
     $sql = "UPDATE MEDIA_SOURCE SET LANGUAGE=? WHERE TIMESTAMP=?";
     foreach ($feeds as $feed) {
         $is_html = $feed['TYPE'] == 'html' ? true : false;
         crawlLog("Updating {$feed['NAME']}. Making dom object from feed.");
         if (!$feed[CrawlConstants::PAGE]) {
             crawlLog("...No data in feed skipping.");
             continue;
         }
         $dom = new DOMDocument();
         if ($is_html) {
             @$dom->loadHTML($feed[CrawlConstants::PAGE]);
         } else {
             @$dom->loadXML($feed[CrawlConstants::PAGE]);
         }
         crawlLog("...done. Extracting info about whole feed.");
         $lang = "";
         if ($feed['TYPE'] != 'html' && !isset($feed["LANGUAGE"]) || $feed["LANGUAGE"] == "") {
             $languages = $dom->getElementsByTagName('language');
             if ($languages && is_object($languages) && is_object($languages->item(0))) {
                 $lang = $languages->item(0)->textContent;
                 $db->execute($sql, array($lang, $feed['TIMESTAMP']));
             }
         } else {
             if (isset($feed["LANGUAGE"]) && $feed["LANGUAGE"] != "") {
                 $lang = $feed["LANGUAGE"];
             } else {
                 $lang = DEFAULT_LOCALE;
             }
         }
         crawlLog("...Language is {$lang}. Getting channel, finding nodes.");
         if ($is_html) {
             $sub_dom = $this->getTags($dom, $feed['CHANNEL_PATH']);
             if (!$sub_dom) {
                 crawlLog("... Scraper couldn't parse channel" . " path so bailing on this feed.");
                 continue;
             } else {
                 crawlLog("...Channel scraped.");
             }
             $nodes = $this->getTags($sub_dom[0], $feed['ITEM_PATH']);
             $rss_elements = array("title" => $feed['TITLE_PATH'], "description" => $feed['DESCRIPTION_PATH'], "link" => $feed['LINK_PATH']);
         } else {
             $nodes = $dom->getElementsByTagName('item');
             $rss_elements = array("title" => "title", "description" => "description", "link" => "link", "guid" => "guid", "pubDate" => "pubDate");
             if ($nodes->length == 0) {
                 // maybe we're dealing with atom rather than rss
                 $nodes = $dom->getElementsByTagName('entry');
                 $rss_elements = array("title" => "title", "description" => "summary", "link" => "link", "guid" => "id", "pubDate" => "updated");
             }
         }
         crawlLog("...done extracting info. Check for new news " . "items in {$feed['NAME']}.");
         $num_added = 0;
         $num_seen = 0;
         foreach ($nodes as $node) {
             $item = array();
             foreach ($rss_elements as $db_element => $feed_element) {
                 crawlTimeoutLog("..still adding feed items to index.");
                 if ($is_html) {
                     $tag_nodes = $this->getTags($node, $feed_element);
                     if (!isset($tag_nodes[0])) {
                         $tag_node = NULL;
                     } else {
                         $tag_node = $tag_nodes[0];
                     }
                     $element_text = is_object($tag_node) ? $tag_node->textContent : "";
                 } else {
                     $tag_node = $node->getElementsByTagName($feed_element)->item(0);
                     $element_text = is_object($tag_node) ? $tag_node->nodeValue : "";
                 }
                 if ($db_element == "link" && $tag_node && ($element_text == "" || $is_html)) {
                     if ($is_html) {
                         $element_text = $tag_node->documentElement->getAttribute("href");
                     } else {
                         $element_text = $tag_node->getAttribute("href");
                     }
                     $element_text = UrlParser::canonicalLink($element_text, $feed["SOURCE_URL"]);
                 }
                 $item[$db_element] = strip_tags($element_text);
             }
             $did_add = $this->addFeedItemIfNew($item, $feed['NAME'], $lang, $age);
             if ($did_add) {
                 $num_added++;
             }
             $num_seen++;
         }
         crawlLog("...added {$num_added} news items of {$num_seen} " . "on rss page.\n Done Processing {$feed['NAME']}.");
     }
 }