/** * When a crawl is being shutdown, this function is called to write * the contents of the web queue bundle back to schedules. This allows * crawls to be resumed without losing urls. This function can also be * called if the queue gets clogged to reschedule its contents for a later * time. * * @param bool $for_reschedule if the call was to reschedule the urls * to be crawled at a later time as opposed to being used to * save the urls because the crawl is being halted. */ function dumpQueueToSchedules($for_reschedule = false) { if (!$for_reschedule) { $this->writeAdminMessage("SHUTDOWN_QUEUE"); } if (!isset($this->web_queue->to_crawl_queue)) { crawlLog("URL queue appears to be empty or NULL"); return; } crawlLog("Writing queue contents back to schedules..."); $dir = CRAWL_DIR . "/schedules/" . self::schedule_data_base_name . $this->crawl_time; if (!file_exists($dir)) { mkdir($dir); chmod($dir, 0777); } $now = time(); if ($for_reschedule) { $day = floor($now / ONE_DAY); $note_string = "Reschedule"; } else { $day = floor($this->crawl_time / ONE_DAY) - 1; //want before all other schedules, so will be reloaded first $note_string = ""; } $dir .= "/{$day}"; if (!file_exists($dir)) { mkdir($dir); chmod($dir, 0777); } //get rid of previous restart attempts, if present if (!$for_reschedule) { $this->db->unlinkRecursive($dir, false); } $count = $this->web_queue->to_crawl_queue->count; $old_time = 1; $schedule_data = array(); $schedule_data[self::SCHEDULE_TIME] = $this->crawl_time; $schedule_data[self::TO_CRAWL] = array(); $fh = $this->web_queue->openUrlArchive(); for ($time = 1; $time < $count; $time++) { crawlTimeoutLog("..have written %s urls of %s urls so far", $time, $count); $tmp = $this->web_queue->peekQueue($time, $fh); list($url, $weight, , ) = $tmp; // if queue error skip if ($tmp === false || strcmp($url, "LOOKUP ERROR") == 0) { continue; } /* for fetcher hash is a hash of link_num . hash_of_page_link_on * in the case below. Either the url or hash can be used to * determine if the page has been seen. In the case, of a dump * we choose hash to be something so only url affects whether * dedup. */ $hash = crawlHash($now . $url); if ($for_reschedule) { $schedule_time = $time + $now; } else { $schedule_time = $time; } $schedule_data[self::TO_CRAWL][] = array($url, $weight, $hash); if ($time - $old_time >= MAX_FETCH_SIZE) { if (count($schedule_data[self::TO_CRAWL]) > 0) { $data_string = webencode(gzcompress(serialize($schedule_data))); $data_hash = crawlHash($data_string); file_put_contents($dir . "/At" . $schedule_time . "From127-0-0-1" . $note_string . "WithHash{$data_hash}.txt", $data_string); $data_string = ""; $schedule_data[self::TO_CRAWL] = array(); } $old_time = $time; } } $this->web_queue->closeUrlArchive($fh); if (count($schedule_data[self::TO_CRAWL]) > 0) { $data_string = webencode(gzcompress(serialize($schedule_data))); $data_hash = crawlHash($data_string); if ($for_reschedule) { $schedule_time = $time + $now; } else { $schedule_time = $time; } file_put_contents($dir . "/At" . $schedule_time . "From127-0-0-1" . $note_string . "WithHash{$data_hash}.txt", $data_string); } $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::queue_base_name . $this->crawl_time); $this->db->setWorldPermissionsRecursive($dir); }
/** * Gets the next doc from the iterator * @param bool $no_process if true then just return page string found * not any additional meta data. * @return mixed associative array for doc or just string of doc */ function nextPage($no_process = false) { if (!$this->checkFileHandle()) { return NULL; } $matches = array(); while (preg_match($this->delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) { crawlTimeoutLog("..still looking for a page in local buffer"); $block = $this->getFileBlock(); if (!$block || !$this->checkFileHandle() || $this->checkEof()) { return NULL; } $this->buffer .= $block; } $delim_len = strlen($matches[0][0]); $pos = $matches[0][1] + $delim_len; $page_pos = $this->start_delimiter == "" ? $pos : $pos - $delim_len; $page = substr($this->buffer, 0, $page_pos); if ($this->end_delimiter == "") { $page = $this->remainder . $page; $this->remainder = $matches[0][0]; } $this->buffer = substr($this->buffer, $pos + $delim_len); if ($this->start_delimiter != "") { $matches = array(); if (preg_match($this->start_delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE) != 1) { if (isset($matches[0][1])) { $page = substr($page, $matches[0][1]); } } } if ($no_process == true) { return $page; } $site = array(); $site[self::HEADER] = "text_archive_bundle_iterator extractor"; $site[self::IP_ADDRESSES] = array("0.0.0.0"); $site[self::TIMESTAMP] = date("U", time()); $site[self::TYPE] = "text/plain"; $site[self::PAGE] = $page; $site[self::HASH] = FetchUrl::computePageHash($page); $site[self::URL] = "record:" . webencode($site[self::HASH]); $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = $this->encoding; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::WEIGHT] = 1; return $site; }
/** * Sends to crawl, robot, and index data to the current queue server. * If this data is more than post_max_size, it splits it into chunks * which are then reassembled by the queue server web app before being * put into the appropriate schedule sub-directory. * * @param string $queue_server url of the current queue server * @param array $byte_counts has four fields: TOTAL, ROBOT, SCHEDULE, * INDEX. These give the number of bytes overall for the * 'data' field of $post_data and for each of these components. * @param array $post_data data to be uploaded to the queue server web app */ function uploadCrawlData($queue_server, $byte_counts, &$post_data) { $post_data['fetcher_peak_memory'] = memory_get_peak_usage(); $post_data['byte_counts'] = webencode(serialize($byte_counts)); $len = strlen($post_data['data']); $max_len = $this->post_max_size - 10 * 1024; // non-data post vars < 10K $post_data['num_parts'] = ceil($len / $max_len); $num_parts = $post_data['num_parts']; $data =& $post_data['data']; unset($post_data['data']); $post_data['hash_data'] = crawlHash($data); $offset = 0; for ($i = 1; $i <= $num_parts; $i++) { $time = time(); $session = md5($time . AUTH_KEY); $post_data['time'] = $time; $post_data['session'] = $session; $post_data['part'] = substr($data, $offset, $max_len); $post_data['hash_part'] = crawlHash($post_data['part']); $post_data['current_part'] = $i; $offset += $max_len; $part_len = strlen($post_data['part']); crawlLog("Sending Queue Server Part {$i} of {$num_parts}..."); crawlLog("...sending about {$part_len} bytes."); $sleep = false; do { if ($sleep == true) { crawlLog("Trouble sending to the scheduler at url:"); crawlLog($queue_server); crawlLog("Response was:"); crawlLog("{$info_string}"); $info = @unserialize($info_string); $time = time(); $session = md5($time . AUTH_KEY); $post_data['time'] = $time; $post_data['session'] = $session; if (isset($info[self::STATUS]) && $info[self::STATUS] == self::REDO_STATE) { crawlLog("Server requested last item to be re-sent..."); if (isset($info[self::SUMMARY])) { crawlLog($info[self::SUMMARY]); } crawlLog("Trying again in 5 seconds..."); } else { crawlLog("Trying again in 5 seconds. You might want"); crawlLog("to check the queue server url and server"); crawlLog("key. Queue Server post_max_size is:" . $this->post_max_size); } if ($i == 1 && !defined('FORCE_SMALL') && $this->post_max_size > 1000000) { /* maybe server has limited memory and two high a post_max_size */ crawlLog("Using smaller post size to see if helps"); define('FORCE_SMALL', true); $this->post_max_size = 1000000; $info[self::POST_MAX_SIZE] = 1000001; /* set to small value before try again. */ } sleep(5); } $sleep = true; $info_string = FetchUrl::getPage($queue_server, $post_data, true); $info = unserialize(trim($info_string)); if (isset($info[self::LOGGING])) { crawlLog("Messages from Fetch Controller:"); crawlLog($info[self::LOGGING]); } if (isset($info[self::POST_MAX_SIZE]) && $this->post_max_size > $info[self::POST_MAX_SIZE]) { if (!defined('FORCE_SMALL')) { crawlLog("post_max_size has changed was " . "{$this->post_max_size}. Now is " . $info[self::POST_MAX_SIZE] . "."); $this->post_max_size = $info[self::POST_MAX_SIZE]; } else { crawlLog("...Using Force Small Rule on Server Posting"); } if ($max_len > $this->post_max_size) { crawlLog("Restarting upload..."); if (isset($post_data["resized_once"])) { crawlLog("Restart failed"); return; } $post_data['data'] = $data; $post_data["resized_once"] = true; return $this->uploadCrawlData($queue_server, $byte_counts, $post_data); } } } while (!isset($info[self::STATUS]) || $info[self::STATUS] != self::CONTINUE_STATE); crawlLog("Queue Server info response code: " . $info[self::STATUS]); crawlLog("Queue Server's crawl time is: " . $info[self::CRAWL_TIME]); crawlLog("Web Server peak memory usage: " . $info[self::MEMORY_USAGE]); crawlLog("This fetcher peak memory usage: " . memory_get_peak_usage()); } crawlLog("Updated Queue Server, sent approximately" . " {$byte_counts['TOTAL']} bytes:"); }
/** * Checks to see whether there are more pages to extract from the current * archive, and if so returns the next batch to the requesting fetcher. The * iteration progress is automatically saved on each call to nextPages, so * that the next fetcher will get the next batch of pages. If there is no * current archive to iterate over, or the iterator has reached the end of * the archive then indicate that there is no more data by setting the * status to NO_DATA_STATE. */ function archiveSchedule() { $view = "fetch"; $request_start = time(); if (isset($_REQUEST['crawl_time'])) { $crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'), 0, TIMESTAMP_LEN); } else { $crawl_time = 0; } $messages_filename = CRAWL_DIR . '/schedules/name_server_messages.txt'; $lock_filename = WORK_DIRECTORY . "/schedules/name_server_lock.txt"; if ($crawl_time > 0 && file_exists($messages_filename)) { $fetch_pages = true; $info = unserialize(file_get_contents($messages_filename)); if ($info[self::STATUS] == 'STOP_CRAWL') { /* The stop crawl message gets created by the admin_controller when the "stop crawl" button is pressed.*/ if (file_exists($messages_filename)) { unlink($messages_filename); } if (file_exists($lock_filename)) { unlink($lock_filename); } $fetch_pages = false; $info = array(); } $this->checkRestart(self::ARCHIVE_CRAWL); } else { $fetch_pages = false; $info = array(); } $pages = array(); $got_lock = true; if (file_exists($lock_filename)) { $lock_time = unserialize(file_get_contents($lock_filename)); if ($request_start - $lock_time < ini_get('max_execution_time')) { $got_lock = false; } } $chunk = false; $archive_iterator = NULL; if ($fetch_pages && $got_lock) { file_put_contents($lock_filename, serialize($request_start)); if ($info[self::ARC_DIR] == "MIX" || file_exists($info[self::ARC_DIR])) { $iterate_timestamp = $info[self::CRAWL_INDEX]; $result_timestamp = $crawl_time; $result_dir = WORK_DIRECTORY . "/schedules/" . self::name_archive_iterator . $crawl_time; $arctype = $info[self::ARC_TYPE]; $iterator_name = $arctype . "Iterator"; if (!class_exists($iterator_name)) { $info['ARCHIVE_BUNDLE_ERROR'] = "Invalid bundle iterator: '{$iterator_name}'"; } else { if ($info[self::ARC_DIR] == "MIX") { //recrawl of crawl mix case $archive_iterator = new $iterator_name($iterate_timestamp, $result_timestamp); } else { //any other archive crawl except web archive recrawls $archive_iterator = new $iterator_name($iterate_timestamp, $info[self::ARC_DIR], $result_timestamp, $result_dir); } } } $pages = false; if ($archive_iterator && !$archive_iterator->end_of_iterator) { if (generalIsA($archive_iterator, "TextArchiveBundleIterator")) { $pages = $archive_iterator->nextChunk(); $chunk = true; } else { $pages = $archive_iterator->nextPages(ARCHIVE_BATCH_SIZE); } } if (file_exists($lock_filename)) { unlink($lock_filename); } } if ($archive_iterator && $archive_iterator->end_of_iterator) { $info[self::END_ITERATOR] = true; } if ($chunk && $pages || $pages && !empty($pages)) { $pages_string = webencode(gzcompress(serialize($pages))); } else { $info[self::STATUS] = self::NO_DATA_STATE; $info[self::POST_MAX_SIZE] = metricToInt(ini_get("post_max_size")); $pages = array(); $pages_string = webencode(gzcompress(serialize($pages))); } $info[self::DATA] = $pages_string; $info_string = serialize($info); $data['MESSAGE'] = $info_string; $this->displayView($view, $data); }
/** * Stores in the cache a key-value pair * * Only when a key is set is there a check for whether to invalidate * a cache bin. It is deleted as invalid if the following two conditions * both hold: * The last time it was expired is more than SECONDS_IN_A_BIN seconds ago, * and the number of cache items is more than self::MAX_FILES_IN_A_BIN. * * @param string $key to associate with value * @param mixed $value to store */ function set($key, $value) { $checksum_block = $this->checksum($key); $checksum_dir = $this->dir_name . "/{$checksum_block}"; if (file_exists("{$checksum_dir}/last_expired.txt")) { $data = unserialize(file_get_contents("{$checksum_dir}/last_expired.txt")); } if (!isset($data['last_expired'])) { $data = array('last_expired' => time(), 'count' => 0); } if (time() - $data['last_expired'] > MIN_QUERY_CACHE_TIME && $data['count'] > self::MAX_FILES_IN_A_BIN || $data['count'] > 10 * self::MAX_FILES_IN_A_BIN) { $db_class = ucfirst(DBMS) . "Manager"; $db = new $db_class(); $db->unlinkRecursive($checksum_dir); } $data['count']++; if (!file_exists($checksum_dir)) { mkdir($checksum_dir); $data['last_expired'] = time(); } file_put_contents("{$checksum_dir}/last_expired.txt", serialize($data)); $cache_file = "{$checksum_dir}/" . webencode($key); file_put_contents($cache_file, serialize($value)); }
/** * Gets the next at most $num many docs from the iterator. It might return * less than $num many documents if the partition changes or the end of the * bundle is reached. * * @param int $num number of docs to get * @param bool $no_process do not do any processing on page data * @return array associative arrays for $num pages */ function nextPages($num, $no_process = false) { $pages = array(); $page_count = 0; $db = $this->db; $query = "{$this->sql} " . $db->limitOffset($this->limit, $num); $result = $db->execute($query); $i = 0; while ($row = $db->fetchArray($result)) { crawlTimeoutLog("..Still getting pages from archive iterator. At %s" . " of %s", $i, $num); $page = ""; foreach ($row as $key => $value) { $page .= "{$key}{$this->field_value_separator}" . "{$value}{$this->column_separator}"; } if ($no_process) { $pages[] = $page; } else { $site = array(); $site[self::HEADER] = "database_bundle_iterator extractor"; $site[self::IP_ADDRESSES] = array("0.0.0.0"); $site[self::TIMESTAMP] = date("U", time()); $site[self::TYPE] = "text/plain"; $site[self::PAGE] = $page; $site[self::HASH] = FetchUrl::computePageHash($page); $site[self::URL] = "record:" . webencode($site[self::HASH]); $site[self::HTTP_CODE] = 200; $site[self::ENCODING] = $this->encoding; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::WEIGHT] = 1; $pages[] = $site; } $page_count++; } $this->limit += $page_count; if ($page_count < $num) { $this->end_of_iterator = true; } $this->saveCheckpoint(); return $pages; }
/** * This method is invoked by other ParallelModel (@see CrawlModel * for examples) methods when they want to have their method performed * on an array of other Yioop instances. The results returned can then * be aggregated. The invocation sequence is * crawlModelMethodA invokes execMachine with a list of * urls of other Yioop instances. execMachine makes REST requests of * those instances of the given command and optional arguments * This request would be handled by a CrawlController which in turn * calls crawlModelMethodA on the given Yioop instance, serializes the * result and gives it back to execMachine and then back to the originally * calling function. * * @param string $command the ParallelModel method to invoke on the remote * Yioop instances * @param array $machine_urls machines to invoke this command on * @param string $arg additional arguments to be passed to the remote * machine * @param int $num_machines the integer to be used in calculating partition * @return array a list of outputs from each machine that was called. */ function execMachines($command, $machine_urls, $arg = NULL, $num_machines = 0) { if ($num_machines == 0) { $num_machines = count($machine_urls); } $time = time(); $session = md5($time . AUTH_KEY); $query = "c=crawl&a={$command}&time={$time}&session={$session}" . "&num={$num_machines}"; if ($arg != NULL) { $arg = webencode($arg); $query .= "&arg={$arg}"; } $sites = array(); $post_data = array(); $i = 0; foreach ($machine_urls as $index => $machine_url) { $sites[$i][CrawlConstants::URL] = $machine_url; $post_data[$i] = $query . "&i={$index}"; $i++; } $outputs = array(); if (count($sites) > 0) { $outputs = FetchUrl::getPages($sites, false, 0, NULL, self::URL, self::PAGE, true, $post_data); } return $outputs; }
/** * Add the provided urls to the schedule directory of URLs that will * be crawled * * @param string $timestamp Unix timestamp of crawl to add to schedule of * @param array $inject_urls urls to be added to the schedule of * the active crawl * @param array $machine_urls an array of urls of yioop queue servers */ function injectUrlsCurrentCrawl($timestamp, $inject_urls, $machine_urls = NULL) { if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls, $timestamp)) { $this->execMachines("injectUrlsCurrentCrawl", $machine_urls, serialize(array($timestamp, $inject_urls))); return; } $dir = CRAWL_DIR . "/schedules/" . self::schedule_data_base_name . $timestamp; if (!file_exists($dir)) { mkdir($dir); chmod($dir, 0777); } $day = floor($timestamp / ONE_DAY) - 1; /* want before all other schedules, execute next */ $dir .= "/{$day}"; if (!file_exists($dir)) { mkdir($dir); chmod($dir, 0777); } $count = count($inject_urls); if ($count > 0) { $now = time(); $schedule_data = array(); $schedule_data[self::SCHEDULE_TIME] = $timestamp; $schedule_data[self::TO_CRAWL] = array(); for ($i = 0; $i < $count; $i++) { $url = $inject_urls[$i]; $hash = crawlHash($now . $url); $schedule_data[self::TO_CRAWL][] = array($url, 1, $hash); } $data_string = webencode(gzcompress(serialize($schedule_data))); $data_hash = crawlHash($data_string); file_put_contents($dir . "/At1From127-0-0-1" . "WithHash{$data_hash}.txt", $data_string); return true; } return false; }
/** * Creates a fresh array from an existing page summary array, and augments * it with extra data relevant to the labeling interface on the client. * * @param array $page original page summary array * @param float $score classification score (estimated by the Naive Bayes * text classification algorithm) for $page * @param float $disagreement disagreement score computed for $page * @param int $crawl_time index the page came from * @param string $keywords query supplied to the crawl mix used to find * $page * @return array reduced page summary structure containing only the * information that the client needs to display a summary of the page */ function prepareUnlabelledDocument($page, $score, $disagreement, $crawl_time, $keywords) { $phrase_model = $this->model("phrase"); // Highlight the query keywords, if any. $disjunct_phrases = explode("|", $keywords); $words = array(); foreach ($disjunct_phrases as $disjunct_phrase) { list($word_struct, $format_words) = $phrase_model->parseWordStructConjunctiveQuery($disjunct_phrase); $words = array_merge($words, $format_words); } $title = $phrase_model->boldKeywords($page[self::TITLE], $words); $description = $phrase_model->getSnippets(strip_tags($page[self::DESCRIPTION]), $words, 400); $description = $phrase_model->boldKeywords($description, $words); $cache_link = "?c=search&a=cache" . "&q=" . urlencode($keywords) . "&arg=" . urlencode($page[self::URL]) . "&its=" . $crawl_time; /* Note that the confidence is a transformation of the score that converts it into a value between 0 and 1, where it's 0 if the score was exactly 0.5, and increases toward 1 as the score either increases toward 1 or decreases toward 0. */ return array('title' => $title, 'url' => $page[self::URL], 'key' => webencode(Classifier::makeKey($page)), 'cache_link' => $cache_link, 'description' => $description, 'score' => $score, 'positive' => $score >= 0.5 ? 1 : 0, 'confidence' => abs($score - 0.5) / 0.5, 'disagreement' => $disagreement); }
/** * Handles the request to get the array of news feed sources which hash to * a particular value i.e. match with the index of requesting machine's * hashed url/name from array of available machines hash */ function getNewsSources() { if (!isset($_REQUEST["arg"])) { return; } $source_model = $this->model("source"); $current_machine = $this->clean(webdecode($_REQUEST["arg"]), "string"); $machine_hashes = $source_model->getMachineHashUrls(); $machine_index_match = array_search($current_machine, $machine_hashes); if ($machine_index_match === false) { echo webencode(serialize(array())); return; } $num_machines = count($machine_hashes); $pre_feeds = $source_model->getMediaSources("rss"); $pre_feeds = array_merge($pre_feeds, $source_model->getMediaSources("html")); if (!$pre_feeds) { return false; } $feeds = array(); foreach ($pre_feeds as $pre_feed) { if (!isset($pre_feed['NAME'])) { continue; } $hash_int = unpack("N", crawlHash($pre_feed['NAME'])); if (!isset($hash_int[1])) { continue; } $hash_index = $hash_int[1] % $num_machines; if ($machine_index_match != $hash_index) { continue; } if ($pre_feed['TYPE'] == 'html') { list($pre_feed['CHANNEL_PATH'], $pre_feed['ITEM_PATH'], $pre_feed['TITLE_PATH'], $pre_feed['DESCRIPTION_PATH'], $pre_feed['LINK_PATH']) = explode("###", html_entity_decode($pre_feed['AUX_INFO'])); } $feeds[] = $pre_feed; } echo webencode(serialize($feeds)); }
/** * Given a list of class labels, returns an array mapping each class label * to an array of data necessary for initializing a classifier for that * label. This static method is used to prepare a collection of classifiers * for distribution to fetchers, so that each fetcher can classify pages as * it downloads them. The only extra properties passed along in addition to * the base classification data are the final features and final algorithm, * both necessary for classifying new documents. * * @param array $labels flat array of class labels for which to load data * @return array associative array mapping class labels to arrays of data * necessary for initializing the associated classifier */ static function loadClassifiersData($labels) { $fields = array('classifier', 'final_features', 'final_algorithm'); $classifiers_data = array(); foreach ($labels as $label) { $basedir = WORK_DIRECTORY . "/classifiers/{$label}"; $classifier_data = array(); foreach ($fields as $field) { $filename = "{$basedir}/{$field}.txt"; if (file_exists($filename)) { /* The data is web-encoded because it will be sent in an HTTP response to each fetcher as it prepares for a new crawl. */ $classifier_data[$field] = webencode(file_get_contents($filename)); } else { $classifier_data = false; break; } } $classifiers_data[$label] = $classifier_data; } return $classifiers_data; }