/** * Writes status information about the current crawl so that the webserver * app can use it for its display. * * @param array $sites contains the most recently crawled sites */ function writeCrawlStatus(&$sites) { $crawl_status = array(); $stat_file = CRAWL_DIR . "/schedules/crawl_status.txt"; if (file_exists($stat_file)) { $crawl_status = unserialize(file_get_contents($stat_file)); if (!isset($crawl_status['CRAWL_TIME']) || $crawl_status['CRAWL_TIME'] != $this->crawl_time) { $crawl_status = array(); // status of some other crawl } } $crawl_status['MOST_RECENT_FETCHER'] = $this->most_recent_fetcher; if (isset($sites[self::RECENT_URLS])) { $crawl_status['MOST_RECENT_URLS_SEEN'] = $sites[self::RECENT_URLS]; } $crawl_status['CRAWL_TIME'] = $this->crawl_time; $info_bundle = IndexArchiveBundle::getArchiveInfo(CRAWL_DIR . '/cache/' . self::index_data_base_name . $this->crawl_time); $index_archive_info = unserialize($info_bundle['DESCRIPTION']); $crawl_status['COUNT'] = $info_bundle['COUNT']; $now = time(); $change_in_time = ONE_HOUR + 1; while (count($this->hourly_crawl_data) > 0 && $change_in_time > ONE_HOUR) { $least_recent_hourly_pair = array_pop($this->hourly_crawl_data); $change_in_time = $now - $least_recent_hourly_pair[0]; } if ($change_in_time <= ONE_HOUR) { $this->hourly_crawl_data[] = $least_recent_hourly_pair; } array_unshift($this->hourly_crawl_data, array($now, $info_bundle['VISITED_URLS_COUNT'])); $crawl_status['VISITED_COUNT_HISTORY'] = $this->hourly_crawl_data; $crawl_status['VISITED_URLS_COUNT'] = $info_bundle['VISITED_URLS_COUNT']; $crawl_status['DESCRIPTION'] = $index_archive_info['DESCRIPTION']; $crawl_status['QUEUE_PEAK_MEMORY'] = memory_get_peak_usage(); file_put_contents($stat_file, serialize($crawl_status), LOCK_EX); chmod($stat_file, 0777); crawlLog("End checking for new URLs data memory usage" . memory_get_usage()); crawlLog("The current crawl description is: " . $index_archive_info['DESCRIPTION']); crawlLog("Number of unique pages so far: " . $info_bundle['VISITED_URLS_COUNT']); crawlLog("Total urls extracted so far: " . $info_bundle['COUNT']); if (isset($sites[self::RECENT_URLS])) { crawlLog("Of these, the most recent urls are:"); foreach ($sites[self::RECENT_URLS] as $url) { crawlLog("URL: " . iconv("UTF-8", "ISO-8859-1//IGNORE", $url)); } } }
/** * Gets a list of all index archives of crawls that have been conducted * * @param bool $return_arc_bundles whether index bundles used for indexing * arc or other archive bundles should be included in the lsit * @param bool $return_recrawls whether index archive bundles generated as * a result of recrawling should be included in the result * @param array $machine_urls an array of urls of yioop queue servers * @param bool $cache whether to try to get/set the data to a cache file * * @return array available IndexArchiveBundle directories and * their meta information this meta information includes the time of * the crawl, its description, the number of pages downloaded, and the * number of partitions used in storing the inverted index */ function getCrawlList($return_arc_bundles = false, $return_recrawls = false, $machine_urls = NULL, $cache = false) { if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $arg = $return_arc_bundles && $return_recrawls ? 3 : ($return_recrawls ? 2 : ($return_arc_bundles ? 1 : 0)); $cache_file = CRAWL_DIR . "/cache/" . self::network_crawllist_base_name . "{$arg}.txt"; if ($cache && file_exists($cache_file) && filemtime($cache_file) + 300 > time()) { return unserialize(file_get_contents($cache_file)); } $list_strings = $this->execMachines("getCrawlList", $machine_urls, $arg); $list = $this->aggregateCrawlList($list_strings); if ($cache) { file_put_contents($cache_file, serialize($list)); } return $list; } $list = array(); $dirs = glob(CRAWL_DIR . '/cache/' . self::index_data_base_name . '*', GLOB_ONLYDIR); foreach ($dirs as $dir) { $crawl = array(); $pre_timestamp = strstr($dir, self::index_data_base_name); $crawl['CRAWL_TIME'] = substr($pre_timestamp, strlen(self::index_data_base_name)); $info = IndexArchiveBundle::getArchiveInfo($dir); $index_info = @unserialize($info['DESCRIPTION']); $crawl['DESCRIPTION'] = ""; if (!$return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) { continue; } else { if ($return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) { $crawl['DESCRIPTION'] = "RECRAWL::"; } } $sched_path = CRAWL_DIR . '/schedules/' . self::schedule_data_base_name . $crawl['CRAWL_TIME']; $crawl['RESUMABLE'] = false; if (is_dir($sched_path)) { $sched_dir = opendir($sched_path); while (($name = readdir($sched_dir)) !== false) { $sub_path = "{$sched_path}/{$name}"; if (!is_dir($sub_path) || $name == '.' || $name == '..') { continue; } $sub_dir = opendir($sub_path); $i = 0; while (($sub_name = readdir($sub_dir)) !== false && $i < 5) { if ($sub_name[0] == 'A' && $sub_name[1] == 't') { $crawl['RESUMABLE'] = true; break 2; } } closedir($sub_dir); } closedir($sched_dir); } if (isset($index_info['DESCRIPTION'])) { $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION']; } $crawl['VISITED_URLS_COUNT'] = isset($info['VISITED_URLS_COUNT']) ? $info['VISITED_URLS_COUNT'] : 0; $crawl['COUNT'] = isset($info['COUNT']) ? $info['COUNT'] : 0; $crawl['NUM_DOCS_PER_PARTITION'] = isset($info['NUM_DOCS_PER_PARTITION']) ? $info['NUM_DOCS_PER_PARTITION'] : 0; $crawl['WRITE_PARTITION'] = isset($info['WRITE_PARTITION']) ? $info['WRITE_PARTITION'] : 0; $list[] = $crawl; } if ($return_arc_bundles) { $dirs = glob(CRAWL_DIR . '/archives/*', GLOB_ONLYDIR); foreach ($dirs as $dir) { $crawl = array(); $crawl['CRAWL_TIME'] = crc32($dir); $crawl['DESCRIPTION'] = "ARCFILE::"; $crawl['ARC_DIR'] = $dir; $ini_file = "{$dir}/arc_description.ini"; if (!file_exists($ini_file)) { continue; } else { $ini = parse_ini_with_fallback($ini_file); $crawl['ARC_TYPE'] = $ini['arc_type']; $crawl['DESCRIPTION'] .= $ini['description']; } $crawl['VISITED_URLS_COUNT'] = 0; $crawl['COUNT'] = 0; $crawl['NUM_DOCS_PER_PARTITION'] = 0; $crawl['WRITE_PARTITION'] = 0; $list[] = $crawl; } } return $list; }