コード例 #1
0
ファイル: string_array.php プロジェクト: yakar/yioop
 /**
  * Load a StringArray from a file
  *
  * @param string $fname the name of the file to load the StringArray from
  * @return object the PersistentStructure loaded
  */
 static function load($fname)
 {
     $fh = fopen($fname, "rb");
     $array_size = unpackInt(fread($fh, 4));
     $array = fread($fh, $array_size);
     $object = unserialize(fread($fh, filesize($fname) - 4 - $array_size));
     $object->string_array =& $array;
     fclose($fh);
     return $object;
 }
コード例 #2
0
ファイル: queue_server.php プロジェクト: yakar/yioop
 /**
  * Adds the summary and index data in $file to summary bundle and word index
  *
  * @param string $file containing web pages summaries and a mini-inverted
  *     index for their content
  * @param bool $blocking this method might be called by the indexer
  *     subcomponent when a merge tier phase is ongoing to allow for
  *     other processing to occur. If so, we don't want a regress
  *     where the indexer calls this code calls the indexer etc. If
  *     the blocking flag is set then the indexer subcomponent won't
  *     be called
  */
 function processIndexArchive($file, $blocking)
 {
     static $blocked = false;
     if ($blocking && $blocked) {
         crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. B");
         return;
     }
     if (!$blocking) {
         $blocked = false;
     }
     crawlLog("{$this->server_name} is starting to process index data," . " memory usage: " . memory_get_usage() . "...");
     crawlLog("Indexer: Processing index data in {$file}...");
     $start_time = microtime();
     $start_total_time = microtime();
     $pre_sites = webdecode(file_get_contents($file));
     $len_urls = unpackInt(substr($pre_sites, 0, 4));
     $seen_urls_string = substr($pre_sites, 4, $len_urls);
     $pre_sites = substr($pre_sites, 4 + $len_urls);
     $sites[self::SEEN_URLS] = array();
     $pos = 0;
     $num = 0;
     $bad = false;
     $max_batch_sites_and_links = SEEN_URLS_BEFORE_UPDATE_SCHEDULER * (max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP) + 1);
     while ($pos < $len_urls && $num <= $max_batch_sites_and_links) {
         crawlTimeoutLog("..Indexer still processing index data at position" . " %s of out of %s", $pos, $len_urls);
         $len_site = unpackInt(substr($seen_urls_string, $pos, 4));
         if ($len_site > 2 * $this->page_range_request) {
             crawlLog("Indexer: Site string too long, {$len_site}," . " data file may be corrupted? Skip rest.");
             $bad = true;
             break;
         }
         $pos += 4;
         $site_string = substr($seen_urls_string, $pos, $len_site);
         $pos += strlen($site_string);
         $tmp = unserialize(gzuncompress($site_string));
         if (!$tmp || !is_array($tmp)) {
             crawlLog("Compressed array null," . " data file may be corrupted? Skip rest.");
             $bad = true;
             break;
         }
         $sites[self::SEEN_URLS][] = $tmp;
         $num++;
     }
     if ($num > $max_batch_sites_and_links * SEEN_URLS_BEFORE_UPDATE_SCHEDULER || $bad) {
         crawlLog("Index data file len_urls was {$len_urls} num was {$num}, " . "may be corrupt so skipping this file.");
         crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time));
         unlink($file);
         return;
     }
     crawlLog("A. Indexer Load SEEN_URLS. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     $sites[self::INVERTED_INDEX] = IndexShard::load("fetcher_shard", $pre_sites);
     unset($pre_sites);
     crawlLog("B. Indexer Load Sent shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     $start_time = microtime();
     //do deduplication of summaries
     if (isset($sites[self::SEEN_URLS]) && count($sites[self::SEEN_URLS]) > 0) {
         $seen_sites = $sites[self::SEEN_URLS];
         $seen_sites = array_values($seen_sites);
         unset($sites[self::SEEN_URLS]);
         $num_seen = count($seen_sites);
         crawlLog("Indexer: SEEN_URLS array had {$num_seen} sites.");
     } else {
         $num_seen = 0;
     }
     $visited_urls_count = 0;
     $recent_urls_count = 0;
     $recent_urls = array();
     for ($i = 0; $i < $num_seen; $i++) {
         $seen_sites[$i][self::HASH_URL] = crawlHash($seen_sites[$i][self::URL], true);
         $link_url_parts = explode("|", $seen_sites[$i][self::URL]);
         if (strcmp("url", $link_url_parts[0]) == 0) {
             $reftype = strcmp("eref", $link_url_parts[4]) == 0 ? "e" : "i";
             $seen_sites[$i][self::HASH_URL] = crawlHash($link_url_parts[1], true) . crawlHash($seen_sites[$i][self::URL], true) . $reftype . substr(crawlHash(UrlParser::getHost($link_url_parts[5]) . "/", true), 1);
             $seen_sites[$i][self::IS_DOC] = false;
         } else {
             $seen_sites[$i][self::IS_DOC] = true;
             $visited_urls_count++;
             array_push($recent_urls, $seen_sites[$i][self::URL]);
             if ($recent_urls_count >= NUM_RECENT_URLS_TO_DISPLAY) {
                 array_shift($recent_urls);
             }
             $recent_urls_count++;
         }
     }
     if (isset($sites[self::INVERTED_INDEX])) {
         $index_shard =& $sites[self::INVERTED_INDEX];
         $generation = $this->index_archive->initGenerationToAdd($index_shard->num_docs, $this, $blocking);
         if ($generation == -1) {
             crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. A");
             $blocked = true;
             return;
         }
         $summary_offsets = array();
         if (isset($seen_sites)) {
             $this->index_archive->addPages($generation, self::SUMMARY_OFFSET, $seen_sites, $visited_urls_count);
             foreach ($seen_sites as $site) {
                 if ($site[self::IS_DOC]) {
                     // so not link
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $hash = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                 } else {
                     $hash = $site[self::HASH_URL];
                 }
                 $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
             }
             unset($seen_sites);
         }
         crawlLog("C. Indexer init local shard, store " . "Summaries memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
         $start_time = microtime();
         // added summary offset info to inverted index data
         $index_shard->changeDocumentOffsets($summary_offsets);
         crawlLog("D. Indexer Update shard offsets. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
         $start_time = microtime();
         $this->index_archive->addIndexData($index_shard);
         $this->index_dirty = true;
     }
     crawlLog("E. Indexer Add index shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time));
     if (isset($recent_urls)) {
         $sites[self::RECENT_URLS] =& $recent_urls;
         $this->writeCrawlStatus($sites);
     }
     if (file_exists($file)) {
         //Haven't tracked down yet, but can try to delete twice giving warn
         unlink($file);
     }
 }
コード例 #3
0
ファイル: fetcher.php プロジェクト: yakar/yioop
 /**
  * Get status, current crawl, crawl order, and new site information from
  * the queue_server.
  *
  * @return mixed array or bool. If we are doing
  *     a web crawl and we still have pages to crawl then true, if the
  *     scheduler page fails to download then false, otherwise, returns
  *     an array of info from the scheduler.
  */
 function checkScheduler()
 {
     $prefix = $this->fetcher_num . "-";
     $info = array();
     $to_crawl_count = count($this->to_crawl);
     $to_crawl_again_count = count($this->to_crawl_again);
     if ($this->recrawl_check_scheduler) {
         crawlLog("Archive Crawl checking ... Recrawl.");
     }
     if ((count($this->to_crawl) > 0 || count($this->to_crawl_again) > 0) && !$this->recrawl_check_scheduler) {
         crawlLog("  Current to crawl count:" . $to_crawl_count);
         crawlLog("  Current to crawl try again count:" . $to_crawl_again_count);
         crawlLog("So not checking scheduler.");
         return true;
     }
     $this->selectCurrentServerAndUpdateIfNeeded(false);
     $this->recrawl_check_scheduler = false;
     $queue_server = $this->queue_servers[$this->current_server];
     crawlLog("Checking  {$queue_server} for a new schedule.");
     // hosts with error counts cleared with each schedule
     $this->hosts_with_errors = array();
     $start_time = microtime();
     $time = time();
     $session = md5($time . AUTH_KEY);
     $request = $queue_server . "?c=fetch&a=schedule&time={$time}&session={$session}" . "&robot_instance=" . $prefix . ROBOT_INSTANCE . "&machine_uri=" . WEB_URI . "&crawl_time=" . $this->crawl_time . "&check_crawl_time=" . $this->check_crawl_time;
     $info_string = FetchUrl::getPage($request, NULL, true);
     crawlLog("Making schedule request: " . $request);
     if ($info_string === false) {
         crawlLog("The request failed!!!!");
         return false;
     }
     $info_string = trim($info_string);
     $tok = strtok($info_string, "\n");
     $info = unserialize(base64_decode($tok));
     $this->setCrawlParamsFromArray($info);
     if (isset($info[self::SITES])) {
         $tok = strtok("\n");
         //skip meta info
         $this->to_crawl = array();
         while ($tok !== false) {
             $string = base64_decode($tok);
             $weight = unpackFloat(substr($string, 0, 4));
             $delay = unpackInt(substr($string, 4, 4));
             $url = substr($string, 8);
             $this->to_crawl[] = array($url, $weight, $delay);
             $tok = strtok("\n");
         }
         $dir = CRAWL_DIR . "/schedules";
         file_put_contents("{$dir}/{$prefix}" . self::fetch_batch_name . "{$this->crawl_time}.txt", serialize($this->to_crawl));
         $this->db->setWorldPermissionsRecursive("{$dir}/{$prefix}" . self::fetch_batch_name . "{$this->crawl_time}.txt");
         unset($info[self::SITES]);
         file_put_contents("{$dir}/{$prefix}" . self::fetch_crawl_info . "{$this->crawl_time}.txt", serialize($info));
     }
     crawlLog("Time to check Scheduler " . changeInMicrotime($start_time));
     return $info;
 }
コード例 #4
0
ファイル: non_compressor.php プロジェクト: yakar/yioop
 /**
  * Used to uncompress an int from a fixed length string in the format of
  * the compression algorithm underlying the compressor. Since this
  * compressor doesn't compress we just use unpack
  *
  * @param string $my_compressed_int the fixed length string containing
  *     the packed int to extract
  * @return int the integer contained in that string
  */
 function uncompressInt($my_compressed_int)
 {
     return unpackInt($my_compressed_int);
 }
コード例 #5
0
ファイル: web_queue_bundle.php プロジェクト: yakar/yioop
 /**
  * Checks if the given $url is allowed to be crawled based on stored
  * robots.txt info.
  * @param string $url to check
  * @return bool whether it was allowed or not
  */
 function checkRobotOkay($url)
 {
     // local cache of recent robot.txt stuff
     static $robot_cache = array();
     $cache_size = 2000;
     list($host, $path) = UrlParser::getHostAndPath($url, true, true);
     $path = urldecode($path);
     $key = crawlHash($host, true);
     if (isset($robot_cache[$key])) {
         $robot_object = $robot_cache[$key];
     } else {
         $data = $this->robot_table->lookup($key);
         $offset = unpackInt($data);
         $robot_object = $this->robot_archive->getObjects($offset, 1);
         $robot_cache[$key] = $robot_object;
         if (count($robot_cache) > $cache_size) {
             array_shift($robot_cache);
         }
     }
     $robot_paths = isset($robot_object[0][1]) ? $robot_object[0][1] : array();
     //these should have been urldecoded in RobotProcessor
     $robots_okay = true;
     $robots_not_okay = false;
     if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) {
         $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]);
         $robots_okay = !$robots_not_okay;
     }
     if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) {
         $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]);
     }
     return $robots_okay || !$robots_not_okay;
 }