Example #1
0
 /**
  * Checks if getScheme is working okay
  */
 function getSchemeTestCase()
 {
     $test_links = array(array("http://www.example.com/", "http", "Simple HTTP 1"), array("https://www.example.com/", "https", "Simple HTTPS 1"), array("gopher://www.example.com/", "gopher", "Simple GOPHER 1"), array("./", "http", "Simple HTTP 2"));
     foreach ($test_links as $test_link) {
         $result = UrlParser::getScheme($test_link[0]);
         $this->assertEqual($result, $test_link[1], $test_link[2]);
     }
 }
Example #2
0
 /**
  * Used to handle data from the suggest-a-url to crawl form
  * (suggest_view.php). Basically, it saves any data submitted to
  * a file which can then be imported in manageCrawls
  *
  * @return array $data contains fields with the current value for
  *     the url (if set but not submitted) as well as for a captcha
  */
 function suggestUrl()
 {
     $data["REFRESH"] = "suggest";
     $visitor_model = $this->model("visitor");
     $clear = false;
     if (CAPTCHA_MODE != IMAGE_CAPTCHA) {
         unset($_SESSION["captcha_text"]);
     }
     if (CAPTCHA_MODE != TEXT_CAPTCHA) {
         unset($_SESSION['CAPTCHA']);
         unset($_SESSION['CAPTCHA_ANSWERS']);
     }
     if (CAPTCHA_MODE != HASH_CAPTCHA) {
         $num_captchas = self::NUM_CAPTCHA_QUESTIONS;
         unset($_SESSION["request_time"]);
         unset($_SESSION["level"]);
         unset($_SESSION["random_string"]);
     } else {
         $data['INCLUDE_SCRIPTS'] = array("sha1", "hash_captcha");
     }
     if (!isset($_SESSION['BUILD_TIME']) || !isset($_REQUEST['build_time']) || $_SESSION['BUILD_TIME'] != $_REQUEST['build_time'] || $this->clean($_REQUEST['build_time'], "int") <= 0) {
         if (CAPTCHA_MODE == HASH_CAPTCHA) {
             $time = time();
             $_SESSION["request_time"] = $time;
             $_SESSION["level"] = self::HASH_CAPTCHA_LEVEL;
             $_SESSION["random_string"] = md5($time . AUTH_KEY);
         }
         $clear = true;
         if (isset($_REQUEST['url'])) {
             unset($_REQUEST['url']);
         }
         if (isset($_REQUEST['arg'])) {
             unset($_REQUEST['arg']);
         }
         $data['build_time'] = time();
         $_SESSION['BUILD_TIME'] = $data['build_time'];
     } else {
         $data['build_time'] = $_SESSION['BUILD_TIME'];
     }
     $data['url'] = "";
     if (isset($_REQUEST['url'])) {
         $data['url'] = $this->clean($_REQUEST['url'], "string");
     }
     $missing = array();
     $save = isset($_REQUEST['arg']) && $_REQUEST['arg'];
     if (CAPTCHA_MODE == TEXT_CAPTCHA) {
         for ($i = 0; $i < $num_captchas; $i++) {
             $data["question_{$i}"] = "-1";
             if ($clear && isset($_REQUEST["question_{$i}"])) {
                 unset($_REQUEST["question_{$i}"]);
             }
         }
         if (!isset($_SESSION['CAPTCHA']) || !isset($_SESSION['CAPTCHA_ANSWERS'])) {
             list($captchas, $answers) = $this->selectQuestionsAnswers($this->captchas_qa, $num_captchas, self::NUM_CAPTCHA_CHOICES);
             $data['CAPTCHA'] = $captchas;
             $data['build_time'] = time();
             $_SESSION['BUILD_TIME'] = $data['build_time'];
             $_SESSION['CAPTCHA_ANSWERS'] = $answers;
             $_SESSION['CAPTCHA'] = $data['CAPTCHA'];
         } else {
             $data['CAPTCHA'] = $_SESSION['CAPTCHA'];
         }
         for ($i = 0; $i < $num_captchas; $i++) {
             $field = "question_{$i}";
             $captchas = isset($_SESSION['CAPTCHA'][$i]) ? $_SESSION['CAPTCHA'][$i] : array();
             if ($save) {
                 if (!isset($_REQUEST[$field]) || $_REQUEST[$field] == "-1" || !in_array($_REQUEST[$field], $captchas)) {
                     $missing[] = $field;
                 } else {
                     $data[$field] = $_REQUEST[$field];
                 }
             }
         }
     }
     $data['MISSING'] = $missing;
     $fail = false;
     if (CAPTCHA_MODE == IMAGE_CAPTCHA && !$save) {
         $this->setupGraphicalCaptchaViewData($data);
     }
     if ($save && isset($_REQUEST['url'])) {
         $url = $this->clean($_REQUEST['url'], "string");
         $url_parts = @parse_url($url);
         if (!isset($url_parts['scheme'])) {
             $url = "http://" . $url;
         }
         $suggest_host = UrlParser::getHost($url);
         $scheme = UrlParser::getScheme($url);
         if (strlen($suggest_host) < 12 || !$suggest_host || !in_array($scheme, array("http", "https"))) {
             $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_invalid_url') . "</h1>');";
             $fail = true;
         } else {
             if ($missing != array()) {
                 $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_error_fields') . "</h1>');";
                 $fail = true;
             }
         }
         if (CAPTCHA_MODE == IMAGE_CAPTCHA && $fail) {
             $this->setupGraphicalCaptchaViewData($data);
         }
         if ($fail) {
             return $data;
         }
         switch (CAPTCHA_MODE) {
             case HASH_CAPTCHA:
                 if (!$this->validateHashCode()) {
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_hashcode') . "</h1>');";
                     $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out");
                     return $data;
                 }
                 break;
             case TEXT_CAPTCHA:
                 $fail = false;
                 if (!$this->checkCaptchaAnswers()) {
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_human') . "</h1>');";
                     $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out");
                     $data['build_time'] = time();
                     $_SESSION['BUILD_TIME'] = $data['build_time'];
                     $fail = true;
                 }
                 for ($i = 0; $i < $num_captchas; $i++) {
                     $data["question_{$i}"] = "-1";
                 }
                 list($captchas, $answers) = $this->selectQuestionsAnswers($this->captchas_qa, $num_captchas, self::NUM_CAPTCHA_CHOICES);
                 $data['CAPTCHA'] = $captchas;
                 $_SESSION['CAPTCHA_ANSWERS'] = $answers;
                 $_SESSION['CAPTCHA'] = $data['CAPTCHA'];
                 if ($fail) {
                     return $data;
                 }
                 break;
             case IMAGE_CAPTCHA:
                 $user_captcha_text = isset($_REQUEST['user_captcha_text']) ? $this->clean($_REQUEST['user_captcha_text'], "string") : "";
                 if (isset($_SESSION['captcha_text']) && $_SESSION['captcha_text'] != trim($user_captcha_text)) {
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_graphical_human') . "</h1>');";
                     unset($_SESSION['captcha_text']);
                     $this->setupGraphicalCaptchaViewData($data);
                     $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out");
                     return $data;
                 }
                 $this->setupGraphicalCaptchaViewData($data);
                 break;
         }
         // Handle cases where captcha was okay
         if (!$this->model("crawl")->appendSuggestSites($url)) {
             $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_suggest_full') . "</h1>');";
             return $data;
         }
         $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_url_submitted') . "</h1>');";
         $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "suggest_day_exceeded", ONE_DAY, ONE_DAY, MAX_SUGGEST_URLS_ONE_DAY);
         $data['build_time'] = time();
         $_SESSION['BUILD_TIME'] = $data['build_time'];
         $data['url'] = "";
     }
     return $data;
 }
Example #3
0
 /**
  * Produces a schedule.txt file of url data for a fetcher to crawl next.
  *
  * The hard part of scheduling is to make sure that the overall crawl
  * process obeys robots.txt files. This involves checking the url is in
  * an allowed path for that host and it also involves making sure the
  * Crawl-delay directive is respected. The first fetcher that contacts the
  * server requesting data to crawl will get the schedule.txt
  * produced by produceFetchBatch() at which point it will be unlinked
  * (these latter thing are controlled in FetchController).
  *
  * @see FetchController
  */
 function produceFetchBatch()
 {
     $i = 1;
     // array implementation of priority queue starts at 1 not 0
     $fetch_size = 0;
     crawlLog("Scheduler: Start Produce Fetch Batch Memory usage" . memory_get_usage());
     $count = $this->web_queue->to_crawl_queue->count;
     $schedule_time = time();
     $first_line = $this->calculateScheduleMetaInfo($schedule_time);
     $sites = array();
     $delete_urls = array();
     $crawl_delay_hosts = array();
     $time_per_request_guess = MINIMUM_FETCH_LOOP_TIME;
     // it would be impressive if we can achieve this speed
     $current_crawl_index = -1;
     crawlLog("Scheduler: Trying to Produce Fetch Batch; Queue Size {$count}");
     $start_time = microtime();
     $fh = $this->web_queue->openUrlArchive();
     /*
         $delete - array of items we will delete from the queue after
             we have selected all of the items for fetch batch
         $sites - array of urls for fetch batch indices in this array we'll
             call slots. Crawled-delayed host urls are spaced by a certain
             number of slots
     */
     while ($i <= $count && $fetch_size < MAX_FETCH_SIZE) {
         crawlTimeoutLog("..Scheduler: still producing fetch batch. " . "Examining location %s in queue of %s.", $i, $count);
         //look in queue for url and its weight
         $tmp = $this->web_queue->peekQueue($i, $fh);
         list($url, $weight, $flag, $probe) = $tmp;
         // if queue error remove entry any loop
         if ($tmp === false || strcmp($url, "LOOKUP ERROR") == 0) {
             $delete_urls[$i] = false;
             crawlLog("Scheduler: Removing lookup error at" . " {$i} during produce fetch");
             $i++;
             continue;
         }
         $no_flags = false;
         $hard_coded = false;
         $host_url = UrlParser::getHost($url);
         if ($flag == WebQueueBundle::NO_FLAGS) {
             $hard_coded_pos = strpos($url, "###!");
             if ($hard_coded_pos > 0) {
                 $has_robots = true;
                 $hard_coded = true;
                 $is_robot = false;
             } else {
                 $has_robots = $this->web_queue->containsGotRobotTxt($host_url);
                 $scheme = UrlParser::getScheme($host_url);
                 if ($scheme == "gopher") {
                     $is_robot = strcmp($host_url . "/0/robots.txt", $url) == 0;
                 } else {
                     $is_robot = strcmp($host_url . "/robots.txt", $url) == 0;
                 }
             }
             $no_flags = true;
         } else {
             $is_robot = $flag == WebQueueBundle::ROBOT;
             if ($flag >= WebQueueBundle::SCHEDULABLE) {
                 $has_robots = true;
                 if ($flag > WebQueueBundle::SCHEDULABLE) {
                     $delay = $flag - WebQueueBundle::SCHEDULABLE;
                 }
             }
         }
         //if $url is a robots.txt url see if we need to schedule or not
         if ($is_robot) {
             if ($has_robots) {
                 $delete_urls[$i] = $url;
                 $i++;
             } else {
                 $next_slot = $this->getEarliestSlot($current_crawl_index, $sites);
                 if ($next_slot < MAX_FETCH_SIZE) {
                     $sites[$next_slot] = array($url, $weight, 0);
                     $delete_urls[$i] = $url;
                     /* note don't add to seen url filter
                          since check robots every 24 hours as needed
                        */
                     $current_crawl_index = $next_slot;
                     $fetch_size++;
                     $i++;
                 } else {
                     //no more available slots so prepare to bail
                     $i = $count;
                     if ($no_flags) {
                         $this->web_queue->setQueueFlag($url, WebQueueBundle::ROBOT);
                     }
                 }
             }
             continue;
         }
         //Now handle the non-robots.txt url case
         $robots_okay = true;
         if ($has_robots) {
             if ($no_flags) {
                 if (!isset($hard_coded) || !$hard_coded) {
                     $robots_okay = $this->web_queue->checkRobotOkay($url);
                 } else {
                     $robots_okay = true;
                 }
                 if (!$robots_okay) {
                     $delete_urls[$i] = $url;
                     $this->web_queue->addSeenUrlFilter($url);
                     $i++;
                     continue;
                 }
                 $delay = $this->web_queue->getCrawlDelay($host_url);
             }
             if (!$this->withinQuota($url)) {
                 //we've not allowed to schedule $url till next hour
                 $delete_urls[$i] = $url;
                 //delete from queue (so no clog) but don't mark seen
                 $i++;
                 continue;
             }
             //each host has two entries in $this->waiting_hosts
             $num_waiting = floor(count($this->waiting_hosts) / 2);
             if ($delay > 0) {
                 // handle adding a url if there is a crawl delay
                 $hash_host = crawlHash($host_url);
                 $is_waiting_host = isset($this->waiting_hosts[$hash_host]);
                 if (!$is_waiting_host && $num_waiting < MAX_WAITING_HOSTS || $is_waiting_host && $this->waiting_hosts[$hash_host] == $schedule_time) {
                     $this->waiting_hosts[$hash_host] = $schedule_time;
                     $this->waiting_hosts[$schedule_time][] = $hash_host;
                     $request_batches_per_delay = ceil($delay / $time_per_request_guess);
                     if (!isset($crawl_delay_hosts[$hash_host])) {
                         $next_earliest_slot = $current_crawl_index;
                         $crawl_delay_hosts[$hash_host] = $next_earliest_slot;
                     } else {
                         $next_earliest_slot = $crawl_delay_hosts[$hash_host] + $request_batches_per_delay * NUM_MULTI_CURL_PAGES;
                     }
                     if (($next_slot = $this->getEarliestSlot($next_earliest_slot, $sites)) < MAX_FETCH_SIZE) {
                         $crawl_delay_hosts[$hash_host] = $next_slot;
                         $delete_urls[$i] = $url;
                         $sites[$next_slot] = array($url, $weight, $delay);
                         $this->web_queue->addSeenUrlFilter($url);
                         /* we might miss some sites by marking them
                              seen after only scheduling them
                            */
                         $fetch_size++;
                     } else {
                         if ($no_flags) {
                             $this->web_queue->setQueueFlag($url, $delay + WebQueueBundle::SCHEDULABLE);
                         }
                     }
                 } else {
                     if (!$is_waiting_host) {
                         // has crawl delay but too many already waiting
                         $delete_urls[$i] = $url;
                         //delete from queue (so no clog) but don't mark seen
                         $i++;
                         continue;
                     }
                 }
             } else {
                 // add a url no crawl delay
                 $next_slot = $this->getEarliestSlot($current_crawl_index, $sites);
                 if ($next_slot < MAX_FETCH_SIZE) {
                     $sites[$next_slot] = array($url, $weight, 0);
                     $delete_urls[$i] = $url;
                     $this->web_queue->addSeenUrlFilter($url);
                     /* we might miss some sites by marking them
                          seen after only scheduling them
                        */
                     $current_crawl_index = $next_slot;
                     $fetch_size++;
                 } else {
                     //no more available slots so prepare to bail
                     $i = $count;
                     if ($no_flags) {
                         $this->web_queue->setQueueFlag($url, WebQueueBundle::SCHEDULABLE);
                     }
                 }
             }
             //if delay else
         }
         // if containsGotRobotTxt
         // handle robots.txt urls
         $i++;
     }
     //end while
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     crawlLog("...Scheduler: Done selecting URLS for fetch batch time " . "so far:" . changeInMicrotime($start_time));
     $num_deletes = count($delete_urls);
     $k = 0;
     foreach ($delete_urls as $delete_url) {
         $k++;
         crawlTimeoutLog("..Removing selected url %s of %s " . "from queue.", $k, $num_deletes);
         if ($delete_url) {
             $this->web_queue->removeQueue($delete_url);
         } else {
             /*  if there was a hash table look up error still get rid of
                 index from priority queue */
             $this->web_queue->to_crawl_queue->poll($k);
         }
     }
     crawlLog("...Scheduler: Removed {$k} URLS for fetch batch from " . "queue in time: " . changeInMicrotime($new_time));
     $new_time = microtime();
     if (isset($sites) && count($sites) > 0) {
         $dummy_slot = array(self::DUMMY, 0.0, 0);
         /* dummy's are used for crawl delays of sites with longer delays
              when we don't have much else to crawl.
            */
         $cnt = 0;
         for ($j = 0; $j < MAX_FETCH_SIZE; $j++) {
             if (isset($sites[$j])) {
                 $cnt++;
                 if ($cnt == $fetch_size) {
                     break;
                 }
             } else {
                 if ($j % NUM_MULTI_CURL_PAGES == 0) {
                     $sites[$j] = $dummy_slot;
                 }
             }
         }
         ksort($sites);
         //write schedule to disk
         $fh = fopen(CRAWL_DIR . "/schedules/" . self::schedule_name . $this->crawl_time . ".txt", "wb");
         fwrite($fh, $first_line);
         $num_sites = count($sites);
         $k = 0;
         foreach ($sites as $site) {
             crawlTimeoutLog("..Scheduler: Still Writing fetch schedule %s" . " of %s.", $k, $num_sites);
             $k++;
             $extracted_etag = NULL;
             list($url, $weight, $delay) = $site;
             $key = crawlHash($url, true);
             if (USE_ETAG_EXPIRES) {
                 /*check if we have cache validation data for a URL. If both
                    ETag and Expires timestamp are found or only an expires
                    timestamp is found, the timestamp is compared with the current
                    time. If the current time is less than the expires timestamp,
                    the URL is not added to the fetch batch. If only an ETag is
                    found, the ETag is appended to the URL so that it can be
                    processed by the fetcher.
                   */
                 $value = $this->web_queue->etag_btree->findValue($key);
                 if ($value !== NULL) {
                     $cache_validation_data = $value[1];
                     if ($cache_validation_data['etag'] !== -1 && $cache_validation_data['expires'] !== -1) {
                         $expires_timestamp = $cache_validation_data['expires'];
                         $current_time = time();
                         if ($current_time < $expires_timestamp) {
                             continue;
                         } else {
                             $etag = $cache_validation_data['etag'];
                             $extracted_etag = "ETag: " . $etag;
                         }
                     } else {
                         if ($cache_validation_data['etag'] !== -1) {
                             $etag = $cache_validation_data['etag'];
                             $extracted_etag = "ETag: " . $etag;
                         } else {
                             if ($cache_validation_data['expires'] !== -1) {
                                 $expires_timestamp = $cache_validation_data['expires'];
                                 $current_time = time();
                                 if ($current_time < $expires_timestamp) {
                                     continue;
                                 }
                             }
                         }
                     }
                 }
             }
             $host_url = UrlParser::getHost($url);
             $dns_lookup = $this->web_queue->dnsLookup($host_url);
             if ($dns_lookup) {
                 $url .= "###" . urlencode($dns_lookup);
             }
             if ($extracted_etag !== NULL) {
                 $url .= $extracted_etag;
             }
             $out_string = base64_encode(packFloat($weight) . packInt($delay) . $url) . "\n";
             fwrite($fh, $out_string);
         }
         fclose($fh);
         crawlLog("...Scheduler: Sort URLS and write schedule time: " . changeInMicrotime($new_time));
         crawlLog("Scheduler: End Produce Fetch Batch Memory usage" . memory_get_usage());
         crawlLog("Scheduler: Created fetch batch of size {$num_sites}." . " {$num_deletes} urls were deleted." . " Queue size is now " . $this->web_queue->to_crawl_queue->count . "...Total Time to create batch: " . changeInMicrotime($start_time));
     } else {
         crawlLog("Scheduler: No fetch batch created!! " . "Time failing to make a fetch batch:" . changeInMicrotime($start_time) . ". Loop properties:{$i} {$count}" . " {$num_deletes} urls were deleted in failed attempt.");
         $max_links = max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP);
         if ($num_deletes < 5 && $i >= $count && $count >= NUM_URLS_QUEUE_RAM - SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) {
             crawlLog("Scheduler: Queue Full and Couldn't produce Fetch " . "Batch!! Or Delete any URLS!!!");
             crawlLog("Scheduler: Rescheduling Queue Contents " . "(not marking seen) to try to unjam!");
             $this->dumpQueueToSchedules(true);
             $this->clearWebQueue();
         }
     }
 }
Example #4
0
 /**
  * Gets the cached version of a web page from the machine on which it was
  * fetched.
  *
  * Complete cached versions of web pages typically only live on a fetcher
  * machine. The queue server machine typically only maintains summaries.
  * This method makes a REST request of a fetcher machine for a cached page
  * and get the results back.
  *
  * @param string $machine the ip address of domain name of the machine the
  *     cached page lives on
  * @param string $machine_uri the path from document root on $machine where
  *     the yioop scripts live
  * @param int $partition the partition in the WebArchiveBundle the page is
  *      in
  * @param int $offset the offset in bytes into the WebArchive partition in
  *     the WebArchiveBundle at which the cached page lives.
  * @param string $crawl_time the timestamp of the crawl the cache page is
  *     from
  * @param int $instance_num which fetcher instance for the particular
  *     fetcher crawled the page (if more than one), false otherwise
  * @return array page data of the cached page
  */
 function getCacheFile($machine, $machine_uri, $partition, $offset, $crawl_time, $instance_num = false)
 {
     $time = time();
     $session = md5($time . AUTH_KEY);
     if ($machine == '::1') {
         //IPv6 :(
         $machine = "[::1]";
         //used if the fetching and queue serving were on the same machine
     }
     // we assume all machines use the same scheme & port of the name server
     $port = UrlParser::getPort(NAME_SERVER);
     $scheme = UrlParser::getScheme(NAME_SERVER);
     $request = "{$scheme}://{$machine}:{$port}{$machine_uri}?c=archive&a=cache&" . "time={$time}&session={$session}&partition={$partition}&offset={$offset}" . "&crawl_time={$crawl_time}";
     if ($instance_num !== false) {
         $request .= "&instance_num={$instance_num}";
     }
     $tmp = FetchUrl::getPage($request);
     $page = @unserialize(base64_decode($tmp));
     $page['REQUEST'] = $request;
     return $page;
 }