コード例 #1
0
 /**
  * Check whether a URL is available, and update the status of the URL in the database
  * @param $last_check array of the data from the last check for the URL
  * @param bool $force true if the check should be forced to happen, even if it's not yet scheduled
  * @return array|bool
  */
 public function check($last_check, $force = false)
 {
     $url = $last_check['url'];
     $id = isset($last_check['id']) ? $last_check['id'] : md5($url);
     //TODO: Unify ID generation
     /* Make sure we're still scheduled to check the $url */
     $next_check_timestamp = isset($last_check['next_check']) ? $last_check['next_check'] : 0;
     if (!$force && $next_check_timestamp > time()) {
         return false;
     }
     $date = new DateTime();
     if (!AmberRobots::robots_allowed($url)) {
         /* If blocked by robots.txt, schedule next check for 6 months out */
         $next = $date->add(new DateInterval("P6M"))->getTimestamp();
         $status = isset($last_check['status']) ? $last_check['status'] : NULL;
         error_log(join(":", array(__FILE__, __METHOD__, "Blocked by robots.txt", $url)));
         $message = "Blocked by robots.txt";
     } else {
         $fetch_result = AmberNetworkUtils::open_url($url, array(CURLOPT_FAILONERROR => FALSE));
         $status = $this->is_up($fetch_result);
         $next = $this->next_check_date(isset($last_check['status']) ? $last_check['status'] : NULL, isset($last_check['last_checked']) ? $last_check['last_checked'] : NULL, isset($last_check['next_check']) ? $last_check['next_check'] : NULL, $status);
     }
     $now = new DateTime();
     $result = array('id' => $id, 'url' => $url, 'last_checked' => $now->getTimestamp(), 'next_check' => $next, 'status' => isset($status) ? $status ? 1 : 0 : NULL, 'message' => isset($message) ? $message : NULL, 'details' => isset($fetch_result) ? $fetch_result : NULL);
     return $result;
 }
コード例 #2
0
    public function testAmberExcluded_11622()
    {
        $this->assertFalse(AmberRobots::url_permitted(<<<EOD
user-agent: Amber
disallow: /
EOD
, "/Settings"));
    }
コード例 #3
0
 /**
  * Find out if the access to the given URL is permitted by the robots.txt
  * @param $url
  * @return bool
  */
 public static function robots_allowed($url)
 {
     $p = parse_url($url);
     $p['path'] = "robots.txt";
     $robots_url = $p['scheme'] . "://" . $p['host'] . (isset($p['port']) ? ":" . $p['port'] : '') . '/robots.txt';
     $data = AmberNetworkUtils::open_url($robots_url, array(CURLOPT_FAILONERROR => FALSE));
     if (isset($data['info']['http_code']) && $data['info']['http_code'] == 200) {
         $body = $data['body'];
         return !$body || AmberRobots::url_permitted($body, $url);
     }
     return true;
 }