Ejemplo n.º 1
0
 /**
  * Check whether a URL is available, and update the status of the URL in the database
  * @param $last_check array of the data from the last check for the URL
  * @param bool $force true if the check should be forced to happen, even if it's not yet scheduled
  * @return array|bool
  */
 public function check($last_check, $force = false)
 {
     $url = $last_check['url'];
     $id = isset($last_check['id']) ? $last_check['id'] : md5($url);
     //TODO: Unify ID generation
     /* Make sure we're still scheduled to check the $url */
     $next_check_timestamp = isset($last_check['next_check']) ? $last_check['next_check'] : 0;
     if (!$force && $next_check_timestamp > time()) {
         return false;
     }
     $date = new DateTime();
     if (!AmberRobots::robots_allowed($url)) {
         /* If blocked by robots.txt, schedule next check for 6 months out */
         $next = $date->add(new DateInterval("P6M"))->getTimestamp();
         $status = isset($last_check['status']) ? $last_check['status'] : NULL;
         error_log(join(":", array(__FILE__, __METHOD__, "Blocked by robots.txt", $url)));
         $message = "Blocked by robots.txt";
     } else {
         $fetch_result = AmberNetworkUtils::open_url($url, array(CURLOPT_FAILONERROR => FALSE));
         $status = $this->is_up($fetch_result);
         $next = $this->next_check_date(isset($last_check['status']) ? $last_check['status'] : NULL, isset($last_check['last_checked']) ? $last_check['last_checked'] : NULL, isset($last_check['next_check']) ? $last_check['next_check'] : NULL, $status);
     }
     $now = new DateTime();
     $result = array('id' => $id, 'url' => $url, 'last_checked' => $now->getTimestamp(), 'next_check' => $next, 'status' => isset($status) ? $status ? 1 : 0 : NULL, 'message' => isset($message) ? $message : NULL, 'details' => isset($fetch_result) ? $fetch_result : NULL);
     return $result;
 }
Ejemplo n.º 2
0
 /**
  * Fetch the URL and associated assets and pass it on to the designated Storage service
  * @param $url
  * @return
  */
 public function fetch($url)
 {
     if (!$url) {
         throw new RuntimeException("Empty URL");
     }
     if (!$this->apiKey) {
         throw new InvalidArgumentException("Missing required API key for accessing Perma");
     }
     $api_endpoint = $this->apiUrl . '/v1/archives/?api_key=' . $this->apiKey;
     $curl_options = array(CURLOPT_POST => TRUE, CURLOPT_POSTFIELDS => json_encode(array('url' => $url)), CURLOPT_HTTPHEADER => array("Content-type: application/json"), CURLOPT_FOLLOWLOCATION => TRUE);
     $perma_result = AmberNetworkUtils::open_single_url($api_endpoint, $curl_options);
     /* Make sure that we got a valid response from Perma */
     if ($perma_result === FALSE || $perma_result['body'] === FALSE) {
         $message = "";
         if (isset($perma_result['info']['http_code'])) {
             $message = "HTTP response code=" . $perma_result['info']['http_code'];
         }
         throw new RuntimeException(join(":", array("Error submitting URL to Perma", $message)));
     }
     $json_result = json_decode($perma_result['body'], true);
     if (!isset($json_result['guid'])) {
         throw new RuntimeException("Perma response did not include GUID");
     }
     $result = array('id' => md5($json_result['url']), 'url' => $json_result['url'], 'type' => '', 'date' => strtotime($json_result['creation_timestamp']), 'location' => join("/", array($this->archiveUrl, $json_result['guid'])), 'size' => 0, 'provider' => 1, 'provider_id' => $json_result['guid']);
     return $result;
 }
Ejemplo n.º 3
0
 /**
  * Check to see if a given URL is available (if it returns 200 status code)
  * @param $url
  */
 public function up($url)
 {
     $item = AmberNetworkUtils::open_url($url, array(CURLOPT_FAILONERROR => FALSE));
     if (isset($item['info']['http_code'])) {
         return $item['info']['http_code'] == 200;
     } else {
         return false;
     }
 }
 /**
  * Query the Timegate server for a memento for this URL and date
  * @param  string $url    URL to query
  * @param  string $date   preferred date for the memento
  * @return string 		  JSON structure with memento location and date (if any)
  */
 public function getMemento($url, $date)
 {
     $header = array('Accept-Datetime: ' . gmdate(DATE_RFC1123, strtotime($date)));
     $options = array(CURLOPT_NOBODY => true, CURLOPT_HTTPHEADER => $header);
     /* Be forgiving of trailing slashes (or lack thereof) in server URL */
     $query_url = implode("/", array(trim($this->serverUrl, "/"), $url));
     $result = AmberNetworkUtils::open_single_url($query_url, $options, FALSE);
     if ($result !== FALSE && isset($result['headers']['Location'])) {
         $url = $result['headers']['Location'];
         return array('url' => $url, 'date' => $this->getArchiveDate($url));
     } else {
         return array();
     }
 }
Ejemplo n.º 5
0
 /**
  * Query the NetClerk server for the status of the URLs in a particular country
  * @param  array  $urls    array of URLs to query
  * @param  string $country two-character ISO code for the user's country
  * @return string 			body of the response from the NetClerk server
  */
 public function query_status_from_netclerk(array $urls, $country)
 {
     $fields = array('country' => $country, 'url' => $urls);
     $fields_string = http_build_query($fields);
     /* http_build_query represents arrays as "urls[0]=foo&urls[1]=bar", 
        but we need "urls[]=foo&urls[]=bar" */
     $fields_string = preg_replace('/%5B[0-9]+%5D/', '%5B%5D', $fields_string);
     $options = array(CURLOPT_POST => true, CURLOPT_POSTFIELDS => $fields_string);
     $result = AmberNetworkUtils::open_single_url($this->serverUrl . "/statuses", $options);
     if ($result !== FALSE && isset($result['body'])) {
         return $result['body'];
     } else {
         return FALSE;
     }
 }
 /**
  * Fetch the URL and associated assets and pass it on to the designated Storage service
  * @param $url
  * @return
  */
 public function fetch($url)
 {
     if (!$url) {
         throw new RuntimeException("Empty URL");
     }
     $api_endpoint = join("", array($this->archiveUrl, "/save/", $url));
     $ia_result = AmberNetworkUtils::open_single_url($api_endpoint, array(), FALSE);
     /* Make sure that we got a valid response from the Archive */
     if ($ia_result === FALSE) {
         throw new RuntimeException(join(":", array("Error submitting to Internet Archive")));
     }
     if (isset($ia_result['info']['http_code']) && $ia_result['info']['http_code'] == 403) {
         throw new RuntimeException(join(":", array("Permission denied when submitting to Internet Archive (may be blocked by robots.txt)")));
     }
     if (!isset($ia_result['headers']['Content-Location'])) {
         throw new RuntimeException("Internet Archive response did not include archive location");
     }
     $location = $ia_result['headers']['Content-Location'];
     $content_type = isset($ia_result['headers']['X-Archive-Orig-Content-Type']) ? $ia_result['headers']['X-Archive-Orig-Content-Type'] : "";
     $size = isset($ia_result['headers']['X-Archive-Orig-Content-Length']) ? $ia_result['headers']['X-Archive-Orig-Content-Length'] : 0;
     $result = array('id' => md5($url), 'url' => $url, 'type' => $content_type, 'date' => time(), 'location' => $this->archiveUrl . $location, 'size' => $size, 'provider' => 2, 'provider_id' => $location);
     return $result;
 }
Ejemplo n.º 7
0
 public function testCleanUpPathString()
 {
     $this->assertEquals("common.css", AmberNetworkUtils::clean_up_path("../common.css"));
     $this->assertEquals("_v_1.0.32/personal/common.css", AmberNetworkUtils::clean_up_path("_v_1.0.32/personal/common.css"));
     $this->assertEquals("_v_1.0.32/personal/common.css", AmberNetworkUtils::clean_up_path("_v_1.0.32/personal/photo/../common.css"));
     $this->assertEquals("_v_1.0.32/common.css", AmberNetworkUtils::clean_up_path("_v_1.0.32/personal/photo/../../common.css"));
     $this->assertEquals("common.css", AmberNetworkUtils::clean_up_path("_v_1.0.32/personal/photo/../../../../common.css"));
 }
Ejemplo n.º 8
0
 /**
  * Find out if the access to the given URL is permitted by the robots.txt
  * @param $url
  * @return bool
  */
 public static function robots_allowed($url)
 {
     $p = parse_url($url);
     $p['path'] = "robots.txt";
     $robots_url = $p['scheme'] . "://" . $p['host'] . (isset($p['port']) ? ":" . $p['port'] : '') . '/robots.txt';
     $data = AmberNetworkUtils::open_url($robots_url, array(CURLOPT_FAILONERROR => FALSE));
     if (isset($data['info']['http_code']) && $data['info']['http_code'] == 200) {
         $body = $data['body'];
         return !$body || AmberRobots::url_permitted($body, $url);
     }
     return true;
 }
Ejemplo n.º 9
0
 /**
  * Respect the "noarchive" meta tag as described here: http://noarchive.net/meta/
  * Sample tags that will prevent archiving:
  *   <meta name="robots" content="noarchive">
  *   <meta name="amber" content="noarchive">
  *   <meta name="robots" content="noarchive, noindex">
  *   <meta name="amber" content="noindex">
  * @param  string $body HTML document to example
  * @return boolean       true if there is an application no-archive tag, false otherwise
  */
 public static function find_meta_no_archive($body)
 {
     $head = AmberNetworkUtils::get_head($body);
     if (preg_match("/<meta\\s+name\\s*=\\s*['\"](robots|amber)['\"].*content\\s*=\\s*['\"].*(noarchive|noindex).*['\"]/i", $head, $matches)) {
         return TRUE;
     } else {
         return FALSE;
     }
 }
    public function testMetaNoArchiveTagDetectionNoIndex()
    {
        $this->assertTrue(AmberNetworkUtils::find_meta_no_archive(<<<EOD
<html>
<head><title>bad man</title>
<meta name="robots" content="noindex">
</head>
<body>
The meta tag only works in the head
</body>
</html>
EOD
));
        $this->assertTrue(AmberNetworkUtils::find_meta_no_archive(<<<EOD
<html>
<head><title>bad man</title>
<meta name="robots" content="noindex">
</head>
<body>
The meta tag only works in the head
</body>
</html>
EOD
));
        $this->assertTrue(AmberNetworkUtils::find_meta_no_archive(<<<EOD
<html>
<head><title>bad man</title>
<meta name="amber" content="noarchive, noindex">
</head>
<body>
The meta tag only works in the head
</body>
</html>
EOD
));
        $this->assertTrue(AmberNetworkUtils::find_meta_no_archive(<<<EOD
<html>
<head><title>bad man</title>
<meta name="robots" content="noindex,noarchive">
</head>
<body>
The meta tag only works in the head
</body>
</html>
EOD
));
    }