Ejemplo n.º 1
0
 /**
  * Check whether a URL is available, and update the status of the URL in the database
  * @param $last_check array of the data from the last check for the URL
  * @param bool $force true if the check should be forced to happen, even if it's not yet scheduled
  * @return array|bool
  */
 public function check($last_check, $force = false)
 {
     $url = $last_check['url'];
     $id = isset($last_check['id']) ? $last_check['id'] : md5($url);
     //TODO: Unify ID generation
     /* Make sure we're still scheduled to check the $url */
     $next_check_timestamp = isset($last_check['next_check']) ? $last_check['next_check'] : 0;
     if (!$force && $next_check_timestamp > time()) {
         return false;
     }
     $date = new DateTime();
     if (!AmberRobots::robots_allowed($url)) {
         /* If blocked by robots.txt, schedule next check for 6 months out */
         $next = $date->add(new DateInterval("P6M"))->getTimestamp();
         $status = isset($last_check['status']) ? $last_check['status'] : NULL;
         error_log(join(":", array(__FILE__, __METHOD__, "Blocked by robots.txt", $url)));
         $message = "Blocked by robots.txt";
     } else {
         $fetch_result = AmberNetworkUtils::open_url($url, array(CURLOPT_FAILONERROR => FALSE));
         $status = $this->is_up($fetch_result);
         $next = $this->next_check_date(isset($last_check['status']) ? $last_check['status'] : NULL, isset($last_check['last_checked']) ? $last_check['last_checked'] : NULL, isset($last_check['next_check']) ? $last_check['next_check'] : NULL, $status);
     }
     $now = new DateTime();
     $result = array('id' => $id, 'url' => $url, 'last_checked' => $now->getTimestamp(), 'next_check' => $next, 'status' => isset($status) ? $status ? 1 : 0 : NULL, 'message' => isset($message) ? $message : NULL, 'details' => isset($fetch_result) ? $fetch_result : NULL);
     return $result;
 }
Ejemplo n.º 2
0
 /**
  * Fetch the URL and associated assets and pass it on to the designated Storage service
  * @param $url
  * @return 
  */
 public function fetch($url)
 {
     if (!$url) {
         throw new RuntimeException("Empty URL");
     }
     // Check the robots.txt
     if (!AmberRobots::robots_allowed($url)) {
         throw new RuntimeException("Blocked by robots.txt");
     }
     // Send a GET request
     $root_item = AmberNetworkUtils::open_url($url);
     // Decide whether the item should be cached
     if (!$this->cacheable_item($root_item, $reason)) {
         throw new RuntimeException($reason);
     }
     $size = $root_item['info']['size_download'];
     if ($size == 0) {
         throw new RuntimeException("Empty document");
     }
     // Get other assets
     if (isset($root_item['headers']['Content-Type']) && ($content_type = $root_item['headers']['Content-Type']) && AmberNetworkUtils::is_html_mime_type($content_type)) {
         $body = $root_item['body'];
         $asset_paths = $this->assetHelper->extract_assets($body);
         /* Use the url of the document we end up downloading as a reference point for
            relative asset references, since we may have been redirected from the one
            we originally requested. */
         $assets = $this->assetHelper->expand_asset_references($root_item['info']['url'], $asset_paths, $this->assetHelper->extract_base_tag($body));
         $assets = $this->download_assets($assets, $root_item['info']['url']);
         $assets = $this->download_css_assets_recursive($assets, $root_item['info']['url'], $size);
         $body = $this->assetHelper->rewrite_links($body, $assets);
         $body = $this->assetHelper->insert_banner($body, $this->headerText, array("url" => $url, "date" => date('Y/m/d')));
         $root_item['body'] = $body;
         /* Check total size of the file combined with its assets */
         if ($size > $this->maxFileSize * 1024) {
             throw new RuntimeException("File size of document + assets too large");
         }
     }
     if ($this->storage && $root_item) {
         $result = $this->storage->save($url, $root_item['body'], $root_item['headers'], isset($assets) ? $assets : array());
         if (!$result) {
             throw new RuntimeException("Could not save cache");
         }
         $storage_metadata = $this->storage->get_metadata($url);
         if (!$storage_metadata || empty($storage_metadata)) {
             throw new RuntimeException("Could not retrieve metadata");
         }
         //TODO: If cannot retrieve storage metadata, or id/url/cache not populated (perhaps due to permissions errors
         //      in saving the cache), fail more gracefully instead of with errors because the keys are not set
         return array('id' => $storage_metadata['id'], 'url' => $storage_metadata['url'], 'type' => isset($storage_metadata['type']) ? $storage_metadata['type'] : 'application/octet-stream', 'date' => strtotime($storage_metadata['cache']['amber']['date']), 'location' => $storage_metadata['cache']['amber']['location'], 'size' => $size, 'provider' => $this->storage->provider_id(), 'provider_id' => $storage_metadata['id']);
     } else {
         throw new RuntimeException("Content empty or could not save to disk");
     }
 }