/** * Check whether a URL is available, and update the status of the URL in the database * @param $last_check array of the data from the last check for the URL * @param bool $force true if the check should be forced to happen, even if it's not yet scheduled * @return array|bool */ public function check($last_check, $force = false) { $url = $last_check['url']; $id = isset($last_check['id']) ? $last_check['id'] : md5($url); //TODO: Unify ID generation /* Make sure we're still scheduled to check the $url */ $next_check_timestamp = isset($last_check['next_check']) ? $last_check['next_check'] : 0; if (!$force && $next_check_timestamp > time()) { return false; } $date = new DateTime(); if (!AmberRobots::robots_allowed($url)) { /* If blocked by robots.txt, schedule next check for 6 months out */ $next = $date->add(new DateInterval("P6M"))->getTimestamp(); $status = isset($last_check['status']) ? $last_check['status'] : NULL; error_log(join(":", array(__FILE__, __METHOD__, "Blocked by robots.txt", $url))); $message = "Blocked by robots.txt"; } else { $fetch_result = AmberNetworkUtils::open_url($url, array(CURLOPT_FAILONERROR => FALSE)); $status = $this->is_up($fetch_result); $next = $this->next_check_date(isset($last_check['status']) ? $last_check['status'] : NULL, isset($last_check['last_checked']) ? $last_check['last_checked'] : NULL, isset($last_check['next_check']) ? $last_check['next_check'] : NULL, $status); } $now = new DateTime(); $result = array('id' => $id, 'url' => $url, 'last_checked' => $now->getTimestamp(), 'next_check' => $next, 'status' => isset($status) ? $status ? 1 : 0 : NULL, 'message' => isset($message) ? $message : NULL, 'details' => isset($fetch_result) ? $fetch_result : NULL); return $result; }
/** * Fetch the URL and associated assets and pass it on to the designated Storage service * @param $url * @return */ public function fetch($url) { if (!$url) { throw new RuntimeException("Empty URL"); } // Check the robots.txt if (!AmberRobots::robots_allowed($url)) { throw new RuntimeException("Blocked by robots.txt"); } // Send a GET request $root_item = AmberNetworkUtils::open_url($url); // Decide whether the item should be cached if (!$this->cacheable_item($root_item, $reason)) { throw new RuntimeException($reason); } $size = $root_item['info']['size_download']; if ($size == 0) { throw new RuntimeException("Empty document"); } // Get other assets if (isset($root_item['headers']['Content-Type']) && ($content_type = $root_item['headers']['Content-Type']) && AmberNetworkUtils::is_html_mime_type($content_type)) { $body = $root_item['body']; $asset_paths = $this->assetHelper->extract_assets($body); /* Use the url of the document we end up downloading as a reference point for relative asset references, since we may have been redirected from the one we originally requested. */ $assets = $this->assetHelper->expand_asset_references($root_item['info']['url'], $asset_paths, $this->assetHelper->extract_base_tag($body)); $assets = $this->download_assets($assets, $root_item['info']['url']); $assets = $this->download_css_assets_recursive($assets, $root_item['info']['url'], $size); $body = $this->assetHelper->rewrite_links($body, $assets); $body = $this->assetHelper->insert_banner($body, $this->headerText, array("url" => $url, "date" => date('Y/m/d'))); $root_item['body'] = $body; /* Check total size of the file combined with its assets */ if ($size > $this->maxFileSize * 1024) { throw new RuntimeException("File size of document + assets too large"); } } if ($this->storage && $root_item) { $result = $this->storage->save($url, $root_item['body'], $root_item['headers'], isset($assets) ? $assets : array()); if (!$result) { throw new RuntimeException("Could not save cache"); } $storage_metadata = $this->storage->get_metadata($url); if (!$storage_metadata || empty($storage_metadata)) { throw new RuntimeException("Could not retrieve metadata"); } //TODO: If cannot retrieve storage metadata, or id/url/cache not populated (perhaps due to permissions errors // in saving the cache), fail more gracefully instead of with errors because the keys are not set return array('id' => $storage_metadata['id'], 'url' => $storage_metadata['url'], 'type' => isset($storage_metadata['type']) ? $storage_metadata['type'] : 'application/octet-stream', 'date' => strtotime($storage_metadata['cache']['amber']['date']), 'location' => $storage_metadata['cache']['amber']['location'], 'size' => $size, 'provider' => $this->storage->provider_id(), 'provider_id' => $storage_metadata['id']); } else { throw new RuntimeException("Content empty or could not save to disk"); } }