/**
  * Parses a robots.txt-file and returns regular-expression-rules corresponding to the containing "disallow"-rules
  * that are adressed to the given user-agent.
  *
  * @param PHPCrawlerURLDescriptor $BaseUrl           The root-URL all rules from the robots-txt-file should relate to
  * @param string                  $user_agent_string The useragent all rules from the robots-txt-file should relate to
  * @param string                  $robots_txt_uri    Optional. The location of the robots.txt-file as URI.
  *                                                   If not set, the default robots.txt-file for the given BaseUrl gets parsed.
  *
  * @return array Numeric array containing regular-expressions for each "disallow"-rule defined in the robots.txt-file
  *               that's adressed to the given user-agent.
  */
 public function parseRobotsTxt(PHPCrawlerURLDescriptor $BaseUrl, $user_agent_string, $robots_txt_uri = null)
 {
     PHPCrawlerBenchmark::start("processing_robotstxt");
     // If robots_txt_uri not given, use the default one for the given BaseUrl
     if ($robots_txt_uri === null) {
         $robots_txt_uri = self::getRobotsTxtURL($BaseUrl->url_rebuild);
     }
     // Get robots.txt-content
     $robots_txt_content = PHPCrawlerUtils::getURIContent($robots_txt_uri, $user_agent_string);
     $non_follow_reg_exps = array();
     // If content was found
     if ($robots_txt_content != null) {
         // Get all lines in the robots.txt-content that are adressed to our user-agent.
         $applying_lines = $this->getUserAgentLines($robots_txt_content, $user_agent_string);
         // Get valid reg-expressions for the given disallow-pathes.
         $non_follow_reg_exps = $this->buildRegExpressions($applying_lines, PHPCrawlerUtils::getRootUrl($BaseUrl->url_rebuild));
     }
     PHPCrawlerBenchmark::stop("processing_robots.txt");
     return $non_follow_reg_exps;
 }
 /**
  * Reads the response-content.
  *
  * @param bool    $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and
  *                                this method will not return the content as a string.
  * @param int     &$error_code    Error-code by reference if an error occured.
  * @param &string &$error_string  Error-string by reference
  * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference
  *
  * @return string  The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file.
  */
 protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely)
 {
     $this->content_bytes_received = 0;
     // If content should be streamed to file
     if ($stream_to_file == true) {
         $fp = @fopen($this->tmpFile, "w");
         if ($fp == false) {
             $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE;
             $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing.";
             return "";
         }
     }
     // Init
     $source_portion = "";
     $source_complete = "";
     $document_received_completely = true;
     $document_completed = false;
     $gzip_encoded_content = null;
     // Resume data-transfer-time benchmark
     PHPCrawlerBenchmark::start("data_transfer_time");
     while ($document_completed == false) {
         // Get chunk from content
         $content_chunk = $this->readResponseContentChunk($document_completed, $error_code, $error_string, $document_received_completely);
         $source_portion .= $content_chunk;
         // Check if content is gzip-encoded (check only first chunk)
         if ($gzip_encoded_content === null) {
             if (PHPCrawlerEncodingUtils::isGzipEncoded($content_chunk)) {
                 $gzip_encoded_content = true;
             } else {
                 $gzip_encoded_content = false;
             }
         }
         // Stream to file or store source in memory
         if ($stream_to_file == true) {
             @fwrite($fp, $content_chunk);
         } else {
             $source_complete .= $content_chunk;
         }
         // Decode gzip-encoded content when done with document
         if ($document_completed == true && $gzip_encoded_content == true) {
             $source_complete = $source_portion = PHPCrawlerEncodingUtils::decodeGZipContent($source_complete);
         }
         // Find links in portion of the source
         if ($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= $this->content_buffer_size || $document_completed == true) {
             if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) {
                 PHPCrawlerBenchmark::stop("data_transfer_time");
                 $this->LinkFinder->findLinksInHTMLChunk($source_portion);
                 if ($this->source_overlap_size > 0) {
                     $source_portion = substr($source_portion, -$this->source_overlap_size);
                 } else {
                     $source_portion = "";
                 }
                 PHPCrawlerBenchmark::start("data_transfer_time");
             }
         }
     }
     if ($stream_to_file == true) {
         @fclose($fp);
     }
     // Stop data-transfer-time benchmark
     PHPCrawlerBenchmark::stop("data_transfer_time");
     $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time");
     return $source_complete;
 }
 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     PHPCrawlerBenchmark::start("getting_cookies_from_cache");
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $return_cookies = array();
     $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';");
     $rows = $Result->fetchAll(PDO::FETCH_ASSOC);
     $Result->closeCursor();
     $cnt = count($rows);
     for ($x = 0; $x < $cnt; $x++) {
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) {
                 $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]);
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     PHPCrawlerBenchmark::stop("getting_cookies_from_cache");
     return $return_cookies;
 }
 /**
  * Searches for links in the given HTML-chunk and adds found links the the internal link-cache.
  */
 public function findLinksInHTMLChunk(&$html_source)
 {
     PHPCrawlerBenchmark::start("searching_for_links_in_page");
     // Check for meta-base-URL and meta-tags in top of HTML-source
     if ($this->top_lines_processed == false) {
         $meta_base_url = PHPCrawlerUtils::getBaseUrlFromMetaTag($html_source);
         if ($meta_base_url != null) {
             $base_url = PHPCrawlerUtils::buildURLFromLink($meta_base_url, $this->baseUrlParts);
             $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($base_url);
         }
         // Get all meta-tags
         $this->meta_attributes = PHPCrawlerUtils::getMetaTagAttributes($html_source);
         // Set flag that top-lines of source were processed
         $this->top_lines_processed = true;
     }
     // Prepare HTML-chunk
     $this->prepareHTMLChunk($html_source);
     // Build the RegEx-part for html-tags to search links in
     $tag_regex_part = "";
     $cnt = count($this->extract_tags);
     for ($x = 0; $x < $cnt; $x++) {
         $tag_regex_part .= "|" . $this->extract_tags[$x];
     }
     $tag_regex_part = substr($tag_regex_part, 1);
     // 1. <a href="...">LINKTEXT</a> (well formed link with </a> at the end and quotes around the link)
     // Get the link AND the linktext from these tags
     // This has to be done FIRST !!
     preg_match_all("#<\\s*a\\s[^<>]*(?<=\\s)(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*>" . "((?:(?!<\\s*\\/a\\s*>).){0,500})" . "<\\s*\\/a\\s*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = $matches[2][$x];
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
         }
     }
     // Second regex (everything that could be a link inside of <>-tags)
     preg_match_all("#<[^<>]*\\s(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = "";
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
         }
     }
     // Now, if agressive_mode is set to true, we look for some
     // other things
     $pregs = array();
     if ($this->aggressive_search == true) {
         // Links like "...:url("animage.gif")..."
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*\\(\\s*([\"|']{0,1})([^\"'\\) ]{1,500})['\"\\)]/ is";
         // Everything like "...href="bla.html"..." with qoutes
         $pregs[] = "/[\\s\\.:;\"'](?:" . $tag_regex_part . ")\\s*=\\s*([\"|'])(.{0,500}?)\\1/ is";
         // Everything like "...href=bla.html..." without qoutes
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*(=)\\s*([^\\s\">']{1,500})/ is";
         for ($x = 0; $x < count($pregs); $x++) {
             unset($matches);
             preg_match_all($pregs[$x], $html_source, $matches);
             $cnt = count($matches[0]);
             for ($y = 0; $y < $cnt; $y++) {
                 $link_raw = trim($matches[2][$y]);
                 $linkcode = trim($matches[0][$y]);
                 $linktext = "";
                 $this->addLinkToCache($link_raw, $linkcode, $linktext);
             }
         }
     }
     $this->found_links_map = array();
     PHPCrawlerBenchmark::stop("searching_for_links_in_page");
 }
Esempio n. 5
0
 /**
  * Checks if the crawling-process should be aborted.
  *
  * @return int NULL if the process shouldn't be aborted yet, otherwise one of the PHPCrawlerAbortReasons::ABORTREASON-constants.
  */
 protected function checkForAbort()
 {
     PHPCrawlerBenchmark::start("checkning_for_abort");
     $abort_reason = null;
     // Get current status
     $crawler_status = $this->CrawlerStatusHandler->getCrawlerStatus();
     // if crawlerstatus already marked for ABORT
     if ($crawler_status->abort_reason !== null) {
         $abort_reason = $crawler_status->abort_reason;
     }
     // Check for reached limits
     // If traffic-limit is reached
     if ($this->traffic_limit > 0 && $crawler_status->bytes_received >= $this->traffic_limit) {
         $abort_reason = PHPCrawlerAbortReasons::ABORTREASON_TRAFFICLIMIT_REACHED;
     }
     // If request-limit is set
     if ($this->request_limit > 0) {
         // If document-limit regards to received documetns
         if ($this->only_count_received_documents == true && $crawler_status->documents_received >= $this->request_limit) {
             $abort_reason = PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED;
         } elseif ($this->only_count_received_documents == false && $crawler_status->links_followed >= $this->request_limit) {
             $abort_reason = PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED;
         }
     }
     $this->CrawlerStatusHandler->updateCrawlerStatus(null, $abort_reason);
     PHPCrawlerBenchmark::stop("checkning_for_abort");
     return $abort_reason;
 }
 /**
  * Updates the status of the crawler
  *
  * @param PHPCrawlerDocumentInfo $PageInfo          The PHPCrawlerDocumentInfo-object of the last received document
  *                                                  or NULL if no document was received.
  * @param int                    $abort_reason      One of the PHPCrawlerAbortReasons::ABORTREASON-constants if the crawling-process
  *                                                  should get aborted, otherwise NULL
  * @param string                 $first_content_url The first URL some content was found in (or NULL if no content was found so far).
  */
 public function updateCrawlerStatus($PageInfo, $abort_reason = null, $first_content_url = null, $last_request_time = null)
 {
     PHPCrawlerBenchmark::start("updating_crawler_status");
     // Set semaphore/lock if
     if ($this->lock_status_updates == true) {
         $sem_key = sem_get($this->crawler_uniqid);
         sem_acquire($sem_key);
     }
     // Get current Status
     $crawler_status = $this->getCrawlerStatus();
     // Update status
     if ($PageInfo != null) {
         // Increase number of followed links
         $crawler_status->links_followed++;
         // Increase documents_received-counter
         if ($PageInfo->received == true) {
             $crawler_status->documents_received++;
         }
         // Increase bytes-counter
         $crawler_status->bytes_received += $PageInfo->bytes_received + $PageInfo->header_bytes_received;
         // Benchmarks
         if ($PageInfo->error_occured == false) {
             // server connect time
             $crawler_status->sum_server_connect_time += $PageInfo->server_connect_time;
             $crawler_status->sum_server_connects++;
             // server response time
             $crawler_status->sum_server_response_time += $PageInfo->server_response_time;
             $crawler_status->sum_server_responses++;
             // data transfer time
             $crawler_status->sum_data_transfer_time += $PageInfo->data_transfer_time;
             // unbuffered bytes read
             $crawler_status->unbuffered_bytes_read += $PageInfo->unbuffered_bytes_read;
         }
     }
     // Set abortreason
     if ($abort_reason !== null) {
         $crawler_status->abort_reason = $abort_reason;
     }
     // Set first_content_url
     if ($first_content_url !== null) {
         $crawler_status->first_content_url = $first_content_url;
     }
     // Set last request-time
     if ($last_request_time !== null) {
         $crawler_status->last_request_time = $last_request_time;
     }
     // Write crawler-status back
     $this->setCrawlerStatus($crawler_status);
     // Remove semaphore/lock
     if ($this->lock_status_updates == true) {
         sem_release($sem_key);
     }
     PHPCrawlerBenchmark::stop("updating_crawler_status");
 }
 /**
  * Creates the sqlite-db-file and opens connection to it.
  *
  * @param bool $create_tables Defines whether all necessary tables should be created
  */
 protected function openConnection($create_tables = false)
 {
     PHPCrawlerBenchmark::start("connecting_to_sqlite_db");
     // Open sqlite-file
     try {
         $this->PDO = new PDO("sqlite:" . $this->sqlite_db_file);
     } catch (Exception $e) {
         throw new Exception("Error creating SQLite-cache-file, " . $e->getMessage() . ", try installing sqlite3-extension for PHP.");
     }
     $this->PDO->exec("PRAGMA journal_mode = OFF");
     $this->PDO->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
     $this->PDO->setAttribute(PDO::ATTR_TIMEOUT, 100);
     if ($create_tables == true) {
         // Create url-table (if not exists)
         $this->PDO->exec("CREATE TABLE IF NOT EXISTS urls (id integer PRIMARY KEY AUTOINCREMENT,\n                                                         in_process bool DEFAULT 0,\n                                                         processed bool DEFAULT 0,\n                                                         priority_level integer,\n                                                         distinct_hash TEXT UNIQUE,\n                                                         link_raw TEXT,\n                                                         linkcode TEXT,\n                                                         linktext TEXT,\n                                                         refering_url TEXT,\n                                                         url_rebuild TEXT,\n                                                         is_redirect_url bool,\n                                                         url_link_depth integer);");
         // Create indexes (seems that indexes make the whole thingy slower)
         $this->PDO->exec("CREATE INDEX IF NOT EXISTS priority_level ON urls (priority_level);");
         $this->PDO->exec("CREATE INDEX IF NOT EXISTS distinct_hash ON urls (distinct_hash);");
         $this->PDO->exec("CREATE INDEX IF NOT EXISTS in_process ON urls (in_process);");
         $this->PDO->exec("CREATE INDEX IF NOT EXISTS processed ON urls (processed);");
         $this->PDO->exec("ANALYZE;");
     }
     PHPCrawlerBenchmark::stop("connecting_to_sqlite_db");
 }
 /**
  * Filters the given URLs (contained in the given PHPCrawlerDocumentInfo-object) by the given rules.
  *
  * @param PHPCrawlerDocumentInfo $DocumentInfo PHPCrawlerDocumentInfo-object containing all found links of the current document.
  */
 public function filterUrls(PHPCrawlerDocumentInfo $DocumentInfo)
 {
     PHPCrawlerBenchmark::start("filtering_urls");
     $this->CurrentDocumentInfo = $DocumentInfo;
     $filtered_urls = array();
     $cnt = count($DocumentInfo->links_found_url_descriptors);
     for ($x = 0; $x < $cnt; $x++) {
         if (!$this->urlMatchesRules($DocumentInfo->links_found_url_descriptors[$x])) {
             $DocumentInfo->links_found_url_descriptors[$x] = null;
         }
     }
     $this->CurrentDocumentInfo = null;
     PHPCrawlerBenchmark::stop("filtering_urls");
 }