/**
  * Updates the status of the crawler
  *
  * @param PHPCrawlerDocumentInfo $PageInfo The PHPCrawlerDocumentInfo-object of the last received document
  *                                                  or NULL if no document was received.
  * @param int $abort_reason One of the PHPCrawlerAbortReasons::ABORTREASON-constants if the crawling-process
  *                                                  should get aborted, otherwise NULL
  * @param string $first_content_url The first URL some content was found in (or NULL if no content was found so far).
  */
 public function updateCrawlerStatus($PageInfo, $abort_reason = null, $first_content_url = null, $last_request_time = null)
 {
     PHPCrawlerBenchmark::start("updating_crawler_status");
     // Set semaphore/lock if
     if ($this->lock_status_updates == true) {
         $sem_key = sem_get($this->crawler_uniqid);
         sem_acquire($sem_key);
     }
     // Get current Status
     $crawler_status = $this->getCrawlerStatus();
     // Update status
     if ($PageInfo != null) {
         // Increase number of followed links
         $crawler_status->links_followed++;
         // Increase documents_received-counter
         if ($PageInfo->received == true) {
             $crawler_status->documents_received++;
         }
         // Increase bytes-counter
         $crawler_status->bytes_received += $PageInfo->bytes_received + $PageInfo->header_bytes_received;
         // Benchmarks
         if ($PageInfo->error_occured == false) {
             // server connect time
             $crawler_status->sum_server_connect_time += $PageInfo->server_connect_time;
             $crawler_status->sum_server_connects++;
             // server response time
             $crawler_status->sum_server_response_time += $PageInfo->server_response_time;
             $crawler_status->sum_server_responses++;
             // data transfer time
             $crawler_status->sum_data_transfer_time += $PageInfo->data_transfer_time;
             // unbuffered bytes read
             $crawler_status->unbuffered_bytes_read += $PageInfo->unbuffered_bytes_read;
         }
     }
     // Set abortreason
     if ($abort_reason !== null) {
         $crawler_status->abort_reason = $abort_reason;
     }
     // Set first_content_url
     if ($first_content_url !== null) {
         $crawler_status->first_content_url = $first_content_url;
     }
     // Set last request-time
     if ($last_request_time !== null) {
         $crawler_status->last_request_time = $last_request_time;
     }
     // Write crawler-status back
     $this->setCrawlerStatus($crawler_status);
     // Remove semaphore/lock
     if ($this->lock_status_updates == true) {
         sem_release($sem_key);
     }
     PHPCrawlerBenchmark::stop("updating_crawler_status");
 }
 /**
  * Parses a robots.txt-file and returns regular-expression-rules corresponding to the containing "disallow"-rules
  * that are adressed to the given user-agent.
  *
  * @param PHPCrawlerURLDescriptor $BaseUrl The root-URL all rules from the robots-txt-file should relate to
  * @param string $user_agent_string The useragent all rules from the robots-txt-file should relate to
  * @param string $robots_txt_uri Optional. The location of the robots.txt-file as URI.
  *                                                   If not set, the default robots.txt-file for the given BaseUrl gets parsed.
  *
  * @return array Numeric array containing regular-expressions for each "disallow"-rule defined in the robots.txt-file
  *               that's adressed to the given user-agent.
  */
 public function parseRobotsTxt(PHPCrawlerURLDescriptor $BaseUrl, $user_agent_string, $robots_txt_uri = null)
 {
     PHPCrawlerBenchmark::start("processing_robotstxt");
     // If robots_txt_uri not given, use the default one for the given BaseUrl
     if ($robots_txt_uri === null) {
         $robots_txt_uri = self::getRobotsTxtURL($BaseUrl->url_rebuild);
     }
     // Get robots.txt-content
     $robots_txt_content = PHPCrawlerUtils::getURIContent($robots_txt_uri, $user_agent_string);
     $non_follow_reg_exps = array();
     // If content was found
     if ($robots_txt_content != null) {
         // Get all lines in the robots.txt-content that are adressed to our user-agent.
         $applying_lines = $this->getUserAgentLines($robots_txt_content, $user_agent_string);
         // Get valid reg-expressions for the given disallow-pathes.
         $non_follow_reg_exps = $this->buildRegExpressions($applying_lines, PHPCrawlerUtils::getRootUrl($BaseUrl->url_rebuild));
     }
     PHPCrawlerBenchmark::stop("processing_robots.txt");
     return $non_follow_reg_exps;
 }
 /**
  * Creates the sqlite-db-file and opens connection to it.
  *
  * @param bool $create_tables Defines whether all necessary tables should be created
  */
 protected function openConnection($create_tables = false)
 {
     PHPCrawlerBenchmark::start("connecting_to_sqlite_db");
     // Open sqlite-file
     try {
         $this->PDO = new PDO("sqlite:" . $this->sqlite_db_file);
     } catch (Exception $e) {
         throw new Exception("Error creating SQLite-cache-file, " . $e->getMessage() . ", try installing sqlite3-extension for PHP.");
     }
     $this->PDO->exec("PRAGMA journal_mode = OFF");
     $this->PDO->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
     $this->PDO->setAttribute(PDO::ATTR_TIMEOUT, 100);
     if ($create_tables == true) {
         // Create url-table (if not exists)
         $this->PDO->exec("CREATE TABLE IF NOT EXISTS urls (id integer PRIMARY KEY AUTOINCREMENT,\n                                                         in_process bool DEFAULT 0,\n                                                         processed bool DEFAULT 0,\n                                                         priority_level integer,\n                                                         distinct_hash TEXT UNIQUE,\n                                                         link_raw TEXT,\n                                                         linkcode TEXT,\n                                                         linktext TEXT,\n                                                         refering_url TEXT,\n                                                         url_rebuild TEXT,\n                                                         is_redirect_url bool,\n                                                         url_link_depth integer);");
         // Create indexes (seems that indexes make the whole thingy slower)
         $this->PDO->exec("CREATE INDEX IF NOT EXISTS priority_level ON urls (priority_level);");
         $this->PDO->exec("CREATE INDEX IF NOT EXISTS distinct_hash ON urls (distinct_hash);");
         $this->PDO->exec("CREATE INDEX IF NOT EXISTS in_process ON urls (in_process);");
         $this->PDO->exec("CREATE INDEX IF NOT EXISTS processed ON urls (processed);");
         $this->PDO->exec("ANALYZE;");
     }
     PHPCrawlerBenchmark::stop("connecting_to_sqlite_db");
 }
 /**
  * Searches for links in the given HTML-chunk and adds found links the the internal link-cache.
  */
 public function findLinksInHTMLChunk(&$html_source)
 {
     PHPCrawlerBenchmark::start("searching_for_links_in_page");
     // Check for meta-base-URL and meta-tags in top of HTML-source
     if ($this->top_lines_processed == false) {
         $meta_base_url = PHPCrawlerUtils::getBaseUrlFromMetaTag($html_source);
         if ($meta_base_url != null) {
             $base_url = PHPCrawlerUtils::buildURLFromLink($meta_base_url, $this->baseUrlParts);
             $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($base_url);
         }
         // Get all meta-tags
         $this->meta_attributes = PHPCrawlerUtils::getMetaTagAttributes($html_source);
         // Set flag that top-lines of source were processed
         $this->top_lines_processed = true;
     }
     // Prepare HTML-chunk
     $this->prepareHTMLChunk($html_source);
     // Build the RegEx-part for html-tags to search links in
     $tag_regex_part = "";
     $cnt = count($this->extract_tags);
     for ($x = 0; $x < $cnt; $x++) {
         $tag_regex_part .= "|" . $this->extract_tags[$x];
     }
     $tag_regex_part = substr($tag_regex_part, 1);
     // 1. <a href="...">LINKTEXT</a> (well formed link with </a> at the end and quotes around the link)
     // Get the link AND the linktext from these tags
     // This has to be done FIRST !!
     preg_match_all("#<\\s*a\\s[^<>]*(?<=\\s)(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*>" . "((?:(?!<\\s*\\/a\\s*>).){0,500})" . "<\\s*\\/a\\s*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = $matches[2][$x];
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
         }
     }
     // Second regex (everything that could be a link inside of <>-tags)
     preg_match_all("#<[^<>]*\\s(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = "";
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
         }
     }
     // Now, if agressive_mode is set to true, we look for some
     // other things
     $pregs = array();
     if ($this->aggressive_search == true) {
         // Links like "...:url("animage.gif")..."
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*\\(\\s*([\"|']{0,1})([^\"'\\) ]{1,500})['\"\\)]/ is";
         // Everything like "...href="bla.html"..." with qoutes
         $pregs[] = "/[\\s\\.:;\"'](?:" . $tag_regex_part . ")\\s*=\\s*([\"|'])(.{0,500}?)\\1/ is";
         // Everything like "...href=bla.html..." without qoutes
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*(=)\\s*([^\\s\">']{1,500})/ is";
         for ($x = 0; $x < count($pregs); $x++) {
             unset($matches);
             preg_match_all($pregs[$x], $html_source, $matches);
             $cnt = count($matches[0]);
             for ($y = 0; $y < $cnt; $y++) {
                 $link_raw = trim($matches[2][$y]);
                 $linkcode = trim($matches[0][$y]);
                 $linktext = "";
                 $this->addLinkToCache($link_raw, $linkcode, $linktext);
             }
         }
     }
     $this->found_links_map = array();
     PHPCrawlerBenchmark::stop("searching_for_links_in_page");
 }
Beispiel #5
0
 /**
  * Starts the cralwer by using multi processes.
  *
  * When using this method instead of the {@link go()}-method to start the crawler, phpcrawl will use the given
  * number of processes simultaneously for spidering the target-url.
  * Using multi processes will speed up the crawling-progress dramatically in most cases.
  *
  * There are some requirements though to successfully run the cralwler in multi-process mode:
  * <ul>
  * <li> The multi-process mode only works on unix-based systems (linux)</li>
  * <li> Scripts using the crawler have to be run from the commandline (cli)</li>
  * <li> The <a href="http://php.net/manual/en/pcntl.installation.php">PCNTL-extension</a> for php (process control) has to be installed and activated.</li>
  * <li> The <a href="http://php.net/manual/en/sem.installation.php">SEMAPHORE-extension</a> for php has to be installed and activated.</li>
  * <li>The <a href="http://de.php.net/manual/en/posix.installation.php">POSIX-extension</a> for php has to be installed and activated.</li>
  * <li> The <a href="http://de2.php.net/manual/en/pdo.installation.php">PDO-extension</a> together with the SQLite-driver (PDO_SQLITE) has to be installed and activated.</li>
  * </ul>
  *
  * PHPCrawls supports two different modes of multiprocessing:
  * <ol>
  * <li><b>{@link PHPCrawlerMultiProcessModes}::MPMODE_PARENT_EXECUTES_USERCODE</b>
  *
  * The cralwer uses multi processes simultaneously for spidering the target URL, but the usercode provided to
  * the overridable function {@link handleDocumentInfo()} gets always executed on the same main-process. This
  * means that the <b>usercode never gets executed simultaneously</b> and so you dont't have to care about
  * concurrent file/database/handle-accesses or smimilar things.
  * But on the other side the usercode may slow down the crawling-procedure because every child-process has to
  * wait until the usercode got executed on the main-process. <b>This ist the recommended multiprocess-mode!</b>
  * </li>
  * <li><b>{@link PHPCrawlerMultiProcessModes}::MPMODE_CHILDS_EXECUTES_USERCODE</b>
  *
  * The cralwer uses multi processes simultaneously for spidering the target URL, and every chld-process executes
  * the usercode provided to the overridable function {@link handleDocumentInfo()} directly from it's process. This
  * means that the <b>usercode gets executed simultaneously</b> by the different child-processes and you should
  * take care of concurrent file/data/handle-accesses proberbly (if used).
  *
  * When using this mode and you use any handles like database-connections or filestreams in your extended
  * crawler-class, you should open them within the overridden mehtod {@link initChildProcess()} instead of opening
  * them from the constructor. For more details see the documentation of the {@link initChildProcess()}-method.
  * </li>
  * </ol>
  *
  * Example for starting the crawler with 5 processes using the recommended MPMODE_PARENT_EXECUTES_USERCODE-mode:
  * <code>
  * $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE);
  * </code>
  *
  * Please note that increasing the number of processes to high values does't automatically mean that the crawling-process
  * will go off faster! Using 3 to 5 processes should be good values to start from.
  *
  * @param int $process_count Number of processes to use
  * @param int $multiprocess_mode The multiprocess-mode to use.
  *                               One of the {@link PHPCrawlerMultiProcessModes}-constants
  * @section 1 Basic settings
  */
 public function goMultiProcessed($process_count = 3, $multiprocess_mode = 1)
 {
     $this->multiprocess_mode = $multiprocess_mode;
     $this->child_process_count = $process_count;
     // Check if fork is supported
     if (!function_exists("pcntl_fork")) {
         throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (function pcntl_fork() missing)." . "Try running from command-line (cli) and/or installing the PHP PCNTL-extension.");
     }
     if (!function_exists("sem_get")) {
         throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (function sem_get() missing)." . "Try installing the PHP SEMAPHORE-extension.");
     }
     if (!function_exists("posix_kill")) {
         throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (function posix_kill() missing)." . "Try installing the PHP POSIX-extension.");
     }
     if (!class_exists("PDO")) {
         throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (class PDO missing)." . "Try installing the PHP PDO-extension.");
     }
     PHPCrawlerBenchmark::start("crawling_process");
     // Set url-cache-type to sqlite.
     $this->url_cache_type = PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE;
     // Init process
     $this->initCrawlerProcess();
     // Process robots.txt
     if ($this->obey_robots_txt == true) {
         $this->processRobotsTxt();
     }
     // Fork off child-processes
     $pids = array();
     for ($i = 1; $i <= $process_count; $i++) {
         $pids[$i] = pcntl_fork();
         if (!$pids[$i]) {
             // Childprocess goes here
             $this->is_chlid_process = true;
             $this->child_process_number = $i;
             $this->ProcessHandler->registerChildPID(getmypid());
             $this->startChildProcessLoop();
         }
     }
     // Set flag "parent-process"
     $this->is_parent_process = true;
     // Determinate all child-PIDs
     $this->child_pids = $this->ProcessHandler->getChildPIDs($process_count);
     // If crawler runs in MPMODE_PARENT_EXECUTES_USERCODE-mode -> start controller-loop
     if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) {
         $this->startControllerProcessLoop();
     }
     // Wait for childs to finish
     for ($i = 1; $i <= $process_count; $i++) {
         pcntl_waitpid($pids[$i], $status, WUNTRACED);
     }
     // Get crawler-status (needed for process-report)
     $this->crawlerStatus = $this->CrawlerStatusHandler->getCrawlerStatus();
     // Cleanup crawler
     $this->cleanup();
     PHPCrawlerBenchmark::stop("crawling_process");
 }
 /**
  * Filters the given URLs (contained in the given PHPCrawlerDocumentInfo-object) by the given rules.
  *
  * @param PHPCrawlerDocumentInfo $DocumentInfo PHPCrawlerDocumentInfo-object containing all found links of the current document.
  */
 public function filterUrls(PHPCrawlerDocumentInfo $DocumentInfo)
 {
     PHPCrawlerBenchmark::start("filtering_urls");
     $this->CurrentDocumentInfo = $DocumentInfo;
     $filtered_urls = array();
     $cnt = count($DocumentInfo->links_found_url_descriptors);
     for ($x = 0; $x < $cnt; $x++) {
         if (!$this->urlMatchesRules($DocumentInfo->links_found_url_descriptors[$x])) {
             $DocumentInfo->links_found_url_descriptors[$x] = null;
         }
     }
     $this->CurrentDocumentInfo = null;
     PHPCrawlerBenchmark::stop("filtering_urls");
 }
 /**
  * Returns all cookies from the cache that are adressed to the given URL
  *
  * @param string $target_url The target-URL
  * @return array  Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
  */
 public function getCookiesForUrl($target_url)
 {
     PHPCrawlerBenchmark::start("getting_cookies_from_cache");
     $url_parts = PHPCrawlerUtils::splitURL($target_url);
     $return_cookies = array();
     $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';");
     $rows = $Result->fetchAll(PDO::FETCH_ASSOC);
     $Result->closeCursor();
     $cnt = count($rows);
     for ($x = 0; $x < $cnt; $x++) {
         // Does the cookie-domain match?
         // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
         // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
         if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) {
             // Does the path match?
             if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) {
                 $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]);
                 $return_cookies[$Cookie->name] = $Cookie;
                 // Use cookie-name as index to avoid double-cookies
             }
         }
     }
     // Convert to numeric array
     $return_cookies = array_values($return_cookies);
     PHPCrawlerBenchmark::stop("getting_cookies_from_cache");
     return $return_cookies;
 }
 /**
  * Reads the response-content.
  *
  * @param bool $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and
  *                                this method will not return the content as a string.
  * @param int &$error_code Error-code by reference if an error occured.
  * @param &string &$error_string  Error-string by reference
  * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference
  *
  * @return string  The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file.
  */
 protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely)
 {
     $this->content_bytes_received = 0;
     // If content should be streamed to file
     if ($stream_to_file == true) {
         $fp = @fopen($this->tmpFile, "w");
         if ($fp == false) {
             $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE;
             $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing.";
             return "";
         }
     }
     // Init
     $source_portion = "";
     $source_complete = "";
     $document_received_completely = true;
     $document_completed = false;
     $gzip_encoded_content = null;
     // Resume data-transfer-time benchmark
     PHPCrawlerBenchmark::start("data_transfer_time");
     while ($document_completed == false) {
         // Get chunk from content
         $content_chunk = $this->readResponseContentChunk($document_completed, $error_code, $error_string, $document_received_completely);
         $source_portion .= $content_chunk;
         // Check if content is gzip-encoded (check only first chunk)
         if ($gzip_encoded_content === null) {
             if (PHPCrawlerEncodingUtils::isGzipEncoded($content_chunk)) {
                 $gzip_encoded_content = true;
             } else {
                 $gzip_encoded_content = false;
             }
         }
         // Stream to file or store source in memory
         if ($stream_to_file == true) {
             @fwrite($fp, $content_chunk);
         } else {
             $source_complete .= $content_chunk;
         }
         // Decode gzip-encoded content when done with document
         if ($document_completed == true && $gzip_encoded_content == true) {
             $source_complete = $source_portion = PHPCrawlerEncodingUtils::decodeGZipContent($source_complete);
         }
         // Find links in portion of the source
         if ($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= $this->content_buffer_size || $document_completed == true) {
             if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) {
                 PHPCrawlerBenchmark::stop("data_transfer_time");
                 $this->LinkFinder->findLinksInHTMLChunk($source_portion);
                 if ($this->source_overlap_size > 0) {
                     $source_portion = substr($source_portion, -$this->source_overlap_size);
                 } else {
                     $source_portion = "";
                 }
                 PHPCrawlerBenchmark::start("data_transfer_time");
             }
         }
     }
     if ($stream_to_file == true) {
         @fclose($fp);
     }
     // Stop data-transfer-time benchmark
     PHPCrawlerBenchmark::stop("data_transfer_time");
     $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time");
     return $source_complete;
 }