/** * Parses a robots.txt-file and returns regular-expression-rules corresponding to the containing "disallow"-rules * that are adressed to the given user-agent. * * @param PHPCrawlerURLDescriptor $BaseUrl The root-URL all rules from the robots-txt-file should relate to * @param string $user_agent_string The useragent all rules from the robots-txt-file should relate to * @param string $robots_txt_uri Optional. The location of the robots.txt-file as URI. * If not set, the default robots.txt-file for the given BaseUrl gets parsed. * * @return array Numeric array containing regular-expressions for each "disallow"-rule defined in the robots.txt-file * that's adressed to the given user-agent. */ public function parseRobotsTxt(PHPCrawlerURLDescriptor $BaseUrl, $user_agent_string, $robots_txt_uri = null) { PHPCrawlerBenchmark::start("processing_robotstxt"); // If robots_txt_uri not given, use the default one for the given BaseUrl if ($robots_txt_uri === null) { $robots_txt_uri = self::getRobotsTxtURL($BaseUrl->url_rebuild); } // Get robots.txt-content $robots_txt_content = PHPCrawlerUtils::getURIContent($robots_txt_uri, $user_agent_string); $non_follow_reg_exps = array(); // If content was found if ($robots_txt_content != null) { // Get all lines in the robots.txt-content that are adressed to our user-agent. $applying_lines = $this->getUserAgentLines($robots_txt_content, $user_agent_string); // Get valid reg-expressions for the given disallow-pathes. $non_follow_reg_exps = $this->buildRegExpressions($applying_lines, PHPCrawlerUtils::getRootUrl($BaseUrl->url_rebuild)); } PHPCrawlerBenchmark::stop("processing_robots.txt"); return $non_follow_reg_exps; }
/** * Creates the sqlite-db-file and opens connection to it. * * @param bool $create_tables Defines whether all necessary tables should be created */ protected function openConnection($create_tables = false) { PHPCrawlerBenchmark::start("connecting_to_sqlite_db"); // Open sqlite-file try { $this->PDO = new PDO("sqlite:" . $this->sqlite_db_file); } catch (Exception $e) { throw new Exception("Error creating SQLite-cache-file, " . $e->getMessage() . ", try installing sqlite3-extension for PHP."); } $this->PDO->exec("PRAGMA journal_mode = OFF"); $this->PDO->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); $this->PDO->setAttribute(PDO::ATTR_TIMEOUT, 100); if ($create_tables == true) { // Create url-table (if not exists) $this->PDO->exec("CREATE TABLE IF NOT EXISTS urls (id integer PRIMARY KEY AUTOINCREMENT,\n in_process bool DEFAULT 0,\n processed bool DEFAULT 0,\n priority_level integer,\n distinct_hash TEXT UNIQUE,\n link_raw TEXT,\n linkcode TEXT,\n linktext TEXT,\n refering_url TEXT,\n url_rebuild TEXT,\n is_redirect_url bool,\n url_link_depth integer);"); // Create indexes (seems that indexes make the whole thingy slower) $this->PDO->exec("CREATE INDEX IF NOT EXISTS priority_level ON urls (priority_level);"); $this->PDO->exec("CREATE INDEX IF NOT EXISTS distinct_hash ON urls (distinct_hash);"); $this->PDO->exec("CREATE INDEX IF NOT EXISTS in_process ON urls (in_process);"); $this->PDO->exec("CREATE INDEX IF NOT EXISTS processed ON urls (processed);"); $this->PDO->exec("ANALYZE;"); } PHPCrawlerBenchmark::stop("connecting_to_sqlite_db"); }
/** * Searches for links in the given HTML-chunk and adds found links the the internal link-cache. */ public function findLinksInHTMLChunk(&$html_source) { PHPCrawlerBenchmark::start("searching_for_links_in_page"); // Check for meta-base-URL and meta-tags in top of HTML-source if ($this->top_lines_processed == false) { $meta_base_url = PHPCrawlerUtils::getBaseUrlFromMetaTag($html_source); if ($meta_base_url != null) { $base_url = PHPCrawlerUtils::buildURLFromLink($meta_base_url, $this->baseUrlParts); $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($base_url); } // Get all meta-tags $this->meta_attributes = PHPCrawlerUtils::getMetaTagAttributes($html_source); // Set flag that top-lines of source were processed $this->top_lines_processed = true; } // Prepare HTML-chunk $this->prepareHTMLChunk($html_source); // Build the RegEx-part for html-tags to search links in $tag_regex_part = ""; $cnt = count($this->extract_tags); for ($x = 0; $x < $cnt; $x++) { $tag_regex_part .= "|" . $this->extract_tags[$x]; } $tag_regex_part = substr($tag_regex_part, 1); // 1. <a href="...">LINKTEXT</a> (well formed link with </a> at the end and quotes around the link) // Get the link AND the linktext from these tags // This has to be done FIRST !! preg_match_all("#<\\s*a\\s[^<>]*(?<=\\s)(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*>" . "((?:(?!<\\s*\\/a\\s*>).){0,500})" . "<\\s*\\/a\\s*># is", $html_source, $matches); $cnt = count($matches[0]); for ($x = 0; $x < $cnt; $x++) { $link_raw = trim($matches[1][$x]); $linktext = $matches[2][$x]; $linkcode = trim($matches[0][$x]); if (!empty($link_raw)) { $this->addLinkToCache($link_raw, $linkcode, $linktext); } } // Second regex (everything that could be a link inside of <>-tags) preg_match_all("#<[^<>]*\\s(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*># is", $html_source, $matches); $cnt = count($matches[0]); for ($x = 0; $x < $cnt; $x++) { $link_raw = trim($matches[1][$x]); $linktext = ""; $linkcode = trim($matches[0][$x]); if (!empty($link_raw)) { $this->addLinkToCache($link_raw, $linkcode, $linktext); } } // Now, if agressive_mode is set to true, we look for some // other things $pregs = array(); if ($this->aggressive_search == true) { // Links like "...:url("animage.gif")..." $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*\\(\\s*([\"|']{0,1})([^\"'\\) ]{1,500})['\"\\)]/ is"; // Everything like "...href="bla.html"..." with qoutes $pregs[] = "/[\\s\\.:;\"'](?:" . $tag_regex_part . ")\\s*=\\s*([\"|'])(.{0,500}?)\\1/ is"; // Everything like "...href=bla.html..." without qoutes $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*(=)\\s*([^\\s\">']{1,500})/ is"; for ($x = 0; $x < count($pregs); $x++) { unset($matches); preg_match_all($pregs[$x], $html_source, $matches); $cnt = count($matches[0]); for ($y = 0; $y < $cnt; $y++) { $link_raw = trim($matches[2][$y]); $linkcode = trim($matches[0][$y]); $linktext = ""; $this->addLinkToCache($link_raw, $linkcode, $linktext); } } } $this->found_links_map = array(); PHPCrawlerBenchmark::stop("searching_for_links_in_page"); }
/** * Starts the cralwer by using multi processes. * * When using this method instead of the {@link go()}-method to start the crawler, phpcrawl will use the given * number of processes simultaneously for spidering the target-url. * Using multi processes will speed up the crawling-progress dramatically in most cases. * * There are some requirements though to successfully run the cralwler in multi-process mode: * <ul> * <li> The multi-process mode only works on unix-based systems (linux)</li> * <li> Scripts using the crawler have to be run from the commandline (cli)</li> * <li> The <a href="http://php.net/manual/en/pcntl.installation.php">PCNTL-extension</a> for php (process control) has to be installed and activated.</li> * <li> The <a href="http://php.net/manual/en/sem.installation.php">SEMAPHORE-extension</a> for php has to be installed and activated.</li> * <li>The <a href="http://de.php.net/manual/en/posix.installation.php">POSIX-extension</a> for php has to be installed and activated.</li> * <li> The <a href="http://de2.php.net/manual/en/pdo.installation.php">PDO-extension</a> together with the SQLite-driver (PDO_SQLITE) has to be installed and activated.</li> * </ul> * * PHPCrawls supports two different modes of multiprocessing: * <ol> * <li><b>{@link PHPCrawlerMultiProcessModes}::MPMODE_PARENT_EXECUTES_USERCODE</b> * * The cralwer uses multi processes simultaneously for spidering the target URL, but the usercode provided to * the overridable function {@link handleDocumentInfo()} gets always executed on the same main-process. This * means that the <b>usercode never gets executed simultaneously</b> and so you dont't have to care about * concurrent file/database/handle-accesses or smimilar things. * But on the other side the usercode may slow down the crawling-procedure because every child-process has to * wait until the usercode got executed on the main-process. <b>This ist the recommended multiprocess-mode!</b> * </li> * <li><b>{@link PHPCrawlerMultiProcessModes}::MPMODE_CHILDS_EXECUTES_USERCODE</b> * * The cralwer uses multi processes simultaneously for spidering the target URL, and every chld-process executes * the usercode provided to the overridable function {@link handleDocumentInfo()} directly from it's process. This * means that the <b>usercode gets executed simultaneously</b> by the different child-processes and you should * take care of concurrent file/data/handle-accesses proberbly (if used). * * When using this mode and you use any handles like database-connections or filestreams in your extended * crawler-class, you should open them within the overridden mehtod {@link initChildProcess()} instead of opening * them from the constructor. For more details see the documentation of the {@link initChildProcess()}-method. * </li> * </ol> * * Example for starting the crawler with 5 processes using the recommended MPMODE_PARENT_EXECUTES_USERCODE-mode: * <code> * $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE); * </code> * * Please note that increasing the number of processes to high values does't automatically mean that the crawling-process * will go off faster! Using 3 to 5 processes should be good values to start from. * * @param int $process_count Number of processes to use * @param int $multiprocess_mode The multiprocess-mode to use. * One of the {@link PHPCrawlerMultiProcessModes}-constants * @section 1 Basic settings */ public function goMultiProcessed($process_count = 3, $multiprocess_mode = 1) { $this->multiprocess_mode = $multiprocess_mode; $this->child_process_count = $process_count; // Check if fork is supported if (!function_exists("pcntl_fork")) { throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (function pcntl_fork() missing)." . "Try running from command-line (cli) and/or installing the PHP PCNTL-extension."); } if (!function_exists("sem_get")) { throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (function sem_get() missing)." . "Try installing the PHP SEMAPHORE-extension."); } if (!function_exists("posix_kill")) { throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (function posix_kill() missing)." . "Try installing the PHP POSIX-extension."); } if (!class_exists("PDO")) { throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (class PDO missing)." . "Try installing the PHP PDO-extension."); } PHPCrawlerBenchmark::start("crawling_process"); // Set url-cache-type to sqlite. $this->url_cache_type = PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE; // Init process $this->initCrawlerProcess(); // Process robots.txt if ($this->obey_robots_txt == true) { $this->processRobotsTxt(); } // Fork off child-processes $pids = array(); for ($i = 1; $i <= $process_count; $i++) { $pids[$i] = pcntl_fork(); if (!$pids[$i]) { // Childprocess goes here $this->is_chlid_process = true; $this->child_process_number = $i; $this->ProcessHandler->registerChildPID(getmypid()); $this->startChildProcessLoop(); } } // Set flag "parent-process" $this->is_parent_process = true; // Determinate all child-PIDs $this->child_pids = $this->ProcessHandler->getChildPIDs($process_count); // If crawler runs in MPMODE_PARENT_EXECUTES_USERCODE-mode -> start controller-loop if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) { $this->startControllerProcessLoop(); } // Wait for childs to finish for ($i = 1; $i <= $process_count; $i++) { pcntl_waitpid($pids[$i], $status, WUNTRACED); } // Get crawler-status (needed for process-report) $this->crawlerStatus = $this->CrawlerStatusHandler->getCrawlerStatus(); // Cleanup crawler $this->cleanup(); PHPCrawlerBenchmark::stop("crawling_process"); }
/** * Filters the given URLs (contained in the given PHPCrawlerDocumentInfo-object) by the given rules. * * @param PHPCrawlerDocumentInfo $DocumentInfo PHPCrawlerDocumentInfo-object containing all found links of the current document. */ public function filterUrls(PHPCrawlerDocumentInfo $DocumentInfo) { PHPCrawlerBenchmark::start("filtering_urls"); $this->CurrentDocumentInfo = $DocumentInfo; $filtered_urls = array(); $cnt = count($DocumentInfo->links_found_url_descriptors); for ($x = 0; $x < $cnt; $x++) { if (!$this->urlMatchesRules($DocumentInfo->links_found_url_descriptors[$x])) { $DocumentInfo->links_found_url_descriptors[$x] = null; } } $this->CurrentDocumentInfo = null; PHPCrawlerBenchmark::stop("filtering_urls"); }
/** * Updates the status of the crawler * * @param PHPCrawlerDocumentInfo $PageInfo The PHPCrawlerDocumentInfo-object of the last received document * or NULL if no document was received. * @param int $abort_reason One of the PHPCrawlerAbortReasons::ABORTREASON-constants if the crawling-process * should get aborted, otherwise NULL * @param string $first_content_url The first URL some content was found in (or NULL if no content was found so far). */ public function updateCrawlerStatus($PageInfo, $abort_reason = null, $first_content_url = null, $last_request_time = null) { PHPCrawlerBenchmark::start("updating_crawler_status"); // Set semaphore/lock if if ($this->lock_status_updates == true) { $sem_key = sem_get($this->crawler_uniqid); sem_acquire($sem_key); } // Get current Status $crawler_status = $this->getCrawlerStatus(); // Update status if ($PageInfo != null) { // Increase number of followed links $crawler_status->links_followed++; // Increase documents_received-counter if ($PageInfo->received == true) { $crawler_status->documents_received++; } // Increase bytes-counter $crawler_status->bytes_received += $PageInfo->bytes_received + $PageInfo->header_bytes_received; // Benchmarks if ($PageInfo->error_occured == false) { // server connect time $crawler_status->sum_server_connect_time += $PageInfo->server_connect_time; $crawler_status->sum_server_connects++; // server response time $crawler_status->sum_server_response_time += $PageInfo->server_response_time; $crawler_status->sum_server_responses++; // data transfer time $crawler_status->sum_data_transfer_time += $PageInfo->data_transfer_time; // unbuffered bytes read $crawler_status->unbuffered_bytes_read += $PageInfo->unbuffered_bytes_read; } } // Set abortreason if ($abort_reason !== null) { $crawler_status->abort_reason = $abort_reason; } // Set first_content_url if ($first_content_url !== null) { $crawler_status->first_content_url = $first_content_url; } // Set last request-time if ($last_request_time !== null) { $crawler_status->last_request_time = $last_request_time; } // Write crawler-status back $this->setCrawlerStatus($crawler_status); // Remove semaphore/lock if ($this->lock_status_updates == true) { sem_release($sem_key); } PHPCrawlerBenchmark::stop("updating_crawler_status"); }
/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { PHPCrawlerBenchmark::start("getting_cookies_from_cache"); $url_parts = PHPCrawlerUtils::splitURL($target_url); $return_cookies = array(); $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';"); $rows = $Result->fetchAll(PDO::FETCH_ASSOC); $Result->closeCursor(); $cnt = count($rows); for ($x = 0; $x < $cnt; $x++) { // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) { $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]); $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); PHPCrawlerBenchmark::stop("getting_cookies_from_cache"); return $return_cookies; }
/** * Reads the response-content. * * @param bool $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and * this method will not return the content as a string. * @param int &$error_code Error-code by reference if an error occured. * @param &string &$error_string Error-string by reference * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference * * @return string The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file. */ protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely) { $this->content_bytes_received = 0; // If content should be streamed to file if ($stream_to_file == true) { $fp = @fopen($this->tmpFile, "w"); if ($fp == false) { $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE; $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing."; return ""; } } // Init $source_portion = ""; $source_complete = ""; $document_received_completely = true; $document_completed = false; $gzip_encoded_content = null; // Resume data-transfer-time benchmark PHPCrawlerBenchmark::start("data_transfer_time"); while ($document_completed == false) { // Get chunk from content $content_chunk = $this->readResponseContentChunk($document_completed, $error_code, $error_string, $document_received_completely); $source_portion .= $content_chunk; // Check if content is gzip-encoded (check only first chunk) if ($gzip_encoded_content === null) { if (PHPCrawlerEncodingUtils::isGzipEncoded($content_chunk)) { $gzip_encoded_content = true; } else { $gzip_encoded_content = false; } } // Stream to file or store source in memory if ($stream_to_file == true) { @fwrite($fp, $content_chunk); } else { $source_complete .= $content_chunk; } // Decode gzip-encoded content when done with document if ($document_completed == true && $gzip_encoded_content == true) { $source_complete = $source_portion = PHPCrawlerEncodingUtils::decodeGZipContent($source_complete); } // Find links in portion of the source if ($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= $this->content_buffer_size || $document_completed == true) { if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) { PHPCrawlerBenchmark::stop("data_transfer_time"); $this->LinkFinder->findLinksInHTMLChunk($source_portion); if ($this->source_overlap_size > 0) { $source_portion = substr($source_portion, -$this->source_overlap_size); } else { $source_portion = ""; } PHPCrawlerBenchmark::start("data_transfer_time"); } } } if ($stream_to_file == true) { @fclose($fp); } // Stop data-transfer-time benchmark PHPCrawlerBenchmark::stop("data_transfer_time"); $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time"); return $source_complete; }