/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { PHPCrawlerBenchmark::start("getting_cookies_from_cache"); $url_parts = PHPCrawlerUtils::splitURL($target_url); $return_cookies = array(); $result = db_query("SELECT * FROM {" . $this->table . "} WHERE source_domain = '" . $url_parts["domain"] . "' AND crawler_id = '" . $this->crawler_id . "';"); $rows = $result->fetchAllAssoc('id'); // $rows = $this->conn->query("SELECT * FROM " . $this->table . " WHERE source_domain = '".$url_parts["domain"]."' AND crawler_id = '" . $this->crawler_id . "';")->fetchAllAssoc('id'); // drupal_set_message('<pre>PHPCrawlerD7CookieCache::getCookiesForUrl ' . print_r($rows, 1) . '</pre>'); $cnt = count($rows); for ($x = 0; $x < $cnt; $x++) { // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($rows[$x]->domain == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]->domain) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($rows[$x]->path) . "#", $url_parts->path)) { $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]->source_url, $rows[$x]->name, $rows[$x]->value, $rows[$x]->expires, $rows[$x]->path, $rows[$x]->domain); $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); PHPCrawlerBenchmark::stop("getting_cookies_from_cache"); return $return_cookies; }
/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { PHPCrawlerBenchmark::start("getting_cookies_from_cache"); $url_parts = PHPCrawlerUtils::splitURL($target_url); $return_cookies = array(); $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';"); $rows = $Result->fetchAll(PDO::FETCH_ASSOC); $Result->closeCursor(); $cnt = count($rows); for ($x = 0; $x < $cnt; $x++) { // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) { $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]); $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); PHPCrawlerBenchmark::stop("getting_cookies_from_cache"); return $return_cookies; }
/** * Filters the given URLs (contained in the given PHPCrawlerDocumentInfo-object) by the given rules. * * @param PHPCrawlerDocumentInfo $DocumentInfo PHPCrawlerDocumentInfo-object containing all found links of the current document. */ public function filterUrls(PHPCrawlerDocumentInfo $DocumentInfo) { PHPCrawlerBenchmark::start("filtering_urls"); $this->CurrentDocumentInfo = $DocumentInfo; $filtered_urls = array(); $cnt = count($DocumentInfo->links_found_url_descriptors); for ($x = 0; $x < $cnt; $x++) { if (!$this->urlMatchesRules($DocumentInfo->links_found_url_descriptors[$x])) { $DocumentInfo->links_found_url_descriptors[$x] = null; } } PHPCrawlerBenchmark::stop("filtering_urls"); }
/** * Parses the robots.txt-file related to the given URL and returns regular-expression-rules * corresponding to the containing "disallow"-rules that are adressed to the given user-agent. * * @param PHPCrawlerURLDescriptor $Url The URL * @param string $user_agent_string User-agent. * * @return array Numeric array containing regular-expressions for each "disallow"-rule defined in the robots.txt-file * that's adressed to the given user-agent. */ public function parseRobotsTxt(PHPCrawlerURLDescriptor $Url, $user_agent_string) { PHPCrawlerBenchmark::start("processing_robotstxt"); // URL of robots-txt $RobotsTxtUrl = self::getRobotsTxtURL($Url); // Get robots.txt-content related to the given URL $robots_txt_content = $this->getRobotsTxtContent($RobotsTxtUrl); $non_follow_reg_exps = array(); // If content was found if ($robots_txt_content != null) { // Get all lines in the robots.txt-content that are adressed to our user-agent. $applying_lines = $this->getApplyingLines($robots_txt_content, $user_agent_string); // Get valid reg-expressions for the given disallow-pathes. $non_follow_reg_exps = $this->buildRegExpressions($applying_lines, PHPCrawlerUtils::getRootUrl($Url->url_rebuild)); } PHPCrawlerBenchmark::stop("processing_robots.txt"); return $non_follow_reg_exps; }
/** * Parses a robots.txt-file and returns regular-expression-rules corresponding to the containing "disallow"-rules * that are adressed to the given user-agent. * * @param PHPCrawlerURLDescriptor $BaseUrl The root-URL all rules from the robots-txt-file should relate to * @param string $user_agent_string The useragent all rules from the robots-txt-file should relate to * @param string $robots_txt_uri Optional. The location of the robots.txt-file as URI. * If not set, the default robots.txt-file for the given BaseUrl gets parsed. * * @return array Numeric array containing regular-expressions for each "disallow"-rule defined in the robots.txt-file * that's adressed to the given user-agent. */ public function parseRobotsTxt(PHPCrawlerURLDescriptor $BaseUrl, $user_agent_string, $robots_txt_uri = null) { PHPCrawlerBenchmark::start("processing_robotstxt"); // If robots_txt_uri not given, use the default one for the given BaseUrl if ($robots_txt_uri === null) { $robots_txt_uri = self::getRobotsTxtURL($BaseUrl->url_rebuild); } // Get robots.txt-content $robots_txt_content = PHPCrawlerUtils::getURIContent($robots_txt_uri, $user_agent_string); $non_follow_reg_exps = array(); // If content was found if ($robots_txt_content != null) { // Get all lines in the robots.txt-content that are adressed to our user-agent. $applying_lines = $this->getUserAgentLines($robots_txt_content, $user_agent_string); // Get valid reg-expressions for the given disallow-pathes. $non_follow_reg_exps = $this->buildRegExpressions($applying_lines, PHPCrawlerUtils::getRootUrl($BaseUrl->url_rebuild)); } PHPCrawlerBenchmark::stop("processing_robots.txt"); return $non_follow_reg_exps; }
/** * Creates the sqlite-db-file and opens connection to it. * * @param bool $create_tables Defines whether all necessary tables should be created */ protected function openConnection($create_tables = false) { PHPCrawlerBenchmark::start("connecting_to_sqlite_db"); // Open sqlite-file try { $this->PDO = new PDO("sqlite:" . $this->sqlite_db_file); } catch (Exception $e) { throw new Exception("Error creating SQLite-cache-file, " . $e->getMessage() . ", try installing sqlite3-extension for PHP."); } $this->PDO->exec("PRAGMA journal_mode = OFF"); $this->PDO->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); $this->PDO->setAttribute(PDO::ATTR_TIMEOUT, 100); if ($create_tables == true) { // Create url-table (if not exists) $this->PDO->exec("CREATE TABLE IF NOT EXISTS urls (id integer PRIMARY KEY AUTOINCREMENT,\r\n in_process bool DEFAULT 0,\r\n processed bool DEFAULT 0,\r\n priority_level integer,\r\n distinct_hash TEXT UNIQUE,\r\n link_raw TEXT,\r\n linkcode TEXT,\r\n linktext TEXT,\r\n refering_url TEXT,\r\n url_rebuild TEXT,\r\n is_redirect_url bool,\r\n url_link_depth integer);"); // Create indexes (seems that indexes make the whole thingy slower) $this->PDO->exec("CREATE INDEX IF NOT EXISTS priority_level ON urls (priority_level);"); $this->PDO->exec("CREATE INDEX IF NOT EXISTS distinct_hash ON urls (distinct_hash);"); $this->PDO->exec("CREATE INDEX IF NOT EXISTS in_process ON urls (in_process);"); $this->PDO->exec("CREATE INDEX IF NOT EXISTS processed ON urls (processed);"); $this->PDO->exec("ANALYZE;"); } PHPCrawlerBenchmark::stop("connecting_to_sqlite_db"); }
/** * Resets all clocks for all benchmarks. * * @param array $retain_benachmarks Optional. Numeric array containing benchmark-identifiers that should NOT get resetted. */ public static function resetAll($retain_benchmarks = array()) { // If no benchmarks should be retained if (count($retain_benchmarks) == 0) { self::$benchmark_results = array(); return; } // Else reset all benchmarks BUT the retain_benachmarks @reset(self::$benchmark_results); while (list($identifier) = @each(self::$benchmark_results)) { if (!in_array($identifier, $retain_benchmarks)) { self::$benchmark_results[$identifier] = 0; } } }
/** * Retruns summarizing report-information about the crawling-process after it has finished. * * @return PHPCrawlerProcessReport PHPCrawlerProcessReport-object containing process-summary-information * @section 1 Basic settings */ public function getProcessReport() { // Get current crawler-Status $CrawlerStatus = $this->crawlerStatus; // Create report $Report = new PHPCrawlerProcessReport(); $Report->links_followed = $CrawlerStatus->links_followed; $Report->files_received = $CrawlerStatus->documents_received; $Report->bytes_received = $CrawlerStatus->bytes_received; $Report->process_runtime = PHPCrawlerBenchmark::getElapsedTime("crawling_process"); if ($Report->process_runtime > 0) { $Report->data_throughput = $Report->bytes_received / $Report->process_runtime; } // Process abort-reason $Report->abort_reason = $CrawlerStatus->abort_reason; if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_TRAFFICLIMIT_REACHED) { $Report->traffic_limit_reached = true; } if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED) { $Report->file_limit_reached = true; } if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_USERABORT) { $Report->user_abort = true; } // Peak memory-usage if (function_exists("memory_get_peak_usage")) { $Report->memory_peak_usage = memory_get_peak_usage(true); } return $Report; }
/** * Updates the status of the crawler * * @param PHPCrawlerDocumentInfo $PageInfo The PHPCrawlerDocumentInfo-object of the last received document * or NULL if no document was received. * @param int $abort_reason One of the PHPCrawlerAbortReasons::ABORTREASON-constants if the crawling-process * should get aborted, otherwise NULL * @param string $first_content_url The first URL some content was found in (or NULL if no content was found so far). */ public function updateCrawlerStatus($PageInfo, $abort_reason = null, $first_content_url = null, $last_request_time = null) { PHPCrawlerBenchmark::start("updating_crawler_status"); // Set semaphore/lock if if ($this->lock_status_updates == true) { $sem_key = sem_get($this->crawler_uniqid); sem_acquire($sem_key); } // Get current Status $crawler_status = $this->getCrawlerStatus(); // Update status if ($PageInfo != null) { // Increase number of followed links $crawler_status->links_followed++; // Increase documents_received-counter if ($PageInfo->received == true) { $crawler_status->documents_received++; } // Increase bytes-counter $crawler_status->bytes_received += $PageInfo->bytes_received + $PageInfo->header_bytes_received; // Benchmarks if ($PageInfo->error_occured == false) { // server connect time $crawler_status->sum_server_connect_time += $PageInfo->server_connect_time; $crawler_status->sum_server_connects++; // server response time $crawler_status->sum_server_response_time += $PageInfo->server_response_time; $crawler_status->sum_server_responses++; // data transfer time $crawler_status->sum_data_transfer_time += $PageInfo->data_transfer_time; // unbuffered bytes read $crawler_status->unbuffered_bytes_read += $PageInfo->unbuffered_bytes_read; } } // Set abortreason if ($abort_reason !== null) { $crawler_status->abort_reason = $abort_reason; } // Set first_content_url if ($first_content_url !== null) { $crawler_status->first_content_url = $first_content_url; } // Set last request-time if ($last_request_time !== null) { $crawler_status->last_request_time = $last_request_time; } // Write crawler-status back $this->setCrawlerStatus($crawler_status); // Remove semaphore/lock if ($this->lock_status_updates == true) { sem_release($sem_key); } PHPCrawlerBenchmark::stop("updating_crawler_status"); }
/** * Searches for links in the given HTML-chunk and adds found links the the internal link-cache. */ public function findLinksInHTMLChunk(&$html_source) { PHPCrawlerBenchmark::start("searching_for_links_in_page"); // Check for meta-base-URL and meta-tags in top of HTML-source if ($this->top_lines_processed == false) { $meta_base_url = PHPCrawlerUtils::getBaseUrlFromMetaTag($html_source); if ($meta_base_url != null) { $base_url = PHPCrawlerUtils::buildURLFromLink($meta_base_url, $this->baseUrlParts); $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($base_url); } // Get all meta-tags $this->meta_attributes = PHPCrawlerUtils::getMetaTagAttributes($html_source); // Set flag that top-lines of source were processed $this->top_lines_processed = true; } // Build the RegEx-part for html-tags to search links in $tag_regex_part = ""; $cnt = count($this->extract_tags); for ($x = 0; $x < $cnt; $x++) { $tag_regex_part .= "|" . $this->extract_tags[$x]; } $tag_regex_part = substr($tag_regex_part, 1); // 1. <a href="...">LINKTEXT</a> (well formed link with </a> at the end and quotes around the link) // Get the link AND the linktext from these tags // This has to be done FIRST !! preg_match_all("#<\\s*a\\s[^<>]*(?<=\\s)(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*>" . "((?:(?!<\\s*\\/a\\s*>).){0,500})" . "<\\s*\\/a\\s*># is", $html_source, $matches); $cnt = count($matches[0]); for ($x = 0; $x < $cnt; $x++) { $link_raw = trim($matches[1][$x]); $linktext = $matches[2][$x]; $linkcode = trim($matches[0][$x]); if (!empty($link_raw)) { $this->addLinkToCache($link_raw, $linkcode, $linktext); } } // Second regex (everything that could be a link inside of <>-tags) preg_match_all("#<[^<>]*\\s(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*># is", $html_source, $matches); $cnt = count($matches[0]); for ($x = 0; $x < $cnt; $x++) { $link_raw = trim($matches[1][$x]); $linktext = ""; $linkcode = trim($matches[0][$x]); if (!empty($link_raw)) { $this->addLinkToCache($link_raw, $linkcode, $linktext); } } // Now, if agressive_mode is set to true, we look for some // other things $pregs = array(); if ($this->aggressive_search == true) { // Links like "...:url("animage.gif")..." $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*\\(\\s*([\"|']{0,1})([^\"'\\) ]{1,500})['\"\\)]/ is"; // Everything like "...href="bla.html"..." with qoutes $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*=\\s*([\"|'])(.{0,500}?)\\1/ is"; // Everything like "...href=bla.html..." without qoutes $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*(=)\\s*([^\\s\">']{1,500})/ is"; for ($x = 0; $x < count($pregs); $x++) { unset($matches); preg_match_all($pregs[$x], $html_source, $matches); $cnt = count($matches[0]); for ($y = 0; $y < $cnt; $y++) { $link_raw = trim($matches[2][$y]); $linkcode = trim($matches[0][$y]); $linktext = ""; $this->addLinkToCache($link_raw, $linkcode, $linktext); } } } $this->found_links_map = array(); PHPCrawlerBenchmark::stop("searching_for_links_in_page"); }
/** * Reads the response-content. * * @param bool $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and * this method will not return the content as a string. * @param int &$error_code Error-code by reference if an error occured. * @param &string &$error_string Error-string by reference * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference * * @return string The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file. */ protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely) { $this->content_bytes_received = 0; // If content should be streamed to file if ($stream_to_file == true) { $fp = @fopen($this->tmpFile, "w"); if ($fp == false) { $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE; $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing."; return ""; } } // Init $source_portion = ""; $source_complete = ""; $document_received_completely = true; $document_completed = false; $gzip_encoded_content = null; // Resume data-transfer-time benchmark PHPCrawlerBenchmark::start("data_transfer_time"); while ($document_completed == false) { // Get chunk from content $content_chunk = $this->readResponseContentChunk($document_completed, $error_code, $error_string, $document_received_completely); $source_portion .= $content_chunk; // Check if content is gzip-encoded (check only first chunk) if ($gzip_encoded_content === null) { if (PHPCrawlerUtils::isGzipEncoded($content_chunk)) { $gzip_encoded_content = true; } else { $gzip_encoded_content = false; } } // Stream to file or store source in memory if ($stream_to_file == true) { @fwrite($fp, $content_chunk); } else { $source_complete .= $content_chunk; } // Decode gzip-encoded content when done with document if ($document_completed == true && $gzip_encoded_content == true) { $source_complete = $source_portion = PHPCrawlerUtils::decodeGZipContent($source_complete); } // Find links in portion of the source if ($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= 200000 || $document_completed == true) { if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) { PHPCrawlerBenchmark::stop("data_transfer_time"); $this->LinkFinder->findLinksInHTMLChunk($source_portion); $source_portion = substr($source_portion, -1500); PHPCrawlerBenchmark::start("data_transfer_time"); } } } if ($stream_to_file == true) { @fclose($fp); } // Stop data-transfer-time benchmark PHPCrawlerBenchmark::stop("data_transfer_time"); $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time"); return $source_complete; }
/** * Reads the response-content. * * @param bool $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and * this method will not return the content as a string. * @param int &$error_code Error-code by reference if an error occured. * @param &string &$error_string Error-string by reference * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference * @param &string &$bytes_received Number of bytes received, passed by reference * @return string The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file. */ protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely, &$bytes_received) { PHPCrawlerBenchmark::start("retreiving_content"); PHPCrawlerBenchmark::start("data_transfer_time", true); // If content should be streamed to file if ($stream_to_file == true) { $fp = @fopen($this->tmpFile, "w"); if ($fp == false) { $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE; $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing."; return ""; } } // Init $status = socket_get_status($this->socket); $source_portion = ""; $source_complete = ""; $bytes_received = 0; $document_received_completely = true; $stop_receving = false; while ($stop_receving == false) { socket_set_timeout($this->socket, $this->socketReadTimeout); // Read from socket $line_read = @fread($this->socket, 1024); // Das @ ist da um die blöde "SSL fatal protocol error"-Warnung zu unterdrücken, // die keinen Sinn macht // Check socket-status $status = socket_get_status($this->socket); // Check for EOF if ($status["eof"] == true) { $stop_receving = true; } // Socket timed out if ($status["timed_out"] == true) { $stop_receving = true; $error_code = PHPCrawlerRequestErrors::ERROR_SOCKET_TIMEOUT; $error_string = "Socket-stream timed out (timeout set to " . $this->socketReadTimeout . " sec)."; $document_received_completely = false; } else { $source_portion .= $line_read; $bytes_received += strlen($line_read); $this->global_traffic_count += strlen($line_read); // Stream to file or store source in memory if ($stream_to_file == true) { @fwrite($fp, $line_read); } else { $source_complete .= $line_read; } } // Check if content-length stated in the header is reached if ($this->lastResponseHeader->content_length == $bytes_received) { $stop_receving = true; } // Check if contentsize-limit is reached if ($this->content_size_limit > 0 && $this->content_size_limit <= $bytes_received) { $stop_receving = true; } // Find links in portion of the source if (strlen($source_portion) >= 100000 || $stop_receving == true) { if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) { PHPCrawlerBenchmark::stop("retreiving_content"); PHPCrawlerBenchmark::stop("data_transfer_time"); $this->LinkFinder->findLinksInHTMLChunk($source_portion); $source_portion = substr($source_portion, -1500); PHPCrawlerBenchmark::start("retreiving_content"); PHPCrawlerBenchmark::start("data_transfer_time", true); } } } if ($stream_to_file == true) { @fclose($fp); } PHPCrawlerBenchmark::stop("retreiving_content"); PHPCrawlerBenchmark::stop("data_transfer_time"); $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time"); PHPCrawlerBenchmark::reset("data_transfer_time"); return $source_complete; }
/** * Initiates a new PHPCrawlerCookieDescriptor-object. * * @param string $source_url URL the cookie was send from. * @param string $name Cookie-name * @param string $value Cookie-value * @param string $expires Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT" * @param string $path Cookie-path * @param string $domain Cookie-domain * @internal */ public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null) { // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html $this->name = $name; $this->value = $value; $this->expires = $expires; $this->path = $path; $this->domain = $domain; $source_url_parts = PHPCrawlerUtils::splitURL($source_url); // Source-domain $this->source_domain = $source_url_parts["domain"]; // Source-URL $this->source_url = $source_url; // Send-time $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime(); // Expire-date to timetsamp if ($this->expires != null) { $this->expire_timestamp = @strtotime($this->expires); } // If domain doesn't start with "." -> add it (see RFC) if ($this->domain != null && substr($this->domain, 0, 1) != ".") { $this->domain = "." . $this->domain; } // Comeplete missing values // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC) if ($this->domain == null) { $this->domain = $source_url_parts["host"]; } // If path not set if ($this->path == null) { $this->path = $source_url_parts["path"]; } }
/** * Updates the status of the crawler * * @param PHPCrawlerDocumentInfo $PageInfo The PHPCrawlerDocumentInfo-object of the last received document * or NULL if no document was received. * @param int $abort_reason One of the PHPCrawlerAbortReasons::ABORTREASON-constants if the crawling-process * should get aborted, otherwise NULL * @param string $first_content_url The first URL some content was found in (or NULL if no content was found so far). */ public function updateCrawlerStatus($PageInfo, $abort_reason = null, $first_content_url = null) { PHPCrawlerBenchmark::start("updating_crawler_status"); // Set semaphore if crawler is multiprocessed if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE) { $sem_key = sem_get($this->crawler_uniqid); sem_acquire($sem_key); } // Get current Status $crawler_status = $this->getCrawlerStatus(); // Update status if ($PageInfo != null) { // Increase number of followed links $crawler_status->links_followed++; // Increase documents_received-counter if ($PageInfo->received == true) { $crawler_status->documents_received++; } // Increase bytes-counter $crawler_status->bytes_received += $PageInfo->bytes_received; } // Set abortreason if ($abort_reason !== null) { $crawler_status->abort_reason = $abort_reason; } // Set first_content_url if ($first_content_url !== null) { $crawler_status->first_content_url = $first_content_url; } // Write crawler-status back $this->setCrawlerStatus($crawler_status); // Remove semaphore if crawler is multiprocessed if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE) { sem_release($sem_key); } PHPCrawlerBenchmark::stop("updating_crawler_status"); }
/** * Checks whether there are URLs left in the cache that should be processed or not. * * @return bool */ public function containsURLs() { // drupal_set_message('<pre>PHPCrawlerD7URLCache::containsURLs ' . print_r('', 1) . '</pre>'); PHPCrawlerBenchmark::start("checking_for_urls_in_cache"); // $has_columns = $this->conn->query("SELECT id FROM " . $this->table . " WHERE (processed = 0 OR in_process = 1) AND crawler_id = '" . $this->crawler_id . "' LIMIT 1;")->fetchField(); $result = db_query("SELECT id FROM {" . $this->table . "} WHERE (processed = 0 OR in_process = 1) AND crawler_id = '" . $this->crawler_id . "' LIMIT 1;"); $has_columns = $result->fetchField(); PHPCrawlerBenchmark::stop("checking_for_urls_in_cache"); return !!$has_columns; }