/** * Initiates a new PHPCrawlerCookieDescriptor-object. * * @param string $source_url URL the cookie was send from. * @param string $name Cookie-name * @param string $value Cookie-value * @param string $expires Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT" * @param string $path Cookie-path * @param string $domain Cookie-domain * @internal */ public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null) { // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html $this->name = $name; $this->value = $value; $this->expires = $expires; $this->path = $path; $this->domain = $domain; $source_url_parts = PHPCrawlerUtils::splitURL($source_url); // Source-domain $this->source_domain = $source_url_parts["domain"]; // Source-URL $this->source_url = $source_url; // Send-time $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime(); // Expire-date to timetsamp if ($this->expires != null) { $this->expire_timestamp = @strtotime($this->expires); } // If domain doesn't start with "." -> add it (see RFC) if ($this->domain != null && substr($this->domain, 0, 1) != ".") { $this->domain = "." . $this->domain; } // Comeplete missing values // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC) if ($this->domain == null) { $this->domain = $source_url_parts["host"]; } // If path not set if ($this->path == null) { $this->path = $source_url_parts["path"]; } }
/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { PHPCrawlerBenchmark::start("getting_cookies_from_cache"); $url_parts = PHPCrawlerUtils::splitURL($target_url); $return_cookies = array(); $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';"); $rows = $Result->fetchAll(PDO::FETCH_ASSOC); $Result->closeCursor(); $cnt = count($rows); for ($x = 0; $x < $cnt; $x++) { // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) { $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]); $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); PHPCrawlerBenchmark::stop("getting_cookies_from_cache"); return $return_cookies; }
/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { $url_parts = PHPCrawlerUtils::splitURL($target_url); $target_domain = $url_parts["domain"]; // e.g. acme.com $return_cookies = array(); // Iterate over all cookies of this domain @reset($this->cookies[$target_domain]); while (list($hash) = @each($this->cookies[$target_domain])) { $Cookie = $this->cookies[$target_domain][$hash]; // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) { $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); return $return_cookies; }
/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { PHPCrawlerBenchmark::start("getting_cookies_from_cache"); $url_parts = PHPCrawlerUtils::splitURL($target_url); $return_cookies = array(); $result = db_query("SELECT * FROM {" . $this->table . "} WHERE source_domain = '" . $url_parts["domain"] . "' AND crawler_id = '" . $this->crawler_id . "';"); $rows = $result->fetchAllAssoc('id'); // $rows = $this->conn->query("SELECT * FROM " . $this->table . " WHERE source_domain = '".$url_parts["domain"]."' AND crawler_id = '" . $this->crawler_id . "';")->fetchAllAssoc('id'); // drupal_set_message('<pre>PHPCrawlerD7CookieCache::getCookiesForUrl ' . print_r($rows, 1) . '</pre>'); $cnt = count($rows); for ($x = 0; $x < $cnt; $x++) { // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($rows[$x]->domain == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]->domain) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($rows[$x]->path) . "#", $url_parts->path)) { $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]->source_url, $rows[$x]->name, $rows[$x]->value, $rows[$x]->expires, $rows[$x]->path, $rows[$x]->domain); $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); PHPCrawlerBenchmark::stop("getting_cookies_from_cache"); return $return_cookies; }
/** * Sets/writes the current crawler-status * * @param PHPCrawlerStatus $crawler_status The status to set */ public function setCrawlerStatus(PHPCrawlerStatus $crawler_status) { $this->crawlerStatus = $crawler_status; // Write crawler-status back to file if ($this->write_status_to_file == true) { PHPCrawlerUtils::serializeToFile($this->working_directory . "crawlerstatus.tmp", $crawler_status); } }
/** * Initiates an new PHPCrawlerResponseHeader. * * @param string $header_string A complete response-header as it was send by the server * @param string $source_url The URL of the website the header was recevied from. * @internal */ public function __construct($header_string, $source_url) { $this->header_raw = $header_string; $this->source_url = $source_url; $this->http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header_string); $this->content_type = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-type")); $this->content_length = strtolower(PHPCrawlerUtils::getHeaderValue($header_string, "content-length")); $this->cookies = PHPCrawlerUtils::getCookiesFromHeader($header_string, $source_url); }
/** * Returns/reads the current crawler-status * * @return PHPCrawlerStatus The current crawlerstatus as a PHPCrawlerStatus-object */ public function getCrawlerStatus() { // Get crawler-status from file if crawler is multiprocessed if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE) { $this->crawlerStatus = PHPCrawlerUtils::deserializeFromFile($this->working_directory . "crawlerstatus.tmp"); if ($this->crawlerStatus == null) { $this->crawlerStatus = new PHPCrawlerStatus(); } } return $this->crawlerStatus; }
/** * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL. * * @return PHPCrawlerUrlPartsDescriptor */ public static function fromURL($url) { $parts = PHPCrawlerUtils::splitURL($url); $tmp = new PHPCrawlerUrlPartsDescriptor(); $tmp->protocol = $parts["protocol"]; $tmp->host = $parts["host"]; $tmp->path = $parts["path"]; $tmp->file = $parts["file"]; $tmp->domain = $parts["domain"]; $tmp->port = $parts["port"]; $tmp->auth_username = $parts["auth_username"]; $tmp->auth_password = $parts["auth_password"]; return $tmp; }
/** * Adds a basic-authentication (username and password) to the list of authentications that will be send * with requests. * * @param string $url_regex Regular expression defining the URL(s) the authentication should be send to. * @param string $username The username * @param string $password The password * * @return bool */ public function addBasicAuthentication($url_regex, $username, $password) { // Check regex $regex_okay = PHPCrawlerUtils::checkRegexPattern($url_regex); if ($regex_okay == true) { // Add authentication to basic_authentications-array $tmp = array(); $tmp["url_regex"] = $url_regex; $tmp["username"] = $username; $tmp["password"] = $password; $this->basic_authentications[] = $tmp; return true; } else { return false; } }
/** * Adds a rule to the list of rules that decide which URLs found on a page should be ignored by the crawler. */ public function addURLFilterRule($regex) { $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern if ($check == true) { $this->url_filter_rules[] = trim($regex); } return $check; }
/** * Checks whether the hostname of the given URL is already cached * * @param PHPCrawlerURLDescriptor $URL The URL * @return bool */ public function urlHostInCache(PHPCrawlerURLDescriptor $URL) { $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild); return $this->hostInCache($url_parts["host"]); }
protected function addLinkToCache($link_raw, $link_code, $link_text = "") { //PHPCrawlerBenchmark::start("preparing_link_for_cache"); // If liks already was found and processed -> skip this link if (isset($this->found_links_map[$link_raw])) { return; } // Rebuild URL from link $url_rebuild = PHPCrawlerUtils::buildURLFromLink($link_raw, $this->baseUrlParts); // If link coulnd't be rebuild if ($url_rebuild == null) { return; } // Create an PHPCrawlerURLDescriptor-object with URL-data $UrlDescriptor = new PHPCrawlerURLDescriptor($url_rebuild, $link_raw, $link_code, $link_text, $this->SourceUrl->url_rebuild); // Add the PHPCrawlerURLDescriptor-object to LinkCache $this->LinkCache->addURL($UrlDescriptor); // Add the PHPCrawlerURLDescriptor-object to found-links-array $map_key = $link_raw; $this->found_links_map[$map_key] = true; //PHPCrawlerBenchmark::stop("preparing_link_for_cache"); }
function receivePage($url_to_crawl, $referer_url) { // Check if tmp-file was set by the user, otherwise set a default one if ($this->tmp_file == "") { $this->tmp_file = uniqid(time()) . ".tmp"; } // Define some vars $source_read = ""; $bytes_received = 0; $stream_to_memory = false; $stream_to_file = false; // Split the url to crawl into its elements (host, path, port and stuff) $url_parts = PHPCrawlerUtils::splitURL($url_to_crawl); $protocol = $url_parts["protocol"]; $host = $url_parts["host"]; $path = $url_parts["path"]; $query = $url_parts["query"]; $file = $url_parts["file"]; $port = $url_parts["port"]; // If the host was already visited so far // -> get the ip from our host-ip-array, otherwise // get the IP and add the entry to the array. if (isset($this->host_ip_table[$host])) { $ip = $this->host_ip_table[$host]; } else { $ip = $this->host_ip_table[$host] = gethostbyname($host); // Host switched and wasnt "visited" before. // So read the robots.txt-file for this new host (if wanted) if ($this->use_robots_txt_files == true) { $this->robotsTxtHandler->processRobotsTxt($protocol, $host, $port, $this->user_agent_string); } } // Is this URL allowed to be requested by the robots.txt-file of this host? $url_disallowed = false; if ($this->use_robots_txt_files == true) { $host_url = $protocol . $host . ":" . $port; $url_disallowed = $this->robotsTxtHandler->checkIfUrlDisallowed($url_to_crawl, $host_url); } // Check the protocol (http or https) and build the // host-string for fsockopen if ($protocol == "https://") { $host_str = "ssl://" . $ip; } else { $host_str = $ip; } // normal connect // Check if an authentication should be send $authentication = PHPCrawlerUtils::getAuthenticationForURL($this->basic_authentications, $url_to_crawl); // Error-codes // 0 - couldnt connect to server / page within timeout-time // 1 - stopped reading from socket, read-timeout reached BEFORE EOF() // Open socket-connection if ($url_disallowed == false) { $s = @fsockopen($host_str, $port, $e, $t, $this->socket_mean_timeout); } else { return false; // Return false if the URL was completely ignored } if ($s == false) { $error_string = $t; $error_code = $e; if ($t == "" && $e == "") { $error_code = 0; $error_string = "Couldn't connect to server"; } } else { $header_found = false; // will get true if the header of the page was extracted // Build header to send $headerlines_to_send[] = "GET " . $path . $file . $query . " HTTP/1.0\r\n"; $headerlines_to_send[] = "HOST: " . $host . "\r\n"; // Referer if ($referer_url != "") { $headerlines_to_send[] = "Referer: {$referer_url}\r\n"; } // Cookies if ($this->handle_cookies == true) { $cookie_string = PHPCrawlerUtils::buildHeaderCookieString($this->cookies, $host); } if (isset($cookie_string)) { $headerlines_to_send[] = "Cookie: " . $cookie_string . "\r\n"; } // Authentication if (count($authentication) > 0) { $auth_string = base64_encode($authentication["username"] . ":" . $authentication["password"]); $headerlines_to_send[] = "Authorization: Basic " . $auth_string . "\r\n"; } // Rest of header $headerlines_to_send[] = "User-Agent: " . str_replace("\n", "", $this->user_agent_string) . "\r\n"; $headerlines_to_send[] = "Connection: close\r\n"; $headerlines_to_send[] = "\r\n"; // Now send the header for ($x = 0; $x < count($headerlines_to_send); $x++) { // Send header-line fputs($s, $headerlines_to_send[$x]); // Put together lines to $header_send if (isset($header_send)) { $header_send .= $headerlines_to_send[$x]; } else { $header_send = $headerlines_to_send[$x]; } } unset($header_lines); $status = socket_get_status($s); // Now read from socket // UNTIL timeout reached OR eof() OR content-type shouldnt be followed // OR traffic-limit reached or ... while (!isset($stop)) { socket_set_timeout($s, $this->socket_read_timeout); // Read from socket $line_read = @fgets($s, 1024); // The @ is to avoid the strange "SSL fatal protocol error"-warning that // appears in some environments without any reasons $source_read .= $line_read; // do this anyway // If we want the content in tmp-file -> write line to TMP-file if ($header_found == true && $stream_to_file == true && $line_read) { unset($check); $check = @fwrite($fp, $line_read); if ($check == false) { $error_code = "2000"; $error_string = "Couldn't write to TMP-file " . $this->tmp_file; } } // Count bytes of the content (not the header) if ($header_found == true) { $bytes_received = $bytes_received + strlen($line_read); } // Check for traffic limit and stop receiving if reached if ($this->traffic_limit_complete_page == false && $this->traffic_limit_all > 0) { if (strlen($source_read) + $this->traffic_all > $this->traffic_limit_all) { $stop = true; $received_completly = false; $page_data["traffic_limit_reached"] = true; } } // Check for pagesize-limit if ($header_found == true && $bytes_received > $this->pagesize_limit && $this->pagesize_limit > 0) { $stop = true; $received_completly = false; } // "Cut" Header in seperate var $header and handle it if ($header_found == false && substr($source_read, -4, 4) == "\r\n\r\n") { $header = substr($source_read, 0, strlen($source_read) - 2); $actual_content_type = PHPCrawlerUtils::getHeaderTag("content-type", $header); $source_read = ""; $header_found = true; // Get the http-status-code $http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header); // Should this content-type be streamed into memory (true/false) ? $stream_to_memory = PHPCrawlerUtils::decideStreamToMemory($header, $this->receive_to_memory_matches); // Should this content-type be streamed into tmp-file (true/false) ? $stream_to_file = PHPCrawlerUtils::decideStreamToTmpFile($header, $this->receive_to_file_matches); // No ? then open TMP-file for the stream if ($stream_to_file == true) { $fp = @fopen($this->tmp_file, "w"); if ($fp == false) { $error_code = "2000"; $error_string = "Couldn't open TMP-file" . $this->tmp_file; } } // Header found here -> check if source should be followed (content-type) $follow = PHPCrawlerUtils::decideFollow($header, $this->follow_content_type); // no ?? then stop with this page ! if ($follow == false) { $stop = true; } else { $received_completly = true; // just init, may switch later on ! } // Check if a cookie was send with the header and store it // (if wanted) if ($this->handle_cookies == true) { PHPCrawlerUtils::getCookieData($header, $this->cookies, $host); } } // end cut and handle header // Get status of socket to check timeout and EOF $status = socket_get_status($s); // Now, if the source-buffer is filled or EOF is reached // -> look for links in the buffer, put the found links into // array $links_found_in_page and then empty the buffer BUT // COPY THE LAST FEW BYTES of the old buffer into the new one ! // This has to be done because of links that take more than a single // line ! // And yes, only makes sense if we dont want to have the whole content // in memory anyway AND if the content-type is text/html! if ($header_found == true && $stream_to_memory == false) { if (strlen($source_read) >= 100000 || $status["eof"] == true) { if (preg_match("/text\\/html/ i", $actual_content_type)) { $links_found_in_buffer = PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map); $source_read = substr($source_read, -1500); } } } // Check timeout if ($status["timed_out"] == true) { $error_code = 1000; // ahem..which int to give ?? $error_string = "socketstream timed out"; $stop = true; $received_completly = false; } // Check eof if ($status["eof"] == true) { $stop = true; } } fclose($s); // close socket if (isset($fp) && $fp != false) { fclose($fp); } // close tmp file if used } // echo "Get page:".($this->getmicrotime() - $start); // Now, HERE, if the whole content/source was received into memory, // we are looking for the links in the complete source (faster) // it only makes sense if content-type is text/html ! if ($stream_to_memory == true) { unset($links_found_in_page); if (preg_match("/text\\/html/ i", $actual_content_type)) { // $start = $this->getmicrotime(); PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map); // echo "Find links:".($this->getmicrotime() - $start); } } // Add the "refering_url" to the array-elements if (isset($links_found_in_page)) { for ($x = 0; $x < count($links_found_in_page); $x++) { $links_found_in_page[$x]["referer_url"] = $url_to_crawl; } } // Page crawled, // return header, source, followed (true/false) and all we got here unset($page_data); if (isset($error_code)) { $page_data["error_code"] = $error_code; } else { $page_data["error_code"] = false; } if (isset($error_string)) { $page_data["error_string"] = $error_string; } else { $page_data["error_string"] = false; } if (isset($follow)) { $page_data["received"] =& $follow; } else { $page_data["received"] = false; } if (isset($received_completly)) { $page_data["received_completly"] =& $received_completly; } else { $page_data["received_completly"] = false; } $page_data["received_completely"] =& $page_data["received_completly"]; // Wrote "completely" it wrong in prev. version, if (isset($bytes_received)) { $page_data["bytes_received"] = $bytes_received; } else { $page_data["bytes_received"] = 0; } if (isset($header)) { $page_data["header"] =& $header; } else { $page_data["header"] = false; } if (isset($http_status_code)) { $page_data["http_status_code"] =& $http_status_code; } else { $page_data["http_status_code"] = false; } if (isset($actual_content_type)) { $page_data["content_type"] = $actual_content_type; } else { $page_data["content_type"] = false; } // TMP-file infos and that $page_data["content_tmp_file"] = $page_data["received_to_file"] = false; $page_data["source"] = $page_data["content"] = $page_data["received_to_memory"] = false; if (isset($page_data["received"])) { if ($stream_to_file == true) { $page_data["content_tmp_file"] = $this->tmp_file; $page_data["received_to_file"] = true; } if ($stream_to_memory == true) { $page_data["source"] =& $source_read; $page_data["content"] =& $source_read; $page_data["received_to_memory"] = true; } } // Additional infos for the override-function handlePageData() $page_data["protocol"] = $protocol; $page_data["port"] = $port; $page_data["host"] = $host; $page_data["path"] = $path; $page_data["file"] = $file; $page_data["query"] = $query; $page_data["header_send"] =& $header_send; $page_data["referer_url"] = $referer_url; // "Normailzed" URL and referer-URL (f.e. without port if port is 80 and protocol is http) $page_data["url"] = $url_to_crawl; // All links found in this page $page_data["links_found"] =& $links_found_in_page; // Increase SUM of traffic alltogether this instance received $this->traffic_all = $this->traffic_all + strlen($page_data["header"]) + $page_data["bytes_received"]; // Set flag if traffic-limit is reached if ($this->traffic_all > $this->traffic_limit_all && $this->traffic_limit_all != 0) { $page_data["traffic_limit_reached"] = true; } if (!isset($page_data["traffic_limit_reached"])) { $page_data["traffic_limit_reached"] = false; } return $page_data; }
/** * Adds a Link-Priority-Level * * @param string $regex * @param int $level */ public function addLinkPriority($regex, $level) { $c = count($this->url_priorities); $this->url_priorities[$c]["match"] = trim($regex); $this->url_priorities[$c]["level"] = trim($level); // Sort url-priortie-array so that high priority-levels come firts. PHPCrawlerUtils::sort2dArray($this->url_priorities, "level", SORT_DESC); }
function addBasicAuthentication($expression, $username, $password) { $this->initCrawler(); $check = PHPCrawlerUtils::checkExpressionPattern($expression); // Check pattern if ($check == true) { $c = count($this->pageRequest->basic_authentications); $this->pageRequest->basic_authentications[$c]["match"] = $expression; $this->pageRequest->basic_authentications[$c]["username"] = $username; $this->pageRequest->basic_authentications[$c]["password"] = $password; return true; } else { return false; } }
/** * Returns the Robots.txt-URL related to the given URL * * @param PHPCrawlerURLDescriptor $Url The URL as PHPCrawlerURLDescriptor-object * @return PHPCrawlerURLDescriptor Url of the related to the passed URL. */ public static function getRobotsTxtURL(PHPCrawlerURLDescriptor $Url) { $url_parts = PHPCrawlerUtils::splitURL($Url->url_rebuild); $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt"; return new PHPCrawlerURLDescriptor($robots_txt_url); }
/** * Adds a rule to the list of rules that decide in what kind of documents the crawler * should search for links in (regarding their content-type) * * By default the crawler ONLY searches for links in documents of type "text/html". * Use this method to add one or more other content-types the crawler should check for links. * * Example: * <code> * $crawler->addLinkSearchContentType("#text/css# i"); * $crawler->addLinkSearchContentType("#text/xml# i"); * </code> * These rules let the crawler search for links in HTML-, CSS- ans XML-documents. * * <b>Please note:</b> It is NOT recommended to let the crawler checkfor links in EVERY document- * type! This could slow down the crawling-process dramatically (e.g. if the crawler receives large * binary-files like images and tries to find links in them). * * @param string $regex Regular-expression defining the rule * @return bool TRUE if the rule was successfully added */ function addLinkSearchContentType($regex) { $this->initCrawler(); $check = PHPCrawlerUtils::checkExpressionPattern($regex); // Check pattern if ($check == true) { $this->pageRequest->linksearch_content_types[] = trim($regex); } return $check; }
function buidlNonFollowMatches($applying_lines, $base_url) { // First, get all "Disallow:"-pathes $disallow_pathes = array(); for ($x = 0; $x < count($applying_lines); $x++) { if (preg_match("#^Disallow:# i", $applying_lines[$x])) { preg_match("#^Disallow:[ ]*(.*)#", $applying_lines[$x], $match); $disallow_pathes[] = trim($match[1]); } } // Works like this: // The base-url is http://www.foo.com. // The driective says: "Disallow: /bla/" // This means: The nonFollowMatch is "#^http://www\.foo\.com/bla/#" $normalized_base_url = PHPCrawlerUtils::normalizeURL($base_url); $non_follow_expressions = array(); for ($x = 0; $x < count($disallow_pathes); $x++) { // If the disallow-path is empty -> simply ignore it if ($disallow_pathes[$x] == "") { continue; } $non_follow_path_complpete = $normalized_base_url . substr($disallow_pathes[$x], 1); // "http://www.foo.com/bla/" $non_follow_exp = preg_quote($non_follow_path_complpete, "#"); // "http://www\.foo\.com/bla/" $non_follow_exp = "#^" . $non_follow_exp . "#"; // "#^http://www\.foo\.com/bla/#" $non_follow_expressions[] = $non_follow_exp; } return $non_follow_expressions; }
function buildURL($link, $actual_url, $url_parts_actual = "") { // Important: Function has to return a FULL URL, ioncluing // the port !! if ($url_parts_actual == "") { $url_parts_actual = phpcrawlerutils::splitURL($actual_url); } // Entities-replacements $entities = array("'&(quot|#34);'i", "'&(amp|#38);'i", "'&(lt|#60);'i", "'&(gt|#62);'i", "'&(nbsp|#160);'i", "'&(iexcl|#161);'i", "'&(cent|#162);'i", "'&(pound|#163);'i", "'&(copy|#169);'i"); $replace = array("\"", "&", "<", ">", " ", chr(161), chr(162), chr(163), chr(169)); $link = str_replace("\n", "", $link); $link = str_replace("\r", "", $link); // Remove "#..." at end, but ONLY at the end, // not if # is at the beginning ! $link = preg_replace("/^(.{1,})#.{0,}\$/", "\\1", $link); // Cases // Strange link like "//foo.htm" -> make it to "http://foo.html" if (substr($link, 0, 2) == "//") { $link = "http:" . $link; $link = phpcrawlerutils::rebuildURL(phpcrawlerutils::splitURL($link)); } elseif (substr($link, 0, 1) == "/") { $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $link; } elseif (substr($link, 0, 2) == "./") { $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $url_parts_actual["path"] . substr($link, 2); } elseif (preg_match("/^[^\\/]{1,}(:\\/\\/)/", $link)) { if (substr($link, 0, 7) == "http://" || substr($link, 0, 8) == "https://") { $link = phpcrawlerutils::rebuildURL(phpcrawlerutils::splitURL($link)); } else { $link = ""; } // Kick out unsupported protocols } elseif (preg_match("/^[a-zA-Z]{0,}:[^\\/]{0,1}/", $link)) { $link = ""; } elseif (substr($link, 0, 3) == "../") { $new_path = $url_parts_actual["path"]; while (substr($link, 0, 3) == "../") { $new_path = preg_replace('/\\/[^\\/]{0,}\\/$/', "/", $new_path); $link = substr($link, 3); } $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $new_path . $link; } elseif (substr($link, 0, 1) == "#") { $link = ""; } elseif ($link == "") { $link = $actual_url; } else { $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $url_parts_actual["path"] . $link; } // Now, at least, replace all HTMLENTITIES with normal text !! // Ie: HTML-Code of the link is: <a href="index.php?x=1&y=2"> // -> Link has to be "index.php?x=1&y=2" $link = preg_replace($entities, $replace, $link); $link = rawurldecode($link); $link = str_replace(" ", "%20", $link); // "Normalize" URL $link = PHPCrawlerUtils::normalizeUrl($link); return $link; }
/** * Adds a regular expression togehter with a priority-level to the list of rules that decide what links should be prefered. * * Links/URLs that match an expression with a high priority-level will be followed before links with a lower level. * All links that don't match with any of the given rules will get the level 0 (lowest level) automatically. * * The level can be any positive integer. * * <b>Example:</b> * * Telling the crawler to follow links that contain the string "forum" before links that contain ".gif" before all other found links. * <code> * $crawler->addLinkPriority("/forum/", 10); * $cralwer->addLinkPriority("/\.gif/", 5); * </code> * * @param string $regex Regular expression definig the rule * @param int $level The priority-level * * @return bool TRUE if a valid preg-pattern is given as argument and was succsessfully added, otherwise it returns FALSE. * @section 10 Other settings */ function addLinkPriority($regex, $level) { $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern if ($check == true && preg_match("/^[0-9]*\$/", $level)) { $c = count($this->link_priority_array); $this->link_priority_array[$c]["match"] = trim($regex); $this->link_priority_array[$c]["level"] = trim($level); return true; } else { return false; } }
/** * Adds a rule to the list of rules that decide what kind of documents should get * checked for links in (regarding their content-type) * * @param string $regex Regular-expression defining the rule * @return bool TRUE if the rule was successfully added */ public function addLinkSearchContentType($regex) { $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern if ($check == true) { $this->linksearch_content_types[] = trim($regex); } return $check; }
function handleDocumentInfo($DocInfo) { echo "<table class=intbl>"; // Loop over the output-array and print info if wanted @reset($this->output_array); while (list($key) = @each($this->output_array)) { if ($key == "requested_url") { $str = '<a href="' . $DocInfo->url . '" target=blank>' . $DocInfo->url . '</a>'; echo "<tr><td width=130><nobr>Page requested:</nobr></td><td width=470>" . $str . "</td></tr>"; } if ($key == "http_status_code") { if ($DocInfo->http_status_code) { $str = $DocInfo->http_status_code; } else { $str = "-"; } echo "<tr><td>HTTP-Status:</td><td>" . $str . "</td></tr>"; } if ($key == "content_type") { if ($DocInfo->content_type) { $str = $DocInfo->content_type; } else { $str = "-"; } echo "<tr><td>Content-Type:</td><td>" . $str . "</td></tr>"; } if ($key == "content_size") { $str = PHPCrawlerUtils::getHeaderValue($DocInfo->header, "content-length"); if (trim($str) == "") { $str = "??"; } echo "<tr><td>Content-Size:</td><td >" . $str . " bytes</td></tr>"; } if ($key == "content_received") { if ($DocInfo->received == true) { $str = "Yes"; } else { $str = "No"; } echo "<tr><td>Content received:</td><td >" . $str . "</td></tr>"; } if ($key == "content_received_completely") { if ($DocInfo->received_completely == true) { $str = "Yes"; } else { $str = "No"; } echo "<tr><td><nobr>Received completely:</nobr></td><td >" . $str . "</td></tr>"; } if ($key == "bytes_received") { echo "<tr><td>Bytes received:</td><td>" . $DocInfo->bytes_received . " bytes</td></tr>"; } if ($key == "referer_url") { if ($DocInfo->referer_url == "") { $str = "-"; } else { $str =& $page_data["referer_url"]; } echo "<tr><td><nobr>Refering URL</nobr>:</td><td >" . $str . "</td></tr>"; } if ($key == "refering_linkcode") { if ($DocInfo->refering_linkcode == "") { $str = "-"; } else { $str = htmlentities($DocInfo->refering_linkcode); $str = str_replace("\n", "<br>", $str); } echo "<tr><td valign=top><nobr>Refering linkcode:</nobr></td><td >" . $str . "</td></tr>"; } if ($key == "refering_link_raw") { if ($DocInfo->refering_link_raw == "") { $str = "-"; } else { $str = $DocInfo->refering_link_raw; } echo "<tr><td><nobr>Refering Link RAW: </nobr></td><td >" . $str . "</td></tr>"; } if ($key == "refering_linktext") { if ($DocInfo->refering_linktext == "") { $str = "-"; } else { $str = $DocInfo->refering_linktext; $str = htmlentities($str); $str = str_replace("\n", "<br>", $str); } echo "<tr><td valign=top><nobr>Refering linktext</nobr>:</td><td >" . $str . "</td></tr>"; } if ($key == "header_send") { if ($DocInfo->header_send) { $str = str_replace("\n", "<br>", trim($DocInfo->header_send)); } else { $str = "-"; } echo "<tr><td valign=top>Send header:</td><td >" . $str . "</td></tr>"; } if ($key == "header") { if ($DocInfo->header) { $str = str_replace("\n", "<br>", trim($DocInfo->header)); } else { $str = "-"; } echo "<tr><td valign=top>Received header:</td><td >" . $str . "</td></tr>"; } if ($key == "nr_found_links") { $str = count($DocInfo->links_found); echo "<tr><td valign=top>Links found:</td><td >" . $str . "</td></tr>"; } if ($key == "all_found_links") { echo "<tr><td valign=top>List of found links:</td>"; echo "<td>"; for ($x = 0; $x < count($DocInfo->links_found_url_descriptors); $x++) { echo $DocInfo->links_found_url_descriptors[$x]->url_rebuild . "<br>"; } if (count($DocInfo->links_found_url_descriptors) == 0) { echo "-"; } echo "</td>"; echo "</tr>"; } if ($key == "received_to_file") { if ($DocInfo->received_to_file) { $str = "Yes"; } else { $str = "No"; } echo "<tr><td valign=top>Received to TMP-file:</td><td >" . $str . "</td></tr>"; } if ($key == "tmpfile_name_size") { if ($DocInfo->content_tmp_file) { $str = $DocInfo->content_tmp_file . " (" . filesize($DocInfo->content_tmp_file) . " bytes)"; } else { $str = "-"; } echo "<tr><td valign=top>Content TMP-file:</td><td >" . $str . "</td></tr>"; } if ($key == "received_to_memory") { if ($DocInfo->received_to_memory) { $str = "Yes"; } else { $str = "No"; } echo "<tr><td valign=top>Received to memory:</td><td >" . $str . "</td></tr>"; } if ($key == "memory_content_size") { echo "<tr><td valign=top>Memory-content-size:</td><td >" . strlen($DocInfo->source) . " bytes</td></tr>"; } } // Output error if theres one if ($DocInfo->error_occured) { echo "<tr>\n <td class=red>Error:</td>\n <td class=red>" . $DocInfo->error_string . "</td>\n </tr>"; } echo "</table> <br>"; $this->flushOutput(); }
/** * Returns the default Robots.txt-URL related to the given URL * * @param string $url The URL * @return string Url of the related robots.txt file */ public static function getRobotsTxtURL($url) { $url_parts = PHPCrawlerUtils::splitURL($url); $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt"; return $robots_txt_url; }