/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { PHPCrawlerBenchmark::start("getting_cookies_from_cache"); $url_parts = PHPCrawlerUtils::splitURL($target_url); $return_cookies = array(); $Result = $this->PDO->query("SELECT * FROM cookies WHERE source_domain = '" . $url_parts["domain"] . "';"); $rows = $Result->fetchAll(PDO::FETCH_ASSOC); $Result->closeCursor(); $cnt = count($rows); for ($x = 0; $x < $cnt; $x++) { // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($rows[$x]["domain"] == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]["domain"]) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($rows[$x]["path"]) . "#", $url_parts["path"])) { $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]["source_url"], $rows[$x]["name"], $rows[$x]["value"], $rows[$x]["expires"], $rows[$x]["path"], $rows[$x]["domain"]); $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); PHPCrawlerBenchmark::stop("getting_cookies_from_cache"); return $return_cookies; }
/** * Initiates a new PHPCrawlerCookieDescriptor-object. * * @param string $source_url URL the cookie was send from. * @param string $name Cookie-name * @param string $value Cookie-value * @param string $expires Expire-string, e.g. "Sat, 08-Aug-2020 23:59:08 GMT" * @param string $path Cookie-path * @param string $domain Cookie-domain * @internal */ public function __construct($source_url, $name, $value, $expires = null, $path = null, $domain = null) { // For cookie-specs, see e.g. http://curl.haxx.se/rfc/cookie_spec.html $this->name = $name; $this->value = $value; $this->expires = $expires; $this->path = $path; $this->domain = $domain; $source_url_parts = PHPCrawlerUtils::splitURL($source_url); // Source-domain $this->source_domain = $source_url_parts["domain"]; // Source-URL $this->source_url = $source_url; // Send-time $this->cookie_send_time = PHPCrawlerBenchmark::getmicrotime(); // Expire-date to timetsamp if ($this->expires != null) { $this->expire_timestamp = @strtotime($this->expires); } // If domain doesn't start with "." -> add it (see RFC) if ($this->domain != null && substr($this->domain, 0, 1) != ".") { $this->domain = "." . $this->domain; } // Comeplete missing values // If domain no set -> domain is the host of the source-url WITHOUT leading "."! (see RFC) if ($this->domain == null) { $this->domain = $source_url_parts["host"]; } // If path not set if ($this->path == null) { $this->path = $source_url_parts["path"]; } }
/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { $url_parts = PHPCrawlerUtils::splitURL($target_url); $target_domain = $url_parts["domain"]; // e.g. acme.com $return_cookies = array(); // Iterate over all cookies of this domain @reset($this->cookies[$target_domain]); while (list($hash) = @each($this->cookies[$target_domain])) { $Cookie = $this->cookies[$target_domain][$hash]; // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($Cookie->domain == $url_parts["host"] || preg_match("#" . preg_quote($Cookie->domain) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($Cookie->path) . "#", $url_parts["path"])) { $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); return $return_cookies; }
/** * Returns all cookies from the cache that are adressed to the given URL * * @param string $target_url The target-URL * @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects */ public function getCookiesForUrl($target_url) { PHPCrawlerBenchmark::start("getting_cookies_from_cache"); $url_parts = PHPCrawlerUtils::splitURL($target_url); $return_cookies = array(); $result = db_query("SELECT * FROM {" . $this->table . "} WHERE source_domain = '" . $url_parts["domain"] . "' AND crawler_id = '" . $this->crawler_id . "';"); $rows = $result->fetchAllAssoc('id'); // $rows = $this->conn->query("SELECT * FROM " . $this->table . " WHERE source_domain = '".$url_parts["domain"]."' AND crawler_id = '" . $this->crawler_id . "';")->fetchAllAssoc('id'); // drupal_set_message('<pre>PHPCrawlerD7CookieCache::getCookiesForUrl ' . print_r($rows, 1) . '</pre>'); $cnt = count($rows); for ($x = 0; $x < $cnt; $x++) { // Does the cookie-domain match? // Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html: // A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com" if ($rows[$x]->domain == $url_parts["host"] || preg_match("#" . preg_quote($rows[$x]->domain) . "\$#", $url_parts["host"])) { // Does the path match? if (preg_match("#^" . preg_quote($rows[$x]->path) . "#", $url_parts->path)) { $Cookie = new PHPCrawlerCookieDescriptor($rows[$x]->source_url, $rows[$x]->name, $rows[$x]->value, $rows[$x]->expires, $rows[$x]->path, $rows[$x]->domain); $return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies } } } // Convert to numeric array $return_cookies = array_values($return_cookies); PHPCrawlerBenchmark::stop("getting_cookies_from_cache"); return $return_cookies; }
/** * Returns the PHPCrawlerUrlPartsDescriptor-object for the given URL. * * @return PHPCrawlerUrlPartsDescriptor */ public static function fromURL($url) { $parts = PHPCrawlerUtils::splitURL($url); $tmp = new PHPCrawlerUrlPartsDescriptor(); $tmp->protocol = $parts["protocol"]; $tmp->host = $parts["host"]; $tmp->path = $parts["path"]; $tmp->file = $parts["file"]; $tmp->domain = $parts["domain"]; $tmp->port = $parts["port"]; $tmp->auth_username = $parts["auth_username"]; $tmp->auth_password = $parts["auth_password"]; return $tmp; }
/** * Returns the Robots.txt-URL related to the given URL * * @param PHPCrawlerURLDescriptor $Url The URL as PHPCrawlerURLDescriptor-object * @return PHPCrawlerURLDescriptor Url of the related to the passed URL. */ public static function getRobotsTxtURL(PHPCrawlerURLDescriptor $Url) { $url_parts = PHPCrawlerUtils::splitURL($Url->url_rebuild); $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt"; return new PHPCrawlerURLDescriptor($robots_txt_url); }
/** * Sets the port to connect to for crawling the starting-url set in setUrl(). * * The default port is 80. * * Note: * <code> * $cralwer->setURL("http://www.foo.com"); * $crawler->setPort(443); * </code> * effects the same as * * <code> * $cralwer->setURL("http://www.foo.com:443"); * </code> * * @param int $port The port * @return bool * @section 1 Basic settings */ public function setPort($port) { // Check port if (!preg_match("#^[0-9]{1,5}\$#", $port)) { return false; } // Add port to the starting-URL $url_parts = PHPCrawlerUtils::splitURL($this->starting_url); $url_parts["port"] = $port; $this->starting_url = PHPCrawlerUtils::buildURLFromParts($url_parts, true); return true; }
/** * Checks whether a given URL matches the rules. * * @param string $url The URL as a PHPCrawlerURLDescriptor-object * @return bool TRUE if the URL matches the defined rules. */ protected function urlMatchesRules(PHPCrawlerURLDescriptor $url) { // URL-parts of the URL to check against the filter-rules $url_parts = PHPCrawlerUtils::splitURL($url->url_rebuild); // Kick out all links that r NOT of protocol "http" or "https" if ($url_parts["protocol"] != "http://" && $url_parts["protocol"] != "https://") { return false; } // If meta-tag "robots"->"nofollow" is present and obey_nofollow_tags is TRUE -> always kick out URL if ($this->obey_nofollow_tags == true && isset($this->CurrentDocumentInfo->meta_attributes["robots"]) && preg_match("#nofollow# i", $this->CurrentDocumentInfo->meta_attributes["robots"])) { return false; } // If linkcode contains "rel='nofollow'" and obey_nofollow_tags is TRUE -> always kick out URL if ($this->obey_nofollow_tags == true) { if (preg_match("#^<[^>]*rel\\s*=\\s*(?|\"\\s*nofollow\\s*\"|'\\s*nofollow\\s*'|\\s*nofollow\\s*)[^>]*>#", $url->linkcode)) { return false; } } // Filter URLs to other domains if wanted if ($this->general_follow_mode >= 1) { if ($url_parts["domain"] != $this->starting_url_parts["domain"]) { return false; } } // Filter URLs to other hosts if wanted if ($this->general_follow_mode >= 2) { // Ignore "www." at the beginning of the host, because "www.foo.com" is the same host as "foo.com" if (preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"])) { return false; } } // Filter URLs leading path-up if wanted if ($this->general_follow_mode == 3) { if ($url_parts["protocol"] != $this->starting_url_parts["protocol"] || preg_replace("#^www\\.#", "", $url_parts["host"]) != preg_replace("#^www\\.#", "", $this->starting_url_parts["host"]) || substr($url_parts["path"], 0, strlen($this->starting_url_parts["path"])) != $this->starting_url_parts["path"]) { return false; } } // Filter URLs by url_filter_rules for ($x = 0; $x < count($this->url_filter_rules); $x++) { if (preg_match($this->url_filter_rules[$x], $url->url_rebuild)) { return false; } } // Filter URLs by url_follow_rules if (count($this->url_follow_rules) > 0) { $match_found = false; for ($x = 0; $x < count($this->url_follow_rules); $x++) { if (preg_match($this->url_follow_rules[$x], $url->url_rebuild)) { $match_found = true; break; } } if ($match_found == false) { return false; } } return true; }
/** * Checks whether the hostname of the given URL is already cached * * @param PHPCrawlerURLDescriptor $URL The URL * @return bool */ public function urlHostInCache(PHPCrawlerURLDescriptor $URL) { $url_parts = PHPCrawlerUtils::splitURL($URL->url_rebuild); return $this->hostInCache($url_parts["host"]); }
/** * Returns the default Robots.txt-URL related to the given URL * * @param string $url The URL * @return string Url of the related robots.txt file */ public static function getRobotsTxtURL($url) { $url_parts = PHPCrawlerUtils::splitURL($url); $robots_txt_url = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . "/robots.txt"; return $robots_txt_url; }
/** * Sets the URL for the request. * * @param PHPCrawlerURLDescriptor $UrlDescriptor An PHPCrawlerURLDescriptor-object containing the URL to request */ public function setUrl(PHPCrawlerURLDescriptor $UrlDescriptor) { $this->UrlDescriptor = $UrlDescriptor; // Split the URL into its parts $this->url_parts = PHPCrawlerUtils::splitURL($UrlDescriptor->url_rebuild); }
function normalizeURL($url) { $url_parts = PHPCrawlerUtils::splitURL($url); if ($url_parts["protocol"] == "http://" && $url_parts["port"] == 80 || $url_parts["protocol"] == "https://" && $url_parts["port"] == 443) { $url_rebuild = $url_parts["protocol"] . $url_parts["host"] . $url_parts["path"] . $url_parts["file"] . $url_parts["query"]; } else { $url_rebuild = $url; } return $url_rebuild; }
function go() { connectToDb($db); $starting_time = $this->getmicrotime(); // Init, split given URL into host, port, path and file a.s.o. $url_parts = PHPCrawlerUtils::splitURL($this->url_to_crawl); // Set base-host and base-path "global" for this class, // we need it very often (i guess at this point...) $this->base_path = $url_parts["path"]; $this->base_host = $url_parts["host"]; $this->base_domain = $url_parts["domain"]; // If the base port wasnt set by the user -> // take the one from the given start-URL. if ($this->base_port == "") { $this->base_port = $url_parts["port"]; } // if the base-port WAS set by the user $url_parts["port"] = $this->base_port; // Reset the base_url $this->url_to_crawl = PHPCrawlerUtils::rebuildURL($url_parts); $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($this->url_to_crawl); // Init counters $links_followed = 0; $files_received = 0; // Put the first url into our main-array $tmp[0]["url_rebuild"] = $this->url_to_crawl; PHPCrawlerUtils::removeMatchingLinks($tmp, $this->not_follow_matches); if (isset($tmp[0]["url_rebuild"]) && $tmp[0]["url_rebuild"] != "") { PHPCrawlerUtils::addToArray($tmp, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo); } // MAIN-LOOP ------------------------------------------------------------------- // It works like this: // The first loop looks through all the "Priority"-arrays and checks if any // of these arrays is filled with URLS. for ($pri_level = $this->max_priority_level + 1; $pri_level > -1; $pri_level--) { // Yep. Found a priority-array with at least one URL if (isset($this->urls_to_crawl[$pri_level]) && !isset($stop_crawling)) { // Now "process" all URLS in this priroity-array @reset($this->urls_to_crawl[$pri_level]); while (list($key) = @each($this->urls_to_crawl[$pri_level])) { $all_start = $this->getmicrotime(); $stop_crawling_this_level = false; // init // Request URL (crawl()) unset($page_data); if (!isset($this->urls_to_crawl[$pri_level][$key]["referer_url"])) { $this->urls_to_crawl[$pri_level][$key]["referer_url"] = ""; } if ($db) { incrementHttpRequests($db, $this->testId); } //Increment number of HTTP requests sent as fsockopen is called next $page_data = $this->pageRequest->receivePage($this->urls_to_crawl[$pri_level][$key]["url_rebuild"], $this->urls_to_crawl[$pri_level][$key]["referer_url"]); // If the request-object just irnored the URL -> // -> Stop and remove URL from Array if ($page_data == false) { unset($this->urls_to_crawl[$pri_level][$key]); continue; } $links_followed++; // Now $page_data["links_found"] contains all found links at this point // Check if a "<base href.."-tag is given in the source and xtract // the base URL // !! Doesnt have to be rebuild cause it only can be a full // qualified URL !! $base_url = PHPCrawlerUtils::getBasePathFromTag($page_data["source"]); if ($base_url == "") { $actual_url =& $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]; } else { $actual_url = $base_url; } // Set flag "content_found" if..content was found if (isset($page_data["http_status_code"]) && $page_data["http_status_code"] == 200) { $content_found = true; } // Check for a REDIRECT-header and if wanted, put it into the array of found links $redirect = PHPCrawlerUtils::getRedirectLocation($page_data["header"]); if ($redirect && $this->follow_redirects == true) { $tmp_array["link_raw"] = $redirect; $tmp_array["referer_url"] = $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]; $page_data["links_found"][] = $tmp_array; } // Count files that have been received completly if ($page_data["received"] == true) { $files_received++; } // If traffic-limit is reached -> stop crawling if ($page_data["traffic_limit_reached"] == true) { $stop_crawling = true; } // Check if pagelimit is reached if set // (and check WHICH page-limit was set) if ($this->page_limit_all > 0) { if ($this->page_limit_count_ct_only == true && $files_received >= $this->page_limit_all) { $stop_crawling = true; } elseif ($this->page_limit_count_ct_only == false && $links_followed >= $this->page_limit_all) { $stop_crawling = true; } } // Add the actual referer to the page_data array for the handlePageData-method $page_data["refering_linktext"] =& $this->urls_to_crawl[$pri_level][$key]["linktext"]; $page_data["refering_link_raw"] =& $this->urls_to_crawl[$pri_level][$key]["link_raw"]; $page_data["refering_linkcode"] =& $this->urls_to_crawl[$pri_level][$key]["linkcode"]; // build new absolute URLs from found links $page_data["links_found"] = PHPCrawlerUtils::buildURLs($page_data["links_found"], $actual_url); // Call the overridable user-function here, but first // "save" the found links from user-manipulation $links_found = $page_data["links_found"]; $user_return = $this->handlePageData($page_data); // Stop crawling if user returned a negative value if ($user_return < 0) { $stop_crawling = true; $page_data["user_abort"] = true; } // Compare the found links with link-priorities set by the user // and add the priority-level to our array $links_found if ($this->benchmark == true) { $bm_start = $this->getmicrotime(); } PHPCrawlerUtils::addURLPriorities($links_found, $this->link_priorities); if ($this->benchmark == true) { echo "addUrlPriorities(): " . ($this->getmicrotime() - $bm_start) . "<br>"; } // Here we can delete the tmp-file maybe created by the pageRequest-object if (file_exists($this->pageRequest->tmp_file)) { @unlink($this->pageRequest->tmp_file); } // Stop everything if a limit was reached if (isset($stop_crawling)) { break; $pri_level = 1000; } // Remove links to other hosts if follow_mode is 2 or 3 if ($this->general_follow_mode == 2 || $this->general_follow_mode == 3) { PHPCrawlerUtils::removeURLsToOtherHosts($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]); } // Remove links to other domains if follow_mode=1 if ($this->general_follow_mode == 1) { PHPCrawlerUtils::removeURLsToOtherDomains($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]); } // Remove "pathUp"-links if follow_mode=3 // (fe: base-site: www.foo.com/bar/index.htm -> dont follow: www.foo.com/anotherbar/xyz) if ($this->general_follow_mode == 3) { PHPCrawlerUtils::removePathUpLinks($links_found, $this->url_to_crawl); } // If given, dont follow "not matching"-links // (dont follow given preg_matches) if (count($this->not_follow_matches) > 0) { PHPCrawlerUtils::removeMatchingLinks($links_found, $this->not_follow_matches); } // If given, just follow "matching"-links // (only follow given preg_matches) if (count($this->follow_matches) > 0) { $links_found =& PHPCrawlerUtils::removeNotMatchingLinks($links_found, $this->follow_matches); } // Add found and filtered links to the main_array urls_to_crawl if ($this->benchmark == true) { $bm_start = $this->getmicrotime(); } PHPCrawlerUtils::addToArray($links_found, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo); if ($this->benchmark == true) { echo "addToArray(): " . ($this->getmicrotime() - $bm_start) . "<br>"; } // If there is wasnt any content found so far (code 200) and theres // a redirect location // -> follow it, doesnt matter what follow-mode was choosen ! // (put it into the main-array !) if (!isset($content_found) && $redirect != "" && $this->follow_redirects_till_content == true) { $rd[0]["url_rebuild"] = phpcrawlerutils::buildURL($redirect, $actual_url); $rd[0]["priority_level"] = 0; PHPCrawlerUtils::addToArray($rd, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo); } // Now we remove the actual URL from the priority-array unset($this->urls_to_crawl[$pri_level][$key]); // Now we check if a priority-array with a higher priority // contains URLS and if so, stop processing this pri-array and "switch" to the higher // one for ($pri_level_check = $this->max_priority_level + 1; $pri_level_check > $pri_level; $pri_level_check--) { if (isset($this->urls_to_crawl[$pri_level_check]) && $pri_level_check > $pri_level) { $stop_crawling_this_level = true; } } // Stop crawling this level if ($stop_crawling_this_level == true) { $pri_level = $this->max_priority_level + 1; break; } // Unset crawled URL, not nedded anymore unset($this->urls_to_crawl[$pri_level][$key]); // echo "All:".($this->getmicrotime()-$all_start); } // end of loop over priority-array // If a priority_level was crawled completely -> unset the whole array if ($stop_crawling_this_level == false) { unset($this->urls_to_crawl[$pri_level]); } } // end if priority-level exists } // end of main loop // Loop stopped here, build report-array (status_return) $this->status_return["links_followed"] = $links_followed; $this->status_return["files_received"] = $files_received; $this->status_return["bytes_received"] = $this->pageRequest->traffic_all; $this->status_return["traffic_limit_reached"] = $page_data["traffic_limit_reached"]; if (isset($page_data["file_limit_reached"])) { $this->status_return["file_limit_reached"] = $page_data["file_limit_reached"]; } else { $this->status_return["file_limit_reached"] = false; } if (isset($page_data["user_abort"])) { $this->status_return["user_abort"] = $page_data["user_abort"]; } else { $this->status_return["user_abort"] = false; } if (isset($stop_crawling)) { $this->status_return["limit_reached"] = true; } else { $this->status_return["limit_reached"] = false; } // Process-time $this->status_return["process_runtime"] = $this->getMicroTime() - $starting_time; // Average bandwith / throughput $this->status_return["data_throughput"] = round($this->status_return["bytes_received"] / $this->status_return["process_runtime"]); if ($this->firstCrawl) { $query = "UPDATE tests SET status = 'Finished Crawling!' WHERE id = {$this->testId};"; if (connectToDb($db)) { $db->query($query); $duration = $this->status_return["process_runtime"]; $query = "UPDATE tests SET duration = {$duration} WHERE id = {$this->testId};"; $db->query($query); } } }
function receivePage($url_to_crawl, $referer_url) { // Check if tmp-file was set by the user, otherwise set a default one if ($this->tmp_file == "") { $this->tmp_file = uniqid(time()) . ".tmp"; } // Define some vars $source_read = ""; $bytes_received = 0; $stream_to_memory = false; $stream_to_file = false; // Split the url to crawl into its elements (host, path, port and stuff) $url_parts = PHPCrawlerUtils::splitURL($url_to_crawl); $protocol = $url_parts["protocol"]; $host = $url_parts["host"]; $path = $url_parts["path"]; $query = $url_parts["query"]; $file = $url_parts["file"]; $port = $url_parts["port"]; // If the host was already visited so far // -> get the ip from our host-ip-array, otherwise // get the IP and add the entry to the array. if (isset($this->host_ip_table[$host])) { $ip = $this->host_ip_table[$host]; } else { $ip = $this->host_ip_table[$host] = gethostbyname($host); // Host switched and wasnt "visited" before. // So read the robots.txt-file for this new host (if wanted) if ($this->use_robots_txt_files == true) { $this->robotsTxtHandler->processRobotsTxt($protocol, $host, $port, $this->user_agent_string); } } // Is this URL allowed to be requested by the robots.txt-file of this host? $url_disallowed = false; if ($this->use_robots_txt_files == true) { $host_url = $protocol . $host . ":" . $port; $url_disallowed = $this->robotsTxtHandler->checkIfUrlDisallowed($url_to_crawl, $host_url); } // Check the protocol (http or https) and build the // host-string for fsockopen if ($protocol == "https://") { $host_str = "ssl://" . $ip; } else { $host_str = $ip; } // normal connect // Check if an authentication should be send $authentication = PHPCrawlerUtils::getAuthenticationForURL($this->basic_authentications, $url_to_crawl); // Error-codes // 0 - couldnt connect to server / page within timeout-time // 1 - stopped reading from socket, read-timeout reached BEFORE EOF() // Open socket-connection if ($url_disallowed == false) { $s = @fsockopen($host_str, $port, $e, $t, $this->socket_mean_timeout); } else { return false; // Return false if the URL was completely ignored } if ($s == false) { $error_string = $t; $error_code = $e; if ($t == "" && $e == "") { $error_code = 0; $error_string = "Couldn't connect to server"; } } else { $header_found = false; // will get true if the header of the page was extracted // Build header to send $headerlines_to_send[] = "GET " . $path . $file . $query . " HTTP/1.0\r\n"; $headerlines_to_send[] = "HOST: " . $host . "\r\n"; // Referer if ($referer_url != "") { $headerlines_to_send[] = "Referer: {$referer_url}\r\n"; } // Cookies if ($this->handle_cookies == true) { $cookie_string = PHPCrawlerUtils::buildHeaderCookieString($this->cookies, $host); } if (isset($cookie_string)) { $headerlines_to_send[] = "Cookie: " . $cookie_string . "\r\n"; } // Authentication if (count($authentication) > 0) { $auth_string = base64_encode($authentication["username"] . ":" . $authentication["password"]); $headerlines_to_send[] = "Authorization: Basic " . $auth_string . "\r\n"; } // Rest of header $headerlines_to_send[] = "User-Agent: " . str_replace("\n", "", $this->user_agent_string) . "\r\n"; $headerlines_to_send[] = "Connection: close\r\n"; $headerlines_to_send[] = "\r\n"; // Now send the header for ($x = 0; $x < count($headerlines_to_send); $x++) { // Send header-line fputs($s, $headerlines_to_send[$x]); // Put together lines to $header_send if (isset($header_send)) { $header_send .= $headerlines_to_send[$x]; } else { $header_send = $headerlines_to_send[$x]; } } unset($header_lines); $status = socket_get_status($s); // Now read from socket // UNTIL timeout reached OR eof() OR content-type shouldnt be followed // OR traffic-limit reached or ... while (!isset($stop)) { socket_set_timeout($s, $this->socket_read_timeout); // Read from socket $line_read = @fgets($s, 1024); // The @ is to avoid the strange "SSL fatal protocol error"-warning that // appears in some environments without any reasons $source_read .= $line_read; // do this anyway // If we want the content in tmp-file -> write line to TMP-file if ($header_found == true && $stream_to_file == true && $line_read) { unset($check); $check = @fwrite($fp, $line_read); if ($check == false) { $error_code = "2000"; $error_string = "Couldn't write to TMP-file " . $this->tmp_file; } } // Count bytes of the content (not the header) if ($header_found == true) { $bytes_received = $bytes_received + strlen($line_read); } // Check for traffic limit and stop receiving if reached if ($this->traffic_limit_complete_page == false && $this->traffic_limit_all > 0) { if (strlen($source_read) + $this->traffic_all > $this->traffic_limit_all) { $stop = true; $received_completly = false; $page_data["traffic_limit_reached"] = true; } } // Check for pagesize-limit if ($header_found == true && $bytes_received > $this->pagesize_limit && $this->pagesize_limit > 0) { $stop = true; $received_completly = false; } // "Cut" Header in seperate var $header and handle it if ($header_found == false && substr($source_read, -4, 4) == "\r\n\r\n") { $header = substr($source_read, 0, strlen($source_read) - 2); $actual_content_type = PHPCrawlerUtils::getHeaderTag("content-type", $header); $source_read = ""; $header_found = true; // Get the http-status-code $http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header); // Should this content-type be streamed into memory (true/false) ? $stream_to_memory = PHPCrawlerUtils::decideStreamToMemory($header, $this->receive_to_memory_matches); // Should this content-type be streamed into tmp-file (true/false) ? $stream_to_file = PHPCrawlerUtils::decideStreamToTmpFile($header, $this->receive_to_file_matches); // No ? then open TMP-file for the stream if ($stream_to_file == true) { $fp = @fopen($this->tmp_file, "w"); if ($fp == false) { $error_code = "2000"; $error_string = "Couldn't open TMP-file" . $this->tmp_file; } } // Header found here -> check if source should be followed (content-type) $follow = PHPCrawlerUtils::decideFollow($header, $this->follow_content_type); // no ?? then stop with this page ! if ($follow == false) { $stop = true; } else { $received_completly = true; // just init, may switch later on ! } // Check if a cookie was send with the header and store it // (if wanted) if ($this->handle_cookies == true) { PHPCrawlerUtils::getCookieData($header, $this->cookies, $host); } } // end cut and handle header // Get status of socket to check timeout and EOF $status = socket_get_status($s); // Now, if the source-buffer is filled or EOF is reached // -> look for links in the buffer, put the found links into // array $links_found_in_page and then empty the buffer BUT // COPY THE LAST FEW BYTES of the old buffer into the new one ! // This has to be done because of links that take more than a single // line ! // And yes, only makes sense if we dont want to have the whole content // in memory anyway AND if the content-type is text/html! if ($header_found == true && $stream_to_memory == false) { if (strlen($source_read) >= 100000 || $status["eof"] == true) { if (preg_match("/text\\/html/ i", $actual_content_type)) { $links_found_in_buffer = PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map); $source_read = substr($source_read, -1500); } } } // Check timeout if ($status["timed_out"] == true) { $error_code = 1000; // ahem..which int to give ?? $error_string = "socketstream timed out"; $stop = true; $received_completly = false; } // Check eof if ($status["eof"] == true) { $stop = true; } } fclose($s); // close socket if (isset($fp) && $fp != false) { fclose($fp); } // close tmp file if used } // echo "Get page:".($this->getmicrotime() - $start); // Now, HERE, if the whole content/source was received into memory, // we are looking for the links in the complete source (faster) // it only makes sense if content-type is text/html ! if ($stream_to_memory == true) { unset($links_found_in_page); if (preg_match("/text\\/html/ i", $actual_content_type)) { // $start = $this->getmicrotime(); PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map); // echo "Find links:".($this->getmicrotime() - $start); } } // Add the "refering_url" to the array-elements if (isset($links_found_in_page)) { for ($x = 0; $x < count($links_found_in_page); $x++) { $links_found_in_page[$x]["referer_url"] = $url_to_crawl; } } // Page crawled, // return header, source, followed (true/false) and all we got here unset($page_data); if (isset($error_code)) { $page_data["error_code"] = $error_code; } else { $page_data["error_code"] = false; } if (isset($error_string)) { $page_data["error_string"] = $error_string; } else { $page_data["error_string"] = false; } if (isset($follow)) { $page_data["received"] =& $follow; } else { $page_data["received"] = false; } if (isset($received_completly)) { $page_data["received_completly"] =& $received_completly; } else { $page_data["received_completly"] = false; } $page_data["received_completely"] =& $page_data["received_completly"]; // Wrote "completely" it wrong in prev. version, if (isset($bytes_received)) { $page_data["bytes_received"] = $bytes_received; } else { $page_data["bytes_received"] = 0; } if (isset($header)) { $page_data["header"] =& $header; } else { $page_data["header"] = false; } if (isset($http_status_code)) { $page_data["http_status_code"] =& $http_status_code; } else { $page_data["http_status_code"] = false; } if (isset($actual_content_type)) { $page_data["content_type"] = $actual_content_type; } else { $page_data["content_type"] = false; } // TMP-file infos and that $page_data["content_tmp_file"] = $page_data["received_to_file"] = false; $page_data["source"] = $page_data["content"] = $page_data["received_to_memory"] = false; if (isset($page_data["received"])) { if ($stream_to_file == true) { $page_data["content_tmp_file"] = $this->tmp_file; $page_data["received_to_file"] = true; } if ($stream_to_memory == true) { $page_data["source"] =& $source_read; $page_data["content"] =& $source_read; $page_data["received_to_memory"] = true; } } // Additional infos for the override-function handlePageData() $page_data["protocol"] = $protocol; $page_data["port"] = $port; $page_data["host"] = $host; $page_data["path"] = $path; $page_data["file"] = $file; $page_data["query"] = $query; $page_data["header_send"] =& $header_send; $page_data["referer_url"] = $referer_url; // "Normailzed" URL and referer-URL (f.e. without port if port is 80 and protocol is http) $page_data["url"] = $url_to_crawl; // All links found in this page $page_data["links_found"] =& $links_found_in_page; // Increase SUM of traffic alltogether this instance received $this->traffic_all = $this->traffic_all + strlen($page_data["header"]) + $page_data["bytes_received"]; // Set flag if traffic-limit is reached if ($this->traffic_all > $this->traffic_limit_all && $this->traffic_limit_all != 0) { $page_data["traffic_limit_reached"] = true; } if (!isset($page_data["traffic_limit_reached"])) { $page_data["traffic_limit_reached"] = false; } return $page_data; }