function go() { connectToDb($db); $starting_time = $this->getmicrotime(); // Init, split given URL into host, port, path and file a.s.o. $url_parts = PHPCrawlerUtils::splitURL($this->url_to_crawl); // Set base-host and base-path "global" for this class, // we need it very often (i guess at this point...) $this->base_path = $url_parts["path"]; $this->base_host = $url_parts["host"]; $this->base_domain = $url_parts["domain"]; // If the base port wasnt set by the user -> // take the one from the given start-URL. if ($this->base_port == "") { $this->base_port = $url_parts["port"]; } // if the base-port WAS set by the user $url_parts["port"] = $this->base_port; // Reset the base_url $this->url_to_crawl = PHPCrawlerUtils::rebuildURL($url_parts); $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($this->url_to_crawl); // Init counters $links_followed = 0; $files_received = 0; // Put the first url into our main-array $tmp[0]["url_rebuild"] = $this->url_to_crawl; PHPCrawlerUtils::removeMatchingLinks($tmp, $this->not_follow_matches); if (isset($tmp[0]["url_rebuild"]) && $tmp[0]["url_rebuild"] != "") { PHPCrawlerUtils::addToArray($tmp, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo); } // MAIN-LOOP ------------------------------------------------------------------- // It works like this: // The first loop looks through all the "Priority"-arrays and checks if any // of these arrays is filled with URLS. for ($pri_level = $this->max_priority_level + 1; $pri_level > -1; $pri_level--) { // Yep. Found a priority-array with at least one URL if (isset($this->urls_to_crawl[$pri_level]) && !isset($stop_crawling)) { // Now "process" all URLS in this priroity-array @reset($this->urls_to_crawl[$pri_level]); while (list($key) = @each($this->urls_to_crawl[$pri_level])) { $all_start = $this->getmicrotime(); $stop_crawling_this_level = false; // init // Request URL (crawl()) unset($page_data); if (!isset($this->urls_to_crawl[$pri_level][$key]["referer_url"])) { $this->urls_to_crawl[$pri_level][$key]["referer_url"] = ""; } if ($db) { incrementHttpRequests($db, $this->testId); } //Increment number of HTTP requests sent as fsockopen is called next $page_data = $this->pageRequest->receivePage($this->urls_to_crawl[$pri_level][$key]["url_rebuild"], $this->urls_to_crawl[$pri_level][$key]["referer_url"]); // If the request-object just irnored the URL -> // -> Stop and remove URL from Array if ($page_data == false) { unset($this->urls_to_crawl[$pri_level][$key]); continue; } $links_followed++; // Now $page_data["links_found"] contains all found links at this point // Check if a "<base href.."-tag is given in the source and xtract // the base URL // !! Doesnt have to be rebuild cause it only can be a full // qualified URL !! $base_url = PHPCrawlerUtils::getBasePathFromTag($page_data["source"]); if ($base_url == "") { $actual_url =& $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]; } else { $actual_url = $base_url; } // Set flag "content_found" if..content was found if (isset($page_data["http_status_code"]) && $page_data["http_status_code"] == 200) { $content_found = true; } // Check for a REDIRECT-header and if wanted, put it into the array of found links $redirect = PHPCrawlerUtils::getRedirectLocation($page_data["header"]); if ($redirect && $this->follow_redirects == true) { $tmp_array["link_raw"] = $redirect; $tmp_array["referer_url"] = $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]; $page_data["links_found"][] = $tmp_array; } // Count files that have been received completly if ($page_data["received"] == true) { $files_received++; } // If traffic-limit is reached -> stop crawling if ($page_data["traffic_limit_reached"] == true) { $stop_crawling = true; } // Check if pagelimit is reached if set // (and check WHICH page-limit was set) if ($this->page_limit_all > 0) { if ($this->page_limit_count_ct_only == true && $files_received >= $this->page_limit_all) { $stop_crawling = true; } elseif ($this->page_limit_count_ct_only == false && $links_followed >= $this->page_limit_all) { $stop_crawling = true; } } // Add the actual referer to the page_data array for the handlePageData-method $page_data["refering_linktext"] =& $this->urls_to_crawl[$pri_level][$key]["linktext"]; $page_data["refering_link_raw"] =& $this->urls_to_crawl[$pri_level][$key]["link_raw"]; $page_data["refering_linkcode"] =& $this->urls_to_crawl[$pri_level][$key]["linkcode"]; // build new absolute URLs from found links $page_data["links_found"] = PHPCrawlerUtils::buildURLs($page_data["links_found"], $actual_url); // Call the overridable user-function here, but first // "save" the found links from user-manipulation $links_found = $page_data["links_found"]; $user_return = $this->handlePageData($page_data); // Stop crawling if user returned a negative value if ($user_return < 0) { $stop_crawling = true; $page_data["user_abort"] = true; } // Compare the found links with link-priorities set by the user // and add the priority-level to our array $links_found if ($this->benchmark == true) { $bm_start = $this->getmicrotime(); } PHPCrawlerUtils::addURLPriorities($links_found, $this->link_priorities); if ($this->benchmark == true) { echo "addUrlPriorities(): " . ($this->getmicrotime() - $bm_start) . "<br>"; } // Here we can delete the tmp-file maybe created by the pageRequest-object if (file_exists($this->pageRequest->tmp_file)) { @unlink($this->pageRequest->tmp_file); } // Stop everything if a limit was reached if (isset($stop_crawling)) { break; $pri_level = 1000; } // Remove links to other hosts if follow_mode is 2 or 3 if ($this->general_follow_mode == 2 || $this->general_follow_mode == 3) { PHPCrawlerUtils::removeURLsToOtherHosts($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]); } // Remove links to other domains if follow_mode=1 if ($this->general_follow_mode == 1) { PHPCrawlerUtils::removeURLsToOtherDomains($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]); } // Remove "pathUp"-links if follow_mode=3 // (fe: base-site: www.foo.com/bar/index.htm -> dont follow: www.foo.com/anotherbar/xyz) if ($this->general_follow_mode == 3) { PHPCrawlerUtils::removePathUpLinks($links_found, $this->url_to_crawl); } // If given, dont follow "not matching"-links // (dont follow given preg_matches) if (count($this->not_follow_matches) > 0) { PHPCrawlerUtils::removeMatchingLinks($links_found, $this->not_follow_matches); } // If given, just follow "matching"-links // (only follow given preg_matches) if (count($this->follow_matches) > 0) { $links_found =& PHPCrawlerUtils::removeNotMatchingLinks($links_found, $this->follow_matches); } // Add found and filtered links to the main_array urls_to_crawl if ($this->benchmark == true) { $bm_start = $this->getmicrotime(); } PHPCrawlerUtils::addToArray($links_found, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo); if ($this->benchmark == true) { echo "addToArray(): " . ($this->getmicrotime() - $bm_start) . "<br>"; } // If there is wasnt any content found so far (code 200) and theres // a redirect location // -> follow it, doesnt matter what follow-mode was choosen ! // (put it into the main-array !) if (!isset($content_found) && $redirect != "" && $this->follow_redirects_till_content == true) { $rd[0]["url_rebuild"] = phpcrawlerutils::buildURL($redirect, $actual_url); $rd[0]["priority_level"] = 0; PHPCrawlerUtils::addToArray($rd, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo); } // Now we remove the actual URL from the priority-array unset($this->urls_to_crawl[$pri_level][$key]); // Now we check if a priority-array with a higher priority // contains URLS and if so, stop processing this pri-array and "switch" to the higher // one for ($pri_level_check = $this->max_priority_level + 1; $pri_level_check > $pri_level; $pri_level_check--) { if (isset($this->urls_to_crawl[$pri_level_check]) && $pri_level_check > $pri_level) { $stop_crawling_this_level = true; } } // Stop crawling this level if ($stop_crawling_this_level == true) { $pri_level = $this->max_priority_level + 1; break; } // Unset crawled URL, not nedded anymore unset($this->urls_to_crawl[$pri_level][$key]); // echo "All:".($this->getmicrotime()-$all_start); } // end of loop over priority-array // If a priority_level was crawled completely -> unset the whole array if ($stop_crawling_this_level == false) { unset($this->urls_to_crawl[$pri_level]); } } // end if priority-level exists } // end of main loop // Loop stopped here, build report-array (status_return) $this->status_return["links_followed"] = $links_followed; $this->status_return["files_received"] = $files_received; $this->status_return["bytes_received"] = $this->pageRequest->traffic_all; $this->status_return["traffic_limit_reached"] = $page_data["traffic_limit_reached"]; if (isset($page_data["file_limit_reached"])) { $this->status_return["file_limit_reached"] = $page_data["file_limit_reached"]; } else { $this->status_return["file_limit_reached"] = false; } if (isset($page_data["user_abort"])) { $this->status_return["user_abort"] = $page_data["user_abort"]; } else { $this->status_return["user_abort"] = false; } if (isset($stop_crawling)) { $this->status_return["limit_reached"] = true; } else { $this->status_return["limit_reached"] = false; } // Process-time $this->status_return["process_runtime"] = $this->getMicroTime() - $starting_time; // Average bandwith / throughput $this->status_return["data_throughput"] = round($this->status_return["bytes_received"] / $this->status_return["process_runtime"]); if ($this->firstCrawl) { $query = "UPDATE tests SET status = 'Finished Crawling!' WHERE id = {$this->testId};"; if (connectToDb($db)) { $db->query($query); $duration = $this->status_return["process_runtime"]; $query = "UPDATE tests SET duration = {$duration} WHERE id = {$this->testId};"; $db->query($query); } } }