function decideStreamToTmpFile(&$header, &$match_array)
 {
     // Get the content-type from header
     $content_type = phpcrawlerutils::getHeaderTag("content-type", $header);
     // Should it be received to memory ?
     @reset($match_array);
     while (list($x) = @each($match_array)) {
         if (preg_match($match_array[$x], $content_type)) {
             return true;
             break;
         }
     }
     return false;
 }
Example #2
0
 function go()
 {
     connectToDb($db);
     $starting_time = $this->getmicrotime();
     // Init, split given URL into host, port, path and file a.s.o.
     $url_parts = PHPCrawlerUtils::splitURL($this->url_to_crawl);
     // Set base-host and base-path "global" for this class,
     // we need it very often (i guess at this point...)
     $this->base_path = $url_parts["path"];
     $this->base_host = $url_parts["host"];
     $this->base_domain = $url_parts["domain"];
     // If the base port wasnt set by the user ->
     // take the one from the given start-URL.
     if ($this->base_port == "") {
         $this->base_port = $url_parts["port"];
     }
     // if the base-port WAS set by the user
     $url_parts["port"] = $this->base_port;
     // Reset the base_url
     $this->url_to_crawl = PHPCrawlerUtils::rebuildURL($url_parts);
     $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($this->url_to_crawl);
     // Init counters
     $links_followed = 0;
     $files_received = 0;
     // Put the first url into our main-array
     $tmp[0]["url_rebuild"] = $this->url_to_crawl;
     PHPCrawlerUtils::removeMatchingLinks($tmp, $this->not_follow_matches);
     if (isset($tmp[0]["url_rebuild"]) && $tmp[0]["url_rebuild"] != "") {
         PHPCrawlerUtils::addToArray($tmp, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
     }
     // MAIN-LOOP -------------------------------------------------------------------
     // It works like this:
     // The first loop looks through all the "Priority"-arrays and checks if any
     // of these arrays is filled with URLS.
     for ($pri_level = $this->max_priority_level + 1; $pri_level > -1; $pri_level--) {
         // Yep. Found a priority-array with at least one URL
         if (isset($this->urls_to_crawl[$pri_level]) && !isset($stop_crawling)) {
             // Now "process" all URLS in this priroity-array
             @reset($this->urls_to_crawl[$pri_level]);
             while (list($key) = @each($this->urls_to_crawl[$pri_level])) {
                 $all_start = $this->getmicrotime();
                 $stop_crawling_this_level = false;
                 // init
                 // Request URL (crawl())
                 unset($page_data);
                 if (!isset($this->urls_to_crawl[$pri_level][$key]["referer_url"])) {
                     $this->urls_to_crawl[$pri_level][$key]["referer_url"] = "";
                 }
                 if ($db) {
                     incrementHttpRequests($db, $this->testId);
                 }
                 //Increment number of HTTP requests sent as fsockopen is called next
                 $page_data = $this->pageRequest->receivePage($this->urls_to_crawl[$pri_level][$key]["url_rebuild"], $this->urls_to_crawl[$pri_level][$key]["referer_url"]);
                 // If the request-object just irnored the URL ->
                 // -> Stop and remove URL from Array
                 if ($page_data == false) {
                     unset($this->urls_to_crawl[$pri_level][$key]);
                     continue;
                 }
                 $links_followed++;
                 // Now $page_data["links_found"] contains all found links at this point
                 // Check if a "<base href.."-tag is given in the source and xtract
                 // the base URL
                 // !! Doesnt have to be rebuild cause it only can be a full
                 // qualified URL !!
                 $base_url = PHPCrawlerUtils::getBasePathFromTag($page_data["source"]);
                 if ($base_url == "") {
                     $actual_url =& $this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
                 } else {
                     $actual_url = $base_url;
                 }
                 // Set flag "content_found" if..content was found
                 if (isset($page_data["http_status_code"]) && $page_data["http_status_code"] == 200) {
                     $content_found = true;
                 }
                 // Check for a REDIRECT-header and if wanted, put it into the array of found links
                 $redirect = PHPCrawlerUtils::getRedirectLocation($page_data["header"]);
                 if ($redirect && $this->follow_redirects == true) {
                     $tmp_array["link_raw"] = $redirect;
                     $tmp_array["referer_url"] = $this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
                     $page_data["links_found"][] = $tmp_array;
                 }
                 // Count files that have been received completly
                 if ($page_data["received"] == true) {
                     $files_received++;
                 }
                 // If traffic-limit is reached -> stop crawling
                 if ($page_data["traffic_limit_reached"] == true) {
                     $stop_crawling = true;
                 }
                 // Check if pagelimit is reached if set
                 // (and check WHICH page-limit was set)
                 if ($this->page_limit_all > 0) {
                     if ($this->page_limit_count_ct_only == true && $files_received >= $this->page_limit_all) {
                         $stop_crawling = true;
                     } elseif ($this->page_limit_count_ct_only == false && $links_followed >= $this->page_limit_all) {
                         $stop_crawling = true;
                     }
                 }
                 // Add the actual referer to the page_data array for the handlePageData-method
                 $page_data["refering_linktext"] =& $this->urls_to_crawl[$pri_level][$key]["linktext"];
                 $page_data["refering_link_raw"] =& $this->urls_to_crawl[$pri_level][$key]["link_raw"];
                 $page_data["refering_linkcode"] =& $this->urls_to_crawl[$pri_level][$key]["linkcode"];
                 // build new absolute URLs from found links
                 $page_data["links_found"] = PHPCrawlerUtils::buildURLs($page_data["links_found"], $actual_url);
                 // Call the overridable user-function here, but first
                 // "save" the found links from user-manipulation
                 $links_found = $page_data["links_found"];
                 $user_return = $this->handlePageData($page_data);
                 // Stop crawling if user returned a negative value
                 if ($user_return < 0) {
                     $stop_crawling = true;
                     $page_data["user_abort"] = true;
                 }
                 // Compare the found links with link-priorities set by the user
                 // and add the priority-level to our array $links_found
                 if ($this->benchmark == true) {
                     $bm_start = $this->getmicrotime();
                 }
                 PHPCrawlerUtils::addURLPriorities($links_found, $this->link_priorities);
                 if ($this->benchmark == true) {
                     echo "addUrlPriorities(): " . ($this->getmicrotime() - $bm_start) . "<br>";
                 }
                 // Here we can delete the tmp-file maybe created by the pageRequest-object
                 if (file_exists($this->pageRequest->tmp_file)) {
                     @unlink($this->pageRequest->tmp_file);
                 }
                 // Stop everything if a limit was reached
                 if (isset($stop_crawling)) {
                     break;
                     $pri_level = 1000;
                 }
                 // Remove links to other hosts if follow_mode is 2 or 3
                 if ($this->general_follow_mode == 2 || $this->general_follow_mode == 3) {
                     PHPCrawlerUtils::removeURLsToOtherHosts($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
                 }
                 // Remove links to other domains if follow_mode=1
                 if ($this->general_follow_mode == 1) {
                     PHPCrawlerUtils::removeURLsToOtherDomains($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
                 }
                 // Remove "pathUp"-links if follow_mode=3
                 // (fe: base-site: www.foo.com/bar/index.htm -> dont follow: www.foo.com/anotherbar/xyz)
                 if ($this->general_follow_mode == 3) {
                     PHPCrawlerUtils::removePathUpLinks($links_found, $this->url_to_crawl);
                 }
                 // If given, dont follow "not matching"-links
                 // (dont follow given preg_matches)
                 if (count($this->not_follow_matches) > 0) {
                     PHPCrawlerUtils::removeMatchingLinks($links_found, $this->not_follow_matches);
                 }
                 // If given, just follow "matching"-links
                 // (only follow given preg_matches)
                 if (count($this->follow_matches) > 0) {
                     $links_found =& PHPCrawlerUtils::removeNotMatchingLinks($links_found, $this->follow_matches);
                 }
                 // Add found and filtered links to the main_array urls_to_crawl
                 if ($this->benchmark == true) {
                     $bm_start = $this->getmicrotime();
                 }
                 PHPCrawlerUtils::addToArray($links_found, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
                 if ($this->benchmark == true) {
                     echo "addToArray(): " . ($this->getmicrotime() - $bm_start) . "<br>";
                 }
                 // If there is wasnt any content found so far (code 200) and theres
                 // a redirect location
                 // -> follow it, doesnt matter what follow-mode was choosen !
                 // (put it into the main-array !)
                 if (!isset($content_found) && $redirect != "" && $this->follow_redirects_till_content == true) {
                     $rd[0]["url_rebuild"] = phpcrawlerutils::buildURL($redirect, $actual_url);
                     $rd[0]["priority_level"] = 0;
                     PHPCrawlerUtils::addToArray($rd, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
                 }
                 // Now we remove the actual URL from the priority-array
                 unset($this->urls_to_crawl[$pri_level][$key]);
                 // Now we check if a priority-array with a higher priority
                 // contains URLS and if so, stop processing this pri-array and "switch" to the higher
                 // one
                 for ($pri_level_check = $this->max_priority_level + 1; $pri_level_check > $pri_level; $pri_level_check--) {
                     if (isset($this->urls_to_crawl[$pri_level_check]) && $pri_level_check > $pri_level) {
                         $stop_crawling_this_level = true;
                     }
                 }
                 // Stop crawling this level
                 if ($stop_crawling_this_level == true) {
                     $pri_level = $this->max_priority_level + 1;
                     break;
                 }
                 // Unset crawled URL, not nedded anymore
                 unset($this->urls_to_crawl[$pri_level][$key]);
                 // echo "All:".($this->getmicrotime()-$all_start);
             }
             // end of loop over priority-array
             // If a priority_level was crawled completely -> unset the whole array
             if ($stop_crawling_this_level == false) {
                 unset($this->urls_to_crawl[$pri_level]);
             }
         }
         // end if priority-level exists
     }
     // end of main loop
     // Loop stopped here, build report-array (status_return)
     $this->status_return["links_followed"] = $links_followed;
     $this->status_return["files_received"] = $files_received;
     $this->status_return["bytes_received"] = $this->pageRequest->traffic_all;
     $this->status_return["traffic_limit_reached"] = $page_data["traffic_limit_reached"];
     if (isset($page_data["file_limit_reached"])) {
         $this->status_return["file_limit_reached"] = $page_data["file_limit_reached"];
     } else {
         $this->status_return["file_limit_reached"] = false;
     }
     if (isset($page_data["user_abort"])) {
         $this->status_return["user_abort"] = $page_data["user_abort"];
     } else {
         $this->status_return["user_abort"] = false;
     }
     if (isset($stop_crawling)) {
         $this->status_return["limit_reached"] = true;
     } else {
         $this->status_return["limit_reached"] = false;
     }
     // Process-time
     $this->status_return["process_runtime"] = $this->getMicroTime() - $starting_time;
     // Average bandwith / throughput
     $this->status_return["data_throughput"] = round($this->status_return["bytes_received"] / $this->status_return["process_runtime"]);
     if ($this->firstCrawl) {
         $query = "UPDATE tests SET status = 'Finished Crawling!' WHERE id = {$this->testId};";
         if (connectToDb($db)) {
             $db->query($query);
             $duration = $this->status_return["process_runtime"];
             $query = "UPDATE tests SET duration = {$duration} WHERE id = {$this->testId};";
             $db->query($query);
         }
     }
 }