PHP PHPCrawlerUtils::addURLPriorities 예제들

프로그래밍 언어: PHP

클래스/타입: PHPCrawlerUtils

메소드/함수: addURLPriorities

hotexamples.com에서의 예제들: 1

PHP PHPCrawlerUtils::addURLPriorities - 1개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 PHP의 PHPCrawlerUtils::addURLPriorities에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

splitURL(14)

checkRegexPattern(4)

normalizeURL(4)

getHeaderValue(3)

getRootUrl(2)

checkExpressionPattern(2)

getRedirectURLFromHeader(2)

checkStringAgainstRegexArray(2)

serializeToFile(2)

rmDir(2)

deserializeFromFile(2)

getHeaderTag(2)

getHTTPStatusCode(2)

buildHeaderCookieString(1)

isUTF8String(1)

isUrlEncodedString(1)

isValidUrlString(1)

buildURLFromParts(1)

normalizeUrl(1)

rebuildURL(1)

removeMatchingLinks(1)

isGzipEncoded(1)

removePathUpLinks(1)

removeURLsToOtherDomains(1)

removeURLsToOtherHosts(1)

sort2dArray(1)

removeNotMatchingLinks(1)

getRedirectLocation(1)

getURIContent(1)

getAuthenticationForURL(1)

buildURLFromLink(1)

decideFollow(1)

decideStreamToMemory(1)

decideStreamToTmpFile(1)

decodeGZipContent(1)

findLinks(1)

getBasePathFromTag(1)

getSystemTempDir(1)

getBaseUrlFromMetaTag(1)

getCookieData(1)

getCookiesFromHeader(1)

addURLPriorities(1)

getMetaTagAttributes(1)

buildURLs(1)

addToArray(1)

예제 #1

파일 보기

파일: phpcrawler.class.php 프로젝트: 3nj0y/webvulscan

 function go()
 {
     connectToDb($db);
     $starting_time = $this->getmicrotime();
     // Init, split given URL into host, port, path and file a.s.o.
     $url_parts = PHPCrawlerUtils::splitURL($this->url_to_crawl);
     // Set base-host and base-path "global" for this class,
     // we need it very often (i guess at this point...)
     $this->base_path = $url_parts["path"];
     $this->base_host = $url_parts["host"];
     $this->base_domain = $url_parts["domain"];
     // If the base port wasnt set by the user ->
     // take the one from the given start-URL.
     if ($this->base_port == "") {
         $this->base_port = $url_parts["port"];
     }
     // if the base-port WAS set by the user
     $url_parts["port"] = $this->base_port;
     // Reset the base_url
     $this->url_to_crawl = PHPCrawlerUtils::rebuildURL($url_parts);
     $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($this->url_to_crawl);
     // Init counters
     $links_followed = 0;
     $files_received = 0;
     // Put the first url into our main-array
     $tmp[0]["url_rebuild"] = $this->url_to_crawl;
     PHPCrawlerUtils::removeMatchingLinks($tmp, $this->not_follow_matches);
     if (isset($tmp[0]["url_rebuild"]) && $tmp[0]["url_rebuild"] != "") {
         PHPCrawlerUtils::addToArray($tmp, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
     }
     // MAIN-LOOP -------------------------------------------------------------------
     // It works like this:
     // The first loop looks through all the "Priority"-arrays and checks if any
     // of these arrays is filled with URLS.
     for ($pri_level = $this->max_priority_level + 1; $pri_level > -1; $pri_level--) {
         // Yep. Found a priority-array with at least one URL
         if (isset($this->urls_to_crawl[$pri_level]) && !isset($stop_crawling)) {
             // Now "process" all URLS in this priroity-array
             @reset($this->urls_to_crawl[$pri_level]);
             while (list($key) = @each($this->urls_to_crawl[$pri_level])) {
                 $all_start = $this->getmicrotime();
                 $stop_crawling_this_level = false;
                 // init
                 // Request URL (crawl())
                 unset($page_data);
                 if (!isset($this->urls_to_crawl[$pri_level][$key]["referer_url"])) {
                     $this->urls_to_crawl[$pri_level][$key]["referer_url"] = "";
                 }
                 if ($db) {
                     incrementHttpRequests($db, $this->testId);
                 }
                 //Increment number of HTTP requests sent as fsockopen is called next
                 $page_data = $this->pageRequest->receivePage($this->urls_to_crawl[$pri_level][$key]["url_rebuild"], $this->urls_to_crawl[$pri_level][$key]["referer_url"]);
                 // If the request-object just irnored the URL ->
                 // -> Stop and remove URL from Array
                 if ($page_data == false) {
                     unset($this->urls_to_crawl[$pri_level][$key]);
                     continue;
                 }
                 $links_followed++;
                 // Now $page_data["links_found"] contains all found links at this point
                 // Check if a "<base href.."-tag is given in the source and xtract
                 // the base URL
                 // !! Doesnt have to be rebuild cause it only can be a full
                 // qualified URL !!
                 $base_url = PHPCrawlerUtils::getBasePathFromTag($page_data["source"]);
                 if ($base_url == "") {
                     $actual_url =& $this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
                 } else {
                     $actual_url = $base_url;
                 }
                 // Set flag "content_found" if..content was found
                 if (isset($page_data["http_status_code"]) && $page_data["http_status_code"] == 200) {
                     $content_found = true;
                 }
                 // Check for a REDIRECT-header and if wanted, put it into the array of found links
                 $redirect = PHPCrawlerUtils::getRedirectLocation($page_data["header"]);
                 if ($redirect && $this->follow_redirects == true) {
                     $tmp_array["link_raw"] = $redirect;
                     $tmp_array["referer_url"] = $this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
                     $page_data["links_found"][] = $tmp_array;
                 }
                 // Count files that have been received completly
                 if ($page_data["received"] == true) {
                     $files_received++;
                 }
                 // If traffic-limit is reached -> stop crawling
                 if ($page_data["traffic_limit_reached"] == true) {
                     $stop_crawling = true;
                 }
                 // Check if pagelimit is reached if set
                 // (and check WHICH page-limit was set)
                 if ($this->page_limit_all > 0) {
                     if ($this->page_limit_count_ct_only == true && $files_received >= $this->page_limit_all) {
                         $stop_crawling = true;
                     } elseif ($this->page_limit_count_ct_only == false && $links_followed >= $this->page_limit_all) {
                         $stop_crawling = true;
                     }
                 }
                 // Add the actual referer to the page_data array for the handlePageData-method
                 $page_data["refering_linktext"] =& $this->urls_to_crawl[$pri_level][$key]["linktext"];
                 $page_data["refering_link_raw"] =& $this->urls_to_crawl[$pri_level][$key]["link_raw"];
                 $page_data["refering_linkcode"] =& $this->urls_to_crawl[$pri_level][$key]["linkcode"];
                 // build new absolute URLs from found links
                 $page_data["links_found"] = PHPCrawlerUtils::buildURLs($page_data["links_found"], $actual_url);
                 // Call the overridable user-function here, but first
                 // "save" the found links from user-manipulation
                 $links_found = $page_data["links_found"];
                 $user_return = $this->handlePageData($page_data);
                 // Stop crawling if user returned a negative value
                 if ($user_return < 0) {
                     $stop_crawling = true;
                     $page_data["user_abort"] = true;
                 }
                 // Compare the found links with link-priorities set by the user
                 // and add the priority-level to our array $links_found
                 if ($this->benchmark == true) {
                     $bm_start = $this->getmicrotime();
                 }
                 PHPCrawlerUtils::addURLPriorities($links_found, $this->link_priorities);
                 if ($this->benchmark == true) {
                     echo "addUrlPriorities(): " . ($this->getmicrotime() - $bm_start) . "<br>";
                 }
                 // Here we can delete the tmp-file maybe created by the pageRequest-object
                 if (file_exists($this->pageRequest->tmp_file)) {
                     @unlink($this->pageRequest->tmp_file);
                 }
                 // Stop everything if a limit was reached
                 if (isset($stop_crawling)) {
                     break;
                     $pri_level = 1000;
                 }
                 // Remove links to other hosts if follow_mode is 2 or 3
                 if ($this->general_follow_mode == 2 || $this->general_follow_mode == 3) {
                     PHPCrawlerUtils::removeURLsToOtherHosts($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
                 }
                 // Remove links to other domains if follow_mode=1
                 if ($this->general_follow_mode == 1) {
                     PHPCrawlerUtils::removeURLsToOtherDomains($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
                 }
                 // Remove "pathUp"-links if follow_mode=3
                 // (fe: base-site: www.foo.com/bar/index.htm -> dont follow: www.foo.com/anotherbar/xyz)
                 if ($this->general_follow_mode == 3) {
                     PHPCrawlerUtils::removePathUpLinks($links_found, $this->url_to_crawl);
                 }
                 // If given, dont follow "not matching"-links
                 // (dont follow given preg_matches)
                 if (count($this->not_follow_matches) > 0) {
                     PHPCrawlerUtils::removeMatchingLinks($links_found, $this->not_follow_matches);
                 }
                 // If given, just follow "matching"-links
                 // (only follow given preg_matches)
                 if (count($this->follow_matches) > 0) {
                     $links_found =& PHPCrawlerUtils::removeNotMatchingLinks($links_found, $this->follow_matches);
                 }
                 // Add found and filtered links to the main_array urls_to_crawl
                 if ($this->benchmark == true) {
                     $bm_start = $this->getmicrotime();
                 }
                 PHPCrawlerUtils::addToArray($links_found, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
                 if ($this->benchmark == true) {
                     echo "addToArray(): " . ($this->getmicrotime() - $bm_start) . "<br>";
                 }
                 // If there is wasnt any content found so far (code 200) and theres
                 // a redirect location
                 // -> follow it, doesnt matter what follow-mode was choosen !
                 // (put it into the main-array !)
                 if (!isset($content_found) && $redirect != "" && $this->follow_redirects_till_content == true) {
                     $rd[0]["url_rebuild"] = phpcrawlerutils::buildURL($redirect, $actual_url);
                     $rd[0]["priority_level"] = 0;
                     PHPCrawlerUtils::addToArray($rd, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
                 }
                 // Now we remove the actual URL from the priority-array
                 unset($this->urls_to_crawl[$pri_level][$key]);
                 // Now we check if a priority-array with a higher priority
                 // contains URLS and if so, stop processing this pri-array and "switch" to the higher
                 // one
                 for ($pri_level_check = $this->max_priority_level + 1; $pri_level_check > $pri_level; $pri_level_check--) {
                     if (isset($this->urls_to_crawl[$pri_level_check]) && $pri_level_check > $pri_level) {
                         $stop_crawling_this_level = true;
                     }
                 }
                 // Stop crawling this level
                 if ($stop_crawling_this_level == true) {
                     $pri_level = $this->max_priority_level + 1;
                     break;
                 }
                 // Unset crawled URL, not nedded anymore
                 unset($this->urls_to_crawl[$pri_level][$key]);
                 // echo "All:".($this->getmicrotime()-$all_start);
             }
             // end of loop over priority-array
             // If a priority_level was crawled completely -> unset the whole array
             if ($stop_crawling_this_level == false) {
                 unset($this->urls_to_crawl[$pri_level]);
             }
         }
         // end if priority-level exists
     }
     // end of main loop
     // Loop stopped here, build report-array (status_return)
     $this->status_return["links_followed"] = $links_followed;
     $this->status_return["files_received"] = $files_received;
     $this->status_return["bytes_received"] = $this->pageRequest->traffic_all;
     $this->status_return["traffic_limit_reached"] = $page_data["traffic_limit_reached"];
     if (isset($page_data["file_limit_reached"])) {
         $this->status_return["file_limit_reached"] = $page_data["file_limit_reached"];
     } else {
         $this->status_return["file_limit_reached"] = false;
     }
     if (isset($page_data["user_abort"])) {
         $this->status_return["user_abort"] = $page_data["user_abort"];
     } else {
         $this->status_return["user_abort"] = false;
     }
     if (isset($stop_crawling)) {
         $this->status_return["limit_reached"] = true;
     } else {
         $this->status_return["limit_reached"] = false;
     }
     // Process-time
     $this->status_return["process_runtime"] = $this->getMicroTime() - $starting_time;
     // Average bandwith / throughput
     $this->status_return["data_throughput"] = round($this->status_return["bytes_received"] / $this->status_return["process_runtime"]);
     if ($this->firstCrawl) {
         $query = "UPDATE tests SET status = 'Finished Crawling!' WHERE id = {$this->testId};";
         if (connectToDb($db)) {
             $db->query($query);
             $duration = $this->status_return["process_runtime"];
             $query = "UPDATE tests SET duration = {$duration} WHERE id = {$this->testId};";
             $db->query($query);
         }
     }
 }