PHP PHPCrawlerUtils::getAuthenticationForURL 예제들

프로그래밍 언어: PHP
클래스/타입: PHPCrawlerUtils
메소드/함수: getAuthenticationForURL
hotexamples.com에서의 예제들: 1
PHP PHPCrawlerUtils::getAuthenticationForURL - 1개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 PHP의 PHPCrawlerUtils::getAuthenticationForURL에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.
자주 사용되는 메소드들
보기 숨기기
splitURL(14)
checkRegexPattern(4)
normalizeURL(4)
getHeaderValue(3)
getRootUrl(2)
checkExpressionPattern(2)
getRedirectURLFromHeader(2)
checkStringAgainstRegexArray(2)
serializeToFile(2)
rmDir(2)
deserializeFromFile(2)
getHeaderTag(2)
getHTTPStatusCode(2)
buildHeaderCookieString(1)
isUTF8String(1)
isUrlEncodedString(1)
isValidUrlString(1)
buildURLFromParts(1)
normalizeUrl(1)
rebuildURL(1)
removeMatchingLinks(1)
isGzipEncoded(1)
removePathUpLinks(1)
removeURLsToOtherDomains(1)
removeURLsToOtherHosts(1)
sort2dArray(1)
removeNotMatchingLinks(1)
getRedirectLocation(1)
getURIContent(1)
getAuthenticationForURL(1)
buildURLFromLink(1)
decideFollow(1)
decideStreamToMemory(1)
decideStreamToTmpFile(1)
decodeGZipContent(1)
findLinks(1)
getBasePathFromTag(1)
getSystemTempDir(1)
getBaseUrlFromMetaTag(1)
getCookieData(1)
getCookiesFromHeader(1)
addURLPriorities(1)
getMetaTagAttributes(1)
buildURLs(1)
addToArray(1)
예제 #1
파일 보기
파일: phpcrawlerpagerequest.class.php 프로젝트: edusegzy/2tao
 function receivePage($url_to_crawl, $referer_url)
 {
     // Check if tmp-file was set by the user, otherwise set a default one
     if ($this->tmp_file == "") {
         $this->tmp_file = uniqid(time()) . ".tmp";
     }
     // Define some vars
     $source_read = "";
     $bytes_received = 0;
     $stream_to_memory = false;
     $stream_to_file = false;
     // Split the url to crawl into its elements (host, path, port and stuff)
     $url_parts = PHPCrawlerUtils::splitURL($url_to_crawl);
     $protocol = $url_parts["protocol"];
     $host = $url_parts["host"];
     $path = $url_parts["path"];
     $query = $url_parts["query"];
     $file = $url_parts["file"];
     $port = $url_parts["port"];
     // If the host was already visited so far
     // -> get the ip from our host-ip-array, otherwise
     // get the IP and add the entry to the array.
     if (isset($this->host_ip_table[$host])) {
         $ip = $this->host_ip_table[$host];
     } else {
         $ip = $this->host_ip_table[$host] = gethostbyname($host);
         // Host switched and wasnt "visited" before.
         // So read the robots.txt-file for this new host (if wanted)
         if ($this->use_robots_txt_files == true) {
             $this->robotsTxtHandler->processRobotsTxt($protocol, $host, $port, $this->user_agent_string);
         }
     }
     // Is this URL allowed to be requested by the robots.txt-file of this host?
     $url_disallowed = false;
     if ($this->use_robots_txt_files == true) {
         $host_url = $protocol . $host . ":" . $port;
         $url_disallowed = $this->robotsTxtHandler->checkIfUrlDisallowed($url_to_crawl, $host_url);
     }
     // Check the protocol (http or https) and build the
     // host-string for fsockopen
     if ($protocol == "https://") {
         $host_str = "ssl://" . $ip;
     } else {
         $host_str = $ip;
     }
     // normal connect
     // Check if an authentication should be send
     $authentication = PHPCrawlerUtils::getAuthenticationForURL($this->basic_authentications, $url_to_crawl);
     // Error-codes
     // 0 - couldnt connect to server / page within timeout-time
     // 1 - stopped reading from socket, read-timeout reached BEFORE EOF()
     // Open socket-connection
     if ($url_disallowed == false) {
         $s = @fsockopen($host_str, $port, $e, $t, $this->socket_mean_timeout);
     } else {
         return false;
         // Return false if the URL was completely ignored
     }
     if ($s == false) {
         $error_string = $t;
         $error_code = $e;
         if ($t == "" && $e == "") {
             $error_code = 0;
             $error_string = "Couldn't connect to server";
         }
     } else {
         $header_found = false;
         // will get true if the header of the page was extracted
         // Build header to send
         $headerlines_to_send[] = "GET " . $path . $file . $query . " HTTP/1.0\r\n";
         $headerlines_to_send[] = "HOST: " . $host . "\r\n";
         // Referer
         if ($referer_url != "") {
             $headerlines_to_send[] = "Referer: {$referer_url}\r\n";
         }
         // Cookies
         if ($this->handle_cookies == true) {
             $cookie_string = PHPCrawlerUtils::buildHeaderCookieString($this->cookies, $host);
         }
         if (isset($cookie_string)) {
             $headerlines_to_send[] = "Cookie: " . $cookie_string . "\r\n";
         }
         // Authentication
         if (count($authentication) > 0) {
             $auth_string = base64_encode($authentication["username"] . ":" . $authentication["password"]);
             $headerlines_to_send[] = "Authorization: Basic " . $auth_string . "\r\n";
         }
         // Rest of header
         $headerlines_to_send[] = "User-Agent: " . str_replace("\n", "", $this->user_agent_string) . "\r\n";
         $headerlines_to_send[] = "Connection: close\r\n";
         $headerlines_to_send[] = "\r\n";
         // Now send the header
         for ($x = 0; $x < count($headerlines_to_send); $x++) {
             // Send header-line
             fputs($s, $headerlines_to_send[$x]);
             // Put together lines to $header_send
             if (isset($header_send)) {
                 $header_send .= $headerlines_to_send[$x];
             } else {
                 $header_send = $headerlines_to_send[$x];
             }
         }
         unset($header_lines);
         $status = socket_get_status($s);
         // Now read from socket
         // UNTIL timeout reached OR eof() OR content-type shouldnt be followed
         // OR traffic-limit reached or ...
         while (!isset($stop)) {
             socket_set_timeout($s, $this->socket_read_timeout);
             // Read from socket
             $line_read = @fgets($s, 1024);
             // The @ is to avoid the strange "SSL fatal protocol error"-warning that
             // appears in some environments without any reasons
             $source_read .= $line_read;
             // do this anyway
             // If we want the content in tmp-file -> write line to TMP-file
             if ($header_found == true && $stream_to_file == true && $line_read) {
                 unset($check);
                 $check = @fwrite($fp, $line_read);
                 if ($check == false) {
                     $error_code = "2000";
                     $error_string = "Couldn't write to TMP-file " . $this->tmp_file;
                 }
             }
             // Count bytes of the content (not the header)
             if ($header_found == true) {
                 $bytes_received = $bytes_received + strlen($line_read);
             }
             // Check for traffic limit and stop receiving if reached
             if ($this->traffic_limit_complete_page == false && $this->traffic_limit_all > 0) {
                 if (strlen($source_read) + $this->traffic_all > $this->traffic_limit_all) {
                     $stop = true;
                     $received_completly = false;
                     $page_data["traffic_limit_reached"] = true;
                 }
             }
             // Check for pagesize-limit
             if ($header_found == true && $bytes_received > $this->pagesize_limit && $this->pagesize_limit > 0) {
                 $stop = true;
                 $received_completly = false;
             }
             // "Cut" Header in seperate var $header and handle it
             if ($header_found == false && substr($source_read, -4, 4) == "\r\n\r\n") {
                 $header = substr($source_read, 0, strlen($source_read) - 2);
                 $actual_content_type = PHPCrawlerUtils::getHeaderTag("content-type", $header);
                 $source_read = "";
                 $header_found = true;
                 // Get the http-status-code
                 $http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header);
                 // Should this content-type be streamed into memory (true/false) ?
                 $stream_to_memory = PHPCrawlerUtils::decideStreamToMemory($header, $this->receive_to_memory_matches);
                 // Should this content-type be streamed into tmp-file (true/false) ?
                 $stream_to_file = PHPCrawlerUtils::decideStreamToTmpFile($header, $this->receive_to_file_matches);
                 // No ? then open TMP-file for the stream
                 if ($stream_to_file == true) {
                     $fp = @fopen($this->tmp_file, "w");
                     if ($fp == false) {
                         $error_code = "2000";
                         $error_string = "Couldn't open TMP-file" . $this->tmp_file;
                     }
                 }
                 // Header found here -> check if source should be followed (content-type)
                 $follow = PHPCrawlerUtils::decideFollow($header, $this->follow_content_type);
                 // no ?? then stop with this page !
                 if ($follow == false) {
                     $stop = true;
                 } else {
                     $received_completly = true;
                     // just init, may switch later on !
                 }
                 // Check if a cookie was send with the header and store it
                 // (if wanted)
                 if ($this->handle_cookies == true) {
                     PHPCrawlerUtils::getCookieData($header, $this->cookies, $host);
                 }
             }
             // end cut and handle header
             // Get status of socket to check timeout and EOF
             $status = socket_get_status($s);
             // Now, if the source-buffer is filled or EOF is reached
             // -> look for links in the buffer, put the found links into
             // array $links_found_in_page and then empty the buffer BUT
             // COPY THE LAST FEW BYTES of the old buffer into the new one !
             // This has to be done because of links that take more than a single
             // line !
             // And yes, only makes sense if we dont want to have the whole content
             // in memory anyway AND if the content-type is text/html!
             if ($header_found == true && $stream_to_memory == false) {
                 if (strlen($source_read) >= 100000 || $status["eof"] == true) {
                     if (preg_match("/text\\/html/ i", $actual_content_type)) {
                         $links_found_in_buffer = PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map);
                         $source_read = substr($source_read, -1500);
                     }
                 }
             }
             // Check timeout
             if ($status["timed_out"] == true) {
                 $error_code = 1000;
                 // ahem..which int to give ??
                 $error_string = "socketstream timed out";
                 $stop = true;
                 $received_completly = false;
             }
             // Check eof
             if ($status["eof"] == true) {
                 $stop = true;
             }
         }
         fclose($s);
         // close socket
         if (isset($fp) && $fp != false) {
             fclose($fp);
         }
         // close tmp file if used
     }
     // echo "Get page:".($this->getmicrotime() - $start);
     // Now, HERE, if the whole content/source was received into memory,
     // we are looking for the links in the complete source (faster)
     // it only makes sense if content-type is text/html !
     if ($stream_to_memory == true) {
         unset($links_found_in_page);
         if (preg_match("/text\\/html/ i", $actual_content_type)) {
             // $start = $this->getmicrotime();
             PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map);
             // echo "Find links:".($this->getmicrotime() - $start);
         }
     }
     // Add the "refering_url" to the array-elements
     if (isset($links_found_in_page)) {
         for ($x = 0; $x < count($links_found_in_page); $x++) {
             $links_found_in_page[$x]["referer_url"] = $url_to_crawl;
         }
     }
     // Page crawled,
     // return header, source, followed (true/false) and all we got here
     unset($page_data);
     if (isset($error_code)) {
         $page_data["error_code"] = $error_code;
     } else {
         $page_data["error_code"] = false;
     }
     if (isset($error_string)) {
         $page_data["error_string"] = $error_string;
     } else {
         $page_data["error_string"] = false;
     }
     if (isset($follow)) {
         $page_data["received"] =& $follow;
     } else {
         $page_data["received"] = false;
     }
     if (isset($received_completly)) {
         $page_data["received_completly"] =& $received_completly;
     } else {
         $page_data["received_completly"] = false;
     }
     $page_data["received_completely"] =& $page_data["received_completly"];
     // Wrote "completely" it wrong in prev. version,
     if (isset($bytes_received)) {
         $page_data["bytes_received"] = $bytes_received;
     } else {
         $page_data["bytes_received"] = 0;
     }
     if (isset($header)) {
         $page_data["header"] =& $header;
     } else {
         $page_data["header"] = false;
     }
     if (isset($http_status_code)) {
         $page_data["http_status_code"] =& $http_status_code;
     } else {
         $page_data["http_status_code"] = false;
     }
     if (isset($actual_content_type)) {
         $page_data["content_type"] = $actual_content_type;
     } else {
         $page_data["content_type"] = false;
     }
     // TMP-file infos and that
     $page_data["content_tmp_file"] = $page_data["received_to_file"] = false;
     $page_data["source"] = $page_data["content"] = $page_data["received_to_memory"] = false;
     if (isset($page_data["received"])) {
         if ($stream_to_file == true) {
             $page_data["content_tmp_file"] = $this->tmp_file;
             $page_data["received_to_file"] = true;
         }
         if ($stream_to_memory == true) {
             $page_data["source"] =& $source_read;
             $page_data["content"] =& $source_read;
             $page_data["received_to_memory"] = true;
         }
     }
     // Additional infos for the override-function handlePageData()
     $page_data["protocol"] = $protocol;
     $page_data["port"] = $port;
     $page_data["host"] = $host;
     $page_data["path"] = $path;
     $page_data["file"] = $file;
     $page_data["query"] = $query;
     $page_data["header_send"] =& $header_send;
     $page_data["referer_url"] = $referer_url;
     // "Normailzed" URL and referer-URL (f.e. without port if port is 80 and protocol is http)
     $page_data["url"] = $url_to_crawl;
     // All links found in this page
     $page_data["links_found"] =& $links_found_in_page;
     // Increase SUM of traffic alltogether this instance received
     $this->traffic_all = $this->traffic_all + strlen($page_data["header"]) + $page_data["bytes_received"];
     // Set flag if traffic-limit is reached
     if ($this->traffic_all > $this->traffic_limit_all && $this->traffic_limit_all != 0) {
         $page_data["traffic_limit_reached"] = true;
     }
     if (!isset($page_data["traffic_limit_reached"])) {
         $page_data["traffic_limit_reached"] = false;
     }
     return $page_data;
 }