Ejemplo n.º 1
0
 function receivePage($url_to_crawl, $referer_url)
 {
     // Check if tmp-file was set by the user, otherwise set a default one
     if ($this->tmp_file == "") {
         $this->tmp_file = uniqid(time()) . ".tmp";
     }
     // Define some vars
     $source_read = "";
     $bytes_received = 0;
     $stream_to_memory = false;
     $stream_to_file = false;
     // Split the url to crawl into its elements (host, path, port and stuff)
     $url_parts = PHPCrawlerUtils::splitURL($url_to_crawl);
     $protocol = $url_parts["protocol"];
     $host = $url_parts["host"];
     $path = $url_parts["path"];
     $query = $url_parts["query"];
     $file = $url_parts["file"];
     $port = $url_parts["port"];
     // If the host was already visited so far
     // -> get the ip from our host-ip-array, otherwise
     // get the IP and add the entry to the array.
     if (isset($this->host_ip_table[$host])) {
         $ip = $this->host_ip_table[$host];
     } else {
         $ip = $this->host_ip_table[$host] = gethostbyname($host);
         // Host switched and wasnt "visited" before.
         // So read the robots.txt-file for this new host (if wanted)
         if ($this->use_robots_txt_files == true) {
             $this->robotsTxtHandler->processRobotsTxt($protocol, $host, $port, $this->user_agent_string);
         }
     }
     // Is this URL allowed to be requested by the robots.txt-file of this host?
     $url_disallowed = false;
     if ($this->use_robots_txt_files == true) {
         $host_url = $protocol . $host . ":" . $port;
         $url_disallowed = $this->robotsTxtHandler->checkIfUrlDisallowed($url_to_crawl, $host_url);
     }
     // Check the protocol (http or https) and build the
     // host-string for fsockopen
     if ($protocol == "https://") {
         $host_str = "ssl://" . $ip;
     } else {
         $host_str = $ip;
     }
     // normal connect
     // Check if an authentication should be send
     $authentication = PHPCrawlerUtils::getAuthenticationForURL($this->basic_authentications, $url_to_crawl);
     // Error-codes
     // 0 - couldnt connect to server / page within timeout-time
     // 1 - stopped reading from socket, read-timeout reached BEFORE EOF()
     // Open socket-connection
     if ($url_disallowed == false) {
         $s = @fsockopen($host_str, $port, $e, $t, $this->socket_mean_timeout);
     } else {
         return false;
         // Return false if the URL was completely ignored
     }
     if ($s == false) {
         $error_string = $t;
         $error_code = $e;
         if ($t == "" && $e == "") {
             $error_code = 0;
             $error_string = "Couldn't connect to server";
         }
     } else {
         $header_found = false;
         // will get true if the header of the page was extracted
         // Build header to send
         $headerlines_to_send[] = "GET " . $path . $file . $query . " HTTP/1.0\r\n";
         $headerlines_to_send[] = "HOST: " . $host . "\r\n";
         // Referer
         if ($referer_url != "") {
             $headerlines_to_send[] = "Referer: {$referer_url}\r\n";
         }
         // Cookies
         if ($this->handle_cookies == true) {
             $cookie_string = PHPCrawlerUtils::buildHeaderCookieString($this->cookies, $host);
         }
         if (isset($cookie_string)) {
             $headerlines_to_send[] = "Cookie: " . $cookie_string . "\r\n";
         }
         // Authentication
         if (count($authentication) > 0) {
             $auth_string = base64_encode($authentication["username"] . ":" . $authentication["password"]);
             $headerlines_to_send[] = "Authorization: Basic " . $auth_string . "\r\n";
         }
         // Rest of header
         $headerlines_to_send[] = "User-Agent: " . str_replace("\n", "", $this->user_agent_string) . "\r\n";
         $headerlines_to_send[] = "Connection: close\r\n";
         $headerlines_to_send[] = "\r\n";
         // Now send the header
         for ($x = 0; $x < count($headerlines_to_send); $x++) {
             // Send header-line
             fputs($s, $headerlines_to_send[$x]);
             // Put together lines to $header_send
             if (isset($header_send)) {
                 $header_send .= $headerlines_to_send[$x];
             } else {
                 $header_send = $headerlines_to_send[$x];
             }
         }
         unset($header_lines);
         $status = socket_get_status($s);
         // Now read from socket
         // UNTIL timeout reached OR eof() OR content-type shouldnt be followed
         // OR traffic-limit reached or ...
         while (!isset($stop)) {
             socket_set_timeout($s, $this->socket_read_timeout);
             // Read from socket
             $line_read = @fgets($s, 1024);
             // The @ is to avoid the strange "SSL fatal protocol error"-warning that
             // appears in some environments without any reasons
             $source_read .= $line_read;
             // do this anyway
             // If we want the content in tmp-file -> write line to TMP-file
             if ($header_found == true && $stream_to_file == true && $line_read) {
                 unset($check);
                 $check = @fwrite($fp, $line_read);
                 if ($check == false) {
                     $error_code = "2000";
                     $error_string = "Couldn't write to TMP-file " . $this->tmp_file;
                 }
             }
             // Count bytes of the content (not the header)
             if ($header_found == true) {
                 $bytes_received = $bytes_received + strlen($line_read);
             }
             // Check for traffic limit and stop receiving if reached
             if ($this->traffic_limit_complete_page == false && $this->traffic_limit_all > 0) {
                 if (strlen($source_read) + $this->traffic_all > $this->traffic_limit_all) {
                     $stop = true;
                     $received_completly = false;
                     $page_data["traffic_limit_reached"] = true;
                 }
             }
             // Check for pagesize-limit
             if ($header_found == true && $bytes_received > $this->pagesize_limit && $this->pagesize_limit > 0) {
                 $stop = true;
                 $received_completly = false;
             }
             // "Cut" Header in seperate var $header and handle it
             if ($header_found == false && substr($source_read, -4, 4) == "\r\n\r\n") {
                 $header = substr($source_read, 0, strlen($source_read) - 2);
                 $actual_content_type = PHPCrawlerUtils::getHeaderTag("content-type", $header);
                 $source_read = "";
                 $header_found = true;
                 // Get the http-status-code
                 $http_status_code = PHPCrawlerUtils::getHTTPStatusCode($header);
                 // Should this content-type be streamed into memory (true/false) ?
                 $stream_to_memory = PHPCrawlerUtils::decideStreamToMemory($header, $this->receive_to_memory_matches);
                 // Should this content-type be streamed into tmp-file (true/false) ?
                 $stream_to_file = PHPCrawlerUtils::decideStreamToTmpFile($header, $this->receive_to_file_matches);
                 // No ? then open TMP-file for the stream
                 if ($stream_to_file == true) {
                     $fp = @fopen($this->tmp_file, "w");
                     if ($fp == false) {
                         $error_code = "2000";
                         $error_string = "Couldn't open TMP-file" . $this->tmp_file;
                     }
                 }
                 // Header found here -> check if source should be followed (content-type)
                 $follow = PHPCrawlerUtils::decideFollow($header, $this->follow_content_type);
                 // no ?? then stop with this page !
                 if ($follow == false) {
                     $stop = true;
                 } else {
                     $received_completly = true;
                     // just init, may switch later on !
                 }
                 // Check if a cookie was send with the header and store it
                 // (if wanted)
                 if ($this->handle_cookies == true) {
                     PHPCrawlerUtils::getCookieData($header, $this->cookies, $host);
                 }
             }
             // end cut and handle header
             // Get status of socket to check timeout and EOF
             $status = socket_get_status($s);
             // Now, if the source-buffer is filled or EOF is reached
             // -> look for links in the buffer, put the found links into
             // array $links_found_in_page and then empty the buffer BUT
             // COPY THE LAST FEW BYTES of the old buffer into the new one !
             // This has to be done because of links that take more than a single
             // line !
             // And yes, only makes sense if we dont want to have the whole content
             // in memory anyway AND if the content-type is text/html!
             if ($header_found == true && $stream_to_memory == false) {
                 if (strlen($source_read) >= 100000 || $status["eof"] == true) {
                     if (preg_match("/text\\/html/ i", $actual_content_type)) {
                         $links_found_in_buffer = PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map);
                         $source_read = substr($source_read, -1500);
                     }
                 }
             }
             // Check timeout
             if ($status["timed_out"] == true) {
                 $error_code = 1000;
                 // ahem..which int to give ??
                 $error_string = "socketstream timed out";
                 $stop = true;
                 $received_completly = false;
             }
             // Check eof
             if ($status["eof"] == true) {
                 $stop = true;
             }
         }
         fclose($s);
         // close socket
         if (isset($fp) && $fp != false) {
             fclose($fp);
         }
         // close tmp file if used
     }
     // echo "Get page:".($this->getmicrotime() - $start);
     // Now, HERE, if the whole content/source was received into memory,
     // we are looking for the links in the complete source (faster)
     // it only makes sense if content-type is text/html !
     if ($stream_to_memory == true) {
         unset($links_found_in_page);
         if (preg_match("/text\\/html/ i", $actual_content_type)) {
             // $start = $this->getmicrotime();
             PHPCrawlerUtils::findLinks($source_read, $links_found_in_page, $this->aggressive_link_extraction, $this->linktags_to_extract, $page_url_map);
             // echo "Find links:".($this->getmicrotime() - $start);
         }
     }
     // Add the "refering_url" to the array-elements
     if (isset($links_found_in_page)) {
         for ($x = 0; $x < count($links_found_in_page); $x++) {
             $links_found_in_page[$x]["referer_url"] = $url_to_crawl;
         }
     }
     // Page crawled,
     // return header, source, followed (true/false) and all we got here
     unset($page_data);
     if (isset($error_code)) {
         $page_data["error_code"] = $error_code;
     } else {
         $page_data["error_code"] = false;
     }
     if (isset($error_string)) {
         $page_data["error_string"] = $error_string;
     } else {
         $page_data["error_string"] = false;
     }
     if (isset($follow)) {
         $page_data["received"] =& $follow;
     } else {
         $page_data["received"] = false;
     }
     if (isset($received_completly)) {
         $page_data["received_completly"] =& $received_completly;
     } else {
         $page_data["received_completly"] = false;
     }
     $page_data["received_completely"] =& $page_data["received_completly"];
     // Wrote "completely" it wrong in prev. version,
     if (isset($bytes_received)) {
         $page_data["bytes_received"] = $bytes_received;
     } else {
         $page_data["bytes_received"] = 0;
     }
     if (isset($header)) {
         $page_data["header"] =& $header;
     } else {
         $page_data["header"] = false;
     }
     if (isset($http_status_code)) {
         $page_data["http_status_code"] =& $http_status_code;
     } else {
         $page_data["http_status_code"] = false;
     }
     if (isset($actual_content_type)) {
         $page_data["content_type"] = $actual_content_type;
     } else {
         $page_data["content_type"] = false;
     }
     // TMP-file infos and that
     $page_data["content_tmp_file"] = $page_data["received_to_file"] = false;
     $page_data["source"] = $page_data["content"] = $page_data["received_to_memory"] = false;
     if (isset($page_data["received"])) {
         if ($stream_to_file == true) {
             $page_data["content_tmp_file"] = $this->tmp_file;
             $page_data["received_to_file"] = true;
         }
         if ($stream_to_memory == true) {
             $page_data["source"] =& $source_read;
             $page_data["content"] =& $source_read;
             $page_data["received_to_memory"] = true;
         }
     }
     // Additional infos for the override-function handlePageData()
     $page_data["protocol"] = $protocol;
     $page_data["port"] = $port;
     $page_data["host"] = $host;
     $page_data["path"] = $path;
     $page_data["file"] = $file;
     $page_data["query"] = $query;
     $page_data["header_send"] =& $header_send;
     $page_data["referer_url"] = $referer_url;
     // "Normailzed" URL and referer-URL (f.e. without port if port is 80 and protocol is http)
     $page_data["url"] = $url_to_crawl;
     // All links found in this page
     $page_data["links_found"] =& $links_found_in_page;
     // Increase SUM of traffic alltogether this instance received
     $this->traffic_all = $this->traffic_all + strlen($page_data["header"]) + $page_data["bytes_received"];
     // Set flag if traffic-limit is reached
     if ($this->traffic_all > $this->traffic_limit_all && $this->traffic_limit_all != 0) {
         $page_data["traffic_limit_reached"] = true;
     }
     if (!isset($page_data["traffic_limit_reached"])) {
         $page_data["traffic_limit_reached"] = false;
     }
     return $page_data;
 }