/** * Reads the response-content. * * @param bool $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and * this method will not return the content as a string. * @param int &$error_code Error-code by reference if an error occured. * @param &string &$error_string Error-string by reference * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference * * @return string The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file. */ protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely) { $this->content_bytes_received = 0; // If content should be streamed to file if ($stream_to_file == true) { $fp = @fopen($this->tmpFile, "w"); if ($fp == false) { $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE; $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing."; return ""; } } // Init $source_portion = ""; $source_complete = ""; $document_received_completely = true; $document_completed = false; $gzip_encoded_content = null; // Resume data-transfer-time benchmark PHPCrawlerBenchmark::start("data_transfer_time"); while ($document_completed == false) { // Get chunk from content $content_chunk = $this->readResponseContentChunk($document_completed, $error_code, $error_string, $document_received_completely); $source_portion .= $content_chunk; // Check if content is gzip-encoded (check only first chunk) if ($gzip_encoded_content === null) { if (PHPCrawlerUtils::isGzipEncoded($content_chunk)) { $gzip_encoded_content = true; } else { $gzip_encoded_content = false; } } // Stream to file or store source in memory if ($stream_to_file == true) { @fwrite($fp, $content_chunk); } else { $source_complete .= $content_chunk; } // Decode gzip-encoded content when done with document if ($document_completed == true && $gzip_encoded_content == true) { $source_complete = $source_portion = PHPCrawlerUtils::decodeGZipContent($source_complete); } // Find links in portion of the source if ($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= 200000 || $document_completed == true) { if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) { PHPCrawlerBenchmark::stop("data_transfer_time"); $this->LinkFinder->findLinksInHTMLChunk($source_portion); $source_portion = substr($source_portion, -1500); PHPCrawlerBenchmark::start("data_transfer_time"); } } } if ($stream_to_file == true) { @fclose($fp); } // Stop data-transfer-time benchmark PHPCrawlerBenchmark::stop("data_transfer_time"); $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time"); return $source_complete; }
/** * Retruns summarizing report-information about the crawling-process after it has finished. * * @return PHPCrawlerProcessReport PHPCrawlerProcessReport-object containing process-summary-information * @section 1 Basic settings */ public function getProcessReport() { // Get current crawler-Status $CrawlerStatus = $this->crawlerStatus; // Create report $Report = new PHPCrawlerProcessReport(); $Report->links_followed = $CrawlerStatus->links_followed; $Report->files_received = $CrawlerStatus->documents_received; $Report->bytes_received = $CrawlerStatus->bytes_received; $Report->process_runtime = PHPCrawlerBenchmark::getElapsedTime("crawling_process"); if ($Report->process_runtime > 0) { $Report->data_throughput = $Report->bytes_received / $Report->process_runtime; } // Process abort-reason $Report->abort_reason = $CrawlerStatus->abort_reason; if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_TRAFFICLIMIT_REACHED) { $Report->traffic_limit_reached = true; } if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED) { $Report->file_limit_reached = true; } if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_USERABORT) { $Report->user_abort = true; } // Peak memory-usage if (function_exists("memory_get_peak_usage")) { $Report->memory_peak_usage = memory_get_peak_usage(true); } return $Report; }
/** * Reads the response-content. * * @param bool $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and * this method will not return the content as a string. * @param int &$error_code Error-code by reference if an error occured. * @param &string &$error_string Error-string by reference * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference * @param &string &$bytes_received Number of bytes received, passed by reference * @return string The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file. */ protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely, &$bytes_received) { PHPCrawlerBenchmark::start("retreiving_content"); PHPCrawlerBenchmark::start("data_transfer_time", true); // If content should be streamed to file if ($stream_to_file == true) { $fp = @fopen($this->tmpFile, "w"); if ($fp == false) { $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE; $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing."; return ""; } } // Init $status = socket_get_status($this->socket); $source_portion = ""; $source_complete = ""; $bytes_received = 0; $document_received_completely = true; $stop_receving = false; while ($stop_receving == false) { socket_set_timeout($this->socket, $this->socketReadTimeout); // Read from socket $line_read = @fread($this->socket, 1024); // Das @ ist da um die blöde "SSL fatal protocol error"-Warnung zu unterdrücken, // die keinen Sinn macht // Check socket-status $status = socket_get_status($this->socket); // Check for EOF if ($status["eof"] == true) { $stop_receving = true; } // Socket timed out if ($status["timed_out"] == true) { $stop_receving = true; $error_code = PHPCrawlerRequestErrors::ERROR_SOCKET_TIMEOUT; $error_string = "Socket-stream timed out (timeout set to " . $this->socketReadTimeout . " sec)."; $document_received_completely = false; } else { $source_portion .= $line_read; $bytes_received += strlen($line_read); $this->global_traffic_count += strlen($line_read); // Stream to file or store source in memory if ($stream_to_file == true) { @fwrite($fp, $line_read); } else { $source_complete .= $line_read; } } // Check if content-length stated in the header is reached if ($this->lastResponseHeader->content_length == $bytes_received) { $stop_receving = true; } // Check if contentsize-limit is reached if ($this->content_size_limit > 0 && $this->content_size_limit <= $bytes_received) { $stop_receving = true; } // Find links in portion of the source if (strlen($source_portion) >= 100000 || $stop_receving == true) { if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) { PHPCrawlerBenchmark::stop("retreiving_content"); PHPCrawlerBenchmark::stop("data_transfer_time"); $this->LinkFinder->findLinksInHTMLChunk($source_portion); $source_portion = substr($source_portion, -1500); PHPCrawlerBenchmark::start("retreiving_content"); PHPCrawlerBenchmark::start("data_transfer_time", true); } } } if ($stream_to_file == true) { @fclose($fp); } PHPCrawlerBenchmark::stop("retreiving_content"); PHPCrawlerBenchmark::stop("data_transfer_time"); $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time"); PHPCrawlerBenchmark::reset("data_transfer_time"); return $source_complete; }