コード例 #1
0
 /**
  * Reads the response-content.
  * 
  * @param bool    $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and
  *                                this method will not return the content as a string.                            
  * @param int     &$error_code    Error-code by reference if an error occured.
  * @param &string &$error_string  Error-string by reference
  * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference
  *
  * @return string  The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file.
  */
 protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely)
 {
     $this->content_bytes_received = 0;
     // If content should be streamed to file
     if ($stream_to_file == true) {
         $fp = @fopen($this->tmpFile, "w");
         if ($fp == false) {
             $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE;
             $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing.";
             return "";
         }
     }
     // Init
     $source_portion = "";
     $source_complete = "";
     $document_received_completely = true;
     $document_completed = false;
     $gzip_encoded_content = null;
     // Resume data-transfer-time benchmark
     PHPCrawlerBenchmark::start("data_transfer_time");
     while ($document_completed == false) {
         // Get chunk from content
         $content_chunk = $this->readResponseContentChunk($document_completed, $error_code, $error_string, $document_received_completely);
         $source_portion .= $content_chunk;
         // Check if content is gzip-encoded (check only first chunk)
         if ($gzip_encoded_content === null) {
             if (PHPCrawlerUtils::isGzipEncoded($content_chunk)) {
                 $gzip_encoded_content = true;
             } else {
                 $gzip_encoded_content = false;
             }
         }
         // Stream to file or store source in memory
         if ($stream_to_file == true) {
             @fwrite($fp, $content_chunk);
         } else {
             $source_complete .= $content_chunk;
         }
         // Decode gzip-encoded content when done with document
         if ($document_completed == true && $gzip_encoded_content == true) {
             $source_complete = $source_portion = PHPCrawlerUtils::decodeGZipContent($source_complete);
         }
         // Find links in portion of the source
         if ($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= 200000 || $document_completed == true) {
             if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) {
                 PHPCrawlerBenchmark::stop("data_transfer_time");
                 $this->LinkFinder->findLinksInHTMLChunk($source_portion);
                 $source_portion = substr($source_portion, -1500);
                 PHPCrawlerBenchmark::start("data_transfer_time");
             }
         }
     }
     if ($stream_to_file == true) {
         @fclose($fp);
     }
     // Stop data-transfer-time benchmark
     PHPCrawlerBenchmark::stop("data_transfer_time");
     $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time");
     return $source_complete;
 }
コード例 #2
0
ファイル: PHPCrawler.class.php プロジェクト: luoxun/SensysPHP
 /**
  * Retruns summarizing report-information about the crawling-process after it has finished.
  *
  * @return PHPCrawlerProcessReport PHPCrawlerProcessReport-object containing process-summary-information
  * @section 1 Basic settings
  */
 public function getProcessReport()
 {
     // Get current crawler-Status
     $CrawlerStatus = $this->crawlerStatus;
     // Create report
     $Report = new PHPCrawlerProcessReport();
     $Report->links_followed = $CrawlerStatus->links_followed;
     $Report->files_received = $CrawlerStatus->documents_received;
     $Report->bytes_received = $CrawlerStatus->bytes_received;
     $Report->process_runtime = PHPCrawlerBenchmark::getElapsedTime("crawling_process");
     if ($Report->process_runtime > 0) {
         $Report->data_throughput = $Report->bytes_received / $Report->process_runtime;
     }
     // Process abort-reason
     $Report->abort_reason = $CrawlerStatus->abort_reason;
     if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_TRAFFICLIMIT_REACHED) {
         $Report->traffic_limit_reached = true;
     }
     if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED) {
         $Report->file_limit_reached = true;
     }
     if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_USERABORT) {
         $Report->user_abort = true;
     }
     // Peak memory-usage
     if (function_exists("memory_get_peak_usage")) {
         $Report->memory_peak_usage = memory_get_peak_usage(true);
     }
     return $Report;
 }
コード例 #3
0
 /**
  * Reads the response-content.
  * 
  * @param bool    $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and
  *                                this method will not return the content as a string.                            
  * @param int     &$error_code    Error-code by reference if an error occured.
  * @param &string &$error_string  Error-string by reference
  * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference
  * @param &string &$bytes_received Number of bytes received, passed by reference
  * @return string  The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file.
  */
 protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely, &$bytes_received)
 {
     PHPCrawlerBenchmark::start("retreiving_content");
     PHPCrawlerBenchmark::start("data_transfer_time", true);
     // If content should be streamed to file
     if ($stream_to_file == true) {
         $fp = @fopen($this->tmpFile, "w");
         if ($fp == false) {
             $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE;
             $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing.";
             return "";
         }
     }
     // Init
     $status = socket_get_status($this->socket);
     $source_portion = "";
     $source_complete = "";
     $bytes_received = 0;
     $document_received_completely = true;
     $stop_receving = false;
     while ($stop_receving == false) {
         socket_set_timeout($this->socket, $this->socketReadTimeout);
         // Read from socket
         $line_read = @fread($this->socket, 1024);
         // Das @ ist da um die blöde "SSL fatal protocol error"-Warnung zu unterdrücken,
         // die keinen Sinn macht
         // Check socket-status
         $status = socket_get_status($this->socket);
         // Check for EOF
         if ($status["eof"] == true) {
             $stop_receving = true;
         }
         // Socket timed out
         if ($status["timed_out"] == true) {
             $stop_receving = true;
             $error_code = PHPCrawlerRequestErrors::ERROR_SOCKET_TIMEOUT;
             $error_string = "Socket-stream timed out (timeout set to " . $this->socketReadTimeout . " sec).";
             $document_received_completely = false;
         } else {
             $source_portion .= $line_read;
             $bytes_received += strlen($line_read);
             $this->global_traffic_count += strlen($line_read);
             // Stream to file or store source in memory
             if ($stream_to_file == true) {
                 @fwrite($fp, $line_read);
             } else {
                 $source_complete .= $line_read;
             }
         }
         // Check if content-length stated in the header is reached
         if ($this->lastResponseHeader->content_length == $bytes_received) {
             $stop_receving = true;
         }
         // Check if contentsize-limit is reached
         if ($this->content_size_limit > 0 && $this->content_size_limit <= $bytes_received) {
             $stop_receving = true;
         }
         // Find links in portion of the source
         if (strlen($source_portion) >= 100000 || $stop_receving == true) {
             if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) {
                 PHPCrawlerBenchmark::stop("retreiving_content");
                 PHPCrawlerBenchmark::stop("data_transfer_time");
                 $this->LinkFinder->findLinksInHTMLChunk($source_portion);
                 $source_portion = substr($source_portion, -1500);
                 PHPCrawlerBenchmark::start("retreiving_content");
                 PHPCrawlerBenchmark::start("data_transfer_time", true);
             }
         }
     }
     if ($stream_to_file == true) {
         @fclose($fp);
     }
     PHPCrawlerBenchmark::stop("retreiving_content");
     PHPCrawlerBenchmark::stop("data_transfer_time");
     $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time");
     PHPCrawlerBenchmark::reset("data_transfer_time");
     return $source_complete;
 }