Ejemplo n.º 1
0
 /**
  * Retruns summarizing report-information about the crawling-process after it has finished.
  *
  * @return PHPCrawlerProcessReport PHPCrawlerProcessReport-object containing process-summary-information
  * @section 1 Basic settings
  */
 public function getProcessReport()
 {
     // Get current crawler-Status
     $CrawlerStatus = $this->crawlerStatus;
     // Create report
     $Report = new PHPCrawlerProcessReport();
     $Report->links_followed = $CrawlerStatus->links_followed;
     $Report->files_received = $CrawlerStatus->documents_received;
     $Report->bytes_received = $CrawlerStatus->bytes_received;
     $Report->process_runtime = PHPCrawlerBenchmark::getElapsedTime("crawling_process");
     if ($Report->process_runtime > 0) {
         $Report->data_throughput = $Report->bytes_received / $Report->process_runtime;
     }
     // Process abort-reason
     $Report->abort_reason = $CrawlerStatus->abort_reason;
     if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_TRAFFICLIMIT_REACHED) {
         $Report->traffic_limit_reached = true;
     }
     if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED) {
         $Report->file_limit_reached = true;
     }
     if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_USERABORT) {
         $Report->user_abort = true;
     }
     // Peak memory-usage
     if (function_exists("memory_get_peak_usage")) {
         $Report->memory_peak_usage = memory_get_peak_usage(true);
     }
     // Benchmark: Average server connect time
     if ($CrawlerStatus->sum_server_connects > 0) {
         $Report->avg_server_connect_time = $CrawlerStatus->sum_server_connect_time / $CrawlerStatus->sum_server_connects;
     }
     // Benchmark: Average server response time
     if ($CrawlerStatus->sum_server_responses > 0) {
         $Report->avg_server_response_time = $CrawlerStatus->sum_server_response_time / $CrawlerStatus->sum_server_responses;
     }
     // Average data tranfer time
     if ($CrawlerStatus->sum_data_transfer_time > 0) {
         $Report->avg_proc_data_transfer_rate = $CrawlerStatus->unbuffered_bytes_read / $CrawlerStatus->sum_data_transfer_time;
     }
     return $Report;
 }
Ejemplo n.º 2
0
 /**
  * Reads the response-content.
  *
  * @param bool $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and
  *                                this method will not return the content as a string.
  * @param int &$error_code Error-code by reference if an error occured.
  * @param &string &$error_string  Error-string by reference
  * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference
  *
  * @return string  The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file.
  */
 protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely)
 {
     $this->content_bytes_received = 0;
     // If content should be streamed to file
     if ($stream_to_file == true) {
         $fp = @fopen($this->tmpFile, "w");
         if ($fp == false) {
             $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE;
             $error_string = "Couldn't open the temporary file " . $this->tmpFile . " for writing.";
             return "";
         }
     }
     // Init
     $source_portion = "";
     $source_complete = "";
     $document_received_completely = true;
     $document_completed = false;
     $gzip_encoded_content = null;
     // Resume data-transfer-time benchmark
     PHPCrawlerBenchmark::start("data_transfer_time");
     while ($document_completed == false) {
         // Get chunk from content
         $content_chunk = $this->readResponseContentChunk($document_completed, $error_code, $error_string, $document_received_completely);
         $source_portion .= $content_chunk;
         // Check if content is gzip-encoded (check only first chunk)
         if ($gzip_encoded_content === null) {
             if (PHPCrawlerEncodingUtils::isGzipEncoded($content_chunk)) {
                 $gzip_encoded_content = true;
             } else {
                 $gzip_encoded_content = false;
             }
         }
         // Stream to file or store source in memory
         if ($stream_to_file == true) {
             @fwrite($fp, $content_chunk);
         } else {
             $source_complete .= $content_chunk;
         }
         // Decode gzip-encoded content when done with document
         if ($document_completed == true && $gzip_encoded_content == true) {
             $source_complete = $source_portion = PHPCrawlerEncodingUtils::decodeGZipContent($source_complete);
         }
         // Find links in portion of the source
         if ($gzip_encoded_content == false && $stream_to_file == false && strlen($source_portion) >= $this->content_buffer_size || $document_completed == true) {
             if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types)) {
                 PHPCrawlerBenchmark::stop("data_transfer_time");
                 $this->LinkFinder->findLinksInHTMLChunk($source_portion);
                 if ($this->source_overlap_size > 0) {
                     $source_portion = substr($source_portion, -$this->source_overlap_size);
                 } else {
                     $source_portion = "";
                 }
                 PHPCrawlerBenchmark::start("data_transfer_time");
             }
         }
     }
     if ($stream_to_file == true) {
         @fclose($fp);
     }
     // Stop data-transfer-time benchmark
     PHPCrawlerBenchmark::stop("data_transfer_time");
     $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time");
     return $source_complete;
 }