예제 #1
0
 /**
  * Load the XML document into a DOM and return it. Errors are appended to
  * the $report parameter.
  *
  * For reasons beyond anyone's apparent control, the export may contain
  * invalid UTF-8 characters. If the file cannot be parsed as XML, the
  * function will attempt to filter out invalid UTF-8 characters and then
  * try to load the XML again.
  *
  * Other errors in the XML, beyond the bad UTF-8, will not be tolerated.
  *
  * @return DOMDocument
  *
  * @param Deposit $deposit
  * @param string  $filename
  * @param string  $report
  */
 private function loadXml(Deposit $deposit, $filename, &$report)
 {
     $dom = new DOMDocument();
     try {
         $dom->load($filename, LIBXML_COMPACT | LIBXML_PARSEHUGE);
     } catch (Exception $ex) {
         if (strpos($ex->getMessage(), 'Input is not proper UTF-8') === false) {
             $deposit->addErrorLog('XML file ' . basename($filename) . ' is not parseable: ' . $ex->getMessage());
             $report .= $ex->getMessage();
             $report .= "\nCannot validate XML.\n";
             return;
         }
         // The XML files can be arbitrarily large, so stream them, filter
         // the stream, and write to disk. The result may not fit in memory.
         $filteredFilename = "{$filename}-filtered.xml";
         $in = fopen($filename, 'rb');
         $out = fopen($filteredFilename, 'wb');
         $blockSize = 64 * 1024;
         // 64k blocks
         $changes = 0;
         while ($buffer = fread($in, $blockSize)) {
             $filtered = iconv('UTF-8', 'UTF-8//IGNORE', $buffer);
             $changes += strlen($buffer) - strlen($filtered);
             fwrite($out, $filtered);
         }
         $report .= basename($filename) . " contains {$changes} invalid UTF-8 characters, which have been removed with " . ICONV_IMPL . ' version ' . ICONV_VERSION . ' in PHP ' . PHP_VERSION . "\n";
         $report .= basename($filteredFilename) . " will be validated.\n";
         $dom->load($filteredFilename, LIBXML_COMPACT | LIBXML_PARSEHUGE);
     }
     return $dom;
 }
예제 #2
0
 /**
  * Send an HTTP HEAD request to get the deposit's host to get an estimate
  * of the download size.
  *
  * @param type $deposit
  *
  * @throws Exception
  */
 protected function checkSize(Deposit $deposit)
 {
     $client = $this->getClient();
     try {
         $head = $client->head($deposit->getUrl());
         if ($head->getStatusCode() !== 200) {
             throw new Exception("HTTP HEAD request cannot check file size: HTTP {$head->getStatusCode()} - {$head->getReasonPhrase()} - {$deposit->getUrl()}");
         }
         $size = $head->getHeader('Content-Length');
         if ($size === null || $size === '') {
             throw new Exception("HTTP HEAD response does not include file size - {$deposit->getUrl()}");
         }
         $expectedSize = $deposit->getSize() * 1000;
         if (abs($expectedSize - $size) / $size > self::FILE_SIZE_THRESHOLD) {
             $deposit->addErrorLog("Expected file size {$expectedSize} is not close to reported size {$size}");
             $this->logger->warning("Harvest - {$deposit->getUrl()} - Expected file size {$expectedSize} is not close to reported size {$size}");
         }
     } catch (RequestException $e) {
         $response = $e->getResponse();
         if ($response !== null) {
             $this->logger->critical($e->getResponse()->getStatusCode() . ' ' . $e->getResponse()->getReasonPhrase());
         } else {
             $this->logger->critical($e->getMessage());
         }
         throw $e;
     }
 }
예제 #3
0
 /**
  * Load the XML from a file and return a DOM. Errors are appended to
  * the $report string.
  *
  * @return DOMDocument
  *
  * @param Deposit $deposit
  * @param string  $filename
  * @param string  $report
  */
 private function loadXml(Deposit $deposit, $filename, &$report)
 {
     $dom = new DOMDocument();
     try {
         $dom->load($filename, LIBXML_COMPACT | LIBXML_PARSEHUGE);
     } catch (Exception $ex) {
         if (strpos($ex->getMessage(), 'Input is not proper UTF-8') === false) {
             $deposit->addErrorLog('XML file ' . basename($filename) . ' is not parseable, and cannot be scanned for viruses: ' . $ex->getMessage());
             $report .= $ex->getMessage();
             $report .= "\nCannot scan for viruses.\n";
             return;
         }
         $filteredFilename = "{$filename}-filtered.xml";
         $report .= basename($filename) . " contains invalid UTF-8 characters and will not be scanned for viruses.\n";
         $report .= basename($filteredFilename) . " will be scanned for viruses instead.\n";
         $dom->load($filteredFilename, LIBXML_COMPACT | LIBXML_PARSEHUGE);
     }
     return $dom;
 }