/** * Load the XML document into a DOM and return it. Errors are appended to * the $report parameter. * * For reasons beyond anyone's apparent control, the export may contain * invalid UTF-8 characters. If the file cannot be parsed as XML, the * function will attempt to filter out invalid UTF-8 characters and then * try to load the XML again. * * Other errors in the XML, beyond the bad UTF-8, will not be tolerated. * * @return DOMDocument * * @param Deposit $deposit * @param string $filename * @param string $report */ private function loadXml(Deposit $deposit, $filename, &$report) { $dom = new DOMDocument(); try { $dom->load($filename, LIBXML_COMPACT | LIBXML_PARSEHUGE); } catch (Exception $ex) { if (strpos($ex->getMessage(), 'Input is not proper UTF-8') === false) { $deposit->addErrorLog('XML file ' . basename($filename) . ' is not parseable: ' . $ex->getMessage()); $report .= $ex->getMessage(); $report .= "\nCannot validate XML.\n"; return; } // The XML files can be arbitrarily large, so stream them, filter // the stream, and write to disk. The result may not fit in memory. $filteredFilename = "{$filename}-filtered.xml"; $in = fopen($filename, 'rb'); $out = fopen($filteredFilename, 'wb'); $blockSize = 64 * 1024; // 64k blocks $changes = 0; while ($buffer = fread($in, $blockSize)) { $filtered = iconv('UTF-8', 'UTF-8//IGNORE', $buffer); $changes += strlen($buffer) - strlen($filtered); fwrite($out, $filtered); } $report .= basename($filename) . " contains {$changes} invalid UTF-8 characters, which have been removed with " . ICONV_IMPL . ' version ' . ICONV_VERSION . ' in PHP ' . PHP_VERSION . "\n"; $report .= basename($filteredFilename) . " will be validated.\n"; $dom->load($filteredFilename, LIBXML_COMPACT | LIBXML_PARSEHUGE); } return $dom; }
/** * Send an HTTP HEAD request to get the deposit's host to get an estimate * of the download size. * * @param type $deposit * * @throws Exception */ protected function checkSize(Deposit $deposit) { $client = $this->getClient(); try { $head = $client->head($deposit->getUrl()); if ($head->getStatusCode() !== 200) { throw new Exception("HTTP HEAD request cannot check file size: HTTP {$head->getStatusCode()} - {$head->getReasonPhrase()} - {$deposit->getUrl()}"); } $size = $head->getHeader('Content-Length'); if ($size === null || $size === '') { throw new Exception("HTTP HEAD response does not include file size - {$deposit->getUrl()}"); } $expectedSize = $deposit->getSize() * 1000; if (abs($expectedSize - $size) / $size > self::FILE_SIZE_THRESHOLD) { $deposit->addErrorLog("Expected file size {$expectedSize} is not close to reported size {$size}"); $this->logger->warning("Harvest - {$deposit->getUrl()} - Expected file size {$expectedSize} is not close to reported size {$size}"); } } catch (RequestException $e) { $response = $e->getResponse(); if ($response !== null) { $this->logger->critical($e->getResponse()->getStatusCode() . ' ' . $e->getResponse()->getReasonPhrase()); } else { $this->logger->critical($e->getMessage()); } throw $e; } }
/** * Load the XML from a file and return a DOM. Errors are appended to * the $report string. * * @return DOMDocument * * @param Deposit $deposit * @param string $filename * @param string $report */ private function loadXml(Deposit $deposit, $filename, &$report) { $dom = new DOMDocument(); try { $dom->load($filename, LIBXML_COMPACT | LIBXML_PARSEHUGE); } catch (Exception $ex) { if (strpos($ex->getMessage(), 'Input is not proper UTF-8') === false) { $deposit->addErrorLog('XML file ' . basename($filename) . ' is not parseable, and cannot be scanned for viruses: ' . $ex->getMessage()); $report .= $ex->getMessage(); $report .= "\nCannot scan for viruses.\n"; return; } $filteredFilename = "{$filename}-filtered.xml"; $report .= basename($filename) . " contains invalid UTF-8 characters and will not be scanned for viruses.\n"; $report .= basename($filteredFilename) . " will be scanned for viruses instead.\n"; $dom->load($filteredFilename, LIBXML_COMPACT | LIBXML_PARSEHUGE); } return $dom; }