/** * @dataProvider dataProviderForTestUnescape * * @param string $stringToUnescape * @param string $expectedUnescapedString * @return void */ public function testUnescape($stringToUnescape, $expectedUnescapedString) { /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ $escaper = new \Box\Spout\Common\Escaper\XLSX(); $unescapedString = $escaper->unescape($stringToUnescape); $this->assertEquals($expectedUnescapedString, $unescapedString, 'Incorrect escaped string'); }
/** * Builds an in-memory array containing all the shared strings of the sheet. * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. * It is then accessed by the sheet data, via the string index in the built table. * * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx * * The XML file can be really big with sheets containing a lot of data. That is why * we need to use a XML reader that provides streaming like the XMLReader library. * Please note that SimpleXML does not provide such a functionality but since it is faster * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose. * * @return void * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read */ public function extractSharedStrings() { $xmlReader = new XMLReader(); $sharedStringIndex = 0; $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sharedStringsFilePath = $this->getSharedStringsFilePath(); if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) { throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); } try { $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); while ($xmlReader->read() && $xmlReader->name !== 'si') { // do nothing until a 'si' tag is reached } while ($xmlReader->name === 'si') { $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader); $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); // removes nodes that should not be read, like the pronunciation of the Kanji characters $cleanNode = $this->removeSuperfluousTextNodes($node); // find all text nodes 't'; there can be multiple if the cell contains formatting $textNodes = $cleanNode->xpath('//ns:t'); $textValue = ''; foreach ($textNodes as $textNode) { if ($this->shouldPreserveWhitespace($textNode)) { $textValue .= $textNode->__toString(); } else { $textValue .= trim($textNode->__toString()); } } $unescapedTextValue = $escaper->unescape($textValue); $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex); $sharedStringIndex++; // jump to the next 'si' tag $xmlReader->next('si'); } } catch (XMLProcessingException $exception) { throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]"); } $this->cachingStrategy->closeCache(); $xmlReader->close(); }
/** * Creates the "workbook.xml" file under the "xl" folder * * @param Worksheet[] $worksheets * @return FileSystemHelper */ public function createWorkbookFile($worksheets) { $workbookXmlFileContents = <<<EOD <?xml version="1.0" encoding="UTF-8" standalone="yes"?> <workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"> <sheets> EOD; $escaper = new \Box\Spout\Common\Escaper\XLSX(); /** @var Worksheet $worksheet */ foreach ($worksheets as $worksheet) { $worksheetName = $worksheet->getExternalSheet()->getName(); $worksheetId = $worksheet->getId(); $workbookXmlFileContents .= ' <sheet name="' . $escaper->escape($worksheetName) . '" sheetId="' . $worksheetId . '" r:id="rIdSheet' . $worksheetId . '"/>' . PHP_EOL; } $workbookXmlFileContents .= <<<EOD </sheets> </workbook> EOD; $this->createFileWithContents($this->xlFolder, self::WORKBOOK_XML_FILE_NAME, $workbookXmlFileContents); return $this; }
/** * @dataProvider dataProviderForTestUnescape * * @param string $stringToUnescape * @param string $expectedUnescapedString * @return void */ public function testUnescape($stringToUnescape, $expectedUnescapedString) { $escaper = new \Box\Spout\Common\Escaper\XLSX(); $unescapedString = $escaper->unescape($stringToUnescape); $this->assertEquals($expectedUnescapedString, $unescapedString, 'Incorrect escaped string'); }
/** * Returns an instance of a sheet, given the path of its data XML file. * We first look at "xl/_rels/workbook.xml.rels" to find the relationship ID of the sheet. * Then we look at "xl/worbook.xml" to find the sheet entry associated to the found ID. * The entry contains the ID and name of the sheet. * * If this piece of data can't be found by parsing the different XML files, the ID will default * to the sheet index, based on order in [Content_Types].xml. Similarly, the sheet's name will * default to the data sheet XML file name ("xl/worksheets/sheet2.xml" => "sheet2"). * * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml * @param int $sheetIndexZeroBased Index of the sheet, based on order in [Content_Types].xml (zero-based) * @return \Box\Spout\Reader\Sheet Sheet instance */ protected function getSheet($sheetDataXMLFilePath, $sheetIndexZeroBased) { $sheetId = $sheetIndexZeroBased + 1; $sheetName = $this->getDefaultSheetName($sheetDataXMLFilePath); /* * In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" * In workbook.xml.rels, it is only "worksheets/sheet1.xml" */ $sheetDataXMLFilePathInWorkbookXMLRels = ltrim($sheetDataXMLFilePath, '/xl/'); // find the node associated to the given file path $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement(); $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Target="' . $sheetDataXMLFilePathInWorkbookXMLRels . '"]'); if (count($relationshipNodes) === 1) { $relationshipNode = $relationshipNodes[0]; $sheetId = (string) $relationshipNode->attributes()->Id; $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $sheetId . '"]'); if (count($sheetNodes) === 1) { $sheetNode = $sheetNodes[0]; $sheetId = (int) $sheetNode->attributes()->sheetId; $escapedSheetName = (string) $sheetNode->attributes()->name; $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sheetName = $escaper->unescape($escapedSheetName); } } return new Sheet($sheetId, $sheetIndexZeroBased, $sheetName); }
/** * Returns an instance of a sheet, given the path of its data XML file. * We first look at "xl/_rels/workbook.xml.rels" to find the relationship ID of the sheet. * Then we look at "xl/worbook.xml" to find the sheet entry associated to the found ID. * The entry contains the ID and name of the sheet. * * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml * @return \Box\Spout\Reader\XLSX\Sheet Sheet instance */ protected function getSheetFromXML($sheetDataXMLFilePath) { // In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml" // In workbook.xml.rels, it is only "worksheets/sheet1.xml" $sheetDataXMLFilePathInWorkbookXMLRels = ltrim($sheetDataXMLFilePath, '/xl/'); // find the node associated to the given file path $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement(); $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Target="' . $sheetDataXMLFilePathInWorkbookXMLRels . '"]'); $relationshipNode = $relationshipNodes[0]; $relationshipSheetId = $relationshipNode->getAttribute('Id'); $workbookXMLElement = $this->getWorkbookXMLAsXMLElement(); $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $relationshipSheetId . '"]'); $sheetNode = $sheetNodes[0]; $escapedSheetName = $sheetNode->getAttribute('name'); $sheetIdOneBased = $sheetNode->getAttribute('sheetId'); $sheetIndexZeroBased = $sheetIdOneBased - 1; /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */ $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sheetName = $escaper->unescape($escapedSheetName); return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $sheetIndexZeroBased, $sheetName); }
/** * Builds an in-memory array containing all the shared strings of the worksheet. * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. * It is then accessed by the worksheet data, via the string index in the built table. * * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx * * The XML file can be really big with worksheets containing a lot of data. That is why * we need to use a XML reader that provides streaming like the XMLReader library. * Please note that SimpleXML does not provide such a functionality but since it is faster * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose. * * @return void * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read */ public function extractSharedStrings() { $xmlReader = new \XMLReader(); $sharedStringIndex = 0; $this->tempFilePointer = null; $escaper = new \Box\Spout\Common\Escaper\XLSX(); $sharedStringsFilePath = $this->getSharedStringsFilePath(); if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NOENT | LIBXML_NONET) === false) { throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); } while ($xmlReader->read() && $xmlReader->name !== 'si') { // do nothing until a 'si' tag is reached } while ($xmlReader->name === 'si') { $node = new \SimpleXMLElement($xmlReader->readOuterXml()); $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML); // removes nodes that should not be read, like the pronunciation of the Kanji characters $cleanNode = $this->removeSuperfluousTextNodes($node); // find all text nodes 't'; there can be multiple if the cell contains formatting $textNodes = $cleanNode->xpath('//ns:t'); $textValue = ''; foreach ($textNodes as $textNode) { if ($this->shouldPreserveWhitespace($textNode)) { $textValue .= $textNode->__toString(); } else { $textValue .= trim($textNode->__toString()); } } $unescapedTextValue = $escaper->unescape($textValue); // The shared string retrieval logic expects each cell data to be on one line only // Encoding the line feed character allows to preserve this assumption $lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue); $this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex); $sharedStringIndex++; // jump to the next 'si' tag $xmlReader->next('si'); } // close pointer to the last temp file that was written if ($this->tempFilePointer) { fclose($this->tempFilePointer); } $xmlReader->close(); }