Beispiel #1
0
 /**
  * @dataProvider dataProviderForTestUnescape
  *
  * @param string $stringToUnescape
  * @param string $expectedUnescapedString
  * @return void
  */
 public function testUnescape($stringToUnescape, $expectedUnescapedString)
 {
     /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
     $escaper = new \Box\Spout\Common\Escaper\XLSX();
     $unescapedString = $escaper->unescape($stringToUnescape);
     $this->assertEquals($expectedUnescapedString, $unescapedString, 'Incorrect escaped string');
 }
Beispiel #2
0
 /**
  * Builds an in-memory array containing all the shared strings of the sheet.
  * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
  * It is then accessed by the sheet data, via the string index in the built table.
  *
  * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
  *
  * The XML file can be really big with sheets containing a lot of data. That is why
  * we need to use a XML reader that provides streaming like the XMLReader library.
  * Please note that SimpleXML does not provide such a functionality but since it is faster
  * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose.
  *
  * @return void
  * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read
  */
 public function extractSharedStrings()
 {
     $xmlReader = new XMLReader();
     $sharedStringIndex = 0;
     $escaper = new \Box\Spout\Common\Escaper\XLSX();
     $sharedStringsFilePath = $this->getSharedStringsFilePath();
     if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NONET) === false) {
         throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
     }
     try {
         $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
         $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
         while ($xmlReader->read() && $xmlReader->name !== 'si') {
             // do nothing until a 'si' tag is reached
         }
         while ($xmlReader->name === 'si') {
             $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
             $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
             // removes nodes that should not be read, like the pronunciation of the Kanji characters
             $cleanNode = $this->removeSuperfluousTextNodes($node);
             // find all text nodes 't'; there can be multiple if the cell contains formatting
             $textNodes = $cleanNode->xpath('//ns:t');
             $textValue = '';
             foreach ($textNodes as $textNode) {
                 if ($this->shouldPreserveWhitespace($textNode)) {
                     $textValue .= $textNode->__toString();
                 } else {
                     $textValue .= trim($textNode->__toString());
                 }
             }
             $unescapedTextValue = $escaper->unescape($textValue);
             $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
             $sharedStringIndex++;
             // jump to the next 'si' tag
             $xmlReader->next('si');
         }
     } catch (XMLProcessingException $exception) {
         throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
     }
     $this->cachingStrategy->closeCache();
     $xmlReader->close();
 }
Beispiel #3
0
    /**
     * Creates the "workbook.xml" file under the "xl" folder
     *
     * @param Worksheet[] $worksheets
     * @return FileSystemHelper
     */
    public function createWorkbookFile($worksheets)
    {
        $workbookXmlFileContents = <<<EOD
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
    <sheets>

EOD;
        $escaper = new \Box\Spout\Common\Escaper\XLSX();
        /** @var Worksheet $worksheet */
        foreach ($worksheets as $worksheet) {
            $worksheetName = $worksheet->getExternalSheet()->getName();
            $worksheetId = $worksheet->getId();
            $workbookXmlFileContents .= '        <sheet name="' . $escaper->escape($worksheetName) . '" sheetId="' . $worksheetId . '" r:id="rIdSheet' . $worksheetId . '"/>' . PHP_EOL;
        }
        $workbookXmlFileContents .= <<<EOD
    </sheets>
</workbook>
EOD;
        $this->createFileWithContents($this->xlFolder, self::WORKBOOK_XML_FILE_NAME, $workbookXmlFileContents);
        return $this;
    }
Beispiel #4
0
 /**
  * @dataProvider dataProviderForTestUnescape
  *
  * @param string $stringToUnescape
  * @param string $expectedUnescapedString
  * @return void
  */
 public function testUnescape($stringToUnescape, $expectedUnescapedString)
 {
     $escaper = new \Box\Spout\Common\Escaper\XLSX();
     $unescapedString = $escaper->unescape($stringToUnescape);
     $this->assertEquals($expectedUnescapedString, $unescapedString, 'Incorrect escaped string');
 }
Beispiel #5
0
 /**
  * Returns an instance of a sheet, given the path of its data XML file.
  * We first look at "xl/_rels/workbook.xml.rels" to find the relationship ID of the sheet.
  * Then we look at "xl/worbook.xml" to find the sheet entry associated to the found ID.
  * The entry contains the ID and name of the sheet.
  *
  * If this piece of data can't be found by parsing the different XML files, the ID will default
  * to the sheet index, based on order in [Content_Types].xml. Similarly, the sheet's name will
  * default to the data sheet XML file name ("xl/worksheets/sheet2.xml" => "sheet2").
  *
  * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
  * @param int $sheetIndexZeroBased Index of the sheet, based on order in [Content_Types].xml (zero-based)
  * @return \Box\Spout\Reader\Sheet Sheet instance
  */
 protected function getSheet($sheetDataXMLFilePath, $sheetIndexZeroBased)
 {
     $sheetId = $sheetIndexZeroBased + 1;
     $sheetName = $this->getDefaultSheetName($sheetDataXMLFilePath);
     /*
      * In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml"
      * In workbook.xml.rels, it is only "worksheets/sheet1.xml"
      */
     $sheetDataXMLFilePathInWorkbookXMLRels = ltrim($sheetDataXMLFilePath, '/xl/');
     // find the node associated to the given file path
     $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement();
     $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Target="' . $sheetDataXMLFilePathInWorkbookXMLRels . '"]');
     if (count($relationshipNodes) === 1) {
         $relationshipNode = $relationshipNodes[0];
         $sheetId = (string) $relationshipNode->attributes()->Id;
         $workbookXMLElement = $this->getWorkbookXMLAsXMLElement();
         $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $sheetId . '"]');
         if (count($sheetNodes) === 1) {
             $sheetNode = $sheetNodes[0];
             $sheetId = (int) $sheetNode->attributes()->sheetId;
             $escapedSheetName = (string) $sheetNode->attributes()->name;
             $escaper = new \Box\Spout\Common\Escaper\XLSX();
             $sheetName = $escaper->unescape($escapedSheetName);
         }
     }
     return new Sheet($sheetId, $sheetIndexZeroBased, $sheetName);
 }
Beispiel #6
0
 /**
  * Returns an instance of a sheet, given the path of its data XML file.
  * We first look at "xl/_rels/workbook.xml.rels" to find the relationship ID of the sheet.
  * Then we look at "xl/worbook.xml" to find the sheet entry associated to the found ID.
  * The entry contains the ID and name of the sheet.
  *
  * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
  * @return \Box\Spout\Reader\XLSX\Sheet Sheet instance
  */
 protected function getSheetFromXML($sheetDataXMLFilePath)
 {
     // In [Content_Types].xml, the path is "/xl/worksheets/sheet1.xml"
     // In workbook.xml.rels, it is only "worksheets/sheet1.xml"
     $sheetDataXMLFilePathInWorkbookXMLRels = ltrim($sheetDataXMLFilePath, '/xl/');
     // find the node associated to the given file path
     $workbookXMLResElement = $this->getWorkbookXMLRelsAsXMLElement();
     $relationshipNodes = $workbookXMLResElement->xpath('//ns:Relationship[@Target="' . $sheetDataXMLFilePathInWorkbookXMLRels . '"]');
     $relationshipNode = $relationshipNodes[0];
     $relationshipSheetId = $relationshipNode->getAttribute('Id');
     $workbookXMLElement = $this->getWorkbookXMLAsXMLElement();
     $sheetNodes = $workbookXMLElement->xpath('//ns:sheet[@r:id="' . $relationshipSheetId . '"]');
     $sheetNode = $sheetNodes[0];
     $escapedSheetName = $sheetNode->getAttribute('name');
     $sheetIdOneBased = $sheetNode->getAttribute('sheetId');
     $sheetIndexZeroBased = $sheetIdOneBased - 1;
     /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
     $escaper = new \Box\Spout\Common\Escaper\XLSX();
     $sheetName = $escaper->unescape($escapedSheetName);
     return new Sheet($this->filePath, $sheetDataXMLFilePath, $this->sharedStringsHelper, $sheetIndexZeroBased, $sheetName);
 }
 /**
  * Builds an in-memory array containing all the shared strings of the worksheet.
  * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
  * It is then accessed by the worksheet data, via the string index in the built table.
  *
  * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
  *
  * The XML file can be really big with worksheets containing a lot of data. That is why
  * we need to use a XML reader that provides streaming like the XMLReader library.
  * Please note that SimpleXML does not provide such a functionality but since it is faster
  * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose.
  *
  * @return void
  * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read
  */
 public function extractSharedStrings()
 {
     $xmlReader = new \XMLReader();
     $sharedStringIndex = 0;
     $this->tempFilePointer = null;
     $escaper = new \Box\Spout\Common\Escaper\XLSX();
     $sharedStringsFilePath = $this->getSharedStringsFilePath();
     if ($xmlReader->open($sharedStringsFilePath, null, LIBXML_NOENT | LIBXML_NONET) === false) {
         throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
     }
     while ($xmlReader->read() && $xmlReader->name !== 'si') {
         // do nothing until a 'si' tag is reached
     }
     while ($xmlReader->name === 'si') {
         $node = new \SimpleXMLElement($xmlReader->readOuterXml());
         $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
         // removes nodes that should not be read, like the pronunciation of the Kanji characters
         $cleanNode = $this->removeSuperfluousTextNodes($node);
         // find all text nodes 't'; there can be multiple if the cell contains formatting
         $textNodes = $cleanNode->xpath('//ns:t');
         $textValue = '';
         foreach ($textNodes as $textNode) {
             if ($this->shouldPreserveWhitespace($textNode)) {
                 $textValue .= $textNode->__toString();
             } else {
                 $textValue .= trim($textNode->__toString());
             }
         }
         $unescapedTextValue = $escaper->unescape($textValue);
         // The shared string retrieval logic expects each cell data to be on one line only
         // Encoding the line feed character allows to preserve this assumption
         $lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue);
         $this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex);
         $sharedStringIndex++;
         // jump to the next 'si' tag
         $xmlReader->next('si');
     }
     // close pointer to the last temp file that was written
     if ($this->tempFilePointer) {
         fclose($this->tempFilePointer);
     }
     $xmlReader->close();
 }