Example #1
0
 public function testUnStored()
 {
     $field = Document\Field::UnStored('field', 'value');
     $this->assertEquals($field->boost, 1);
     $this->assertEquals($field->encoding, 'UTF-8');
     $this->assertEquals($field->isBinary, false);
     $this->assertEquals($field->isIndexed, true);
     $this->assertEquals($field->isStored, false);
     $this->assertEquals($field->isTokenized, true);
     $this->assertEquals($field->name, 'field');
     $this->assertEquals($field->value, 'value');
 }
Example #2
0
 /**
  * Object constructor
  *
  * @param string  $fileName
  * @param boolean $storeContent
  * @throws \Zend\Search\Lucene\Exception\ExtensionNotLoadedException
  * @throws \Zend\Search\Lucene\Exception\RuntimeException
  */
 private function __construct($fileName, $storeContent)
 {
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded');
     }
     // Document data holders
     $slides = array();
     $slideNotes = array();
     $documentBody = array();
     $coreProperties = array();
     // Open AbstractOpenXML package
     $package = new \ZipArchive();
     $package->open($fileName);
     // Read relations and search for officeDocument
     $relationsXml = $package->getFromName('_rels/.rels');
     if ($relationsXml === false) {
         throw new RuntimeException('Invalid archive or corrupted .pptx file.');
     }
     $relations = simplexml_load_string($relationsXml);
     foreach ($relations->Relationship as $rel) {
         if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
             // Found office document! Search for slides...
             $slideRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")));
             foreach ($slideRelations->Relationship as $slideRel) {
                 if ($slideRel["Type"] == self::SCHEMA_SLIDERELATION) {
                     // Found slide!
                     $slides[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"]))));
                     // Search for slide notes
                     $slideNotesRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")));
                     foreach ($slideNotesRelations->Relationship as $slideNoteRel) {
                         if ($slideNoteRel["Type"] == self::SCHEMA_SLIDENOTESRELATION) {
                             // Found slide notes!
                             $slideNotes[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"]))));
                             break;
                         }
                     }
                 }
             }
             break;
         }
     }
     // Sort slides
     ksort($slides);
     ksort($slideNotes);
     // Extract contents from slides
     foreach ($slides as $slideKey => $slide) {
         // Register namespaces
         $slide->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML);
         $slide->registerXPathNamespace("a", self::SCHEMA_DRAWINGML);
         // Fetch all text
         $textElements = $slide->xpath('//a:t');
         foreach ($textElements as $textElement) {
             $documentBody[] = (string) $textElement;
         }
         // Extract contents from slide notes
         if (isset($slideNotes[$slideKey])) {
             // Fetch slide note
             $slideNote = $slideNotes[$slideKey];
             // Register namespaces
             $slideNote->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML);
             $slideNote->registerXPathNamespace("a", self::SCHEMA_DRAWINGML);
             // Fetch all text
             $textElements = $slideNote->xpath('//a:t');
             foreach ($textElements as $textElement) {
                 $documentBody[] = (string) $textElement;
             }
         }
     }
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     $package->close();
     // Store filename
     $this->addField(Field::Text('filename', $fileName, 'UTF-8'));
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
     }
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     }
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
     }
 }
Example #3
0
 /**
  * Object constructor
  *
  * @param string  $fileName
  * @param boolean $storeContent
  * @throws \Zend\Search\Lucene\Exception\ExtensionNotLoadedException
  * @throws \Zend\Search\Lucene\Exception\RuntimeException
  */
 private function __construct($fileName, $storeContent)
 {
     if (!class_exists('ZipArchive', false)) {
         throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded');
     }
     // Document data holders
     $documentBody = array();
     $coreProperties = array();
     // Open AbstractOpenXML package
     $package = new \ZipArchive();
     $package->open($fileName);
     // Read relations and search for officeDocument
     $relationsXml = $package->getFromName('_rels/.rels');
     if ($relationsXml === false) {
         throw new RuntimeException('Invalid archive or corrupted .docx file.');
     }
     $relations = simplexml_load_string($relationsXml);
     foreach ($relations->Relationship as $rel) {
         if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
             // Found office document! Read in contents...
             $contents = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel['Target']) . '/' . basename($rel['Target']))));
             $contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML);
             $paragraphs = $contents->xpath('//w:body/w:p');
             foreach ($paragraphs as $paragraph) {
                 $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]');
                 if ($runs === false) {
                     // Paragraph doesn't contain any text or breaks
                     continue;
                 }
                 foreach ($runs as $run) {
                     if ($run->getName() == 'br') {
                         // Break element
                         $documentBody[] = ' ';
                     } else {
                         $documentBody[] = (string) $run;
                     }
                 }
                 // Add space after each paragraph. So they are not bound together.
                 $documentBody[] = ' ';
             }
             break;
         }
     }
     // Read core properties
     $coreProperties = $this->extractMetaData($package);
     // Close file
     $package->close();
     // Store filename
     $this->addField(Field::Text('filename', $fileName, 'UTF-8'));
     // Store contents
     if ($storeContent) {
         $this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8'));
     }
     // Store meta data properties
     foreach ($coreProperties as $key => $value) {
         $this->addField(Field::Text($key, $value, 'UTF-8'));
     }
     // Store title (if not present in meta data)
     if (!isset($coreProperties['title'])) {
         $this->addField(Field::Text('title', $fileName, 'UTF-8'));
     }
 }
Example #4
0
File: HTML.php Project: rikaix/zf2
 /**
  * Object constructor
  *
  * @param string  $data         HTML string (may be HTML fragment, )
  * @param boolean $isFile
  * @param boolean $storeContent
  * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
  */
 private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
 {
     $this->_doc = new \DOMDocument();
     $this->_doc->substituteEntities = true;
     if ($isFile) {
         $htmlData = file_get_contents($data);
     } else {
         $htmlData = $data;
     }
     @$this->_doc->loadHTML($htmlData);
     if ($this->_doc->encoding === null) {
         // Document encoding is not recognized
         /** @todo improve HTML vs HTML fragment recognition */
         if (preg_match('/<html>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
             // It's an HTML document
             // Add additional HEAD section and recognize document
             $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
             @$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
             // Remove additional HEAD section
             $xpath = new \DOMXPath($this->_doc);
             $head = $xpath->query('/html/head')->item(0);
             $head->parentNode->removeChild($head);
         } else {
             // It's an HTML fragment
             @$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) . '</body></html>');
         }
     }
     /** @todo Add correction of wrong HTML encoding recognition processing
      * The case is:
      * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
      * even $this->_doc->encoding demonstrates another recognized encoding
      */
     $xpath = new \DOMXPath($this->_doc);
     $docTitle = '';
     $titleNodes = $xpath->query('/html/head/title');
     foreach ($titleNodes as $titleNode) {
         // title should always have only one entry, but we process all nodeset entries
         $docTitle .= $titleNode->nodeValue . ' ';
     }
     $this->addField(Field::Text('title', $docTitle, 'UTF-8'));
     $metaNodes = $xpath->query('/html/head/meta[@name]');
     foreach ($metaNodes as $metaNode) {
         $this->addField(Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), 'UTF-8'));
     }
     $docBody = '';
     $bodyNodes = $xpath->query('/html/body');
     foreach ($bodyNodes as $bodyNode) {
         // body should always have only one entry, but we process all nodeset entries
         $this->_retrieveNodeText($bodyNode, $docBody);
     }
     if ($storeContent) {
         $this->addField(Field::Text('body', $docBody, 'UTF-8'));
     } else {
         $this->addField(Field::UnStored('body', $docBody, 'UTF-8'));
     }
     $linkNodes = $this->_doc->getElementsByTagName('a');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
             $this->_links[] = $href;
         }
     }
     $linkNodes = $this->_doc->getElementsByTagName('area');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) {
             $this->_links[] = $href;
         }
     }
     $this->_links = array_unique($this->_links);
     $linkNodes = $xpath->query('/html/head/link');
     foreach ($linkNodes as $linkNode) {
         if (($href = $linkNode->getAttribute('href')) != '') {
             $this->_headerLinks[] = $href;
         }
     }
     $this->_headerLinks = array_unique($this->_headerLinks);
 }
Example #5
0
    /**
     * Object constructor
     *
     * @param string  $fileName
     * @param boolean $storeContent
     * @throws \Zend\Search\Lucene\Exception\ExtensionNotLoadedException
     * @throws \Zend\Search\Lucene\Exception\RuntimeException
     */
    private function __construct($fileName, $storeContent)
    {
        if (!class_exists('ZipArchive', false)) {
            throw new ExtensionNotLoadedException(
            	'MS Office documents processing functionality requires Zip extension to be loaded'
            );
        }

        // Document data holders
        $sharedStrings = array();
        $worksheets = array();
        $documentBody = array();
        $coreProperties = array();

        // Open AbstractOpenXML package
        $package = new \ZipArchive();
        $package->open($fileName);

        // Read relations and search for officeDocument
        $relationsXml = $package->getFromName('_rels/.rels');
        if ($relationsXml === false) {
            throw new RuntimeException('Invalid archive or corrupted .xlsx file.');
        }
        $relations = simplexml_load_string($relationsXml);
        foreach ($relations->Relationship as $rel) {
            if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) {
                // Found office document! Read relations for workbook...
                $workbookRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
                $workbookRelations->registerXPathNamespace("rel", AbstractOpenXML::SCHEMA_RELATIONSHIP);

                // Read shared strings
                $sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . self::SCHEMA_SHAREDSTRINGS . "']");
                $sharedStringsPath = (string)$sharedStringsPath[0]['Target'];
                $xmlStrings = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)) );
                if (isset($xmlStrings) && isset($xmlStrings->si)) {
                    foreach ($xmlStrings->si as $val) {
                        if (isset($val->t)) {
                            $sharedStrings[] = (string)$val->t;
                        } elseif (isset($val->r)) {
                            $sharedStrings[] = $this->_parseRichText($val);
                        }
                    }
                }

                // Loop relations for workbook and extract worksheets...
                foreach ($workbookRelations->Relationship as $workbookRelation) {
                    if ($workbookRelation["Type"] == self::SCHEMA_WORKSHEETRELATION) {
                        $worksheets[ str_replace( 'rId', '', (string)$workbookRelation["Id"]) ] = simplexml_load_string(
                            $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])) )
                        );
                    }
                }

                break;
            }
        }

        // Sort worksheets
        ksort($worksheets);

        // Extract contents from worksheets
        foreach ($worksheets as $sheetKey => $worksheet) {
            foreach ($worksheet->sheetData->row as $row) {
                foreach ($row->c as $c) {
                    // Determine data type
                    $dataType = (string)$c["t"];
                    switch ($dataType) {
                        case "s":
                            // Value is a shared string
                            if ((string)$c->v != '') {
                                $value = $sharedStrings[intval($c->v)];
                            } else {
                                $value = '';
                            }

                            break;

                        case "b":
                            // Value is boolean
                            $value = (string)$c->v;
                            if ($value == '0') {
                                $value = false;
                            } else if ($value == '1') {
                                $value = true;
                            } else {
                                $value = (bool)$c->v;
                            }

                            break;

                        case "inlineStr":
                            // Value is rich text inline
                            $value = $this->_parseRichText($c->is);

                            break;

                        case "e":
                            // Value is an error message
                            if ((string)$c->v != '') {
                                $value = (string)$c->v;
                            } else {
                                $value = '';
                            }

                            break;

                        default:
                            // Value is a string
                            $value = (string)$c->v;

                            // Check for numeric values
                            if (is_numeric($value) && $dataType != 's') {
                                if ($value == (int)$value) $value = (int)$value;
                                elseif ($value == (float)$value) $value = (float)$value;
                                elseif ($value == (double)$value) $value = (double)$value;
                            }
                    }

                    $documentBody[] = $value;
                }
            }
        }

        // Read core properties
        $coreProperties = $this->extractMetaData($package);

        // Close file
        $package->close();

        // Store filename
        $this->addField(Field::Text('filename', $fileName, 'UTF-8'));

        // Store contents
        if ($storeContent) {
            $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
        } else {
            $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
        }

        // Store meta data properties
        foreach ($coreProperties as $key => $value)
        {
            $this->addField(Field::Text($key, $value, 'UTF-8'));
        }

        // Store title (if not present in meta data)
        if (!isset($coreProperties['title']))
        {
            $this->addField(Field::Text('title', $fileName, 'UTF-8'));
        }
    }