Esempio n. 1
0
 /**
  * Used to extract the title, description and links from
  * a docx file consisting of xml data.
  *
  * @param string $page docx(zip) contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $sites = array();
     $zip = new PartialZipArchive($page);
     $buf = $zip->getFromName("docProps/core.xml");
     if ($buf) {
         $dom = self::dom($buf);
         if ($dom !== false) {
             // Try to get the title from the document meta data
             $summary[self::TITLE] = self::title($dom);
         }
     }
     $buf = $zip->getFromName("word/document.xml");
     if ($buf) {
         $dom = self::dom($buf);
         $summary[self::DESCRIPTION] = self::docText($dom);
         $summary[self::LANG] = guessLocaleFromString($summary[self::DESCRIPTION], 'en-US');
     } else {
         $summary[self::DESCRIPTION] = "Did not download " . "word/document.xml portion of docx file";
         $summary[self::LANG] = 'en-US';
     }
     $buf = $zip->getFromName("word/_rels/document.xml.rels");
     if ($buf) {
         $dom = self::dom($buf);
         $summary[self::LINKS] = self::links($dom, $url);
     } else {
         $summary[self::LINKS] = array();
     }
     return $summary;
 }
Esempio n. 2
0
 /**
  * Used to extract the title, description and links from
  * a pptx file consisting of xml data.
  *
  * @param string $page pptx(zip) contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     // Open zip archive
     $zip = new PartialZipArchive($page);
     $buf = $zip->getFromName("docProps/core.xml");
     if ($buf) {
         $dom = self::dom($buf);
         if ($dom !== false) {
             // Get the title
             $summary[self::TITLE] = self::title($dom);
         }
     }
     $buf = $zip->getFromName("docProps/app.xml");
     if ($buf) {
         // Get number of slides present
         $dom = self::dom($buf);
         $num_slides = self::numSlides($dom);
     } else {
         /*  go for an upper bound on number of slides (might happen on
                partial download of pptx file)
             */
         $num_slides = $zip->numFiles();
     }
     $summary[self::DESCRIPTION] = "";
     $summary[self::LINKS] = array();
     $lang = NULL;
     for ($i = 1; $i <= $num_slides; $i++) {
         $buf = $zip->getFromName("ppt/slides/slide" . $i . ".xml");
         if ($buf) {
             /* Get description , language and url links asociated
                with each slide*/
             $dom = self::dom($buf);
             $description = self::slideText($dom);
             if (strlen($summary[self::DESCRIPTION]) + strlen($description) < self::$max_description_len) {
                 $summary[self::DESCRIPTION] .= $description;
             }
             if (!$lang) {
                 $lang = self::lang($dom);
                 if ($lang) {
                     $summary[self::LANG] = $lang;
                 }
             }
             $summary[self::LINKS] = array_merge($summary[self::LINKS], self::links($dom, $url));
         }
     }
     return $summary;
 }
Esempio n. 3
0
 /**
  * Used to extract the title, description and links from
  * a xlsx file.
  *
  * @param string $page contents of xlsx file in zip format
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     // Open a zip archive
     $zip = new PartialZipArchive($page);
     //Count of the sheets in xlsx
     $file_count = 0;
     //Getting the title from xlsx file
     $buf = $zip->getFromName("docProps/app.xml");
     if ($buf) {
         $dom = self::dom($buf);
         if ($dom !== false) {
             // Get the title
             $summary[self::TITLE] = self::title($dom);
             $file_count = self::sheetCount($dom);
         }
     }
     //Getting the description from xlsx file
     $buf = $zip->getFromName("xl/sharedStrings.xml");
     if ($buf) {
         $dom = self::dom($buf);
         if ($dom !== false) {
             // Get the description
             $summary[self::DESCRIPTION] = self::description($dom);
         }
         //Getting the language from xlsx file
         $summary[self::LANG] = self::calculateLang($summary[self::DESCRIPTION], $url);
     }
     $summary[self::LINKS] = array();
     //Getting links from each worksheet
     for ($i = 1; $i <= $file_count; $i++) {
         $buf = $zip->getFromName("xl/worksheets/_rels/sheet" . $i . ".xml.rels");
         if ($buf) {
             $dom = self::dom($buf);
             if ($dom !== false) {
                 // Get the links
                 $summary[self::LINKS] = array_merge($summary[self::LINKS], self::links($dom, $url));
             }
         }
     }
     return $summary;
 }
Esempio n. 4
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of ebook publication data.
  *
  * @param string $page epub contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $opf_pattern = "/.opf\$/i";
     $html_pattern = "/.html\$/i";
     $xhtml_pattern = "/.xhtml\$/i";
     $epub_url[0] = '';
     $epub_language = '';
     $epub_title = '';
     $epub_unique_identifier = '';
     $epub_author = '';
     $epub_publisher = '';
     $epub_date = '';
     $epub_subject = '';
     $desc = '';
     $htmlcontent = '';
     // Open a zip archive
     $zip = new PartialZipArchive($page);
     $num_files = $zip->numFiles();
     for ($i = 0; $i < $num_files; $i++) {
         // get the content file names of .epub document
         $filename[$i] = $zip->getNameIndex($i);
         if (preg_match($opf_pattern, $filename[$i])) {
             // Get the file data from zipped folder
             $opf_data = $zip->getFromName($filename[$i]);
             $opf_summary = $this->xmlToObject($opf_data);
             for ($m = 0; $m <= MAX_DOM_LEVEL; $m++) {
                 for ($n = 0; $n <= MAX_DOM_LEVEL; $n++) {
                     if (isset($opf_summary->children[$m]->children[$n])) {
                         $child = $opf_summary->children[$m]->children[$n];
                         if (isset($child->name) && $child->name == "dc:language") {
                             $epub_language = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:title") {
                             $epub_title = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:creator") {
                             $epub_author = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:identifier") {
                             $epub_unique_identifier = $opf_summary->children[$m]->children[$n]->content;
                         }
                     }
                 }
             }
         } else {
             if (preg_match($html_pattern, $filename[$i]) || preg_match($xhtml_pattern, $filename[$i])) {
                 $html = new HtmlProcessor();
                 $html_data = $zip->getFromName($filename[$i]);
                 $description[$i] = $html->process($html_data, $url);
                 $htmlcontent .= $description[$i]['t'];
             }
         }
     }
     if ($epub_title != '') {
         $desc = " {$epub_title} .";
     }
     if ($epub_author != '') {
         $desc = $desc . " {$epub_author} ";
     }
     if ($epub_language != '') {
         $desc = $desc . " {$epub_language} ";
     }
     if ($epub_unique_identifier != '') {
         $desc = $desc . " URN-" . $epub_unique_identifier . ".";
     }
     if ($epub_publisher != '') {
         $desc = $desc . " {$epub_publisher} ";
     }
     if ($epub_date != '') {
         $desc = $desc . " {$epub_date} ";
     }
     if ($epub_subject != '') {
         $desc = $desc . " {$epub_subject} ";
     }
     $desc = $desc . $htmlcontent;
     //restrict the length of the description to maximum description length
     if (strlen($desc) > self::$max_description_len) {
         $desc = substr($desc, 0, self::$max_description_len);
     }
     $summary[self::TITLE] = $epub_title;
     $summary[self::DESCRIPTION] = $desc;
     $summary[self::LANG] = $epub_language;
     $summary[self::LINKS] = $epub_url;
     $summary[self::PAGE] = $page;
     return $summary;
 }