Esempio n. 1
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of ebook publication data.
  *
  * @param string $page epub contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $opf_pattern = "/.opf\$/i";
     $html_pattern = "/.html\$/i";
     $xhtml_pattern = "/.xhtml\$/i";
     $epub_url[0] = '';
     $epub_language = '';
     $epub_title = '';
     $epub_unique_identifier = '';
     $epub_author = '';
     $epub_publisher = '';
     $epub_date = '';
     $epub_subject = '';
     $desc = '';
     $htmlcontent = '';
     // Open a zip archive
     $zip = new PartialZipArchive($page);
     $num_files = $zip->numFiles();
     for ($i = 0; $i < $num_files; $i++) {
         // get the content file names of .epub document
         $filename[$i] = $zip->getNameIndex($i);
         if (preg_match($opf_pattern, $filename[$i])) {
             // Get the file data from zipped folder
             $opf_data = $zip->getFromName($filename[$i]);
             $opf_summary = $this->xmlToObject($opf_data);
             for ($m = 0; $m <= MAX_DOM_LEVEL; $m++) {
                 for ($n = 0; $n <= MAX_DOM_LEVEL; $n++) {
                     if (isset($opf_summary->children[$m]->children[$n])) {
                         $child = $opf_summary->children[$m]->children[$n];
                         if (isset($child->name) && $child->name == "dc:language") {
                             $epub_language = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:title") {
                             $epub_title = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:creator") {
                             $epub_author = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:identifier") {
                             $epub_unique_identifier = $opf_summary->children[$m]->children[$n]->content;
                         }
                     }
                 }
             }
         } else {
             if (preg_match($html_pattern, $filename[$i]) || preg_match($xhtml_pattern, $filename[$i])) {
                 $html = new HtmlProcessor();
                 $html_data = $zip->getFromName($filename[$i]);
                 $description[$i] = $html->process($html_data, $url);
                 $htmlcontent .= $description[$i]['t'];
             }
         }
     }
     if ($epub_title != '') {
         $desc = " {$epub_title} .";
     }
     if ($epub_author != '') {
         $desc = $desc . " {$epub_author} ";
     }
     if ($epub_language != '') {
         $desc = $desc . " {$epub_language} ";
     }
     if ($epub_unique_identifier != '') {
         $desc = $desc . " URN-" . $epub_unique_identifier . ".";
     }
     if ($epub_publisher != '') {
         $desc = $desc . " {$epub_publisher} ";
     }
     if ($epub_date != '') {
         $desc = $desc . " {$epub_date} ";
     }
     if ($epub_subject != '') {
         $desc = $desc . " {$epub_subject} ";
     }
     $desc = $desc . $htmlcontent;
     //restrict the length of the description to maximum description length
     if (strlen($desc) > self::$max_description_len) {
         $desc = substr($desc, 0, self::$max_description_len);
     }
     $summary[self::TITLE] = $epub_title;
     $summary[self::DESCRIPTION] = $desc;
     $summary[self::LANG] = $epub_language;
     $summary[self::LINKS] = $epub_url;
     $summary[self::PAGE] = $page;
     return $summary;
 }