Exemple #1
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of gopher page data.
  *
  * @param string $page gopher contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $lines = explode("\r\n", $page);
     $out_page = "<html><title></title><body>";
     $old_type = "@";
     $okay_types = array("0", "1", "3", "4", "5", "6", "9", "g", "h", "I");
     foreach ($lines as $line) {
         if (!isset($line[0])) {
             continue;
         }
         $type = $line[0];
         if ($type != $old_type) {
             if ($type == 'i') {
                 $out_page .= "<div>";
             } else {
                 if ($old_type == 'i') {
                     $out_page .= "</div>";
                 }
             }
         }
         $rest = substr($line, 1);
         $line_parts = explode("\t", $rest);
         if ($type == 'i') {
             $out_page .= $line_parts[0] . "\n";
         } else {
             if (in_array($type, $okay_types) && count($line_parts) == 4) {
                 $scheme = "gopher://";
                 $text = $line_parts[0];
                 $path = $line_parts[1];
                 $host = $line_parts[2];
                 $port = $line_parts[3];
                 $port_string = "";
                 $use_host = false;
                 if ($port != "70") {
                     $port_string = ":{$port}";
                 }
                 if (substr($path, 0, 4) == "URL:") {
                     $link = substr($path, 4);
                 } else {
                     $path = "/{$type}{$path}";
                     $link = "{$scheme}{$host}{$port_string}{$path}";
                 }
                 $out_page .= "<div><a href='{$link}'>" . "{$text}</a></div>";
             } else {
                 $out_page .= "<div>{$line_parts[0]}</div>";
             }
         }
     }
     $out_page .= "</body></html>";
     $summary = parent::process($out_page, $url);
     return $summary;
 }
Exemple #2
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of ebook publication data.
  *
  * @param string $page epub contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $opf_pattern = "/.opf\$/i";
     $html_pattern = "/.html\$/i";
     $xhtml_pattern = "/.xhtml\$/i";
     $epub_url[0] = '';
     $epub_language = '';
     $epub_title = '';
     $epub_unique_identifier = '';
     $epub_author = '';
     $epub_publisher = '';
     $epub_date = '';
     $epub_subject = '';
     $desc = '';
     $htmlcontent = '';
     // Open a zip archive
     $zip = new PartialZipArchive($page);
     $num_files = $zip->numFiles();
     for ($i = 0; $i < $num_files; $i++) {
         // get the content file names of .epub document
         $filename[$i] = $zip->getNameIndex($i);
         if (preg_match($opf_pattern, $filename[$i])) {
             // Get the file data from zipped folder
             $opf_data = $zip->getFromName($filename[$i]);
             $opf_summary = $this->xmlToObject($opf_data);
             for ($m = 0; $m <= MAX_DOM_LEVEL; $m++) {
                 for ($n = 0; $n <= MAX_DOM_LEVEL; $n++) {
                     if (isset($opf_summary->children[$m]->children[$n])) {
                         $child = $opf_summary->children[$m]->children[$n];
                         if (isset($child->name) && $child->name == "dc:language") {
                             $epub_language = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:title") {
                             $epub_title = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:creator") {
                             $epub_author = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:identifier") {
                             $epub_unique_identifier = $opf_summary->children[$m]->children[$n]->content;
                         }
                     }
                 }
             }
         } else {
             if (preg_match($html_pattern, $filename[$i]) || preg_match($xhtml_pattern, $filename[$i])) {
                 $html = new HtmlProcessor();
                 $html_data = $zip->getFromName($filename[$i]);
                 $description[$i] = $html->process($html_data, $url);
                 $htmlcontent .= $description[$i]['t'];
             }
         }
     }
     if ($epub_title != '') {
         $desc = " {$epub_title} .";
     }
     if ($epub_author != '') {
         $desc = $desc . " {$epub_author} ";
     }
     if ($epub_language != '') {
         $desc = $desc . " {$epub_language} ";
     }
     if ($epub_unique_identifier != '') {
         $desc = $desc . " URN-" . $epub_unique_identifier . ".";
     }
     if ($epub_publisher != '') {
         $desc = $desc . " {$epub_publisher} ";
     }
     if ($epub_date != '') {
         $desc = $desc . " {$epub_date} ";
     }
     if ($epub_subject != '') {
         $desc = $desc . " {$epub_subject} ";
     }
     $desc = $desc . $htmlcontent;
     //restrict the length of the description to maximum description length
     if (strlen($desc) > self::$max_description_len) {
         $desc = substr($desc, 0, self::$max_description_len);
     }
     $summary[self::TITLE] = $epub_title;
     $summary[self::DESCRIPTION] = $desc;
     $summary[self::LANG] = $epub_language;
     $summary[self::LINKS] = $epub_url;
     $summary[self::PAGE] = $page;
     return $summary;
 }