/** * Used to extract the title, description and links from * a string consisting of gopher page data. * * @param string $page gopher contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $lines = explode("\r\n", $page); $out_page = "<html><title></title><body>"; $old_type = "@"; $okay_types = array("0", "1", "3", "4", "5", "6", "9", "g", "h", "I"); foreach ($lines as $line) { if (!isset($line[0])) { continue; } $type = $line[0]; if ($type != $old_type) { if ($type == 'i') { $out_page .= "<div>"; } else { if ($old_type == 'i') { $out_page .= "</div>"; } } } $rest = substr($line, 1); $line_parts = explode("\t", $rest); if ($type == 'i') { $out_page .= $line_parts[0] . "\n"; } else { if (in_array($type, $okay_types) && count($line_parts) == 4) { $scheme = "gopher://"; $text = $line_parts[0]; $path = $line_parts[1]; $host = $line_parts[2]; $port = $line_parts[3]; $port_string = ""; $use_host = false; if ($port != "70") { $port_string = ":{$port}"; } if (substr($path, 0, 4) == "URL:") { $link = substr($path, 4); } else { $path = "/{$type}{$path}"; $link = "{$scheme}{$host}{$port_string}{$path}"; } $out_page .= "<div><a href='{$link}'>" . "{$text}</a></div>"; } else { $out_page .= "<div>{$line_parts[0]}</div>"; } } } $out_page .= "</body></html>"; $summary = parent::process($out_page, $url); return $summary; }
/** * Used to extract the title, description and links from * a string consisting of ebook publication data. * * @param string $page epub contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $opf_pattern = "/.opf\$/i"; $html_pattern = "/.html\$/i"; $xhtml_pattern = "/.xhtml\$/i"; $epub_url[0] = ''; $epub_language = ''; $epub_title = ''; $epub_unique_identifier = ''; $epub_author = ''; $epub_publisher = ''; $epub_date = ''; $epub_subject = ''; $desc = ''; $htmlcontent = ''; // Open a zip archive $zip = new PartialZipArchive($page); $num_files = $zip->numFiles(); for ($i = 0; $i < $num_files; $i++) { // get the content file names of .epub document $filename[$i] = $zip->getNameIndex($i); if (preg_match($opf_pattern, $filename[$i])) { // Get the file data from zipped folder $opf_data = $zip->getFromName($filename[$i]); $opf_summary = $this->xmlToObject($opf_data); for ($m = 0; $m <= MAX_DOM_LEVEL; $m++) { for ($n = 0; $n <= MAX_DOM_LEVEL; $n++) { if (isset($opf_summary->children[$m]->children[$n])) { $child = $opf_summary->children[$m]->children[$n]; if (isset($child->name) && $child->name == "dc:language") { $epub_language = $opf_summary->children[$m]->children[$n]->content; } if ($opf_summary->children[$m]->children[$n]->name == "dc:title") { $epub_title = $opf_summary->children[$m]->children[$n]->content; } if ($opf_summary->children[$m]->children[$n]->name == "dc:creator") { $epub_author = $opf_summary->children[$m]->children[$n]->content; } if ($opf_summary->children[$m]->children[$n]->name == "dc:identifier") { $epub_unique_identifier = $opf_summary->children[$m]->children[$n]->content; } } } } } else { if (preg_match($html_pattern, $filename[$i]) || preg_match($xhtml_pattern, $filename[$i])) { $html = new HtmlProcessor(); $html_data = $zip->getFromName($filename[$i]); $description[$i] = $html->process($html_data, $url); $htmlcontent .= $description[$i]['t']; } } } if ($epub_title != '') { $desc = " {$epub_title} ."; } if ($epub_author != '') { $desc = $desc . " {$epub_author} "; } if ($epub_language != '') { $desc = $desc . " {$epub_language} "; } if ($epub_unique_identifier != '') { $desc = $desc . " URN-" . $epub_unique_identifier . "."; } if ($epub_publisher != '') { $desc = $desc . " {$epub_publisher} "; } if ($epub_date != '') { $desc = $desc . " {$epub_date} "; } if ($epub_subject != '') { $desc = $desc . " {$epub_subject} "; } $desc = $desc . $htmlcontent; //restrict the length of the description to maximum description length if (strlen($desc) > self::$max_description_len) { $desc = substr($desc, 0, self::$max_description_len); } $summary[self::TITLE] = $epub_title; $summary[self::DESCRIPTION] = $desc; $summary[self::LANG] = $epub_language; $summary[self::LINKS] = $epub_url; $summary[self::PAGE] = $page; return $summary; }