/** * Used to extract the title, description and links from * a string consisting of gopher page data. * * @param string $page gopher contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $lines = explode("\r\n", $page); $out_page = "<html><title></title><body>"; $old_type = "@"; $okay_types = array("0", "1", "3", "4", "5", "6", "9", "g", "h", "I"); foreach ($lines as $line) { if (!isset($line[0])) { continue; } $type = $line[0]; if ($type != $old_type) { if ($type == 'i') { $out_page .= "<div>"; } else { if ($old_type == 'i') { $out_page .= "</div>"; } } } $rest = substr($line, 1); $line_parts = explode("\t", $rest); if ($type == 'i') { $out_page .= $line_parts[0] . "\n"; } else { if (in_array($type, $okay_types) && count($line_parts) == 4) { $scheme = "gopher://"; $text = $line_parts[0]; $path = $line_parts[1]; $host = $line_parts[2]; $port = $line_parts[3]; $port_string = ""; $use_host = false; if ($port != "70") { $port_string = ":{$port}"; } if (substr($path, 0, 4) == "URL:") { $link = substr($path, 4); } else { $path = "/{$type}{$path}"; $link = "{$scheme}{$host}{$port_string}{$path}"; } $out_page .= "<div><a href='{$link}'>" . "{$text}</a></div>"; } else { $out_page .= "<div>{$line_parts[0]}</div>"; } } } $out_page .= "</body></html>"; $summary = parent::process($out_page, $url); return $summary; }
/** * This method is called by a PageProcessor in its handle() method * just after it has processed a web page. This method allows * an indexing plugin to do additional processing on the page * such as adding sub-documents, before the page summary is * handed back to the fetcher. For the recipe plugin a sub-document * will be the title of the recipe. The description will consists * of the ingredients of the recipe. Ingredients will be separated by * || * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array consisting of a sequence of subdoc arrays found * on the given page. Each subdoc array has a self::TITLE and * a self::DESCRIPTION */ function pageProcessing($page, $url) { crawlLog("...Using recipe plugin to check for recipes!"); $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page); $page = preg_replace('/>/', '> ', $page); $dom = HtmlProcessor::dom($page); if ($dom == NULL) { return NULL; } $xpath = new DOMXPath($dom); $recipes_per_page = $xpath->evaluate("/html//ul[@class = 'ingredient-wrap'] |\n /html//*[@class = 'pod ingredients'] |\n /html//*[@id='recipe_title'] |\n /html//div[@class = 'rcp-head clrfix']|\n /html//h1[@class = 'fn recipeDetailHeading']"); $recipe = array(); $subdocs_description = array(); if (is_object($recipes_per_page) && $recipes_per_page->length != 0) { $recipes_count = $recipes_per_page->length; $titles = $xpath->evaluate("/html//*[@id = 'itemTitle']|\n /html//h1[@class = 'fn'] |\n /html//*[@id='recipe_title'] |\n /html//div[@class ='rcp-head clrfix']/h1 |\n /html//h1[@class = 'fn recipeDetailHeading']"); for ($i = 0; $i < $recipes_count; $i++) { $ingredients = $xpath->evaluate("/html//ul[@class = 'ingredient-wrap']/li |\n /html//li[@class = 'ingredient']|\n /html//*[@class = 'ingredients']/*|\n /html//*[@itemprop='ingredients']\n "); $ingredients_result = ""; if (is_object($ingredients) && $ingredients->length != 0) { $lastIngredient = end($ingredients); foreach ($ingredients as $ingredient) { $content = trim($ingredient->textContent); if (!empty($content)) { if ($content != $lastIngredient) { $ingredients_result .= $content . "||"; } else { $ingredients_result .= $content; } } } $ingredients_result = mb_ereg_replace("(\\s)+", " ", $ingredients_result); } $recipe[self::TITLE] = $titles->item($i)->textContent; $recipe[self::DESCRIPTION] = $ingredients_result; $subdocs_description[] = $recipe; } } $num_recipes = count($subdocs_description); crawlLog("...{$num_recipes} found."); return $subdocs_description; }
/** * Used to extract the title, description and links from * a string consisting of ebook publication data. * * @param string $page epub contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $opf_pattern = "/.opf\$/i"; $html_pattern = "/.html\$/i"; $xhtml_pattern = "/.xhtml\$/i"; $epub_url[0] = ''; $epub_language = ''; $epub_title = ''; $epub_unique_identifier = ''; $epub_author = ''; $epub_publisher = ''; $epub_date = ''; $epub_subject = ''; $desc = ''; $htmlcontent = ''; // Open a zip archive $zip = new PartialZipArchive($page); $num_files = $zip->numFiles(); for ($i = 0; $i < $num_files; $i++) { // get the content file names of .epub document $filename[$i] = $zip->getNameIndex($i); if (preg_match($opf_pattern, $filename[$i])) { // Get the file data from zipped folder $opf_data = $zip->getFromName($filename[$i]); $opf_summary = $this->xmlToObject($opf_data); for ($m = 0; $m <= MAX_DOM_LEVEL; $m++) { for ($n = 0; $n <= MAX_DOM_LEVEL; $n++) { if (isset($opf_summary->children[$m]->children[$n])) { $child = $opf_summary->children[$m]->children[$n]; if (isset($child->name) && $child->name == "dc:language") { $epub_language = $opf_summary->children[$m]->children[$n]->content; } if ($opf_summary->children[$m]->children[$n]->name == "dc:title") { $epub_title = $opf_summary->children[$m]->children[$n]->content; } if ($opf_summary->children[$m]->children[$n]->name == "dc:creator") { $epub_author = $opf_summary->children[$m]->children[$n]->content; } if ($opf_summary->children[$m]->children[$n]->name == "dc:identifier") { $epub_unique_identifier = $opf_summary->children[$m]->children[$n]->content; } } } } } else { if (preg_match($html_pattern, $filename[$i]) || preg_match($xhtml_pattern, $filename[$i])) { $html = new HtmlProcessor(); $html_data = $zip->getFromName($filename[$i]); $description[$i] = $html->process($html_data, $url); $htmlcontent .= $description[$i]['t']; } } } if ($epub_title != '') { $desc = " {$epub_title} ."; } if ($epub_author != '') { $desc = $desc . " {$epub_author} "; } if ($epub_language != '') { $desc = $desc . " {$epub_language} "; } if ($epub_unique_identifier != '') { $desc = $desc . " URN-" . $epub_unique_identifier . "."; } if ($epub_publisher != '') { $desc = $desc . " {$epub_publisher} "; } if ($epub_date != '') { $desc = $desc . " {$epub_date} "; } if ($epub_subject != '') { $desc = $desc . " {$epub_subject} "; } $desc = $desc . $htmlcontent; //restrict the length of the description to maximum description length if (strlen($desc) > self::$max_description_len) { $desc = substr($desc, 0, self::$max_description_len); } $summary[self::TITLE] = $epub_title; $summary[self::DESCRIPTION] = $desc; $summary[self::LANG] = $epub_language; $summary[self::LINKS] = $epub_url; $summary[self::PAGE] = $page; return $summary; }