Beispiel #1
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of gopher page data.
  *
  * @param string $page gopher contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $lines = explode("\r\n", $page);
     $out_page = "<html><title></title><body>";
     $old_type = "@";
     $okay_types = array("0", "1", "3", "4", "5", "6", "9", "g", "h", "I");
     foreach ($lines as $line) {
         if (!isset($line[0])) {
             continue;
         }
         $type = $line[0];
         if ($type != $old_type) {
             if ($type == 'i') {
                 $out_page .= "<div>";
             } else {
                 if ($old_type == 'i') {
                     $out_page .= "</div>";
                 }
             }
         }
         $rest = substr($line, 1);
         $line_parts = explode("\t", $rest);
         if ($type == 'i') {
             $out_page .= $line_parts[0] . "\n";
         } else {
             if (in_array($type, $okay_types) && count($line_parts) == 4) {
                 $scheme = "gopher://";
                 $text = $line_parts[0];
                 $path = $line_parts[1];
                 $host = $line_parts[2];
                 $port = $line_parts[3];
                 $port_string = "";
                 $use_host = false;
                 if ($port != "70") {
                     $port_string = ":{$port}";
                 }
                 if (substr($path, 0, 4) == "URL:") {
                     $link = substr($path, 4);
                 } else {
                     $path = "/{$type}{$path}";
                     $link = "{$scheme}{$host}{$port_string}{$path}";
                 }
                 $out_page .= "<div><a href='{$link}'>" . "{$text}</a></div>";
             } else {
                 $out_page .= "<div>{$line_parts[0]}</div>";
             }
         }
     }
     $out_page .= "</body></html>";
     $summary = parent::process($out_page, $url);
     return $summary;
 }
Beispiel #2
0
 /**
  * This method is called by a PageProcessor in its handle() method
  * just after it has processed a web page. This method allows
  * an indexing plugin to do additional processing on the page
  * such as adding sub-documents, before the page summary is
  * handed back to the fetcher. For the recipe plugin a sub-document
  * will be the title of the recipe. The description will consists
  * of the ingredients of the recipe. Ingredients will be separated by
  * ||
  *
  * @param string $page web-page contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array consisting of a sequence of subdoc arrays found
  *     on the given page. Each subdoc array has a self::TITLE and
  *     a self::DESCRIPTION
  */
 function pageProcessing($page, $url)
 {
     crawlLog("...Using recipe plugin to check for recipes!");
     $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page);
     $page = preg_replace('/>/', '> ', $page);
     $dom = HtmlProcessor::dom($page);
     if ($dom == NULL) {
         return NULL;
     }
     $xpath = new DOMXPath($dom);
     $recipes_per_page = $xpath->evaluate("/html//ul[@class = 'ingredient-wrap'] |\n            /html//*[@class = 'pod ingredients'] |\n            /html//*[@id='recipe_title'] |\n            /html//div[@class = 'rcp-head clrfix']|\n            /html//h1[@class = 'fn recipeDetailHeading']");
     $recipe = array();
     $subdocs_description = array();
     if (is_object($recipes_per_page) && $recipes_per_page->length != 0) {
         $recipes_count = $recipes_per_page->length;
         $titles = $xpath->evaluate("/html//*[@id = 'itemTitle']|\n               /html//h1[@class = 'fn'] |\n               /html//*[@id='recipe_title'] |\n               /html//div[@class ='rcp-head clrfix']/h1 |\n               /html//h1[@class = 'fn recipeDetailHeading']");
         for ($i = 0; $i < $recipes_count; $i++) {
             $ingredients = $xpath->evaluate("/html//ul[@class = 'ingredient-wrap']/li |\n                    /html//li[@class = 'ingredient']|\n                    /html//*[@class = 'ingredients']/*|\n                    /html//*[@itemprop='ingredients']\n                    ");
             $ingredients_result = "";
             if (is_object($ingredients) && $ingredients->length != 0) {
                 $lastIngredient = end($ingredients);
                 foreach ($ingredients as $ingredient) {
                     $content = trim($ingredient->textContent);
                     if (!empty($content)) {
                         if ($content != $lastIngredient) {
                             $ingredients_result .= $content . "||";
                         } else {
                             $ingredients_result .= $content;
                         }
                     }
                 }
                 $ingredients_result = mb_ereg_replace("(\\s)+", " ", $ingredients_result);
             }
             $recipe[self::TITLE] = $titles->item($i)->textContent;
             $recipe[self::DESCRIPTION] = $ingredients_result;
             $subdocs_description[] = $recipe;
         }
     }
     $num_recipes = count($subdocs_description);
     crawlLog("...{$num_recipes} found.");
     return $subdocs_description;
 }
Beispiel #3
0
 /**
  * Used to extract the title, description and links from
  * a string consisting of ebook publication data.
  *
  * @param string $page epub contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $opf_pattern = "/.opf\$/i";
     $html_pattern = "/.html\$/i";
     $xhtml_pattern = "/.xhtml\$/i";
     $epub_url[0] = '';
     $epub_language = '';
     $epub_title = '';
     $epub_unique_identifier = '';
     $epub_author = '';
     $epub_publisher = '';
     $epub_date = '';
     $epub_subject = '';
     $desc = '';
     $htmlcontent = '';
     // Open a zip archive
     $zip = new PartialZipArchive($page);
     $num_files = $zip->numFiles();
     for ($i = 0; $i < $num_files; $i++) {
         // get the content file names of .epub document
         $filename[$i] = $zip->getNameIndex($i);
         if (preg_match($opf_pattern, $filename[$i])) {
             // Get the file data from zipped folder
             $opf_data = $zip->getFromName($filename[$i]);
             $opf_summary = $this->xmlToObject($opf_data);
             for ($m = 0; $m <= MAX_DOM_LEVEL; $m++) {
                 for ($n = 0; $n <= MAX_DOM_LEVEL; $n++) {
                     if (isset($opf_summary->children[$m]->children[$n])) {
                         $child = $opf_summary->children[$m]->children[$n];
                         if (isset($child->name) && $child->name == "dc:language") {
                             $epub_language = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:title") {
                             $epub_title = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:creator") {
                             $epub_author = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:identifier") {
                             $epub_unique_identifier = $opf_summary->children[$m]->children[$n]->content;
                         }
                     }
                 }
             }
         } else {
             if (preg_match($html_pattern, $filename[$i]) || preg_match($xhtml_pattern, $filename[$i])) {
                 $html = new HtmlProcessor();
                 $html_data = $zip->getFromName($filename[$i]);
                 $description[$i] = $html->process($html_data, $url);
                 $htmlcontent .= $description[$i]['t'];
             }
         }
     }
     if ($epub_title != '') {
         $desc = " {$epub_title} .";
     }
     if ($epub_author != '') {
         $desc = $desc . " {$epub_author} ";
     }
     if ($epub_language != '') {
         $desc = $desc . " {$epub_language} ";
     }
     if ($epub_unique_identifier != '') {
         $desc = $desc . " URN-" . $epub_unique_identifier . ".";
     }
     if ($epub_publisher != '') {
         $desc = $desc . " {$epub_publisher} ";
     }
     if ($epub_date != '') {
         $desc = $desc . " {$epub_date} ";
     }
     if ($epub_subject != '') {
         $desc = $desc . " {$epub_subject} ";
     }
     $desc = $desc . $htmlcontent;
     //restrict the length of the description to maximum description length
     if (strlen($desc) > self::$max_description_len) {
         $desc = substr($desc, 0, self::$max_description_len);
     }
     $summary[self::TITLE] = $epub_title;
     $summary[self::DESCRIPTION] = $desc;
     $summary[self::LANG] = $epub_language;
     $summary[self::LINKS] = $epub_url;
     $summary[self::PAGE] = $page;
     return $summary;
 }