PHP HtmlProcessor примеры использования

Язык программирования: PHP

Класс/Тип: HtmlProcessor

Примеров на hotexamples.com: 3

PHP HtmlProcessor - 3 примера найдено. Это лучшие примеры PHP кода для HtmlProcessor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

process(2)

dom(1)

Пример #1

Показать файл

Файл: gopher_processor.php Проект: yakar/yioop

 /**
  * Used to extract the title, description and links from
  * a string consisting of gopher page data.
  *
  * @param string $page gopher contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $lines = explode("\r\n", $page);
     $out_page = "<html><title></title><body>";
     $old_type = "@";
     $okay_types = array("0", "1", "3", "4", "5", "6", "9", "g", "h", "I");
     foreach ($lines as $line) {
         if (!isset($line[0])) {
             continue;
         }
         $type = $line[0];
         if ($type != $old_type) {
             if ($type == 'i') {
                 $out_page .= "<div>";
             } else {
                 if ($old_type == 'i') {
                     $out_page .= "</div>";
                 }
             }
         }
         $rest = substr($line, 1);
         $line_parts = explode("\t", $rest);
         if ($type == 'i') {
             $out_page .= $line_parts[0] . "\n";
         } else {
             if (in_array($type, $okay_types) && count($line_parts) == 4) {
                 $scheme = "gopher://";
                 $text = $line_parts[0];
                 $path = $line_parts[1];
                 $host = $line_parts[2];
                 $port = $line_parts[3];
                 $port_string = "";
                 $use_host = false;
                 if ($port != "70") {
                     $port_string = ":{$port}";
                 }
                 if (substr($path, 0, 4) == "URL:") {
                     $link = substr($path, 4);
                 } else {
                     $path = "/{$type}{$path}";
                     $link = "{$scheme}{$host}{$port_string}{$path}";
                 }
                 $out_page .= "<div><a href='{$link}'>" . "{$text}</a></div>";
             } else {
                 $out_page .= "<div>{$line_parts[0]}</div>";
             }
         }
     }
     $out_page .= "</body></html>";
     $summary = parent::process($out_page, $url);
     return $summary;
 }

Пример #2

Показать файл

Файл: recipe_plugin.php Проект: yakar/yioop

 /**
  * This method is called by a PageProcessor in its handle() method
  * just after it has processed a web page. This method allows
  * an indexing plugin to do additional processing on the page
  * such as adding sub-documents, before the page summary is
  * handed back to the fetcher. For the recipe plugin a sub-document
  * will be the title of the recipe. The description will consists
  * of the ingredients of the recipe. Ingredients will be separated by
  * ||
  *
  * @param string $page web-page contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array consisting of a sequence of subdoc arrays found
  *     on the given page. Each subdoc array has a self::TITLE and
  *     a self::DESCRIPTION
  */
 function pageProcessing($page, $url)
 {
     crawlLog("...Using recipe plugin to check for recipes!");
     $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page);
     $page = preg_replace('/>/', '> ', $page);
     $dom = HtmlProcessor::dom($page);
     if ($dom == NULL) {
         return NULL;
     }
     $xpath = new DOMXPath($dom);
     $recipes_per_page = $xpath->evaluate("/html//ul[@class = 'ingredient-wrap'] |\n            /html//*[@class = 'pod ingredients'] |\n            /html//*[@id='recipe_title'] |\n            /html//div[@class = 'rcp-head clrfix']|\n            /html//h1[@class = 'fn recipeDetailHeading']");
     $recipe = array();
     $subdocs_description = array();
     if (is_object($recipes_per_page) && $recipes_per_page->length != 0) {
         $recipes_count = $recipes_per_page->length;
         $titles = $xpath->evaluate("/html//*[@id = 'itemTitle']|\n               /html//h1[@class = 'fn'] |\n               /html//*[@id='recipe_title'] |\n               /html//div[@class ='rcp-head clrfix']/h1 |\n               /html//h1[@class = 'fn recipeDetailHeading']");
         for ($i = 0; $i < $recipes_count; $i++) {
             $ingredients = $xpath->evaluate("/html//ul[@class = 'ingredient-wrap']/li |\n                    /html//li[@class = 'ingredient']|\n                    /html//*[@class = 'ingredients']/*|\n                    /html//*[@itemprop='ingredients']\n                    ");
             $ingredients_result = "";
             if (is_object($ingredients) && $ingredients->length != 0) {
                 $lastIngredient = end($ingredients);
                 foreach ($ingredients as $ingredient) {
                     $content = trim($ingredient->textContent);
                     if (!empty($content)) {
                         if ($content != $lastIngredient) {
                             $ingredients_result .= $content . "||";
                         } else {
                             $ingredients_result .= $content;
                         }
                     }
                 }
                 $ingredients_result = mb_ereg_replace("(\\s)+", " ", $ingredients_result);
             }
             $recipe[self::TITLE] = $titles->item($i)->textContent;
             $recipe[self::DESCRIPTION] = $ingredients_result;
             $subdocs_description[] = $recipe;
         }
     }
     $num_recipes = count($subdocs_description);
     crawlLog("...{$num_recipes} found.");
     return $subdocs_description;
 }

Пример #3

Показать файл

Файл: epub_processor.php Проект: yakar/yioop

 /**
  * Used to extract the title, description and links from
  * a string consisting of ebook publication data.
  *
  * @param string $page epub contents
  * @param string $url the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     $opf_pattern = "/.opf\$/i";
     $html_pattern = "/.html\$/i";
     $xhtml_pattern = "/.xhtml\$/i";
     $epub_url[0] = '';
     $epub_language = '';
     $epub_title = '';
     $epub_unique_identifier = '';
     $epub_author = '';
     $epub_publisher = '';
     $epub_date = '';
     $epub_subject = '';
     $desc = '';
     $htmlcontent = '';
     // Open a zip archive
     $zip = new PartialZipArchive($page);
     $num_files = $zip->numFiles();
     for ($i = 0; $i < $num_files; $i++) {
         // get the content file names of .epub document
         $filename[$i] = $zip->getNameIndex($i);
         if (preg_match($opf_pattern, $filename[$i])) {
             // Get the file data from zipped folder
             $opf_data = $zip->getFromName($filename[$i]);
             $opf_summary = $this->xmlToObject($opf_data);
             for ($m = 0; $m <= MAX_DOM_LEVEL; $m++) {
                 for ($n = 0; $n <= MAX_DOM_LEVEL; $n++) {
                     if (isset($opf_summary->children[$m]->children[$n])) {
                         $child = $opf_summary->children[$m]->children[$n];
                         if (isset($child->name) && $child->name == "dc:language") {
                             $epub_language = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:title") {
                             $epub_title = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:creator") {
                             $epub_author = $opf_summary->children[$m]->children[$n]->content;
                         }
                         if ($opf_summary->children[$m]->children[$n]->name == "dc:identifier") {
                             $epub_unique_identifier = $opf_summary->children[$m]->children[$n]->content;
                         }
                     }
                 }
             }
         } else {
             if (preg_match($html_pattern, $filename[$i]) || preg_match($xhtml_pattern, $filename[$i])) {
                 $html = new HtmlProcessor();
                 $html_data = $zip->getFromName($filename[$i]);
                 $description[$i] = $html->process($html_data, $url);
                 $htmlcontent .= $description[$i]['t'];
             }
         }
     }
     if ($epub_title != '') {
         $desc = " {$epub_title} .";
     }
     if ($epub_author != '') {
         $desc = $desc . " {$epub_author} ";
     }
     if ($epub_language != '') {
         $desc = $desc . " {$epub_language} ";
     }
     if ($epub_unique_identifier != '') {
         $desc = $desc . " URN-" . $epub_unique_identifier . ".";
     }
     if ($epub_publisher != '') {
         $desc = $desc . " {$epub_publisher} ";
     }
     if ($epub_date != '') {
         $desc = $desc . " {$epub_date} ";
     }
     if ($epub_subject != '') {
         $desc = $desc . " {$epub_subject} ";
     }
     $desc = $desc . $htmlcontent;
     //restrict the length of the description to maximum description length
     if (strlen($desc) > self::$max_description_len) {
         $desc = substr($desc, 0, self::$max_description_len);
     }
     $summary[self::TITLE] = $epub_title;
     $summary[self::DESCRIPTION] = $desc;
     $summary[self::LANG] = $epub_language;
     $summary[self::LINKS] = $epub_url;
     $summary[self::PAGE] = $page;
     return $summary;
 }