PHP HTMLParser::parse примеры использования

Язык программирования: PHP

Класс/Тип: HTMLParser

Метод/Функция: parse

Примеров на hotexamples.com: 5

PHP HTMLParser::parse - 5 примеров найдено. Это лучшие примеры PHP кода для HTMLParser::parse, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

parse(5)

innerHTML(5)

StripLinks(5)

StripTags(4)

AddTagDepth(3)

loadHTML(3)

startString(3)

RemoveTagDepth(3)

getPriceListForProduct(2)

setURL(2)

getHTML(2)

getHostUrl(2)

optHTMLLink(2)

StripAds(1)

setRule(1)

setRoot(1)

setGenericParent(1)

StripStyles(1)

StripBlankLinks(1)

StripScripts(1)

getImageHolder(1)

GetFormDetails(1)

getField(1)

getBodyHolder(1)

dump(1)

getHeader(1)

Пример #1

Показать файл

Файл: Extract.php Проект: mizterp/PHP-Web-Article-Extractor

 /**
  *	Extracts an article from HTML
  *
  *	@param  string  $rawHTMLPage the raw HTML from which to extract an article
  *	@return Article extraction result
  */
 public static function extractFromHTML($rawHTMLPage, $source = "")
 {
     $parser = new HTMLParser();
     // Parse HTML into blocks
     $article = $parser->parse($rawHTMLPage);
     // Filter out clean article title
     Filters\TitleFilter::filter($article);
     // Discover article 'end' points using syntactic terminators
     Filters\EndBlockFilter::filter($article);
     // Filter content using word count and link density using algorithm from Machine learning
     Filters\NumberOfWordsFilter::filter($article);
     // Filter blocks that come after content
     Filters\PostcontentFilter::filter($article);
     // Merge close blocks
     Mergers\CloseBlockMerger::merge($article);
     // Remove blocks that are not content
     Filters\NonContentFilter::filter($article);
     // Mark largest block as 'content'
     Filters\LargestBlockFilter::filter($article);
     // Mark blocks found between the title and main content as content as well
     Filters\BetweenTitleAndContentFilter::filter($article);
     // Post-extraction cleanup removing now irrelevant blocks and sets full title
     Filters\PostextractionFilter::filter($article);
     // Scans article line by line removing non-content on a per-line basis
     Filters\LineFilter::filter($article);
     // Determine document language
     Filters\LanguageFilter::filter($article);
     // Filter keywords from the article document
     Filters\KeywordFilter::filter($article);
     $article->source = $source;
     return $article;
 }

Пример #2

Показать файл

Файл: Compiler.php Проект: icybee/patron

 /**
  * @param string $template
  *
  * @return array
  */
 public function __invoke($template)
 {
     $parser = new HTMLParser([HTMLParser::T_ERROR_HANDLER => function ($message, array $args) {
         throw new \Exception(\ICanBoogie\format($message, $args));
     }]);
     $tree = $parser->parse($template, Engine::PREFIX);
     return $this->parse_html_tree($tree);
 }

Пример #3

Показать файл

Файл: CrawlOutgoingHTMLParser.php Проект: kba/rssscrpr

 public function parse(Session $session)
 {
     if (!$this->xpathOutgoing) {
         Utils::throw400("Must set 'xpathOutgoing'");
     }
     // the list of xpaths to find outgoing links, ordered by level of hierarchy
     $xpathOutgoingList = preg_split("/\\s*,\\s*/", $this->xpathOutgoing);
     // Let the HTMLParser parse, so we have a DOM
     parent::parse($session);
     // The urls to iterate through in this level of hierarchy
     $crawlUrls = array($session->url);
     // Step through the outgoing link xpaths
     for ($i = 0; $i < count($xpathOutgoingList); $i++) {
         $nextLevelUrls = array();
         $thisLevelXpath = $xpathOutgoingList[$i];
         foreach ($crawlUrls as $url) {
             // create a session
             $subsession = new Session($url);
             // create a fetcher and fetch
             $fetcher = new CachingHttpFetcher();
             $fetcher->fetch($subsession);
             // create a non-crawling HTMLParser and parse
             $parser = new HTMLParser();
             $parser->parse($subsession);
             // Query for URLs of pages to further recurse
             $outLinkNodes = $subsession->xpath->query($thisLevelXpath);
             if ($outLinkNodes === false) {
                 throw Utils::throw400("Xpath query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]");
             } else {
                 if ($outLinkNodes->length === 0) {
                     throw Utils::throw400("No results for query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]");
                 }
             }
             foreach ($outLinkNodes as $outLinkNode) {
                 $nextLevelUrls[] = $subsession->ensureAbsoluteUrl($outLinkNode->textContent);
             }
         }
         $crawlUrls = $nextLevelUrls;
     }
     // Concatenate all the <body> elements into the original document
     foreach ($crawlUrls as $url) {
         // create a session
         $subsession = new Session($url);
         // create a fetcher and fetch
         $fetcher = new CachingHttpFetcher();
         $fetcher->fetch($subsession);
         // create a non-crawling HTMLParser and parse
         $parser = new HTMLParser();
         $parser->parse($subsession);
         $newBody = $session->dom->importNode($subsession->dom->getElementsByTagName('body')->item(0), true);
         $session->dom->documentElement->appendChild($newBody);
     }
     $session->dom->save('/tmp/test3.html');
 }

Пример #4

Показать файл

Файл: HTMLIncludeFile.php5 Проект: BackupTheBerlios/freeform-frmwrk

  function onOpen() {
    $this->removeAll();
    $d = $this->getDocument();
    
    if(!($name = $this->getAttribute('name'))) {
      $name = $d->getVariable($this->getAttribute('key'));
    } else {
      if(!($pkg = $this->getAttribute('package'))) {
        // Try to find current package (where calling action defined)
        if(!($a = $d->getResponce()->getRequest()->getParameter('action'))) {
          $pkg = Package::getPackageByName('freeform');
          if($a = $pkg->getProperty('action.default')) {
            $pkg = Package::getPackageByName(Package::getPackageNameForClass($a));
          }
        } else {
          $pkg = Package::getPackageByName(Package::getPackageNameForClass($a));
	      }
      } else {
        echo $pkg;
        $pkg = Package::getPackageByName($pkg);
      }
      if($pkg) {
        $name = $pkg->getResourcePath($name);
      }
    }

    if($name) {
      $p = new HTMLParser($this->getDocument());
      $r = $p->parse($name);
      if($r) {
        $r->setExposed(false);
        $this->addNode($r);
        return self::PROCESS_BODY;
      } else {
        return self::SKIP_BODY;
      }
    }
    return self::SKIP_BODY;
  }

Пример #5

Показать файл

Файл: HTMLScraping.class.php Проект: diggin-sandbox/mirror-htmlscraping-20090114

 /**
  * Return array contains formated XHTML string
  * created from the responded HTML of the given URL.
  * array[code] => HTTP status code
  * array[headers] => HTTP headers
  * array[headers] => formated XHTML string made from the entity body
  * Throw exception if error.
  *
  * @param  string  $url
  * @param  integer $cache_lifetime
  * @param  boolean $conditional_request
  * @param  array   $headers
  * @param  array   $post
  * @return array
  */
 public final function getXhtml($url, $cache_lifetime = 0, $conditional_request = false, $headers = array(), $post = array())
 {
     /*
      * \x21\x23-\x3b\x3d\x3f-\x5a\x5c\x5f\x61-\x7a\x7c\x7e
      */
     if (!preg_match('/^https?:\\/\\/\\w[\\w\\-\\.]+/i', $url)) {
         throw new Exception("Not a valid or fully qualified HTTP URL.");
     }
     $data = false;
     $cache_lifetime = (int) $cache_lifetime;
     $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
     if ($use_cache) {
         $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
         $params = array();
         foreach ($headers as $key => $value) {
             if (!empty($value)) {
                 $params[] = urlencode($key) . '=' . urlencode($value);
             }
         }
         foreach ($post as $key => $value) {
             $params[] = urlencode($key) . '=' . urlencode($value);
         }
         $cache_id = "{$url}?" . implode('&', $params);
         if (false !== ($data = $cache->get($cache_id))) {
             $data = unserialize($data);
         }
     }
     /*
      * Access to the URL if not cached
      * or if the cache has either Last-Modified or Etag header
      * and conditional request is specified.
      */
     if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
         $conditional_request = false;
     }
     if (!$data or $conditional_request) {
         if (isset($data['headers']['last-modified']) and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
             $headers['last-modified'] = $data['headers']['last-modified'];
         }
         if (isset($data['headers']['etag']) and (!isset($headers['etag']) or empty($headers['etag']))) {
             $headers['etag'] = $data['headers']['etag'];
         }
         try {
             $response = $this->getHttpResponse($url, $headers, $post);
         } catch (Exception $e) {
             if (!$data) {
                 throw $e;
             }
         }
         /*
          * Use cache if the responded HTTP status code is 304.
          * If 200, format the responded HTML of the given URL to XHTML.
          */
         if (!$data or isset($response['code']) and $response['code'] != 304) {
             $data =& $response;
             /*
              * If status code was 200 and Content-Type was not (X)HTML,
              * the status code was forcibly altered to 204.
              * @see HTTP_Request_Listener_Extended->update().
              */
             if ($data['code'] != 200 and $data['code'] != 204) {
                 throw new Exception("Responded HTTP Status Code is {$data['code']}.");
             } elseif (isset($data['headers']['content-type']) and !preg_match('/^(?:text|application)\\/x?html\\b/', $data['headers']['content-type'])) {
                 throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
             } elseif (empty($data['body'])) {
                 throw new Exception("Responded entity body is empty.");
             } elseif (!preg_match('/<\\w+[^>]*?>/', $data['body'], $matches)) {
                 throw new Exception("Responded entity body does not contain a markup symbol.");
             } elseif (false !== strpos($matches[0], "")) {
                 throw new Exception("Responded entity body contains NULL.");
             }
             /*
              * Remove BOM and NULLs.
              */
             $data['body'] = preg_replace('/^\\xef\\xbb\\xbf/', '', $data['body']);
             $data['body'] = str_replace("", '', $data['body']);
             /*
              * Initialize the backups.
              */
             $this->backup = array();
             $this->backup_count = 0;
             /*
              * Removing SCRIPT and STYLE is recommended.
              * The following substitute code will capsulate the content of the tags in CDATA.
              * If use it, be sure that some JavaScript method such as document.write
              * is not compliant with XHTML/XML.
              */
             $tags = array('script', 'style');
             foreach ($tags as $tag) {
                 $data['body'] = preg_replace("/<{$tag}\\b[^>]*?>.*?<\\/{$tag}\\b[^>]*?>/si", '', $data['body']);
                 /*
                 $data['body'] = preg_replace_callback(
                     "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
                     create_function('$matches', '
                         $content = trim($matches[2]);
                         if (empty($content)
                             or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
                             return $matches[0];
                         } else {
                             $content = preg_replace("/^<!-+/", "", $content);
                             $content = preg_replace("/-+>$/", "", $content);
                             $content = preg_replace("/\s*\/\/$/s", "", trim($content));
                             return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
                         }
                     '),
                     $data['body']
                 );
                 */
             }
             /*
              * Backup CDATA sections for later process.
              */
             $data['body'] = preg_replace_callback('/<!\\[CDATA\\[.*?\\]\\]>/s', array($this, 'backup'), $data['body']);
             /*
              * Comment section must not contain two or more adjacent hyphens.
              */
             $data['body'] = preg_replace_callback('/<!--(.*?)-->/si', create_function('$matches', '
                     return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
                 '), $data['body']);
             /*
              * Backup comment sections for later process.
              */
             $data['body'] = preg_replace_callback('/<!--.*?-->/s', array($this, 'backup'), $data['body']);
             /*
              * Process tags that is potentially dangerous for XML parsers.
              */
             $data['body'] = preg_replace_callback('/(<textarea\\b[^>]*?>)(.*?)(<\\/textarea\\b[^>]*?>)/si', create_function('$matches', '
                     return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
                 '), $data['body']);
             $data['body'] = preg_replace_callback('/<xmp\\b[^>]*?>(.*?)<\\/xmp\\b[^>]*?>/si', create_function('$matches', '
                     return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
                 '), $data['body']);
             $data['body'] = preg_replace_callback('/<plaintext\\b[^>]*?>(.*)$/si', create_function('$matches', '
                     return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
                 '), $data['body']);
             /*
              * Remove DTD declarations, wrongly placed comments etc.
              * This must be done before removing DOCTYPE.
              */
             $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
             /*
              * XML and DOCTYPE declaration will be replaced.
              */
             $data['body'] = preg_replace('/<!DOCTYPE\\b[^>]*?>/si', '', $data['body']);
             $data['body'] = preg_replace('/<\\?xml\\b[^>]*?\\?>/si', '', $data['body']);
             if (preg_match('/^\\s*$/s', $data['body'])) {
                 throw new Exception('The entity body became empty after preprocessing.');
             }
             /*
              * Detect character encoding and convert to UTF-8.
              */
             $encoding = false;
             if (isset($data['headers']['content-type'])) {
                 $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
             }
             if (!$encoding and preg_match_all('/<meta\\b[^>]*?>/si', $data['body'], $matches)) {
                 foreach ($matches[0] as $value) {
                     if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' and false !== ($encoding = $this->getAttribute('content', $value))) {
                         $encoding = $this->getCharsetFromCType($encoding);
                         break;
                     }
                 }
             }
             /*
              * Use mbstring to convert character encoding if available.
              * Otherwise use iconv (iconv may try to detect character encoding automatically).
              * Do not trust the declared encoding and do conversion even if UTF-8.
              */
             if (extension_loaded('mbstring')) {
                 if (!$encoding) {
                     @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
                     if (false === ($encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body'])))) {
                         throw new Exception('Failed detecting character encoding.');
                     }
                 }
                 @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
             } else {
                 if (false === ($data['body'] = @iconv($encoding, 'UTF-8', $data['body']))) {
                     throw new Exception('Failed converting character encoding.');
                 }
                 foreach ($this->backup as $key => $value) {
                     if (false === ($this->backup[$key] = @iconv($encoding, 'UTF-8', $value))) {
                         throw new Exception('Failed converting character encoding.');
                     }
                 }
             }
             /*
              * Restore CDATAs and comments.
              */
             for ($i = 0; $i < $this->backup_count; $i++) {
                 $data['body'] = str_replace("<restore count=\"{$i}\" />", $this->backup[$i], $data['body']);
             }
             /*
              * Use Tidy to format HTML if available.
              * Otherwise, use HTMLParser class (is slower and consumes much memory).
              */
             if (extension_loaded('tidy')) {
                 $tidy = new tidy();
                 $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
                 $tidy->cleanRepair();
                 $data['body'] = $tidy->html();
             } else {
                 require_once 'HTMLParser.class.php';
                 $parser = new HTMLParser();
                 $format_rule = (require 'xhtml1-transitional_dtd.inc.php');
                 $parser->setRule($format_rule);
                 $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
                 $parser->setGenericParent('body');
                 $parser->parse($data['body']);
                 $data['body'] = $parser->dump();
             }
             /*
              * Valid XHTML DOCTYPE declaration (with DTD URI) is required
              * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
              */
             $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
             $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
             $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
             $data['body'] = "{$declarations}{$data['body']}";
             if ($use_cache) {
                 $cache->save(serialize($data), $cache_id);
             }
         }
     }
     return $data;
 }