Exemplo n.º 1
0
 /**
  *	Extracts an article from HTML
  *
  *	@param  string  $rawHTMLPage the raw HTML from which to extract an article
  *	@return Article extraction result
  */
 public static function extractFromHTML($rawHTMLPage, $source = "")
 {
     $parser = new HTMLParser();
     // Parse HTML into blocks
     $article = $parser->parse($rawHTMLPage);
     // Filter out clean article title
     Filters\TitleFilter::filter($article);
     // Discover article 'end' points using syntactic terminators
     Filters\EndBlockFilter::filter($article);
     // Filter content using word count and link density using algorithm from Machine learning
     Filters\NumberOfWordsFilter::filter($article);
     // Filter blocks that come after content
     Filters\PostcontentFilter::filter($article);
     // Merge close blocks
     Mergers\CloseBlockMerger::merge($article);
     // Remove blocks that are not content
     Filters\NonContentFilter::filter($article);
     // Mark largest block as 'content'
     Filters\LargestBlockFilter::filter($article);
     // Mark blocks found between the title and main content as content as well
     Filters\BetweenTitleAndContentFilter::filter($article);
     // Post-extraction cleanup removing now irrelevant blocks and sets full title
     Filters\PostextractionFilter::filter($article);
     // Scans article line by line removing non-content on a per-line basis
     Filters\LineFilter::filter($article);
     // Determine document language
     Filters\LanguageFilter::filter($article);
     // Filter keywords from the article document
     Filters\KeywordFilter::filter($article);
     $article->source = $source;
     return $article;
 }
Exemplo n.º 2
0
 /**
  * @param string $template
  *
  * @return array
  */
 public function __invoke($template)
 {
     $parser = new HTMLParser([HTMLParser::T_ERROR_HANDLER => function ($message, array $args) {
         throw new \Exception(\ICanBoogie\format($message, $args));
     }]);
     $tree = $parser->parse($template, Engine::PREFIX);
     return $this->parse_html_tree($tree);
 }
Exemplo n.º 3
0
 public function parse(Session $session)
 {
     if (!$this->xpathOutgoing) {
         Utils::throw400("Must set 'xpathOutgoing'");
     }
     // the list of xpaths to find outgoing links, ordered by level of hierarchy
     $xpathOutgoingList = preg_split("/\\s*,\\s*/", $this->xpathOutgoing);
     // Let the HTMLParser parse, so we have a DOM
     parent::parse($session);
     // The urls to iterate through in this level of hierarchy
     $crawlUrls = array($session->url);
     // Step through the outgoing link xpaths
     for ($i = 0; $i < count($xpathOutgoingList); $i++) {
         $nextLevelUrls = array();
         $thisLevelXpath = $xpathOutgoingList[$i];
         foreach ($crawlUrls as $url) {
             // create a session
             $subsession = new Session($url);
             // create a fetcher and fetch
             $fetcher = new CachingHttpFetcher();
             $fetcher->fetch($subsession);
             // create a non-crawling HTMLParser and parse
             $parser = new HTMLParser();
             $parser->parse($subsession);
             // Query for URLs of pages to further recurse
             $outLinkNodes = $subsession->xpath->query($thisLevelXpath);
             if ($outLinkNodes === false) {
                 throw Utils::throw400("Xpath query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]");
             } else {
                 if ($outLinkNodes->length === 0) {
                     throw Utils::throw400("No results for query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]");
                 }
             }
             foreach ($outLinkNodes as $outLinkNode) {
                 $nextLevelUrls[] = $subsession->ensureAbsoluteUrl($outLinkNode->textContent);
             }
         }
         $crawlUrls = $nextLevelUrls;
     }
     // Concatenate all the <body> elements into the original document
     foreach ($crawlUrls as $url) {
         // create a session
         $subsession = new Session($url);
         // create a fetcher and fetch
         $fetcher = new CachingHttpFetcher();
         $fetcher->fetch($subsession);
         // create a non-crawling HTMLParser and parse
         $parser = new HTMLParser();
         $parser->parse($subsession);
         $newBody = $session->dom->importNode($subsession->dom->getElementsByTagName('body')->item(0), true);
         $session->dom->documentElement->appendChild($newBody);
     }
     $session->dom->save('/tmp/test3.html');
 }
  function onOpen() {
    $this->removeAll();
    $d = $this->getDocument();
    
    if(!($name = $this->getAttribute('name'))) {
      $name = $d->getVariable($this->getAttribute('key'));
    } else {
      if(!($pkg = $this->getAttribute('package'))) {
        // Try to find current package (where calling action defined)
        if(!($a = $d->getResponce()->getRequest()->getParameter('action'))) {
          $pkg = Package::getPackageByName('freeform');
          if($a = $pkg->getProperty('action.default')) {
            $pkg = Package::getPackageByName(Package::getPackageNameForClass($a));
          }
        } else {
          $pkg = Package::getPackageByName(Package::getPackageNameForClass($a));
	      }
      } else {
        echo $pkg;
        $pkg = Package::getPackageByName($pkg);
      }
      if($pkg) {
        $name = $pkg->getResourcePath($name);
      }
    }

    if($name) {
      $p = new HTMLParser($this->getDocument());
      $r = $p->parse($name);
      if($r) {
        $r->setExposed(false);
        $this->addNode($r);
        return self::PROCESS_BODY;
      } else {
        return self::SKIP_BODY;
      }
    }
    return self::SKIP_BODY;
  }
 /**
  * Return array contains formated XHTML string
  * created from the responded HTML of the given URL.
  * array[code] => HTTP status code
  * array[headers] => HTTP headers
  * array[headers] => formated XHTML string made from the entity body
  * Throw exception if error.
  *
  * @param  string  $url
  * @param  integer $cache_lifetime
  * @param  boolean $conditional_request
  * @param  array   $headers
  * @param  array   $post
  * @return array
  */
 public final function getXhtml($url, $cache_lifetime = 0, $conditional_request = false, $headers = array(), $post = array())
 {
     /*
      * \x21\x23-\x3b\x3d\x3f-\x5a\x5c\x5f\x61-\x7a\x7c\x7e
      */
     if (!preg_match('/^https?:\\/\\/\\w[\\w\\-\\.]+/i', $url)) {
         throw new Exception("Not a valid or fully qualified HTTP URL.");
     }
     $data = false;
     $cache_lifetime = (int) $cache_lifetime;
     $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
     if ($use_cache) {
         $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
         $params = array();
         foreach ($headers as $key => $value) {
             if (!empty($value)) {
                 $params[] = urlencode($key) . '=' . urlencode($value);
             }
         }
         foreach ($post as $key => $value) {
             $params[] = urlencode($key) . '=' . urlencode($value);
         }
         $cache_id = "{$url}?" . implode('&', $params);
         if (false !== ($data = $cache->get($cache_id))) {
             $data = unserialize($data);
         }
     }
     /*
      * Access to the URL if not cached
      * or if the cache has either Last-Modified or Etag header
      * and conditional request is specified.
      */
     if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
         $conditional_request = false;
     }
     if (!$data or $conditional_request) {
         if (isset($data['headers']['last-modified']) and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
             $headers['last-modified'] = $data['headers']['last-modified'];
         }
         if (isset($data['headers']['etag']) and (!isset($headers['etag']) or empty($headers['etag']))) {
             $headers['etag'] = $data['headers']['etag'];
         }
         try {
             $response = $this->getHttpResponse($url, $headers, $post);
         } catch (Exception $e) {
             if (!$data) {
                 throw $e;
             }
         }
         /*
          * Use cache if the responded HTTP status code is 304.
          * If 200, format the responded HTML of the given URL to XHTML.
          */
         if (!$data or isset($response['code']) and $response['code'] != 304) {
             $data =& $response;
             /*
              * If status code was 200 and Content-Type was not (X)HTML,
              * the status code was forcibly altered to 204.
              * @see HTTP_Request_Listener_Extended->update().
              */
             if ($data['code'] != 200 and $data['code'] != 204) {
                 throw new Exception("Responded HTTP Status Code is {$data['code']}.");
             } elseif (isset($data['headers']['content-type']) and !preg_match('/^(?:text|application)\\/x?html\\b/', $data['headers']['content-type'])) {
                 throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
             } elseif (empty($data['body'])) {
                 throw new Exception("Responded entity body is empty.");
             } elseif (!preg_match('/<\\w+[^>]*?>/', $data['body'], $matches)) {
                 throw new Exception("Responded entity body does not contain a markup symbol.");
             } elseif (false !== strpos($matches[0], "")) {
                 throw new Exception("Responded entity body contains NULL.");
             }
             /*
              * Remove BOM and NULLs.
              */
             $data['body'] = preg_replace('/^\\xef\\xbb\\xbf/', '', $data['body']);
             $data['body'] = str_replace("", '', $data['body']);
             /*
              * Initialize the backups.
              */
             $this->backup = array();
             $this->backup_count = 0;
             /*
              * Removing SCRIPT and STYLE is recommended.
              * The following substitute code will capsulate the content of the tags in CDATA.
              * If use it, be sure that some JavaScript method such as document.write
              * is not compliant with XHTML/XML.
              */
             $tags = array('script', 'style');
             foreach ($tags as $tag) {
                 $data['body'] = preg_replace("/<{$tag}\\b[^>]*?>.*?<\\/{$tag}\\b[^>]*?>/si", '', $data['body']);
                 /*
                 $data['body'] = preg_replace_callback(
                     "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
                     create_function('$matches', '
                         $content = trim($matches[2]);
                         if (empty($content)
                             or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
                             return $matches[0];
                         } else {
                             $content = preg_replace("/^<!-+/", "", $content);
                             $content = preg_replace("/-+>$/", "", $content);
                             $content = preg_replace("/\s*\/\/$/s", "", trim($content));
                             return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
                         }
                     '),
                     $data['body']
                 );
                 */
             }
             /*
              * Backup CDATA sections for later process.
              */
             $data['body'] = preg_replace_callback('/<!\\[CDATA\\[.*?\\]\\]>/s', array($this, 'backup'), $data['body']);
             /*
              * Comment section must not contain two or more adjacent hyphens.
              */
             $data['body'] = preg_replace_callback('/<!--(.*?)-->/si', create_function('$matches', '
                     return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
                 '), $data['body']);
             /*
              * Backup comment sections for later process.
              */
             $data['body'] = preg_replace_callback('/<!--.*?-->/s', array($this, 'backup'), $data['body']);
             /*
              * Process tags that is potentially dangerous for XML parsers.
              */
             $data['body'] = preg_replace_callback('/(<textarea\\b[^>]*?>)(.*?)(<\\/textarea\\b[^>]*?>)/si', create_function('$matches', '
                     return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
                 '), $data['body']);
             $data['body'] = preg_replace_callback('/<xmp\\b[^>]*?>(.*?)<\\/xmp\\b[^>]*?>/si', create_function('$matches', '
                     return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
                 '), $data['body']);
             $data['body'] = preg_replace_callback('/<plaintext\\b[^>]*?>(.*)$/si', create_function('$matches', '
                     return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
                 '), $data['body']);
             /*
              * Remove DTD declarations, wrongly placed comments etc.
              * This must be done before removing DOCTYPE.
              */
             $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
             /*
              * XML and DOCTYPE declaration will be replaced.
              */
             $data['body'] = preg_replace('/<!DOCTYPE\\b[^>]*?>/si', '', $data['body']);
             $data['body'] = preg_replace('/<\\?xml\\b[^>]*?\\?>/si', '', $data['body']);
             if (preg_match('/^\\s*$/s', $data['body'])) {
                 throw new Exception('The entity body became empty after preprocessing.');
             }
             /*
              * Detect character encoding and convert to UTF-8.
              */
             $encoding = false;
             if (isset($data['headers']['content-type'])) {
                 $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
             }
             if (!$encoding and preg_match_all('/<meta\\b[^>]*?>/si', $data['body'], $matches)) {
                 foreach ($matches[0] as $value) {
                     if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' and false !== ($encoding = $this->getAttribute('content', $value))) {
                         $encoding = $this->getCharsetFromCType($encoding);
                         break;
                     }
                 }
             }
             /*
              * Use mbstring to convert character encoding if available.
              * Otherwise use iconv (iconv may try to detect character encoding automatically).
              * Do not trust the declared encoding and do conversion even if UTF-8.
              */
             if (extension_loaded('mbstring')) {
                 if (!$encoding) {
                     @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
                     if (false === ($encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body'])))) {
                         throw new Exception('Failed detecting character encoding.');
                     }
                 }
                 @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
             } else {
                 if (false === ($data['body'] = @iconv($encoding, 'UTF-8', $data['body']))) {
                     throw new Exception('Failed converting character encoding.');
                 }
                 foreach ($this->backup as $key => $value) {
                     if (false === ($this->backup[$key] = @iconv($encoding, 'UTF-8', $value))) {
                         throw new Exception('Failed converting character encoding.');
                     }
                 }
             }
             /*
              * Restore CDATAs and comments.
              */
             for ($i = 0; $i < $this->backup_count; $i++) {
                 $data['body'] = str_replace("<restore count=\"{$i}\" />", $this->backup[$i], $data['body']);
             }
             /*
              * Use Tidy to format HTML if available.
              * Otherwise, use HTMLParser class (is slower and consumes much memory).
              */
             if (extension_loaded('tidy')) {
                 $tidy = new tidy();
                 $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
                 $tidy->cleanRepair();
                 $data['body'] = $tidy->html();
             } else {
                 require_once 'HTMLParser.class.php';
                 $parser = new HTMLParser();
                 $format_rule = (require 'xhtml1-transitional_dtd.inc.php');
                 $parser->setRule($format_rule);
                 $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
                 $parser->setGenericParent('body');
                 $parser->parse($data['body']);
                 $data['body'] = $parser->dump();
             }
             /*
              * Valid XHTML DOCTYPE declaration (with DTD URI) is required
              * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
              */
             $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
             $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
             $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
             $data['body'] = "{$declarations}{$data['body']}";
             if ($use_cache) {
                 $cache->save(serialize($data), $cache_id);
             }
         }
     }
     return $data;
 }