/** * Extracts an article from HTML * * @param string $rawHTMLPage the raw HTML from which to extract an article * @return Article extraction result */ public static function extractFromHTML($rawHTMLPage, $source = "") { $parser = new HTMLParser(); // Parse HTML into blocks $article = $parser->parse($rawHTMLPage); // Filter out clean article title Filters\TitleFilter::filter($article); // Discover article 'end' points using syntactic terminators Filters\EndBlockFilter::filter($article); // Filter content using word count and link density using algorithm from Machine learning Filters\NumberOfWordsFilter::filter($article); // Filter blocks that come after content Filters\PostcontentFilter::filter($article); // Merge close blocks Mergers\CloseBlockMerger::merge($article); // Remove blocks that are not content Filters\NonContentFilter::filter($article); // Mark largest block as 'content' Filters\LargestBlockFilter::filter($article); // Mark blocks found between the title and main content as content as well Filters\BetweenTitleAndContentFilter::filter($article); // Post-extraction cleanup removing now irrelevant blocks and sets full title Filters\PostextractionFilter::filter($article); // Scans article line by line removing non-content on a per-line basis Filters\LineFilter::filter($article); // Determine document language Filters\LanguageFilter::filter($article); // Filter keywords from the article document Filters\KeywordFilter::filter($article); $article->source = $source; return $article; }
/** * @param string $template * * @return array */ public function __invoke($template) { $parser = new HTMLParser([HTMLParser::T_ERROR_HANDLER => function ($message, array $args) { throw new \Exception(\ICanBoogie\format($message, $args)); }]); $tree = $parser->parse($template, Engine::PREFIX); return $this->parse_html_tree($tree); }
public function parse(Session $session) { if (!$this->xpathOutgoing) { Utils::throw400("Must set 'xpathOutgoing'"); } // the list of xpaths to find outgoing links, ordered by level of hierarchy $xpathOutgoingList = preg_split("/\\s*,\\s*/", $this->xpathOutgoing); // Let the HTMLParser parse, so we have a DOM parent::parse($session); // The urls to iterate through in this level of hierarchy $crawlUrls = array($session->url); // Step through the outgoing link xpaths for ($i = 0; $i < count($xpathOutgoingList); $i++) { $nextLevelUrls = array(); $thisLevelXpath = $xpathOutgoingList[$i]; foreach ($crawlUrls as $url) { // create a session $subsession = new Session($url); // create a fetcher and fetch $fetcher = new CachingHttpFetcher(); $fetcher->fetch($subsession); // create a non-crawling HTMLParser and parse $parser = new HTMLParser(); $parser->parse($subsession); // Query for URLs of pages to further recurse $outLinkNodes = $subsession->xpath->query($thisLevelXpath); if ($outLinkNodes === false) { throw Utils::throw400("Xpath query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]"); } else { if ($outLinkNodes->length === 0) { throw Utils::throw400("No results for query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]"); } } foreach ($outLinkNodes as $outLinkNode) { $nextLevelUrls[] = $subsession->ensureAbsoluteUrl($outLinkNode->textContent); } } $crawlUrls = $nextLevelUrls; } // Concatenate all the <body> elements into the original document foreach ($crawlUrls as $url) { // create a session $subsession = new Session($url); // create a fetcher and fetch $fetcher = new CachingHttpFetcher(); $fetcher->fetch($subsession); // create a non-crawling HTMLParser and parse $parser = new HTMLParser(); $parser->parse($subsession); $newBody = $session->dom->importNode($subsession->dom->getElementsByTagName('body')->item(0), true); $session->dom->documentElement->appendChild($newBody); } $session->dom->save('/tmp/test3.html'); }
function onOpen() { $this->removeAll(); $d = $this->getDocument(); if(!($name = $this->getAttribute('name'))) { $name = $d->getVariable($this->getAttribute('key')); } else { if(!($pkg = $this->getAttribute('package'))) { // Try to find current package (where calling action defined) if(!($a = $d->getResponce()->getRequest()->getParameter('action'))) { $pkg = Package::getPackageByName('freeform'); if($a = $pkg->getProperty('action.default')) { $pkg = Package::getPackageByName(Package::getPackageNameForClass($a)); } } else { $pkg = Package::getPackageByName(Package::getPackageNameForClass($a)); } } else { echo $pkg; $pkg = Package::getPackageByName($pkg); } if($pkg) { $name = $pkg->getResourcePath($name); } } if($name) { $p = new HTMLParser($this->getDocument()); $r = $p->parse($name); if($r) { $r->setExposed(false); $this->addNode($r); return self::PROCESS_BODY; } else { return self::SKIP_BODY; } } return self::SKIP_BODY; }
/** * Return array contains formated XHTML string * created from the responded HTML of the given URL. * array[code] => HTTP status code * array[headers] => HTTP headers * array[headers] => formated XHTML string made from the entity body * Throw exception if error. * * @param string $url * @param integer $cache_lifetime * @param boolean $conditional_request * @param array $headers * @param array $post * @return array */ public final function getXhtml($url, $cache_lifetime = 0, $conditional_request = false, $headers = array(), $post = array()) { /* * \x21\x23-\x3b\x3d\x3f-\x5a\x5c\x5f\x61-\x7a\x7c\x7e */ if (!preg_match('/^https?:\\/\\/\\w[\\w\\-\\.]+/i', $url)) { throw new Exception("Not a valid or fully qualified HTTP URL."); } $data = false; $cache_lifetime = (int) $cache_lifetime; $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0; if ($use_cache) { $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime)); $params = array(); foreach ($headers as $key => $value) { if (!empty($value)) { $params[] = urlencode($key) . '=' . urlencode($value); } } foreach ($post as $key => $value) { $params[] = urlencode($key) . '=' . urlencode($value); } $cache_id = "{$url}?" . implode('&', $params); if (false !== ($data = $cache->get($cache_id))) { $data = unserialize($data); } } /* * Access to the URL if not cached * or if the cache has either Last-Modified or Etag header * and conditional request is specified. */ if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) { $conditional_request = false; } if (!$data or $conditional_request) { if (isset($data['headers']['last-modified']) and (!isset($headers['last-modified']) or empty($headers['last-modified']))) { $headers['last-modified'] = $data['headers']['last-modified']; } if (isset($data['headers']['etag']) and (!isset($headers['etag']) or empty($headers['etag']))) { $headers['etag'] = $data['headers']['etag']; } try { $response = $this->getHttpResponse($url, $headers, $post); } catch (Exception $e) { if (!$data) { throw $e; } } /* * Use cache if the responded HTTP status code is 304. * If 200, format the responded HTML of the given URL to XHTML. */ if (!$data or isset($response['code']) and $response['code'] != 304) { $data =& $response; /* * If status code was 200 and Content-Type was not (X)HTML, * the status code was forcibly altered to 204. * @see HTTP_Request_Listener_Extended->update(). */ if ($data['code'] != 200 and $data['code'] != 204) { throw new Exception("Responded HTTP Status Code is {$data['code']}."); } elseif (isset($data['headers']['content-type']) and !preg_match('/^(?:text|application)\\/x?html\\b/', $data['headers']['content-type'])) { throw new Exception("Responded Content-Type is {$data['headers']['content-type']}"); } elseif (empty($data['body'])) { throw new Exception("Responded entity body is empty."); } elseif (!preg_match('/<\\w+[^>]*?>/', $data['body'], $matches)) { throw new Exception("Responded entity body does not contain a markup symbol."); } elseif (false !== strpos($matches[0], "")) { throw new Exception("Responded entity body contains NULL."); } /* * Remove BOM and NULLs. */ $data['body'] = preg_replace('/^\\xef\\xbb\\xbf/', '', $data['body']); $data['body'] = str_replace("", '', $data['body']); /* * Initialize the backups. */ $this->backup = array(); $this->backup_count = 0; /* * Removing SCRIPT and STYLE is recommended. * The following substitute code will capsulate the content of the tags in CDATA. * If use it, be sure that some JavaScript method such as document.write * is not compliant with XHTML/XML. */ $tags = array('script', 'style'); foreach ($tags as $tag) { $data['body'] = preg_replace("/<{$tag}\\b[^>]*?>.*?<\\/{$tag}\\b[^>]*?>/si", '', $data['body']); /* $data['body'] = preg_replace_callback( "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si", create_function('$matches', ' $content = trim($matches[2]); if (empty($content) or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) { return $matches[0]; } else { $content = preg_replace("/^<!-+/", "", $content); $content = preg_replace("/-+>$/", "", $content); $content = preg_replace("/\s*\/\/$/s", "", trim($content)); return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]"; } '), $data['body'] ); */ } /* * Backup CDATA sections for later process. */ $data['body'] = preg_replace_callback('/<!\\[CDATA\\[.*?\\]\\]>/s', array($this, 'backup'), $data['body']); /* * Comment section must not contain two or more adjacent hyphens. */ $data['body'] = preg_replace_callback('/<!--(.*?)-->/si', create_function('$matches', ' return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->"; '), $data['body']); /* * Backup comment sections for later process. */ $data['body'] = preg_replace_callback('/<!--.*?-->/s', array($this, 'backup'), $data['body']); /* * Process tags that is potentially dangerous for XML parsers. */ $data['body'] = preg_replace_callback('/(<textarea\\b[^>]*?>)(.*?)(<\\/textarea\\b[^>]*?>)/si', create_function('$matches', ' return $matches[1].str_replace("<", "<", $matches[2]).$matches[3]; '), $data['body']); $data['body'] = preg_replace_callback('/<xmp\\b[^>]*?>(.*?)<\\/xmp\\b[^>]*?>/si', create_function('$matches', ' return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; '), $data['body']); $data['body'] = preg_replace_callback('/<plaintext\\b[^>]*?>(.*)$/si', create_function('$matches', ' return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; '), $data['body']); /* * Remove DTD declarations, wrongly placed comments etc. * This must be done before removing DOCTYPE. */ $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']); /* * XML and DOCTYPE declaration will be replaced. */ $data['body'] = preg_replace('/<!DOCTYPE\\b[^>]*?>/si', '', $data['body']); $data['body'] = preg_replace('/<\\?xml\\b[^>]*?\\?>/si', '', $data['body']); if (preg_match('/^\\s*$/s', $data['body'])) { throw new Exception('The entity body became empty after preprocessing.'); } /* * Detect character encoding and convert to UTF-8. */ $encoding = false; if (isset($data['headers']['content-type'])) { $encoding = $this->getCharsetFromCType($data['headers']['content-type']); } if (!$encoding and preg_match_all('/<meta\\b[^>]*?>/si', $data['body'], $matches)) { foreach ($matches[0] as $value) { if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' and false !== ($encoding = $this->getAttribute('content', $value))) { $encoding = $this->getCharsetFromCType($encoding); break; } } } /* * Use mbstring to convert character encoding if available. * Otherwise use iconv (iconv may try to detect character encoding automatically). * Do not trust the declared encoding and do conversion even if UTF-8. */ if (extension_loaded('mbstring')) { if (!$encoding) { @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS'); if (false === ($encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body'])))) { throw new Exception('Failed detecting character encoding.'); } } @mb_convert_variables('UTF-8', $encoding, $data, $this->backup); } else { if (false === ($data['body'] = @iconv($encoding, 'UTF-8', $data['body']))) { throw new Exception('Failed converting character encoding.'); } foreach ($this->backup as $key => $value) { if (false === ($this->backup[$key] = @iconv($encoding, 'UTF-8', $value))) { throw new Exception('Failed converting character encoding.'); } } } /* * Restore CDATAs and comments. */ for ($i = 0; $i < $this->backup_count; $i++) { $data['body'] = str_replace("<restore count=\"{$i}\" />", $this->backup[$i], $data['body']); } /* * Use Tidy to format HTML if available. * Otherwise, use HTMLParser class (is slower and consumes much memory). */ if (extension_loaded('tidy')) { $tidy = new tidy(); $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8'); $tidy->cleanRepair(); $data['body'] = $tidy->html(); } else { require_once 'HTMLParser.class.php'; $parser = new HTMLParser(); $format_rule = (require 'xhtml1-transitional_dtd.inc.php'); $parser->setRule($format_rule); $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml')); $parser->setGenericParent('body'); $parser->parse($data['body']); $data['body'] = $parser->dump(); } /* * Valid XHTML DOCTYPE declaration (with DTD URI) is required * for SimpleXMLElement->asXML() method to produce proper XHTML tags. */ $declarations = '<?xml version="1.0" encoding="UTF-8"?>'; $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" '; $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'; $data['body'] = "{$declarations}{$data['body']}"; if ($use_cache) { $cache->save(serialize($data), $cache_id); } } } return $data; }