function readable($allowed_tags = '<p><a><h1><h2><h3>', $allowed_classes = []) { // If this function has run before on this instance then return the saved text if ($this->_readable) { return $this->_readable; } $html = $this->body; // F*****g BuzzFeed has the most ghetto ass template system and it // f***s up PHP DOMDocument. That is the only reason for this line. // It can and should be removed whenever BuzzFeed grows a clue. $html = str_replace("+ '</div>'", '', $html); // Step 1. Readability $r = new Readability($html); $r->init(); $html = $r->getContent()->innerHTML; // Step 2. strip_tags $html = strip_tags($html, $allowed_tags); // Step 3. HTMLPurifier $config = HTMLPurifier_Config::createDefault(); $config->set('Attr', 'AllowedClasses', $allowed_classes); $html = (new HTMLPurifier($config))->purify($html); // Step 4. HTML Tidy $html = tidy_repair_string($html, ['bare' => true, 'show-body-only' => true, 'wrap' => 0], 'UTF8'); // Save the result to the instance before returning // I want $this->terms() to be able to use it without recomputing $this->_readable = $html; return $this->_readable; }
private function readFromServer($url) { // check tidy function available if (function_exists('tidy_parse_string')) { $this->tidyAvailable = true; } $urlArray = parse_url($url); $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host']; $html = $this->fetchContent($url); $html = $this->tidyClean($html); $readability = new Readability($html, $url); $readability->init(); $this->title = $readability->getTitle()->textContent; $content = $readability->getContent()->innerHTML; /** * still need one more tidy clean, otherwise domdocument will not work properly */ $content = $this->tidyClean($content); /** * use domdocument to fix relative urls */ $this->content = $this->fixRelativeUrls($content); $article = array('title' => $this->title, 'content' => $this->content); $this->writeCache(serialize($article)); }
public function __construct($url) { if (!preg_match('!^https?://!i', $url)) { $url = 'http://' . $url; } $data = Http::Request($url); //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII"); $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII"); //$html = utf8_encode($html); $r = new Readability($html, $url); $r->init(); if (!isset($this->metadata["title"])) { $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML)); } if (!isset($this->metadata["author"])) { $parts = parse_url($url); $this->metadata["author"] = $parts["host"]; } $article = $r->getContent()->innerHTML; if (substr($article, 0, 5) == "<body") { $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>" . $article . "</html>"; } else { $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>" . $article . "</body></html>"; } $doc = new DOMDocument(); @$doc->loadHTML($article) or die($article); $doc->normalizeDocument(); $this->images = $this->handleImages($doc, $url); $this->text = $doc->saveHTML(); }
function run_work($href) { global $db2, $tablepre; $sql = "select * from {$tablepre}collection_content where url = '{$href}'"; $query = $db2->query($sql); $row = $db2->fetch_array($query); if ($row) { echo "<font color = 'red'>" . $href . "已收集过</font><br>"; return null; } else { $from = get_url_contents($href); if ($from) { preg_match('/charset=\\"?(.*?)\\"/si', $from, $charset); if ($charset[1]) { $da = new Readability($from, $charset[1]); $data = $da->getContent(); if (preg_match('/(.*?)[_-].*?/si', $data['title'], $title)) { $data['title'] = $title[1]; } if (strlen($data['content']) > 1000) { return $data; } } } else { return null; } } }
function process_message($msg) { global $response; global $useragent; $body = $msg->body; print "{$body}\n"; $blob = json_decode($body); $id = $blob->article_id; $url = $blob->url; $crl = curl_init(); curl_setopt($crl, CURLOPT_URL, $url); curl_setopt($crl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($crl, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($crl, CURLOPT_MAXCONNECTS, 30); curl_setopt($crl, CURLOPT_TIMEOUT, 10); curl_setopt($crl, CURLOPT_USERAGENT, $useragent); $html = curl_exec($crl); curl_close($crl); $html = str_replace('’', '’', $html); $r = new Readability($html); $rData = $r->getContent(); $rtitle = addslashes($rData['title']); $rcontent = addslashes($rData['content']); # print ("$rtitle\n"); # print ("$rcontent\n"); if ($rcontent != "") { $result = mysql_query("update articles set readability_title = '{$rtitle}', readability_content = '{$rcontent}' where id = '{$id}'"); } $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']); }
function grabContent($url) { $tikilib = TikiLib::lib('tiki'); $client = $tikilib->get_http_client($url); $response = $tikilib->http_perform_request($client); // Obtain the URL after redirections $url = (string) $client->getUri(); $html = $response->getBody(); // Note: PHP Readability expects UTF-8 encoded content. // If your content is not UTF-8 encoded, convert it // first before passing it to PHP Readability. // Both iconv() and mb_convert_encoding() can do this. // If we've got Tidy, let's clean up input. // This step is highly recommended - PHP's default HTML parser // often doesn't do a great job and results in strange output. $html = $this->tidy($html); // give it to Readability global $prefs; if (is_file($prefs['page_content_fetch_readability'])) { require_once $prefs['page_content_fetch_readability']; } if (!class_exists('Readability')) { return false; } $readability = new Readability($html, $url); $result = $readability->init(); if ($result) { $content = $this->tidy($readability->getContent()->innerHTML); $content = $this->replacePaths($content, $url); return array('title' => $readability->getTitle()->textContent, 'content' => $content); } }
/** Extract article from a page using php-readability */ function getArticle($url) { require 'class.Readability.php'; $html = file_get_contents($url); $html_input_charset = 'utf-8'; $Readability = new Readability($html, $html_input_charset); // default charset is utf-8 $ReadabilityData = $Readability->getContent(); $results = array('title' => $ReadabilityData['title'], 'content' => $ReadabilityData['content']); return $results; }
public function downloadItem($itemId) { $item = $this->cache->getItem($itemId); $html = file_get_contents($item->link); $reader = new Readability($html, $item->link); if ($reader->init()) { $item->description = $reader->articleContent->innerHTML; // save the downloaded content to cache, just in case $this->cache->saveItemDescription($item); } return $item; }
public function parseData($html) { if (strlen($html) == 0) { return null; } // check tidy function available if (function_exists('tidy_parse_string')) { $this->tidyAvailable = true; } $headers = $this->response->getHeaders(); if (isset($headers['Location'])) { $url = $headers["Location"]; } else { $url = $this->getOption("readerUrl"); } $urlArray = parse_url($url); if (isset($urlArray['path'])) { $this->basePath = dirname($urlArray['path']); } else { $this->basePath = ""; } $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host']; $html = $this->tidyClean($html); $readability = new Readability($html, $url); $article = null; if ($readability->init()) { $title = $readability->getTitle()->textContent; $content = $readability->getContent()->innerHTML; /** * still need one more tidy clean, otherwise domdocument will not work properly */ $content = $this->tidyClean($content); $content = $this->removeEmptyTags($content); /** * use domdocument to fix relative urls */ $content = $this->fixRelativeUrls($content); /** * if there is tidy support, then detect target source code from meta content */ if ($this->tidyAvailable) { $tidy = tidy_parse_string($content, array(), 'utf8'); $head = $tidy->head(); $charset = $this->findCharset($head->value); if (!empty($charset) && $charset != "utf-8") { $content = mb_convert_encoding($content, "utf-8", $charset); } } $article = array('title' => $title, 'content' => $content); } return $article; }
function perform() { $q = DB::query('SELECT link, neighborhood FROM listings WHERE scraped != TRUE', PDO::FETCH_ASSOC); $ps = DB::prepare('UPDATE listings SET scraped=TRUE, street=:street, description=:description, lat=:lat, lng=:lng WHERE link=:link'); /* Guzzle::sendAll(array_map(function ($listing) { return Guzzle::createRequest('GET', 'http://newyork.craigslist.org' . $listing['link']); }, iterator_to_array($q)), ['complete' => function ($event) use($ps) { try { $body = $event->getResponse()->getBody(); $crawler = new Crawler($body); $readability = new Readability($body); $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress'); $ps->execute([ ':link' => parse_url($event->getRequest()->getUrl())['path'], ':lat' => null, ':lng' => null, ':street' => $street->count() ? $street->text() : null, ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null ]); } catch (Exception $e) { Logger::error($e->getMessage(), $ps->errorinfo()); } }]); */ foreach ($q as $listing) { try { $body = Guzzle::get('http://newyork.craigslist.org' . $listing['link'])->getBody(); $crawler = new Crawler($body); $readability = new Readability($body); $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress'); $url = 'http://maps.googleapis.com/maps/api/geocode/json?address=' . ($street->count() ? $street->text() : $listing['neighborhood']); $json = json_decode(Guzzle::get($url)->getBody(), true); $loc = isset($json['results'][0]) ? $json['results'][0]['geometry']['location'] : null; $ps->execute([':link' => $listing['link'], ':lat' => isset($loc['lat']) ? $loc['lat'] : null, ':lng' => isset($loc['lng']) ? $loc['lng'] : null, ':street' => $street->count() ? $street->text() : null, ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null]); } catch (Exception $e) { Logger::error($e->getMessage(), $ps->errorinfo()); } } }
protected function absorb(OutputInterface $output, $id, $url) { $config = new Config(); $config->setClientUserAgent('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:11.0) Gecko/20100101 Firefox/11.0'); $reader = new Reader($config); $reader->download($url); $parser = $reader->getParser(); if ($parser === false) { //$this->feedRepository->disableFeed($id); return $this->writeErrors($output); } $feed = $parser->execute(); if ($feed === false) { //$this->feedRepository->disableFeed($id); return $this->writeErrors($output); } $data = ['lang' => $feed->getLanguage(), 'title' => $feed->getTitle(), 'lastUpdate' => $this->formatDateForMySQL($feed->getDate())]; $this->feedRepository->updateByPk($data, $id); foreach ($feed->items as $item) { $url = $item->getUrl(); $output->writeln('+ ' . $item->title); $fullContent = file_get_contents($url); $tidy = tidy_parse_string($fullContent, array(), 'UTF8'); $tidy->cleanRepair(); $html = $tidy->value; $readability = new \Readability($html, $url); $result = $readability->init(); if ($result) { $content = $readability->getContent()->innerHTML; $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8'); $tidy->cleanRepair(); $content = $tidy->value; } else { $output->writeln('unable to get full content'); $content = $item->getContent(); } $data = ['feedId' => $id, 'remoteId' => $item->getId(), 'title' => $item->getTitle(), 'url' => $url, 'pubDate' => $this->formatDateForMySQL($item->getDate()), 'content' => $content, 'author' => $item->getAuthor()]; $this->postRepository->add($data, true); } }
function filter_article_readability(&$article, $config) { require_once 'readability/Readability.php'; $link = trim($article['link']); $html = $this->load_url($link, $config); $readability = new Readability($html, $link); $readability->debug = false; $readability->convertLinksToFootnotes = isset($config['footnote_links']) && $config['footnote_links']; $result = $readability->init(); if ($result) { $article['content'] = $readability->getContent()->innerHTML; $article['plugin_data'] = "feedmod,{$owner_uid}:" . $article['plugin_data']; } }
public function parse($html, $url, Sources $source = null) { if ($source) { $this->source = $source; } try { $parsedNews = array(); $html = $this->stripTagWithContent($html, "script"); $htmlToDetect = $this->processExcludeElements($html); $content = $this->tryContentDetect($htmlToDetect); $readability = new \Readability($html, $url); $readability->debug = false; $readability->convertLinksToFootnotes = false; $result = $readability->init(); if ($result || $content) { $title = $readability->getTitle()->textContent; $title = $this->processTitleStopWords($title); if (!$content) { $content = $readability->getContent()->innerHTML; } $content = $this->processContentStopWords($content); $content = preg_replace('/\\n/', ' ', $content); $content = strip_tags($content, "<p><div><img><span><br><ul><li><embed><iframe>"); $content = $this->fixUrls($content); $content = $this->processExcludeElements($content); $date = $this->processPublishDate($html); if ($searchContent = trim(strip_tags($content))) { $searchContent = preg_replace('/\\n/', ' ', $searchContent); $searchContent = preg_replace("/[^а-яa-z ]/ui", "", $searchContent); $searchContent = preg_replace('/\\s+/', ' ', $searchContent); $searchContent = mb_convert_encoding($searchContent, 'HTML-ENTITIES', "UTF-8"); $parsedNews['title'] = $title; $parsedNews['content'] = $content; $parsedNews['searchContent'] = $searchContent; $parsedNews['thumb'] = $this->detectThumb($html, $content); $parsedNews['date'] = $date; } } else { throw new Exception('Looks like we couldn\'t find the content. :('); } } catch (Exception $e) { $parsedNews['error'] = $e->getMessage(); } return $parsedNews; }
public function extract_content($url) { if (!class_exists("Readability")) { require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php"; } if (!defined('NO_CURL') && function_exists('curl_init') && !ini_get("open_basedir")) { $ch = curl_init($url); curl_setopt($ch, CURLOPT_TIMEOUT, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT); @($result = curl_exec($ch)); $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); if (strpos($content_type, "text/html") === FALSE) { return false; } } $tmp = fetch_file_contents($url); if ($tmp && mb_strlen($tmp) < 65535 * 4) { $tmpdoc = new DOMDocument("1.0", "UTF-8"); if (!$tmpdoc->loadHTML($tmp)) { return false; } if (strtolower($tmpdoc->encoding) != 'utf-8') { $tmpxpath = new DOMXPath($tmpdoc); foreach ($tmpxpath->query("//meta") as $elem) { $elem->parentNode->removeChild($elem); } $tmp = $tmpdoc->saveHTML(); } $r = new Readability($tmp, $url); if ($r->init()) { $tmpxpath = new DOMXPath($r->dom); $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute("href")) { $entry->setAttribute("href", rewrite_relative_url($url, $entry->getAttribute("href"))); } if ($entry->hasAttribute("src")) { $entry->setAttribute("src", rewrite_relative_url($url, $entry->getAttribute("src"))); } } return $r->articleContent->innerHTML; } } return false; }
} else { $handle = curl_init(); curl_setopt_array($handle, array(CURLOPT_USERAGENT => USER_AGENT, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HEADER => false, CURLOPT_HTTPGET => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_URL => $request_url)); $source = curl_exec($handle); curl_close($handle); // Write request data into cache file. @file_put_contents($request_url_cache_file, $source); } // 判断编码 //if (!$charset = mb_detect_encoding($source)) { //} preg_match("/charset=([\\w|\\-]+);?/", $source, $match); $charset = isset($match[1]) ? $match[1] : 'utf-8'; /** * 获取 HTML 内容后,解析主体内容 */ $Readability = new Readability($source, $charset); $Data = $Readability->getContent(); switch ($output_type) { case 'json': header("Content-type: text/json;charset=utf-8"); $Data['url'] = $request_url; echo json_encode($Data); break; case 'html': default: header("Content-type: text/html;charset=utf-8"); $title = $Data['title']; $content = $Data['content']; include 'template/reader.html'; }
private function get_full_post($request_url) { try { try { $handle = curl_init(); curl_setopt_array($handle, array(CURLOPT_USERAGENT => "Tiny Tiny RSS", CURLOPT_FOLLOWLOCATION => true, CURLOPT_HEADER => false, CURLOPT_HTTPGET => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_URL => $request_url)); $source = curl_exec($handle); curl_close($handle); } catch (Exception $e) { $source = file_get_contents($request_url); } // fix encoding -> done by itohsnap: https://github.com/itohsnap/ttrss_fullpost/commit/815e163b724fbfb426eff43bde6c3aa744a22ae5 preg_match("/charset=([\\w|\\-]+);?/", $source, $match); $charset = isset($match[1]) ? $match[1] : 'utf-8'; $source = mb_convert_encoding($source, 'UTF-8', $charset); // Clean with tidy, if exists if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($source, array(), 'UTF8'); $tidy->cleanRepair(); $source = $tidy->value; } // get the Text require_once 'Readability.php'; $readability = new Readability($source); $readability->debug = false; $readability->convertLinksToFootnotes = false; $result = $readability->init(); $content = $readability->getContent()->innerHTML; // if we've got Tidy, let's clean it up for output if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8'); $tidy->cleanRepair(); $content = $tidy->value; } $Data['content'] = $content; } catch (Exception $e) { // do nothing if it dont grep fulltext succesfully } return $Data['content']; }
<?php require_once '../Readability.php'; header('Content-Type: text/plain; charset=utf-8'); // get latest Medialens alert // (change this URL to whatever you'd like to test) $url = 'http://medialens.org/alerts/index.php'; $html = file_get_contents($url); // Note: PHP Readability expects UTF-8 encoded content. // If your content is not UTF-8 encoded, convert it // first before passing it to PHP Readability. // Both iconv() and mb_convert_encoding() can do this. // give it to Readability $readability = new Readability($html, $url); // print debug output? // useful to compare against Arc90's original JS version - // simply click the bookmarklet with FireBug's console window open $readability->debug = false; // convert links to footnotes? $readability->convertLinksToFootnotes = true; // process it $result = $readability->init(); // does it look like we found what we wanted? if ($result) { echo "== Title =====================================\n"; echo $readability->getTitle()->textContent, "\n\n"; echo "== Body ======================================\n"; $content = $readability->getContent()->innerHTML; // if we've got Tidy, let's clean it up for output if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
function hook_article_filter($article) { if (strpos($article["link"], "reddit.com/r/") !== FALSE) { $doc = new DOMDocument(); @$doc->loadHTML($article["content"]); $xpath = new DOMXPath($doc); if ($this->host->get($this, "enable_content_dupcheck")) { $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0); if ($content_link) { $content_href = db_escape_string($content_link->getAttribute("href")); $entry_guid = db_escape_string($article["guid_hashed"]); $owner_uid = $article["owner_uid"]; if (DB_TYPE == "pgsql") { $interval_qpart = "date_entered < NOW() - INTERVAL '1 day'"; } else { $interval_qpart = "date_entered < DATE_SUB(NOW(), INTERVAL 1 DAY)"; } $result = db_query("SELECT COUNT(id) AS cid\n\t\t\t\t\t\tFROM ttrss_entries, ttrss_user_entries WHERE\n\t\t\t\t\t\t\tref_id = id AND\n\t\t\t\t\t\t\t{$interval_qpart} AND\n\t\t\t\t\t\t\tguid != '{$entry_guid}' AND\n\t\t\t\t\t\t\towner_uid = '{$owner_uid}' AND\n\t\t\t\t\t\t\tcontent LIKE '%href=\"{$content_href}\">[link]%'"); if ($result) { $num_found = db_fetch_result($result, 0, "cid"); if ($num_found > 0) { $article["force_catchup"] = true; } } } } $found = $this->inline_stuff($article, $doc, $xpath); if (!defined('NO_CURL') && function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) { if (!class_exists("Readability")) { require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php"; } if ($content_link && strpos($content_link->getAttribute("href"), "twitter.com") === FALSE && strpos($content_link->getAttribute("href"), "youtube.com") === FALSE && strpos($content_link->getAttribute("href"), "reddit.com") === FALSE) { /* link may lead to a huge video file or whatever, we need to check content type before trying to parse it which p much requires curl */ $ch = curl_init($content_link->getAttribute("href")); curl_setopt($ch, CURLOPT_TIMEOUT, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("open_basedir")); curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT); @($result = curl_exec($ch)); $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); if ($content_type && strpos($content_type, "text/html") !== FALSE) { $tmp = fetch_file_contents($content_link->getAttribute("href")); //_debug("tmplen: " . mb_strlen($tmp)); if ($tmp && mb_strlen($tmp) < 65535 * 4) { $r = new Readability($tmp, $content_link->getAttribute("href")); if ($r->init()) { $tmpxpath = new DOMXPath($r->dom); $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute("href")) { $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href"))); } if ($entry->hasAttribute("src")) { $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src"))); } } $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"]; // prob not a very good idea (breaks wikipedia pages, etc) - // inliner currently is not really fit for any random web content //$doc = new DOMDocument(); //@$doc->loadHTML($article["content"]); //$xpath = new DOMXPath($doc); //$found = $this->inline_stuff($article, $doc, $xpath); } } } } } $node = $doc->getElementsByTagName('body')->item(0); if ($node && $found) { $article["content"] = $doc->saveXML($node); } } return $article; }
function hook_article_filter($article) { if (strpos($article["link"], "reddit.com/r/") !== FALSE) { $doc = new DOMDocument(); @$doc->loadHTML($article["content"]); $xpath = new DOMXPath($doc); $found = $this->inline_stuff($article, $doc, $xpath); if (function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) { if (!class_exists("Readability")) { require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php"; } $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0); if ($content_link && strpos($content_link->getAttribute("href"), "twitter.com") === FALSE && strpos($content_link->getAttribute("href"), "youtube.com") === FALSE && strpos($content_link->getAttribute("href"), "reddit.com") === FALSE) { /* link may lead to a huge video file or whatever, we need to check content type before trying to parse it which p much requires curl */ $ch = curl_init($content_link->getAttribute("href")); curl_setopt($ch, CURLOPT_TIMEOUT, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir")); curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT); @($result = curl_exec($ch)); $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); if ($content_type && strpos($content_type, "text/html") !== FALSE) { $tmp = fetch_file_contents($content_link->getAttribute("href")); if ($tmp) { $r = new Readability($tmp, $content_link->getAttribute("href")); if ($r->init()) { $tmpxpath = new DOMXPath($r->dom); $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute("href")) { $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href"))); } if ($entry->hasAttribute("src")) { $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src"))); } } $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"]; // prob not a very good idea (breaks wikipedia pages, etc) - // inliner currently is not really fit for any random web content //$doc = new DOMDocument(); //@$doc->loadHTML($article["content"]); //$xpath = new DOMXPath($doc); //$found = $this->inline_stuff($article, $doc, $xpath); } } } } } $node = $doc->getElementsByTagName('body')->item(0); if ($node && $found) { $article["content"] = $doc->saveXML($node); } } return $article; }
/** * Runs a URL through Readability and hands back the stripped content * * @since 1.7 * @see http://www.keyvan.net/2010/08/php-readability/ * @param $url */ public static function readability_object($url) { set_time_limit(0); $url = pf_de_https($url); $url = str_replace('&', '&', $url); //print_r($url); print_r(' - Readability<br />'); // change from Boone - use wp_remote_get() instead of file_get_contents() $request = wp_remote_get($url, array('timeout' => '30')); if (is_wp_error($request)) { $content = 'error-secured'; //print_r($request); die(); return $content; } if (!empty($request['body'])) { $html = $request['body']; } else { $content = false; return $content; } //check if tidy exists to clean up the input. if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($html, array(), 'UTF8'); $tidy->cleanRepair(); $html = $tidy->value; } // give it to Readability $readability = new Readability($html, $url); // print debug output? // useful to compare against Arc90's original JS version - // simply click the bookmarklet with FireBug's // console window open $readability->debug = false; // convert links to footnotes? $readability->convertLinksToFootnotes = false; // process it $result = $readability->init(); if ($result) { $content = $readability->getContent()->innerHTML; //$content = $contentOut->innerHTML; //if we've got tidy, let's use it. if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8'); $tidy->cleanRepair(); $content = $tidy->value; } $content = balanceTags($content, true); $content = ent2ncr($content); $content = convert_chars($content); $domRotated = 0; $dom = new domDocument('1.0', 'utf-8'); $dom->preserveWhiteSpace = true; $dom->substituteEntities = true; $dom->resolveExternals = true; $dom->loadXML('<fullContent>' . $content . '</fullContent>'); $images = $dom->getElementsByTagName('img'); foreach ($images as $image) { $img = $image->getAttribute('src'); if (strpos($img, '/') === 0 || strpos($img, 'http') != 0) { $urlArray = parse_url($url); if (strpos($img, 'http') != 0) { $urlBase = 'http://' . $urlArray['host'] . '/'; } else { $urlBase = 'http://' . $urlArray['host']; } if (!is_wp_error(wp_remote_head($urlBase . $img))) { $image->setAttribute('src', $urlBase . $img); $domRotated++; } elseif (!is_wp_error(wp_remote_head($url . $img))) { $image->setAttribute('src', $url . $img); $domRotated++; } else { $image->parentNode->removeChild($image); $domRotated++; } } } if ($domRotated > 0) { $content = $dom->saveXML(); $rel = '(<\\?xml version="1\\.0" encoding="utf-8"\\?>)'; $content = preg_replace("/" . $rel . "/is", ' ', $content); $rel = '(<\\?xml version="1\\.0"\\?>)'; $content = preg_replace("/" . $rel . "/is", ' ', $content); } if (120 > strlen($content)) { $content = false; } # $content = stripslashes($content); # print_r($content); # var_dump($content); die(); // this will also output doctype and comments at top level # $content = ""; # foreach($dom->childNodes as $node){ # $content .= $dom->saveXML($node)."\n"; # } } else { # If Readability can't get the content, send back a FALSE to loop with. $content = false; # and let's throw up an error via AJAX as well, so we know what's going on. //print_r($url . ' fails Readability.<br />'); } if ($content != false) { $contentObj = new pf_htmlchecker($content); $content = $contentObj->closetags($content); } return $content; }
public function readability($html) { $obj = new Readability($html); return $obj->getContent(); }
function hook_article_filter($article) { $enabled_feeds = $this->host->get($this, "enabled_feeds"); $key = array_search($article["feed"]["id"], $enabled_feeds); if ($key === FALSE) { return $article; } if (!class_exists("Readability")) { require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php"; } if (function_exists("curl_init")) { $ch = curl_init($article["link"]); curl_setopt($ch, CURLOPT_TIMEOUT, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir")); curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT); @($result = curl_exec($ch)); $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); if (strpos($content_type, "text/html") === FALSE) { return $article; } } $tmp = fetch_file_contents($article["link"]); if ($tmp) { $tmpdoc = new DOMDocument("1.0", "UTF-8"); if (!$tmpdoc->loadHTML($tmp)) { return $article; } if ($tmpdoc->encoding != 'UTF-8') { $tmpxpath = new DOMXPath($tmpdoc); foreach ($tmpxpath->query("//meta") as $elem) { $elem->parentNode->removeChild($elem); } $tmp = $tmpdoc->saveHTML(); } $r = new Readability($tmp, $article["link"]); if ($r->init()) { $tmpxpath = new DOMXPath($r->dom); $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute("href")) { $entry->setAttribute("href", rewrite_relative_url($article["link"], $entry->getAttribute("href"))); } if ($entry->hasAttribute("src")) { $entry->setAttribute("src", rewrite_relative_url($article["link"], $entry->getAttribute("src"))); } } $article["content"] = $r->articleContent->innerHTML; } } return $article; }
// check site config for single page URL - fetch it if found if ($single_page_response = getSinglePage($item, $html, $effective_url)) { $html = $single_page_response['body']; // remove strange things $html = str_replace('</[>', '', $html); $html = convert_to_utf8($html, $single_page_response['headers']); $effective_url = $single_page_response['effective_url']; unset($single_page_response); } $extract_result = $extractor->process($html, $effective_url); $readability = $extractor->readability; $content_block = $extract_result ? $extractor->getContent() : null; //echo "content_block: [" . $content_block->innerHTML . "] \n\n"; $title = $extract_result ? $extractor->getTitle() : ''; } else { $readability = new Readability($html, $effective_url); // content block is entire document (for now...) $content_block = $readability->dom; //echo $content_block->innerHTML; //TODO: get title $title = ''; } //echo "[" . $content_block . "]" ; } // use extracted title for both feed and item title if we're using single-item dummy feed if ($isDummyFeed) { $output->setTitle($title); $newitem->setTitle($title); } if ($do_content_extraction) { if ($extract_pattern && isset($content_block)) {
echo htmlspecialchars($pageTitle); ?> " id="pageTitle" /> <input type="hidden" name="key" value="<?php echo htmlspecialchars($_SESSION['secureKey']); ?> " id="key" /> </form> <?php } else { if ($page == "complete") { ?> <div id="complete"> <p> A link to this page has been sent to <?php echo Readability::kindleAsLinks($to); ?> </p> <p> Thanks for using Readability. </p> <div> <img src="http://localhost/readability/images/footer-thanks.png" alt="Readability" /> </div> </div> <?php } } ?> </div> </body>
function import($feedObj, $maxItems = 0) { jimport('simplepie.simplepie'); $config = EasyBlogHelper::getConfig(); $itemMigrated = 0; $isDomSupported = false; $defaultAllowedHTML = '<img>,<a>,<br>,<table>,<tbody>,<th>,<tr>,<td>,<div>,<span>,<p>,<h1>,<h2>,<h3>,<h4>,<h5>,<h6>'; if (class_exists('DomDocument')) { $isDomSupported = true; require_once EBLOG_CLASSES . DIRECTORY_SEPARATOR . 'readability' . DIRECTORY_SEPARATOR . 'Readability.php'; } $params = EasyBlogHelper::getRegistry($feedObj->params); $maxItems = $maxItems ? $maxItems : $params->get('feedamount', 0); $feedURL = $feedObj->url; require_once EBLOG_HELPERS . DIRECTORY_SEPARATOR . 'connectors.php'; $connector = new EasyBlogConnectorsHelper(); $connector->addUrl($feedURL); $connector->execute(); $content = $connector->getResult($feedURL); // to ensure the leading no text before the <?xml> tag //$pattern = '/(.*?)(?=<\?xml)/ims'; $pattern = '/(.*?)<\\?xml version/is'; $replacement = '<?xml version'; $content = preg_replace($pattern, $replacement, $content, 1); if (strpos($content, '<?xml version') === false) { // look like the content missing the xml header. lets manually add in. $content = '<?xml version="1.0" encoding="utf-8"?>' . $content; } $parser = new SimplePie(); $parser->strip_htmltags(false); $parser->set_raw_data($content); $parser->init(); $items = ''; $items = $parser->get_items(); if (count($items) > 0) { //lets process the data insert $myCnt = 0; foreach ($items as $item) { @ini_set('max_execution_time', 180); if (!empty($maxItems) && $myCnt == $maxItems) { break; } $timezoneSec = $item->get_date('Z'); $itemdate = $item->get_date('U'); $itemdate = $itemdate - $timezoneSec; $mydate = date('Y-m-d H:i:s', $itemdate); $feedUid = $item->get_id(); $feedPath = $item->get_link(); $feedHistory = EasyBlogHelper::getTable('FeedHistory'); $newHistoryId = ''; if ($feedHistory->isExists($feedObj->id, $feedUid)) { continue; } else { //log the feed item so that in future it will not process again. $date = EasyBlogHelper::getDate(); $newHistory = EasyBlogHelper::getTable('FeedHistory'); $newHistory->feed_id = $feedObj->id; $newHistory->uid = $feedUid; $newHistory->created = $date->toMySQL(); $newHistory->store(); $newHistoryId = $newHistory->id; } $blogObj = new stdClass(); // set the default setting from the feed configuration via backend. $blogObj->category_id = $feedObj->item_category; $blogObj->published = $feedObj->item_published; $blogObj->frontpage = $feedObj->item_frontpage; $blogObj->created_by = $feedObj->item_creator; $blogObj->allowcomment = $config->get('main_comment', 1); $blogObj->subscription = $config->get('main_subscription', 1); $blogObj->issitewide = '1'; $text = $item->get_content(); // @rule: Append copyright text $blogObj->copyrights = $params->get('copyrights', ''); if ($feedObj->item_get_fulltext && $isDomSupported) { $feedItemUrl = urldecode($item->get_link()); $fiConnector = new EasyBlogConnectorsHelper(); $fiConnector->addUrl($feedItemUrl); $fiConnector->execute(); $fiContent = $fiConnector->getResult($feedItemUrl); // to ensure the leading no text before the <?xml> tag $pattern = '/(.*?)<html/is'; $replacement = '<html'; $fiContent = preg_replace($pattern, $replacement, $fiContent, 1); if (!empty($fiContent)) { $fiContent = EasyBlogHelper::getHelper('string')->forceUTF8($fiContent); $readability = new Readability($fiContent); $readability->debug = false; $readability->convertLinksToFootnotes = false; $result = $readability->init(); if ($result) { $content = $readability->getContent()->innerHTML; //$content = EasyBlogHelper::getHelper( 'string' )->fixUTF8( $content ); $content = EasyBlogFeedsHelper::tidyContent($content); if (stristr(html_entity_decode($content), '<!DOCTYPE html') === false) { $text = $content; $text = $this->_processRelLinktoAbs($text, $feedPath); } } } } // strip un-allowed html tag. $text = strip_tags($text, $params->get('allowed', $defaultAllowedHTML)); // Append original source link into article if necessary if ($params->get('sourceLinks')) { JFactory::getLanguage()->load('com_easyblog', JPATH_ROOT); $text .= '<div><a href="' . $item->get_link() . '" target="_blank">' . JText::_('COM_EASYBLOG_FEEDS_ORIGINAL_LINK') . '</a></div>'; } if ($feedObj->author) { $feedAuthor = $item->get_author(); if (!empty($feedAuthor)) { $authorName = $feedAuthor->get_name(); $authorEmail = $feedAuthor->get_email(); if (!empty($authorName)) { // Store it as copyright column instead $text .= '<div>' . JText::sprintf('COM_EASYBLOG_FEEDS_ORIGINAL_AUTHOR', $authorName) . '</div>'; } else { if (!empty($authorEmail)) { $authorArr = explode(' ', $authorEmail); if (isset($authorArr[1])) { $authorName = $authorArr[1]; $authorName = str_replace(array('(', ')'), '', $authorName); $text .= '<div>' . JText::sprintf('COM_EASYBLOG_FEEDS_ORIGINAL_AUTHOR', $authorName) . '</div>'; } } } } } if ($feedObj->item_content == 'intro') { $blogObj->intro = $text; } else { $blogObj->content = $text; } $creationDate = $mydate; $blogObj->created = $mydate; $blogObj->modified = $mydate; $blogObj->title = $item->get_title(); if (empty($blogObj->title)) { $blogObj->title = $this->_getTitleFromLink($item->get_link()); } $blogObj->title = EasyBlogStringHelper::unhtmlentities($blogObj->title); $blogObj->permalink = EasyBlogHelper::getPermalink($blogObj->title); $blogObj->publish_up = $mydate; $blogObj->isnew = !$feedObj->item_published ? true : false; $blog = EasyBlogHelper::getTable('blog'); $blog->bind($blogObj); if ($feedObj->item_published) { $blog->notify(); } if ($blog->store()) { $myCnt++; //update the history with blog id if (!empty($newHistoryId)) { $tmpHistory = EasyBlogHelper::getTable('FeedHistory'); $tmpHistory->load($newHistoryId); $tmpHistory->post_id = $blog->id; $tmpHistory->store(); } $itemMigrated++; if ($feedObj->item_published) { //insert activity here. EasyBlogHelper::addJomSocialActivityBlog($blog, true, true); // Determines if admin wants to auto post this item to the social sites. if ($params->get('autopost')) { $allowed = array(EBLOG_OAUTH_LINKEDIN, EBLOG_OAUTH_FACEBOOK, EBLOG_OAUTH_TWITTER); // @rule: Process centralized options first // See if there are any global postings enabled. $blog->autopost($allowed, $allowed); } } } //end if } } return $itemMigrated; }
<?php require 'php-readability/lib/Readability.inc.php'; $html = file_get_contents($_GET['src']); $r = new Readability($html); $rData = $r->getContent(); echo "<h1>" . $rData['title'] . "</h1>"; echo $rData['content'];
public function parse_tweets($_id) { $mongo_id = new MongoId($_id); $user = $this->db->users->findOne(array('_id' => $mongo_id)); if ($this->input->get('debug')) { echo 'Updating ' . $user['name'] . "\n"; } if ($this->input->get('debug')) { var_dump($user); } $this->load->library('twitter'); $auth = $this->twitter->oauth('', '', $user['oauth_token'], $user['oauth_token_secret']); if (!array_key_exists('last_tweet', $user) || !isset($user['last_tweet']) || !strlen($user['last_tweet']) || $user['last_tweet'] == '0' || !is_numeric($user['last_tweet'])) { $criteria = array('count' => 50); } else { $criteria = array('count' => 50, 'since_id' => $user['last_tweet']); } $data = $this->twitter->call('statuses/home_timeline', $criteria); if ($this->input->get('debug')) { var_dump($data); } if (array_key_exists('links', $user)) { $current_link_count = count($user['links']); } else { $current_link_count = 0; } if (is_array($data)) { if (array_key_exists('error', $data)) { if ($data['error'] == 'Could not authenticate with OAuth.') { $this->db->users->remove(array('_id' => $mongo_id), array('justOne' => true)); if ($this->input->get('debug')) { echo 'Deleting user' . "\n"; } } if ($this->input->get('debug')) { echo "ERROR: " . $data['error'] . "\n"; } return; } if (count($data) >= 1) { $this->db->users->update(array('_id' => $mongo_id), array('$set' => array('last_tweet' => (string) $data[0]['id']))); } for ($x = count($data) - 1; $x > 0; $x--) { $tweet = $data[$x]; if (preg_match('@(https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.]*(\\?\\S+)?)?)?)@', $tweet['text'], $matches)) { $doc = $tweet; $doc['link'] = $matches[0]; $doc['owner_id'] = $user['id']; $doc['tweet'] = $tweet['text']; $doc['created_at'] = $tweet['created_at']; $doc['tweet_id'] = (string) $tweet['id']; $doc['from_user'] = $tweet['user']; if (false && isset($user['full_article']) && $user['full_article']) { require_once 'lib/Readability.php'; $url = $matches[0]; $html = @file_get_contents($url); if ($html) { $readability = new Readability($html, $url); $result = $readability->init(); if ($result) { $doc['article_title'] = trim($readability->getTitle()->textContent); $doc['article_body'] = trim($readability->getContent()->innerHTML); } } } $current_link_count++; $this->db->users->update(array('_id' => $user['_id']), array('$push' => array('links' => $doc))); if ($current_link_count > 50) { $this->db->users->update(array('_id' => $user['_id']), array('$pop' => array('links' => -1))); } } } //echo " Done\n"; } else { //echo ' Failed'."\n"; } $data = null; $user = null; }
// (change this URL to whatever you'd like to test) //$url = 'http://alkhaleejonline.net/articles/1434390719340805900'; $html = file_get_contents($url); // PHP Readability works with UTF-8 encoded content. // If $html is not UTF-8 encoded, use iconv() or // mb_convert_encoding() to convert to UTF-8. // If we've got Tidy, let's clean up input. // This step is highly recommended - PHP's default HTML parser // often does a terrible job and results in strange output. if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($html, array(), 'UTF8'); $tidy->cleanRepair(); $html = $tidy->value; } // give it to Readability $readability = new Readability($html, $url); // print debug output? // useful to compare against Arc90's original JS version - // simply click the bookmarklet with FireBug's // console window open $readability->debug = false; // convert links to footnotes? $readability->convertLinksToFootnotes = true; // process it $result = $readability->init(); // does it look like we found what we wanted? if ($result) { //echo "== Title ===============================\n"; echo $readability->getTitle()->textContent, "\n\n"; exit; // echo "== Body ===============================\n";
public function __construct($args) { parent::__construct($args[0]); }
echo $pageTitle; ?> " id="pageTitle" /> <input type="hidden" name="key" value="<?php echo $_SESSION['secureKey']; ?> " id="key" /> </form> <?php } else { if ($page == "complete") { ?> <div id="complete"> <p> A link to this page has been sent to <?php echo Readability::emailAsLinks($to); ?> </p> <p> Thanks for using Readability. </p> <div> <img src="http://lab.arc90.com/experiments/readability/images/footer-thanks.png" alt="Readability" /> </div> </div> <?php } } ?> </div> </body>