Esempio n. 1
 function readable($allowed_tags = '<p><a><h1><h2><h3>', $allowed_classes = [])
     // If this function has run before on this instance then return the saved text
     if ($this->_readable) {
         return $this->_readable;
     $html = $this->body;
     // F*****g BuzzFeed has the most ghetto ass template system and it
     // f***s up PHP DOMDocument. That is the only reason for this line.
     // It can and should be removed whenever BuzzFeed grows a clue.
     $html = str_replace("+ '</div>'", '', $html);
     // Step 1. Readability
     $r = new Readability($html);
     $html = $r->getContent()->innerHTML;
     // Step 2. strip_tags
     $html = strip_tags($html, $allowed_tags);
     // Step 3. HTMLPurifier
     $config = HTMLPurifier_Config::createDefault();
     $config->set('Attr', 'AllowedClasses', $allowed_classes);
     $html = (new HTMLPurifier($config))->purify($html);
     // Step 4. HTML Tidy
     $html = tidy_repair_string($html, ['bare' => true, 'show-body-only' => true, 'wrap' => 0], 'UTF8');
     // Save the result to the instance before returning
     // I want $this->terms() to be able to use it without recomputing
     $this->_readable = $html;
     return $this->_readable;
Esempio n. 2
 function grabContent($url)
     $tikilib = TikiLib::lib('tiki');
     $client = $tikilib->get_http_client($url);
     $response = $tikilib->http_perform_request($client);
     // Obtain the URL after redirections
     $url = (string) $client->getUri();
     $html = $response->getBody();
     // Note: PHP Readability expects UTF-8 encoded content.
     // If your content is not UTF-8 encoded, convert it
     // first before passing it to PHP Readability.
     // Both iconv() and mb_convert_encoding() can do this.
     // If we've got Tidy, let's clean up input.
     // This step is highly recommended - PHP's default HTML parser
     // often doesn't do a great job and results in strange output.
     $html = $this->tidy($html);
     // give it to Readability
     global $prefs;
     if (is_file($prefs['page_content_fetch_readability'])) {
         require_once $prefs['page_content_fetch_readability'];
     if (!class_exists('Readability')) {
         return false;
     $readability = new Readability($html, $url);
     $result = $readability->init();
     if ($result) {
         $content = $this->tidy($readability->getContent()->innerHTML);
         $content = $this->replacePaths($content, $url);
         return array('title' => $readability->getTitle()->textContent, 'content' => $content);
Esempio n. 3
 public function __construct($url)
     if (!preg_match('!^https?://!i', $url)) {
         $url = 'http://' . $url;
     $data = Http::Request($url);
     //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII");
     $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII");
     //$html = utf8_encode($html);
     $r = new Readability($html, $url);
     if (!isset($this->metadata["title"])) {
         $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML));
     if (!isset($this->metadata["author"])) {
         $parts = parse_url($url);
         $this->metadata["author"] = $parts["host"];
     $article = $r->getContent()->innerHTML;
     if (substr($article, 0, 5) == "<body") {
         $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>" . $article . "</html>";
     } else {
         $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>" . $article . "</body></html>";
     $doc = new DOMDocument();
     @$doc->loadHTML($article) or die($article);
     $this->images = $this->handleImages($doc, $url);
     $this->text = $doc->saveHTML();
Esempio n. 4
 private function readFromServer($url)
     // check tidy function available
     if (function_exists('tidy_parse_string')) {
         $this->tidyAvailable = true;
     $urlArray = parse_url($url);
     $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host'];
     $html = $this->fetchContent($url);
     $html = $this->tidyClean($html);
     $readability = new Readability($html, $url);
     $this->title = $readability->getTitle()->textContent;
     $content = $readability->getContent()->innerHTML;
      * still need one more tidy clean, otherwise domdocument will not work properly
     $content = $this->tidyClean($content);
      * use domdocument to fix relative urls
     $this->content = $this->fixRelativeUrls($content);
     $article = array('title' => $this->title, 'content' => $this->content);
Esempio n. 5
 public function downloadItem($itemId)
     $item = $this->cache->getItem($itemId);
     $html = file_get_contents($item->link);
     $reader = new Readability($html, $item->link);
     if ($reader->init()) {
         $item->description = $reader->articleContent->innerHTML;
         // save the downloaded content to cache, just in case
     return $item;
 public function parseData($html)
     if (strlen($html) == 0) {
         return null;
     // check tidy function available
     if (function_exists('tidy_parse_string')) {
         $this->tidyAvailable = true;
     $headers = $this->response->getHeaders();
     if (isset($headers['Location'])) {
         $url = $headers["Location"];
     } else {
         $url = $this->getOption("readerUrl");
     $urlArray = parse_url($url);
     if (isset($urlArray['path'])) {
         $this->basePath = dirname($urlArray['path']);
     } else {
         $this->basePath = "";
     $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host'];
     $html = $this->tidyClean($html);
     $readability = new Readability($html, $url);
     $article = null;
     if ($readability->init()) {
         $title = $readability->getTitle()->textContent;
         $content = $readability->getContent()->innerHTML;
          * still need one more tidy clean, otherwise domdocument will not work properly
         $content = $this->tidyClean($content);
         $content = $this->removeEmptyTags($content);
          * use domdocument to fix relative urls
         $content = $this->fixRelativeUrls($content);
          * if there is tidy support, then detect target source code from meta content
         if ($this->tidyAvailable) {
             $tidy = tidy_parse_string($content, array(), 'utf8');
             $head = $tidy->head();
             $charset = $this->findCharset($head->value);
             if (!empty($charset) && $charset != "utf-8") {
                 $content = mb_convert_encoding($content, "utf-8", $charset);
         $article = array('title' => $title, 'content' => $content);
     return $article;
Esempio n. 7
 function perform()
     $q = DB::query('SELECT link, neighborhood FROM listings WHERE scraped != TRUE', PDO::FETCH_ASSOC);
     $ps = DB::prepare('UPDATE listings SET scraped=TRUE, street=:street, description=:description, lat=:lat, lng=:lng WHERE link=:link');
     Guzzle::sendAll(array_map(function ($listing) {
       return Guzzle::createRequest('GET', '' . $listing['link']);
     }, iterator_to_array($q)), ['complete' => function ($event) use($ps) {
       try {        
         $body = $event->getResponse()->getBody();
         $crawler = new Crawler($body);
         $readability = new Readability($body);
         $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress');
           ':link' => parse_url($event->getRequest()->getUrl())['path'],
           ':lat'  => null,
           ':lng'  => null,
           ':street' => $street->count() ? $street->text() : null,
           ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null    
       } catch (Exception $e) {
         Logger::error($e->getMessage(), $ps->errorinfo());
     foreach ($q as $listing) {
         try {
             $body = Guzzle::get('' . $listing['link'])->getBody();
             $crawler = new Crawler($body);
             $readability = new Readability($body);
             $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress');
             $url = '' . ($street->count() ? $street->text() : $listing['neighborhood']);
             $json = json_decode(Guzzle::get($url)->getBody(), true);
             $loc = isset($json['results'][0]) ? $json['results'][0]['geometry']['location'] : null;
             $ps->execute([':link' => $listing['link'], ':lat' => isset($loc['lat']) ? $loc['lat'] : null, ':lng' => isset($loc['lng']) ? $loc['lng'] : null, ':street' => $street->count() ? $street->text() : null, ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null]);
         } catch (Exception $e) {
             Logger::error($e->getMessage(), $ps->errorinfo());
Esempio n. 8
 protected function absorb(OutputInterface $output, $id, $url)
     $config = new Config();
     $config->setClientUserAgent('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:11.0) Gecko/20100101 Firefox/11.0');
     $reader = new Reader($config);
     $parser = $reader->getParser();
     if ($parser === false) {
         return $this->writeErrors($output);
     $feed = $parser->execute();
     if ($feed === false) {
         return $this->writeErrors($output);
     $data = ['lang' => $feed->getLanguage(), 'title' => $feed->getTitle(), 'lastUpdate' => $this->formatDateForMySQL($feed->getDate())];
     $this->feedRepository->updateByPk($data, $id);
     foreach ($feed->items as $item) {
         $url = $item->getUrl();
         $output->writeln('+ ' . $item->title);
         $fullContent = file_get_contents($url);
         $tidy = tidy_parse_string($fullContent, array(), 'UTF8');
         $html = $tidy->value;
         $readability = new \Readability($html, $url);
         $result = $readability->init();
         if ($result) {
             $content = $readability->getContent()->innerHTML;
             $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
             $content = $tidy->value;
         } else {
             $output->writeln('unable to get full content');
             $content = $item->getContent();
         $data = ['feedId' => $id, 'remoteId' => $item->getId(), 'title' => $item->getTitle(), 'url' => $url, 'pubDate' => $this->formatDateForMySQL($item->getDate()), 'content' => $content, 'author' => $item->getAuthor()];
         $this->postRepository->add($data, true);
Esempio n. 9
 public function parse_tweets($_id)
     $mongo_id = new MongoId($_id);
     $user = $this->db->users->findOne(array('_id' => $mongo_id));
     if ($this->input->get('debug')) {
         echo 'Updating ' . $user['name'] . "\n";
     if ($this->input->get('debug')) {
     $auth = $this->twitter->oauth('', '', $user['oauth_token'], $user['oauth_token_secret']);
     if (!array_key_exists('last_tweet', $user) || !isset($user['last_tweet']) || !strlen($user['last_tweet']) || $user['last_tweet'] == '0' || !is_numeric($user['last_tweet'])) {
         $criteria = array('count' => 50);
     } else {
         $criteria = array('count' => 50, 'since_id' => $user['last_tweet']);
     $data = $this->twitter->call('statuses/home_timeline', $criteria);
     if ($this->input->get('debug')) {
     if (array_key_exists('links', $user)) {
         $current_link_count = count($user['links']);
     } else {
         $current_link_count = 0;
     if (is_array($data)) {
         if (array_key_exists('error', $data)) {
             if ($data['error'] == 'Could not authenticate with OAuth.') {
                 $this->db->users->remove(array('_id' => $mongo_id), array('justOne' => true));
                 if ($this->input->get('debug')) {
                     echo 'Deleting user' . "\n";
             if ($this->input->get('debug')) {
                 echo "ERROR: " . $data['error'] . "\n";
         if (count($data) >= 1) {
             $this->db->users->update(array('_id' => $mongo_id), array('$set' => array('last_tweet' => (string) $data[0]['id'])));
         for ($x = count($data) - 1; $x > 0; $x--) {
             $tweet = $data[$x];
             if (preg_match('@(https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.]*(\\?\\S+)?)?)?)@', $tweet['text'], $matches)) {
                 $doc = $tweet;
                 $doc['link'] = $matches[0];
                 $doc['owner_id'] = $user['id'];
                 $doc['tweet'] = $tweet['text'];
                 $doc['created_at'] = $tweet['created_at'];
                 $doc['tweet_id'] = (string) $tweet['id'];
                 $doc['from_user'] = $tweet['user'];
                 if (false && isset($user['full_article']) && $user['full_article']) {
                     require_once 'lib/Readability.php';
                     $url = $matches[0];
                     $html = @file_get_contents($url);
                     if ($html) {
                         $readability = new Readability($html, $url);
                         $result = $readability->init();
                         if ($result) {
                             $doc['article_title'] = trim($readability->getTitle()->textContent);
                             $doc['article_body'] = trim($readability->getContent()->innerHTML);
                 $this->db->users->update(array('_id' => $user['_id']), array('$push' => array('links' => $doc)));
                 if ($current_link_count > 50) {
                     $this->db->users->update(array('_id' => $user['_id']), array('$pop' => array('links' => -1)));
         //echo "  Done\n";
     } else {
         //echo '  Failed'."\n";
     $data = null;
     $user = null;
Esempio n. 10
 public function parse($html, $url, Sources $source = null)
     if ($source) {
         $this->source = $source;
     try {
         $parsedNews = array();
         $html = $this->stripTagWithContent($html, "script");
         $htmlToDetect = $this->processExcludeElements($html);
         $content = $this->tryContentDetect($htmlToDetect);
         $readability = new \Readability($html, $url);
         $readability->debug = false;
         $readability->convertLinksToFootnotes = false;
         $result = $readability->init();
         if ($result || $content) {
             $title = $readability->getTitle()->textContent;
             $title = $this->processTitleStopWords($title);
             if (!$content) {
                 $content = $readability->getContent()->innerHTML;
             $content = $this->processContentStopWords($content);
             $content = preg_replace('/\\n/', ' ', $content);
             $content = strip_tags($content, "<p><div><img><span><br><ul><li><embed><iframe>");
             $content = $this->fixUrls($content);
             $content = $this->processExcludeElements($content);
             $date = $this->processPublishDate($html);
             if ($searchContent = trim(strip_tags($content))) {
                 $searchContent = preg_replace('/\\n/', ' ', $searchContent);
                 $searchContent = preg_replace("/[^а-яa-z ]/ui", "", $searchContent);
                 $searchContent = preg_replace('/\\s+/', ' ', $searchContent);
                 $searchContent = mb_convert_encoding($searchContent, 'HTML-ENTITIES', "UTF-8");
                 $parsedNews['title'] = $title;
                 $parsedNews['content'] = $content;
                 $parsedNews['searchContent'] = $searchContent;
                 $parsedNews['thumb'] = $this->detectThumb($html, $content);
                 $parsedNews['date'] = $date;
         } else {
             throw new Exception('Looks like we couldn\'t find the content. :(');
     } catch (Exception $e) {
         $parsedNews['error'] = $e->getMessage();
     return $parsedNews;
 $html = convert_to_utf8($html, $response['headers']);
 if ($auto_extract) {
     // Run through Tidy (if it exists).
     // This fixes problems with some sites which would otherwise
     // trouble DOMDocument's HTML parsing. (Although sometimes it fails
     // to return anything, so it's a bit of tradeoff.)
     if (function_exists('tidy_parse_string')) {
         $tidy = tidy_parse_string($html, $tidy_config, 'UTF8');
         $html = $tidy->value;
     $readability = new Readability($html, $effective_url);
     if ($links == 'footnotes') {
         $readability->convertLinksToFootnotes = true;
     $extract_result = $readability->init();
     // content block is detected element
     $content_block = $readability->getContent();
 } else {
     $readability = new Readability($html, $effective_url);
     // content block is entire document (for now...)
     $content_block = $readability->dom;
 if ($extract_pattern) {
     $xpath = new DOMXPath($readability->dom);
     $elems = @$xpath->query($extract_pattern, $content_block);
     // check if our custom extraction pattern matched
     if ($elems && $elems->length > 0) {
         $extract_result = true;
         // get the first matched element
         $content_block = $elems->item(0);
Esempio n. 12
 function hook_article_filter($article)
     $enabled_feeds = $this->host->get($this, "enabled_feeds");
     $key = array_search($article["feed"]["id"], $enabled_feeds);
     if ($key === FALSE) {
         return $article;
     if (!class_exists("Readability")) {
         require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
     if (function_exists("curl_init")) {
         $ch = curl_init($article["link"]);
         curl_setopt($ch, CURLOPT_TIMEOUT, 5);
         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
         curl_setopt($ch, CURLOPT_HEADER, true);
         curl_setopt($ch, CURLOPT_NOBODY, true);
         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir"));
         curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
         @($result = curl_exec($ch));
         $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
         if (strpos($content_type, "text/html") === FALSE) {
             return $article;
     $tmp = fetch_file_contents($article["link"]);
     if ($tmp) {
         $tmpdoc = new DOMDocument("1.0", "UTF-8");
         if (!$tmpdoc->loadHTML($tmp)) {
             return $article;
         if ($tmpdoc->encoding != 'UTF-8') {
             $tmpxpath = new DOMXPath($tmpdoc);
             foreach ($tmpxpath->query("//meta") as $elem) {
             $tmp = $tmpdoc->saveHTML();
         $r = new Readability($tmp, $article["link"]);
         if ($r->init()) {
             $tmpxpath = new DOMXPath($r->dom);
             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
             foreach ($entries as $entry) {
                 if ($entry->hasAttribute("href")) {
                     $entry->setAttribute("href", rewrite_relative_url($article["link"], $entry->getAttribute("href")));
                 if ($entry->hasAttribute("src")) {
                     $entry->setAttribute("src", rewrite_relative_url($article["link"], $entry->getAttribute("src")));
             $article["content"] = $r->articleContent->innerHTML;
     return $article;
Esempio n. 13
 public function extract_content($url)
     if (!class_exists("Readability")) {
         require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
     if (!defined('NO_CURL') && function_exists('curl_init') && !ini_get("open_basedir")) {
         $ch = curl_init($url);
         curl_setopt($ch, CURLOPT_TIMEOUT, 5);
         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
         curl_setopt($ch, CURLOPT_HEADER, true);
         curl_setopt($ch, CURLOPT_NOBODY, true);
         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
         curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
         @($result = curl_exec($ch));
         $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
         if (strpos($content_type, "text/html") === FALSE) {
             return false;
     $tmp = fetch_file_contents($url);
     if ($tmp && mb_strlen($tmp) < 65535 * 4) {
         $tmpdoc = new DOMDocument("1.0", "UTF-8");
         if (!$tmpdoc->loadHTML($tmp)) {
             return false;
         if (strtolower($tmpdoc->encoding) != 'utf-8') {
             $tmpxpath = new DOMXPath($tmpdoc);
             foreach ($tmpxpath->query("//meta") as $elem) {
             $tmp = $tmpdoc->saveHTML();
         $r = new Readability($tmp, $url);
         if ($r->init()) {
             $tmpxpath = new DOMXPath($r->dom);
             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
             foreach ($entries as $entry) {
                 if ($entry->hasAttribute("href")) {
                     $entry->setAttribute("href", rewrite_relative_url($url, $entry->getAttribute("href")));
                 if ($entry->hasAttribute("src")) {
                     $entry->setAttribute("src", rewrite_relative_url($url, $entry->getAttribute("src")));
             return $r->articleContent->innerHTML;
     return false;
$url = '';
$html = file_get_contents($url);
// Note: PHP Readability expects UTF-8 encoded content.
// If your content is not UTF-8 encoded, convert it
// first before passing it to PHP Readability.
// Both iconv() and mb_convert_encoding() can do this.
// give it to Readability
$readability = new Readability($html, $url);
// print debug output?
// useful to compare against Arc90's original JS version -
// simply click the bookmarklet with FireBug's console window open
$readability->debug = false;
// convert links to footnotes?
$readability->convertLinksToFootnotes = true;
// process it
$result = $readability->init();
// does it look like we found what we wanted?
if ($result) {
    echo "== Title =====================================\n";
    echo $readability->getTitle()->textContent, "\n\n";
    echo "== Body ======================================\n";
    $content = $readability->getContent()->innerHTML;
    // if we've got Tidy, let's clean it up for output
    if (function_exists('tidy_parse_string')) {
        $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
        $content = $tidy->value;
    echo $content;
} else {
    echo 'Looks like we couldn\'t find the content. :(';
Esempio n. 15
 private function get_full_post($request_url)
     try {
         try {
             $handle = curl_init();
             curl_setopt_array($handle, array(CURLOPT_USERAGENT => "Tiny Tiny RSS", CURLOPT_FOLLOWLOCATION => true, CURLOPT_HEADER => false, CURLOPT_HTTPGET => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_URL => $request_url));
             $source = curl_exec($handle);
         } catch (Exception $e) {
             $source = file_get_contents($request_url);
         // fix encoding -> done by itohsnap:
         preg_match("/charset=([\\w|\\-]+);?/", $source, $match);
         $charset = isset($match[1]) ? $match[1] : 'utf-8';
         $source = mb_convert_encoding($source, 'UTF-8', $charset);
         // Clean with tidy, if exists
         if (function_exists('tidy_parse_string')) {
             $tidy = tidy_parse_string($source, array(), 'UTF8');
             $source = $tidy->value;
         // get the Text
         require_once 'Readability.php';
         $readability = new Readability($source);
         $readability->debug = false;
         $readability->convertLinksToFootnotes = false;
         $result = $readability->init();
         $content = $readability->getContent()->innerHTML;
         // if we've got Tidy, let's clean it up for output
         if (function_exists('tidy_parse_string')) {
             $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
             $content = $tidy->value;
         $Data['content'] = $content;
     } catch (Exception $e) {
         // do nothing if it dont grep fulltext succesfully
     return $Data['content'];
Esempio n. 16
 function hook_article_filter($article)
     if (strpos($article["link"], "") !== FALSE) {
         $doc = new DOMDocument();
         $xpath = new DOMXPath($doc);
         $found = $this->inline_stuff($article, $doc, $xpath);
         if (function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) {
             if (!class_exists("Readability")) {
                 require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
             $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0);
             if ($content_link && strpos($content_link->getAttribute("href"), "") === FALSE && strpos($content_link->getAttribute("href"), "") === FALSE && strpos($content_link->getAttribute("href"), "") === FALSE) {
                 /* link may lead to a huge video file or whatever, we need to check content type before trying to
                 			parse it which p much requires curl */
                 $ch = curl_init($content_link->getAttribute("href"));
                 curl_setopt($ch, CURLOPT_TIMEOUT, 5);
                 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
                 curl_setopt($ch, CURLOPT_HEADER, true);
                 curl_setopt($ch, CURLOPT_NOBODY, true);
                 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir"));
                 curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
                 @($result = curl_exec($ch));
                 $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
                 if ($content_type && strpos($content_type, "text/html") !== FALSE) {
                     $tmp = fetch_file_contents($content_link->getAttribute("href"));
                     if ($tmp) {
                         $r = new Readability($tmp, $content_link->getAttribute("href"));
                         if ($r->init()) {
                             $tmpxpath = new DOMXPath($r->dom);
                             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
                             foreach ($entries as $entry) {
                                 if ($entry->hasAttribute("href")) {
                                     $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href")));
                                 if ($entry->hasAttribute("src")) {
                                     $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src")));
                             $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"];
                             // prob not a very good idea (breaks wikipedia pages, etc) -
                             // inliner currently is not really fit for any random web content
                             //$doc = new DOMDocument();
                             //$xpath = new DOMXPath($doc);
                             //$found = $this->inline_stuff($article, $doc, $xpath);
         $node = $doc->getElementsByTagName('body')->item(0);
         if ($node && $found) {
             $article["content"] = $doc->saveXML($node);
     return $article;
Esempio n. 17
 function hook_article_filter($article)
     if (strpos($article["link"], "") !== FALSE) {
         $doc = new DOMDocument();
         $xpath = new DOMXPath($doc);
         if ($this->host->get($this, "enable_content_dupcheck")) {
             $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0);
             if ($content_link) {
                 $content_href = db_escape_string($content_link->getAttribute("href"));
                 $entry_guid = db_escape_string($article["guid_hashed"]);
                 $owner_uid = $article["owner_uid"];
                 if (DB_TYPE == "pgsql") {
                     $interval_qpart = "date_entered < NOW() - INTERVAL '1 day'";
                 } else {
                     $interval_qpart = "date_entered < DATE_SUB(NOW(), INTERVAL 1 DAY)";
                 $result = db_query("SELECT COUNT(id) AS cid\n\t\t\t\t\t\tFROM ttrss_entries, ttrss_user_entries WHERE\n\t\t\t\t\t\t\tref_id = id AND\n\t\t\t\t\t\t\t{$interval_qpart} AND\n\t\t\t\t\t\t\tguid != '{$entry_guid}' AND\n\t\t\t\t\t\t\towner_uid = '{$owner_uid}' AND\n\t\t\t\t\t\t\tcontent LIKE '%href=\"{$content_href}\">[link]%'");
                 if ($result) {
                     $num_found = db_fetch_result($result, 0, "cid");
                     if ($num_found > 0) {
                         $article["force_catchup"] = true;
         $found = $this->inline_stuff($article, $doc, $xpath);
         if (!defined('NO_CURL') && function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) {
             if (!class_exists("Readability")) {
                 require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
             if ($content_link && strpos($content_link->getAttribute("href"), "") === FALSE && strpos($content_link->getAttribute("href"), "") === FALSE && strpos($content_link->getAttribute("href"), "") === FALSE) {
                 /* link may lead to a huge video file or whatever, we need to check content type before trying to
                 			parse it which p much requires curl */
                 $ch = curl_init($content_link->getAttribute("href"));
                 curl_setopt($ch, CURLOPT_TIMEOUT, 5);
                 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
                 curl_setopt($ch, CURLOPT_HEADER, true);
                 curl_setopt($ch, CURLOPT_NOBODY, true);
                 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("open_basedir"));
                 curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
                 @($result = curl_exec($ch));
                 $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
                 if ($content_type && strpos($content_type, "text/html") !== FALSE) {
                     $tmp = fetch_file_contents($content_link->getAttribute("href"));
                     //_debug("tmplen: " . mb_strlen($tmp));
                     if ($tmp && mb_strlen($tmp) < 65535 * 4) {
                         $r = new Readability($tmp, $content_link->getAttribute("href"));
                         if ($r->init()) {
                             $tmpxpath = new DOMXPath($r->dom);
                             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
                             foreach ($entries as $entry) {
                                 if ($entry->hasAttribute("href")) {
                                     $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href")));
                                 if ($entry->hasAttribute("src")) {
                                     $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src")));
                             $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"];
                             // prob not a very good idea (breaks wikipedia pages, etc) -
                             // inliner currently is not really fit for any random web content
                             //$doc = new DOMDocument();
                             //$xpath = new DOMXPath($doc);
                             //$found = $this->inline_stuff($article, $doc, $xpath);
         $node = $doc->getElementsByTagName('body')->item(0);
         if ($node && $found) {
             $article["content"] = $doc->saveXML($node);
     return $article;
Esempio n. 18
 function filter_article_readability(&$article, $config)
     require_once 'readability/Readability.php';
     $link = trim($article['link']);
     $html = $this->load_url($link, $config);
     $readability = new Readability($html, $link);
     $readability->debug = false;
     $readability->convertLinksToFootnotes = isset($config['footnote_links']) && $config['footnote_links'];
     $result = $readability->init();
     if ($result) {
         $article['content'] = $readability->getContent()->innerHTML;
         $article['plugin_data'] = "feedmod,{$owner_uid}:" . $article['plugin_data'];
Esempio n. 19
  * Runs a URL through Readability and hands back the stripped content
  * @since 1.7
  * @see
  * @param $url
 public static function readability_object($url)
     $url = pf_de_https($url);
     $url = str_replace('&amp;', '&', $url);
     //print_r($url); print_r(' - Readability<br />');
     // change from Boone - use wp_remote_get() instead of file_get_contents()
     $request = wp_remote_get($url, array('timeout' => '30'));
     if (is_wp_error($request)) {
         $content = 'error-secured';
         //print_r($request); die();
         return $content;
     if (!empty($request['body'])) {
         $html = $request['body'];
     } else {
         $content = false;
         return $content;
     //check if tidy exists to clean up the input.
     if (function_exists('tidy_parse_string')) {
         $tidy = tidy_parse_string($html, array(), 'UTF8');
         $html = $tidy->value;
     // give it to Readability
     $readability = new Readability($html, $url);
     // print debug output?
     // useful to compare against Arc90's original JS version -
     // simply click the bookmarklet with FireBug's
     // console window open
     $readability->debug = false;
     // convert links to footnotes?
     $readability->convertLinksToFootnotes = false;
     // process it
     $result = $readability->init();
     if ($result) {
         $content = $readability->getContent()->innerHTML;
         //$content = $contentOut->innerHTML;
         //if we've got tidy, let's use it.
         if (function_exists('tidy_parse_string')) {
             $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
             $content = $tidy->value;
         $content = balanceTags($content, true);
         $content = ent2ncr($content);
         $content = convert_chars($content);
         $domRotated = 0;
         $dom = new domDocument('1.0', 'utf-8');
         $dom->preserveWhiteSpace = true;
         $dom->substituteEntities = true;
         $dom->resolveExternals = true;
         $dom->loadXML('<fullContent>' . $content . '</fullContent>');
         $images = $dom->getElementsByTagName('img');
         foreach ($images as $image) {
             $img = $image->getAttribute('src');
             if (strpos($img, '/') === 0 || strpos($img, 'http') != 0) {
                 $urlArray = parse_url($url);
                 if (strpos($img, 'http') != 0) {
                     $urlBase = 'http://' . $urlArray['host'] . '/';
                 } else {
                     $urlBase = 'http://' . $urlArray['host'];
                 if (!is_wp_error(wp_remote_head($urlBase . $img))) {
                     $image->setAttribute('src', $urlBase . $img);
                 } elseif (!is_wp_error(wp_remote_head($url . $img))) {
                     $image->setAttribute('src', $url . $img);
                 } else {
         if ($domRotated > 0) {
             $content = $dom->saveXML();
             $rel = '(<\\?xml version="1\\.0" encoding="utf-8"\\?>)';
             $content = preg_replace("/" . $rel . "/is", ' ', $content);
             $rel = '(<\\?xml version="1\\.0"\\?>)';
             $content = preg_replace("/" . $rel . "/is", ' ', $content);
         if (120 > strlen($content)) {
             $content = false;
         #			$content = stripslashes($content);
         # print_r($content);
         #				var_dump($content); die();
         // this will also output doctype and comments at top level
         #			$content = "";
         #			foreach($dom->childNodes as $node){
         #				$content .= $dom->saveXML($node)."\n";
         #			}
     } else {
         # If Readability can't get the content, send back a FALSE to loop with.
         $content = false;
         # and let's throw up an error via AJAX as well, so we know what's going on.
         //print_r($url . ' fails Readability.<br />');
     if ($content != false) {
         $contentObj = new pf_htmlchecker($content);
         $content = $contentObj->closetags($content);
     return $content;
Esempio n. 20
 function import($feedObj, $maxItems = 0)
     $config = EasyBlogHelper::getConfig();
     $itemMigrated = 0;
     $isDomSupported = false;
     $defaultAllowedHTML = '<img>,<a>,<br>,<table>,<tbody>,<th>,<tr>,<td>,<div>,<span>,<p>,<h1>,<h2>,<h3>,<h4>,<h5>,<h6>';
     if (class_exists('DomDocument')) {
         $isDomSupported = true;
         require_once EBLOG_CLASSES . DIRECTORY_SEPARATOR . 'readability' . DIRECTORY_SEPARATOR . 'Readability.php';
     $params = EasyBlogHelper::getRegistry($feedObj->params);
     $maxItems = $maxItems ? $maxItems : $params->get('feedamount', 0);
     $feedURL = $feedObj->url;
     require_once EBLOG_HELPERS . DIRECTORY_SEPARATOR . 'connectors.php';
     $connector = new EasyBlogConnectorsHelper();
     $content = $connector->getResult($feedURL);
     // to ensure the leading no text before the <?xml> tag
     //$pattern	= '/(.*?)(?=<\?xml)/ims';
     $pattern = '/(.*?)<\\?xml version/is';
     $replacement = '<?xml version';
     $content = preg_replace($pattern, $replacement, $content, 1);
     if (strpos($content, '<?xml version') === false) {
         // look like the content missing the xml header. lets manually add in.
         $content = '<?xml version="1.0" encoding="utf-8"?>' . $content;
     $parser = new SimplePie();
     $items = '';
     $items = $parser->get_items();
     if (count($items) > 0) {
         //lets process the data insert
         $myCnt = 0;
         foreach ($items as $item) {
             @ini_set('max_execution_time', 180);
             if (!empty($maxItems) && $myCnt == $maxItems) {
             $timezoneSec = $item->get_date('Z');
             $itemdate = $item->get_date('U');
             $itemdate = $itemdate - $timezoneSec;
             $mydate = date('Y-m-d H:i:s', $itemdate);
             $feedUid = $item->get_id();
             $feedPath = $item->get_link();
             $feedHistory = EasyBlogHelper::getTable('FeedHistory');
             $newHistoryId = '';
             if ($feedHistory->isExists($feedObj->id, $feedUid)) {
             } else {
                 //log the feed item so that in future it will not process again.
                 $date = EasyBlogHelper::getDate();
                 $newHistory = EasyBlogHelper::getTable('FeedHistory');
                 $newHistory->feed_id = $feedObj->id;
                 $newHistory->uid = $feedUid;
                 $newHistory->created = $date->toMySQL();
                 $newHistoryId = $newHistory->id;
             $blogObj = new stdClass();
             // set the default setting from the feed configuration via backend.
             $blogObj->category_id = $feedObj->item_category;
             $blogObj->published = $feedObj->item_published;
             $blogObj->frontpage = $feedObj->item_frontpage;
             $blogObj->created_by = $feedObj->item_creator;
             $blogObj->allowcomment = $config->get('main_comment', 1);
             $blogObj->subscription = $config->get('main_subscription', 1);
             $blogObj->issitewide = '1';
             $text = $item->get_content();
             // @rule: Append copyright text
             $blogObj->copyrights = $params->get('copyrights', '');
             if ($feedObj->item_get_fulltext && $isDomSupported) {
                 $feedItemUrl = urldecode($item->get_link());
                 $fiConnector = new EasyBlogConnectorsHelper();
                 $fiContent = $fiConnector->getResult($feedItemUrl);
                 // to ensure the leading no text before the <?xml> tag
                 $pattern = '/(.*?)<html/is';
                 $replacement = '<html';
                 $fiContent = preg_replace($pattern, $replacement, $fiContent, 1);
                 if (!empty($fiContent)) {
                     $fiContent = EasyBlogHelper::getHelper('string')->forceUTF8($fiContent);
                     $readability = new Readability($fiContent);
                     $readability->debug = false;
                     $readability->convertLinksToFootnotes = false;
                     $result = $readability->init();
                     if ($result) {
                         $content = $readability->getContent()->innerHTML;
                         //$content	= EasyBlogHelper::getHelper( 'string' )->fixUTF8( $content );
                         $content = EasyBlogFeedsHelper::tidyContent($content);
                         if (stristr(html_entity_decode($content), '<!DOCTYPE html') === false) {
                             $text = $content;
                             $text = $this->_processRelLinktoAbs($text, $feedPath);
             // strip un-allowed html tag.
             $text = strip_tags($text, $params->get('allowed', $defaultAllowedHTML));
             // Append original source link into article if necessary
             if ($params->get('sourceLinks')) {
                 JFactory::getLanguage()->load('com_easyblog', JPATH_ROOT);
                 $text .= '<div><a href="' . $item->get_link() . '" target="_blank">' . JText::_('COM_EASYBLOG_FEEDS_ORIGINAL_LINK') . '</a></div>';
             if ($feedObj->author) {
                 $feedAuthor = $item->get_author();
                 if (!empty($feedAuthor)) {
                     $authorName = $feedAuthor->get_name();
                     $authorEmail = $feedAuthor->get_email();
                     if (!empty($authorName)) {
                         // Store it as copyright column instead
                         $text .= '<div>' . JText::sprintf('COM_EASYBLOG_FEEDS_ORIGINAL_AUTHOR', $authorName) . '</div>';
                     } else {
                         if (!empty($authorEmail)) {
                             $authorArr = explode(' ', $authorEmail);
                             if (isset($authorArr[1])) {
                                 $authorName = $authorArr[1];
                                 $authorName = str_replace(array('(', ')'), '', $authorName);
                                 $text .= '<div>' . JText::sprintf('COM_EASYBLOG_FEEDS_ORIGINAL_AUTHOR', $authorName) . '</div>';
             if ($feedObj->item_content == 'intro') {
                 $blogObj->intro = $text;
             } else {
                 $blogObj->content = $text;
             $creationDate = $mydate;
             $blogObj->created = $mydate;
             $blogObj->modified = $mydate;
             $blogObj->title = $item->get_title();
             if (empty($blogObj->title)) {
                 $blogObj->title = $this->_getTitleFromLink($item->get_link());
             $blogObj->title = EasyBlogStringHelper::unhtmlentities($blogObj->title);
             $blogObj->permalink = EasyBlogHelper::getPermalink($blogObj->title);
             $blogObj->publish_up = $mydate;
             $blogObj->isnew = !$feedObj->item_published ? true : false;
             $blog = EasyBlogHelper::getTable('blog');
             if ($feedObj->item_published) {
             if ($blog->store()) {
                 //update the history with blog id
                 if (!empty($newHistoryId)) {
                     $tmpHistory = EasyBlogHelper::getTable('FeedHistory');
                     $tmpHistory->post_id = $blog->id;
                 if ($feedObj->item_published) {
                     //insert activity here.
                     EasyBlogHelper::addJomSocialActivityBlog($blog, true, true);
                     // Determines if admin wants to auto post this item to the social sites.
                     if ($params->get('autopost')) {
                         $allowed = array(EBLOG_OAUTH_LINKEDIN, EBLOG_OAUTH_FACEBOOK, EBLOG_OAUTH_TWITTER);
                         // @rule: Process centralized options first
                         // See if there are any global postings enabled.
                         $blog->autopost($allowed, $allowed);
             //end if
     return $itemMigrated;