Example #1
0
 function readable($allowed_tags = '<p><a><h1><h2><h3>', $allowed_classes = [])
 {
     // If this function has run before on this instance then return the saved text
     if ($this->_readable) {
         return $this->_readable;
     }
     $html = $this->body;
     // F*****g BuzzFeed has the most ghetto ass template system and it
     // f***s up PHP DOMDocument. That is the only reason for this line.
     // It can and should be removed whenever BuzzFeed grows a clue.
     $html = str_replace("+ '</div>'", '', $html);
     // Step 1. Readability
     $r = new Readability($html);
     $r->init();
     $html = $r->getContent()->innerHTML;
     // Step 2. strip_tags
     $html = strip_tags($html, $allowed_tags);
     // Step 3. HTMLPurifier
     $config = HTMLPurifier_Config::createDefault();
     $config->set('Attr', 'AllowedClasses', $allowed_classes);
     $html = (new HTMLPurifier($config))->purify($html);
     // Step 4. HTML Tidy
     $html = tidy_repair_string($html, ['bare' => true, 'show-body-only' => true, 'wrap' => 0], 'UTF8');
     // Save the result to the instance before returning
     // I want $this->terms() to be able to use it without recomputing
     $this->_readable = $html;
     return $this->_readable;
 }
Example #2
0
 private function readFromServer($url)
 {
     // check tidy function available
     if (function_exists('tidy_parse_string')) {
         $this->tidyAvailable = true;
     }
     $urlArray = parse_url($url);
     $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host'];
     $html = $this->fetchContent($url);
     $html = $this->tidyClean($html);
     $readability = new Readability($html, $url);
     $readability->init();
     $this->title = $readability->getTitle()->textContent;
     $content = $readability->getContent()->innerHTML;
     /**
      * still need one more tidy clean, otherwise domdocument will not work properly
      */
     $content = $this->tidyClean($content);
     /**
      * use domdocument to fix relative urls
      */
     $this->content = $this->fixRelativeUrls($content);
     $article = array('title' => $this->title, 'content' => $this->content);
     $this->writeCache(serialize($article));
 }
Example #3
0
 public function __construct($url)
 {
     if (!preg_match('!^https?://!i', $url)) {
         $url = 'http://' . $url;
     }
     $data = Http::Request($url);
     //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII");
     $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII");
     //$html = utf8_encode($html);
     $r = new Readability($html, $url);
     $r->init();
     if (!isset($this->metadata["title"])) {
         $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML));
     }
     if (!isset($this->metadata["author"])) {
         $parts = parse_url($url);
         $this->metadata["author"] = $parts["host"];
     }
     $article = $r->getContent()->innerHTML;
     if (substr($article, 0, 5) == "<body") {
         $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>" . $article . "</html>";
     } else {
         $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>" . $article . "</body></html>";
     }
     $doc = new DOMDocument();
     @$doc->loadHTML($article) or die($article);
     $doc->normalizeDocument();
     $this->images = $this->handleImages($doc, $url);
     $this->text = $doc->saveHTML();
 }
Example #4
0
function run_work($href)
{
    global $db2, $tablepre;
    $sql = "select * from {$tablepre}collection_content where url = '{$href}'";
    $query = $db2->query($sql);
    $row = $db2->fetch_array($query);
    if ($row) {
        echo "<font color = 'red'>" . $href . "已收集过</font><br>";
        return null;
    } else {
        $from = get_url_contents($href);
        if ($from) {
            preg_match('/charset=\\"?(.*?)\\"/si', $from, $charset);
            if ($charset[1]) {
                $da = new Readability($from, $charset[1]);
                $data = $da->getContent();
                if (preg_match('/(.*?)[_-].*?/si', $data['title'], $title)) {
                    $data['title'] = $title[1];
                }
                if (strlen($data['content']) > 1000) {
                    return $data;
                }
            }
        } else {
            return null;
        }
    }
}
function process_message($msg)
{
    global $response;
    global $useragent;
    $body = $msg->body;
    print "{$body}\n";
    $blob = json_decode($body);
    $id = $blob->article_id;
    $url = $blob->url;
    $crl = curl_init();
    curl_setopt($crl, CURLOPT_URL, $url);
    curl_setopt($crl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($crl, CURLOPT_FOLLOWLOCATION, 1);
    curl_setopt($crl, CURLOPT_MAXCONNECTS, 30);
    curl_setopt($crl, CURLOPT_TIMEOUT, 10);
    curl_setopt($crl, CURLOPT_USERAGENT, $useragent);
    $html = curl_exec($crl);
    curl_close($crl);
    $html = str_replace('&rsquo;', '&#8217;', $html);
    $r = new Readability($html);
    $rData = $r->getContent();
    $rtitle = addslashes($rData['title']);
    $rcontent = addslashes($rData['content']);
    #	print ("$rtitle\n");
    #	print ("$rcontent\n");
    if ($rcontent != "") {
        $result = mysql_query("update articles set readability_title = '{$rtitle}', readability_content = '{$rcontent}' where id = '{$id}'");
    }
    $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']);
}
Example #6
0
 function grabContent($url)
 {
     $tikilib = TikiLib::lib('tiki');
     $client = $tikilib->get_http_client($url);
     $response = $tikilib->http_perform_request($client);
     // Obtain the URL after redirections
     $url = (string) $client->getUri();
     $html = $response->getBody();
     // Note: PHP Readability expects UTF-8 encoded content.
     // If your content is not UTF-8 encoded, convert it
     // first before passing it to PHP Readability.
     // Both iconv() and mb_convert_encoding() can do this.
     // If we've got Tidy, let's clean up input.
     // This step is highly recommended - PHP's default HTML parser
     // often doesn't do a great job and results in strange output.
     $html = $this->tidy($html);
     // give it to Readability
     global $prefs;
     if (is_file($prefs['page_content_fetch_readability'])) {
         require_once $prefs['page_content_fetch_readability'];
     }
     if (!class_exists('Readability')) {
         return false;
     }
     $readability = new Readability($html, $url);
     $result = $readability->init();
     if ($result) {
         $content = $this->tidy($readability->getContent()->innerHTML);
         $content = $this->replacePaths($content, $url);
         return array('title' => $readability->getTitle()->textContent, 'content' => $content);
     }
 }
 /** Extract article from a page using php-readability */
 function getArticle($url)
 {
     require 'class.Readability.php';
     $html = file_get_contents($url);
     $html_input_charset = 'utf-8';
     $Readability = new Readability($html, $html_input_charset);
     // default charset is utf-8
     $ReadabilityData = $Readability->getContent();
     $results = array('title' => $ReadabilityData['title'], 'content' => $ReadabilityData['content']);
     return $results;
 }
Example #8
0
 public function downloadItem($itemId)
 {
     $item = $this->cache->getItem($itemId);
     $html = file_get_contents($item->link);
     $reader = new Readability($html, $item->link);
     if ($reader->init()) {
         $item->description = $reader->articleContent->innerHTML;
         // save the downloaded content to cache, just in case
         $this->cache->saveItemDescription($item);
     }
     return $item;
 }
 public function parseData($html)
 {
     if (strlen($html) == 0) {
         return null;
     }
     // check tidy function available
     if (function_exists('tidy_parse_string')) {
         $this->tidyAvailable = true;
     }
     $headers = $this->response->getHeaders();
     if (isset($headers['Location'])) {
         $url = $headers["Location"];
     } else {
         $url = $this->getOption("readerUrl");
     }
     $urlArray = parse_url($url);
     if (isset($urlArray['path'])) {
         $this->basePath = dirname($urlArray['path']);
     } else {
         $this->basePath = "";
     }
     $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host'];
     $html = $this->tidyClean($html);
     $readability = new Readability($html, $url);
     $article = null;
     if ($readability->init()) {
         $title = $readability->getTitle()->textContent;
         $content = $readability->getContent()->innerHTML;
         /**
          * still need one more tidy clean, otherwise domdocument will not work properly
          */
         $content = $this->tidyClean($content);
         $content = $this->removeEmptyTags($content);
         /**
          * use domdocument to fix relative urls
          */
         $content = $this->fixRelativeUrls($content);
         /**
          * if there is tidy support, then detect target source code from meta content
          */
         if ($this->tidyAvailable) {
             $tidy = tidy_parse_string($content, array(), 'utf8');
             $head = $tidy->head();
             $charset = $this->findCharset($head->value);
             if (!empty($charset) && $charset != "utf-8") {
                 $content = mb_convert_encoding($content, "utf-8", $charset);
             }
         }
         $article = array('title' => $title, 'content' => $content);
     }
     return $article;
 }
Example #10
0
 function perform()
 {
     $q = DB::query('SELECT link, neighborhood FROM listings WHERE scraped != TRUE', PDO::FETCH_ASSOC);
     $ps = DB::prepare('UPDATE listings SET scraped=TRUE, street=:street, description=:description, lat=:lat, lng=:lng WHERE link=:link');
     /*
     Guzzle::sendAll(array_map(function ($listing) {
       return Guzzle::createRequest('GET', 'http://newyork.craigslist.org' . $listing['link']);
     }, iterator_to_array($q)), ['complete' => function ($event) use($ps) {
       try {        
         $body = $event->getResponse()->getBody();
       
         $crawler = new Crawler($body);
         $readability = new Readability($body);
     
         $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress');
       
         $ps->execute([
           ':link' => parse_url($event->getRequest()->getUrl())['path'],
           ':lat'  => null,
           ':lng'  => null,
           ':street' => $street->count() ? $street->text() : null,
           ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null    
         ]);
       } catch (Exception $e) {
         Logger::error($e->getMessage(), $ps->errorinfo());
       }
     }]);
     */
     foreach ($q as $listing) {
         try {
             $body = Guzzle::get('http://newyork.craigslist.org' . $listing['link'])->getBody();
             $crawler = new Crawler($body);
             $readability = new Readability($body);
             $street = $crawler->filter('.mapAndAttrs > .mapbox > div.mapaddress');
             $url = 'http://maps.googleapis.com/maps/api/geocode/json?address=' . ($street->count() ? $street->text() : $listing['neighborhood']);
             $json = json_decode(Guzzle::get($url)->getBody(), true);
             $loc = isset($json['results'][0]) ? $json['results'][0]['geometry']['location'] : null;
             $ps->execute([':link' => $listing['link'], ':lat' => isset($loc['lat']) ? $loc['lat'] : null, ':lng' => isset($loc['lng']) ? $loc['lng'] : null, ':street' => $street->count() ? $street->text() : null, ':description' => $readability->init() ? trim(strip_tags(tidy_parse_string($readability->getContent()->innerHTML, [], 'UTF8'))) : null]);
         } catch (Exception $e) {
             Logger::error($e->getMessage(), $ps->errorinfo());
         }
     }
 }
Example #11
0
 protected function absorb(OutputInterface $output, $id, $url)
 {
     $config = new Config();
     $config->setClientUserAgent('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:11.0) Gecko/20100101 Firefox/11.0');
     $reader = new Reader($config);
     $reader->download($url);
     $parser = $reader->getParser();
     if ($parser === false) {
         //$this->feedRepository->disableFeed($id);
         return $this->writeErrors($output);
     }
     $feed = $parser->execute();
     if ($feed === false) {
         //$this->feedRepository->disableFeed($id);
         return $this->writeErrors($output);
     }
     $data = ['lang' => $feed->getLanguage(), 'title' => $feed->getTitle(), 'lastUpdate' => $this->formatDateForMySQL($feed->getDate())];
     $this->feedRepository->updateByPk($data, $id);
     foreach ($feed->items as $item) {
         $url = $item->getUrl();
         $output->writeln('+ ' . $item->title);
         $fullContent = file_get_contents($url);
         $tidy = tidy_parse_string($fullContent, array(), 'UTF8');
         $tidy->cleanRepair();
         $html = $tidy->value;
         $readability = new \Readability($html, $url);
         $result = $readability->init();
         if ($result) {
             $content = $readability->getContent()->innerHTML;
             $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
             $tidy->cleanRepair();
             $content = $tidy->value;
         } else {
             $output->writeln('unable to get full content');
             $content = $item->getContent();
         }
         $data = ['feedId' => $id, 'remoteId' => $item->getId(), 'title' => $item->getTitle(), 'url' => $url, 'pubDate' => $this->formatDateForMySQL($item->getDate()), 'content' => $content, 'author' => $item->getAuthor()];
         $this->postRepository->add($data, true);
     }
 }
Example #12
0
 function filter_article_readability(&$article, $config)
 {
     require_once 'readability/Readability.php';
     $link = trim($article['link']);
     $html = $this->load_url($link, $config);
     $readability = new Readability($html, $link);
     $readability->debug = false;
     $readability->convertLinksToFootnotes = isset($config['footnote_links']) && $config['footnote_links'];
     $result = $readability->init();
     if ($result) {
         $article['content'] = $readability->getContent()->innerHTML;
         $article['plugin_data'] = "feedmod,{$owner_uid}:" . $article['plugin_data'];
     }
 }
Example #13
0
 public function parse($html, $url, Sources $source = null)
 {
     if ($source) {
         $this->source = $source;
     }
     try {
         $parsedNews = array();
         $html = $this->stripTagWithContent($html, "script");
         $htmlToDetect = $this->processExcludeElements($html);
         $content = $this->tryContentDetect($htmlToDetect);
         $readability = new \Readability($html, $url);
         $readability->debug = false;
         $readability->convertLinksToFootnotes = false;
         $result = $readability->init();
         if ($result || $content) {
             $title = $readability->getTitle()->textContent;
             $title = $this->processTitleStopWords($title);
             if (!$content) {
                 $content = $readability->getContent()->innerHTML;
             }
             $content = $this->processContentStopWords($content);
             $content = preg_replace('/\\n/', ' ', $content);
             $content = strip_tags($content, "<p><div><img><span><br><ul><li><embed><iframe>");
             $content = $this->fixUrls($content);
             $content = $this->processExcludeElements($content);
             $date = $this->processPublishDate($html);
             if ($searchContent = trim(strip_tags($content))) {
                 $searchContent = preg_replace('/\\n/', ' ', $searchContent);
                 $searchContent = preg_replace("/[^а-яa-z ]/ui", "", $searchContent);
                 $searchContent = preg_replace('/\\s+/', ' ', $searchContent);
                 $searchContent = mb_convert_encoding($searchContent, 'HTML-ENTITIES', "UTF-8");
                 $parsedNews['title'] = $title;
                 $parsedNews['content'] = $content;
                 $parsedNews['searchContent'] = $searchContent;
                 $parsedNews['thumb'] = $this->detectThumb($html, $content);
                 $parsedNews['date'] = $date;
             }
         } else {
             throw new Exception('Looks like we couldn\'t find the content. :(');
         }
     } catch (Exception $e) {
         $parsedNews['error'] = $e->getMessage();
     }
     return $parsedNews;
 }
Example #14
0
 public function extract_content($url)
 {
     if (!class_exists("Readability")) {
         require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
     }
     if (!defined('NO_CURL') && function_exists('curl_init') && !ini_get("open_basedir")) {
         $ch = curl_init($url);
         curl_setopt($ch, CURLOPT_TIMEOUT, 5);
         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
         curl_setopt($ch, CURLOPT_HEADER, true);
         curl_setopt($ch, CURLOPT_NOBODY, true);
         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
         curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
         @($result = curl_exec($ch));
         $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
         if (strpos($content_type, "text/html") === FALSE) {
             return false;
         }
     }
     $tmp = fetch_file_contents($url);
     if ($tmp && mb_strlen($tmp) < 65535 * 4) {
         $tmpdoc = new DOMDocument("1.0", "UTF-8");
         if (!$tmpdoc->loadHTML($tmp)) {
             return false;
         }
         if (strtolower($tmpdoc->encoding) != 'utf-8') {
             $tmpxpath = new DOMXPath($tmpdoc);
             foreach ($tmpxpath->query("//meta") as $elem) {
                 $elem->parentNode->removeChild($elem);
             }
             $tmp = $tmpdoc->saveHTML();
         }
         $r = new Readability($tmp, $url);
         if ($r->init()) {
             $tmpxpath = new DOMXPath($r->dom);
             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
             foreach ($entries as $entry) {
                 if ($entry->hasAttribute("href")) {
                     $entry->setAttribute("href", rewrite_relative_url($url, $entry->getAttribute("href")));
                 }
                 if ($entry->hasAttribute("src")) {
                     $entry->setAttribute("src", rewrite_relative_url($url, $entry->getAttribute("src")));
                 }
             }
             return $r->articleContent->innerHTML;
         }
     }
     return false;
 }
Example #15
0
} else {
    $handle = curl_init();
    curl_setopt_array($handle, array(CURLOPT_USERAGENT => USER_AGENT, CURLOPT_FOLLOWLOCATION => true, CURLOPT_HEADER => false, CURLOPT_HTTPGET => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_URL => $request_url));
    $source = curl_exec($handle);
    curl_close($handle);
    // Write request data into cache file.
    @file_put_contents($request_url_cache_file, $source);
}
// 判断编码
//if (!$charset = mb_detect_encoding($source)) {
//}
preg_match("/charset=([\\w|\\-]+);?/", $source, $match);
$charset = isset($match[1]) ? $match[1] : 'utf-8';
/**
 * 获取 HTML 内容后,解析主体内容
 */
$Readability = new Readability($source, $charset);
$Data = $Readability->getContent();
switch ($output_type) {
    case 'json':
        header("Content-type: text/json;charset=utf-8");
        $Data['url'] = $request_url;
        echo json_encode($Data);
        break;
    case 'html':
    default:
        header("Content-type: text/html;charset=utf-8");
        $title = $Data['title'];
        $content = $Data['content'];
        include 'template/reader.html';
}
Example #16
0
 private function get_full_post($request_url)
 {
     try {
         try {
             $handle = curl_init();
             curl_setopt_array($handle, array(CURLOPT_USERAGENT => "Tiny Tiny RSS", CURLOPT_FOLLOWLOCATION => true, CURLOPT_HEADER => false, CURLOPT_HTTPGET => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_URL => $request_url));
             $source = curl_exec($handle);
             curl_close($handle);
         } catch (Exception $e) {
             $source = file_get_contents($request_url);
         }
         // fix encoding -> done by itohsnap: https://github.com/itohsnap/ttrss_fullpost/commit/815e163b724fbfb426eff43bde6c3aa744a22ae5
         preg_match("/charset=([\\w|\\-]+);?/", $source, $match);
         $charset = isset($match[1]) ? $match[1] : 'utf-8';
         $source = mb_convert_encoding($source, 'UTF-8', $charset);
         // Clean with tidy, if exists
         if (function_exists('tidy_parse_string')) {
             $tidy = tidy_parse_string($source, array(), 'UTF8');
             $tidy->cleanRepair();
             $source = $tidy->value;
         }
         // get the Text
         require_once 'Readability.php';
         $readability = new Readability($source);
         $readability->debug = false;
         $readability->convertLinksToFootnotes = false;
         $result = $readability->init();
         $content = $readability->getContent()->innerHTML;
         // if we've got Tidy, let's clean it up for output
         if (function_exists('tidy_parse_string')) {
             $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
             $tidy->cleanRepair();
             $content = $tidy->value;
         }
         $Data['content'] = $content;
     } catch (Exception $e) {
         // do nothing if it dont grep fulltext succesfully
     }
     return $Data['content'];
 }
<?php

require_once '../Readability.php';
header('Content-Type: text/plain; charset=utf-8');
// get latest Medialens alert
// (change this URL to whatever you'd like to test)
$url = 'http://medialens.org/alerts/index.php';
$html = file_get_contents($url);
// Note: PHP Readability expects UTF-8 encoded content.
// If your content is not UTF-8 encoded, convert it
// first before passing it to PHP Readability.
// Both iconv() and mb_convert_encoding() can do this.
// give it to Readability
$readability = new Readability($html, $url);
// print debug output?
// useful to compare against Arc90's original JS version -
// simply click the bookmarklet with FireBug's console window open
$readability->debug = false;
// convert links to footnotes?
$readability->convertLinksToFootnotes = true;
// process it
$result = $readability->init();
// does it look like we found what we wanted?
if ($result) {
    echo "== Title =====================================\n";
    echo $readability->getTitle()->textContent, "\n\n";
    echo "== Body ======================================\n";
    $content = $readability->getContent()->innerHTML;
    // if we've got Tidy, let's clean it up for output
    if (function_exists('tidy_parse_string')) {
        $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
Example #18
0
 function hook_article_filter($article)
 {
     if (strpos($article["link"], "reddit.com/r/") !== FALSE) {
         $doc = new DOMDocument();
         @$doc->loadHTML($article["content"]);
         $xpath = new DOMXPath($doc);
         if ($this->host->get($this, "enable_content_dupcheck")) {
             $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0);
             if ($content_link) {
                 $content_href = db_escape_string($content_link->getAttribute("href"));
                 $entry_guid = db_escape_string($article["guid_hashed"]);
                 $owner_uid = $article["owner_uid"];
                 if (DB_TYPE == "pgsql") {
                     $interval_qpart = "date_entered < NOW() - INTERVAL '1 day'";
                 } else {
                     $interval_qpart = "date_entered < DATE_SUB(NOW(), INTERVAL 1 DAY)";
                 }
                 $result = db_query("SELECT COUNT(id) AS cid\n\t\t\t\t\t\tFROM ttrss_entries, ttrss_user_entries WHERE\n\t\t\t\t\t\t\tref_id = id AND\n\t\t\t\t\t\t\t{$interval_qpart} AND\n\t\t\t\t\t\t\tguid != '{$entry_guid}' AND\n\t\t\t\t\t\t\towner_uid = '{$owner_uid}' AND\n\t\t\t\t\t\t\tcontent LIKE '%href=\"{$content_href}\">[link]%'");
                 if ($result) {
                     $num_found = db_fetch_result($result, 0, "cid");
                     if ($num_found > 0) {
                         $article["force_catchup"] = true;
                     }
                 }
             }
         }
         $found = $this->inline_stuff($article, $doc, $xpath);
         if (!defined('NO_CURL') && function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) {
             if (!class_exists("Readability")) {
                 require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
             }
             if ($content_link && strpos($content_link->getAttribute("href"), "twitter.com") === FALSE && strpos($content_link->getAttribute("href"), "youtube.com") === FALSE && strpos($content_link->getAttribute("href"), "reddit.com") === FALSE) {
                 /* link may lead to a huge video file or whatever, we need to check content type before trying to
                 			parse it which p much requires curl */
                 $ch = curl_init($content_link->getAttribute("href"));
                 curl_setopt($ch, CURLOPT_TIMEOUT, 5);
                 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
                 curl_setopt($ch, CURLOPT_HEADER, true);
                 curl_setopt($ch, CURLOPT_NOBODY, true);
                 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("open_basedir"));
                 curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
                 @($result = curl_exec($ch));
                 $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
                 if ($content_type && strpos($content_type, "text/html") !== FALSE) {
                     $tmp = fetch_file_contents($content_link->getAttribute("href"));
                     //_debug("tmplen: " . mb_strlen($tmp));
                     if ($tmp && mb_strlen($tmp) < 65535 * 4) {
                         $r = new Readability($tmp, $content_link->getAttribute("href"));
                         if ($r->init()) {
                             $tmpxpath = new DOMXPath($r->dom);
                             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
                             foreach ($entries as $entry) {
                                 if ($entry->hasAttribute("href")) {
                                     $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href")));
                                 }
                                 if ($entry->hasAttribute("src")) {
                                     $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src")));
                                 }
                             }
                             $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"];
                             // prob not a very good idea (breaks wikipedia pages, etc) -
                             // inliner currently is not really fit for any random web content
                             //$doc = new DOMDocument();
                             //@$doc->loadHTML($article["content"]);
                             //$xpath = new DOMXPath($doc);
                             //$found = $this->inline_stuff($article, $doc, $xpath);
                         }
                     }
                 }
             }
         }
         $node = $doc->getElementsByTagName('body')->item(0);
         if ($node && $found) {
             $article["content"] = $doc->saveXML($node);
         }
     }
     return $article;
 }
Example #19
0
 function hook_article_filter($article)
 {
     if (strpos($article["link"], "reddit.com/r/") !== FALSE) {
         $doc = new DOMDocument();
         @$doc->loadHTML($article["content"]);
         $xpath = new DOMXPath($doc);
         $found = $this->inline_stuff($article, $doc, $xpath);
         if (function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) {
             if (!class_exists("Readability")) {
                 require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
             }
             $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0);
             if ($content_link && strpos($content_link->getAttribute("href"), "twitter.com") === FALSE && strpos($content_link->getAttribute("href"), "youtube.com") === FALSE && strpos($content_link->getAttribute("href"), "reddit.com") === FALSE) {
                 /* link may lead to a huge video file or whatever, we need to check content type before trying to
                 			parse it which p much requires curl */
                 $ch = curl_init($content_link->getAttribute("href"));
                 curl_setopt($ch, CURLOPT_TIMEOUT, 5);
                 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
                 curl_setopt($ch, CURLOPT_HEADER, true);
                 curl_setopt($ch, CURLOPT_NOBODY, true);
                 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir"));
                 curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
                 @($result = curl_exec($ch));
                 $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
                 if ($content_type && strpos($content_type, "text/html") !== FALSE) {
                     $tmp = fetch_file_contents($content_link->getAttribute("href"));
                     if ($tmp) {
                         $r = new Readability($tmp, $content_link->getAttribute("href"));
                         if ($r->init()) {
                             $tmpxpath = new DOMXPath($r->dom);
                             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
                             foreach ($entries as $entry) {
                                 if ($entry->hasAttribute("href")) {
                                     $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href")));
                                 }
                                 if ($entry->hasAttribute("src")) {
                                     $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src")));
                                 }
                             }
                             $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"];
                             // prob not a very good idea (breaks wikipedia pages, etc) -
                             // inliner currently is not really fit for any random web content
                             //$doc = new DOMDocument();
                             //@$doc->loadHTML($article["content"]);
                             //$xpath = new DOMXPath($doc);
                             //$found = $this->inline_stuff($article, $doc, $xpath);
                         }
                     }
                 }
             }
         }
         $node = $doc->getElementsByTagName('body')->item(0);
         if ($node && $found) {
             $article["content"] = $doc->saveXML($node);
         }
     }
     return $article;
 }
Example #20
0
 /**
  * Runs a URL through Readability and hands back the stripped content
  *
  * @since 1.7
  * @see http://www.keyvan.net/2010/08/php-readability/
  * @param $url
  */
 public static function readability_object($url)
 {
     set_time_limit(0);
     $url = pf_de_https($url);
     $url = str_replace('&amp;', '&', $url);
     //print_r($url); print_r(' - Readability<br />');
     // change from Boone - use wp_remote_get() instead of file_get_contents()
     $request = wp_remote_get($url, array('timeout' => '30'));
     if (is_wp_error($request)) {
         $content = 'error-secured';
         //print_r($request); die();
         return $content;
     }
     if (!empty($request['body'])) {
         $html = $request['body'];
     } else {
         $content = false;
         return $content;
     }
     //check if tidy exists to clean up the input.
     if (function_exists('tidy_parse_string')) {
         $tidy = tidy_parse_string($html, array(), 'UTF8');
         $tidy->cleanRepair();
         $html = $tidy->value;
     }
     // give it to Readability
     $readability = new Readability($html, $url);
     // print debug output?
     // useful to compare against Arc90's original JS version -
     // simply click the bookmarklet with FireBug's
     // console window open
     $readability->debug = false;
     // convert links to footnotes?
     $readability->convertLinksToFootnotes = false;
     // process it
     $result = $readability->init();
     if ($result) {
         $content = $readability->getContent()->innerHTML;
         //$content = $contentOut->innerHTML;
         //if we've got tidy, let's use it.
         if (function_exists('tidy_parse_string')) {
             $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
             $tidy->cleanRepair();
             $content = $tidy->value;
         }
         $content = balanceTags($content, true);
         $content = ent2ncr($content);
         $content = convert_chars($content);
         $domRotated = 0;
         $dom = new domDocument('1.0', 'utf-8');
         $dom->preserveWhiteSpace = true;
         $dom->substituteEntities = true;
         $dom->resolveExternals = true;
         $dom->loadXML('<fullContent>' . $content . '</fullContent>');
         $images = $dom->getElementsByTagName('img');
         foreach ($images as $image) {
             $img = $image->getAttribute('src');
             if (strpos($img, '/') === 0 || strpos($img, 'http') != 0) {
                 $urlArray = parse_url($url);
                 if (strpos($img, 'http') != 0) {
                     $urlBase = 'http://' . $urlArray['host'] . '/';
                 } else {
                     $urlBase = 'http://' . $urlArray['host'];
                 }
                 if (!is_wp_error(wp_remote_head($urlBase . $img))) {
                     $image->setAttribute('src', $urlBase . $img);
                     $domRotated++;
                 } elseif (!is_wp_error(wp_remote_head($url . $img))) {
                     $image->setAttribute('src', $url . $img);
                     $domRotated++;
                 } else {
                     $image->parentNode->removeChild($image);
                     $domRotated++;
                 }
             }
         }
         if ($domRotated > 0) {
             $content = $dom->saveXML();
             $rel = '(<\\?xml version="1\\.0" encoding="utf-8"\\?>)';
             $content = preg_replace("/" . $rel . "/is", ' ', $content);
             $rel = '(<\\?xml version="1\\.0"\\?>)';
             $content = preg_replace("/" . $rel . "/is", ' ', $content);
         }
         if (120 > strlen($content)) {
             $content = false;
         }
         #			$content = stripslashes($content);
         # print_r($content);
         #				var_dump($content); die();
         // this will also output doctype and comments at top level
         #			$content = "";
         #			foreach($dom->childNodes as $node){
         #				$content .= $dom->saveXML($node)."\n";
         #			}
     } else {
         # If Readability can't get the content, send back a FALSE to loop with.
         $content = false;
         # and let's throw up an error via AJAX as well, so we know what's going on.
         //print_r($url . ' fails Readability.<br />');
     }
     if ($content != false) {
         $contentObj = new pf_htmlchecker($content);
         $content = $contentObj->closetags($content);
     }
     return $content;
 }
Example #21
0
 public function readability($html)
 {
     $obj = new Readability($html);
     return $obj->getContent();
 }
Example #22
0
 function hook_article_filter($article)
 {
     $enabled_feeds = $this->host->get($this, "enabled_feeds");
     $key = array_search($article["feed"]["id"], $enabled_feeds);
     if ($key === FALSE) {
         return $article;
     }
     if (!class_exists("Readability")) {
         require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
     }
     if (function_exists("curl_init")) {
         $ch = curl_init($article["link"]);
         curl_setopt($ch, CURLOPT_TIMEOUT, 5);
         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
         curl_setopt($ch, CURLOPT_HEADER, true);
         curl_setopt($ch, CURLOPT_NOBODY, true);
         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir"));
         curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
         @($result = curl_exec($ch));
         $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
         if (strpos($content_type, "text/html") === FALSE) {
             return $article;
         }
     }
     $tmp = fetch_file_contents($article["link"]);
     if ($tmp) {
         $tmpdoc = new DOMDocument("1.0", "UTF-8");
         if (!$tmpdoc->loadHTML($tmp)) {
             return $article;
         }
         if ($tmpdoc->encoding != 'UTF-8') {
             $tmpxpath = new DOMXPath($tmpdoc);
             foreach ($tmpxpath->query("//meta") as $elem) {
                 $elem->parentNode->removeChild($elem);
             }
             $tmp = $tmpdoc->saveHTML();
         }
         $r = new Readability($tmp, $article["link"]);
         if ($r->init()) {
             $tmpxpath = new DOMXPath($r->dom);
             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
             foreach ($entries as $entry) {
                 if ($entry->hasAttribute("href")) {
                     $entry->setAttribute("href", rewrite_relative_url($article["link"], $entry->getAttribute("href")));
                 }
                 if ($entry->hasAttribute("src")) {
                     $entry->setAttribute("src", rewrite_relative_url($article["link"], $entry->getAttribute("src")));
                 }
             }
             $article["content"] = $r->articleContent->innerHTML;
         }
     }
     return $article;
 }
         // check site config for single page URL - fetch it if found
         if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
             $html = $single_page_response['body'];
             // remove strange things
             $html = str_replace('</[>', '', $html);
             $html = convert_to_utf8($html, $single_page_response['headers']);
             $effective_url = $single_page_response['effective_url'];
             unset($single_page_response);
         }
         $extract_result = $extractor->process($html, $effective_url);
         $readability = $extractor->readability;
         $content_block = $extract_result ? $extractor->getContent() : null;
         //echo "content_block: [" . $content_block->innerHTML . "] \n\n";
         $title = $extract_result ? $extractor->getTitle() : '';
     } else {
         $readability = new Readability($html, $effective_url);
         // content block is entire document (for now...)
         $content_block = $readability->dom;
         //echo $content_block->innerHTML;
         //TODO: get title
         $title = '';
     }
     //echo "[" . $content_block . "]" ;
 }
 // use extracted title for both feed and item title if we're using single-item dummy feed
 if ($isDummyFeed) {
     $output->setTitle($title);
     $newitem->setTitle($title);
 }
 if ($do_content_extraction) {
     if ($extract_pattern && isset($content_block)) {
Example #24
0
    echo htmlspecialchars($pageTitle);
    ?>
" id="pageTitle" />
                <input type="hidden" name="key" value="<?php 
    echo htmlspecialchars($_SESSION['secureKey']);
    ?>
" id="key" />
            </form>
            <?php 
} else {
    if ($page == "complete") {
        ?>
            <div id="complete">
                <p>
                    A link to this page has been sent to <?php 
        echo Readability::kindleAsLinks($to);
        ?>
                </p>
                <p>
                    Thanks for using Readability.
                </p>
                <div>
                    <img src="http://localhost/readability/images/footer-thanks.png" alt="Readability" />
                </div>
            </div>
            <?php 
    }
}
?>
        </div>
    </body>
Example #25
0
 function import($feedObj, $maxItems = 0)
 {
     jimport('simplepie.simplepie');
     $config = EasyBlogHelper::getConfig();
     $itemMigrated = 0;
     $isDomSupported = false;
     $defaultAllowedHTML = '<img>,<a>,<br>,<table>,<tbody>,<th>,<tr>,<td>,<div>,<span>,<p>,<h1>,<h2>,<h3>,<h4>,<h5>,<h6>';
     if (class_exists('DomDocument')) {
         $isDomSupported = true;
         require_once EBLOG_CLASSES . DIRECTORY_SEPARATOR . 'readability' . DIRECTORY_SEPARATOR . 'Readability.php';
     }
     $params = EasyBlogHelper::getRegistry($feedObj->params);
     $maxItems = $maxItems ? $maxItems : $params->get('feedamount', 0);
     $feedURL = $feedObj->url;
     require_once EBLOG_HELPERS . DIRECTORY_SEPARATOR . 'connectors.php';
     $connector = new EasyBlogConnectorsHelper();
     $connector->addUrl($feedURL);
     $connector->execute();
     $content = $connector->getResult($feedURL);
     // to ensure the leading no text before the <?xml> tag
     //$pattern	= '/(.*?)(?=<\?xml)/ims';
     $pattern = '/(.*?)<\\?xml version/is';
     $replacement = '<?xml version';
     $content = preg_replace($pattern, $replacement, $content, 1);
     if (strpos($content, '<?xml version') === false) {
         // look like the content missing the xml header. lets manually add in.
         $content = '<?xml version="1.0" encoding="utf-8"?>' . $content;
     }
     $parser = new SimplePie();
     $parser->strip_htmltags(false);
     $parser->set_raw_data($content);
     $parser->init();
     $items = '';
     $items = $parser->get_items();
     if (count($items) > 0) {
         //lets process the data insert
         $myCnt = 0;
         foreach ($items as $item) {
             @ini_set('max_execution_time', 180);
             if (!empty($maxItems) && $myCnt == $maxItems) {
                 break;
             }
             $timezoneSec = $item->get_date('Z');
             $itemdate = $item->get_date('U');
             $itemdate = $itemdate - $timezoneSec;
             $mydate = date('Y-m-d H:i:s', $itemdate);
             $feedUid = $item->get_id();
             $feedPath = $item->get_link();
             $feedHistory = EasyBlogHelper::getTable('FeedHistory');
             $newHistoryId = '';
             if ($feedHistory->isExists($feedObj->id, $feedUid)) {
                 continue;
             } else {
                 //log the feed item so that in future it will not process again.
                 $date = EasyBlogHelper::getDate();
                 $newHistory = EasyBlogHelper::getTable('FeedHistory');
                 $newHistory->feed_id = $feedObj->id;
                 $newHistory->uid = $feedUid;
                 $newHistory->created = $date->toMySQL();
                 $newHistory->store();
                 $newHistoryId = $newHistory->id;
             }
             $blogObj = new stdClass();
             // set the default setting from the feed configuration via backend.
             $blogObj->category_id = $feedObj->item_category;
             $blogObj->published = $feedObj->item_published;
             $blogObj->frontpage = $feedObj->item_frontpage;
             $blogObj->created_by = $feedObj->item_creator;
             $blogObj->allowcomment = $config->get('main_comment', 1);
             $blogObj->subscription = $config->get('main_subscription', 1);
             $blogObj->issitewide = '1';
             $text = $item->get_content();
             // @rule: Append copyright text
             $blogObj->copyrights = $params->get('copyrights', '');
             if ($feedObj->item_get_fulltext && $isDomSupported) {
                 $feedItemUrl = urldecode($item->get_link());
                 $fiConnector = new EasyBlogConnectorsHelper();
                 $fiConnector->addUrl($feedItemUrl);
                 $fiConnector->execute();
                 $fiContent = $fiConnector->getResult($feedItemUrl);
                 // to ensure the leading no text before the <?xml> tag
                 $pattern = '/(.*?)<html/is';
                 $replacement = '<html';
                 $fiContent = preg_replace($pattern, $replacement, $fiContent, 1);
                 if (!empty($fiContent)) {
                     $fiContent = EasyBlogHelper::getHelper('string')->forceUTF8($fiContent);
                     $readability = new Readability($fiContent);
                     $readability->debug = false;
                     $readability->convertLinksToFootnotes = false;
                     $result = $readability->init();
                     if ($result) {
                         $content = $readability->getContent()->innerHTML;
                         //$content	= EasyBlogHelper::getHelper( 'string' )->fixUTF8( $content );
                         $content = EasyBlogFeedsHelper::tidyContent($content);
                         if (stristr(html_entity_decode($content), '<!DOCTYPE html') === false) {
                             $text = $content;
                             $text = $this->_processRelLinktoAbs($text, $feedPath);
                         }
                     }
                 }
             }
             // strip un-allowed html tag.
             $text = strip_tags($text, $params->get('allowed', $defaultAllowedHTML));
             // Append original source link into article if necessary
             if ($params->get('sourceLinks')) {
                 JFactory::getLanguage()->load('com_easyblog', JPATH_ROOT);
                 $text .= '<div><a href="' . $item->get_link() . '" target="_blank">' . JText::_('COM_EASYBLOG_FEEDS_ORIGINAL_LINK') . '</a></div>';
             }
             if ($feedObj->author) {
                 $feedAuthor = $item->get_author();
                 if (!empty($feedAuthor)) {
                     $authorName = $feedAuthor->get_name();
                     $authorEmail = $feedAuthor->get_email();
                     if (!empty($authorName)) {
                         // Store it as copyright column instead
                         $text .= '<div>' . JText::sprintf('COM_EASYBLOG_FEEDS_ORIGINAL_AUTHOR', $authorName) . '</div>';
                     } else {
                         if (!empty($authorEmail)) {
                             $authorArr = explode(' ', $authorEmail);
                             if (isset($authorArr[1])) {
                                 $authorName = $authorArr[1];
                                 $authorName = str_replace(array('(', ')'), '', $authorName);
                                 $text .= '<div>' . JText::sprintf('COM_EASYBLOG_FEEDS_ORIGINAL_AUTHOR', $authorName) . '</div>';
                             }
                         }
                     }
                 }
             }
             if ($feedObj->item_content == 'intro') {
                 $blogObj->intro = $text;
             } else {
                 $blogObj->content = $text;
             }
             $creationDate = $mydate;
             $blogObj->created = $mydate;
             $blogObj->modified = $mydate;
             $blogObj->title = $item->get_title();
             if (empty($blogObj->title)) {
                 $blogObj->title = $this->_getTitleFromLink($item->get_link());
             }
             $blogObj->title = EasyBlogStringHelper::unhtmlentities($blogObj->title);
             $blogObj->permalink = EasyBlogHelper::getPermalink($blogObj->title);
             $blogObj->publish_up = $mydate;
             $blogObj->isnew = !$feedObj->item_published ? true : false;
             $blog = EasyBlogHelper::getTable('blog');
             $blog->bind($blogObj);
             if ($feedObj->item_published) {
                 $blog->notify();
             }
             if ($blog->store()) {
                 $myCnt++;
                 //update the history with blog id
                 if (!empty($newHistoryId)) {
                     $tmpHistory = EasyBlogHelper::getTable('FeedHistory');
                     $tmpHistory->load($newHistoryId);
                     $tmpHistory->post_id = $blog->id;
                     $tmpHistory->store();
                 }
                 $itemMigrated++;
                 if ($feedObj->item_published) {
                     //insert activity here.
                     EasyBlogHelper::addJomSocialActivityBlog($blog, true, true);
                     // Determines if admin wants to auto post this item to the social sites.
                     if ($params->get('autopost')) {
                         $allowed = array(EBLOG_OAUTH_LINKEDIN, EBLOG_OAUTH_FACEBOOK, EBLOG_OAUTH_TWITTER);
                         // @rule: Process centralized options first
                         // See if there are any global postings enabled.
                         $blog->autopost($allowed, $allowed);
                     }
                 }
             }
             //end if
         }
     }
     return $itemMigrated;
 }
Example #26
0
<?php

require 'php-readability/lib/Readability.inc.php';
$html = file_get_contents($_GET['src']);
$r = new Readability($html);
$rData = $r->getContent();
echo "<h1>" . $rData['title'] . "</h1>";
echo $rData['content'];
Example #27
0
 public function parse_tweets($_id)
 {
     $mongo_id = new MongoId($_id);
     $user = $this->db->users->findOne(array('_id' => $mongo_id));
     if ($this->input->get('debug')) {
         echo 'Updating ' . $user['name'] . "\n";
     }
     if ($this->input->get('debug')) {
         var_dump($user);
     }
     $this->load->library('twitter');
     $auth = $this->twitter->oauth('', '', $user['oauth_token'], $user['oauth_token_secret']);
     if (!array_key_exists('last_tweet', $user) || !isset($user['last_tweet']) || !strlen($user['last_tweet']) || $user['last_tweet'] == '0' || !is_numeric($user['last_tweet'])) {
         $criteria = array('count' => 50);
     } else {
         $criteria = array('count' => 50, 'since_id' => $user['last_tweet']);
     }
     $data = $this->twitter->call('statuses/home_timeline', $criteria);
     if ($this->input->get('debug')) {
         var_dump($data);
     }
     if (array_key_exists('links', $user)) {
         $current_link_count = count($user['links']);
     } else {
         $current_link_count = 0;
     }
     if (is_array($data)) {
         if (array_key_exists('error', $data)) {
             if ($data['error'] == 'Could not authenticate with OAuth.') {
                 $this->db->users->remove(array('_id' => $mongo_id), array('justOne' => true));
                 if ($this->input->get('debug')) {
                     echo 'Deleting user' . "\n";
                 }
             }
             if ($this->input->get('debug')) {
                 echo "ERROR: " . $data['error'] . "\n";
             }
             return;
         }
         if (count($data) >= 1) {
             $this->db->users->update(array('_id' => $mongo_id), array('$set' => array('last_tweet' => (string) $data[0]['id'])));
         }
         for ($x = count($data) - 1; $x > 0; $x--) {
             $tweet = $data[$x];
             if (preg_match('@(https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.]*(\\?\\S+)?)?)?)@', $tweet['text'], $matches)) {
                 $doc = $tweet;
                 $doc['link'] = $matches[0];
                 $doc['owner_id'] = $user['id'];
                 $doc['tweet'] = $tweet['text'];
                 $doc['created_at'] = $tweet['created_at'];
                 $doc['tweet_id'] = (string) $tweet['id'];
                 $doc['from_user'] = $tweet['user'];
                 if (false && isset($user['full_article']) && $user['full_article']) {
                     require_once 'lib/Readability.php';
                     $url = $matches[0];
                     $html = @file_get_contents($url);
                     if ($html) {
                         $readability = new Readability($html, $url);
                         $result = $readability->init();
                         if ($result) {
                             $doc['article_title'] = trim($readability->getTitle()->textContent);
                             $doc['article_body'] = trim($readability->getContent()->innerHTML);
                         }
                     }
                 }
                 $current_link_count++;
                 $this->db->users->update(array('_id' => $user['_id']), array('$push' => array('links' => $doc)));
                 if ($current_link_count > 50) {
                     $this->db->users->update(array('_id' => $user['_id']), array('$pop' => array('links' => -1)));
                 }
             }
         }
         //echo "  Done\n";
     } else {
         //echo '  Failed'."\n";
     }
     $data = null;
     $user = null;
 }
Example #28
0
File: index.php Project: arh922/ain
// (change this URL to whatever you'd like to test)
//$url = 'http://alkhaleejonline.net/articles/1434390719340805900';
$html = file_get_contents($url);
// PHP Readability works with UTF-8 encoded content.
// If $html is not UTF-8 encoded, use iconv() or
// mb_convert_encoding() to convert to UTF-8.
// If we've got Tidy, let's clean up input.
// This step is highly recommended - PHP's default HTML parser
// often does a terrible job and results in strange output.
if (function_exists('tidy_parse_string')) {
    $tidy = tidy_parse_string($html, array(), 'UTF8');
    $tidy->cleanRepair();
    $html = $tidy->value;
}
// give it to Readability
$readability = new Readability($html, $url);
// print debug output?
// useful to compare against Arc90's original JS version -
// simply click the bookmarklet with FireBug's
// console window open
$readability->debug = false;
// convert links to footnotes?
$readability->convertLinksToFootnotes = true;
// process it
$result = $readability->init();
// does it look like we found what we wanted?
if ($result) {
    //echo "== Title ===============================\n";
    echo $readability->getTitle()->textContent, "\n\n";
    exit;
    //  echo "== Body ===============================\n";
Example #29
0
 public function __construct($args)
 {
     parent::__construct($args[0]);
 }
Example #30
0
    echo $pageTitle;
    ?>
" id="pageTitle" />
                <input type="hidden" name="key" value="<?php 
    echo $_SESSION['secureKey'];
    ?>
" id="key" />
            </form>
            <?php 
} else {
    if ($page == "complete") {
        ?>
            <div id="complete">
                <p>
                    A link to this page has been sent to <?php 
        echo Readability::emailAsLinks($to);
        ?>
                </p>
                <p>
                    Thanks for using Readability.
                </p>
                <div>
                    <img src="http://lab.arc90.com/experiments/readability/images/footer-thanks.png" alt="Readability" />
                </div>
            </div>
            <?php 
    }
}
?>
        </div>
    </body>