Example #1
0
 private function readFromServer($url)
 {
     // check tidy function available
     if (function_exists('tidy_parse_string')) {
         $this->tidyAvailable = true;
     }
     $urlArray = parse_url($url);
     $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host'];
     $html = $this->fetchContent($url);
     $html = $this->tidyClean($html);
     $readability = new Readability($html, $url);
     $readability->init();
     $this->title = $readability->getTitle()->textContent;
     $content = $readability->getContent()->innerHTML;
     /**
      * still need one more tidy clean, otherwise domdocument will not work properly
      */
     $content = $this->tidyClean($content);
     /**
      * use domdocument to fix relative urls
      */
     $this->content = $this->fixRelativeUrls($content);
     $article = array('title' => $this->title, 'content' => $this->content);
     $this->writeCache(serialize($article));
 }
Example #2
0
 public function __construct($url)
 {
     if (!preg_match('!^https?://!i', $url)) {
         $url = 'http://' . $url;
     }
     $data = Http::Request($url);
     //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII");
     $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII");
     //$html = utf8_encode($html);
     $r = new Readability($html, $url);
     $r->init();
     if (!isset($this->metadata["title"])) {
         $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML));
     }
     if (!isset($this->metadata["author"])) {
         $parts = parse_url($url);
         $this->metadata["author"] = $parts["host"];
     }
     $article = $r->getContent()->innerHTML;
     if (substr($article, 0, 5) == "<body") {
         $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>" . $article . "</html>";
     } else {
         $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>" . $article . "</body></html>";
     }
     $doc = new DOMDocument();
     @$doc->loadHTML($article) or die($article);
     $doc->normalizeDocument();
     $this->images = $this->handleImages($doc, $url);
     $this->text = $doc->saveHTML();
 }
Example #3
0
 function grabContent($url)
 {
     $tikilib = TikiLib::lib('tiki');
     $client = $tikilib->get_http_client($url);
     $response = $tikilib->http_perform_request($client);
     // Obtain the URL after redirections
     $url = (string) $client->getUri();
     $html = $response->getBody();
     // Note: PHP Readability expects UTF-8 encoded content.
     // If your content is not UTF-8 encoded, convert it
     // first before passing it to PHP Readability.
     // Both iconv() and mb_convert_encoding() can do this.
     // If we've got Tidy, let's clean up input.
     // This step is highly recommended - PHP's default HTML parser
     // often doesn't do a great job and results in strange output.
     $html = $this->tidy($html);
     // give it to Readability
     global $prefs;
     if (is_file($prefs['page_content_fetch_readability'])) {
         require_once $prefs['page_content_fetch_readability'];
     }
     if (!class_exists('Readability')) {
         return false;
     }
     $readability = new Readability($html, $url);
     $result = $readability->init();
     if ($result) {
         $content = $this->tidy($readability->getContent()->innerHTML);
         $content = $this->replacePaths($content, $url);
         return array('title' => $readability->getTitle()->textContent, 'content' => $content);
     }
 }
 public function parseData($html)
 {
     if (strlen($html) == 0) {
         return null;
     }
     // check tidy function available
     if (function_exists('tidy_parse_string')) {
         $this->tidyAvailable = true;
     }
     $headers = $this->response->getHeaders();
     if (isset($headers['Location'])) {
         $url = $headers["Location"];
     } else {
         $url = $this->getOption("readerUrl");
     }
     $urlArray = parse_url($url);
     if (isset($urlArray['path'])) {
         $this->basePath = dirname($urlArray['path']);
     } else {
         $this->basePath = "";
     }
     $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host'];
     $html = $this->tidyClean($html);
     $readability = new Readability($html, $url);
     $article = null;
     if ($readability->init()) {
         $title = $readability->getTitle()->textContent;
         $content = $readability->getContent()->innerHTML;
         /**
          * still need one more tidy clean, otherwise domdocument will not work properly
          */
         $content = $this->tidyClean($content);
         $content = $this->removeEmptyTags($content);
         /**
          * use domdocument to fix relative urls
          */
         $content = $this->fixRelativeUrls($content);
         /**
          * if there is tidy support, then detect target source code from meta content
          */
         if ($this->tidyAvailable) {
             $tidy = tidy_parse_string($content, array(), 'utf8');
             $head = $tidy->head();
             $charset = $this->findCharset($head->value);
             if (!empty($charset) && $charset != "utf-8") {
                 $content = mb_convert_encoding($content, "utf-8", $charset);
             }
         }
         $article = array('title' => $title, 'content' => $content);
     }
     return $article;
 }
Example #5
0
 public function parse($html, $url, Sources $source = null)
 {
     if ($source) {
         $this->source = $source;
     }
     try {
         $parsedNews = array();
         $html = $this->stripTagWithContent($html, "script");
         $htmlToDetect = $this->processExcludeElements($html);
         $content = $this->tryContentDetect($htmlToDetect);
         $readability = new \Readability($html, $url);
         $readability->debug = false;
         $readability->convertLinksToFootnotes = false;
         $result = $readability->init();
         if ($result || $content) {
             $title = $readability->getTitle()->textContent;
             $title = $this->processTitleStopWords($title);
             if (!$content) {
                 $content = $readability->getContent()->innerHTML;
             }
             $content = $this->processContentStopWords($content);
             $content = preg_replace('/\\n/', ' ', $content);
             $content = strip_tags($content, "<p><div><img><span><br><ul><li><embed><iframe>");
             $content = $this->fixUrls($content);
             $content = $this->processExcludeElements($content);
             $date = $this->processPublishDate($html);
             if ($searchContent = trim(strip_tags($content))) {
                 $searchContent = preg_replace('/\\n/', ' ', $searchContent);
                 $searchContent = preg_replace("/[^а-яa-z ]/ui", "", $searchContent);
                 $searchContent = preg_replace('/\\s+/', ' ', $searchContent);
                 $searchContent = mb_convert_encoding($searchContent, 'HTML-ENTITIES', "UTF-8");
                 $parsedNews['title'] = $title;
                 $parsedNews['content'] = $content;
                 $parsedNews['searchContent'] = $searchContent;
                 $parsedNews['thumb'] = $this->detectThumb($html, $content);
                 $parsedNews['date'] = $date;
             }
         } else {
             throw new Exception('Looks like we couldn\'t find the content. :(');
         }
     } catch (Exception $e) {
         $parsedNews['error'] = $e->getMessage();
     }
     return $parsedNews;
 }
$html = file_get_contents($url);
// Note: PHP Readability expects UTF-8 encoded content.
// If your content is not UTF-8 encoded, convert it
// first before passing it to PHP Readability.
// Both iconv() and mb_convert_encoding() can do this.
// give it to Readability
$readability = new Readability($html, $url);
// print debug output?
// useful to compare against Arc90's original JS version -
// simply click the bookmarklet with FireBug's console window open
$readability->debug = false;
// convert links to footnotes?
$readability->convertLinksToFootnotes = true;
// process it
$result = $readability->init();
// does it look like we found what we wanted?
if ($result) {
    echo "== Title =====================================\n";
    echo $readability->getTitle()->textContent, "\n\n";
    echo "== Body ======================================\n";
    $content = $readability->getContent()->innerHTML;
    // if we've got Tidy, let's clean it up for output
    if (function_exists('tidy_parse_string')) {
        $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
        $tidy->cleanRepair();
        $content = $tidy->value;
    }
    echo $content;
} else {
    echo 'Looks like we couldn\'t find the content. :(';
}
         $content_block = $elems->item(0);
         // clean it up
         $readability->removeScripts($content_block);
         $readability->prepArticle($content_block);
     } else {
         if ($exclude_on_fail) {
             die('Sorry, could not extract content');
         }
         $content_block = $readability->dom->createElement('p', 'Sorry, could not extract content');
     }
 }
 $readability->clean($content_block, 'select');
 if ($options->rewrite_relative_urls) {
     makeAbsolute($effective_url, $content_block);
 }
 $title = $readability->getTitle()->textContent;
 if ($extract_pattern) {
     // get outerHTML
     $content = $content_block->ownerDocument->saveXML($content_block);
 } else {
     $content = $content_block->innerHTML;
 }
 if ($links == 'remove') {
     $content = preg_replace('!</?a[^>]*>!', '', $content);
 }
 if (!$valid_key) {
     $content = $options->message_to_prepend . $content;
     $content .= $options->message_to_append;
 } else {
     $content = $options->message_to_prepend_with_key . $content;
     $content .= $options->message_to_append_with_key;
Example #8
0
 public function parse_tweets($_id)
 {
     $mongo_id = new MongoId($_id);
     $user = $this->db->users->findOne(array('_id' => $mongo_id));
     if ($this->input->get('debug')) {
         echo 'Updating ' . $user['name'] . "\n";
     }
     if ($this->input->get('debug')) {
         var_dump($user);
     }
     $this->load->library('twitter');
     $auth = $this->twitter->oauth('', '', $user['oauth_token'], $user['oauth_token_secret']);
     if (!array_key_exists('last_tweet', $user) || !isset($user['last_tweet']) || !strlen($user['last_tweet']) || $user['last_tweet'] == '0' || !is_numeric($user['last_tweet'])) {
         $criteria = array('count' => 50);
     } else {
         $criteria = array('count' => 50, 'since_id' => $user['last_tweet']);
     }
     $data = $this->twitter->call('statuses/home_timeline', $criteria);
     if ($this->input->get('debug')) {
         var_dump($data);
     }
     if (array_key_exists('links', $user)) {
         $current_link_count = count($user['links']);
     } else {
         $current_link_count = 0;
     }
     if (is_array($data)) {
         if (array_key_exists('error', $data)) {
             if ($data['error'] == 'Could not authenticate with OAuth.') {
                 $this->db->users->remove(array('_id' => $mongo_id), array('justOne' => true));
                 if ($this->input->get('debug')) {
                     echo 'Deleting user' . "\n";
                 }
             }
             if ($this->input->get('debug')) {
                 echo "ERROR: " . $data['error'] . "\n";
             }
             return;
         }
         if (count($data) >= 1) {
             $this->db->users->update(array('_id' => $mongo_id), array('$set' => array('last_tweet' => (string) $data[0]['id'])));
         }
         for ($x = count($data) - 1; $x > 0; $x--) {
             $tweet = $data[$x];
             if (preg_match('@(https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.]*(\\?\\S+)?)?)?)@', $tweet['text'], $matches)) {
                 $doc = $tweet;
                 $doc['link'] = $matches[0];
                 $doc['owner_id'] = $user['id'];
                 $doc['tweet'] = $tweet['text'];
                 $doc['created_at'] = $tweet['created_at'];
                 $doc['tweet_id'] = (string) $tweet['id'];
                 $doc['from_user'] = $tweet['user'];
                 if (false && isset($user['full_article']) && $user['full_article']) {
                     require_once 'lib/Readability.php';
                     $url = $matches[0];
                     $html = @file_get_contents($url);
                     if ($html) {
                         $readability = new Readability($html, $url);
                         $result = $readability->init();
                         if ($result) {
                             $doc['article_title'] = trim($readability->getTitle()->textContent);
                             $doc['article_body'] = trim($readability->getContent()->innerHTML);
                         }
                     }
                 }
                 $current_link_count++;
                 $this->db->users->update(array('_id' => $user['_id']), array('$push' => array('links' => $doc)));
                 if ($current_link_count > 50) {
                     $this->db->users->update(array('_id' => $user['_id']), array('$pop' => array('links' => -1)));
                 }
             }
         }
         //echo "  Done\n";
     } else {
         //echo '  Failed'."\n";
     }
     $data = null;
     $user = null;
 }