private function readFromServer($url) { // check tidy function available if (function_exists('tidy_parse_string')) { $this->tidyAvailable = true; } $urlArray = parse_url($url); $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host']; $html = $this->fetchContent($url); $html = $this->tidyClean($html); $readability = new Readability($html, $url); $readability->init(); $this->title = $readability->getTitle()->textContent; $content = $readability->getContent()->innerHTML; /** * still need one more tidy clean, otherwise domdocument will not work properly */ $content = $this->tidyClean($content); /** * use domdocument to fix relative urls */ $this->content = $this->fixRelativeUrls($content); $article = array('title' => $this->title, 'content' => $this->content); $this->writeCache(serialize($article)); }
public function __construct($url) { if (!preg_match('!^https?://!i', $url)) { $url = 'http://' . $url; } $data = Http::Request($url); //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII"); $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII"); //$html = utf8_encode($html); $r = new Readability($html, $url); $r->init(); if (!isset($this->metadata["title"])) { $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML)); } if (!isset($this->metadata["author"])) { $parts = parse_url($url); $this->metadata["author"] = $parts["host"]; } $article = $r->getContent()->innerHTML; if (substr($article, 0, 5) == "<body") { $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>" . $article . "</html>"; } else { $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>" . $article . "</body></html>"; } $doc = new DOMDocument(); @$doc->loadHTML($article) or die($article); $doc->normalizeDocument(); $this->images = $this->handleImages($doc, $url); $this->text = $doc->saveHTML(); }
function grabContent($url) { $tikilib = TikiLib::lib('tiki'); $client = $tikilib->get_http_client($url); $response = $tikilib->http_perform_request($client); // Obtain the URL after redirections $url = (string) $client->getUri(); $html = $response->getBody(); // Note: PHP Readability expects UTF-8 encoded content. // If your content is not UTF-8 encoded, convert it // first before passing it to PHP Readability. // Both iconv() and mb_convert_encoding() can do this. // If we've got Tidy, let's clean up input. // This step is highly recommended - PHP's default HTML parser // often doesn't do a great job and results in strange output. $html = $this->tidy($html); // give it to Readability global $prefs; if (is_file($prefs['page_content_fetch_readability'])) { require_once $prefs['page_content_fetch_readability']; } if (!class_exists('Readability')) { return false; } $readability = new Readability($html, $url); $result = $readability->init(); if ($result) { $content = $this->tidy($readability->getContent()->innerHTML); $content = $this->replacePaths($content, $url); return array('title' => $readability->getTitle()->textContent, 'content' => $content); } }
public function parseData($html) { if (strlen($html) == 0) { return null; } // check tidy function available if (function_exists('tidy_parse_string')) { $this->tidyAvailable = true; } $headers = $this->response->getHeaders(); if (isset($headers['Location'])) { $url = $headers["Location"]; } else { $url = $this->getOption("readerUrl"); } $urlArray = parse_url($url); if (isset($urlArray['path'])) { $this->basePath = dirname($urlArray['path']); } else { $this->basePath = ""; } $this->baseUrl = $urlArray['scheme'] . "://" . $urlArray['host']; $html = $this->tidyClean($html); $readability = new Readability($html, $url); $article = null; if ($readability->init()) { $title = $readability->getTitle()->textContent; $content = $readability->getContent()->innerHTML; /** * still need one more tidy clean, otherwise domdocument will not work properly */ $content = $this->tidyClean($content); $content = $this->removeEmptyTags($content); /** * use domdocument to fix relative urls */ $content = $this->fixRelativeUrls($content); /** * if there is tidy support, then detect target source code from meta content */ if ($this->tidyAvailable) { $tidy = tidy_parse_string($content, array(), 'utf8'); $head = $tidy->head(); $charset = $this->findCharset($head->value); if (!empty($charset) && $charset != "utf-8") { $content = mb_convert_encoding($content, "utf-8", $charset); } } $article = array('title' => $title, 'content' => $content); } return $article; }
public function parse($html, $url, Sources $source = null) { if ($source) { $this->source = $source; } try { $parsedNews = array(); $html = $this->stripTagWithContent($html, "script"); $htmlToDetect = $this->processExcludeElements($html); $content = $this->tryContentDetect($htmlToDetect); $readability = new \Readability($html, $url); $readability->debug = false; $readability->convertLinksToFootnotes = false; $result = $readability->init(); if ($result || $content) { $title = $readability->getTitle()->textContent; $title = $this->processTitleStopWords($title); if (!$content) { $content = $readability->getContent()->innerHTML; } $content = $this->processContentStopWords($content); $content = preg_replace('/\\n/', ' ', $content); $content = strip_tags($content, "<p><div><img><span><br><ul><li><embed><iframe>"); $content = $this->fixUrls($content); $content = $this->processExcludeElements($content); $date = $this->processPublishDate($html); if ($searchContent = trim(strip_tags($content))) { $searchContent = preg_replace('/\\n/', ' ', $searchContent); $searchContent = preg_replace("/[^а-яa-z ]/ui", "", $searchContent); $searchContent = preg_replace('/\\s+/', ' ', $searchContent); $searchContent = mb_convert_encoding($searchContent, 'HTML-ENTITIES', "UTF-8"); $parsedNews['title'] = $title; $parsedNews['content'] = $content; $parsedNews['searchContent'] = $searchContent; $parsedNews['thumb'] = $this->detectThumb($html, $content); $parsedNews['date'] = $date; } } else { throw new Exception('Looks like we couldn\'t find the content. :('); } } catch (Exception $e) { $parsedNews['error'] = $e->getMessage(); } return $parsedNews; }
$html = file_get_contents($url); // Note: PHP Readability expects UTF-8 encoded content. // If your content is not UTF-8 encoded, convert it // first before passing it to PHP Readability. // Both iconv() and mb_convert_encoding() can do this. // give it to Readability $readability = new Readability($html, $url); // print debug output? // useful to compare against Arc90's original JS version - // simply click the bookmarklet with FireBug's console window open $readability->debug = false; // convert links to footnotes? $readability->convertLinksToFootnotes = true; // process it $result = $readability->init(); // does it look like we found what we wanted? if ($result) { echo "== Title =====================================\n"; echo $readability->getTitle()->textContent, "\n\n"; echo "== Body ======================================\n"; $content = $readability->getContent()->innerHTML; // if we've got Tidy, let's clean it up for output if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8'); $tidy->cleanRepair(); $content = $tidy->value; } echo $content; } else { echo 'Looks like we couldn\'t find the content. :('; }
$content_block = $elems->item(0); // clean it up $readability->removeScripts($content_block); $readability->prepArticle($content_block); } else { if ($exclude_on_fail) { die('Sorry, could not extract content'); } $content_block = $readability->dom->createElement('p', 'Sorry, could not extract content'); } } $readability->clean($content_block, 'select'); if ($options->rewrite_relative_urls) { makeAbsolute($effective_url, $content_block); } $title = $readability->getTitle()->textContent; if ($extract_pattern) { // get outerHTML $content = $content_block->ownerDocument->saveXML($content_block); } else { $content = $content_block->innerHTML; } if ($links == 'remove') { $content = preg_replace('!</?a[^>]*>!', '', $content); } if (!$valid_key) { $content = $options->message_to_prepend . $content; $content .= $options->message_to_append; } else { $content = $options->message_to_prepend_with_key . $content; $content .= $options->message_to_append_with_key;
public function parse_tweets($_id) { $mongo_id = new MongoId($_id); $user = $this->db->users->findOne(array('_id' => $mongo_id)); if ($this->input->get('debug')) { echo 'Updating ' . $user['name'] . "\n"; } if ($this->input->get('debug')) { var_dump($user); } $this->load->library('twitter'); $auth = $this->twitter->oauth('', '', $user['oauth_token'], $user['oauth_token_secret']); if (!array_key_exists('last_tweet', $user) || !isset($user['last_tweet']) || !strlen($user['last_tweet']) || $user['last_tweet'] == '0' || !is_numeric($user['last_tweet'])) { $criteria = array('count' => 50); } else { $criteria = array('count' => 50, 'since_id' => $user['last_tweet']); } $data = $this->twitter->call('statuses/home_timeline', $criteria); if ($this->input->get('debug')) { var_dump($data); } if (array_key_exists('links', $user)) { $current_link_count = count($user['links']); } else { $current_link_count = 0; } if (is_array($data)) { if (array_key_exists('error', $data)) { if ($data['error'] == 'Could not authenticate with OAuth.') { $this->db->users->remove(array('_id' => $mongo_id), array('justOne' => true)); if ($this->input->get('debug')) { echo 'Deleting user' . "\n"; } } if ($this->input->get('debug')) { echo "ERROR: " . $data['error'] . "\n"; } return; } if (count($data) >= 1) { $this->db->users->update(array('_id' => $mongo_id), array('$set' => array('last_tweet' => (string) $data[0]['id']))); } for ($x = count($data) - 1; $x > 0; $x--) { $tweet = $data[$x]; if (preg_match('@(https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.]*(\\?\\S+)?)?)?)@', $tweet['text'], $matches)) { $doc = $tweet; $doc['link'] = $matches[0]; $doc['owner_id'] = $user['id']; $doc['tweet'] = $tweet['text']; $doc['created_at'] = $tweet['created_at']; $doc['tweet_id'] = (string) $tweet['id']; $doc['from_user'] = $tweet['user']; if (false && isset($user['full_article']) && $user['full_article']) { require_once 'lib/Readability.php'; $url = $matches[0]; $html = @file_get_contents($url); if ($html) { $readability = new Readability($html, $url); $result = $readability->init(); if ($result) { $doc['article_title'] = trim($readability->getTitle()->textContent); $doc['article_body'] = trim($readability->getContent()->innerHTML); } } } $current_link_count++; $this->db->users->update(array('_id' => $user['_id']), array('$push' => array('links' => $doc))); if ($current_link_count > 50) { $this->db->users->update(array('_id' => $user['_id']), array('$pop' => array('links' => -1))); } } } //echo " Done\n"; } else { //echo ' Failed'."\n"; } $data = null; $user = null; }