function hook_article_filter($article) { $owner_uid = $article["owner_uid"]; if (strpos($article["guid"], "dilbert.com") !== FALSE) { if (strpos($article["plugin_data"], "dilbert,{$owner_uid}:") === FALSE) { $doc = new DOMDocument(); @$doc->loadHTML(fetch_file_contents($article["link"])); $basenode = false; if ($doc) { $xpath = new DOMXPath($doc); $entries = $xpath->query('(//img[@src])'); // we might also check for img[@class='strip'] I guess... $matches = array(); foreach ($entries as $entry) { if (preg_match("/dyn\\/str_strip\\/.*zoom\\.gif\$/", $entry->getAttribute("src"), $matches)) { $entry->setAttribute("src", rewrite_relative_url("http://dilbert.com/", $matches[0])); $basenode = $entry; break; } } if ($basenode) { $article["content"] = $doc->saveXML($basenode); $article["plugin_data"] = "dilbert,{$owner_uid}:" . $article["plugin_data"]; } } } else { if (isset($article["stored"]["content"])) { $article["content"] = $article["stored"]["content"]; } } } return $article; }
function get_link() { $links = $this->elem->getElementsByTagName("link"); foreach ($links as $link) { if ($link && $link->hasAttribute("href") && (!$link->hasAttribute("rel") || $link->getAttribute("rel") == "alternate" || $link->getAttribute("rel") == "standout")) { $base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $link); return rewrite_relative_url($base, $link->getAttribute("href")); } } }
function cache_images($html, $site_url, $debug) { $cache_dir = CACHE_DIR . "/images"; libxml_use_internal_errors(true); $charset_hack = '<head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> </head>'; $doc = new DOMDocument(); $doc->loadHTML($charset_hack . $html); $xpath = new DOMXPath($doc); $entries = $xpath->query('(//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute('src')) { $src = rewrite_relative_url($site_url, $entry->getAttribute('src')); $local_filename = CACHE_DIR . "/images/" . sha1($src) . ".png"; if ($debug) { _debug("cache_images: downloading: {$src} to {$local_filename}"); } if (!file_exists($local_filename)) { $file_content = fetch_file_contents($src); if ($file_content && strlen($file_content) > 1024) { file_put_contents($local_filename, $file_content); } } if (file_exists($local_filename)) { $entry->setAttribute('src', SELF_URL_PATH . '/image.php?url=' . base64_encode($src)); } } } $node = $doc->getElementsByTagName('body')->item(0); return $doc->saveXML($node); }
/** * Try to determine the favicon URL for a feed. * adapted from wordpress favicon plugin by Jeff Minard (http://thecodepro.com/) * http://dev.wp-plugins.org/file/favatars/trunk/favatars.php * * @param string $url A feed or page URL * @access public * @return mixed The favicon URL, or false if none was found. */ function get_favicon_url($url) { $favicon_url = false; if ($html = @fetch_file_contents($url)) { libxml_use_internal_errors(true); $doc = new DOMDocument(); $doc->loadHTML($html); $xpath = new DOMXPath($doc); $base = $xpath->query('/html/head/base'); foreach ($base as $b) { $url = $b->getAttribute("href"); break; } $entries = $xpath->query('/html/head/link[@rel="shortcut icon" or @rel="icon"]'); if (count($entries) > 0) { foreach ($entries as $entry) { $favicon_url = rewrite_relative_url($url, $entry->getAttribute("href")); break; } } } if (!$favicon_url) { $favicon_url = rewrite_relative_url($url, "/favicon.ico"); } return $favicon_url; }
function cache_article_images($content, $site_url, $owner_uid, $article_id) { libxml_use_internal_errors(true); $charset_hack = '<head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> </head>'; $doc = new DOMDocument(); $doc->loadHTML($charset_hack . $content); $xpath = new DOMXPath($doc); $entries = $xpath->query('(//img[@src])'); $success = false; $has_images = false; foreach ($entries as $entry) { if ($entry->hasAttribute('src')) { $has_images = true; $src = rewrite_relative_url($site_url, $entry->getAttribute('src')); $local_filename = $this->cache_dir . $article_id . "-" . sha1($src) . ".png"; //_debug("cache_images: downloading: $src to $local_filename"); if (!file_exists($local_filename)) { $file_content = fetch_file_contents($src); if ($file_content && strlen($file_content) > 0) { file_put_contents($local_filename, $file_content); $success = true; } } else { $success = true; } } } return $success || !$has_images; }
function get_feeds_from_html($url, $content) { $url = fix_url($url); $baseUrl = substr($url, 0, strrpos($url, '/') + 1); libxml_use_internal_errors(true); $doc = new DOMDocument(); $doc->loadHTML($content); $xpath = new DOMXPath($doc); $entries = $xpath->query('/html/head/link[@rel="alternate"]'); $feedUrls = array(); foreach ($entries as $entry) { if ($entry->hasAttribute('href')) { $title = $entry->getAttribute('title'); if ($title == '') { $title = $entry->getAttribute('type'); } $feedUrl = rewrite_relative_url($baseUrl, $entry->getAttribute('href')); $feedUrls[$feedUrl] = $title; } } return $feedUrls; }
public function extract_content($url) { if (!class_exists("Readability")) { require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php"; } if (!defined('NO_CURL') && function_exists('curl_init') && !ini_get("open_basedir")) { $ch = curl_init($url); curl_setopt($ch, CURLOPT_TIMEOUT, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT); @($result = curl_exec($ch)); $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); if (strpos($content_type, "text/html") === FALSE) { return false; } } $tmp = fetch_file_contents($url); if ($tmp && mb_strlen($tmp) < 65535 * 4) { $tmpdoc = new DOMDocument("1.0", "UTF-8"); if (!$tmpdoc->loadHTML($tmp)) { return false; } if (strtolower($tmpdoc->encoding) != 'utf-8') { $tmpxpath = new DOMXPath($tmpdoc); foreach ($tmpxpath->query("//meta") as $elem) { $elem->parentNode->removeChild($elem); } $tmp = $tmpdoc->saveHTML(); } $r = new Readability($tmp, $url); if ($r->init()) { $tmpxpath = new DOMXPath($r->dom); $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute("href")) { $entry->setAttribute("href", rewrite_relative_url($url, $entry->getAttribute("href"))); } if ($entry->hasAttribute("src")) { $entry->setAttribute("src", rewrite_relative_url($url, $entry->getAttribute("src"))); } } return $r->articleContent->innerHTML; } } return false; }
function hook_article_filter($article) { if (strpos($article["link"], "reddit.com/r/") !== FALSE) { $doc = new DOMDocument(); @$doc->loadHTML($article["content"]); $xpath = new DOMXPath($doc); $found = $this->inline_stuff($article, $doc, $xpath); if (function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) { if (!class_exists("Readability")) { require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php"; } $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0); if ($content_link && strpos($content_link->getAttribute("href"), "twitter.com") === FALSE && strpos($content_link->getAttribute("href"), "youtube.com") === FALSE && strpos($content_link->getAttribute("href"), "reddit.com") === FALSE) { /* link may lead to a huge video file or whatever, we need to check content type before trying to parse it which p much requires curl */ $ch = curl_init($content_link->getAttribute("href")); curl_setopt($ch, CURLOPT_TIMEOUT, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir")); curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT); @($result = curl_exec($ch)); $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); if ($content_type && strpos($content_type, "text/html") !== FALSE) { $tmp = fetch_file_contents($content_link->getAttribute("href")); if ($tmp) { $r = new Readability($tmp, $content_link->getAttribute("href")); if ($r->init()) { $tmpxpath = new DOMXPath($r->dom); $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute("href")) { $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href"))); } if ($entry->hasAttribute("src")) { $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src"))); } } $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"]; // prob not a very good idea (breaks wikipedia pages, etc) - // inliner currently is not really fit for any random web content //$doc = new DOMDocument(); //@$doc->loadHTML($article["content"]); //$xpath = new DOMXPath($doc); //$found = $this->inline_stuff($article, $doc, $xpath); } } } } } $node = $doc->getElementsByTagName('body')->item(0); if ($node && $found) { $article["content"] = $doc->saveXML($node); } } return $article; }
function hook_article_filter($article) { if (strpos($article["link"], "reddit.com/r/") !== FALSE) { $doc = new DOMDocument(); @$doc->loadHTML($article["content"]); $xpath = new DOMXPath($doc); if ($this->host->get($this, "enable_content_dupcheck")) { $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0); if ($content_link) { $content_href = db_escape_string($content_link->getAttribute("href")); $entry_guid = db_escape_string($article["guid_hashed"]); $owner_uid = $article["owner_uid"]; if (DB_TYPE == "pgsql") { $interval_qpart = "date_entered < NOW() - INTERVAL '1 day'"; } else { $interval_qpart = "date_entered < DATE_SUB(NOW(), INTERVAL 1 DAY)"; } $result = db_query("SELECT COUNT(id) AS cid\n\t\t\t\t\t\tFROM ttrss_entries, ttrss_user_entries WHERE\n\t\t\t\t\t\t\tref_id = id AND\n\t\t\t\t\t\t\t{$interval_qpart} AND\n\t\t\t\t\t\t\tguid != '{$entry_guid}' AND\n\t\t\t\t\t\t\towner_uid = '{$owner_uid}' AND\n\t\t\t\t\t\t\tcontent LIKE '%href=\"{$content_href}\">[link]%'"); if ($result) { $num_found = db_fetch_result($result, 0, "cid"); if ($num_found > 0) { $article["force_catchup"] = true; } } } } $found = $this->inline_stuff($article, $doc, $xpath); if (!defined('NO_CURL') && function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) { if (!class_exists("Readability")) { require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php"; } if ($content_link && strpos($content_link->getAttribute("href"), "twitter.com") === FALSE && strpos($content_link->getAttribute("href"), "youtube.com") === FALSE && strpos($content_link->getAttribute("href"), "reddit.com") === FALSE) { /* link may lead to a huge video file or whatever, we need to check content type before trying to parse it which p much requires curl */ $ch = curl_init($content_link->getAttribute("href")); curl_setopt($ch, CURLOPT_TIMEOUT, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("open_basedir")); curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT); @($result = curl_exec($ch)); $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); if ($content_type && strpos($content_type, "text/html") !== FALSE) { $tmp = fetch_file_contents($content_link->getAttribute("href")); //_debug("tmplen: " . mb_strlen($tmp)); if ($tmp && mb_strlen($tmp) < 65535 * 4) { $r = new Readability($tmp, $content_link->getAttribute("href")); if ($r->init()) { $tmpxpath = new DOMXPath($r->dom); $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute("href")) { $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href"))); } if ($entry->hasAttribute("src")) { $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src"))); } } $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"]; // prob not a very good idea (breaks wikipedia pages, etc) - // inliner currently is not really fit for any random web content //$doc = new DOMDocument(); //@$doc->loadHTML($article["content"]); //$xpath = new DOMXPath($doc); //$found = $this->inline_stuff($article, $doc, $xpath); } } } } } $node = $doc->getElementsByTagName('body')->item(0); if ($node && $found) { $article["content"] = $doc->saveXML($node); } } return $article; }
function cache_images($html, $site_url, $debug) { libxml_use_internal_errors(true); $charset_hack = '<head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> </head>'; $doc = new DOMDocument(); $doc->loadHTML($charset_hack . $html); $xpath = new DOMXPath($doc); $entries = $xpath->query('(//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute('src')) { $src = rewrite_relative_url($site_url, $entry->getAttribute('src')); $local_filename = CACHE_DIR . "/images/" . sha1($src) . ".png"; if ($debug) { _debug("cache_images: downloading: {$src} to {$local_filename}"); } if (!file_exists($local_filename)) { $file_content = fetch_file_contents($src); if ($file_content && strlen($file_content) > _MIN_CACHE_IMAGE_SIZE) { file_put_contents($local_filename, $file_content); } } } } }
function my_sanitize($str, $site_url = false) { $res = trim($str); if (!$res) { return ''; } if (strpos($res, "href=") === false) { $res = $this->rewrite_urls($res); } $charset_hack = '<head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> </head>'; $res = trim($res); if (!$res) { return ''; } libxml_use_internal_errors(true); $doc = new DOMDocument(); $doc->loadHTML($charset_hack . $res); $xpath = new DOMXPath($doc); $entries = $xpath->query('(//a[@href]|//img[@src])'); foreach ($entries as $entry) { if ($site_url) { if ($entry->hasAttribute('href')) { $entry->setAttribute('href', rewrite_relative_url($site_url, $entry->getAttribute('href'))); } if ($entry->hasAttribute('src')) { $src = rewrite_relative_url($site_url, $entry->getAttribute('src')); $entry->setAttribute('src', $src); } } if (strtolower($entry->nodeName) == "a") { $entry->setAttribute("target", "_blank"); } } $entries = $xpath->query('//iframe'); foreach ($entries as $entry) { $entry->setAttribute('sandbox', 'allow-scripts allow-same-origin'); } $disallowed_attributes = array('id', 'style', 'class'); $entries = $xpath->query('//*'); foreach ($entries as $entry) { if ($entry->hasAttributes()) { $attrs_to_remove = array(); foreach ($entry->attributes as $attr) { if (strpos($attr->nodeName, 'on') === 0) { //remove onclick and other on* attributes array_push($attrs_to_remove, $attr); } if (in_array($attr->nodeName, $disallowed_attributes)) { array_push($attrs_to_remove, $attr); } } foreach ($attrs_to_remove as $attr) { $entry->removeAttributeNode($attr); } } } $doc->removeChild($doc->firstChild); //remove doctype $res = $doc->saveHTML(); return $res; }
/** * Test rewrite_relative_url() with an absolute URL */ public function testRewriteRelativeUrlAbsoluteUrl() { $this->assertEquals('http://example.org/bar/', rewrite_relative_url('http://tt-rss.org/foo/', 'http://example.org/bar/')); }
function hook_article_filter($article) { $enabled_feeds = $this->host->get($this, "enabled_feeds"); $key = array_search($article["feed"]["id"], $enabled_feeds); if ($key === FALSE) { return $article; } if (!class_exists("Readability")) { require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php"; } if (function_exists("curl_init")) { $ch = curl_init($article["link"]); curl_setopt($ch, CURLOPT_TIMEOUT, 5); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir")); curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT); @($result = curl_exec($ch)); $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); if (strpos($content_type, "text/html") === FALSE) { return $article; } } $tmp = fetch_file_contents($article["link"]); if ($tmp) { $tmpdoc = new DOMDocument("1.0", "UTF-8"); if (!$tmpdoc->loadHTML($tmp)) { return $article; } if ($tmpdoc->encoding != 'UTF-8') { $tmpxpath = new DOMXPath($tmpdoc); foreach ($tmpxpath->query("//meta") as $elem) { $elem->parentNode->removeChild($elem); } $tmp = $tmpdoc->saveHTML(); } $r = new Readability($tmp, $article["link"]); if ($r->init()) { $tmpxpath = new DOMXPath($r->dom); $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute("href")) { $entry->setAttribute("href", rewrite_relative_url($article["link"], $entry->getAttribute("href"))); } if ($entry->hasAttribute("src")) { $entry->setAttribute("src", rewrite_relative_url($article["link"], $entry->getAttribute("src"))); } } $article["content"] = $r->articleContent->innerHTML; } } return $article; }