function hook_article_filter($article)
 {
     $owner_uid = $article["owner_uid"];
     if (strpos($article["guid"], "dilbert.com") !== FALSE) {
         if (strpos($article["plugin_data"], "dilbert,{$owner_uid}:") === FALSE) {
             $doc = new DOMDocument();
             @$doc->loadHTML(fetch_file_contents($article["link"]));
             $basenode = false;
             if ($doc) {
                 $xpath = new DOMXPath($doc);
                 $entries = $xpath->query('(//img[@src])');
                 // we might also check for img[@class='strip'] I guess...
                 $matches = array();
                 foreach ($entries as $entry) {
                     if (preg_match("/dyn\\/str_strip\\/.*zoom\\.gif\$/", $entry->getAttribute("src"), $matches)) {
                         $entry->setAttribute("src", rewrite_relative_url("http://dilbert.com/", $matches[0]));
                         $basenode = $entry;
                         break;
                     }
                 }
                 if ($basenode) {
                     $article["content"] = $doc->saveXML($basenode);
                     $article["plugin_data"] = "dilbert,{$owner_uid}:" . $article["plugin_data"];
                 }
             }
         } else {
             if (isset($article["stored"]["content"])) {
                 $article["content"] = $article["stored"]["content"];
             }
         }
     }
     return $article;
 }
 function get_link()
 {
     $links = $this->elem->getElementsByTagName("link");
     foreach ($links as $link) {
         if ($link && $link->hasAttribute("href") && (!$link->hasAttribute("rel") || $link->getAttribute("rel") == "alternate" || $link->getAttribute("rel") == "standout")) {
             $base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $link);
             return rewrite_relative_url($base, $link->getAttribute("href"));
         }
     }
 }
function cache_images($html, $site_url, $debug)
{
    $cache_dir = CACHE_DIR . "/images";
    libxml_use_internal_errors(true);
    $charset_hack = '<head>
			<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
		</head>';
    $doc = new DOMDocument();
    $doc->loadHTML($charset_hack . $html);
    $xpath = new DOMXPath($doc);
    $entries = $xpath->query('(//img[@src])');
    foreach ($entries as $entry) {
        if ($entry->hasAttribute('src')) {
            $src = rewrite_relative_url($site_url, $entry->getAttribute('src'));
            $local_filename = CACHE_DIR . "/images/" . sha1($src) . ".png";
            if ($debug) {
                _debug("cache_images: downloading: {$src} to {$local_filename}");
            }
            if (!file_exists($local_filename)) {
                $file_content = fetch_file_contents($src);
                if ($file_content && strlen($file_content) > 1024) {
                    file_put_contents($local_filename, $file_content);
                }
            }
            if (file_exists($local_filename)) {
                $entry->setAttribute('src', SELF_URL_PATH . '/image.php?url=' . base64_encode($src));
            }
        }
    }
    $node = $doc->getElementsByTagName('body')->item(0);
    return $doc->saveXML($node);
}
예제 #4
0
/**
 * Try to determine the favicon URL for a feed.
 * adapted from wordpress favicon plugin by Jeff Minard (http://thecodepro.com/)
 * http://dev.wp-plugins.org/file/favatars/trunk/favatars.php
 *
 * @param string $url A feed or page URL
 * @access public
 * @return mixed The favicon URL, or false if none was found.
 */
function get_favicon_url($url)
{
    $favicon_url = false;
    if ($html = @fetch_file_contents($url)) {
        libxml_use_internal_errors(true);
        $doc = new DOMDocument();
        $doc->loadHTML($html);
        $xpath = new DOMXPath($doc);
        $base = $xpath->query('/html/head/base');
        foreach ($base as $b) {
            $url = $b->getAttribute("href");
            break;
        }
        $entries = $xpath->query('/html/head/link[@rel="shortcut icon" or @rel="icon"]');
        if (count($entries) > 0) {
            foreach ($entries as $entry) {
                $favicon_url = rewrite_relative_url($url, $entry->getAttribute("href"));
                break;
            }
        }
    }
    if (!$favicon_url) {
        $favicon_url = rewrite_relative_url($url, "/favicon.ico");
    }
    return $favicon_url;
}
예제 #5
0
파일: init.php 프로젝트: zamentur/ttrss_ynh
    function cache_article_images($content, $site_url, $owner_uid, $article_id)
    {
        libxml_use_internal_errors(true);
        $charset_hack = '<head>
			<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
		</head>';
        $doc = new DOMDocument();
        $doc->loadHTML($charset_hack . $content);
        $xpath = new DOMXPath($doc);
        $entries = $xpath->query('(//img[@src])');
        $success = false;
        $has_images = false;
        foreach ($entries as $entry) {
            if ($entry->hasAttribute('src')) {
                $has_images = true;
                $src = rewrite_relative_url($site_url, $entry->getAttribute('src'));
                $local_filename = $this->cache_dir . $article_id . "-" . sha1($src) . ".png";
                //_debug("cache_images: downloading: $src to $local_filename");
                if (!file_exists($local_filename)) {
                    $file_content = fetch_file_contents($src);
                    if ($file_content && strlen($file_content) > 0) {
                        file_put_contents($local_filename, $file_content);
                        $success = true;
                    }
                } else {
                    $success = true;
                }
            }
        }
        return $success || !$has_images;
    }
예제 #6
0
function get_feeds_from_html($url, $content)
{
    $url = fix_url($url);
    $baseUrl = substr($url, 0, strrpos($url, '/') + 1);
    libxml_use_internal_errors(true);
    $doc = new DOMDocument();
    $doc->loadHTML($content);
    $xpath = new DOMXPath($doc);
    $entries = $xpath->query('/html/head/link[@rel="alternate"]');
    $feedUrls = array();
    foreach ($entries as $entry) {
        if ($entry->hasAttribute('href')) {
            $title = $entry->getAttribute('title');
            if ($title == '') {
                $title = $entry->getAttribute('type');
            }
            $feedUrl = rewrite_relative_url($baseUrl, $entry->getAttribute('href'));
            $feedUrls[$feedUrl] = $title;
        }
    }
    return $feedUrls;
}
예제 #7
0
파일: init.php 프로젝트: kucrut/tt-rss
 public function extract_content($url)
 {
     if (!class_exists("Readability")) {
         require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
     }
     if (!defined('NO_CURL') && function_exists('curl_init') && !ini_get("open_basedir")) {
         $ch = curl_init($url);
         curl_setopt($ch, CURLOPT_TIMEOUT, 5);
         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
         curl_setopt($ch, CURLOPT_HEADER, true);
         curl_setopt($ch, CURLOPT_NOBODY, true);
         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
         curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
         @($result = curl_exec($ch));
         $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
         if (strpos($content_type, "text/html") === FALSE) {
             return false;
         }
     }
     $tmp = fetch_file_contents($url);
     if ($tmp && mb_strlen($tmp) < 65535 * 4) {
         $tmpdoc = new DOMDocument("1.0", "UTF-8");
         if (!$tmpdoc->loadHTML($tmp)) {
             return false;
         }
         if (strtolower($tmpdoc->encoding) != 'utf-8') {
             $tmpxpath = new DOMXPath($tmpdoc);
             foreach ($tmpxpath->query("//meta") as $elem) {
                 $elem->parentNode->removeChild($elem);
             }
             $tmp = $tmpdoc->saveHTML();
         }
         $r = new Readability($tmp, $url);
         if ($r->init()) {
             $tmpxpath = new DOMXPath($r->dom);
             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
             foreach ($entries as $entry) {
                 if ($entry->hasAttribute("href")) {
                     $entry->setAttribute("href", rewrite_relative_url($url, $entry->getAttribute("href")));
                 }
                 if ($entry->hasAttribute("src")) {
                     $entry->setAttribute("src", rewrite_relative_url($url, $entry->getAttribute("src")));
                 }
             }
             return $r->articleContent->innerHTML;
         }
     }
     return false;
 }
예제 #8
0
 function hook_article_filter($article)
 {
     if (strpos($article["link"], "reddit.com/r/") !== FALSE) {
         $doc = new DOMDocument();
         @$doc->loadHTML($article["content"]);
         $xpath = new DOMXPath($doc);
         $found = $this->inline_stuff($article, $doc, $xpath);
         if (function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) {
             if (!class_exists("Readability")) {
                 require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
             }
             $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0);
             if ($content_link && strpos($content_link->getAttribute("href"), "twitter.com") === FALSE && strpos($content_link->getAttribute("href"), "youtube.com") === FALSE && strpos($content_link->getAttribute("href"), "reddit.com") === FALSE) {
                 /* link may lead to a huge video file or whatever, we need to check content type before trying to
                 			parse it which p much requires curl */
                 $ch = curl_init($content_link->getAttribute("href"));
                 curl_setopt($ch, CURLOPT_TIMEOUT, 5);
                 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
                 curl_setopt($ch, CURLOPT_HEADER, true);
                 curl_setopt($ch, CURLOPT_NOBODY, true);
                 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir"));
                 curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
                 @($result = curl_exec($ch));
                 $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
                 if ($content_type && strpos($content_type, "text/html") !== FALSE) {
                     $tmp = fetch_file_contents($content_link->getAttribute("href"));
                     if ($tmp) {
                         $r = new Readability($tmp, $content_link->getAttribute("href"));
                         if ($r->init()) {
                             $tmpxpath = new DOMXPath($r->dom);
                             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
                             foreach ($entries as $entry) {
                                 if ($entry->hasAttribute("href")) {
                                     $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href")));
                                 }
                                 if ($entry->hasAttribute("src")) {
                                     $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src")));
                                 }
                             }
                             $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"];
                             // prob not a very good idea (breaks wikipedia pages, etc) -
                             // inliner currently is not really fit for any random web content
                             //$doc = new DOMDocument();
                             //@$doc->loadHTML($article["content"]);
                             //$xpath = new DOMXPath($doc);
                             //$found = $this->inline_stuff($article, $doc, $xpath);
                         }
                     }
                 }
             }
         }
         $node = $doc->getElementsByTagName('body')->item(0);
         if ($node && $found) {
             $article["content"] = $doc->saveXML($node);
         }
     }
     return $article;
 }
예제 #9
0
파일: init.php 프로젝트: kucrut/tt-rss
 function hook_article_filter($article)
 {
     if (strpos($article["link"], "reddit.com/r/") !== FALSE) {
         $doc = new DOMDocument();
         @$doc->loadHTML($article["content"]);
         $xpath = new DOMXPath($doc);
         if ($this->host->get($this, "enable_content_dupcheck")) {
             $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0);
             if ($content_link) {
                 $content_href = db_escape_string($content_link->getAttribute("href"));
                 $entry_guid = db_escape_string($article["guid_hashed"]);
                 $owner_uid = $article["owner_uid"];
                 if (DB_TYPE == "pgsql") {
                     $interval_qpart = "date_entered < NOW() - INTERVAL '1 day'";
                 } else {
                     $interval_qpart = "date_entered < DATE_SUB(NOW(), INTERVAL 1 DAY)";
                 }
                 $result = db_query("SELECT COUNT(id) AS cid\n\t\t\t\t\t\tFROM ttrss_entries, ttrss_user_entries WHERE\n\t\t\t\t\t\t\tref_id = id AND\n\t\t\t\t\t\t\t{$interval_qpart} AND\n\t\t\t\t\t\t\tguid != '{$entry_guid}' AND\n\t\t\t\t\t\t\towner_uid = '{$owner_uid}' AND\n\t\t\t\t\t\t\tcontent LIKE '%href=\"{$content_href}\">[link]%'");
                 if ($result) {
                     $num_found = db_fetch_result($result, 0, "cid");
                     if ($num_found > 0) {
                         $article["force_catchup"] = true;
                     }
                 }
             }
         }
         $found = $this->inline_stuff($article, $doc, $xpath);
         if (!defined('NO_CURL') && function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) {
             if (!class_exists("Readability")) {
                 require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
             }
             if ($content_link && strpos($content_link->getAttribute("href"), "twitter.com") === FALSE && strpos($content_link->getAttribute("href"), "youtube.com") === FALSE && strpos($content_link->getAttribute("href"), "reddit.com") === FALSE) {
                 /* link may lead to a huge video file or whatever, we need to check content type before trying to
                 			parse it which p much requires curl */
                 $ch = curl_init($content_link->getAttribute("href"));
                 curl_setopt($ch, CURLOPT_TIMEOUT, 5);
                 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
                 curl_setopt($ch, CURLOPT_HEADER, true);
                 curl_setopt($ch, CURLOPT_NOBODY, true);
                 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("open_basedir"));
                 curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
                 @($result = curl_exec($ch));
                 $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
                 if ($content_type && strpos($content_type, "text/html") !== FALSE) {
                     $tmp = fetch_file_contents($content_link->getAttribute("href"));
                     //_debug("tmplen: " . mb_strlen($tmp));
                     if ($tmp && mb_strlen($tmp) < 65535 * 4) {
                         $r = new Readability($tmp, $content_link->getAttribute("href"));
                         if ($r->init()) {
                             $tmpxpath = new DOMXPath($r->dom);
                             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
                             foreach ($entries as $entry) {
                                 if ($entry->hasAttribute("href")) {
                                     $entry->setAttribute("href", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href")));
                                 }
                                 if ($entry->hasAttribute("src")) {
                                     $entry->setAttribute("src", rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src")));
                                 }
                             }
                             $article["content"] = $r->articleContent->innerHTML . "<hr/>" . $article["content"];
                             // prob not a very good idea (breaks wikipedia pages, etc) -
                             // inliner currently is not really fit for any random web content
                             //$doc = new DOMDocument();
                             //@$doc->loadHTML($article["content"]);
                             //$xpath = new DOMXPath($doc);
                             //$found = $this->inline_stuff($article, $doc, $xpath);
                         }
                     }
                 }
             }
         }
         $node = $doc->getElementsByTagName('body')->item(0);
         if ($node && $found) {
             $article["content"] = $doc->saveXML($node);
         }
     }
     return $article;
 }
예제 #10
0
function cache_images($html, $site_url, $debug)
{
    libxml_use_internal_errors(true);
    $charset_hack = '<head>
			<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
		</head>';
    $doc = new DOMDocument();
    $doc->loadHTML($charset_hack . $html);
    $xpath = new DOMXPath($doc);
    $entries = $xpath->query('(//img[@src])');
    foreach ($entries as $entry) {
        if ($entry->hasAttribute('src')) {
            $src = rewrite_relative_url($site_url, $entry->getAttribute('src'));
            $local_filename = CACHE_DIR . "/images/" . sha1($src) . ".png";
            if ($debug) {
                _debug("cache_images: downloading: {$src} to {$local_filename}");
            }
            if (!file_exists($local_filename)) {
                $file_content = fetch_file_contents($src);
                if ($file_content && strlen($file_content) > _MIN_CACHE_IMAGE_SIZE) {
                    file_put_contents($local_filename, $file_content);
                }
            }
        }
    }
}
예제 #11
0
    function my_sanitize($str, $site_url = false)
    {
        $res = trim($str);
        if (!$res) {
            return '';
        }
        if (strpos($res, "href=") === false) {
            $res = $this->rewrite_urls($res);
        }
        $charset_hack = '<head>
			<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
		</head>';
        $res = trim($res);
        if (!$res) {
            return '';
        }
        libxml_use_internal_errors(true);
        $doc = new DOMDocument();
        $doc->loadHTML($charset_hack . $res);
        $xpath = new DOMXPath($doc);
        $entries = $xpath->query('(//a[@href]|//img[@src])');
        foreach ($entries as $entry) {
            if ($site_url) {
                if ($entry->hasAttribute('href')) {
                    $entry->setAttribute('href', rewrite_relative_url($site_url, $entry->getAttribute('href')));
                }
                if ($entry->hasAttribute('src')) {
                    $src = rewrite_relative_url($site_url, $entry->getAttribute('src'));
                    $entry->setAttribute('src', $src);
                }
            }
            if (strtolower($entry->nodeName) == "a") {
                $entry->setAttribute("target", "_blank");
            }
        }
        $entries = $xpath->query('//iframe');
        foreach ($entries as $entry) {
            $entry->setAttribute('sandbox', 'allow-scripts allow-same-origin');
        }
        $disallowed_attributes = array('id', 'style', 'class');
        $entries = $xpath->query('//*');
        foreach ($entries as $entry) {
            if ($entry->hasAttributes()) {
                $attrs_to_remove = array();
                foreach ($entry->attributes as $attr) {
                    if (strpos($attr->nodeName, 'on') === 0) {
                        //remove onclick and other on* attributes
                        array_push($attrs_to_remove, $attr);
                    }
                    if (in_array($attr->nodeName, $disallowed_attributes)) {
                        array_push($attrs_to_remove, $attr);
                    }
                }
                foreach ($attrs_to_remove as $attr) {
                    $entry->removeAttributeNode($attr);
                }
            }
        }
        $doc->removeChild($doc->firstChild);
        //remove doctype
        $res = $doc->saveHTML();
        return $res;
    }
예제 #12
0
 /**
  * Test rewrite_relative_url() with an absolute URL
  */
 public function testRewriteRelativeUrlAbsoluteUrl()
 {
     $this->assertEquals('http://example.org/bar/', rewrite_relative_url('http://tt-rss.org/foo/', 'http://example.org/bar/'));
 }
예제 #13
0
 function hook_article_filter($article)
 {
     $enabled_feeds = $this->host->get($this, "enabled_feeds");
     $key = array_search($article["feed"]["id"], $enabled_feeds);
     if ($key === FALSE) {
         return $article;
     }
     if (!class_exists("Readability")) {
         require_once dirname(dirname(__DIR__)) . "/lib/readability/Readability.php";
     }
     if (function_exists("curl_init")) {
         $ch = curl_init($article["link"]);
         curl_setopt($ch, CURLOPT_TIMEOUT, 5);
         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
         curl_setopt($ch, CURLOPT_HEADER, true);
         curl_setopt($ch, CURLOPT_NOBODY, true);
         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("safe_mode") && !ini_get("open_basedir"));
         curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT);
         @($result = curl_exec($ch));
         $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
         if (strpos($content_type, "text/html") === FALSE) {
             return $article;
         }
     }
     $tmp = fetch_file_contents($article["link"]);
     if ($tmp) {
         $tmpdoc = new DOMDocument("1.0", "UTF-8");
         if (!$tmpdoc->loadHTML($tmp)) {
             return $article;
         }
         if ($tmpdoc->encoding != 'UTF-8') {
             $tmpxpath = new DOMXPath($tmpdoc);
             foreach ($tmpxpath->query("//meta") as $elem) {
                 $elem->parentNode->removeChild($elem);
             }
             $tmp = $tmpdoc->saveHTML();
         }
         $r = new Readability($tmp, $article["link"]);
         if ($r->init()) {
             $tmpxpath = new DOMXPath($r->dom);
             $entries = $tmpxpath->query('(//a[@href]|//img[@src])');
             foreach ($entries as $entry) {
                 if ($entry->hasAttribute("href")) {
                     $entry->setAttribute("href", rewrite_relative_url($article["link"], $entry->getAttribute("href")));
                 }
                 if ($entry->hasAttribute("src")) {
                     $entry->setAttribute("src", rewrite_relative_url($article["link"], $entry->getAttribute("src")));
                 }
             }
             $article["content"] = $r->articleContent->innerHTML;
         }
     }
     return $article;
 }