change_encoding() public method

public change_encoding ( $data, $input, $output )
Example #1
0
 /**
  * Detect XML encoding, as per XML 1.0 Appendix F.1
  *
  * @todo Add support for EBCDIC
  * @param string $data XML data
  * @param SimplePie_Registry $registry Class registry
  * @return array Possible encodings
  */
 public static function xml_encoding($data, $registry)
 {
     // UTF-32 Big Endian BOM
     if (substr($data, 0, 4) === "��") {
         $encoding[] = 'UTF-32BE';
     } elseif (substr($data, 0, 4) === "��") {
         $encoding[] = 'UTF-32LE';
     } elseif (substr($data, 0, 2) === "��") {
         $encoding[] = 'UTF-16BE';
     } elseif (substr($data, 0, 2) === "��") {
         $encoding[] = 'UTF-16LE';
     } elseif (substr($data, 0, 3) === "") {
         $encoding[] = 'UTF-8';
     } elseif (substr($data, 0, 20) === "<?xml") {
         if ($pos = strpos($data, "?>")) {
             $parser = $registry->create('XML_Declaration_Parser', array(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32BE', 'UTF-8')));
             if ($parser->parse()) {
                 $encoding[] = $parser->encoding;
             }
         }
         $encoding[] = 'UTF-32BE';
     } elseif (substr($data, 0, 20) === "<?xml") {
         if ($pos = strpos($data, "?>")) {
             $parser = $registry->create('XML_Declaration_Parser', array(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32LE', 'UTF-8')));
             if ($parser->parse()) {
                 $encoding[] = $parser->encoding;
             }
         }
         $encoding[] = 'UTF-32LE';
     } elseif (substr($data, 0, 10) === "<?xml") {
         if ($pos = strpos($data, "?>")) {
             $parser = $registry->create('XML_Declaration_Parser', array(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16BE', 'UTF-8')));
             if ($parser->parse()) {
                 $encoding[] = $parser->encoding;
             }
         }
         $encoding[] = 'UTF-16BE';
     } elseif (substr($data, 0, 10) === "<?xml") {
         if ($pos = strpos($data, "?>")) {
             $parser = $registry->create('XML_Declaration_Parser', array(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16LE', 'UTF-8')));
             if ($parser->parse()) {
                 $encoding[] = $parser->encoding;
             }
         }
         $encoding[] = 'UTF-16LE';
     } elseif (substr($data, 0, 5) === "<?xml") {
         if ($pos = strpos($data, "?>")) {
             $parser = $registry->create('XML_Declaration_Parser', array(substr($data, 5, $pos - 5)));
             if ($parser->parse()) {
                 $encoding[] = $parser->encoding;
             }
         }
         $encoding[] = 'UTF-8';
     } else {
         $encoding[] = 'UTF-8';
     }
     return $encoding;
 }
function convert_to_utf8($html, $header = null)
{
    $encoding = null;
    if ($html || $header) {
        if (is_array($header)) {
            $header = implode("\n", $header);
        }
        if (!$header || !preg_match_all('/^Content-Type:\\s+([^;]+)(?:;\\s*charset=["\']?([^;"\'\\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
            // error parsing the response
            debug('Could not find Content-Type header in HTTP response');
        } else {
            $match = end($match);
            // get last matched element (in case of redirects)
            if (isset($match[2])) {
                $encoding = trim($match[2], "\"' \r\n\v\t");
            }
        }
        // TODO: check to see if encoding is supported (can we convert it?)
        // If it's not, result will be empty string.
        // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
        // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
        if (!$encoding || $encoding == 'none') {
            // search for encoding in HTML - only look at the first 50000 characters
            // Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
            // TODO: improve this so it looks at smaller chunks first
            $html_head = substr($html, 0, 50000);
            if (preg_match('/^<\\?xml\\s+version=(?:"[^"]*"|\'[^\']*\')\\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
                $encoding = trim($match[1], '"\'');
            } elseif (preg_match('/<meta\\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\\s*charset=["\']?([^;"\'>]+)/i', $html_head, $match)) {
                $encoding = trim($match[1]);
            } elseif (preg_match_all('/<meta\\s+([^>]+)>/i', $html_head, $match)) {
                foreach ($match[1] as $_test) {
                    if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) {
                        $encoding = trim($_m[1]);
                        break;
                    }
                }
            }
        }
        if (isset($encoding)) {
            $encoding = trim($encoding);
        }
        // trim is important here!
        if (!$encoding || strtolower($encoding) == 'iso-8859-1') {
            // replace MS Word smart qutoes
            $trans = array();
            $trans[chr(130)] = '&sbquo;';
            // Single Low-9 Quotation Mark
            $trans[chr(131)] = '&fnof;';
            // Latin Small Letter F With Hook
            $trans[chr(132)] = '&bdquo;';
            // Double Low-9 Quotation Mark
            $trans[chr(133)] = '&hellip;';
            // Horizontal Ellipsis
            $trans[chr(134)] = '&dagger;';
            // Dagger
            $trans[chr(135)] = '&Dagger;';
            // Double Dagger
            $trans[chr(136)] = '&circ;';
            // Modifier Letter Circumflex Accent
            $trans[chr(137)] = '&permil;';
            // Per Mille Sign
            $trans[chr(138)] = '&Scaron;';
            // Latin Capital Letter S With Caron
            $trans[chr(139)] = '&lsaquo;';
            // Single Left-Pointing Angle Quotation Mark
            $trans[chr(140)] = '&OElig;';
            // Latin Capital Ligature OE
            $trans[chr(145)] = '&lsquo;';
            // Left Single Quotation Mark
            $trans[chr(146)] = '&rsquo;';
            // Right Single Quotation Mark
            $trans[chr(147)] = '&ldquo;';
            // Left Double Quotation Mark
            $trans[chr(148)] = '&rdquo;';
            // Right Double Quotation Mark
            $trans[chr(149)] = '&bull;';
            // Bullet
            $trans[chr(150)] = '&ndash;';
            // En Dash
            $trans[chr(151)] = '&mdash;';
            // Em Dash
            $trans[chr(152)] = '&tilde;';
            // Small Tilde
            $trans[chr(153)] = '&trade;';
            // Trade Mark Sign
            $trans[chr(154)] = '&scaron;';
            // Latin Small Letter S With Caron
            $trans[chr(155)] = '&rsaquo;';
            // Single Right-Pointing Angle Quotation Mark
            $trans[chr(156)] = '&oelig;';
            // Latin Small Ligature OE
            $trans[chr(159)] = '&Yuml;';
            // Latin Capital Letter Y With Diaeresis
            $html = strtr($html, $trans);
        }
        if (!$encoding) {
            debug('No character encoding found, so treating as UTF-8');
            $encoding = 'utf-8';
        } else {
            debug('Character encoding: ' . $encoding);
            if (strtolower($encoding) != 'utf-8') {
                debug('Converting to UTF-8');
                $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
            }
        }
    }
    return $html;
}
Example #3
0
 public function init()
 {
     // Check absolute bare minimum requirements.
     if (function_exists('version_compare') && version_compare(PHP_VERSION, '5.0', '<') || !extension_loaded('xml') || !extension_loaded('pcre')) {
         return false;
     } elseif (!extension_loaded('xmlreader')) {
         static $xml_is_sane = null;
         if ($xml_is_sane === null) {
             $parser_check = xml_parser_create();
             xml_parse_into_struct($parser_check, '<foo>&amp;</foo>', $values);
             xml_parser_free($parser_check);
             $xml_is_sane = isset($values[0]['value']);
         }
         if (!$xml_is_sane) {
             return false;
         }
     }
     // Pass whatever was set with config options over to the sanitizer.
     $this->sanitize->pass_cache_data($this->cache, $this->cache_location, $this->cache_name_function, $this->cache_class);
     $this->sanitize->pass_file_data($this->file_class, $this->timeout, $this->useragent, $this->force_fsockopen);
     if ($this->feed_url !== null || $this->raw_data !== null) {
         $this->error = null;
         $this->data = array();
         $this->multifeed_objects = array();
         $cache = false;
         if ($this->feed_url !== null) {
             $parsed_feed_url = SimplePie_Misc::parse_url($this->feed_url);
             // Decide whether to enable caching
             if ($this->cache && $parsed_feed_url['scheme'] !== '') {
                 $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, call_user_func($this->cache_name_function, $this->feed_url), 'spc');
             }
             // If it's enabled and we don't want an XML dump, use the cache
             if ($cache && !$this->xml_dump) {
                 // Load the Cache
                 $this->data = $cache->load();
                 if (!empty($this->data)) {
                     // If the cache is for an outdated build of SimplePie
                     if (!isset($this->data['build']) || $this->data['build'] !== SIMPLEPIE_BUILD) {
                         $cache->unlink();
                         $this->data = array();
                     } elseif (isset($this->data['url']) && $this->data['url'] !== $this->feed_url) {
                         $cache = false;
                         $this->data = array();
                     } elseif (isset($this->data['feed_url'])) {
                         // If the autodiscovery cache is still valid use it.
                         if ($cache->mtime() + $this->autodiscovery_cache_duration > time()) {
                             // Do not need to do feed autodiscovery yet.
                             if ($this->data['feed_url'] === $this->data['url']) {
                                 $cache->unlink();
                                 $this->data = array();
                             } else {
                                 $this->set_feed_url($this->data['feed_url']);
                                 return $this->init();
                             }
                         }
                     } elseif ($cache->mtime() + $this->cache_duration < time()) {
                         // If we have last-modified and/or etag set
                         if (isset($this->data['headers']['last-modified']) || isset($this->data['headers']['etag'])) {
                             $headers = array('Accept' => 'application/atom+xml, application/rss+xml, application/rdf+xml;q=0.9, application/xml;q=0.8, text/xml;q=0.8, text/html;q=0.7, unknown/unknown;q=0.1, application/unknown;q=0.1, */*;q=0.1');
                             if (isset($this->data['headers']['last-modified'])) {
                                 $headers['if-modified-since'] = $this->data['headers']['last-modified'];
                             }
                             if (isset($this->data['headers']['etag'])) {
                                 $headers['if-none-match'] = $this->data['headers']['etag'];
                             }
                             $file = new $this->file_class($this->feed_url, $this->timeout / 10, 5, $headers, $this->useragent, $this->force_fsockopen);
                             if ($file->success) {
                                 if ($file->status_code === 304) {
                                     $cache->touch();
                                     return true;
                                 } else {
                                     $headers = $file->headers;
                                 }
                             } else {
                                 unset($file);
                             }
                         }
                     } else {
                         return true;
                     }
                 } else {
                     $cache->unlink();
                     $this->data = array();
                 }
             }
             // If we don't already have the file (it'll only exist if we've opened it to check if the cache has been modified), open it.
             if (!isset($file)) {
                 if (is_a($this->file, 'SimplePie_File') && $this->file->url === $this->feed_url) {
                     $file =& $this->file;
                 } else {
                     $headers = array('Accept' => 'application/atom+xml, application/rss+xml, application/rdf+xml;q=0.9, application/xml;q=0.8, text/xml;q=0.8, text/html;q=0.7, unknown/unknown;q=0.1, application/unknown;q=0.1, */*;q=0.1');
                     $file = new $this->file_class($this->feed_url, $this->timeout, 5, $headers, $this->useragent, $this->force_fsockopen);
                 }
             }
             // If the file connection has an error, set SimplePie::error to that and quit
             if (!$file->success && !($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300))) {
                 $this->error = $file->error;
                 if (!empty($this->data)) {
                     return true;
                 } else {
                     return false;
                 }
             }
             if (!$this->force_feed) {
                 // Check if the supplied URL is a feed, if it isn't, look for it.
                 $locate = new $this->locator_class($file, $this->timeout, $this->useragent, $this->file_class, $this->max_checked_feeds, $this->content_type_sniffer_class);
                 if (!$locate->is_feed($file)) {
                     // We need to unset this so that if SimplePie::set_file() has been called that object is untouched
                     unset($file);
                     if ($file = $locate->find($this->autodiscovery, $this->all_discovered_feeds)) {
                         if ($cache) {
                             $this->data = array('url' => $this->feed_url, 'feed_url' => $file->url, 'build' => SIMPLEPIE_BUILD);
                             if (!$cache->save($this)) {
                                 trigger_error("{$this->cache_location} is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
                             }
                             $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, call_user_func($this->cache_name_function, $file->url), 'spc');
                         }
                         $this->feed_url = $file->url;
                     } else {
                         $this->error = "A feed could not be found at {$this->feed_url}. A feed with an invalid mime type may fall victim to this error, or " . SIMPLEPIE_NAME . " was unable to auto-discover it.. Use force_feed() if you are certain this URL is a real feed.";
                         SimplePie_Misc::error($this->error, E_USER_NOTICE, __FILE__, __LINE__);
                         return false;
                     }
                 }
                 $locate = null;
             }
             $headers = $file->headers;
             $data = $file->body;
             $sniffer = new $this->content_type_sniffer_class($file);
             $sniffed = $sniffer->get_type();
         } else {
             $data = $this->raw_data;
         }
         // Set up array of possible encodings
         $encodings = array();
         // First check to see if input has been overridden.
         if ($this->input_encoding !== false) {
             $encodings[] = $this->input_encoding;
         }
         $application_types = array('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity');
         $text_types = array('text/xml', 'text/xml-external-parsed-entity');
         // RFC 3023 (only applies to sniffed content)
         if (isset($sniffed)) {
             if (in_array($sniffed, $application_types) || substr($sniffed, 0, 12) === 'application/' && substr($sniffed, -4) === '+xml') {
                 if (isset($headers['content-type']) && preg_match('/;\\x20?charset=([^;]*)/i', $headers['content-type'], $charset)) {
                     $encodings[] = strtoupper($charset[1]);
                 }
                 $encodings = array_merge($encodings, SimplePie_Misc::xml_encoding($data));
                 $encodings[] = 'UTF-8';
             } elseif (in_array($sniffed, $text_types) || substr($sniffed, 0, 5) === 'text/' && substr($sniffed, -4) === '+xml') {
                 if (isset($headers['content-type']) && preg_match('/;\\x20?charset=([^;]*)/i', $headers['content-type'], $charset)) {
                     $encodings[] = $charset[1];
                 }
                 $encodings[] = 'US-ASCII';
             } elseif (substr($sniffed, 0, 5) === 'text/') {
                 $encodings[] = 'US-ASCII';
             }
         }
         // Fallback to XML 1.0 Appendix F.1/UTF-8/ISO-8859-1
         $encodings = array_merge($encodings, SimplePie_Misc::xml_encoding($data));
         $encodings[] = 'UTF-8';
         $encodings[] = 'ISO-8859-1';
         // There's no point in trying an encoding twice
         $encodings = array_unique($encodings);
         // If we want the XML, just output that with the most likely encoding and quit
         if ($this->xml_dump) {
             header('Content-type: text/xml; charset=' . $encodings[0]);
             echo $data;
             exit;
         }
         // Loop through each possible encoding, till we return something, or run out of possibilities
         foreach ($encodings as $encoding) {
             // Change the encoding to UTF-8 (as we always use UTF-8 internally)
             if ($utf8_data = SimplePie_Misc::change_encoding($data, $encoding, 'UTF-8')) {
                 // Create new parser
                 $parser = new $this->parser_class();
                 // If it's parsed fine
                 if ($parser->parse($utf8_data, 'UTF-8')) {
                     $this->data = $parser->get_data();
                     if ($this->get_type() & ~SIMPLEPIE_TYPE_NONE) {
                         if (isset($headers)) {
                             $this->data['headers'] = $headers;
                         }
                         $this->data['build'] = SIMPLEPIE_BUILD;
                         // Cache the file if caching is enabled
                         if ($cache && !$cache->save($this)) {
                             trigger_error("{$this->cache_location} is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
                         }
                         return true;
                     } else {
                         $this->error = "A feed could not be found at {$this->feed_url}. This does not appear to be a valid RSS or Atom feed.";
                         SimplePie_Misc::error($this->error, E_USER_NOTICE, __FILE__, __LINE__);
                         return false;
                     }
                 }
             }
         }
         if (isset($parser)) {
             // We have an error, just set SimplePie_Misc::error to it and quit
             $this->error = sprintf('This XML document is invalid, likely due to invalid characters. XML error: %s at line %d, column %d', $parser->get_error_string(), $parser->get_current_line(), $parser->get_current_column());
         } else {
             $this->error = 'The data could not be converted to UTF-8. You MUST have either the iconv or mbstring extension installed. Upgrading to PHP 5.x (which includes iconv) is highly recommended.';
         }
         SimplePie_Misc::error($this->error, E_USER_NOTICE, __FILE__, __LINE__);
         return false;
     } elseif (!empty($this->multifeed_url)) {
         $i = 0;
         $success = 0;
         $this->multifeed_objects = array();
         foreach ($this->multifeed_url as $url) {
             $this->multifeed_objects[$i] = clone $this;
             $this->multifeed_objects[$i]->set_feed_url($url);
             $success |= $this->multifeed_objects[$i]->init();
             $i++;
         }
         return (bool) $success;
     } else {
         return false;
     }
 }
Example #4
0
 function sanitize($data, $type, $base = '')
 {
     $data = trim($data);
     if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI) {
         if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML) {
             if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\\/[A-Za-z][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x2F\\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data)) {
                 $type |= SIMPLEPIE_CONSTRUCT_HTML;
             } else {
                 $type |= SIMPLEPIE_CONSTRUCT_TEXT;
             }
         }
         if ($type & SIMPLEPIE_CONSTRUCT_BASE64) {
             $data = base64_decode($data);
         }
         if ($type & SIMPLEPIE_CONSTRUCT_XHTML) {
             if ($this->remove_div) {
                 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
                 $data = preg_replace('/<\\/div>$/', '', $data);
             } else {
                 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
             }
         }
         if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML)) {
             // Strip comments
             if ($this->strip_comments) {
                 $data = SimplePie_Misc::strip_comments($data);
             }
             // Strip out HTML tags and attributes that might cause various security problems.
             // Based on recommendations by Mark Pilgrim at:
             // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
             if ($this->strip_htmltags) {
                 foreach ($this->strip_htmltags as $tag) {
                     $pcre = "/<({$tag})" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . "(>(.*)<\\/{$tag}" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>|(\\/)?>)/siU';
                     while (preg_match($pcre, $data)) {
                         $data = preg_replace_callback($pcre, array(&$this, 'do_strip_htmltags'), $data);
                     }
                 }
             }
             if ($this->strip_attributes) {
                 foreach ($this->strip_attributes as $attrib) {
                     $data = preg_replace('/(<[A-Za-z][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x2F\\x3E]*)' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . trim($attrib) . '(?:\\s*=\\s*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x22\\x27\\x3E][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x3E]*)?))?' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>/', '\\1\\2\\3>', $data);
                 }
             }
             // Replace relative URLs
             $this->base = $base;
             foreach ($this->replace_url_attributes as $element => $attributes) {
                 $data = $this->replace_urls($data, $element, $attributes);
             }
             // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
             if (isset($this->image_handler) && (string) $this->image_handler !== '' && $this->enable_cache) {
                 $images = SimplePie_Misc::get_element('img', $data);
                 foreach ($images as $img) {
                     if (isset($img['attribs']['src']['data'])) {
                         $image_url = call_user_func($this->cache_name_function, $img['attribs']['src']['data']);
                         $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, $image_url, 'spi');
                         if ($cache->load()) {
                             $img['attribs']['src']['data'] = $this->image_handler . $image_url;
                             $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
                         } else {
                             $file =& new $this->file_class($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen);
                             $headers = $file->headers;
                             if ($file->success && ($file->status_code == 200 || $file->status_code > 206 && $file->status_code < 300)) {
                                 if ($cache->save(array('headers' => $file->headers, 'body' => $file->body))) {
                                     $img['attribs']['src']['data'] = $this->image_handler . $image_url;
                                     $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
                                 } else {
                                     trigger_error("{$cache->name} is not writeable", E_USER_WARNING);
                                 }
                             }
                         }
                     }
                 }
             }
             // Having (possibly) taken stuff out, there may now be whitespace at the beginning/end of the data
             $data = trim($data);
         }
         if ($type & SIMPLEPIE_CONSTRUCT_IRI) {
             $data = SimplePie_Misc::absolutize_url($data, $base);
         }
         if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI)) {
             $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
         }
         if ($this->output_encoding != 'UTF-8') {
             $data = SimplePie_Misc::change_encoding($data, 'UTF-8', $this->output_encoding);
         }
     }
     return $data;
 }
 public function sanitize($data, $type, $base = '')
 {
     $data = trim($data);
     if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI) {
         if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML) {
             if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\\/[A-Za-z][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x2F\\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data)) {
                 $type |= SIMPLEPIE_CONSTRUCT_HTML;
             } else {
                 $type |= SIMPLEPIE_CONSTRUCT_TEXT;
             }
         }
         if ($type & SIMPLEPIE_CONSTRUCT_BASE64) {
             $data = base64_decode($data);
         }
         if ($type & SIMPLEPIE_CONSTRUCT_XHTML) {
             if ($this->remove_div) {
                 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
                 $data = preg_replace('/<\\/div>$/', '', $data);
             } else {
                 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
             }
         }
         if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML)) {
             if ($this->strip_comments) {
                 $data = SimplePie_Misc::strip_comments($data);
             }
             if ($this->strip_htmltags) {
                 foreach ($this->strip_htmltags as $tag) {
                     $pcre = "/<({$tag})" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . "(>(.*)<\\/{$tag}" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>|(\\/)?>)/siU';
                     while (preg_match($pcre, $data)) {
                         $data = preg_replace_callback($pcre, array(&$this, 'do_strip_htmltags'), $data);
                     }
                 }
             }
             if ($this->strip_attributes) {
                 foreach ($this->strip_attributes as $attrib) {
                     $data = preg_replace('/(<[A-Za-z][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x2F\\x3E]*)' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . trim($attrib) . '(?:\\s*=\\s*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x22\\x27\\x3E][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x3E]*)?))?' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>/', '\\1\\2\\3>', $data);
                 }
             }
             $this->base = $base;
             foreach ($this->replace_url_attributes as $element => $attributes) {
                 $data = $this->replace_urls($data, $element, $attributes);
             }
             if (isset($this->image_handler) && (string) $this->image_handler !== '' && $this->enable_cache) {
                 $images = SimplePie_Misc::get_element('img', $data);
                 foreach ($images as $img) {
                     if (isset($img['attribs']['src']['data'])) {
                         $image_url = call_user_func($this->cache_name_function, $img['attribs']['src']['data']);
                         $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, $image_url, 'spi');
                         if ($cache->load()) {
                             $img['attribs']['src']['data'] = $this->image_handler . $image_url;
                             $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
                         } else {
                             $file = new $this->file_class($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen);
                             $headers = $file->headers;
                             if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300))) {
                                 if ($cache->save(array('headers' => $file->headers, 'body' => $file->body))) {
                                     $img['attribs']['src']['data'] = $this->image_handler . $image_url;
                                     $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
                                 } else {
                                     trigger_error("{$this->cache_location} is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
                                 }
                             }
                         }
                     }
                 }
             }
             $data = trim($data);
         }
         if ($type & SIMPLEPIE_CONSTRUCT_IRI) {
             $data = SimplePie_Misc::absolutize_url($data, $base);
         }
         if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI)) {
             $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
         }
         if ($this->output_encoding !== 'UTF-8') {
             $data = SimplePie_Misc::change_encoding($data, 'UTF-8', $this->output_encoding);
         }
     }
     return $data;
 }
Example #6
0
 public function test_nonexistant()
 {
     $this->assertFalse(SimplePie_Misc::change_encoding('', 'TESTENC', 'UTF-8'));
 }
Example #7
0
	/**
	 * Detect XML encoding, as per XML 1.0 Appendix F.1
	 *
	 * @todo Add support for EBCDIC
	 * @param string $data XML data
	 * @return array Possible encodings
	 */
	public static function xml_encoding($data)
	{
		// UTF-32 Big Endian BOM
		if (substr($data, 0, 4) === "\x00\x00\xFE\xFF")
		{
			$encoding[] = 'UTF-32BE';
		}
		// UTF-32 Little Endian BOM
		elseif (substr($data, 0, 4) === "\xFF\xFE\x00\x00")
		{
			$encoding[] = 'UTF-32LE';
		}
		// UTF-16 Big Endian BOM
		elseif (substr($data, 0, 2) === "\xFE\xFF")
		{
			$encoding[] = 'UTF-16BE';
		}
		// UTF-16 Little Endian BOM
		elseif (substr($data, 0, 2) === "\xFF\xFE")
		{
			$encoding[] = 'UTF-16LE';
		}
		// UTF-8 BOM
		elseif (substr($data, 0, 3) === "\xEF\xBB\xBF")
		{
			$encoding[] = 'UTF-8';
		}
		// UTF-32 Big Endian Without BOM
		elseif (substr($data, 0, 20) === "\x00\x00\x00\x3C\x00\x00\x00\x3F\x00\x00\x00\x78\x00\x00\x00\x6D\x00\x00\x00\x6C")
		{
			if ($pos = strpos($data, "\x00\x00\x00\x3F\x00\x00\x00\x3E"))
			{
				$parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32BE', 'UTF-8'));
				if ($parser->parse())
				{
					$encoding[] = $parser->encoding;
				}
			}
			$encoding[] = 'UTF-32BE';
		}
		// UTF-32 Little Endian Without BOM
		elseif (substr($data, 0, 20) === "\x3C\x00\x00\x00\x3F\x00\x00\x00\x78\x00\x00\x00\x6D\x00\x00\x00\x6C\x00\x00\x00")
		{
			if ($pos = strpos($data, "\x3F\x00\x00\x00\x3E\x00\x00\x00"))
			{
				$parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32LE', 'UTF-8'));
				if ($parser->parse())
				{
					$encoding[] = $parser->encoding;
				}
			}
			$encoding[] = 'UTF-32LE';
		}
		// UTF-16 Big Endian Without BOM
		elseif (substr($data, 0, 10) === "\x00\x3C\x00\x3F\x00\x78\x00\x6D\x00\x6C")
		{
			if ($pos = strpos($data, "\x00\x3F\x00\x3E"))
			{
				$parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16BE', 'UTF-8'));
				if ($parser->parse())
				{
					$encoding[] = $parser->encoding;
				}
			}
			$encoding[] = 'UTF-16BE';
		}
		// UTF-16 Little Endian Without BOM
		elseif (substr($data, 0, 10) === "\x3C\x00\x3F\x00\x78\x00\x6D\x00\x6C\x00")
		{
			if ($pos = strpos($data, "\x3F\x00\x3E\x00"))
			{
				$parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16LE', 'UTF-8'));
				if ($parser->parse())
				{
					$encoding[] = $parser->encoding;
				}
			}
			$encoding[] = 'UTF-16LE';
		}
		// US-ASCII (or superset)
		elseif (substr($data, 0, 5) === "\x3C\x3F\x78\x6D\x6C")
		{
			if ($pos = strpos($data, "\x3F\x3E"))
			{
				$parser = new SimplePie_XML_Declaration_Parser(substr($data, 5, $pos - 5));
				if ($parser->parse())
				{
					$encoding[] = $parser->encoding;
				}
			}
			$encoding[] = 'UTF-8';
		}
		// Fallback to UTF-8
		else
		{
			$encoding[] = 'UTF-8';
		}
		return $encoding;
	}
 function do_entites_decode($data)
 {
     if (isset($this->cached_entities[$data[0]])) {
         return $this->cached_entities[$data[0]];
     } else {
         $return = SimplePie_Misc::change_encoding(html_entity_decode($data[0], ENT_QUOTES), 'ISO-8859-1', $this->input_encoding);
         if ($return == $data[0]) {
             $return = SimplePie_Misc::change_encoding(preg_replace_callback('/&#([x]?[0-9a-f]+);/mi', array(&$this, 'replace_num_entity'), $data[0]), 'UTF-8', $this->input_encoding);
         }
         $this->cached_entities[$data[0]] = $return;
         return $return;
     }
 }
function rpf_convert_to_utf8($html, $header = null)
{
    $accept = array('type' => array('application/rss+xml', 'application/xml', 'application/rdf+xml', 'text/xml', 'text/html'), 'charset' => array_diff(mb_list_encodings(), array('pass', 'auto', 'wchar', 'byte2be', 'byte2le', 'byte4be', 'byte4le', 'BASE64', 'UUENCODE', 'HTML-ENTITIES', 'Quoted-Printable', '7bit', '8bit')));
    $encoding = null;
    if ($html || $header) {
        if (is_array($header)) {
            $header = implode("\n", $header);
        }
        if (!$header || !preg_match_all('/^Content-Type:\\s+([^;]+)(?:;\\s*charset=([^;"\'\\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
            // error parsing the response
        } else {
            $match = end($match);
            // get last matched element (in case of redirects)
            if (!in_array(strtolower($match[1]), $accept['type'])) {
                // type not accepted
                // TODO: avoid conversion
            }
            if (isset($match[2])) {
                $encoding = trim($match[2], '"\'');
            }
        }
        if (!$encoding) {
            if (preg_match('/^<\\?xml\\s+version=(?:"[^"]*"|\'[^\']*\')\\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
                $encoding = trim($match[1], '"\'');
            } elseif (preg_match('/<meta\\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\\s*charset=([^;"\'>]+)/i', $html, $match)) {
                if (isset($match[1])) {
                    $encoding = trim($match[1]);
                }
            }
        }
        if (!$encoding) {
            $encoding = 'utf-8';
        } else {
            if (!in_array($encoding, array_map('strtolower', $accept['charset']))) {
                // encoding not accepted
                // TODO: avoid conversion
            }
            if (strtolower($encoding) != 'utf-8') {
                if (strtolower($encoding) == 'iso-8859-1') {
                    // replace MS Word smart qutoes
                    $trans = array();
                    $trans[chr(130)] = '&sbquo;';
                    // Single Low-9 Quotation Mark
                    $trans[chr(131)] = '&fnof;';
                    // Latin Small Letter F With Hook
                    $trans[chr(132)] = '&bdquo;';
                    // Double Low-9 Quotation Mark
                    $trans[chr(133)] = '&hellip;';
                    // Horizontal Ellipsis
                    $trans[chr(134)] = '&dagger;';
                    // Dagger
                    $trans[chr(135)] = '&Dagger;';
                    // Double Dagger
                    $trans[chr(136)] = '&circ;';
                    // Modifier Letter Circumflex Accent
                    $trans[chr(137)] = '&permil;';
                    // Per Mille Sign
                    $trans[chr(138)] = '&Scaron;';
                    // Latin Capital Letter S With Caron
                    $trans[chr(139)] = '&lsaquo;';
                    // Single Left-Pointing Angle Quotation Mark
                    $trans[chr(140)] = '&OElig;';
                    // Latin Capital Ligature OE
                    $trans[chr(145)] = '&lsquo;';
                    // Left Single Quotation Mark
                    $trans[chr(146)] = '&rsquo;';
                    // Right Single Quotation Mark
                    $trans[chr(147)] = '&ldquo;';
                    // Left Double Quotation Mark
                    $trans[chr(148)] = '&rdquo;';
                    // Right Double Quotation Mark
                    $trans[chr(149)] = '&bull;';
                    // Bullet
                    $trans[chr(150)] = '&ndash;';
                    // En Dash
                    $trans[chr(151)] = '&mdash;';
                    // Em Dash
                    $trans[chr(152)] = '&tilde;';
                    // Small Tilde
                    $trans[chr(153)] = '&trade;';
                    // Trade Mark Sign
                    $trans[chr(154)] = '&scaron;';
                    // Latin Small Letter S With Caron
                    $trans[chr(155)] = '&rsaquo;';
                    // Single Right-Pointing Angle Quotation Mark
                    $trans[chr(156)] = '&oelig;';
                    // Latin Small Ligature OE
                    $trans[chr(159)] = '&Yuml;';
                    // Latin Capital Letter Y With Diaeresis
                    $html = strtr($html, $trans);
                }
                if (!class_exists('SimplePie_Misc')) {
                    require_once RPFINC . 'simplepie.class.php';
                }
                $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
                /*
                if (function_exists('iconv')) {
                	// iconv appears to handle certain character encodings better than mb_convert_encoding
                	$html = iconv($encoding, 'utf-8', $html);
                } else {
                	$html = mb_convert_encoding($html, 'utf-8', $encoding);
                }
                */
            }
        }
    }
    return $html;
}
function convert_to_utf8($html, $header = null)
{
    $encoding = null;
    if ($html || $header) {
        if (is_array($header)) {
            $header = implode("\n", $header);
        }
        if (!$header || !preg_match_all('/^Content-Type:\\s+([^;]+)(?:;\\s*charset=["\']?([^;"\'\\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
            // error parsing the response
        } else {
            $match = end($match);
            // get last matched element (in case of redirects)
            if (isset($match[2])) {
                $encoding = trim($match[2], '"\'');
            }
        }
        if (!$encoding) {
            if (preg_match('/^<\\?xml\\s+version=(?:"[^"]*"|\'[^\']*\')\\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
                $encoding = trim($match[1], '"\'');
            } elseif (preg_match('/<meta\\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\\s*charset=["\']?([^;"\'>]+)/i', $html, $match)) {
                if (isset($match[1])) {
                    $encoding = trim($match[1]);
                }
            }
        }
        if (!$encoding) {
            $encoding = 'utf-8';
        } else {
            if (strtolower($encoding) != 'utf-8') {
                if (strtolower($encoding) == 'iso-8859-1') {
                    // replace MS Word smart qutoes
                    $trans = array();
                    $trans[chr(130)] = '&sbquo;';
                    // Single Low-9 Quotation Mark
                    $trans[chr(131)] = '&fnof;';
                    // Latin Small Letter F With Hook
                    $trans[chr(132)] = '&bdquo;';
                    // Double Low-9 Quotation Mark
                    $trans[chr(133)] = '&hellip;';
                    // Horizontal Ellipsis
                    $trans[chr(134)] = '&dagger;';
                    // Dagger
                    $trans[chr(135)] = '&Dagger;';
                    // Double Dagger
                    $trans[chr(136)] = '&circ;';
                    // Modifier Letter Circumflex Accent
                    $trans[chr(137)] = '&permil;';
                    // Per Mille Sign
                    $trans[chr(138)] = '&Scaron;';
                    // Latin Capital Letter S With Caron
                    $trans[chr(139)] = '&lsaquo;';
                    // Single Left-Pointing Angle Quotation Mark
                    $trans[chr(140)] = '&OElig;';
                    // Latin Capital Ligature OE
                    $trans[chr(145)] = '&lsquo;';
                    // Left Single Quotation Mark
                    $trans[chr(146)] = '&rsquo;';
                    // Right Single Quotation Mark
                    $trans[chr(147)] = '&ldquo;';
                    // Left Double Quotation Mark
                    $trans[chr(148)] = '&rdquo;';
                    // Right Double Quotation Mark
                    $trans[chr(149)] = '&bull;';
                    // Bullet
                    $trans[chr(150)] = '&ndash;';
                    // En Dash
                    $trans[chr(151)] = '&mdash;';
                    // Em Dash
                    $trans[chr(152)] = '&tilde;';
                    // Small Tilde
                    $trans[chr(153)] = '&trade;';
                    // Trade Mark Sign
                    $trans[chr(154)] = '&scaron;';
                    // Latin Small Letter S With Caron
                    $trans[chr(155)] = '&rsaquo;';
                    // Single Right-Pointing Angle Quotation Mark
                    $trans[chr(156)] = '&oelig;';
                    // Latin Small Ligature OE
                    $trans[chr(159)] = '&Yuml;';
                    // Latin Capital Letter Y With Diaeresis
                    $html = strtr($html, $trans);
                }
                $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
                /*
                if (function_exists('iconv')) {
                	// iconv appears to handle certain character encodings better than mb_convert_encoding
                	$html = iconv($encoding, 'utf-8', $html);
                } else {
                	$html = mb_convert_encoding($html, 'utf-8', $encoding);
                }
                */
            }
        }
    }
    return $html;
}
Example #11
0
 function sanitize($data, $type, $base = '')
 {
     $data = trim($data);
     if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI) {
         if ($type & SIMPLEPIE_CONSTRUCT_BASE64) {
             $data = base64_decode($data);
         }
         if ($type & SIMPLEPIE_CONSTRUCT_XHTML) {
             if ($this->remove_div) {
                 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
                 $data = preg_replace('/<\\/div>$/', '', $data);
             } else {
                 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
             }
         }
         if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML)) {
             // Strip comments
             if ($this->strip_comments) {
                 $data = SimplePie_Misc::strip_comments($data);
             }
             // Strip out HTML tags and attributes that might cause various security problems.
             // Based on recommendations by Mark Pilgrim at:
             // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
             if ($this->strip_htmltags) {
                 foreach ($this->strip_htmltags as $tag) {
                     $pcre = "/<({$tag})" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . "(>(.*)<\\/{$tag}" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>|(\\/)?>)/siU';
                     while (preg_match($pcre, $data)) {
                         $data = preg_replace_callback($pcre, array(&$this, 'do_strip_htmltags'), $data);
                     }
                 }
             }
             if ($this->strip_attributes) {
                 foreach ($this->strip_attributes as $attrib) {
                     $data = preg_replace('/ ' . trim($attrib) . '=("|&quot;)(\\w|\\s|=|-|:|;|\\/|\\.|\\?|&|,|#|!|\\(|\\)|\'|&apos;|<|>|\\+|{|})*("|&quot;)/i', '', $data);
                     $data = preg_replace('/ ' . trim($attrib) . '=(\'|&apos;)(\\w|\\s|=|-|:|;|\\/|\\.|\\?|&|,|#|!|\\(|\\)|"|&quot;|<|>|\\+|{|})*(\'|&apos;)/i', '', $data);
                     $data = preg_replace('/ ' . trim($attrib) . '=(\\w|\\s|=|-|:|;|\\/|\\.|\\?|&|,|#|!|\\(|\\)|\\+|{|})*/i', '', $data);
                 }
             }
             // Replace relative URLs
             $this->base = $base;
             foreach ($this->replace_url_attributes as $element => $attribute) {
                 if ((!is_array($this->strip_htmltags) || !in_array($element, $this->strip_htmltags)) && (!is_array($this->strip_attributes) || !in_array($attribute, $this->strip_attributes))) {
                     $data = $this->replace_urls($data, $element, $attribute);
                 }
             }
             // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
             if (isset($this->image_handler) && !empty($this->image_handler) && $this->enable_cache) {
                 $images = SimplePie_Misc::get_element('img', $data);
                 foreach ($images as $img) {
                     if (!empty($img['attribs']['src']['data'])) {
                         $image_url = $img['attribs']['src']['data'];
                         $cache =& new $this->cache_class($this->cache_location, call_user_func($this->cache_name_function, $image_url), 'spi');
                         if ($cache->load()) {
                             $img['attribs']['src']['data'] = $this->image_handler . rawurlencode($img['attribs']['src']['data']);
                             $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
                         } else {
                             $file =& new $this->file_class($image_url, $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen);
                             $headers = $file->headers;
                             if ($file->success && ($file->status_code == 200 || $file->status_code > 206 && $file->status_code < 300)) {
                                 if (!$cache->save(array('headers' => $file->headers, 'body' => $file->body))) {
                                     trigger_error("{$cache->name} is not writeable", E_USER_WARNING);
                                 }
                                 $img['attribs']['src']['data'] = $this->image_handler . rawurlencode($img['attribs']['src']['data']);
                                 $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
                             }
                         }
                     }
                 }
             }
             // Having (possibly) taken stuff out, there may now be whitespace at the beginning/end of the data
             $data = trim($data);
         }
         if ($type & SIMPLEPIE_CONSTRUCT_IRI) {
             $data = SimplePie_Misc::absolutize_url($data, $base);
         }
         if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI)) {
             $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
         }
         if ($this->output_encoding != 'UTF-8') {
             $data = SimplePie_Misc::change_encoding($data, 'UTF-8', $this->output_encoding);
         }
     }
     return $data;
 }