/** * Detect XML encoding, as per XML 1.0 Appendix F.1 * * @todo Add support for EBCDIC * @param string $data XML data * @param SimplePie_Registry $registry Class registry * @return array Possible encodings */ public static function xml_encoding($data, $registry) { // UTF-32 Big Endian BOM if (substr($data, 0, 4) === "��") { $encoding[] = 'UTF-32BE'; } elseif (substr($data, 0, 4) === "��") { $encoding[] = 'UTF-32LE'; } elseif (substr($data, 0, 2) === "��") { $encoding[] = 'UTF-16BE'; } elseif (substr($data, 0, 2) === "��") { $encoding[] = 'UTF-16LE'; } elseif (substr($data, 0, 3) === "") { $encoding[] = 'UTF-8'; } elseif (substr($data, 0, 20) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = $registry->create('XML_Declaration_Parser', array(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32BE', 'UTF-8'))); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-32BE'; } elseif (substr($data, 0, 20) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = $registry->create('XML_Declaration_Parser', array(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32LE', 'UTF-8'))); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-32LE'; } elseif (substr($data, 0, 10) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = $registry->create('XML_Declaration_Parser', array(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16BE', 'UTF-8'))); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-16BE'; } elseif (substr($data, 0, 10) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = $registry->create('XML_Declaration_Parser', array(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16LE', 'UTF-8'))); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-16LE'; } elseif (substr($data, 0, 5) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = $registry->create('XML_Declaration_Parser', array(substr($data, 5, $pos - 5))); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-8'; } else { $encoding[] = 'UTF-8'; } return $encoding; }
function convert_to_utf8($html, $header = null) { $encoding = null; if ($html || $header) { if (is_array($header)) { $header = implode("\n", $header); } if (!$header || !preg_match_all('/^Content-Type:\\s+([^;]+)(?:;\\s*charset=["\']?([^;"\'\\n]*))?/im', $header, $match, PREG_SET_ORDER)) { // error parsing the response debug('Could not find Content-Type header in HTTP response'); } else { $match = end($match); // get last matched element (in case of redirects) if (isset($match[2])) { $encoding = trim($match[2], "\"' \r\n\v\t"); } } // TODO: check to see if encoding is supported (can we convert it?) // If it's not, result will be empty string. // For now we'll check for invalid encoding types returned by some sites, e.g. 'none' // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html if (!$encoding || $encoding == 'none') { // search for encoding in HTML - only look at the first 50000 characters // Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html // TODO: improve this so it looks at smaller chunks first $html_head = substr($html, 0, 50000); if (preg_match('/^<\\?xml\\s+version=(?:"[^"]*"|\'[^\']*\')\\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) { $encoding = trim($match[1], '"\''); } elseif (preg_match('/<meta\\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\\s*charset=["\']?([^;"\'>]+)/i', $html_head, $match)) { $encoding = trim($match[1]); } elseif (preg_match_all('/<meta\\s+([^>]+)>/i', $html_head, $match)) { foreach ($match[1] as $_test) { if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) { $encoding = trim($_m[1]); break; } } } } if (isset($encoding)) { $encoding = trim($encoding); } // trim is important here! if (!$encoding || strtolower($encoding) == 'iso-8859-1') { // replace MS Word smart qutoes $trans = array(); $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark $trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook $trans[chr(132)] = '„'; // Double Low-9 Quotation Mark $trans[chr(133)] = '…'; // Horizontal Ellipsis $trans[chr(134)] = '†'; // Dagger $trans[chr(135)] = '‡'; // Double Dagger $trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent $trans[chr(137)] = '‰'; // Per Mille Sign $trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron $trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark $trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE $trans[chr(145)] = '‘'; // Left Single Quotation Mark $trans[chr(146)] = '’'; // Right Single Quotation Mark $trans[chr(147)] = '“'; // Left Double Quotation Mark $trans[chr(148)] = '”'; // Right Double Quotation Mark $trans[chr(149)] = '•'; // Bullet $trans[chr(150)] = '–'; // En Dash $trans[chr(151)] = '—'; // Em Dash $trans[chr(152)] = '˜'; // Small Tilde $trans[chr(153)] = '™'; // Trade Mark Sign $trans[chr(154)] = 'š'; // Latin Small Letter S With Caron $trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark $trans[chr(156)] = 'œ'; // Latin Small Ligature OE $trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis $html = strtr($html, $trans); } if (!$encoding) { debug('No character encoding found, so treating as UTF-8'); $encoding = 'utf-8'; } else { debug('Character encoding: ' . $encoding); if (strtolower($encoding) != 'utf-8') { debug('Converting to UTF-8'); $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); } } } return $html; }
public function init() { // Check absolute bare minimum requirements. if (function_exists('version_compare') && version_compare(PHP_VERSION, '5.0', '<') || !extension_loaded('xml') || !extension_loaded('pcre')) { return false; } elseif (!extension_loaded('xmlreader')) { static $xml_is_sane = null; if ($xml_is_sane === null) { $parser_check = xml_parser_create(); xml_parse_into_struct($parser_check, '<foo>&</foo>', $values); xml_parser_free($parser_check); $xml_is_sane = isset($values[0]['value']); } if (!$xml_is_sane) { return false; } } // Pass whatever was set with config options over to the sanitizer. $this->sanitize->pass_cache_data($this->cache, $this->cache_location, $this->cache_name_function, $this->cache_class); $this->sanitize->pass_file_data($this->file_class, $this->timeout, $this->useragent, $this->force_fsockopen); if ($this->feed_url !== null || $this->raw_data !== null) { $this->error = null; $this->data = array(); $this->multifeed_objects = array(); $cache = false; if ($this->feed_url !== null) { $parsed_feed_url = SimplePie_Misc::parse_url($this->feed_url); // Decide whether to enable caching if ($this->cache && $parsed_feed_url['scheme'] !== '') { $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, call_user_func($this->cache_name_function, $this->feed_url), 'spc'); } // If it's enabled and we don't want an XML dump, use the cache if ($cache && !$this->xml_dump) { // Load the Cache $this->data = $cache->load(); if (!empty($this->data)) { // If the cache is for an outdated build of SimplePie if (!isset($this->data['build']) || $this->data['build'] !== SIMPLEPIE_BUILD) { $cache->unlink(); $this->data = array(); } elseif (isset($this->data['url']) && $this->data['url'] !== $this->feed_url) { $cache = false; $this->data = array(); } elseif (isset($this->data['feed_url'])) { // If the autodiscovery cache is still valid use it. if ($cache->mtime() + $this->autodiscovery_cache_duration > time()) { // Do not need to do feed autodiscovery yet. if ($this->data['feed_url'] === $this->data['url']) { $cache->unlink(); $this->data = array(); } else { $this->set_feed_url($this->data['feed_url']); return $this->init(); } } } elseif ($cache->mtime() + $this->cache_duration < time()) { // If we have last-modified and/or etag set if (isset($this->data['headers']['last-modified']) || isset($this->data['headers']['etag'])) { $headers = array('Accept' => 'application/atom+xml, application/rss+xml, application/rdf+xml;q=0.9, application/xml;q=0.8, text/xml;q=0.8, text/html;q=0.7, unknown/unknown;q=0.1, application/unknown;q=0.1, */*;q=0.1'); if (isset($this->data['headers']['last-modified'])) { $headers['if-modified-since'] = $this->data['headers']['last-modified']; } if (isset($this->data['headers']['etag'])) { $headers['if-none-match'] = $this->data['headers']['etag']; } $file = new $this->file_class($this->feed_url, $this->timeout / 10, 5, $headers, $this->useragent, $this->force_fsockopen); if ($file->success) { if ($file->status_code === 304) { $cache->touch(); return true; } else { $headers = $file->headers; } } else { unset($file); } } } else { return true; } } else { $cache->unlink(); $this->data = array(); } } // If we don't already have the file (it'll only exist if we've opened it to check if the cache has been modified), open it. if (!isset($file)) { if (is_a($this->file, 'SimplePie_File') && $this->file->url === $this->feed_url) { $file =& $this->file; } else { $headers = array('Accept' => 'application/atom+xml, application/rss+xml, application/rdf+xml;q=0.9, application/xml;q=0.8, text/xml;q=0.8, text/html;q=0.7, unknown/unknown;q=0.1, application/unknown;q=0.1, */*;q=0.1'); $file = new $this->file_class($this->feed_url, $this->timeout, 5, $headers, $this->useragent, $this->force_fsockopen); } } // If the file connection has an error, set SimplePie::error to that and quit if (!$file->success && !($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300))) { $this->error = $file->error; if (!empty($this->data)) { return true; } else { return false; } } if (!$this->force_feed) { // Check if the supplied URL is a feed, if it isn't, look for it. $locate = new $this->locator_class($file, $this->timeout, $this->useragent, $this->file_class, $this->max_checked_feeds, $this->content_type_sniffer_class); if (!$locate->is_feed($file)) { // We need to unset this so that if SimplePie::set_file() has been called that object is untouched unset($file); if ($file = $locate->find($this->autodiscovery, $this->all_discovered_feeds)) { if ($cache) { $this->data = array('url' => $this->feed_url, 'feed_url' => $file->url, 'build' => SIMPLEPIE_BUILD); if (!$cache->save($this)) { trigger_error("{$this->cache_location} is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING); } $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, call_user_func($this->cache_name_function, $file->url), 'spc'); } $this->feed_url = $file->url; } else { $this->error = "A feed could not be found at {$this->feed_url}. A feed with an invalid mime type may fall victim to this error, or " . SIMPLEPIE_NAME . " was unable to auto-discover it.. Use force_feed() if you are certain this URL is a real feed."; SimplePie_Misc::error($this->error, E_USER_NOTICE, __FILE__, __LINE__); return false; } } $locate = null; } $headers = $file->headers; $data = $file->body; $sniffer = new $this->content_type_sniffer_class($file); $sniffed = $sniffer->get_type(); } else { $data = $this->raw_data; } // Set up array of possible encodings $encodings = array(); // First check to see if input has been overridden. if ($this->input_encoding !== false) { $encodings[] = $this->input_encoding; } $application_types = array('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity'); $text_types = array('text/xml', 'text/xml-external-parsed-entity'); // RFC 3023 (only applies to sniffed content) if (isset($sniffed)) { if (in_array($sniffed, $application_types) || substr($sniffed, 0, 12) === 'application/' && substr($sniffed, -4) === '+xml') { if (isset($headers['content-type']) && preg_match('/;\\x20?charset=([^;]*)/i', $headers['content-type'], $charset)) { $encodings[] = strtoupper($charset[1]); } $encodings = array_merge($encodings, SimplePie_Misc::xml_encoding($data)); $encodings[] = 'UTF-8'; } elseif (in_array($sniffed, $text_types) || substr($sniffed, 0, 5) === 'text/' && substr($sniffed, -4) === '+xml') { if (isset($headers['content-type']) && preg_match('/;\\x20?charset=([^;]*)/i', $headers['content-type'], $charset)) { $encodings[] = $charset[1]; } $encodings[] = 'US-ASCII'; } elseif (substr($sniffed, 0, 5) === 'text/') { $encodings[] = 'US-ASCII'; } } // Fallback to XML 1.0 Appendix F.1/UTF-8/ISO-8859-1 $encodings = array_merge($encodings, SimplePie_Misc::xml_encoding($data)); $encodings[] = 'UTF-8'; $encodings[] = 'ISO-8859-1'; // There's no point in trying an encoding twice $encodings = array_unique($encodings); // If we want the XML, just output that with the most likely encoding and quit if ($this->xml_dump) { header('Content-type: text/xml; charset=' . $encodings[0]); echo $data; exit; } // Loop through each possible encoding, till we return something, or run out of possibilities foreach ($encodings as $encoding) { // Change the encoding to UTF-8 (as we always use UTF-8 internally) if ($utf8_data = SimplePie_Misc::change_encoding($data, $encoding, 'UTF-8')) { // Create new parser $parser = new $this->parser_class(); // If it's parsed fine if ($parser->parse($utf8_data, 'UTF-8')) { $this->data = $parser->get_data(); if ($this->get_type() & ~SIMPLEPIE_TYPE_NONE) { if (isset($headers)) { $this->data['headers'] = $headers; } $this->data['build'] = SIMPLEPIE_BUILD; // Cache the file if caching is enabled if ($cache && !$cache->save($this)) { trigger_error("{$this->cache_location} is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING); } return true; } else { $this->error = "A feed could not be found at {$this->feed_url}. This does not appear to be a valid RSS or Atom feed."; SimplePie_Misc::error($this->error, E_USER_NOTICE, __FILE__, __LINE__); return false; } } } } if (isset($parser)) { // We have an error, just set SimplePie_Misc::error to it and quit $this->error = sprintf('This XML document is invalid, likely due to invalid characters. XML error: %s at line %d, column %d', $parser->get_error_string(), $parser->get_current_line(), $parser->get_current_column()); } else { $this->error = 'The data could not be converted to UTF-8. You MUST have either the iconv or mbstring extension installed. Upgrading to PHP 5.x (which includes iconv) is highly recommended.'; } SimplePie_Misc::error($this->error, E_USER_NOTICE, __FILE__, __LINE__); return false; } elseif (!empty($this->multifeed_url)) { $i = 0; $success = 0; $this->multifeed_objects = array(); foreach ($this->multifeed_url as $url) { $this->multifeed_objects[$i] = clone $this; $this->multifeed_objects[$i]->set_feed_url($url); $success |= $this->multifeed_objects[$i]->init(); $i++; } return (bool) $success; } else { return false; } }
function sanitize($data, $type, $base = '') { $data = trim($data); if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI) { if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML) { if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\\/[A-Za-z][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x2F\\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data)) { $type |= SIMPLEPIE_CONSTRUCT_HTML; } else { $type |= SIMPLEPIE_CONSTRUCT_TEXT; } } if ($type & SIMPLEPIE_CONSTRUCT_BASE64) { $data = base64_decode($data); } if ($type & SIMPLEPIE_CONSTRUCT_XHTML) { if ($this->remove_div) { $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data); $data = preg_replace('/<\\/div>$/', '', $data); } else { $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data); } } if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML)) { // Strip comments if ($this->strip_comments) { $data = SimplePie_Misc::strip_comments($data); } // Strip out HTML tags and attributes that might cause various security problems. // Based on recommendations by Mark Pilgrim at: // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely if ($this->strip_htmltags) { foreach ($this->strip_htmltags as $tag) { $pcre = "/<({$tag})" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . "(>(.*)<\\/{$tag}" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>|(\\/)?>)/siU'; while (preg_match($pcre, $data)) { $data = preg_replace_callback($pcre, array(&$this, 'do_strip_htmltags'), $data); } } } if ($this->strip_attributes) { foreach ($this->strip_attributes as $attrib) { $data = preg_replace('/(<[A-Za-z][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x2F\\x3E]*)' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . trim($attrib) . '(?:\\s*=\\s*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x22\\x27\\x3E][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x3E]*)?))?' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>/', '\\1\\2\\3>', $data); } } // Replace relative URLs $this->base = $base; foreach ($this->replace_url_attributes as $element => $attributes) { $data = $this->replace_urls($data, $element, $attributes); } // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags. if (isset($this->image_handler) && (string) $this->image_handler !== '' && $this->enable_cache) { $images = SimplePie_Misc::get_element('img', $data); foreach ($images as $img) { if (isset($img['attribs']['src']['data'])) { $image_url = call_user_func($this->cache_name_function, $img['attribs']['src']['data']); $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, $image_url, 'spi'); if ($cache->load()) { $img['attribs']['src']['data'] = $this->image_handler . $image_url; $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data); } else { $file =& new $this->file_class($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen); $headers = $file->headers; if ($file->success && ($file->status_code == 200 || $file->status_code > 206 && $file->status_code < 300)) { if ($cache->save(array('headers' => $file->headers, 'body' => $file->body))) { $img['attribs']['src']['data'] = $this->image_handler . $image_url; $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data); } else { trigger_error("{$cache->name} is not writeable", E_USER_WARNING); } } } } } } // Having (possibly) taken stuff out, there may now be whitespace at the beginning/end of the data $data = trim($data); } if ($type & SIMPLEPIE_CONSTRUCT_IRI) { $data = SimplePie_Misc::absolutize_url($data, $base); } if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI)) { $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8'); } if ($this->output_encoding != 'UTF-8') { $data = SimplePie_Misc::change_encoding($data, 'UTF-8', $this->output_encoding); } } return $data; }
public function sanitize($data, $type, $base = '') { $data = trim($data); if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI) { if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML) { if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\\/[A-Za-z][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x2F\\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data)) { $type |= SIMPLEPIE_CONSTRUCT_HTML; } else { $type |= SIMPLEPIE_CONSTRUCT_TEXT; } } if ($type & SIMPLEPIE_CONSTRUCT_BASE64) { $data = base64_decode($data); } if ($type & SIMPLEPIE_CONSTRUCT_XHTML) { if ($this->remove_div) { $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data); $data = preg_replace('/<\\/div>$/', '', $data); } else { $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data); } } if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML)) { if ($this->strip_comments) { $data = SimplePie_Misc::strip_comments($data); } if ($this->strip_htmltags) { foreach ($this->strip_htmltags as $tag) { $pcre = "/<({$tag})" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . "(>(.*)<\\/{$tag}" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>|(\\/)?>)/siU'; while (preg_match($pcre, $data)) { $data = preg_replace_callback($pcre, array(&$this, 'do_strip_htmltags'), $data); } } } if ($this->strip_attributes) { foreach ($this->strip_attributes as $attrib) { $data = preg_replace('/(<[A-Za-z][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x2F\\x3E]*)' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . trim($attrib) . '(?:\\s*=\\s*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x22\\x27\\x3E][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20\\x3E]*)?))?' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>/', '\\1\\2\\3>', $data); } } $this->base = $base; foreach ($this->replace_url_attributes as $element => $attributes) { $data = $this->replace_urls($data, $element, $attributes); } if (isset($this->image_handler) && (string) $this->image_handler !== '' && $this->enable_cache) { $images = SimplePie_Misc::get_element('img', $data); foreach ($images as $img) { if (isset($img['attribs']['src']['data'])) { $image_url = call_user_func($this->cache_name_function, $img['attribs']['src']['data']); $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, $image_url, 'spi'); if ($cache->load()) { $img['attribs']['src']['data'] = $this->image_handler . $image_url; $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data); } else { $file = new $this->file_class($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen); $headers = $file->headers; if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300))) { if ($cache->save(array('headers' => $file->headers, 'body' => $file->body))) { $img['attribs']['src']['data'] = $this->image_handler . $image_url; $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data); } else { trigger_error("{$this->cache_location} is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING); } } } } } } $data = trim($data); } if ($type & SIMPLEPIE_CONSTRUCT_IRI) { $data = SimplePie_Misc::absolutize_url($data, $base); } if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI)) { $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8'); } if ($this->output_encoding !== 'UTF-8') { $data = SimplePie_Misc::change_encoding($data, 'UTF-8', $this->output_encoding); } } return $data; }
public function test_nonexistant() { $this->assertFalse(SimplePie_Misc::change_encoding('', 'TESTENC', 'UTF-8')); }
/** * Detect XML encoding, as per XML 1.0 Appendix F.1 * * @todo Add support for EBCDIC * @param string $data XML data * @return array Possible encodings */ public static function xml_encoding($data) { // UTF-32 Big Endian BOM if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") { $encoding[] = 'UTF-32BE'; } // UTF-32 Little Endian BOM elseif (substr($data, 0, 4) === "\xFF\xFE\x00\x00") { $encoding[] = 'UTF-32LE'; } // UTF-16 Big Endian BOM elseif (substr($data, 0, 2) === "\xFE\xFF") { $encoding[] = 'UTF-16BE'; } // UTF-16 Little Endian BOM elseif (substr($data, 0, 2) === "\xFF\xFE") { $encoding[] = 'UTF-16LE'; } // UTF-8 BOM elseif (substr($data, 0, 3) === "\xEF\xBB\xBF") { $encoding[] = 'UTF-8'; } // UTF-32 Big Endian Without BOM elseif (substr($data, 0, 20) === "\x00\x00\x00\x3C\x00\x00\x00\x3F\x00\x00\x00\x78\x00\x00\x00\x6D\x00\x00\x00\x6C") { if ($pos = strpos($data, "\x00\x00\x00\x3F\x00\x00\x00\x3E")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32BE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-32BE'; } // UTF-32 Little Endian Without BOM elseif (substr($data, 0, 20) === "\x3C\x00\x00\x00\x3F\x00\x00\x00\x78\x00\x00\x00\x6D\x00\x00\x00\x6C\x00\x00\x00") { if ($pos = strpos($data, "\x3F\x00\x00\x00\x3E\x00\x00\x00")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32LE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-32LE'; } // UTF-16 Big Endian Without BOM elseif (substr($data, 0, 10) === "\x00\x3C\x00\x3F\x00\x78\x00\x6D\x00\x6C") { if ($pos = strpos($data, "\x00\x3F\x00\x3E")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16BE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-16BE'; } // UTF-16 Little Endian Without BOM elseif (substr($data, 0, 10) === "\x3C\x00\x3F\x00\x78\x00\x6D\x00\x6C\x00") { if ($pos = strpos($data, "\x3F\x00\x3E\x00")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16LE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-16LE'; } // US-ASCII (or superset) elseif (substr($data, 0, 5) === "\x3C\x3F\x78\x6D\x6C") { if ($pos = strpos($data, "\x3F\x3E")) { $parser = new SimplePie_XML_Declaration_Parser(substr($data, 5, $pos - 5)); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-8'; } // Fallback to UTF-8 else { $encoding[] = 'UTF-8'; } return $encoding; }
function do_entites_decode($data) { if (isset($this->cached_entities[$data[0]])) { return $this->cached_entities[$data[0]]; } else { $return = SimplePie_Misc::change_encoding(html_entity_decode($data[0], ENT_QUOTES), 'ISO-8859-1', $this->input_encoding); if ($return == $data[0]) { $return = SimplePie_Misc::change_encoding(preg_replace_callback('/&#([x]?[0-9a-f]+);/mi', array(&$this, 'replace_num_entity'), $data[0]), 'UTF-8', $this->input_encoding); } $this->cached_entities[$data[0]] = $return; return $return; } }
function rpf_convert_to_utf8($html, $header = null) { $accept = array('type' => array('application/rss+xml', 'application/xml', 'application/rdf+xml', 'text/xml', 'text/html'), 'charset' => array_diff(mb_list_encodings(), array('pass', 'auto', 'wchar', 'byte2be', 'byte2le', 'byte4be', 'byte4le', 'BASE64', 'UUENCODE', 'HTML-ENTITIES', 'Quoted-Printable', '7bit', '8bit'))); $encoding = null; if ($html || $header) { if (is_array($header)) { $header = implode("\n", $header); } if (!$header || !preg_match_all('/^Content-Type:\\s+([^;]+)(?:;\\s*charset=([^;"\'\\n]*))?/im', $header, $match, PREG_SET_ORDER)) { // error parsing the response } else { $match = end($match); // get last matched element (in case of redirects) if (!in_array(strtolower($match[1]), $accept['type'])) { // type not accepted // TODO: avoid conversion } if (isset($match[2])) { $encoding = trim($match[2], '"\''); } } if (!$encoding) { if (preg_match('/^<\\?xml\\s+version=(?:"[^"]*"|\'[^\']*\')\\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) { $encoding = trim($match[1], '"\''); } elseif (preg_match('/<meta\\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\\s*charset=([^;"\'>]+)/i', $html, $match)) { if (isset($match[1])) { $encoding = trim($match[1]); } } } if (!$encoding) { $encoding = 'utf-8'; } else { if (!in_array($encoding, array_map('strtolower', $accept['charset']))) { // encoding not accepted // TODO: avoid conversion } if (strtolower($encoding) != 'utf-8') { if (strtolower($encoding) == 'iso-8859-1') { // replace MS Word smart qutoes $trans = array(); $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark $trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook $trans[chr(132)] = '„'; // Double Low-9 Quotation Mark $trans[chr(133)] = '…'; // Horizontal Ellipsis $trans[chr(134)] = '†'; // Dagger $trans[chr(135)] = '‡'; // Double Dagger $trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent $trans[chr(137)] = '‰'; // Per Mille Sign $trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron $trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark $trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE $trans[chr(145)] = '‘'; // Left Single Quotation Mark $trans[chr(146)] = '’'; // Right Single Quotation Mark $trans[chr(147)] = '“'; // Left Double Quotation Mark $trans[chr(148)] = '”'; // Right Double Quotation Mark $trans[chr(149)] = '•'; // Bullet $trans[chr(150)] = '–'; // En Dash $trans[chr(151)] = '—'; // Em Dash $trans[chr(152)] = '˜'; // Small Tilde $trans[chr(153)] = '™'; // Trade Mark Sign $trans[chr(154)] = 'š'; // Latin Small Letter S With Caron $trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark $trans[chr(156)] = 'œ'; // Latin Small Ligature OE $trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis $html = strtr($html, $trans); } if (!class_exists('SimplePie_Misc')) { require_once RPFINC . 'simplepie.class.php'; } $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); /* if (function_exists('iconv')) { // iconv appears to handle certain character encodings better than mb_convert_encoding $html = iconv($encoding, 'utf-8', $html); } else { $html = mb_convert_encoding($html, 'utf-8', $encoding); } */ } } } return $html; }
function convert_to_utf8($html, $header = null) { $encoding = null; if ($html || $header) { if (is_array($header)) { $header = implode("\n", $header); } if (!$header || !preg_match_all('/^Content-Type:\\s+([^;]+)(?:;\\s*charset=["\']?([^;"\'\\n]*))?/im', $header, $match, PREG_SET_ORDER)) { // error parsing the response } else { $match = end($match); // get last matched element (in case of redirects) if (isset($match[2])) { $encoding = trim($match[2], '"\''); } } if (!$encoding) { if (preg_match('/^<\\?xml\\s+version=(?:"[^"]*"|\'[^\']*\')\\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) { $encoding = trim($match[1], '"\''); } elseif (preg_match('/<meta\\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\\s*charset=["\']?([^;"\'>]+)/i', $html, $match)) { if (isset($match[1])) { $encoding = trim($match[1]); } } } if (!$encoding) { $encoding = 'utf-8'; } else { if (strtolower($encoding) != 'utf-8') { if (strtolower($encoding) == 'iso-8859-1') { // replace MS Word smart qutoes $trans = array(); $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark $trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook $trans[chr(132)] = '„'; // Double Low-9 Quotation Mark $trans[chr(133)] = '…'; // Horizontal Ellipsis $trans[chr(134)] = '†'; // Dagger $trans[chr(135)] = '‡'; // Double Dagger $trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent $trans[chr(137)] = '‰'; // Per Mille Sign $trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron $trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark $trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE $trans[chr(145)] = '‘'; // Left Single Quotation Mark $trans[chr(146)] = '’'; // Right Single Quotation Mark $trans[chr(147)] = '“'; // Left Double Quotation Mark $trans[chr(148)] = '”'; // Right Double Quotation Mark $trans[chr(149)] = '•'; // Bullet $trans[chr(150)] = '–'; // En Dash $trans[chr(151)] = '—'; // Em Dash $trans[chr(152)] = '˜'; // Small Tilde $trans[chr(153)] = '™'; // Trade Mark Sign $trans[chr(154)] = 'š'; // Latin Small Letter S With Caron $trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark $trans[chr(156)] = 'œ'; // Latin Small Ligature OE $trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis $html = strtr($html, $trans); } $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); /* if (function_exists('iconv')) { // iconv appears to handle certain character encodings better than mb_convert_encoding $html = iconv($encoding, 'utf-8', $html); } else { $html = mb_convert_encoding($html, 'utf-8', $encoding); } */ } } } return $html; }
function sanitize($data, $type, $base = '') { $data = trim($data); if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI) { if ($type & SIMPLEPIE_CONSTRUCT_BASE64) { $data = base64_decode($data); } if ($type & SIMPLEPIE_CONSTRUCT_XHTML) { if ($this->remove_div) { $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data); $data = preg_replace('/<\\/div>$/', '', $data); } else { $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data); } } if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML)) { // Strip comments if ($this->strip_comments) { $data = SimplePie_Misc::strip_comments($data); } // Strip out HTML tags and attributes that might cause various security problems. // Based on recommendations by Mark Pilgrim at: // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely if ($this->strip_htmltags) { foreach ($this->strip_htmltags as $tag) { $pcre = "/<({$tag})" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . "(>(.*)<\\/{$tag}" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>|(\\/)?>)/siU'; while (preg_match($pcre, $data)) { $data = preg_replace_callback($pcre, array(&$this, 'do_strip_htmltags'), $data); } } } if ($this->strip_attributes) { foreach ($this->strip_attributes as $attrib) { $data = preg_replace('/ ' . trim($attrib) . '=("|")(\\w|\\s|=|-|:|;|\\/|\\.|\\?|&|,|#|!|\\(|\\)|\'|'|<|>|\\+|{|})*("|")/i', '', $data); $data = preg_replace('/ ' . trim($attrib) . '=(\'|')(\\w|\\s|=|-|:|;|\\/|\\.|\\?|&|,|#|!|\\(|\\)|"|"|<|>|\\+|{|})*(\'|')/i', '', $data); $data = preg_replace('/ ' . trim($attrib) . '=(\\w|\\s|=|-|:|;|\\/|\\.|\\?|&|,|#|!|\\(|\\)|\\+|{|})*/i', '', $data); } } // Replace relative URLs $this->base = $base; foreach ($this->replace_url_attributes as $element => $attribute) { if ((!is_array($this->strip_htmltags) || !in_array($element, $this->strip_htmltags)) && (!is_array($this->strip_attributes) || !in_array($attribute, $this->strip_attributes))) { $data = $this->replace_urls($data, $element, $attribute); } } // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags. if (isset($this->image_handler) && !empty($this->image_handler) && $this->enable_cache) { $images = SimplePie_Misc::get_element('img', $data); foreach ($images as $img) { if (!empty($img['attribs']['src']['data'])) { $image_url = $img['attribs']['src']['data']; $cache =& new $this->cache_class($this->cache_location, call_user_func($this->cache_name_function, $image_url), 'spi'); if ($cache->load()) { $img['attribs']['src']['data'] = $this->image_handler . rawurlencode($img['attribs']['src']['data']); $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data); } else { $file =& new $this->file_class($image_url, $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen); $headers = $file->headers; if ($file->success && ($file->status_code == 200 || $file->status_code > 206 && $file->status_code < 300)) { if (!$cache->save(array('headers' => $file->headers, 'body' => $file->body))) { trigger_error("{$cache->name} is not writeable", E_USER_WARNING); } $img['attribs']['src']['data'] = $this->image_handler . rawurlencode($img['attribs']['src']['data']); $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data); } } } } } // Having (possibly) taken stuff out, there may now be whitespace at the beginning/end of the data $data = trim($data); } if ($type & SIMPLEPIE_CONSTRUCT_IRI) { $data = SimplePie_Misc::absolutize_url($data, $base); } if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI)) { $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8'); } if ($this->output_encoding != 'UTF-8') { $data = SimplePie_Misc::change_encoding($data, 'UTF-8', $this->output_encoding); } } return $data; }