function parse(&$data, $encoding) { // Use UTF-8 if we get passed US-ASCII, as every US-ASCII character is a UTF-8 character if (strtoupper($encoding) == 'US-ASCII') { $this->encoding = 'UTF-8'; } else { $this->encoding = $encoding; } // Strip BOM: // UTF-32 Big Endian BOM if (substr($data, 0, 4) === "��") { $data = substr($data, 4); } elseif (substr($data, 0, 4) === "��") { $data = substr($data, 4); } elseif (substr($data, 0, 2) === "��") { $data = substr($data, 2); } elseif (substr($data, 0, 2) === "��") { $data = substr($data, 2); } elseif (substr($data, 0, 3) === "") { $data = substr($data, 3); } if (substr($data, 0, 5) === '<?xml' && strspn(substr($data, 5, 1), "\t\n\r ") && ($pos = strpos($data, '?>')) !== false) { $declaration = new SimplePie_XML_Declaration_Parser(substr($data, 5, $pos - 5)); if ($declaration->parse()) { $data = substr($data, $pos + 2); $data = '<?xml version="' . $declaration->version . '" encoding="' . $encoding . '" standalone="' . ($declaration->standalone ? 'yes' : 'no') . '"?>' . $data; } else { $this->error_string = 'SimplePie bug! Please report this!'; return false; } } // Work around libxml bug $data = str_replace('<', '<', $data); $data = str_replace('>', '>', $data); $data = str_replace('&', '&', $data); $data = str_replace(''', ''', $data); $data = str_replace('"', '"', $data); $return = true; // Create the parser $xml = xml_parser_create_ns($this->encoding, $this->separator); xml_parser_set_option($xml, XML_OPTION_SKIP_WHITE, 1); xml_parser_set_option($xml, XML_OPTION_CASE_FOLDING, 0); xml_set_object($xml, $this); xml_set_character_data_handler($xml, 'cdata'); xml_set_element_handler($xml, 'tag_open', 'tag_close'); // Parse! if (!xml_parse($xml, $data, true)) { $this->error_code = xml_get_error_code($xml); $this->error_string = xml_error_string($this->error_code); $return = false; } $this->current_line = xml_get_current_line_number($xml); $this->current_column = xml_get_current_column_number($xml); $this->current_byte = xml_get_current_byte_index($xml); xml_parser_free($xml); return $return; }
public function parse(&$data, $encoding) { if (strtoupper($encoding) === 'US-ASCII') { $this->encoding = 'UTF-8'; } else { $this->encoding = $encoding; } if (substr($data, 0, 4) === "��") { $data = substr($data, 4); } elseif (substr($data, 0, 4) === "��") { $data = substr($data, 4); } elseif (substr($data, 0, 2) === "��") { $data = substr($data, 2); } elseif (substr($data, 0, 2) === "��") { $data = substr($data, 2); } elseif (substr($data, 0, 3) === "") { $data = substr($data, 3); } if (substr($data, 0, 5) === '<?xml' && strspn(substr($data, 5, 1), "\t\n\r ") && ($pos = strpos($data, '?>')) !== false) { $declaration = new SimplePie_XML_Declaration_Parser(substr($data, 5, $pos - 5)); if ($declaration->parse()) { $data = substr($data, $pos + 2); $data = '<?xml version="' . $declaration->version . '" encoding="' . $encoding . '" standalone="' . ($declaration->standalone ? 'yes' : 'no') . '"?>' . $data; } else { $this->error_string = 'SimplePie bug! Please report this!'; return false; } } $return = true; static $xml_is_sane = null; if ($xml_is_sane === null) { $parser_check = xml_parser_create(); xml_parse_into_struct($parser_check, '<foo>&</foo>', $values); xml_parser_free($parser_check); $xml_is_sane = isset($values[0]['value']); } if ($xml_is_sane) { $xml = xml_parser_create_ns($this->encoding, $this->separator); xml_parser_set_option($xml, XML_OPTION_SKIP_WHITE, 1); xml_parser_set_option($xml, XML_OPTION_CASE_FOLDING, 0); xml_set_object($xml, $this); xml_set_character_data_handler($xml, 'cdata'); xml_set_element_handler($xml, 'tag_open', 'tag_close'); if (!xml_parse($xml, $data, true)) { $this->error_code = xml_get_error_code($xml); $this->error_string = xml_error_string($this->error_code); $return = false; } $this->current_line = xml_get_current_line_number($xml); $this->current_column = xml_get_current_column_number($xml); $this->current_byte = xml_get_current_byte_index($xml); xml_parser_free($xml); return $return; } else { libxml_clear_errors(); $xml = new XMLReader(); $xml->xml($data); while (@$xml->read()) { switch ($xml->nodeType) { case constant('XMLReader::END_ELEMENT'): if ($xml->namespaceURI !== '') { $tagName = $xml->namespaceURI . $this->separator . $xml->localName; } else { $tagName = $xml->localName; } $this->tag_close(null, $tagName); break; case constant('XMLReader::ELEMENT'): $empty = $xml->isEmptyElement; if ($xml->namespaceURI !== '') { $tagName = $xml->namespaceURI . $this->separator . $xml->localName; } else { $tagName = $xml->localName; } $attributes = array(); while ($xml->moveToNextAttribute()) { if ($xml->namespaceURI !== '') { $attrName = $xml->namespaceURI . $this->separator . $xml->localName; } else { $attrName = $xml->localName; } $attributes[$attrName] = $xml->value; } $this->tag_open(null, $tagName, $attributes); if ($empty) { $this->tag_close(null, $tagName); } break; case constant('XMLReader::TEXT'): case constant('XMLReader::CDATA'): $this->cdata(null, $xml->value); break; } } if ($error = libxml_get_last_error()) { $this->error_code = $error->code; $this->error_string = $error->message; $this->current_line = $error->line; $this->current_column = $error->column; return false; } else { return true; } } }
/** * Detect XML encoding, as per XML 1.0 Appendix F.1 * * @todo Add support for EBCDIC * @param string $data XML data * @return array Possible encodings */ public static function xml_encoding($data) { // UTF-32 Big Endian BOM if (substr($data, 0, 4) === "��") { $encoding[] = 'UTF-32BE'; } elseif (substr($data, 0, 4) === "��") { $encoding[] = 'UTF-32LE'; } elseif (substr($data, 0, 2) === "��") { $encoding[] = 'UTF-16BE'; } elseif (substr($data, 0, 2) === "��") { $encoding[] = 'UTF-16LE'; } elseif (substr($data, 0, 3) === "") { $encoding[] = 'UTF-8'; } elseif (substr($data, 0, 20) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32BE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-32BE'; } elseif (substr($data, 0, 20) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32LE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-32LE'; } elseif (substr($data, 0, 10) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16BE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-16BE'; } elseif (substr($data, 0, 10) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16LE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-16LE'; } elseif (substr($data, 0, 5) === "<?xml") { if ($pos = strpos($data, "?>")) { $parser = new SimplePie_XML_Declaration_Parser(substr($data, 5, $pos - 5)); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-8'; } else { $encoding[] = 'UTF-8'; } return $encoding; }
/** * Detect XML encoding, as per XML 1.0 Appendix F.1 * * @todo Add support for EBCDIC * @param string $data XML data * @return array Possible encodings */ public static function xml_encoding($data) { // UTF-32 Big Endian BOM if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") { $encoding[] = 'UTF-32BE'; } // UTF-32 Little Endian BOM elseif (substr($data, 0, 4) === "\xFF\xFE\x00\x00") { $encoding[] = 'UTF-32LE'; } // UTF-16 Big Endian BOM elseif (substr($data, 0, 2) === "\xFE\xFF") { $encoding[] = 'UTF-16BE'; } // UTF-16 Little Endian BOM elseif (substr($data, 0, 2) === "\xFF\xFE") { $encoding[] = 'UTF-16LE'; } // UTF-8 BOM elseif (substr($data, 0, 3) === "\xEF\xBB\xBF") { $encoding[] = 'UTF-8'; } // UTF-32 Big Endian Without BOM elseif (substr($data, 0, 20) === "\x00\x00\x00\x3C\x00\x00\x00\x3F\x00\x00\x00\x78\x00\x00\x00\x6D\x00\x00\x00\x6C") { if ($pos = strpos($data, "\x00\x00\x00\x3F\x00\x00\x00\x3E")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32BE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-32BE'; } // UTF-32 Little Endian Without BOM elseif (substr($data, 0, 20) === "\x3C\x00\x00\x00\x3F\x00\x00\x00\x78\x00\x00\x00\x6D\x00\x00\x00\x6C\x00\x00\x00") { if ($pos = strpos($data, "\x3F\x00\x00\x00\x3E\x00\x00\x00")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 20), 'UTF-32LE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-32LE'; } // UTF-16 Big Endian Without BOM elseif (substr($data, 0, 10) === "\x00\x3C\x00\x3F\x00\x78\x00\x6D\x00\x6C") { if ($pos = strpos($data, "\x00\x3F\x00\x3E")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16BE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-16BE'; } // UTF-16 Little Endian Without BOM elseif (substr($data, 0, 10) === "\x3C\x00\x3F\x00\x78\x00\x6D\x00\x6C\x00") { if ($pos = strpos($data, "\x3F\x00\x3E\x00")) { $parser = new SimplePie_XML_Declaration_Parser(SimplePie_Misc::change_encoding(substr($data, 20, $pos - 10), 'UTF-16LE', 'UTF-8')); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-16LE'; } // US-ASCII (or superset) elseif (substr($data, 0, 5) === "\x3C\x3F\x78\x6D\x6C") { if ($pos = strpos($data, "\x3F\x3E")) { $parser = new SimplePie_XML_Declaration_Parser(substr($data, 5, $pos - 5)); if ($parser->parse()) { $encoding[] = $parser->encoding; } } $encoding[] = 'UTF-8'; } // Fallback to UTF-8 else { $encoding[] = 'UTF-8'; } return $encoding; }
function parse(&$data, $encoding) { // Use UTF-8 if we get passed US-ASCII, as every US-ASCII character is a UTF-8 character if (strtoupper($encoding) == 'US-ASCII') { $this->encoding = 'UTF-8'; } else { $this->encoding = $encoding; } // Strip BOM: // UTF-32 Big Endian BOM if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") { $data = substr($data, 4); } // UTF-32 Little Endian BOM elseif (substr($data, 0, 4) === "\xFF\xFE\x00\x00") { $data = substr($data, 4); } // UTF-16 Big Endian BOM elseif (substr($data, 0, 2) === "\xFE\xFF") { $data = substr($data, 2); } // UTF-16 Little Endian BOM elseif (substr($data, 0, 2) === "\xFF\xFE") { $data = substr($data, 2); } // UTF-8 BOM elseif (substr($data, 0, 3) === "\xEF\xBB\xBF") { $data = substr($data, 3); } if (substr($data, 0, 5) === '<?xml' && strspn(substr($data, 5, 1), "\x09\x0A\x0D\x20") && ($pos = strpos($data, '?>')) !== false) { $declaration = new SimplePie_XML_Declaration_Parser(substr($data, 5, $pos - 5)); if ($declaration->parse()) { $data = substr($data, $pos + 2); $data = '<?xml version="' . $declaration->version . '" encoding="' . $encoding . '" standalone="' . (($declaration->standalone) ? 'yes' : 'no') . '"?>' . $data; } else { $this->error_string = 'SimplePie bug! Please report this!'; return false; } } $return = true; // Create the parser $xml = xml_parser_create_ns($this->encoding, $this->separator); xml_parser_set_option($xml, XML_OPTION_SKIP_WHITE, 1); xml_parser_set_option($xml, XML_OPTION_CASE_FOLDING, 0); xml_set_object($xml, $this); xml_set_character_data_handler($xml, 'cdata'); xml_set_element_handler($xml, 'tag_open', 'tag_close'); // Parse! if (!xml_parse($xml, $data, true)) { $this->error_code = xml_get_error_code($xml); $this->error_string = xml_error_string($this->error_code); $return = false; } $this->current_line = xml_get_current_line_number($xml); $this->current_column = xml_get_current_column_number($xml); $this->current_byte = xml_get_current_byte_index($xml); xml_parser_free($xml); return $return; }