/** * Will return an DOM object tree from the well formed XML. * * @param string $strXMLText * @return CDataXMLDocument */ function __parse(&$strXMLText) { static $search = array(">", "<", "'", """, "&"); static $replace = array(">", "<", "'", '"', "&"); $oXMLDocument = new CDataXMLDocument(); // strip comments $strXMLText =& CDataXML::__stripComments($strXMLText); // stip the !doctype // The DOCTYPE declaration can consists of an internal DTD in square brackets $cnt = 0; $strXMLText = preg_replace("%<\\!DOCTYPE[^\\[>]*\\[.*?\\]>%is", "", $strXMLText, -1, $cnt); if ($cnt == 0) { $strXMLText = preg_replace("%<\\!DOCTYPE[^>]*>%is", "", $strXMLText); } // get document version and encoding from header preg_match_all("#<\\?(.*?)\\?>#i", $strXMLText, $arXMLHeader_tmp); foreach ($arXMLHeader_tmp[0] as $strXMLHeader_tmp) { preg_match_all("/([a-zA-Z:]+=\".*?\")/i", $strXMLHeader_tmp, $arXMLParam_tmp); foreach ($arXMLParam_tmp[0] as $strXMLParam_tmp) { if ($strXMLParam_tmp != '') { $arXMLAttribute_tmp = explode("=\"", $strXMLParam_tmp); if ($arXMLAttribute_tmp[0] == "version") { $oXMLDocument->version = substr($arXMLAttribute_tmp[1], 0, strlen($arXMLAttribute_tmp[1]) - 1); } elseif ($arXMLAttribute_tmp[0] == "encoding") { $oXMLDocument->encoding = substr($arXMLAttribute_tmp[1], 0, strlen($arXMLAttribute_tmp[1]) - 1); } } } } // strip header $strXMLText =& preg_replace("#<\\?.*?\\?>#", "", $strXMLText); $oXMLDocument->root =& $oXMLDocument->children; /** @var CDataXMLNode $currentNode */ $currentNode =& $oXMLDocument; $tok = strtok($strXMLText, "<"); $arTag = explode(">", $tok); if (count($arTag) < 2) { //There was whitespace before <, so make another try $tok = strtok("<"); $arTag = explode(">", $tok); if (count($arTag) < 2) { //It's a broken XML return false; } } while ($tok !== false) { $tagName = $arTag[0]; $tagContent = $arTag[1]; // find tag name with attributes // check if it's an endtag </tagname> if ($tagName[0] == "/") { $tagName = substr($tagName, 1); // strip out namespace; nameSpace:Name if ($this->delete_ns) { $colonPos = strpos($tagName, ":"); if ($colonPos > 0) { $tagName = substr($tagName, $colonPos + 1); } } if ($currentNode->name != $tagName) { // Error parsing XML, unmatched tags $tagName return false; } $currentNode = $currentNode->_parent; // convert special chars if (!$this->TrimWhiteSpace || trim($tagContent) != "") { $currentNode->content = str_replace($search, $replace, $tagContent); } } elseif (strncmp($tagName, "![CDATA[", 8) === 0) { //because cdata may contain > and < chars //it is special processing needed $cdata = ""; for ($i = 0, $c = count($arTag); $i < $c; $i++) { $cdata .= $arTag[$i] . ">"; if (substr($cdata, -3) == "]]>") { $tagContent = $arTag[$i + 1]; break; } } if (substr($cdata, -3) != "]]>") { $cdata = substr($cdata, 0, -1) . "<"; do { $tok = strtok(">"); //unfortunatly strtok eats > followed by > $cdata .= $tok . ">"; //util end of string or end of cdata found } while ($tok !== false && substr($tok, -2) != "]]"); //$tagName = substr($tagName, 0, -1); } $cdataSection = substr($cdata, 8, -3); // new CDATA node $subNode = new CDataXMLNode(); $subNode->name = "cdata-section"; $subNode->content = $cdataSection; $currentNode->children[] = $subNode; $currentNode->content .= $subNode->content; // convert special chars if (!$this->TrimWhiteSpace || trim($tagContent) != "") { $currentNode->content = str_replace($search, $replace, $tagContent); } } else { // normal start tag $firstSpaceEnd = strpos($tagName, " "); $firstNewlineEnd = strpos($tagName, "\n"); if ($firstNewlineEnd != false) { if ($firstSpaceEnd != false) { $tagNameEnd = min($firstSpaceEnd, $firstNewlineEnd); } else { $tagNameEnd = $firstNewlineEnd; } } else { if ($firstSpaceEnd != false) { $tagNameEnd = $firstSpaceEnd; } else { $tagNameEnd = 0; } } if ($tagNameEnd > 0) { $justName = substr($tagName, 0, $tagNameEnd); } else { $justName = $tagName; } // strip out namespace; nameSpace:Name if ($this->delete_ns) { $colonPos = strpos($justName, ":"); if ($colonPos > 0) { $justName = substr($justName, $colonPos + 1); } } // remove trailing / from the name if exists $justName = rtrim($justName, "/"); $subNode = new CDataXMLNode(); $subNode->_parent = $currentNode; $subNode->name = $justName; // find attributes if ($tagNameEnd > 0) { $attributePart = substr($tagName, $tagNameEnd); // attributes unset($attr); $attr = CDataXML::__parseAttributes($attributePart); if ($attr != false) { $subNode->attributes = $attr; } } // convert special chars if (!$this->TrimWhiteSpace || trim($tagContent) != "") { $subNode->content = str_replace($search, $replace, $tagContent); } $currentNode->children[] = $subNode; if (substr($tagName, -1) != "/") { $currentNode = $subNode; } } //Next iteration $tok = strtok("<"); $arTag = explode(">", $tok); //There was whitespace before < just after CDATA section, so make another try if (count($arTag) < 2 && strncmp($tagName, "![CDATA[", 8) === 0) { $currentNode->content .= $arTag[0]; // convert special chars if (!$this->TrimWhiteSpace || trim($tagContent) != "") { $currentNode->content = str_replace($search, $replace, $tagContent); } $tok = strtok("<"); $arTag = explode(">", $tok); } } return $oXMLDocument; }