function cleanUpPost($postText, $name = "", $noSmilies = false) { global $filter_tags, $bbcode, $postNoSmilies; $postNoSmilies = $noSmilies; require_once 'HTML5/Parser.php'; $document = HTML5_Parser::parseFragment($postText, null, null, $filter_tags, $bbcode, $name)->item(0)->ownerDocument; // The DOM tree is empty. Ignore it. if (!$document) { return ""; } process($document); return $document->saveHTML(); }
public function testParseFragment() { $result = HTML5_Parser::parseFragment('<b>asdf</b> foo'); $this->assertIsA($result, 'DOMNodeList'); }
function cc_wordpress_article_filter($article) { require_once 'lib/html5lib/Parser.php'; // sorry, but parseFragment() returns a DomNodeList, which is as inflexible as it gets $dom = HTML5_Parser::parse($article); $tagnames = array('img', 'audio', 'video', 'object'); foreach ($tagnames as $tagname) { foreach ($dom->getElementsByTagName($tagname) as $element) { $class = $element->getAttribute('class'); // relevant class name example: wp-image-18 preg_match('/wp-(image|audio|video|object)-([0-9]*)/', $class, $matches); $id = $matches[2]; // relevant class name example: size-medium preg_match('/size-(.*)/', $class, $matches); $size = $matches[1]; // TODO: make cc_wordpress_figure() take and return a DOM fragment $figure_html = cc_wordpress_figure($id, $size, false); // only replace node if we actually got something if ($figure_html) { $figure = HTML5_Parser::parseFragment($figure_html)->item(0)->getElementsByTagName('figure')->item(0); // a document context change is needed before appending the node $figure = $dom->importNode($figure, True); $element->parentNode->replaceChild($figure, $element); } } } // hackish but reliable way to serialize the DOM // TODO: fix this mess $XML = $dom->saveXML($dom->getElementsByTagName('body')->item(0)); $XML = str_replace('<body>', '', $XML); $XML = str_replace('</body>', '', $XML); // work around a bug regarding <style> elements including CSS '>' selectors $XML = str_replace('>', '>', $XML); // work around the IE bug that some elements are serialized with a null namespace $XML = str_replace('embedNode.value = helperNode.innerHTML;', 'embedNode.value = helperNode.innerHTML.replace(/<:/g,"<").replace(/<.:/g,"</");', $XML); return $XML; }