/** * Truncate a given HTML fragment to the desired length (measured as character * count), additionally performing some cleanup. * * @param string $html The HTML fragment to clean up * @param int $desiredLength The desired number of characters, or NULL to do * just the cleanup (but no truncating). * @param string $ellipseStr If non-empty, this string will be appended to the * last collected node when the document gets * truncated. * @param bool $stripTags If TRUE, remove *all* HTML tags. Otherwise, keep a * whitelisted 'safe' set. * @param bool $nbsp If TRUE, convert all whitespace runs to non-breaking * spaces (' ' entities). */ function trimToHTML($html, $desiredLength = null, $ellipseStr = "…", $stripTags = false, $nbsp = false) { // We'll use htmlmaid to clean up the HTML, but because we also have to // step through the DOM ourselves to perform the trimming, so we'll do // the DOM loading ourselves, rather than leave it to Maid. // Do not load external entities - this would be a security risk. $prevEntityLoaderDisabled = libxml_disable_entity_loader(true); // Don't crash on invalid HTML, but recover gracefully $prevInternalErrors = libxml_use_internal_errors(true); $doc = new \DOMDocument(); // We need a bit of wrapping here to keep DOMDocument from adding rogue nodes // around our HTML. By doing it explicitly, we keep things under control. $doc->loadHTML('<!DOCTYPE html><html>' . '<head><meta http-equiv="Content-type" content="text/html;charset=utf-8"/></head>' . '<body><div>' . $html . '</div></body>' . '</html>'); $options = array(); if ($stripTags) { $options['allowed-tags'] = array(); } else { $options['allowed-tags'] = array('a', 'div', 'p', 'b', 'i', 'hr', 'br', 'strong', 'em'); } $options['allowed-attribs'] = array('href', 'src', 'id', 'class', 'style'); $maid = new Maid($options); $cleanedNodes = $maid->clean($doc->documentElement->firstChild->nextSibling->firstChild); // To collect the cleaned nodes from a node list into a containing node, // we have to create yet another document, because cloning nodes inside // the same ownerDocument for some reason modifies our node list. // I have no idea why, but it does. $cleanedDoc = new \DOMDocument(); $cleanedNode = $cleanedDoc->createElement('div'); $length = $cleanedNodes->length; for ($i = 0; $i < $length; ++$i) { $node = $cleanedNodes->item($i); $cnode = $cleanedDoc->importNode($node, true); $cleanedNode->appendChild($cnode); } // And now we'll create yet another document (who's keeping count?) to // collect our trimmed nodes. $newDoc = new \DOMDocument(); // Again, some wrapping is necessary here... $newDoc->loadHTML('<html><body><div></div></body></html>'); $newNode = $newDoc->documentElement->firstChild->firstChild; $length = $desiredLength; _collectNodesUpToLength($cleanedNode, $newNode, $length, $ellipseStr); // Convert spaces inside text nodes to // This will actually insert the unicode non-breaking space, so we'll have // to massage our output at the HTML byte-string level later. if ($nbsp) { domSpacesToNBSP($newNode->firstChild->firstChild); } // This is some terrible shotgun hacking; for some reason, the above code // will sometimes put our desired nodes two levels deep, but in other // cases, it'll descend one less level. The proper solution would be // to sort out why this is, but for now, just detecting which of the // two happened seems to work well enough. if (isset($newNode->firstChild->firstChild->childNodes)) { $nodes = $newNode->firstChild->firstChild->childNodes; } elseif (isset($newNode->firstChild->childNodes)) { $nodes = $newNode->firstChild->childNodes; } else { $nodes = array(); } // And now we convert our target nodes to HTML. // Because we don't want any of the wrapper nodes to appear in the // output, we'll have to convert them one by one and concatenate the // HTML. $result = ''; foreach ($nodes as $node) { $result .= Maid::renderFragment($node); } if ($nbsp) { $result = str_replace(html_entity_decode(' '), ' ', $result); } // Restore previous libxml settings libxml_disable_entity_loader($prevEntityLoaderDisabled); libxml_use_internal_errors($prevInternalErrors); return $result; }