/** * Prepares XHTML content for text difference comparison. * @param string $content XHTML content [NO SLASHES] * @return array Array of ouwiki_line objects */ function ouwiki_diff_html_to_lines($content) { // These functions are a pain mostly because PHP preg_* don't provide // proper information as to the start/end position of matches. As a // consequence there is a lot of hackery going down. At every point we // replace things with spaces rather than getting rid, in order to store // positions within original content. // Get rid of all script, style, object tags (that might contain non-text // outside tags) $content = preg_replace_callback('^(<script .*?</script>)|(<object .*?</object>)|(<style .*?</style>)^i', create_function('$matches', 'return preg_replace("/./"," ",$matches[0]);'), $content); // Get rid of all ` symbols as we are going to use these for a marker later. $content = preg_replace('/[`]/', ' ', $content); // Put line breaks on block tags. Mark each line break with ` symbol $blocktags = array('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'li'); $taglist = ''; foreach ($blocktags as $blocktag) { if ($taglist !== '') { $taglist .= '|'; } $taglist .= "<{$blocktag}>|<\\/{$blocktag}>"; } $content = preg_replace_callback('/((' . $taglist . ')\\s*)+/i', create_function('$matches', 'return "`".preg_replace("/./"," ",substr($matches[0],1));'), $content); // Now go through splitting each line $lines = array(); $index = 1; $pos = 0; while ($pos < strlen($content)) { $nextline = strpos($content, '`', $pos); if ($nextline === false) { // No more line breaks? Take content to end $nextline = strlen($content); } $linestr = substr($content, $pos, $nextline - $pos); $line = new ouwiki_line($linestr, $pos); if (!$line->is_empty()) { $lines[$index++] = $line; } $pos = $nextline + 1; } return $lines; }