/** This function exists because many developers started adding html tags directly into the XLIFF source since: 1) XLIFF tag remapping is too complex for them 2) Trados does not lock Tags within the <source> that are expressed as >b< but is tollerant to html tags in <source> in short people typed: <source>The <b>red</d> house</source> or worst <source>5 > 3</source> instead of <source>The <g id="1">red</g> house.</source> and <source>5 > 3</source> This function will do the following <g id="1">Hello</g>, 4 > 3 -> <g id="1">Hello</g>, 4 > 3 <g id="1">Hello</g>, 4 > 3 > -> <g id="1">Hello</g>, 4 > 3 > 2 */ public static function fix_non_well_formed_xml($content) { if (self::$find_xliff_tags_reg === null) { // List of the tags that we don't want to escape $xliff_tags = array('g', 'x', 'bx', 'ex', 'bpt', 'ept', 'ph', 'it', 'mrk'); // Convert the list of tags in a regexp list, for example "g|x|bx|ex" $xliff_tags_reg_list = implode('|', $xliff_tags); // Regexp to find all the XLIFF tags: // </? -> matches the tag start, for both opening and // closure tags (see the optional slash) // ($xliff_tags_reg) -> matches one of the XLIFF tags in the list above // (\s[^>]*)? -> matches attributes and so on; ensures there's a // space after the tag, to not confuse for example a // "g" tag with a "gblabla"; [^>]* matches anything, // including additional spaces; the entire block is // optional, to allow tags with no spaces or attrs // /? > -> matches tag end, with optional slash for // self-closing ones // If you are wondering about spaces inside tags, look at this: // http://www.w3.org/TR/REC-xml/#sec-starttags // It says that there cannot be any space between the '<' and the tag name, // between '</' and the tag name, or inside '/>'. But you can add white // space after the tag name, though. self::$find_xliff_tags_reg = "#</?({$xliff_tags_reg_list})(\\s[^>]*)?/?>#si"; } // Find all the XLIFF tags preg_match_all(self::$find_xliff_tags_reg, $content, $matches); $tags = (array) $matches[0]; // Prepare placeholders $tags_placeholders = array(); for ($i = 0; $i < count($tags); $i++) { $tag = $tags[$i]; $tags_placeholders[$tag] = "#@!XLIFF-TAG-{$i}!@#"; } // Replace all XLIFF tags with placeholders that will not be escaped foreach ($tags_placeholders as $tag => $placeholder) { $content = str_replace($tag, $placeholder, $content); } // Escape the string with the remaining non-XLIFF tags $content = htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false); // Put again in place the original XLIFF tags replacing placeholders foreach ($tags_placeholders as $tag => $placeholder) { $content = str_replace($placeholder, $tag, $content); } return $content; /* I wrote a sort of unit-test to test the function. Obviously, it passes. TODO: move this code to a real unit-test ASAP. $tests = array( '' => '', 'just text' => 'just text', '<gap>Hey</gap>' => '<gap>Hey</gap>', '<mrk>Hey</mrk>' => '<mrk>Hey</mrk>', '<g >Hey</g >' => '<g >Hey</g >', '<g >Hey</g >' => '<g >Hey</g >', '<g id="99">Hey</g>' => '<g id="99">Hey</g>', 'Hey<x/>' => 'Hey<x/>', 'Hey<x />' => 'Hey<x />', 'Hey<x />' => 'Hey<x />', 'Hey<x id="15"/>' => 'Hey<x id="15"/>', 'Hey<bx id="1"/>' => 'Hey<bx id="1"/>', 'Hey<ex id="1"/>' => 'Hey<ex id="1"/>', '<bpt id="1">Hey</bpt>' => '<bpt id="1">Hey</bpt>', '<ept id="1">Hey</ept>' => '<ept id="1">Hey</ept>', '<ph id="1">Hey</ph>' => '<ph id="1">Hey</ph>', '<it id="1">Hey</it>' => '<it id="1">Hey</it>', '<mrk mid="3" mtype="seg"><g id="2">Hey man! <x id="1"/><b id="dunno">Hey man & hey girl!</b></mrk>' => '<mrk mid="3" mtype="seg"><g id="2">Hey man! <x id="1"/><b id="dunno">Hey man & hey girl!</b></mrk>', ); foreach ($tests as $in => $expected) { $out = fix_non_well_formed_xml($in); if (strcmp($out, $expected) !== 0) { echo "ERROR!\nInput: $in\nOutput: $out\nExpected: $expected\n"; } } */ }