/** * This function looks for the next tag. * * @param $body String where to look for the next tag. * @param $offset Start looking from here. * @return false if no more tags exist in the body, or * an array with the following members: * - string with the name of the tag * - array with attributes and their values * - integer with tag type (1, 2, or 3) * - integer where the tag starts (starting "<") * - integer where the tag ends (ending ">") * first three members will be false, if the tag is invalid. */ function sq_getnxtag($body, $offset) { $me = 'sq_getnxtag'; if ($offset > strlen($body)) { return false; } $lt = sq_findnxstr($body, $offset, "<"); if ($lt == strlen($body)) { return false; } /** * We are here: * blah blah <tag attribute="value"> * \---------^ */ $pos = sq_skipspace($body, $lt + 1); if ($pos >= strlen($body)) { return array(false, false, false, $lt, strlen($body)); } /** * There are 3 kinds of tags: * 1. Opening tag, e.g.: * <a href="blah"> * 2. Closing tag, e.g.: * </a> * 3. XHTML-style content-less tag, e.g.: * <img src="blah" /> */ $tagtype = false; switch (substr($body, $pos, 1)) { case '/': $tagtype = 2; $pos++; break; case '!': /** * A comment or an SGML declaration. */ if (substr($body, $pos + 1, 2) == "--") { $gt = strpos($body, "-->", $pos); if ($gt === false) { $gt = strlen($body); } else { $gt += 2; } return array(false, false, false, $lt, $gt); } else { $gt = sq_findnxstr($body, $pos, ">"); return array(false, false, false, $lt, $gt); } break; default: /** * Assume tagtype 1 for now. If it's type 3, we'll switch values * later. */ $tagtype = 1; break; } $tag_start = $pos; $tagname = ''; /** * Look for next [\W-_], which will indicate the end of the tag name. */ $regary = sq_findnxreg($body, $pos, "[^\\w\\-_]"); if ($regary == false) { return array(false, false, false, $lt, strlen($body)); } list($pos, $tagname, $match) = $regary; $tagname = strtolower($tagname); /** * $match can be either of these: * '>' indicating the end of the tag entirely. * '\s' indicating the end of the tag name. * '/' indicating that this is type-3 xhtml tag. * * Whatever else we find there indicates an invalid tag. */ switch ($match) { case '/': /** * This is an xhtml-style tag with a closing / at the * end, like so: <img src="blah" />. Check if it's followed * by the closing bracket. If not, then this tag is invalid */ if (substr($body, $pos, 2) == "/>") { $pos++; $tagtype = 3; } else { $gt = sq_findnxstr($body, $pos, ">"); $retary = array(false, false, false, $lt, $gt); return $retary; } case '>': return array($tagname, false, $tagtype, $lt, $pos); break; default: /** * Check if it's whitespace */ if (!preg_match('/\\s/', $match)) { /** * This is an invalid tag! Look for the next closing ">". */ $gt = sq_findnxstr($body, $lt, ">"); return array(false, false, false, $lt, $gt); } break; } /** * At this point we're here: * <tagname attribute='blah'> * \-------^ * * At this point we loop in order to find all attributes. */ $attname = ''; $atttype = false; $attary = array(); while ($pos <= strlen($body)) { $pos = sq_skipspace($body, $pos); if ($pos == strlen($body)) { /** * Non-closed tag. */ return array(false, false, false, $lt, $pos); } /** * See if we arrived at a ">" or "/>", which means that we reached * the end of the tag. */ $matches = array(); if (preg_match("%^(\\s*)(>|/>)%s", substr($body, $pos), $matches)) { /** * Yep. So we did. */ $pos += strlen($matches[1]); if ($matches[2] == "/>") { $tagtype = 3; $pos++; } return array($tagname, $attary, $tagtype, $lt, $pos); } /** * There are several types of attributes, with optional * [:space:] between members. * Type 1: * attrname[:space:]=[:space:]'CDATA' * Type 2: * attrname[:space:]=[:space:]"CDATA" * Type 3: * attr[:space:]=[:space:]CDATA * Type 4: * attrname * * We leave types 1 and 2 the same, type 3 we check for * '"' and convert to """ if needed, then wrap in * double quotes. Type 4 we convert into: * attrname="yes". */ $regary = sq_findnxreg($body, $pos, "[^:\\w\\-_]"); if ($regary == false) { /** * Looks like body ended before the end of tag. */ return array(false, false, false, $lt, strlen($body)); } list($pos, $attname, $match) = $regary; $attname = strtolower($attname); /** * We arrived at the end of attribute name. Several things possible * here: * '>' means the end of the tag and this is attribute type 4 * '/' if followed by '>' means the same thing as above * '\s' means a lot of things -- look what it's followed by. * anything else means the attribute is invalid. */ switch ($match) { case '/': /** * This is an xhtml-style tag with a closing / at the * end, like so: <img src="blah" />. Check if it's followed * by the closing bracket. If not, then this tag is invalid */ if (substr($body, $pos, 2) == "/>") { $pos++; $tagtype = 3; } else { $gt = sq_findnxstr($body, $pos, ">"); $retary = array(false, false, false, $lt, $gt); return $retary; } case '>': $attary[$attname] = '"yes"'; return array($tagname, $attary, $tagtype, $lt, $pos); break; default: /** * Skip whitespace and see what we arrive at. */ $pos = sq_skipspace($body, $pos); $char = substr($body, $pos, 1); /** * Two things are valid here: * '=' means this is attribute type 1 2 or 3. * \w means this was attribute type 4. * anything else we ignore and re-loop. End of tag and * invalid stuff will be caught by our checks at the beginning * of the loop. */ if ($char == "=") { $pos++; $pos = sq_skipspace($body, $pos); /** * Here are 3 possibilities: * "'" attribute type 1 * '"' attribute type 2 * everything else is the content of tag type 3 */ $quot = substr($body, $pos, 1); if ($quot == "'") { $regary = sq_findnxreg($body, $pos + 1, "\\'"); if ($regary == false) { return array(false, false, false, $lt, strlen($body)); } list($pos, $attval, $match) = $regary; $pos++; $attary[$attname] = "'" . $attval . "'"; } else { if ($quot == '"') { $regary = sq_findnxreg($body, $pos + 1, '\\"'); if ($regary == false) { return array(false, false, false, $lt, strlen($body)); } list($pos, $attval, $match) = $regary; $pos++; $attary[$attname] = '"' . $attval . '"'; } else { /** * These are hateful. Look for \s, or >. */ $regary = sq_findnxreg($body, $pos, "[\\s>]"); if ($regary == false) { return array(false, false, false, $lt, strlen($body)); } list($pos, $attval, $match) = $regary; /** * If it's ">" it will be caught at the top. */ $attval = preg_replace("/\"/s", """, $attval); $attary[$attname] = '"' . $attval . '"'; } } } else { if (preg_match("|[\\w/>]|", $char)) { /** * That was attribute type 4. */ $attary[$attname] = '"yes"'; } else { /** * An illegal character. Find next '>' and return. */ $gt = sq_findnxstr($body, $pos, ">"); return array(false, false, false, $lt, $gt); } } break; } } /** * The fact that we got here indicates that the tag end was never * found. Return invalid tag indication so it gets stripped. */ return array(false, false, false, $lt, strlen($body)); }
/** * This function edits the style definition to make them friendly and * usable in squirrelmail. * * @param $message the message object * @param $id the message id * @param $content a string with whatever is between <style> and </style> * @param $mailbox the message mailbox * @return a string with edited content. */ function sq_fixstyle($body, $pos, $message, $id, $mailbox) { global $view_unsafe_images; $me = 'sq_fixstyle'; $ret = sq_findnxreg($body, $pos, '</\\s*style\\s*>'); if ($ret == FALSE) { return array(FALSE, strlen($body)); } $newpos = $ret[0] + strlen($ret[2]); $content = $ret[1]; /** * First look for general BODY style declaration, which would be * like so: * body {background: blah-blah} * and change it to .bodyclass so we can just assign it to a <div> */ $content = preg_replace("|body(\\s*\\{.*?\\})|si", ".bodyclass\\1", $content); $secremoveimg = '../images/' . _("sec_remove_eng.png"); /** * Fix url('blah') declarations. */ $content = preg_replace("|url\\s*\\(\\s*([\\'\"])\\s*\\S+script\\s*:.*?([\\'\"])\\s*\\)|si", "url(\\1{$secremoveimg}\\2)", $content); /** * Fix url('https*://.*) declarations but only if $view_unsafe_images * is false. */ if (!$view_unsafe_images) { $content = preg_replace("|url\\s*\\(\\s*([\\'\"])\\s*https*:.*?([\\'\"])\\s*\\)|si", "url(\\1{$secremoveimg}\\2)", $content); } /** * Fix urls that refer to cid: */ while (preg_match("|url\\s*\\(\\s*([\\'\"]\\s*cid:.*?[\\'\"])\\s*\\)|si", $content, $matches)) { $cidurl = $matches[1]; $httpurl = sq_cid2http($message, $id, $cidurl, $mailbox); $content = preg_replace("|url\\s*\\(\\s*{$cidurl}\\s*\\)|si", "url({$httpurl})", $content); } /** * Fix stupid css declarations which lead to vulnerabilities * in IE. */ $match = array('/expression/i', '/behaviou*r/i', '/binding/i', '/include-source/i'); $replace = array('idiocy', 'idiocy', 'idiocy', 'idiocy'); $content = preg_replace($match, $replace, $content); return array($content, $newpos); }