/** * Builds an attribute list from string containing attributes. * * This function does a lot of work. It parses an attribute list into an array * with attribute data, and tries to do the right thing even if it gets weird * input. It will add quotes around attribute values that don't have any quotes * or apostrophes around them, to make it easier to produce HTML code that will * conform to W3C's HTML specification. It will also remove bad URL protocols * from attribute values. * It also reduces duplicate attributes by using the * attribute defined first (foo='bar' foo='baz' will result in foo='bar'). * * @param string $attr Attribute list from HTML element to closing HTML element tag * @param array $allowed_protocols Allowed protocols to keep * @return array List of attributes after parsing */ function kses_hair($attr, $allowed_protocols) { $attrarr = array(); $mode = 0; $attrname = ''; $uris = array('xmlns', 'profile', 'href', 'src', 'cite', 'classid', 'codebase', 'data', 'usemap', 'longdesc', 'action'); // Loop through the whole attribute list while (strlen($attr) != 0) { $working = 0; // Was the last operation successful? switch ($mode) { case 0: // attribute name, href for instance if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) { $attrname = $match[1]; $working = $mode = 1; $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr); } break; case 1: // equals sign or valueless ("selected") if (preg_match('/^\\s*=\\s*/', $attr)) { $working = 1; $mode = 2; $attr = preg_replace('/^\\s*=\\s*/', '', $attr); break; } if (preg_match('/^\\s+/', $attr)) { $working = 1; $mode = 0; if (FALSE === array_key_exists($attrname, $attrarr)) { $attrarr[$attrname] = array('name' => $attrname, 'value' => '', 'whole' => $attrname, 'vless' => 'y'); } $attr = preg_replace('/^\\s+/', '', $attr); } break; case 2: // attribute value, a URL after href= for instance if (preg_match('%^"([^"]*)"(\\s+|/?$)%', $attr, $match)) { // MDL-2684 - kses stripping CSS styles that it thinks look like protocols if ($attrname == 'style') { $thisval = $match[1]; } else { $thisval = $match[1]; if (in_array(strtolower($attrname), $uris)) { $thisval = kses_bad_protocol($thisval, $allowed_protocols); } } if (FALSE === array_key_exists($attrname, $attrarr)) { $attrarr[$attrname] = array('name' => $attrname, 'value' => $thisval, 'whole' => "{$attrname}=\"{$thisval}\"", 'vless' => 'n'); } $working = 1; $mode = 0; $attr = preg_replace('/^"[^"]*"(\\s+|$)/', '', $attr); break; } if (preg_match("%^'([^']*)'(\\s+|/?\$)%", $attr, $match)) { $thisval = $match[1]; if (in_array(strtolower($attrname), $uris)) { $thisval = kses_bad_protocol($thisval, $allowed_protocols); } if (FALSE === array_key_exists($attrname, $attrarr)) { $attrarr[$attrname] = array('name' => $attrname, 'value' => $thisval, 'whole' => "{$attrname}='{$thisval}'", 'vless' => 'n'); } $working = 1; $mode = 0; $attr = preg_replace("/^'[^']*'(\\s+|\$)/", '', $attr); break; } if (preg_match("%^([^\\s\"']+)(\\s+|/?\$)%", $attr, $match)) { $thisval = $match[1]; if (in_array(strtolower($attrname), $uris)) { $thisval = kses_bad_protocol($thisval, $allowed_protocols); } if (FALSE === array_key_exists($attrname, $attrarr)) { $attrarr[$attrname] = array('name' => $attrname, 'value' => $thisval, 'whole' => "{$attrname}=\"{$thisval}\"", 'vless' => 'n'); } // We add quotes to conform to W3C's HTML spec. $working = 1; $mode = 0; $attr = preg_replace("%^[^\\s\"']+(\\s+|\$)%", '', $attr); } break; } // switch if ($working == 0) { $attr = kses_html_error($attr); $mode = 0; } } // while if ($mode == 1 && FALSE === array_key_exists($attrname, $attrarr)) { // special case, for when the attribute list ends with a valueless // attribute like "selected" $attrarr[$attrname] = array('name' => $attrname, 'value' => '', 'whole' => $attrname, 'vless' => 'y'); } return $attrarr; }
function kses_hair($attr, $allowed_protocols) { $attrarr = array(); $mode = 0; $attrname = ''; # Loop through the whole attribute list while (strlen($attr) != 0) { $working = 0; # Was the last operation successful? switch ($mode) { case 0: # attribute name, href for instance if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) { $attrname = $match[1]; $working = $mode = 1; $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr); } break; case 1: # equals sign or valueless ("selected") if (preg_match('/^\\s*=\\s*/', $attr)) { $working = 1; $mode = 2; $attr = preg_replace('/^\\s*=\\s*/', '', $attr); break; } if (preg_match('/^\\s+/', $attr)) { $working = 1; $mode = 0; $attrarr[] = array('name' => $attrname, 'value' => '', 'whole' => $attrname, 'vless' => 'y'); $attr = preg_replace('/^\\s+/', '', $attr); } break; case 2: # attribute value, a URL after href= for instance if (preg_match('/^"([^"]*)"(\\s+|$)/', $attr, $match)) { $thisval = kses_bad_protocol($match[1], $allowed_protocols); $attrarr[] = array('name' => $attrname, 'value' => $thisval, 'whole' => "{$attrname}=\"{$thisval}\"", 'vless' => 'n'); $working = 1; $mode = 0; $attr = preg_replace('/^"[^"]*"(\\s+|$)/', '', $attr); break; } if (preg_match("/^'([^']*)'(\\s+|\$)/", $attr, $match)) { $thisval = kses_bad_protocol($match[1], $allowed_protocols); $attrarr[] = array('name' => $attrname, 'value' => $thisval, 'whole' => "{$attrname}='{$thisval}'", 'vless' => 'n'); $working = 1; $mode = 0; $attr = preg_replace("/^'[^']*'(\\s+|\$)/", '', $attr); break; } if (preg_match("%^([^\\s\"']+)(\\s+|\$)%", $attr, $match)) { $thisval = kses_bad_protocol($match[1], $allowed_protocols); $attrarr[] = array('name' => $attrname, 'value' => $thisval, 'whole' => "{$attrname}=\"{$thisval}\"", 'vless' => 'n'); # We add quotes to conform to W3C's HTML spec. $working = 1; $mode = 0; $attr = preg_replace("%^[^\\s\"']+(\\s+|\$)%", '', $attr); } break; } # switch if ($working == 0) { $attr = kses_html_error($attr); $mode = 0; } } # while if ($mode == 1) { # special case, for when the attribute list ends with a valueless # attribute like "selected" $attrarr[] = array('name' => $attrname, 'value' => '', 'whole' => $attrname, 'vless' => 'y'); } return $attrarr; }
function kses_hair($attr, $allowed_protocols) { ############################################################################### # This function does a lot of work. It parses an attribute list into an array # with attribute data, and tries to do the right thing even if it gets weird # input. It will add quotes around attribute values that don't have any quotes # or apostrophes around them, to make it easier to produce HTML code that will # conform to W3C's HTML specification. It will also remove bad URL protocols # from attribute values. ############################################################################### $attrarr = array(); $mode = 0; $attrname = ''; # Loop through the whole attribute list while (strlen($attr) != 0) { $working = 0; # Was the last operation successful? switch ($mode) { case 0: # attribute name, href for instance if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) { $attrname = $match[1]; $working = $mode = 1; $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr); } break; case 1: # equals sign or valueless ("selected") if (preg_match('/^\\s*=\\s*/', $attr)) { # equals sign $working = 1; $mode = 2; $attr = preg_replace('/^\\s*=\\s*/', '', $attr); break; } if (preg_match('/^\\s+/', $attr)) { # valueless $working = 1; $mode = 0; $attrarr[] = array('name' => $attrname, 'value' => '', 'whole' => $attrname, 'vless' => 'y'); $attr = preg_replace('/^\\s+/', '', $attr); } break; case 2: # attribute value, a URL after href= for instance if (preg_match('/^"([^"]*)"(\\s+|$)/', $attr, $match)) { # "value" $thisval = kses_bad_protocol($match[1], $allowed_protocols); $attrarr[] = array('name' => $attrname, 'value' => $thisval, 'whole' => "{$attrname}=\"{$thisval}\"", 'vless' => 'n'); $working = 1; $mode = 0; $attr = preg_replace('/^"[^"]*"(\\s+|$)/', '', $attr); break; } if (preg_match("/^'([^']*)'(\\s+|\$)/", $attr, $match)) { # 'value' $thisval = kses_bad_protocol($match[1], $allowed_protocols); $attrarr[] = array('name' => $attrname, 'value' => $thisval, 'whole' => "{$attrname}='{$thisval}'", 'vless' => 'n'); $working = 1; $mode = 0; $attr = preg_replace("/^'[^']*'(\\s+|\$)/", '', $attr); break; } if (preg_match("%^([^\\s\"']+)(\\s+|\$)%", $attr, $match)) { # value $thisval = kses_bad_protocol($match[1], $allowed_protocols); $attrarr[] = array('name' => $attrname, 'value' => $thisval, 'whole' => "{$attrname}=\"{$thisval}\"", 'vless' => 'n'); # We add quotes to conform to W3C's HTML spec. $working = 1; $mode = 0; $attr = preg_replace("%^[^\\s\"']+(\\s+|\$)%", '', $attr); } break; } # switch if ($working == 0) { # not well formed, remove and try again $attr = kses_html_error($attr); $mode = 0; } } # while if ($mode == 1) { # special case, for when the attribute list ends with a valueless # attribute like "selected" $attrarr[] = array('name' => $attrname, 'value' => '', 'whole' => $attrname, 'vless' => 'y'); } return $attrarr; }