/** * @todo TODO must build DOM to really properly remove offending elements * @todo TODO properly filter URLs */ public static function filter_html_elements($str) { $tokenizer = new HTMLTokenizer($str); // tokenize, baby $tokens = $tokenizer->parse(); // filter token stream $filtered = new HTMLTokenSet(); $stack = array(); foreach ($tokens as $node) { switch ($node['type']) { case HTMLTokenizer::NODE_TYPE_TEXT: $node['value'] = html_entity_decode($node['value'], ENT_QUOTES, MultiByte::hab_encoding()); break; case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN: case HTMLTokenizer::NODE_TYPE_ELEMENT_EMPTY: // is this element allowed at all? if (!in_array(strtolower($node['name']), self::$whitelist_elements)) { if (!in_array(strtolower($node['name']), self::$elements_empty)) { array_push($stack, $node['name']); } //$node = null; //remove the node completely // convert the node to text $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array()); } else { // check attributes foreach ($node['attrs'] as $k => $v) { $attr_ok = false; // if the attribute is in the global whitelist and validates if (array_key_exists(strtolower($k), self::$whitelist_attributes['*']) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes['*'][strtolower($k)])) { $attr_ok = true; } // if there is a whitelist for this node and this attribute is in that list and it validates if (array_key_exists(strtolower($node['name']), self::$whitelist_attributes) && array_key_exists(strtolower($k), self::$whitelist_attributes[strtolower($node['name'])]) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes[strtolower($node['name'])][strtolower($k)])) { $attr_ok = true; } // if it wasn't in one of the whitelists or failed its check, remove it if ($attr_ok != true) { unset($node['attrs'][$k]); } } } break; case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE: if (!in_array(strtolower($node['name']), self::$whitelist_elements)) { if (strtolower($temp = array_pop($stack)) !== strtolower($node['name'])) { // something weird happened (Luke, use the DOM!) array_push($stack, $temp); } //$node = null; //convert the node to text $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array()); } break; case HTMLTokenizer::NODE_TYPE_PI: case HTMLTokenizer::NODE_TYPE_COMMENT: case HTMLTokenizer::NODE_TYPE_CDATA_SECTION: case HTMLTokenizer::NODE_TYPE_STATEMENT: default: $node = null; break; } if ($node != null) { $filtered[] = $node; } } // rebuild our output string return preg_replace('#<([^>\\s]+)(?:\\s+[^>]+)?></\\1>#u', '', (string) $filtered); }
/** * @todo TODO must build DOM to really properly remove offending elements * @todo TODO properly filter URLs */ public static function filter_html_elements($str) { $tokenizer = new HTMLTokenizer($str); // tokenize, baby $tokens = $tokenizer->parse(); // filter token stream $filtered = new HTMLTokenSet(); $stack = array(); foreach ($tokens as $node) { switch ($node['type']) { case HTMLTokenizer::NODE_TYPE_TEXT: // XXX use blog charset setting $node['value'] = html_entity_decode($node['value'], ENT_QUOTES, 'utf-8'); break; case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN: // is this element allowed at all? if (!in_array(strtolower($node['name']), self::$whitelist_elements)) { if (!in_array(strtolower($node['name']), self::$elements_empty)) { array_push($stack, $node['name']); } //$node = NULL; //remove the node completely // convert the node to text $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array()); } else { // check attributes foreach ($node['attrs'] as $k => $v) { $attr_ok = (in_array(strtolower($k), self::$whitelist_attributes['*']) || array_key_exists(strtolower($node['name']), self::$whitelist_attributes) && array_key_exists(strtolower($k), self::$whitelist_attributes[strtolower($node['name'])])) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes[strtolower($node['name'])][strtolower($k)]); if (!$attr_ok) { unset($node['attrs'][$k]); } } } break; case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE: if (!in_array(strtolower($node['name']), self::$whitelist_elements)) { if (strtolower($temp = array_pop($stack)) !== strtolower($node['name'])) { // something weird happened (Luke, use the DOM!) array_push($stack, $temp); } //$node = NULL; //convert the node to text $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array()); } break; case HTMLTokenizer::NODE_TYPE_PI: case HTMLTokenizer::NODE_TYPE_COMMENT: case HTMLTokenizer::NODE_TYPE_CDATA_SECTION: case HTMLTokenizer::NODE_TYPE_STATEMENT: $node = NULL; break; default: } if ($node != NULL) { $filtered[] = $node; } } // rebuild our output string return preg_replace('@<([^>\\s]+)(?:\\s+[^>]+)?></\\1>@', '', (string) $filtered); }