Esempio n. 1
0
 /**
  * @todo TODO must build DOM to really properly remove offending elements
  * @todo TODO properly filter URLs
  */
 public static function filter_html_elements($str)
 {
     $tokenizer = new HTMLTokenizer($str);
     // tokenize, baby
     $tokens = $tokenizer->parse();
     // filter token stream
     $filtered = new HTMLTokenSet();
     $stack = array();
     foreach ($tokens as $node) {
         switch ($node['type']) {
             case HTMLTokenizer::NODE_TYPE_TEXT:
                 $node['value'] = html_entity_decode($node['value'], ENT_QUOTES, MultiByte::hab_encoding());
                 break;
             case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN:
             case HTMLTokenizer::NODE_TYPE_ELEMENT_EMPTY:
                 // is this element allowed at all?
                 if (!in_array(strtolower($node['name']), self::$whitelist_elements)) {
                     if (!in_array(strtolower($node['name']), self::$elements_empty)) {
                         array_push($stack, $node['name']);
                     }
                     //$node = null; //remove the node completely
                     // convert the node to text
                     $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array());
                 } else {
                     // check attributes
                     foreach ($node['attrs'] as $k => $v) {
                         $attr_ok = false;
                         // if the attribute is in the global whitelist and validates
                         if (array_key_exists(strtolower($k), self::$whitelist_attributes['*']) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes['*'][strtolower($k)])) {
                             $attr_ok = true;
                         }
                         // if there is a whitelist for this node and this attribute is in that list and it validates
                         if (array_key_exists(strtolower($node['name']), self::$whitelist_attributes) && array_key_exists(strtolower($k), self::$whitelist_attributes[strtolower($node['name'])]) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes[strtolower($node['name'])][strtolower($k)])) {
                             $attr_ok = true;
                         }
                         // if it wasn't in one of the whitelists or failed its check, remove it
                         if ($attr_ok != true) {
                             unset($node['attrs'][$k]);
                         }
                     }
                 }
                 break;
             case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE:
                 if (!in_array(strtolower($node['name']), self::$whitelist_elements)) {
                     if (strtolower($temp = array_pop($stack)) !== strtolower($node['name'])) {
                         // something weird happened (Luke, use the DOM!)
                         array_push($stack, $temp);
                     }
                     //$node = null;
                     //convert the node to text
                     $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array());
                 }
                 break;
             case HTMLTokenizer::NODE_TYPE_PI:
             case HTMLTokenizer::NODE_TYPE_COMMENT:
             case HTMLTokenizer::NODE_TYPE_CDATA_SECTION:
             case HTMLTokenizer::NODE_TYPE_STATEMENT:
             default:
                 $node = null;
                 break;
         }
         if ($node != null) {
             $filtered[] = $node;
         }
     }
     // rebuild our output string
     return preg_replace('#<([^>\\s]+)(?:\\s+[^>]+)?></\\1>#u', '', (string) $filtered);
 }
Esempio n. 2
0
 /**
  * @todo TODO must build DOM to really properly remove offending elements
  * @todo TODO properly filter URLs
  */
 public static function filter_html_elements($str)
 {
     $tokenizer = new HTMLTokenizer($str);
     // tokenize, baby
     $tokens = $tokenizer->parse();
     // filter token stream
     $filtered = new HTMLTokenSet();
     $stack = array();
     foreach ($tokens as $node) {
         switch ($node['type']) {
             case HTMLTokenizer::NODE_TYPE_TEXT:
                 // XXX use blog charset setting
                 $node['value'] = html_entity_decode($node['value'], ENT_QUOTES, 'utf-8');
                 break;
             case HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN:
                 // is this element allowed at all?
                 if (!in_array(strtolower($node['name']), self::$whitelist_elements)) {
                     if (!in_array(strtolower($node['name']), self::$elements_empty)) {
                         array_push($stack, $node['name']);
                     }
                     //$node = NULL; //remove the node completely
                     // convert the node to text
                     $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array());
                 } else {
                     // check attributes
                     foreach ($node['attrs'] as $k => $v) {
                         $attr_ok = (in_array(strtolower($k), self::$whitelist_attributes['*']) || array_key_exists(strtolower($node['name']), self::$whitelist_attributes) && array_key_exists(strtolower($k), self::$whitelist_attributes[strtolower($node['name'])])) && self::check_attr_value(strtolower($k), $v, self::$whitelist_attributes[strtolower($node['name'])][strtolower($k)]);
                         if (!$attr_ok) {
                             unset($node['attrs'][$k]);
                         }
                     }
                 }
                 break;
             case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE:
                 if (!in_array(strtolower($node['name']), self::$whitelist_elements)) {
                     if (strtolower($temp = array_pop($stack)) !== strtolower($node['name'])) {
                         // something weird happened (Luke, use the DOM!)
                         array_push($stack, $temp);
                     }
                     //$node = NULL;
                     //convert the node to text
                     $node = array('type' => HTMLTokenizer::NODE_TYPE_TEXT, 'name' => '#text', 'value' => HTMLTokenSet::token_to_string($node), 'attrs' => array());
                 }
                 break;
             case HTMLTokenizer::NODE_TYPE_PI:
             case HTMLTokenizer::NODE_TYPE_COMMENT:
             case HTMLTokenizer::NODE_TYPE_CDATA_SECTION:
             case HTMLTokenizer::NODE_TYPE_STATEMENT:
                 $node = NULL;
                 break;
             default:
         }
         if ($node != NULL) {
             $filtered[] = $node;
         }
     }
     // rebuild our output string
     return preg_replace('@<([^>\\s]+)(?:\\s+[^>]+)?></\\1>@', '', (string) $filtered);
 }