Text tokens comprise of regular parsed character data (PCDATA) and raw character data (from the CDATA sections). Internally, their data is parsed with all entities expanded. Surprisingly, the text token does have a "tag name" called #PCDATA, which is how the DTD represents it in permissible child nodes.
Inheritance: extends HTMLPurifier_Token
コード例 #1
0
 /**
  * @param String $html
  * @param HTMLPurifier_Config $config
  * @param HTMLPurifier_Context $context
  * @return array|HTMLPurifier_Token[]
  */
 public function tokenizeHTML($html, $config, $context)
 {
     // special normalization for script tags without any armor
     // our "armor" heurstic is a < sign any number of whitespaces after
     // the first script tag
     if ($config->get('HTML.Trusted')) {
         $html = preg_replace_callback('#(<script[^>]*>)(\\s*[^<].+?)(</script>)#si', array($this, 'scriptCallback'), $html);
     }
     $html = $this->normalize($html, $config, $context);
     $cursor = 0;
     // our location in the text
     $inside_tag = false;
     // whether or not we're parsing the inside of a tag
     $array = array();
     // result array
     // This is also treated to mean maintain *column* numbers too
     $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
     if ($maintain_line_numbers === null) {
         // automatically determine line numbering by checking
         // if error collection is on
         $maintain_line_numbers = $config->get('Core.CollectErrors');
     }
     if ($maintain_line_numbers) {
         $current_line = 1;
         $current_col = 0;
         $length = strlen($html);
     } else {
         $current_line = false;
         $current_col = false;
         $length = false;
     }
     $context->register('CurrentLine', $current_line);
     $context->register('CurrentCol', $current_col);
     $nl = "\n";
     // how often to manually recalculate. This will ALWAYS be right,
     // but it's pretty wasteful. Set to 0 to turn off
     $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
     $e = false;
     if ($config->get('Core.CollectErrors')) {
         $e =& $context->get('ErrorCollector');
     }
     // for testing synchronization
     $loops = 0;
     while (++$loops) {
         // $cursor is either at the start of a token, or inside of
         // a tag (i.e. there was a < immediately before it), as indicated
         // by $inside_tag
         if ($maintain_line_numbers) {
             // $rcursor, however, is always at the start of a token.
             $rcursor = $cursor - (int) $inside_tag;
             // Column number is cheap, so we calculate it every round.
             // We're interested at the *end* of the newline string, so
             // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
             // from our "rcursor" position.
             $nl_pos = strrpos($html, $nl, $rcursor - $length);
             $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
             // recalculate lines
             if ($synchronize_interval && $cursor > 0 && $loops % $synchronize_interval === 0) {
                 // time to synchronize!
                 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
             }
         }
         $position_next_lt = strpos($html, '<', $cursor);
         $position_next_gt = strpos($html, '>', $cursor);
         // triggers on "<b>asdf</b>" but not "asdf <b></b>"
         // special case to set up context
         if ($position_next_lt === $cursor) {
             $inside_tag = true;
             $cursor++;
         }
         if (!$inside_tag && $position_next_lt !== false) {
             // We are not inside tag and there still is another tag to parse
             $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor, $position_next_lt - $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
                 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
             }
             $array[] = $token;
             $cursor = $position_next_lt + 1;
             $inside_tag = true;
             continue;
         } elseif (!$inside_tag) {
             // We are not inside tag but there are no more tags
             // If we're already at the end, break
             if ($cursor === strlen($html)) {
                 break;
             }
             // Create Text of rest of string
             $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
             }
             $array[] = $token;
             break;
         } elseif ($inside_tag && $position_next_gt !== false) {
             // We are in tag and it is well formed
             // Grab the internals of the tag
             $strlen_segment = $position_next_gt - $cursor;
             if ($strlen_segment < 1) {
                 // there's nothing to process!
                 $token = new HTMLPurifier_Token_Text('<');
                 $cursor++;
                 continue;
             }
             $segment = substr($html, $cursor, $strlen_segment);
             if ($segment === false) {
                 // somehow, we attempted to access beyond the end of
                 // the string, defense-in-depth, reported by Nate Abele
                 break;
             }
             // Check if it's a comment
             if (substr($segment, 0, 3) === '!--') {
                 // re-determine segment length, looking for -->
                 $position_comment_end = strpos($html, '-->', $cursor);
                 if ($position_comment_end === false) {
                     // uh oh, we have a comment that extends to
                     // infinity. Can't be helped: set comment
                     // end position to end of string
                     if ($e) {
                         $e->send(E_WARNING, 'Lexer: Unclosed comment');
                     }
                     $position_comment_end = strlen($html);
                     $end = true;
                 } else {
                     $end = false;
                 }
                 $strlen_segment = $position_comment_end - $cursor;
                 $segment = substr($html, $cursor, $strlen_segment);
                 $token = new HTMLPurifier_Token_Comment(substr($segment, 3, $strlen_segment - 3));
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
                 }
                 $array[] = $token;
                 $cursor = $end ? $position_comment_end : $position_comment_end + 3;
                 $inside_tag = false;
                 continue;
             }
             // Check if it's an end tag
             $is_end_tag = strpos($segment, '/') === 0;
             if ($is_end_tag) {
                 $type = substr($segment, 1);
                 $token = new HTMLPurifier_Token_End($type);
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 $cursor = $position_next_gt + 1;
                 continue;
             }
             // Check leading character is alnum, if not, we may
             // have accidently grabbed an emoticon. Translate into
             // text and go our merry way
             if (!ctype_alpha($segment[0])) {
                 // XML:  $segment[0] !== '_' && $segment[0] !== ':'
                 if ($e) {
                     $e->send(E_NOTICE, 'Lexer: Unescaped lt');
                 }
                 $token = new HTMLPurifier_Token_Text('<');
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 continue;
             }
             // Check if it is explicitly self closing, if so, remove
             // trailing slash. Remember, we could have a tag like <br>, so
             // any later token processing scripts must convert improperly
             // classified EmptyTags from StartTags.
             $is_self_closing = strrpos($segment, '/') === $strlen_segment - 1;
             if ($is_self_closing) {
                 $strlen_segment--;
                 $segment = substr($segment, 0, $strlen_segment);
             }
             // Check if there are any attributes
             $position_first_space = strcspn($segment, $this->_whitespace);
             if ($position_first_space >= $strlen_segment) {
                 if ($is_self_closing) {
                     $token = new HTMLPurifier_Token_Empty($segment);
                 } else {
                     $token = new HTMLPurifier_Token_Start($segment);
                 }
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 $cursor = $position_next_gt + 1;
                 continue;
             }
             // Grab out all the data
             $type = substr($segment, 0, $position_first_space);
             $attribute_string = trim(substr($segment, $position_first_space));
             if ($attribute_string) {
                 $attr = $this->parseAttributeString($attribute_string, $config, $context);
             } else {
                 $attr = array();
             }
             if ($is_self_closing) {
                 $token = new HTMLPurifier_Token_Empty($type, $attr);
             } else {
                 $token = new HTMLPurifier_Token_Start($type, $attr);
             }
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
                 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
             }
             $array[] = $token;
             $cursor = $position_next_gt + 1;
             $inside_tag = false;
             continue;
         } else {
             // inside tag, but there's no ending > sign
             if ($e) {
                 $e->send(E_WARNING, 'Lexer: Missing gt');
             }
             $token = new HTMLPurifier_Token_Text('<' . $this->parseData(substr($html, $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
             }
             // no cursor scroll? Hmm...
             $array[] = $token;
             break;
         }
         break;
     }
     $context->destroy('CurrentLine');
     $context->destroy('CurrentCol');
     return $array;
 }
コード例 #2
0
 public function tokenizeHTML($html, $config, $context)
 {
     if ($config->get('HTML.Trusted')) {
         $html = preg_replace_callback('#(<script[^>]*>)(\\s*[^<].+?)(</script>)#si', array($this, 'scriptCallback'), $html);
     }
     $html = $this->normalize($html, $config, $context);
     $cursor = 0;
     $inside_tag = false;
     $array = array();
     $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
     if ($maintain_line_numbers === null) {
         $maintain_line_numbers = $config->get('Core.CollectErrors');
     }
     if ($maintain_line_numbers) {
         $current_line = 1;
         $current_col = 0;
         $length = strlen($html);
     } else {
         $current_line = false;
         $current_col = false;
         $length = false;
     }
     $context->register('CurrentLine', $current_line);
     $context->register('CurrentCol', $current_col);
     $nl = "\n";
     $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
     $e = false;
     if ($config->get('Core.CollectErrors')) {
         $e =& $context->get('ErrorCollector');
     }
     $loops = 0;
     while (++$loops) {
         if ($maintain_line_numbers) {
             $rcursor = $cursor - (int) $inside_tag;
             $nl_pos = strrpos($html, $nl, $rcursor - $length);
             $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
             if ($synchronize_interval && $cursor > 0 && $loops % $synchronize_interval === 0) {
                 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
             }
         }
         $position_next_lt = strpos($html, '<', $cursor);
         $position_next_gt = strpos($html, '>', $cursor);
         if ($position_next_lt === $cursor) {
             $inside_tag = true;
             $cursor++;
         }
         if (!$inside_tag && $position_next_lt !== false) {
             $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor, $position_next_lt - $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
                 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
             }
             $array[] = $token;
             $cursor = $position_next_lt + 1;
             $inside_tag = true;
             continue;
         } elseif (!$inside_tag) {
             if ($cursor === strlen($html)) {
                 break;
             }
             $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
             }
             $array[] = $token;
             break;
         } elseif ($inside_tag && $position_next_gt !== false) {
             $strlen_segment = $position_next_gt - $cursor;
             if ($strlen_segment < 1) {
                 $token = new HTMLPurifier_Token_Text('<');
                 $cursor++;
                 continue;
             }
             $segment = substr($html, $cursor, $strlen_segment);
             if ($segment === false) {
                 break;
             }
             if (substr($segment, 0, 3) === '!--') {
                 $position_comment_end = strpos($html, '-->', $cursor);
                 if ($position_comment_end === false) {
                     if ($e) {
                         $e->send(E_WARNING, 'Lexer: Unclosed comment');
                     }
                     $position_comment_end = strlen($html);
                     $end = true;
                 } else {
                     $end = false;
                 }
                 $strlen_segment = $position_comment_end - $cursor;
                 $segment = substr($html, $cursor, $strlen_segment);
                 $token = new HTMLPurifier_Token_Comment(substr($segment, 3, $strlen_segment - 3));
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
                 }
                 $array[] = $token;
                 $cursor = $end ? $position_comment_end : $position_comment_end + 3;
                 $inside_tag = false;
                 continue;
             }
             $is_end_tag = strpos($segment, '/') === 0;
             if ($is_end_tag) {
                 $type = substr($segment, 1);
                 $token = new HTMLPurifier_Token_End($type);
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 $cursor = $position_next_gt + 1;
                 continue;
             }
             if (!ctype_alpha($segment[0])) {
                 if ($e) {
                     $e->send(E_NOTICE, 'Lexer: Unescaped lt');
                 }
                 $token = new HTMLPurifier_Token_Text('<');
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 continue;
             }
             $is_self_closing = strrpos($segment, '/') === $strlen_segment - 1;
             if ($is_self_closing) {
                 $strlen_segment--;
                 $segment = substr($segment, 0, $strlen_segment);
             }
             $position_first_space = strcspn($segment, $this->_whitespace);
             if ($position_first_space >= $strlen_segment) {
                 if ($is_self_closing) {
                     $token = new HTMLPurifier_Token_Empty($segment);
                 } else {
                     $token = new HTMLPurifier_Token_Start($segment);
                 }
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 $cursor = $position_next_gt + 1;
                 continue;
             }
             $type = substr($segment, 0, $position_first_space);
             $attribute_string = trim(substr($segment, $position_first_space));
             if ($attribute_string) {
                 $attr = $this->parseAttributeString($attribute_string, $config, $context);
             } else {
                 $attr = array();
             }
             if ($is_self_closing) {
                 $token = new HTMLPurifier_Token_Empty($type, $attr);
             } else {
                 $token = new HTMLPurifier_Token_Start($type, $attr);
             }
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
                 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
             }
             $array[] = $token;
             $cursor = $position_next_gt + 1;
             $inside_tag = false;
             continue;
         } else {
             if ($e) {
                 $e->send(E_WARNING, 'Lexer: Missing gt');
             }
             $token = new HTMLPurifier_Token_Text('<' . $this->parseData(substr($html, $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
             }
             $array[] = $token;
             break;
         }
         break;
     }
     $context->destroy('CurrentLine');
     $context->destroy('CurrentCol');
     return $array;
 }