Ejemplo n.º 1
0
 public static function arborize($tokens, $config, $context)
 {
     $definition = $config->getHTMLDefinition();
     $parent = new HTMLPurifier_Token_Start($definition->info_parent);
     $stack = array($parent->toNode());
     foreach ($tokens as $token) {
         $token->skip = null;
         // [MUT]
         $token->carryover = null;
         // [MUT]
         if ($token instanceof HTMLPurifier_Token_End) {
             $token->start = null;
             // [MUT]
             $r = array_pop($stack);
             assert($r->name === $token->name);
             assert(empty($token->attr));
             $r->endCol = $token->col;
             $r->endLine = $token->line;
             $r->endArmor = $token->armor;
             continue;
         }
         $node = $token->toNode();
         $stack[count($stack) - 1]->children[] = $node;
         if ($token instanceof HTMLPurifier_Token_Start) {
             $stack[] = $node;
         }
     }
     assert(count($stack) == 1);
     return $stack[0];
 }
 /**
  * @param String $html
  * @param HTMLPurifier_Config $config
  * @param HTMLPurifier_Context $context
  * @return array|HTMLPurifier_Token[]
  */
 public function tokenizeHTML($html, $config, $context)
 {
     // special normalization for script tags without any armor
     // our "armor" heurstic is a < sign any number of whitespaces after
     // the first script tag
     if ($config->get('HTML.Trusted')) {
         $html = preg_replace_callback('#(<script[^>]*>)(\\s*[^<].+?)(</script>)#si', array($this, 'scriptCallback'), $html);
     }
     $html = $this->normalize($html, $config, $context);
     $cursor = 0;
     // our location in the text
     $inside_tag = false;
     // whether or not we're parsing the inside of a tag
     $array = array();
     // result array
     // This is also treated to mean maintain *column* numbers too
     $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
     if ($maintain_line_numbers === null) {
         // automatically determine line numbering by checking
         // if error collection is on
         $maintain_line_numbers = $config->get('Core.CollectErrors');
     }
     if ($maintain_line_numbers) {
         $current_line = 1;
         $current_col = 0;
         $length = strlen($html);
     } else {
         $current_line = false;
         $current_col = false;
         $length = false;
     }
     $context->register('CurrentLine', $current_line);
     $context->register('CurrentCol', $current_col);
     $nl = "\n";
     // how often to manually recalculate. This will ALWAYS be right,
     // but it's pretty wasteful. Set to 0 to turn off
     $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
     $e = false;
     if ($config->get('Core.CollectErrors')) {
         $e =& $context->get('ErrorCollector');
     }
     // for testing synchronization
     $loops = 0;
     while (++$loops) {
         // $cursor is either at the start of a token, or inside of
         // a tag (i.e. there was a < immediately before it), as indicated
         // by $inside_tag
         if ($maintain_line_numbers) {
             // $rcursor, however, is always at the start of a token.
             $rcursor = $cursor - (int) $inside_tag;
             // Column number is cheap, so we calculate it every round.
             // We're interested at the *end* of the newline string, so
             // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
             // from our "rcursor" position.
             $nl_pos = strrpos($html, $nl, $rcursor - $length);
             $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
             // recalculate lines
             if ($synchronize_interval && $cursor > 0 && $loops % $synchronize_interval === 0) {
                 // time to synchronize!
                 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
             }
         }
         $position_next_lt = strpos($html, '<', $cursor);
         $position_next_gt = strpos($html, '>', $cursor);
         // triggers on "<b>asdf</b>" but not "asdf <b></b>"
         // special case to set up context
         if ($position_next_lt === $cursor) {
             $inside_tag = true;
             $cursor++;
         }
         if (!$inside_tag && $position_next_lt !== false) {
             // We are not inside tag and there still is another tag to parse
             $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor, $position_next_lt - $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
                 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
             }
             $array[] = $token;
             $cursor = $position_next_lt + 1;
             $inside_tag = true;
             continue;
         } elseif (!$inside_tag) {
             // We are not inside tag but there are no more tags
             // If we're already at the end, break
             if ($cursor === strlen($html)) {
                 break;
             }
             // Create Text of rest of string
             $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
             }
             $array[] = $token;
             break;
         } elseif ($inside_tag && $position_next_gt !== false) {
             // We are in tag and it is well formed
             // Grab the internals of the tag
             $strlen_segment = $position_next_gt - $cursor;
             if ($strlen_segment < 1) {
                 // there's nothing to process!
                 $token = new HTMLPurifier_Token_Text('<');
                 $cursor++;
                 continue;
             }
             $segment = substr($html, $cursor, $strlen_segment);
             if ($segment === false) {
                 // somehow, we attempted to access beyond the end of
                 // the string, defense-in-depth, reported by Nate Abele
                 break;
             }
             // Check if it's a comment
             if (substr($segment, 0, 3) === '!--') {
                 // re-determine segment length, looking for -->
                 $position_comment_end = strpos($html, '-->', $cursor);
                 if ($position_comment_end === false) {
                     // uh oh, we have a comment that extends to
                     // infinity. Can't be helped: set comment
                     // end position to end of string
                     if ($e) {
                         $e->send(E_WARNING, 'Lexer: Unclosed comment');
                     }
                     $position_comment_end = strlen($html);
                     $end = true;
                 } else {
                     $end = false;
                 }
                 $strlen_segment = $position_comment_end - $cursor;
                 $segment = substr($html, $cursor, $strlen_segment);
                 $token = new HTMLPurifier_Token_Comment(substr($segment, 3, $strlen_segment - 3));
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
                 }
                 $array[] = $token;
                 $cursor = $end ? $position_comment_end : $position_comment_end + 3;
                 $inside_tag = false;
                 continue;
             }
             // Check if it's an end tag
             $is_end_tag = strpos($segment, '/') === 0;
             if ($is_end_tag) {
                 $type = substr($segment, 1);
                 $token = new HTMLPurifier_Token_End($type);
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 $cursor = $position_next_gt + 1;
                 continue;
             }
             // Check leading character is alnum, if not, we may
             // have accidently grabbed an emoticon. Translate into
             // text and go our merry way
             if (!ctype_alpha($segment[0])) {
                 // XML:  $segment[0] !== '_' && $segment[0] !== ':'
                 if ($e) {
                     $e->send(E_NOTICE, 'Lexer: Unescaped lt');
                 }
                 $token = new HTMLPurifier_Token_Text('<');
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 continue;
             }
             // Check if it is explicitly self closing, if so, remove
             // trailing slash. Remember, we could have a tag like <br>, so
             // any later token processing scripts must convert improperly
             // classified EmptyTags from StartTags.
             $is_self_closing = strrpos($segment, '/') === $strlen_segment - 1;
             if ($is_self_closing) {
                 $strlen_segment--;
                 $segment = substr($segment, 0, $strlen_segment);
             }
             // Check if there are any attributes
             $position_first_space = strcspn($segment, $this->_whitespace);
             if ($position_first_space >= $strlen_segment) {
                 if ($is_self_closing) {
                     $token = new HTMLPurifier_Token_Empty($segment);
                 } else {
                     $token = new HTMLPurifier_Token_Start($segment);
                 }
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 $cursor = $position_next_gt + 1;
                 continue;
             }
             // Grab out all the data
             $type = substr($segment, 0, $position_first_space);
             $attribute_string = trim(substr($segment, $position_first_space));
             if ($attribute_string) {
                 $attr = $this->parseAttributeString($attribute_string, $config, $context);
             } else {
                 $attr = array();
             }
             if ($is_self_closing) {
                 $token = new HTMLPurifier_Token_Empty($type, $attr);
             } else {
                 $token = new HTMLPurifier_Token_Start($type, $attr);
             }
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
                 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
             }
             $array[] = $token;
             $cursor = $position_next_gt + 1;
             $inside_tag = false;
             continue;
         } else {
             // inside tag, but there's no ending > sign
             if ($e) {
                 $e->send(E_WARNING, 'Lexer: Missing gt');
             }
             $token = new HTMLPurifier_Token_Text('<' . $this->parseData(substr($html, $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
             }
             // no cursor scroll? Hmm...
             $array[] = $token;
             break;
         }
         break;
     }
     $context->destroy('CurrentLine');
     $context->destroy('CurrentCol');
     return $array;
 }
 public function tokenizeHTML($html, $config, $context)
 {
     if ($config->get('HTML.Trusted')) {
         $html = preg_replace_callback('#(<script[^>]*>)(\\s*[^<].+?)(</script>)#si', array($this, 'scriptCallback'), $html);
     }
     $html = $this->normalize($html, $config, $context);
     $cursor = 0;
     $inside_tag = false;
     $array = array();
     $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
     if ($maintain_line_numbers === null) {
         $maintain_line_numbers = $config->get('Core.CollectErrors');
     }
     if ($maintain_line_numbers) {
         $current_line = 1;
         $current_col = 0;
         $length = strlen($html);
     } else {
         $current_line = false;
         $current_col = false;
         $length = false;
     }
     $context->register('CurrentLine', $current_line);
     $context->register('CurrentCol', $current_col);
     $nl = "\n";
     $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
     $e = false;
     if ($config->get('Core.CollectErrors')) {
         $e =& $context->get('ErrorCollector');
     }
     $loops = 0;
     while (++$loops) {
         if ($maintain_line_numbers) {
             $rcursor = $cursor - (int) $inside_tag;
             $nl_pos = strrpos($html, $nl, $rcursor - $length);
             $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
             if ($synchronize_interval && $cursor > 0 && $loops % $synchronize_interval === 0) {
                 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
             }
         }
         $position_next_lt = strpos($html, '<', $cursor);
         $position_next_gt = strpos($html, '>', $cursor);
         if ($position_next_lt === $cursor) {
             $inside_tag = true;
             $cursor++;
         }
         if (!$inside_tag && $position_next_lt !== false) {
             $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor, $position_next_lt - $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
                 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
             }
             $array[] = $token;
             $cursor = $position_next_lt + 1;
             $inside_tag = true;
             continue;
         } elseif (!$inside_tag) {
             if ($cursor === strlen($html)) {
                 break;
             }
             $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
             }
             $array[] = $token;
             break;
         } elseif ($inside_tag && $position_next_gt !== false) {
             $strlen_segment = $position_next_gt - $cursor;
             if ($strlen_segment < 1) {
                 $token = new HTMLPurifier_Token_Text('<');
                 $cursor++;
                 continue;
             }
             $segment = substr($html, $cursor, $strlen_segment);
             if ($segment === false) {
                 break;
             }
             if (substr($segment, 0, 3) === '!--') {
                 $position_comment_end = strpos($html, '-->', $cursor);
                 if ($position_comment_end === false) {
                     if ($e) {
                         $e->send(E_WARNING, 'Lexer: Unclosed comment');
                     }
                     $position_comment_end = strlen($html);
                     $end = true;
                 } else {
                     $end = false;
                 }
                 $strlen_segment = $position_comment_end - $cursor;
                 $segment = substr($html, $cursor, $strlen_segment);
                 $token = new HTMLPurifier_Token_Comment(substr($segment, 3, $strlen_segment - 3));
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
                 }
                 $array[] = $token;
                 $cursor = $end ? $position_comment_end : $position_comment_end + 3;
                 $inside_tag = false;
                 continue;
             }
             $is_end_tag = strpos($segment, '/') === 0;
             if ($is_end_tag) {
                 $type = substr($segment, 1);
                 $token = new HTMLPurifier_Token_End($type);
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 $cursor = $position_next_gt + 1;
                 continue;
             }
             if (!ctype_alpha($segment[0])) {
                 if ($e) {
                     $e->send(E_NOTICE, 'Lexer: Unescaped lt');
                 }
                 $token = new HTMLPurifier_Token_Text('<');
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 continue;
             }
             $is_self_closing = strrpos($segment, '/') === $strlen_segment - 1;
             if ($is_self_closing) {
                 $strlen_segment--;
                 $segment = substr($segment, 0, $strlen_segment);
             }
             $position_first_space = strcspn($segment, $this->_whitespace);
             if ($position_first_space >= $strlen_segment) {
                 if ($is_self_closing) {
                     $token = new HTMLPurifier_Token_Empty($segment);
                 } else {
                     $token = new HTMLPurifier_Token_Start($segment);
                 }
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
                 $inside_tag = false;
                 $cursor = $position_next_gt + 1;
                 continue;
             }
             $type = substr($segment, 0, $position_first_space);
             $attribute_string = trim(substr($segment, $position_first_space));
             if ($attribute_string) {
                 $attr = $this->parseAttributeString($attribute_string, $config, $context);
             } else {
                 $attr = array();
             }
             if ($is_self_closing) {
                 $token = new HTMLPurifier_Token_Empty($type, $attr);
             } else {
                 $token = new HTMLPurifier_Token_Start($type, $attr);
             }
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
                 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
             }
             $array[] = $token;
             $cursor = $position_next_gt + 1;
             $inside_tag = false;
             continue;
         } else {
             if ($e) {
                 $e->send(E_WARNING, 'Lexer: Missing gt');
             }
             $token = new HTMLPurifier_Token_Text('<' . $this->parseData(substr($html, $cursor)));
             if ($maintain_line_numbers) {
                 $token->rawPosition($current_line, $current_col);
             }
             $array[] = $token;
             break;
         }
         break;
     }
     $context->destroy('CurrentLine');
     $context->destroy('CurrentCol');
     return $array;
 }