/** * @param String $html * @param HTMLPurifier_Config $config * @param HTMLPurifier_Context $context * @return array|HTMLPurifier_Token[] */ public function tokenizeHTML($html, $config, $context) { // special normalization for script tags without any armor // our "armor" heurstic is a < sign any number of whitespaces after // the first script tag if ($config->get('HTML.Trusted')) { $html = preg_replace_callback('#(<script[^>]*>)(\\s*[^<].+?)(</script>)#si', array($this, 'scriptCallback'), $html); } $html = $this->normalize($html, $config, $context); $cursor = 0; // our location in the text $inside_tag = false; // whether or not we're parsing the inside of a tag $array = array(); // result array // This is also treated to mean maintain *column* numbers too $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); if ($maintain_line_numbers === null) { // automatically determine line numbering by checking // if error collection is on $maintain_line_numbers = $config->get('Core.CollectErrors'); } if ($maintain_line_numbers) { $current_line = 1; $current_col = 0; $length = strlen($html); } else { $current_line = false; $current_col = false; $length = false; } $context->register('CurrentLine', $current_line); $context->register('CurrentCol', $current_col); $nl = "\n"; // how often to manually recalculate. This will ALWAYS be right, // but it's pretty wasteful. Set to 0 to turn off $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); $e = false; if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } // for testing synchronization $loops = 0; while (++$loops) { // $cursor is either at the start of a token, or inside of // a tag (i.e. there was a < immediately before it), as indicated // by $inside_tag if ($maintain_line_numbers) { // $rcursor, however, is always at the start of a token. $rcursor = $cursor - (int) $inside_tag; // Column number is cheap, so we calculate it every round. // We're interested at the *end* of the newline string, so // we need to add strlen($nl) == 1 to $nl_pos before subtracting it // from our "rcursor" position. $nl_pos = strrpos($html, $nl, $rcursor - $length); $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); // recalculate lines if ($synchronize_interval && $cursor > 0 && $loops % $synchronize_interval === 0) { // time to synchronize! $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); } } $position_next_lt = strpos($html, '<', $cursor); $position_next_gt = strpos($html, '>', $cursor); // triggers on "<b>asdf</b>" but not "asdf <b></b>" // special case to set up context if ($position_next_lt === $cursor) { $inside_tag = true; $cursor++; } if (!$inside_tag && $position_next_lt !== false) { // We are not inside tag and there still is another tag to parse $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor, $position_next_lt - $cursor))); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); } $array[] = $token; $cursor = $position_next_lt + 1; $inside_tag = true; continue; } elseif (!$inside_tag) { // We are not inside tag but there are no more tags // If we're already at the end, break if ($cursor === strlen($html)) { break; } // Create Text of rest of string $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor))); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); } $array[] = $token; break; } elseif ($inside_tag && $position_next_gt !== false) { // We are in tag and it is well formed // Grab the internals of the tag $strlen_segment = $position_next_gt - $cursor; if ($strlen_segment < 1) { // there's nothing to process! $token = new HTMLPurifier_Token_Text('<'); $cursor++; continue; } $segment = substr($html, $cursor, $strlen_segment); if ($segment === false) { // somehow, we attempted to access beyond the end of // the string, defense-in-depth, reported by Nate Abele break; } // Check if it's a comment if (substr($segment, 0, 3) === '!--') { // re-determine segment length, looking for --> $position_comment_end = strpos($html, '-->', $cursor); if ($position_comment_end === false) { // uh oh, we have a comment that extends to // infinity. Can't be helped: set comment // end position to end of string if ($e) { $e->send(E_WARNING, 'Lexer: Unclosed comment'); } $position_comment_end = strlen($html); $end = true; } else { $end = false; } $strlen_segment = $position_comment_end - $cursor; $segment = substr($html, $cursor, $strlen_segment); $token = new HTMLPurifier_Token_Comment(substr($segment, 3, $strlen_segment - 3)); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); } $array[] = $token; $cursor = $end ? $position_comment_end : $position_comment_end + 3; $inside_tag = false; continue; } // Check if it's an end tag $is_end_tag = strpos($segment, '/') === 0; if ($is_end_tag) { $type = substr($segment, 1); $token = new HTMLPurifier_Token_End($type); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; $inside_tag = false; $cursor = $position_next_gt + 1; continue; } // Check leading character is alnum, if not, we may // have accidently grabbed an emoticon. Translate into // text and go our merry way if (!ctype_alpha($segment[0])) { // XML: $segment[0] !== '_' && $segment[0] !== ':' if ($e) { $e->send(E_NOTICE, 'Lexer: Unescaped lt'); } $token = new HTMLPurifier_Token_Text('<'); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; $inside_tag = false; continue; } // Check if it is explicitly self closing, if so, remove // trailing slash. Remember, we could have a tag like <br>, so // any later token processing scripts must convert improperly // classified EmptyTags from StartTags. $is_self_closing = strrpos($segment, '/') === $strlen_segment - 1; if ($is_self_closing) { $strlen_segment--; $segment = substr($segment, 0, $strlen_segment); } // Check if there are any attributes $position_first_space = strcspn($segment, $this->_whitespace); if ($position_first_space >= $strlen_segment) { if ($is_self_closing) { $token = new HTMLPurifier_Token_Empty($segment); } else { $token = new HTMLPurifier_Token_Start($segment); } if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; $inside_tag = false; $cursor = $position_next_gt + 1; continue; } // Grab out all the data $type = substr($segment, 0, $position_first_space); $attribute_string = trim(substr($segment, $position_first_space)); if ($attribute_string) { $attr = $this->parseAttributeString($attribute_string, $config, $context); } else { $attr = array(); } if ($is_self_closing) { $token = new HTMLPurifier_Token_Empty($type, $attr); } else { $token = new HTMLPurifier_Token_Start($type, $attr); } if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; $cursor = $position_next_gt + 1; $inside_tag = false; continue; } else { // inside tag, but there's no ending > sign if ($e) { $e->send(E_WARNING, 'Lexer: Missing gt'); } $token = new HTMLPurifier_Token_Text('<' . $this->parseData(substr($html, $cursor))); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); } // no cursor scroll? Hmm... $array[] = $token; break; } break; } $context->destroy('CurrentLine'); $context->destroy('CurrentCol'); return $array; }
public function tokenizeHTML($html, $config, $context) { if ($config->get('HTML.Trusted')) { $html = preg_replace_callback('#(<script[^>]*>)(\\s*[^<].+?)(</script>)#si', array($this, 'scriptCallback'), $html); } $html = $this->normalize($html, $config, $context); $cursor = 0; $inside_tag = false; $array = array(); $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); if ($maintain_line_numbers === null) { $maintain_line_numbers = $config->get('Core.CollectErrors'); } if ($maintain_line_numbers) { $current_line = 1; $current_col = 0; $length = strlen($html); } else { $current_line = false; $current_col = false; $length = false; } $context->register('CurrentLine', $current_line); $context->register('CurrentCol', $current_col); $nl = "\n"; $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); $e = false; if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } $loops = 0; while (++$loops) { if ($maintain_line_numbers) { $rcursor = $cursor - (int) $inside_tag; $nl_pos = strrpos($html, $nl, $rcursor - $length); $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); if ($synchronize_interval && $cursor > 0 && $loops % $synchronize_interval === 0) { $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); } } $position_next_lt = strpos($html, '<', $cursor); $position_next_gt = strpos($html, '>', $cursor); if ($position_next_lt === $cursor) { $inside_tag = true; $cursor++; } if (!$inside_tag && $position_next_lt !== false) { $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor, $position_next_lt - $cursor))); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); } $array[] = $token; $cursor = $position_next_lt + 1; $inside_tag = true; continue; } elseif (!$inside_tag) { if ($cursor === strlen($html)) { break; } $token = new HTMLPurifier_Token_Text($this->parseData(substr($html, $cursor))); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); } $array[] = $token; break; } elseif ($inside_tag && $position_next_gt !== false) { $strlen_segment = $position_next_gt - $cursor; if ($strlen_segment < 1) { $token = new HTMLPurifier_Token_Text('<'); $cursor++; continue; } $segment = substr($html, $cursor, $strlen_segment); if ($segment === false) { break; } if (substr($segment, 0, 3) === '!--') { $position_comment_end = strpos($html, '-->', $cursor); if ($position_comment_end === false) { if ($e) { $e->send(E_WARNING, 'Lexer: Unclosed comment'); } $position_comment_end = strlen($html); $end = true; } else { $end = false; } $strlen_segment = $position_comment_end - $cursor; $segment = substr($html, $cursor, $strlen_segment); $token = new HTMLPurifier_Token_Comment(substr($segment, 3, $strlen_segment - 3)); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); } $array[] = $token; $cursor = $end ? $position_comment_end : $position_comment_end + 3; $inside_tag = false; continue; } $is_end_tag = strpos($segment, '/') === 0; if ($is_end_tag) { $type = substr($segment, 1); $token = new HTMLPurifier_Token_End($type); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; $inside_tag = false; $cursor = $position_next_gt + 1; continue; } if (!ctype_alpha($segment[0])) { if ($e) { $e->send(E_NOTICE, 'Lexer: Unescaped lt'); } $token = new HTMLPurifier_Token_Text('<'); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; $inside_tag = false; continue; } $is_self_closing = strrpos($segment, '/') === $strlen_segment - 1; if ($is_self_closing) { $strlen_segment--; $segment = substr($segment, 0, $strlen_segment); } $position_first_space = strcspn($segment, $this->_whitespace); if ($position_first_space >= $strlen_segment) { if ($is_self_closing) { $token = new HTMLPurifier_Token_Empty($segment); } else { $token = new HTMLPurifier_Token_Start($segment); } if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; $inside_tag = false; $cursor = $position_next_gt + 1; continue; } $type = substr($segment, 0, $position_first_space); $attribute_string = trim(substr($segment, $position_first_space)); if ($attribute_string) { $attr = $this->parseAttributeString($attribute_string, $config, $context); } else { $attr = array(); } if ($is_self_closing) { $token = new HTMLPurifier_Token_Empty($type, $attr); } else { $token = new HTMLPurifier_Token_Start($type, $attr); } if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; $cursor = $position_next_gt + 1; $inside_tag = false; continue; } else { if ($e) { $e->send(E_WARNING, 'Lexer: Missing gt'); } $token = new HTMLPurifier_Token_Text('<' . $this->parseData(substr($html, $cursor))); if ($maintain_line_numbers) { $token->rawPosition($current_line, $current_col); } $array[] = $token; break; } break; } $context->destroy('CurrentLine'); $context->destroy('CurrentCol'); return $array; }