/** * Check the specified XHTML, and return the results. * * @param string The XHTML to validate * @param boolean Whether to avoid checking for relational errors (false implies just a quick structural check, aka a 'well formed' check) * @param boolean Whether what is being validated is an HTML fragment, rather than a whole document * @param boolean Validate javascript * @param boolean Validate CSS * @param boolean Validate WCAG * @param boolean Validate for compatibility * @param boolean Validate external files * @param boolean Bring up messages about manual checks * @return ?map Error information (NULL: no error) */ function check_xhtml($out, $well_formed_only = false, $is_fragment = false, $validation_javascript = true, $validation_css = true, $validation_wcag = true, $validation_compat = true, $validation_ext_files = true, $validation_manual = false) { global $XHTML_VALIDATOR_OFF, $WELL_FORMED_ONLY, $VALIDATION_JAVASCRIPT, $VALIDATION_CSS, $VALIDATION_WCAG, $VALIDATION_COMPAT, $VALIDATION_EXT_FILES, $VALIDATION_MANUAL; $XHTML_VALIDATOR_OFF = mixed(); $WELL_FORMED_ONLY = $well_formed_only; $VALIDATION_JAVASCRIPT = $validation_javascript; $VALIDATION_CSS = $validation_css; $VALIDATION_WCAG = $validation_wcag; $VALIDATION_COMPAT = $validation_compat; $VALIDATION_EXT_FILES = $validation_ext_files; $VALIDATION_MANUAL = $validation_manual; global $IDS_SO_FAR; $IDS_SO_FAR = array(); $content_start_stack = array(); global $BLOCK_CONSTRAIN, $XML_CONSTRAIN, $LAST_TAG_ATTRIBUTES, $FOUND_DOCTYPE, $FOUND_DESCRIPTION, $FOUND_KEYWORDS, $FOUND_CONTENTTYPE, $THE_DOCTYPE, $TAGS_DEPRECATE_ALLOW, $URL_BASE, $PARENT_TAG, $TABS_SEEN, $KEYS_SEEN, $ANCHORS_SEEN, $ATT_STACK, $TAG_STACK, $POS, $LINENO, $LINESTART, $OUT, $T_POS, $PROHIBITIONS, $ONLY_PARENT, $ONLY_CHILDREN, $REQUIRE_ANCESTER, $LEN, $ANCESTER_BLOCK, $ANCESTER_INLINE, $POSSIBLY_EMPTY_TAGS, $MUST_SELFCLOSE_TAGS, $FOR_LABEL_IDS, $FOR_LABEL_IDS_2, $INPUT_TAG_IDS; global $TAG_RANGES, $VALUE_RANGES, $LAST_A_TAG, $A_LINKS, $XHTML_FORM_ENCODING; global $AREA_LINKS, $LAST_HEADING, $CRAWLED_URLS, $HYPERLINK_URLS, $EMBED_URLS, $THE_LANGUAGE, $PSPELL_LINK; $PSPELL_LINK = NULL; $THE_LANGUAGE = 'en'; $THE_DOCTYPE = $is_fragment ? DOCTYPE_XHTML : DOCTYPE_HTML; $TAGS_DEPRECATE_ALLOW = true; $XML_CONSTRAIN = $is_fragment; $BLOCK_CONSTRAIN = false; $LINENO = 0; $LINESTART = 0; $HYPERLINK_URLS = array(); $EMBED_URLS = array(); $AREA_LINKS = array(); $LAST_HEADING = 0; $FOUND_DOCTYPE = false; $FOUND_CONTENTTYPE = false; $FOUND_KEYWORDS = false; $FOUND_DESCRIPTION = false; $CRAWLED_URLS = array(); $PARENT_TAG = ''; $XHTML_FORM_ENCODING = ''; $KEYS_SEEN = array(); $TABS_SEEN = array(); $TAG_RANGES = array(); $VALUE_RANGES = array(); $LAST_A_TAG = NULL; $ANCHORS_SEEN = array(); $FOR_LABEL_IDS = array(); $FOR_LABEL_IDS_2 = array(); $INPUT_TAG_IDS = array(); $TAG_STACK = array(); $ATT_STACK = array(); $ANCESTER_BLOCK = 0; $ANCESTER_INLINE = 0; $POS = 0; $OUT = $out; unset($out); $LEN = strlen($OUT); $level_ranges = array(); $stack_size = 0; $to_find = array('html' => 1, 'head' => 1, 'title' => 1); $only_one_of_stack = array(); $only_one_of_template = array('title' => 1, 'head' => 1, 'body' => 1, 'base' => 1, 'thead' => 1, 'tfoot' => 1); $only_one_of = $only_one_of_template; $A_LINKS = array(); $previous = ''; if (!isset($GLOBALS['MAIL_MODE'])) { $GLOBALS['MAIL_MODE'] = false; } $errors = array(); $token = _get_next_tag(); while (!is_null($token)) { // echo $T_POS.'-'.$POS.' ('.$stack_size.')<br />'; if (is_array($token) && count($token) != 0) { if (is_null($XHTML_VALIDATOR_OFF)) { foreach ($token[1] as $error) { $errors[] = _xhtml_error($error[0], array_key_exists(1, $error) ? $error[1] : '', array_key_exists(2, $error) ? $error[2] : '', array_key_exists(3, $error) ? $error[3] : '', array_key_exists('raw', $error) ? $error['raw'] : false, array_key_exists('pos', $error) ? $error['pos'] : 0); } if (is_null($token[0])) { return array('level_ranges' => $level_ranges, 'tag_ranges' => $TAG_RANGES, 'value_ranges' => $VALUE_RANGES, 'errors' => $errors); } } $token = $token[0]; } $basis_token = _get_tag_basis($token); // Open, close, or monitonic? $term = strpos($token, '/'); if (!is_null($XHTML_VALIDATOR_OFF)) { if ($term === false) { $XHTML_VALIDATOR_OFF++; } elseif ($term == 1) { if ($XHTML_VALIDATOR_OFF == 0) { $XHTML_VALIDATOR_OFF = NULL; } else { $XHTML_VALIDATOR_OFF--; } } } if ($term !== 1) { if (isset($only_one_of[$basis_token])) { if ($only_one_of[$basis_token] == 0) { $errors[] = _xhtml_error('XHTML_ONLY_ONE_ALLOWED', $basis_token); } $only_one_of[$basis_token]--; } // echo 'Push $basis_token<br />'; $level_ranges[] = array($stack_size, $T_POS, $POS); if (isset($to_find[$basis_token])) { unset($to_find[$basis_token]); } if (!$WELL_FORMED_ONLY && is_null($XHTML_VALIDATOR_OFF)) { if (!$is_fragment && $stack_size == 0 && $basis_token != 'html') { $errors[] = _xhtml_error('XHTML_BAD_ROOT'); } if ($stack_size != 0) { if (isset($ONLY_CHILDREN[$PARENT_TAG])) { if (!in_array($basis_token, $ONLY_CHILDREN[$PARENT_TAG])) { $errors[] = _xhtml_error('XHTML_BAD_CHILD', $basis_token, $PARENT_TAG); } } /*if (isset($PROHIBITIONS[$PARENT_TAG])) { $prohibitions=$PROHIBITIONS[$PARENT_TAG]; if (in_array($basis_token,$prohibitions)) $errors[]=_xhtml_error('XHTML_PROHIBITION',$basis_token,$PARENT_TAG); }*/ foreach ($TAG_STACK as $parent_tag) { if (isset($PROHIBITIONS[$parent_tag])) { $prohibitions = $PROHIBITIONS[$parent_tag]; if (in_array($basis_token, $prohibitions)) { $errors[] = _xhtml_error('XHTML_PROHIBITION', $basis_token, $parent_tag); } } } } if (isset($REQUIRE_ANCESTER[$basis_token]) && !$is_fragment) { if (!in_array($REQUIRE_ANCESTER[$basis_token], $TAG_STACK)) { $errors[] = _xhtml_error('XHTML_MISSING_ANCESTER', $basis_token, $REQUIRE_ANCESTER[$basis_token]); } } if (isset($ONLY_PARENT[$basis_token])) { if ($stack_size == 0) { if (!$is_fragment) { $errors[] = _xhtml_error('XHTML_BAD_PARENT', $basis_token, '/'); } } else { if (!in_array($PARENT_TAG, $ONLY_PARENT[$basis_token])) { $errors[] = _xhtml_error('XHTML_BAD_PARENT', $basis_token, $PARENT_TAG); } } } } // In order to ease validation, we tolerate these in the parser (but of course, mark as errors) if (is_null($XHTML_VALIDATOR_OFF) && !$WELL_FORMED_ONLY && $term === false && isset($MUST_SELFCLOSE_TAGS[$basis_token])) { if ($XML_CONSTRAIN) { $errors[] = _xhtml_error('XHTML_NONEMPTY_TAG', $basis_token); } } else { if ($term === false) { $PARENT_TAG = $basis_token; array_push($TAG_STACK, $basis_token); array_push($ATT_STACK, $LAST_TAG_ATTRIBUTES); array_push($content_start_stack, $POS); array_push($only_one_of_stack, $only_one_of); $only_one_of = $only_one_of_template; ++$stack_size; } else { if (is_null($XHTML_VALIDATOR_OFF) && !$WELL_FORMED_ONLY && (!$XML_CONSTRAIN || !isset($MUST_SELFCLOSE_TAGS[$basis_token])) && is_null($XHTML_VALIDATOR_OFF)) { $errors[] = _xhtml_error('XHTML_CEMPTY_TAG', $basis_token); } } } } elseif ($term == 1) { // HTML allows implicit closing. We will flag errors when we have to do it. See 1-2-3 note do { // For case 3 (see note below) if (!in_array($basis_token, $TAG_STACK)) { if (is_null($XHTML_VALIDATOR_OFF) && $XML_CONSTRAIN) { $errors[] = _xhtml_error('XML_NO_CLOSE_MATCH', $basis_token, $previous); } break; } $previous = array_pop($TAG_STACK); $PARENT_TAG = $TAG_STACK == array() ? '' : $TAG_STACK[count($TAG_STACK) - 1]; $start_pos = array_pop($content_start_stack); array_pop($ATT_STACK); $only_one_of = array_pop($only_one_of_stack); if (is_null($previous)) { if (is_null($XHTML_VALIDATOR_OFF) && $XML_CONSTRAIN) { $errors[] = _xhtml_error('XML_MORE_CLOSE_THAN_OPEN', $basis_token); } break; } if ($basis_token != $previous) { // This is really tricky, and totally XHTML-incompliant. There are three situations: // 1) Overlapping tags. We really can't survive this, and it's very invalid. We could only detect it if we broke support for cases (1) and (2). e.g. <i><b></i></b> // 2) Implicit closing. We close everything implicitly until we find the matching tag. E.g. <i><b></i> // 3) Closing something that was never open. This is tricky - we can't survive it if it was opened somewhere as a parent, as we'd end up closing a whole load of tags by rule (2) - but if it's a lone closing, we can skip it. Good e.g. <b></i></b>. Bad e.g. <div><p></div></p></div> if (is_null($XHTML_VALIDATOR_OFF) && $XML_CONSTRAIN) { $errors[] = _xhtml_error('XML_NO_CLOSE_MATCH', $basis_token, $previous); } } if (!$WELL_FORMED_ONLY && is_null($XHTML_VALIDATOR_OFF)) { if (isset($MUST_SELFCLOSE_TAGS[$previous]) && $XML_CONSTRAIN) { $errors[] = _xhtml_error('XHTML_NONEMPTY_TAG', $previous); } if (!isset($MUST_SELFCLOSE_TAGS[$previous]) && !isset($POSSIBLY_EMPTY_TAGS[$previous]) && trim(substr($OUT, $start_pos, $T_POS - $start_pos)) == '') { $errors[] = _xhtml_error('XHTML_EMPTY_TAG', $previous); } } $stack_size--; $level_ranges[] = array($stack_size, $T_POS, $POS); // echo 'Popped $previous<br />'; if (is_null($XHTML_VALIDATOR_OFF) && !$WELL_FORMED_ONLY && is_null($XHTML_VALIDATOR_OFF)) { if ($previous == 'script') { $tag_contents = substr($OUT, $start_pos, $T_POS - $start_pos); $c_section = strpos($tag_contents, ']]>'); if (trim($tag_contents) != '' && strpos($tag_contents, '//-->') === false && strpos($tag_contents, '// -->') === false && $c_section === false) { $errors[] = _xhtml_error('XHTML_SCRIPT_COMMENTING', $previous); } elseif ($c_section === false && strpos($tag_contents, '<!--') !== false) { if ($XML_CONSTRAIN) { $errors[] = _xhtml_error('XHTML_CDATA'); } } if (strpos($tag_contents, '</') !== false) { $errors[] = _xhtml_error('XML_JS_TAG_ESCAPE'); } } } } while ($basis_token != $previous); } /*else { $level_ranges[]=array($stack_size,$T_POS,$POS); // it's monitonic, so ignore }*/ $token = _get_next_tag(); } // Check we have everything closed if ($stack_size != 0) { if ($XML_CONSTRAIN) { $errors[] = _xhtml_error('XML_NO_CLOSE', array_pop($TAG_STACK)); } return array('level_ranges' => $level_ranges, 'tag_ranges' => $TAG_RANGES, 'value_ranges' => $VALUE_RANGES, 'errors' => $errors); } if (!$well_formed_only) { if (!$is_fragment) { foreach (array_keys($to_find) as $tag) { $errors[] = _xhtml_error('XHTML_MISSING_TAG', $tag); } if (!$FOUND_DOCTYPE && !$GLOBALS['MAIL_MODE']) { $errors[] = _xhtml_error('XHTML_DOCTYPE'); } if ($FOUND_DOCTYPE && $GLOBALS['MAIL_MODE']) { $errors[] = _xhtml_error('MAIL_DOCTYPE'); } if (!$FOUND_CONTENTTYPE) { $errors[] = _xhtml_error('XHTML_CONTENTTYPE'); } if (!$FOUND_KEYWORDS) { $errors[] = _xhtml_error('XHTML_KEYWORDS'); } if (!$FOUND_DESCRIPTION) { $errors[] = _xhtml_error('XHTML_DESCRIPTION'); } } if (!$is_fragment) { // Check that all area-links have a corresponding hyperlink foreach (array_keys($AREA_LINKS) as $id) { if (!in_array($id, $HYPERLINK_URLS)) { $errors[] = _xhtml_error('WCAG_AREA_EQUIV', $id); } } // Check that all labels apply to real input tags foreach (array_keys($FOR_LABEL_IDS_2) as $id) { if (!isset($INPUT_TAG_IDS[$id])) { $errors[] = _xhtml_error('XHTML_ID_UNBOUND', $id); } } } } // Main spelling if (function_exists('pspell_new') && isset($GLOBALS['SPELLING'])) { $stripped = $OUT; $matches = array(); $num_matches = preg_match_all('#\\<style.*\\</style\\>#Umis', $stripped, $matches); for ($i = 0; $i < $num_matches; $i++) { $stripped = str_replace($matches[0][$i], str_repeat(' ', strlen($matches[0][$i])), $stripped); } $num_matches = preg_match_all('#\\<script.*\\</script\\>#Umis', $stripped, $matches); for ($i = 0; $i < $num_matches; $i++) { $stripped = str_replace($matches[0][$i], str_repeat(' ', strlen($matches[0][$i])), $stripped); } $stripped = @html_entity_decode(strip_tags($stripped), ENT_QUOTES, get_charset()); $new_errors = validate_spelling($stripped); $misspellings = array(); global $POS, $LINENO, $LINESTART; foreach ($new_errors as $error) { if (array_key_exists($error[1], $misspellings)) { continue; } $misspellings[$error[1]] = 1; $POS = strpos($OUT, $error[1]); $LINESTART = strrpos(substr($OUT, 0, $POS), chr(10)); $LINENO = substr_count(substr($OUT, 0, $LINESTART), chr(10)) + 1; $errors[] = _xhtml_error($error[0], $error[1]); } } unset($OUT); return array('level_ranges' => $level_ranges, 'tag_ranges' => $TAG_RANGES, 'value_ranges' => $VALUE_RANGES, 'errors' => $errors); }
/** * Function that 'fixes' HTML (or bad XHTML) enough for it to pass most basic structural validation. * * @param string The XHTML string to convert to XHTML * @param boolean Whether to force a repair even if we aren't in XHTML mode * @return string The converted string */ function xhtmlise_html($html, $definitely_want = false) { // Tests... // echo xhtmlise_html('test<a></a><br /><po></p><p></po>'); // expect: test<a></a><br /><po><p></p></po> if (!$definitely_want) { if (!($GLOBALS['SEMI_DEBUG_MODE'] && browser_matches('true_xhtml'))) { return $html; } // One day, this will get removed and we'll ensure all our output is always XHTML. But so far there's no point as IE doesn't support true XHTML } $is_escaped = $GLOBALS['XSS_DETECT'] && ocp_is_escaped($html); $html = preg_replace('#<\\!--.*($|-->)#Us', '', $html); // Strip comments require_code('obfuscate'); require_code('validation'); global $XML_CONSTRAIN, $LAST_TAG_ATTRIBUTES, $POS, $OUT, $TAG_STACK, $INBETWEEN_TEXT, $LEN, $WELL_FORMED_ONLY, $MUST_SELFCLOSE_TAGS, $LINENO, $LINESTART; $POS = 0; $OUT = $html; $LEN = strlen($html); $TAG_STACK = array(); $WELL_FORMED_ONLY = true; $LINENO = 0; $LINESTART = 0; $XML_CONSTRAIN = true; $new = ''; $token = _get_next_tag(); // If we actually have a partial tag right at the start (ie. we're breaking into some HTML at a bad point) $ang_pos = strpos($INBETWEEN_TEXT, '>'); if ($ang_pos !== false) { $INBETWEEN_TEXT = substr($INBETWEEN_TEXT, $ang_pos + 1); } $new .= fix_entities($INBETWEEN_TEXT); while (!is_null($token)) { while (is_array($token)) { if ($token[0] !== NULL) { $token = $token[0]; // We can at least discern something } else { $token = _get_next_tag(); // No, we need to just move on } } $basis_token = _get_tag_basis($token); if ($basis_token != '') { // Open, close, or monitonic? $term = strpos($token, '/'); if ($term !== 1) { if ($term === false && !isset($MUST_SELFCLOSE_TAGS[$basis_token])) { // Fix nesting if ($basis_token == 'li' && !in_array('ul', $TAG_STACK) && !in_array('ol', $TAG_STACK) && !in_array('dl', $TAG_STACK) && !in_array('dd', $TAG_STACK) && !in_array('dt', $TAG_STACK) && !in_array('dir', $TAG_STACK) && !in_array('menu', $TAG_STACK)) { array_push($TAG_STACK, 'ul'); $new .= '<ul>'; } if (($basis_token == 'tr' || $basis_token == 'colgroup' || $basis_token == 'col' || $basis_token == 'tbody' || $basis_token == 'tfoot' || $basis_token == 'thead' || $basis_token == 'caption') && !in_array('table', $TAG_STACK)) { array_push($TAG_STACK, 'table'); $new .= '<table>'; } if (($basis_token == 'td' || $basis_token == 'th') && !in_array('table', $TAG_STACK)) { array_push($TAG_STACK, 'table'); $new .= '<table>'; array_push($TAG_STACK, 'tr'); $new .= '<tr>'; } if ($basis_token == 'param' && !in_array('object', $TAG_STACK)) { array_push($TAG_STACK, 'object'); $new .= '<object>'; } if ($basis_token == 'option' && !in_array('select', $TAG_STACK)) { array_push($TAG_STACK, 'select'); $new .= '<select>'; } if ($basis_token == 'noembed' && !in_array('map', $TAG_STACK)) { array_push($TAG_STACK, 'map'); $new .= '<map>'; } array_push($TAG_STACK, $basis_token); $new .= '<' . $basis_token; foreach ($LAST_TAG_ATTRIBUTES as $key => $val) { $new .= ' ' . $key . '="' . fix_entities($val) . '"'; } $new .= '>'; } else { $new .= '<' . $basis_token; foreach ($LAST_TAG_ATTRIBUTES as $key => $val) { $new .= ' ' . $key . '="' . fix_entities($val) . '"'; } $new .= ' />'; } } else { // For case 3 if (!in_array($basis_token, $TAG_STACK)) { // Do nothing, we can't handle it because we're closing something that was never opened } else { $previous = ''; do { $previous = array_pop($TAG_STACK); if ($basis_token != $previous) { $new .= '</' . $previous . '>'; } // We'll have to assume it should be implicitly closed } while ($basis_token != $previous); $new .= '</' . $basis_token . '>'; // Ok so we finally got an opener match and managed to put out our closer } } } $token = _get_next_tag(); if (is_null($token)) { // If we actually have a partial tag right at the end (ie. we're breaking out of some HTML at a bad point) $ang_pos = strpos($INBETWEEN_TEXT, '<'); if ($ang_pos !== false) { $INBETWEEN_TEXT = substr($INBETWEEN_TEXT, 0, $ang_pos); } } $new .= fix_entities($INBETWEEN_TEXT); } // Check we have everything closed while (count($TAG_STACK) != 0) { $previous = array_pop($TAG_STACK); $new .= '</' . $previous . '>'; } // Remove some empty tags that shouldn't be empty (e.g. table) $may_not_be_empty = array('br', 'hr', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'dd', 'dt', 'dl', 'li', 'ol', 'ul', 'rbc', 'rtc', 'rb', 'rt', 'rp', 'abbr', 'acronym', 'cite', 'dfn', 'ruby', 'bdo', 'img', 'param', 'input', 'select', 'object', 'caption', 'label', 'base', 'body', 'col', 'colgroup', 'map', 'optgroup', 'option', 'legend', 'area', 'form'); foreach ($may_not_be_empty as $t) { $new = preg_replace('#<' . $t . '(\\s[^>]*)?' . '>\\s*</' . $t . '>#', '', $new); } unset($OUT); unset($TAG_STACK); if ($is_escaped) { ocp_mark_as_escaped($new); } return $new; }