/** * Checks a tag's attributes. * * @param string The name of the tag to check * @param map A map of attributes (name=>value) the tag has * @param boolean Whether this is a self-closing tag * @param boolean Whether this is a closing tag * @return ?list Array of errors (NULL: none) */ function _check_attributes($tag, $attributes, $self_close, $close) { //unset($self_close); //unset($close); global $PSPELL_LINK, $THE_LANGUAGE, $XML_CONSTRAIN, $TAGS_DEPRECATE_ALLOW, $THE_DOCTYPE, $HYPERLINK_URLS, $CRAWLED_URLS, $EMBED_URLS, $TAGS_INLINE, $TAGS_BLOCK, $TAGS_NORMAL, $TAGS_INLINE_DEPRECATED, $TAGS_BLOCK_DEPRECATED, $TAGS_NORMAL_DEPRECATED, $TAG_ATTRIBUTES, $TAG_ATTRIBUTES_DEPRECATED, $IDS_SO_FAR, $ANCESTER_BLOCK, $ANCESTER_INLINE, $EXPECTING_TAG, $OUT, $POS, $LAST_A_TAG, $TAG_ATTRIBUTES_REQUIRED; $errors = array(); $stub = $tag . '.'; foreach ($attributes as $attribute => $value) { $lattribute = strtolower($attribute); if ($lattribute != $attribute) { if ($XML_CONSTRAIN) { $errors[] = array('XHTML_CASE_ATTRIBUTE', $tag, $attribute); } $attribute = $lattribute; } if ($attribute == 'lang' || $attribute == 'xml:lang') { $THE_LANGUAGE = $value; } if ($value == 'TODO' || strpos($value, 'Lorum ') !== false) { $errors[] = array('XHTML_PLACEHOLDER'); } if (!isset($TAG_ATTRIBUTES[$stub . $attribute]) && !isset($TAG_ATTRIBUTES_REQUIRED[$stub . $attribute])) { if (!isset($TAGS_BLOCK[$tag]) && !isset($TAGS_INLINE[$tag]) && !isset($TAGS_NORMAL[$tag])) { continue; } if (!isset($TAGS_BLOCK_DEPRECATED[$tag]) && !isset($TAGS_INLINE_DEPRECATED[$tag]) && !isset($TAGS_NORMAL_DEPRECATED[$tag])) { continue; } if (strpos($attribute, ':') !== false) { continue; } //if ($tag=='embed') continue; // Hack, to allow rich media to work in multiple browsers $errors[] = array('XHTML_UNKNOWN_ATTRIBUTE', $tag, $attribute); continue; } else { if (isset($TAG_ATTRIBUTES_REQUIRED[$stub . $attribute])) { $errors[] = array($TAGS_DEPRECATE_ALLOW ? 'XHTML_DEPRECATED_ATTRIBUTE' : 'XHTML_UNKNOWN_ATTRIBUTE', $tag, $attribute); } if ($attribute == 'target' && ($THE_DOCTYPE == DOCTYPE_XHTML_STRICT || $THE_DOCTYPE == DOCTYPE_XHTML_NEW)) { $errors[] = array('XHTML_UNKNOWN_ATTRIBUTE', $tag, $attribute); } } if (($attribute == 'alt' || $attribute == 'title' || $attribute == 'content' && array_key_exists('http-equiv', $attributes) && (strtolower($attributes['http-equiv']) == 'description' || strtolower($attributes['http-equiv']) == 'keywords') || $attribute == 'summary') && function_exists('pspell_new') && isset($GLOBALS['SPELLING']) && $value != '') { $_value = @html_entity_decode($value, ENT_QUOTES, get_charset()); $errors = array_merge($errors, validate_spelling($_value)); } if ($attribute == 'alt' && $tag != 'input' && strlen(strip_tags($value)) > 150) { $errors[] = array('WCAG_ATTRIBUTE_TOO_LONG', $attribute); } if ($attribute == 'href' || $attribute == 'src' || $attribute == 'data' && $tag == 'object') { $CRAWLED_URLS[] = @html_entity_decode($value, ENT_QUOTES, get_charset()); if ($tag == 'a') { $HYPERLINK_URLS[] = @html_entity_decode($value, ENT_QUOTES, get_charset()); } } if ($attribute == 'src' && $tag == 'embed' || $attribute == 'src' && $tag == 'script' || $attribute == 'src' && $tag == 'iframe' || $attribute == 'src' && $tag == 'img' || $attribute == 'href' && $tag == 'link' && isset($attributes['rel']) && $attributes['rel'] == 'stylesheet' || $attribute == 'data' && $tag == 'object' || $attribute == 'code' && $tag == 'applet') { $EMBED_URLS[] = @html_entity_decode($value, ENT_QUOTES, get_charset()); } if ($attribute == 'href' && @strtolower(@$value[0]) == 'j' && strtolower(substr($value, 0, 11)) == 'javascript:') { $errors[] = array('XHTML_BAD_ATTRIBUTE_VALUE', $attribute, $value, 'no js href'); } $reg_exp = $TAG_ATTRIBUTES[$stub . $attribute]; if ($reg_exp != '(.|\\n)*' && preg_match('#^' . $reg_exp . '$#', $value) == 0 && $value != 'x') { $errors[] = array('XHTML_BAD_ATTRIBUTE_VALUE', $attribute, $value, $reg_exp); } if ($attribute == 'style' && $GLOBALS['VALIDATION_CSS']) { if (!function_exists('do_template') && strpos($value, '{') === false && strpos($value, 'float:') === false && strpos($value, ': none') === false && strpos($value, ': inline') === false && strpos($value, ': block') === false) { $errors[] = array('CSS_INLINE_STYLES'); } $css_validity = _validate_css_class($value, 0); if (is_array($css_validity)) { $errors = array_merge($errors, $css_validity); } // Some kind of error } if ($attribute == 'id') { if (isset($IDS_SO_FAR[strtolower($value)])) { $errors[] = array('XHTML_DUPLICATED_ID', strval($value)); } $IDS_SO_FAR[strtolower($value)] = 1; } } return $errors == array() ? NULL : $errors; }
/** * Check the specified XHTML, and return the results. * * @param string The XHTML to validate * @param boolean Whether to avoid checking for relational errors (false implies just a quick structural check, aka a 'well formed' check) * @param boolean Whether what is being validated is an HTML fragment, rather than a whole document * @param boolean Validate javascript * @param boolean Validate CSS * @param boolean Validate WCAG * @param boolean Validate for compatibility * @param boolean Validate external files * @param boolean Bring up messages about manual checks * @return ?map Error information (NULL: no error) */ function check_xhtml($out, $well_formed_only = false, $is_fragment = false, $validation_javascript = true, $validation_css = true, $validation_wcag = true, $validation_compat = true, $validation_ext_files = true, $validation_manual = false) { global $XHTML_VALIDATOR_OFF, $WELL_FORMED_ONLY, $VALIDATION_JAVASCRIPT, $VALIDATION_CSS, $VALIDATION_WCAG, $VALIDATION_COMPAT, $VALIDATION_EXT_FILES, $VALIDATION_MANUAL, $UNDER_XMLNS; $XHTML_VALIDATOR_OFF = mixed(); $WELL_FORMED_ONLY = $well_formed_only; if (!$WELL_FORMED_ONLY) { require_code('validation2'); } $VALIDATION_JAVASCRIPT = $validation_javascript; $VALIDATION_CSS = $validation_css; $VALIDATION_WCAG = $validation_wcag; $VALIDATION_COMPAT = $validation_compat; $VALIDATION_EXT_FILES = $validation_ext_files; $VALIDATION_MANUAL = $validation_manual; global $IDS_SO_FAR; $IDS_SO_FAR = array(); $content_start_stack = array(); global $BLOCK_CONSTRAIN, $XML_CONSTRAIN, $LAST_TAG_ATTRIBUTES, $FOUND_DOCTYPE, $FOUND_DESCRIPTION, $FOUND_KEYWORDS, $FOUND_CONTENTTYPE, $THE_DOCTYPE, $TAGS_DEPRECATE_ALLOW, $URL_BASE, $PARENT_TAG, $TABS_SEEN, $KEYS_SEEN, $ANCHORS_SEEN, $ATT_STACK, $TAG_STACK, $POS, $LINENO, $LINESTART, $OUT, $T_POS, $PROHIBITIONS, $ONLY_PARENT, $ONLY_CHILDREN, $REQUIRE_ANCESTER, $LEN, $ANCESTER_BLOCK, $ANCESTER_INLINE, $POSSIBLY_EMPTY_TAGS, $MUST_SELFCLOSE_TAGS, $FOR_LABEL_IDS, $FOR_LABEL_IDS_2, $INPUT_TAG_IDS; global $TAG_RANGES, $VALUE_RANGES, $LAST_A_TAG, $A_LINKS, $XHTML_FORM_ENCODING; global $AREA_LINKS, $LAST_HEADING, $CRAWLED_URLS, $HYPERLINK_URLS, $EMBED_URLS, $THE_LANGUAGE, $PSPELL_LINK; global $TAGS_BLOCK, $TAGS_INLINE, $TAGS_NORMAL, $TAGS_BLOCK_DEPRECATED, $TAGS_INLINE_DEPRECATED, $TAGS_NORMAL_DEPRECATED; $PSPELL_LINK = NULL; $THE_LANGUAGE = 'en'; $THE_DOCTYPE = $is_fragment ? DOCTYPE_XHTML : DOCTYPE_HTML; $TAGS_DEPRECATE_ALLOW = true; $XML_CONSTRAIN = $is_fragment; $BLOCK_CONSTRAIN = false; $LINENO = 0; $LINESTART = 0; $HYPERLINK_URLS = array(); $EMBED_URLS = array(); $AREA_LINKS = array(); $LAST_HEADING = 0; $FOUND_DOCTYPE = false; $FOUND_CONTENTTYPE = false; $FOUND_KEYWORDS = false; $FOUND_DESCRIPTION = false; $CRAWLED_URLS = array(); $PARENT_TAG = ''; $XHTML_FORM_ENCODING = ''; $UNDER_XMLNS = false; $KEYS_SEEN = array(); $TABS_SEEN = array(); $TAG_RANGES = array(); $VALUE_RANGES = array(); $LAST_A_TAG = NULL; $ANCHORS_SEEN = array(); $FOR_LABEL_IDS = array(); $FOR_LABEL_IDS_2 = array(); $INPUT_TAG_IDS = array(); $TAG_STACK = array(); $ATT_STACK = array(); $ANCESTER_BLOCK = 0; $ANCESTER_INLINE = 0; $POS = 0; $OUT = $out; unset($out); $LEN = strlen($OUT); $level_ranges = array(); $stack_size = 0; $to_find = array('html' => 1, 'head' => 1, 'title' => 1); $only_one_of_stack = array(); $only_one_of_template = array('title' => 1, 'head' => 1, 'body' => 1, 'base' => 1, 'thead' => 1, 'tfoot' => 1); $only_one_of = $only_one_of_template; $A_LINKS = array(); $previous = ''; if (!isset($GLOBALS['MAIL_MODE'])) { $GLOBALS['MAIL_MODE'] = false; } $errors = array(); $bad_root = false; $token = _get_next_tag(); while (!is_null($token)) { // echo $T_POS.'-'.$POS.' ('.$stack_size.')<br />'; while (is_array($token) && count($token) != 0) { if (is_null($XHTML_VALIDATOR_OFF)) { foreach ($token[1] as $error) { $errors[] = _xhtml_error($error[0], array_key_exists(1, $error) ? $error[1] : '', array_key_exists(2, $error) ? $error[2] : '', array_key_exists(3, $error) ? $error[3] : '', array_key_exists('raw', $error) ? $error['raw'] : false, array_key_exists('pos', $error) ? $error['pos'] : 0); } if (is_null($token[0])) { return array('level_ranges' => $level_ranges, 'tag_ranges' => $TAG_RANGES, 'value_ranges' => $VALUE_RANGES, 'errors' => $errors); } } $token = $token[0]; } $basis_token = _get_tag_basis($token); // Open, close, or monitonic? $term = strpos($token, '/'); if (!is_null($XHTML_VALIDATOR_OFF)) { if ($term === false) { $XHTML_VALIDATOR_OFF++; } elseif ($term == 1) { if ($XHTML_VALIDATOR_OFF == 0) { $XHTML_VALIDATOR_OFF = NULL; } else { $XHTML_VALIDATOR_OFF--; } } } if ($term !== 1) { if (isset($only_one_of[$basis_token])) { if ($only_one_of[$basis_token] == 0) { $errors[] = _xhtml_error('XHTML_ONLY_ONE_ALLOWED', $basis_token); } $only_one_of[$basis_token]--; } // echo 'Push $basis_token<br />'; $level_ranges[] = array($stack_size, $T_POS, $POS); if (isset($to_find[$basis_token])) { unset($to_find[$basis_token]); } if (!$WELL_FORMED_ONLY && is_null($XHTML_VALIDATOR_OFF)) { if (!$is_fragment && $stack_size == 0 && $basis_token != 'html') { $errors[] = _xhtml_error('XHTML_BAD_ROOT'); $bad_root = true; } if ($stack_size != 0) { if (isset($ONLY_CHILDREN[$PARENT_TAG])) { if (!in_array($basis_token, $ONLY_CHILDREN[$PARENT_TAG])) { $errors[] = _xhtml_error('XHTML_BAD_CHILD', $basis_token, $PARENT_TAG); } } /*if (isset($PROHIBITIONS[$PARENT_TAG])) { $prohibitions=$PROHIBITIONS[$PARENT_TAG]; if (in_array($basis_token,$prohibitions)) $errors[]=_xhtml_error('XHTML_PROHIBITION',$basis_token,$PARENT_TAG); }*/ foreach ($TAG_STACK as $parent_tag) { if (isset($PROHIBITIONS[$parent_tag])) { $prohibitions = $PROHIBITIONS[$parent_tag]; if (in_array($basis_token, $prohibitions)) { $errors[] = _xhtml_error('XHTML_PROHIBITION', $basis_token, $parent_tag); } } } } if (isset($REQUIRE_ANCESTER[$basis_token]) && !$is_fragment) { if (!in_array($REQUIRE_ANCESTER[$basis_token], $TAG_STACK)) { $errors[] = _xhtml_error('XHTML_MISSING_ANCESTER', $basis_token, $REQUIRE_ANCESTER[$basis_token]); } } if (isset($ONLY_PARENT[$basis_token])) { if ($stack_size == 0) { if (!$is_fragment) { $errors[] = _xhtml_error('XHTML_BAD_PARENT', $basis_token, '/'); } } else { if (!in_array($PARENT_TAG, $ONLY_PARENT[$basis_token])) { $errors[] = _xhtml_error('XHTML_BAD_PARENT', $basis_token, $PARENT_TAG); } } } } // In order to ease validation, we tolerate these in the parser (but of course, mark as errors) if (is_null($XHTML_VALIDATOR_OFF) && !$WELL_FORMED_ONLY && $term === false && isset($MUST_SELFCLOSE_TAGS[$basis_token])) { if ($XML_CONSTRAIN) { $errors[] = _xhtml_error('XHTML_NONEMPTY_TAG', $basis_token); } } else { if ($term === false) { $PARENT_TAG = $basis_token; array_push($TAG_STACK, $basis_token); array_push($ATT_STACK, $LAST_TAG_ATTRIBUTES); array_push($content_start_stack, $POS); array_push($only_one_of_stack, $only_one_of); $only_one_of = $only_one_of_template; ++$stack_size; } else { if (is_null($XHTML_VALIDATOR_OFF) && !$WELL_FORMED_ONLY && (!$XML_CONSTRAIN || !isset($MUST_SELFCLOSE_TAGS[$basis_token])) && is_null($XHTML_VALIDATOR_OFF)) { if (!$bad_root) { $errors[] = _xhtml_error('XHTML_CEMPTY_TAG', $basis_token); } } } } } elseif ($term == 1) { // HTML allows implicit closing. We will flag errors when we have to do it. See 1-2-3 note do { // For case 3 (see note below) if (!in_array($basis_token, $TAG_STACK)) { if (is_null($XHTML_VALIDATOR_OFF) && $XML_CONSTRAIN) { $errors[] = _xhtml_error('XML_NO_CLOSE_MATCH', $basis_token, $previous); } break; } $previous = array_pop($TAG_STACK); $PARENT_TAG = $TAG_STACK == array() ? '' : $TAG_STACK[count($TAG_STACK) - 1]; $start_pos = array_pop($content_start_stack); array_pop($ATT_STACK); $only_one_of = array_pop($only_one_of_stack); if (is_null($previous)) { if (is_null($XHTML_VALIDATOR_OFF) && $XML_CONSTRAIN) { $errors[] = _xhtml_error('XML_MORE_CLOSE_THAN_OPEN', $basis_token); } break; } if ($basis_token != $previous) { // This is really tricky, and totally XHTML-incompliant. There are three situations: // 1) Overlapping tags. We really can't survive this, and it's very invalid. We could only detect it if we broke support for cases (1) and (2). e.g. <i><b></i></b> // 2) Implicit closing. We close everything implicitly until we find the matching tag. E.g. <i><b></i> // 3) Closing something that was never open. This is tricky - we can't survive it if it was opened somewhere as a parent, as we'd end up closing a whole load of tags by rule (2) - but if it's a lone closing, we can skip it. Good e.g. <b></i></b>. Bad e.g. <div><p></div></p></div> if (is_null($XHTML_VALIDATOR_OFF) && $XML_CONSTRAIN) { $errors[] = _xhtml_error('XML_NO_CLOSE_MATCH', $basis_token, $previous); } } if (!$WELL_FORMED_ONLY && is_null($XHTML_VALIDATOR_OFF)) { if (isset($MUST_SELFCLOSE_TAGS[$previous]) && $XML_CONSTRAIN) { $errors[] = _xhtml_error('XHTML_NONEMPTY_TAG', $previous); } if (!isset($MUST_SELFCLOSE_TAGS[$previous]) && !isset($POSSIBLY_EMPTY_TAGS[$previous]) && trim(substr($OUT, $start_pos, $T_POS - $start_pos)) == '') { if (isset($TAGS_BLOCK[$previous]) || isset($TAGS_INLINE[$previous]) || isset($TAGS_NORMAL[$previous]) || isset($TAGS_BLOCK_DEPRECATED[$previous]) || isset($TAGS_INLINE_DEPRECATED[$previous]) || isset($TAGS_NORMAL_DEPRECATED[$previous])) { $errors[] = _xhtml_error('XHTML_EMPTY_TAG', $previous); } } } $stack_size--; $level_ranges[] = array($stack_size, $T_POS, $POS); // echo 'Popped $previous<br />'; if (is_null($XHTML_VALIDATOR_OFF) && !$WELL_FORMED_ONLY && is_null($XHTML_VALIDATOR_OFF)) { if ($previous == 'script') { $tag_contents = substr($OUT, $start_pos, $T_POS - $start_pos); $c_section = strpos($tag_contents, ']]>'); if (trim($tag_contents) != '' && strpos($tag_contents, '//-->') === false && strpos($tag_contents, '// -->') === false && $c_section === false) { $errors[] = _xhtml_error('XHTML_SCRIPT_COMMENTING', $previous); } elseif ($c_section === false && strpos($tag_contents, '<!--') !== false) { if ($XML_CONSTRAIN) { $errors[] = _xhtml_error('XHTML_CDATA'); } } if (strpos($tag_contents, '</') !== false) { $errors[] = _xhtml_error('XML_JS_TAG_ESCAPE'); } } } } while ($basis_token != $previous); } /*else { $level_ranges[]=array($stack_size,$T_POS,$POS); // it's monitonic, so ignore }*/ $token = _get_next_tag(); } // Check we have everything closed if ($stack_size != 0) { if ($XML_CONSTRAIN) { $errors[] = _xhtml_error('XML_NO_CLOSE', array_pop($TAG_STACK)); } return array('level_ranges' => $level_ranges, 'tag_ranges' => $TAG_RANGES, 'value_ranges' => $VALUE_RANGES, 'errors' => $errors); } if (!$well_formed_only) { if (!$is_fragment) { foreach (array_keys($to_find) as $tag) { $errors[] = _xhtml_error('XHTML_MISSING_TAG', $tag); } if (!$FOUND_DOCTYPE && !$GLOBALS['MAIL_MODE']) { $errors[] = _xhtml_error('XHTML_DOCTYPE'); } if ($FOUND_DOCTYPE && $GLOBALS['MAIL_MODE']) { $errors[] = _xhtml_error('MAIL_DOCTYPE'); } if (!$FOUND_CONTENTTYPE) { $errors[] = _xhtml_error('XHTML_CONTENTTYPE'); } //if (!$FOUND_KEYWORDS) $errors[]=_xhtml_error('XHTML_KEYWORDS'); //if (!$FOUND_DESCRIPTION) $errors[]=_xhtml_error('XHTML_DESCRIPTION'); } if (!$is_fragment) { // Check that all area-links have a corresponding hyperlink foreach (array_keys($AREA_LINKS) as $id) { if (!in_array($id, $HYPERLINK_URLS)) { $errors[] = _xhtml_error('WCAG_AREA_EQUIV', $id); } } // Check that all labels apply to real input tags foreach (array_keys($FOR_LABEL_IDS_2) as $id) { if (!isset($INPUT_TAG_IDS[$id])) { $errors[] = _xhtml_error('XHTML_ID_UNBOUND', $id); } } } } // Main spelling if (function_exists('pspell_new') && isset($GLOBALS['SPELLING'])) { $stripped = $OUT; $matches = array(); $num_matches = preg_match_all('#\\<style.*\\</style\\>#Umis', $stripped, $matches); for ($i = 0; $i < $num_matches; $i++) { $stripped = str_replace($matches[0][$i], str_repeat(' ', strlen($matches[0][$i])), $stripped); } $num_matches = preg_match_all('#\\<script.*\\</script\\>#Umis', $stripped, $matches); for ($i = 0; $i < $num_matches; $i++) { $stripped = str_replace($matches[0][$i], str_repeat(' ', strlen($matches[0][$i])), $stripped); } $stripped = @html_entity_decode(strip_tags($stripped), ENT_QUOTES, get_charset()); require_code('validation2'); $new_errors = validate_spelling($stripped); $misspellings = array(); global $POS, $LINENO, $LINESTART; foreach ($new_errors as $error) { if (array_key_exists($error[1], $misspellings)) { continue; } $misspellings[$error[1]] = 1; $POS = strpos($OUT, $error[1]); $LINESTART = strrpos(substr($OUT, 0, $POS), chr(10)); $LINENO = substr_count(substr($OUT, 0, $LINESTART), chr(10)) + 1; $errors[] = _xhtml_error($error[0], $error[1]); } } unset($OUT); return array('level_ranges' => $level_ranges, 'tag_ranges' => $TAG_RANGES, 'value_ranges' => $VALUE_RANGES, 'errors' => $errors); }