Beispiel #1
0
/**
 * Check the specified XHTML, and return the results.
 *
 * @param  string			The XHTML to validate
 * @param  boolean		Whether to avoid checking for relational errors (false implies just a quick structural check, aka a 'well formed' check)
 * @param  boolean		Whether what is being validated is an HTML fragment, rather than a whole document
 * @param  boolean		Validate javascript
 * @param  boolean		Validate CSS
 * @param  boolean		Validate WCAG
 * @param  boolean		Validate for compatibility
 * @param  boolean		Validate external files
 * @param  boolean		Bring up messages about manual checks
 * @return ?map			Error information (NULL: no error)
 */
function check_xhtml($out, $well_formed_only = false, $is_fragment = false, $validation_javascript = true, $validation_css = true, $validation_wcag = true, $validation_compat = true, $validation_ext_files = true, $validation_manual = false)
{
    global $XHTML_VALIDATOR_OFF, $WELL_FORMED_ONLY, $VALIDATION_JAVASCRIPT, $VALIDATION_CSS, $VALIDATION_WCAG, $VALIDATION_COMPAT, $VALIDATION_EXT_FILES, $VALIDATION_MANUAL;
    $XHTML_VALIDATOR_OFF = mixed();
    $WELL_FORMED_ONLY = $well_formed_only;
    $VALIDATION_JAVASCRIPT = $validation_javascript;
    $VALIDATION_CSS = $validation_css;
    $VALIDATION_WCAG = $validation_wcag;
    $VALIDATION_COMPAT = $validation_compat;
    $VALIDATION_EXT_FILES = $validation_ext_files;
    $VALIDATION_MANUAL = $validation_manual;
    global $IDS_SO_FAR;
    $IDS_SO_FAR = array();
    $content_start_stack = array();
    global $BLOCK_CONSTRAIN, $XML_CONSTRAIN, $LAST_TAG_ATTRIBUTES, $FOUND_DOCTYPE, $FOUND_DESCRIPTION, $FOUND_KEYWORDS, $FOUND_CONTENTTYPE, $THE_DOCTYPE, $TAGS_DEPRECATE_ALLOW, $URL_BASE, $PARENT_TAG, $TABS_SEEN, $KEYS_SEEN, $ANCHORS_SEEN, $ATT_STACK, $TAG_STACK, $POS, $LINENO, $LINESTART, $OUT, $T_POS, $PROHIBITIONS, $ONLY_PARENT, $ONLY_CHILDREN, $REQUIRE_ANCESTER, $LEN, $ANCESTER_BLOCK, $ANCESTER_INLINE, $POSSIBLY_EMPTY_TAGS, $MUST_SELFCLOSE_TAGS, $FOR_LABEL_IDS, $FOR_LABEL_IDS_2, $INPUT_TAG_IDS;
    global $TAG_RANGES, $VALUE_RANGES, $LAST_A_TAG, $A_LINKS, $XHTML_FORM_ENCODING;
    global $AREA_LINKS, $LAST_HEADING, $CRAWLED_URLS, $HYPERLINK_URLS, $EMBED_URLS, $THE_LANGUAGE, $PSPELL_LINK;
    $PSPELL_LINK = NULL;
    $THE_LANGUAGE = 'en';
    $THE_DOCTYPE = $is_fragment ? DOCTYPE_XHTML : DOCTYPE_HTML;
    $TAGS_DEPRECATE_ALLOW = true;
    $XML_CONSTRAIN = $is_fragment;
    $BLOCK_CONSTRAIN = false;
    $LINENO = 0;
    $LINESTART = 0;
    $HYPERLINK_URLS = array();
    $EMBED_URLS = array();
    $AREA_LINKS = array();
    $LAST_HEADING = 0;
    $FOUND_DOCTYPE = false;
    $FOUND_CONTENTTYPE = false;
    $FOUND_KEYWORDS = false;
    $FOUND_DESCRIPTION = false;
    $CRAWLED_URLS = array();
    $PARENT_TAG = '';
    $XHTML_FORM_ENCODING = '';
    $KEYS_SEEN = array();
    $TABS_SEEN = array();
    $TAG_RANGES = array();
    $VALUE_RANGES = array();
    $LAST_A_TAG = NULL;
    $ANCHORS_SEEN = array();
    $FOR_LABEL_IDS = array();
    $FOR_LABEL_IDS_2 = array();
    $INPUT_TAG_IDS = array();
    $TAG_STACK = array();
    $ATT_STACK = array();
    $ANCESTER_BLOCK = 0;
    $ANCESTER_INLINE = 0;
    $POS = 0;
    $OUT = $out;
    unset($out);
    $LEN = strlen($OUT);
    $level_ranges = array();
    $stack_size = 0;
    $to_find = array('html' => 1, 'head' => 1, 'title' => 1);
    $only_one_of_stack = array();
    $only_one_of_template = array('title' => 1, 'head' => 1, 'body' => 1, 'base' => 1, 'thead' => 1, 'tfoot' => 1);
    $only_one_of = $only_one_of_template;
    $A_LINKS = array();
    $previous = '';
    if (!isset($GLOBALS['MAIL_MODE'])) {
        $GLOBALS['MAIL_MODE'] = false;
    }
    $errors = array();
    $token = _get_next_tag();
    while (!is_null($token)) {
        //		echo $T_POS.'-'.$POS.' ('.$stack_size.')<br />';
        if (is_array($token) && count($token) != 0) {
            if (is_null($XHTML_VALIDATOR_OFF)) {
                foreach ($token[1] as $error) {
                    $errors[] = _xhtml_error($error[0], array_key_exists(1, $error) ? $error[1] : '', array_key_exists(2, $error) ? $error[2] : '', array_key_exists(3, $error) ? $error[3] : '', array_key_exists('raw', $error) ? $error['raw'] : false, array_key_exists('pos', $error) ? $error['pos'] : 0);
                }
                if (is_null($token[0])) {
                    return array('level_ranges' => $level_ranges, 'tag_ranges' => $TAG_RANGES, 'value_ranges' => $VALUE_RANGES, 'errors' => $errors);
                }
            }
            $token = $token[0];
        }
        $basis_token = _get_tag_basis($token);
        // Open, close, or monitonic?
        $term = strpos($token, '/');
        if (!is_null($XHTML_VALIDATOR_OFF)) {
            if ($term === false) {
                $XHTML_VALIDATOR_OFF++;
            } elseif ($term == 1) {
                if ($XHTML_VALIDATOR_OFF == 0) {
                    $XHTML_VALIDATOR_OFF = NULL;
                } else {
                    $XHTML_VALIDATOR_OFF--;
                }
            }
        }
        if ($term !== 1) {
            if (isset($only_one_of[$basis_token])) {
                if ($only_one_of[$basis_token] == 0) {
                    $errors[] = _xhtml_error('XHTML_ONLY_ONE_ALLOWED', $basis_token);
                }
                $only_one_of[$basis_token]--;
            }
            //			echo 'Push $basis_token<br />';
            $level_ranges[] = array($stack_size, $T_POS, $POS);
            if (isset($to_find[$basis_token])) {
                unset($to_find[$basis_token]);
            }
            if (!$WELL_FORMED_ONLY && is_null($XHTML_VALIDATOR_OFF)) {
                if (!$is_fragment && $stack_size == 0 && $basis_token != 'html') {
                    $errors[] = _xhtml_error('XHTML_BAD_ROOT');
                }
                if ($stack_size != 0) {
                    if (isset($ONLY_CHILDREN[$PARENT_TAG])) {
                        if (!in_array($basis_token, $ONLY_CHILDREN[$PARENT_TAG])) {
                            $errors[] = _xhtml_error('XHTML_BAD_CHILD', $basis_token, $PARENT_TAG);
                        }
                    }
                    /*if (isset($PROHIBITIONS[$PARENT_TAG]))
                    		{
                    			$prohibitions=$PROHIBITIONS[$PARENT_TAG];
                    			if (in_array($basis_token,$prohibitions)) $errors[]=_xhtml_error('XHTML_PROHIBITION',$basis_token,$PARENT_TAG);
                    		}*/
                    foreach ($TAG_STACK as $parent_tag) {
                        if (isset($PROHIBITIONS[$parent_tag])) {
                            $prohibitions = $PROHIBITIONS[$parent_tag];
                            if (in_array($basis_token, $prohibitions)) {
                                $errors[] = _xhtml_error('XHTML_PROHIBITION', $basis_token, $parent_tag);
                            }
                        }
                    }
                }
                if (isset($REQUIRE_ANCESTER[$basis_token]) && !$is_fragment) {
                    if (!in_array($REQUIRE_ANCESTER[$basis_token], $TAG_STACK)) {
                        $errors[] = _xhtml_error('XHTML_MISSING_ANCESTER', $basis_token, $REQUIRE_ANCESTER[$basis_token]);
                    }
                }
                if (isset($ONLY_PARENT[$basis_token])) {
                    if ($stack_size == 0) {
                        if (!$is_fragment) {
                            $errors[] = _xhtml_error('XHTML_BAD_PARENT', $basis_token, '/');
                        }
                    } else {
                        if (!in_array($PARENT_TAG, $ONLY_PARENT[$basis_token])) {
                            $errors[] = _xhtml_error('XHTML_BAD_PARENT', $basis_token, $PARENT_TAG);
                        }
                    }
                }
            }
            // In order to ease validation, we tolerate these in the parser (but of course, mark as errors)
            if (is_null($XHTML_VALIDATOR_OFF) && !$WELL_FORMED_ONLY && $term === false && isset($MUST_SELFCLOSE_TAGS[$basis_token])) {
                if ($XML_CONSTRAIN) {
                    $errors[] = _xhtml_error('XHTML_NONEMPTY_TAG', $basis_token);
                }
            } else {
                if ($term === false) {
                    $PARENT_TAG = $basis_token;
                    array_push($TAG_STACK, $basis_token);
                    array_push($ATT_STACK, $LAST_TAG_ATTRIBUTES);
                    array_push($content_start_stack, $POS);
                    array_push($only_one_of_stack, $only_one_of);
                    $only_one_of = $only_one_of_template;
                    ++$stack_size;
                } else {
                    if (is_null($XHTML_VALIDATOR_OFF) && !$WELL_FORMED_ONLY && (!$XML_CONSTRAIN || !isset($MUST_SELFCLOSE_TAGS[$basis_token])) && is_null($XHTML_VALIDATOR_OFF)) {
                        $errors[] = _xhtml_error('XHTML_CEMPTY_TAG', $basis_token);
                    }
                }
            }
        } elseif ($term == 1) {
            // HTML allows implicit closing. We will flag errors when we have to do it. See 1-2-3 note
            do {
                // For case 3 (see note below)
                if (!in_array($basis_token, $TAG_STACK)) {
                    if (is_null($XHTML_VALIDATOR_OFF) && $XML_CONSTRAIN) {
                        $errors[] = _xhtml_error('XML_NO_CLOSE_MATCH', $basis_token, $previous);
                    }
                    break;
                }
                $previous = array_pop($TAG_STACK);
                $PARENT_TAG = $TAG_STACK == array() ? '' : $TAG_STACK[count($TAG_STACK) - 1];
                $start_pos = array_pop($content_start_stack);
                array_pop($ATT_STACK);
                $only_one_of = array_pop($only_one_of_stack);
                if (is_null($previous)) {
                    if (is_null($XHTML_VALIDATOR_OFF) && $XML_CONSTRAIN) {
                        $errors[] = _xhtml_error('XML_MORE_CLOSE_THAN_OPEN', $basis_token);
                    }
                    break;
                }
                if ($basis_token != $previous) {
                    // This is really tricky, and totally XHTML-incompliant. There are three situations:
                    // 1) Overlapping tags. We really can't survive this, and it's very invalid. We could only detect it if we broke support for cases (1) and (2). e.g. <i><b></i></b>
                    // 2) Implicit closing. We close everything implicitly until we find the matching tag. E.g. <i><b></i>
                    // 3) Closing something that was never open. This is tricky - we can't survive it if it was opened somewhere as a parent, as we'd end up closing a whole load of tags by rule (2) - but if it's a lone closing, we can skip it. Good e.g. <b></i></b>. Bad e.g. <div><p></div></p></div>
                    if (is_null($XHTML_VALIDATOR_OFF) && $XML_CONSTRAIN) {
                        $errors[] = _xhtml_error('XML_NO_CLOSE_MATCH', $basis_token, $previous);
                    }
                }
                if (!$WELL_FORMED_ONLY && is_null($XHTML_VALIDATOR_OFF)) {
                    if (isset($MUST_SELFCLOSE_TAGS[$previous]) && $XML_CONSTRAIN) {
                        $errors[] = _xhtml_error('XHTML_NONEMPTY_TAG', $previous);
                    }
                    if (!isset($MUST_SELFCLOSE_TAGS[$previous]) && !isset($POSSIBLY_EMPTY_TAGS[$previous]) && trim(substr($OUT, $start_pos, $T_POS - $start_pos)) == '') {
                        $errors[] = _xhtml_error('XHTML_EMPTY_TAG', $previous);
                    }
                }
                $stack_size--;
                $level_ranges[] = array($stack_size, $T_POS, $POS);
                //			echo 'Popped $previous<br />';
                if (is_null($XHTML_VALIDATOR_OFF) && !$WELL_FORMED_ONLY && is_null($XHTML_VALIDATOR_OFF)) {
                    if ($previous == 'script') {
                        $tag_contents = substr($OUT, $start_pos, $T_POS - $start_pos);
                        $c_section = strpos($tag_contents, ']]>');
                        if (trim($tag_contents) != '' && strpos($tag_contents, '//-->') === false && strpos($tag_contents, '// -->') === false && $c_section === false) {
                            $errors[] = _xhtml_error('XHTML_SCRIPT_COMMENTING', $previous);
                        } elseif ($c_section === false && strpos($tag_contents, '<!--') !== false) {
                            if ($XML_CONSTRAIN) {
                                $errors[] = _xhtml_error('XHTML_CDATA');
                            }
                        }
                        if (strpos($tag_contents, '</') !== false) {
                            $errors[] = _xhtml_error('XML_JS_TAG_ESCAPE');
                        }
                    }
                }
            } while ($basis_token != $previous);
        }
        /*else
        		{
        			$level_ranges[]=array($stack_size,$T_POS,$POS);
        			// it's monitonic, so ignore
        		}*/
        $token = _get_next_tag();
    }
    // Check we have everything closed
    if ($stack_size != 0) {
        if ($XML_CONSTRAIN) {
            $errors[] = _xhtml_error('XML_NO_CLOSE', array_pop($TAG_STACK));
        }
        return array('level_ranges' => $level_ranges, 'tag_ranges' => $TAG_RANGES, 'value_ranges' => $VALUE_RANGES, 'errors' => $errors);
    }
    if (!$well_formed_only) {
        if (!$is_fragment) {
            foreach (array_keys($to_find) as $tag) {
                $errors[] = _xhtml_error('XHTML_MISSING_TAG', $tag);
            }
            if (!$FOUND_DOCTYPE && !$GLOBALS['MAIL_MODE']) {
                $errors[] = _xhtml_error('XHTML_DOCTYPE');
            }
            if ($FOUND_DOCTYPE && $GLOBALS['MAIL_MODE']) {
                $errors[] = _xhtml_error('MAIL_DOCTYPE');
            }
            if (!$FOUND_CONTENTTYPE) {
                $errors[] = _xhtml_error('XHTML_CONTENTTYPE');
            }
            if (!$FOUND_KEYWORDS) {
                $errors[] = _xhtml_error('XHTML_KEYWORDS');
            }
            if (!$FOUND_DESCRIPTION) {
                $errors[] = _xhtml_error('XHTML_DESCRIPTION');
            }
        }
        if (!$is_fragment) {
            // Check that all area-links have a corresponding hyperlink
            foreach (array_keys($AREA_LINKS) as $id) {
                if (!in_array($id, $HYPERLINK_URLS)) {
                    $errors[] = _xhtml_error('WCAG_AREA_EQUIV', $id);
                }
            }
            // Check that all labels apply to real input tags
            foreach (array_keys($FOR_LABEL_IDS_2) as $id) {
                if (!isset($INPUT_TAG_IDS[$id])) {
                    $errors[] = _xhtml_error('XHTML_ID_UNBOUND', $id);
                }
            }
        }
    }
    // Main spelling
    if (function_exists('pspell_new') && isset($GLOBALS['SPELLING'])) {
        $stripped = $OUT;
        $matches = array();
        $num_matches = preg_match_all('#\\<style.*\\</style\\>#Umis', $stripped, $matches);
        for ($i = 0; $i < $num_matches; $i++) {
            $stripped = str_replace($matches[0][$i], str_repeat(' ', strlen($matches[0][$i])), $stripped);
        }
        $num_matches = preg_match_all('#\\<script.*\\</script\\>#Umis', $stripped, $matches);
        for ($i = 0; $i < $num_matches; $i++) {
            $stripped = str_replace($matches[0][$i], str_repeat(' ', strlen($matches[0][$i])), $stripped);
        }
        $stripped = @html_entity_decode(strip_tags($stripped), ENT_QUOTES, get_charset());
        $new_errors = validate_spelling($stripped);
        $misspellings = array();
        global $POS, $LINENO, $LINESTART;
        foreach ($new_errors as $error) {
            if (array_key_exists($error[1], $misspellings)) {
                continue;
            }
            $misspellings[$error[1]] = 1;
            $POS = strpos($OUT, $error[1]);
            $LINESTART = strrpos(substr($OUT, 0, $POS), chr(10));
            $LINENO = substr_count(substr($OUT, 0, $LINESTART), chr(10)) + 1;
            $errors[] = _xhtml_error($error[0], $error[1]);
        }
    }
    unset($OUT);
    return array('level_ranges' => $level_ranges, 'tag_ranges' => $TAG_RANGES, 'value_ranges' => $VALUE_RANGES, 'errors' => $errors);
}
Beispiel #2
0
/**
 * Function that 'fixes' HTML (or bad XHTML) enough for it to pass most basic structural validation.
 *
 * @param  string			The XHTML string to convert to XHTML
 * @param  boolean		Whether to force a repair even if we aren't in XHTML mode
 * @return string			The converted string
 */
function xhtmlise_html($html, $definitely_want = false)
{
    // Tests...
    // echo xhtmlise_html('test<a></a><br /><po></p><p></po>'); // expect: test<a></a><br /><po><p></p></po>
    if (!$definitely_want) {
        if (!($GLOBALS['SEMI_DEBUG_MODE'] && browser_matches('true_xhtml'))) {
            return $html;
        }
        // One day, this will get removed and we'll ensure all our output is always XHTML. But so far there's no point as IE doesn't support true XHTML
    }
    $is_escaped = $GLOBALS['XSS_DETECT'] && ocp_is_escaped($html);
    $html = preg_replace('#<\\!--.*($|-->)#Us', '', $html);
    // Strip comments
    require_code('obfuscate');
    require_code('validation');
    global $XML_CONSTRAIN, $LAST_TAG_ATTRIBUTES, $POS, $OUT, $TAG_STACK, $INBETWEEN_TEXT, $LEN, $WELL_FORMED_ONLY, $MUST_SELFCLOSE_TAGS, $LINENO, $LINESTART;
    $POS = 0;
    $OUT = $html;
    $LEN = strlen($html);
    $TAG_STACK = array();
    $WELL_FORMED_ONLY = true;
    $LINENO = 0;
    $LINESTART = 0;
    $XML_CONSTRAIN = true;
    $new = '';
    $token = _get_next_tag();
    // If we actually have a partial tag right at the start (ie. we're breaking into some HTML at a bad point)
    $ang_pos = strpos($INBETWEEN_TEXT, '>');
    if ($ang_pos !== false) {
        $INBETWEEN_TEXT = substr($INBETWEEN_TEXT, $ang_pos + 1);
    }
    $new .= fix_entities($INBETWEEN_TEXT);
    while (!is_null($token)) {
        while (is_array($token)) {
            if ($token[0] !== NULL) {
                $token = $token[0];
                // We can at least discern something
            } else {
                $token = _get_next_tag();
                // No, we need to just move on
            }
        }
        $basis_token = _get_tag_basis($token);
        if ($basis_token != '') {
            // Open, close, or monitonic?
            $term = strpos($token, '/');
            if ($term !== 1) {
                if ($term === false && !isset($MUST_SELFCLOSE_TAGS[$basis_token])) {
                    // Fix nesting
                    if ($basis_token == 'li' && !in_array('ul', $TAG_STACK) && !in_array('ol', $TAG_STACK) && !in_array('dl', $TAG_STACK) && !in_array('dd', $TAG_STACK) && !in_array('dt', $TAG_STACK) && !in_array('dir', $TAG_STACK) && !in_array('menu', $TAG_STACK)) {
                        array_push($TAG_STACK, 'ul');
                        $new .= '<ul>';
                    }
                    if (($basis_token == 'tr' || $basis_token == 'colgroup' || $basis_token == 'col' || $basis_token == 'tbody' || $basis_token == 'tfoot' || $basis_token == 'thead' || $basis_token == 'caption') && !in_array('table', $TAG_STACK)) {
                        array_push($TAG_STACK, 'table');
                        $new .= '<table>';
                    }
                    if (($basis_token == 'td' || $basis_token == 'th') && !in_array('table', $TAG_STACK)) {
                        array_push($TAG_STACK, 'table');
                        $new .= '<table>';
                        array_push($TAG_STACK, 'tr');
                        $new .= '<tr>';
                    }
                    if ($basis_token == 'param' && !in_array('object', $TAG_STACK)) {
                        array_push($TAG_STACK, 'object');
                        $new .= '<object>';
                    }
                    if ($basis_token == 'option' && !in_array('select', $TAG_STACK)) {
                        array_push($TAG_STACK, 'select');
                        $new .= '<select>';
                    }
                    if ($basis_token == 'noembed' && !in_array('map', $TAG_STACK)) {
                        array_push($TAG_STACK, 'map');
                        $new .= '<map>';
                    }
                    array_push($TAG_STACK, $basis_token);
                    $new .= '<' . $basis_token;
                    foreach ($LAST_TAG_ATTRIBUTES as $key => $val) {
                        $new .= ' ' . $key . '="' . fix_entities($val) . '"';
                    }
                    $new .= '>';
                } else {
                    $new .= '<' . $basis_token;
                    foreach ($LAST_TAG_ATTRIBUTES as $key => $val) {
                        $new .= ' ' . $key . '="' . fix_entities($val) . '"';
                    }
                    $new .= ' />';
                }
            } else {
                // For case 3
                if (!in_array($basis_token, $TAG_STACK)) {
                    // Do nothing, we can't handle it because we're closing something that was never opened
                } else {
                    $previous = '';
                    do {
                        $previous = array_pop($TAG_STACK);
                        if ($basis_token != $previous) {
                            $new .= '</' . $previous . '>';
                        }
                        // We'll have to assume it should be implicitly closed
                    } while ($basis_token != $previous);
                    $new .= '</' . $basis_token . '>';
                    // Ok so we finally got an opener match and managed to put out our closer
                }
            }
        }
        $token = _get_next_tag();
        if (is_null($token)) {
            // If we actually have a partial tag right at the end (ie. we're breaking out of some HTML at a bad point)
            $ang_pos = strpos($INBETWEEN_TEXT, '<');
            if ($ang_pos !== false) {
                $INBETWEEN_TEXT = substr($INBETWEEN_TEXT, 0, $ang_pos);
            }
        }
        $new .= fix_entities($INBETWEEN_TEXT);
    }
    // Check we have everything closed
    while (count($TAG_STACK) != 0) {
        $previous = array_pop($TAG_STACK);
        $new .= '</' . $previous . '>';
    }
    // Remove some empty tags that shouldn't be empty (e.g. table)
    $may_not_be_empty = array('br', 'hr', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'dd', 'dt', 'dl', 'li', 'ol', 'ul', 'rbc', 'rtc', 'rb', 'rt', 'rp', 'abbr', 'acronym', 'cite', 'dfn', 'ruby', 'bdo', 'img', 'param', 'input', 'select', 'object', 'caption', 'label', 'base', 'body', 'col', 'colgroup', 'map', 'optgroup', 'option', 'legend', 'area', 'form');
    foreach ($may_not_be_empty as $t) {
        $new = preg_replace('#<' . $t . '(\\s[^>]*)?' . '>\\s*</' . $t . '>#', '', $new);
    }
    unset($OUT);
    unset($TAG_STACK);
    if ($is_escaped) {
        ocp_mark_as_escaped($new);
    }
    return $new;
}