/**
  * Constructor
  *
  * {@internal This gets tested in _libs.misc.simpletest.php}}
  *
  * @param string Context
  * @param boolean Allow CSS tweaks?
  * @param boolean Allow IFrames?
  * @param boolean Allow Javascript?
  * @param boolean Allow Objects?
  * @param string Input encoding to use ('ISO-8859-1', 'UTF-8', 'US-ASCII' or '' for auto-detect)
  * @param string Message type for errors
  */
 function XHTML_Validator($context = 'posting', $allow_css_tweaks = false, $allow_iframes = false, $allow_javascript = false, $allow_objects = false, $encoding = NULL, $msg_type = 'error')
 {
     global $inc_path;
     require $inc_path . 'xhtml_validator/_xhtml_dtd.inc.php';
     $this->context = $context;
     switch ($context) {
         case 'posting':
         case 'xmlrpc_posting':
             $this->tags =& $allowed_tags;
             $this->tagattrs =& $allowed_attributes;
             break;
         case 'commenting':
             $this->tags =& $comments_allowed_tags;
             $this->tagattrs =& $comments_allowed_attributes;
             break;
         case 'head_extension':
             $this->tags = array('body' => 'meta link style script', 'meta' => '', 'link' => '', 'style' => '#PCDATA', 'script' => '#PCDATA');
             $this->tagattrs = array('meta' => 'name content charset http-equiv', 'link' => 'charset href hreflang media rel sizes type', 'style' => 'media scoped type', 'script' => 'async charset defer src type');
             break;
         default:
             debug_die('unknown context: ' . $context);
     }
     // Attributes that need to be checked for a valid URI:
     $this->uri_attrs = array('xmlns', 'profile', 'href', 'src', 'cite', 'classid', 'codebase', 'data', 'archive', 'usemap', 'longdesc', 'action');
     $this->allowed_uri_scheme = get_allowed_uri_schemes($context);
     $this->msg_type = $msg_type;
     if (empty($encoding)) {
         global $io_charset;
         $encoding = $io_charset;
     }
     $encoding = strtoupper($encoding);
     // we might get 'iso-8859-1' for example
     $this->encoding = $encoding;
     if (!in_array($encoding, array('ISO-8859-1', 'UTF-8', 'US-ASCII'))) {
         // passed encoding not supported by xml_parser_create()
         $this->xml_parser_encoding = '';
         // auto-detect (in PHP4, in PHP5 anyway)
     } else {
         $this->xml_parser_encoding = $this->encoding;
     }
     $this->parser = xml_parser_create($this->xml_parser_encoding);
     $this->last_checked_pos = 0;
     $this->error = false;
     // Creates the parser
     xml_set_object($this->parser, $this);
     // set functions to call when a start or end tag is encountered
     xml_set_element_handler($this->parser, 'tag_open', 'tag_close');
     // set function to call for the actual data
     xml_set_character_data_handler($this->parser, 'cdata');
     xml_parser_set_option($this->parser, XML_OPTION_CASE_FOLDING, false);
 }
Esempio n. 2
0
/**
 * Check the validity of a given URL
 *
 * Checks allowed URI schemes and URL ban list.
 * URL can be empty.
 *
 * Note: We have a problem when trying to "antispam" a keyword which is already blacklisted
 * If that keyword appears in the URL... then the next page has a bad referer! :/
 *
 * {@internal This function gets tested in misc.funcs.simpletest.php.}}
 *
 * @param string Url to validate
 * @param string Context ("posting", "commenting", "download_src", "http-https")
 * @param boolean also do an antispam check on the url
 * @return mixed false (which means OK) or error message
 */
function validate_url($url, $context = 'posting', $antispam_check = true)
{
    global $Debuglog, $debug;
    if (empty($url)) {
        // Empty URL, no problem
        return false;
    }
    // Do not give verbose info for comments, unless debug is enabled.
    $verbose = $debug || $context != 'commenting';
    $allowed_uri_schemes = get_allowed_uri_schemes($context);
    // Validate URL structure
    if ($url[0] == '$') {
        // This is a 'special replace code' URL (used in footers)
        if (!preg_match('~\\$([a-z_]+)\\$~', $url)) {
            return T_('Invalid URL $code$ format');
        }
    } elseif (preg_match('~^\\w+:~', $url)) {
        // there's a scheme and therefor an absolute URL:
        if (substr($url, 0, 7) == 'mailto:') {
            // mailto:link
            if (!in_array('mailto', $allowed_uri_schemes)) {
                // Scheme not allowed
                $scheme = 'mailto:';
                $Debuglog->add('URI scheme «' . $scheme . '» not allowed!', 'error');
                return $verbose ? sprintf(T_('URI scheme "%s" not allowed.'), htmlspecialchars($scheme)) : T_('URI scheme not allowed.');
            }
            preg_match('~^(mailto):(.*?)(\\?.*)?$~', $url, $match);
            if (!$match) {
                return $verbose ? sprintf(T_('Invalid email link: %s.'), htmlspecialchars($url)) : T_('Invalid email link.');
            } elseif (!is_email($match[2])) {
                return $verbose ? sprintf(T_('Supplied email address (%s) is invalid.'), htmlspecialchars($match[2])) : T_('Invalid email address.');
            }
        } elseif (substr($url, 0, 6) == 'clsid:') {
            // clsid:link
            if (!in_array('clsid', $allowed_uri_schemes)) {
                // Scheme not allowed
                $scheme = 'clsid:';
                $Debuglog->add('URI scheme «' . $scheme . '» not allowed!', 'error');
                return $verbose ? sprintf(T_('URI scheme "%s" not allowed.'), htmlspecialchars($scheme)) : T_('URI scheme not allowed.');
            }
            if (!preg_match('~^(clsid):([a-fA-F0-9\\-]+)$~', $url, $match)) {
                return T_('Invalid class ID format');
            }
        } elseif (substr($url, 0, 11) == 'javascript:') {
            // javascript:
            // Basically there could be anything here
            if (!in_array('javascript', $allowed_uri_schemes)) {
                // Scheme not allowed
                $scheme = 'javascript:';
                $Debuglog->add('URI scheme «' . $scheme . '» not allowed!', 'error');
                return $verbose ? sprintf(T_('URI scheme "%s" not allowed.'), htmlspecialchars($scheme)) : T_('URI scheme not allowed.');
            }
            preg_match('~^(javascript):~', $url, $match);
        } else {
            // convert URL to IDN:
            $url = idna_encode($url);
            if (!preg_match('~^           # start
				([a-z][a-z0-9+.\\-]*)             # scheme
				://                              # authorize absolute URLs only ( // not present in clsid: -- problem? ; mailto: handled above)
				(\\w+(:\\w+)?@)?                   # username or username and password (optional)
				( localhost |
						[a-z0-9]([a-z0-9\\-])*            # Don t allow anything too funky like entities
						\\.                               # require at least 1 dot
						[a-z0-9]([a-z0-9.\\-])+           # Don t allow anything too funky like entities
				)
				(:[0-9]+)?                       # optional port specification
				.*                               # allow anything in the path (including spaces - used in FileManager - but no newlines).
				$~ix', $url, $match)) {
                // Cannot validate URL structure
                $Debuglog->add('URL «' . $url . '» does not match url pattern!', 'error');
                return $verbose ? sprintf(T_('Invalid URL format (%s).'), htmlspecialchars($url)) : T_('Invalid URL format.');
            }
            $scheme = strtolower($match[1]);
            if (!in_array($scheme, $allowed_uri_schemes)) {
                // Scheme not allowed
                $Debuglog->add('URI scheme «' . $scheme . '» not allowed!', 'error');
                return $verbose ? sprintf(T_('URI scheme "%s" not allowed.'), htmlspecialchars($scheme)) : T_('URI scheme not allowed.');
            }
        }
    } else {
        // URL is relative..
        if ($context == 'commenting' || $context == 'download_src' || $context == 'http-https') {
            // We do not allow relative URLs in comments and download urls
            return $verbose ? sprintf(T_('URL "%s" must be absolute.'), htmlspecialchars($url)) : T_('URL must be absolute.');
        }
        $char = substr($url, 0, 1);
        if ($char != '/' && $char != '#') {
            // must start with a slash or hash (for HTML anchors to the same page)
            return $verbose ? sprintf(T_('URL "%s" must be a full path starting with "/" or an anchor starting with "#".'), htmlspecialchars($url)) : T_('URL must be a full path starting with "/" or an anchor starting with "#".');
        }
    }
    if ($antispam_check) {
        // Search for blocked keywords:
        if ($block = antispam_check($url)) {
            return $verbose ? sprintf(T_('URL "%s" not allowed: blacklisted word "%s".'), htmlspecialchars($url), $block) : T_('URL not allowed');
        }
    }
    return false;
    // OK
}