/** * Take an array of attribute names and values and normalize or discard * illegal values for the given whitelist. * * - Discards attributes not on the given whitelist * - Unsafe style attributes are discarded * - Invalid id attributes are re-encoded * * @param array $attribs * @param array $whitelist List of allowed attribute names * @return array * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ static function validateAttributes($attribs, $whitelist) { $whitelist = array_flip($whitelist); $hrefExp = '/^(' . wfUrlProtocols() . ')[^\\s]+$/'; $out = []; foreach ($attribs as $attribute => $value) { # Allow XML namespace declaration to allow RDFa if (preg_match(self::XMLNS_ATTRIBUTE_PATTERN, $attribute)) { if (!preg_match(self::EVIL_URI_PATTERN, $value)) { $out[$attribute] = $value; } continue; } # Allow any attribute beginning with "data-" # However: # * data-ooui is reserved for ooui # * data-mw and data-parsoid are reserved for parsoid # * data-mw-<name here> is reserved for extensions (or core) if # they need to communicate some data to the client and want to be # sure that it isn't coming from an untrusted user. # * Ensure that the attribute is not namespaced by banning # colons. if (!preg_match('/^data-(?!ooui|mw|parsoid)[^:]*$/i', $attribute) && !isset($whitelist[$attribute])) { continue; } # Strip javascript "expression" from stylesheets. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp if ($attribute == 'style') { $value = Sanitizer::checkCss($value); } # Escape HTML id attributes if ($attribute === 'id') { $value = Sanitizer::escapeId($value, 'noninitial'); } # Escape HTML id reference lists if ($attribute === 'aria-describedby' || $attribute === 'aria-flowto' || $attribute === 'aria-labelledby' || $attribute === 'aria-owns') { $value = Sanitizer::escapeIdReferenceList($value, 'noninitial'); } // RDFa and microdata properties allow URLs, URIs and/or CURIs. // Check them for sanity. if ($attribute === 'rel' || $attribute === 'rev' || $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || $attribute === 'datatype' || $attribute === 'typeof' || $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || $attribute === 'itemscope' || $attribute === 'itemtype') { // Paranoia. Allow "simple" values but suppress javascript if (preg_match(self::EVIL_URI_PATTERN, $value)) { continue; } } # NOTE: even though elements using href/src are not allowed directly, supply # validation code that can be used by tag hook handlers, etc if ($attribute === 'href' || $attribute === 'src') { if (!preg_match($hrefExp, $value)) { continue; // drop any href or src attributes not using an allowed protocol. // NOTE: this also drops all relative URLs } } // If this attribute was previously set, override it. // Output should only have one attribute of each name. $out[$attribute] = $value; } # itemtype, itemid, itemref don't make sense without itemscope if (!array_key_exists('itemscope', $out)) { unset($out['itemtype']); unset($out['itemid']); unset($out['itemref']); } # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. return $out; }
/** * Test escapeIdReferenceList for consistency with escapeId * * @dataProvider provideEscapeIdReferenceList * @covers Sanitizer::escapeIdReferenceList */ public function testEscapeIdReferenceList($referenceList, $id1, $id2) { $this->assertEquals(Sanitizer::escapeIdReferenceList($referenceList, 'noninitial'), Sanitizer::escapeId($id1, 'noninitial') . ' ' . Sanitizer::escapeId($id2, 'noninitial')); }