function processPage($row) { global $wgContLang; $source = $row->img_name; if ($source == '') { // Ye olde empty rows. Just kill them. $this->killRow($source); return $this->progress(1); } $cleaned = $source; // About half of old bad image names have percent-codes $cleaned = rawurldecode($cleaned); // Some are old latin-1 $cleaned = $wgContLang->checkTitleEncoding($cleaned); // Many of remainder look like non-normalized unicode $cleaned = UtfNormal::cleanUp($cleaned); $title = Title::makeTitleSafe(NS_IMAGE, $cleaned); if (is_null($title)) { $this->log("page {$source} ({$cleaned}) is illegal."); $safe = $this->buildSafeTitle($cleaned); $this->pokeFile($source, $safe); return $this->progress(1); } if ($title->getDbKey() !== $source) { $munged = $title->getDbKey(); $this->log("page {$source} ({$munged}) doesn't match self."); $this->pokeFile($source, $munged); return $this->progress(1); } $this->progress(0); }
/** * Format an XML element as with self::element(), but run text through the * UtfNormal::cleanUp() validator first to ensure that no invalid UTF-8 * is passed. * * @param $element String: * @param $attribs Array: Name=>value pairs. Values will be escaped. * @param $contents String: NULL to make an open tag only; '' for a contentless closed tag (default) * @return string */ public static function elementClean($element, $attribs = array(), $contents = '') { if ($attribs) { $attribs = array_map(array('UtfNormal', 'cleanUp'), $attribs); } if ($contents) { $contents = UtfNormal::cleanUp($contents); } return self::element($element, $attribs, $contents); }
/** * Format an XML element as with self::element(), but run text through the * UtfNormal::cleanUp() validator first to ensure that no invalid UTF-8 * is passed. * * @param $element String: * @param $attribs Array: Name=>value pairs. Values will be escaped. * @param $contents String: NULL to make an open tag only; '' for a contentless closed tag (default) * @return string */ public static function elementClean($element, $attribs = array(), $contents = '') { if ($attribs) { $attribs = array_map(array('UtfNormal', 'cleanUp'), $attribs); } if ($contents) { wfProfileIn(__METHOD__ . '-norm'); $contents = UtfNormal::cleanUp($contents); wfProfileOut(__METHOD__ . '-norm'); } return self::element($element, $attribs, $contents); }
function gs_utf8_decompose_to_ascii($str) { static $map = null; if (!is_array($map)) { $map = _gs_utf8_get_map(); } $str = UtfNormal::toNFD(strTr($str, $map)); # return "safe" ASCII without control chars, newlines etc. //$str = preg_replace('/[^a-z0-9\-_. *#\'"!$()\/]/i', '', $str); $str = preg_replace('/[^\\x20-\\x7E]/', '', $str); return $str; }
function processPage($row) { $current = Title::makeTitle($row->wl_namespace, $row->wl_title); $display = $current->getPrefixedText(); $verified = UtfNormal::cleanUp($display); $title = Title::newFromText($verified); if ($row->wl_user == 0 || is_null($title) || !$title->equals($current)) { $this->log("invalid watch by {$row->wl_user} for ({$row->wl_namespace}, \"{$row->wl_title}\")"); $this->removeWatch($row); return $this->progress(1); } $this->progress(0); }
public static function formatDiffRow($title, $oldid, $newid, $timestamp, $comment, $actiontext = '') { global $wgFeedDiffCutoff, $wgContLang, $wgUser; wfProfileIn(__FUNCTION__); $skin = $wgUser->getSkin(); # log enties $completeText = '<p>' . implode(' ', array_filter(array($actiontext, $skin->formatComment($comment)))) . "</p>\n"; //NOTE: Check permissions for anonymous users, not current user. // No "privileged" version should end up in the cache. // Most feed readers will not log in anway. $anon = new User(); $accErrors = $title->getUserPermissionsErrors('read', $anon, true); if ($title->getNamespace() >= 0 && !$accErrors) { if ($oldid) { wfProfileIn(__FUNCTION__ . "-dodiff"); #$diffText = $de->getDiff( wfMsg( 'revisionasof', # $wgContLang->timeanddate( $timestamp ) ), # wfMsg( 'currentrev' ) ); // Don't bother generating the diff if we won't be able to show it if ($wgFeedDiffCutoff > 0) { $de = new DifferenceEngine($title, $oldid, $newid); $diffText = $de->getDiff(wfMsg('previousrevision'), wfMsg('revisionasof', $wgContLang->timeanddate($timestamp))); } if (strlen($diffText) > $wgFeedDiffCutoff || $wgFeedDiffCutoff <= 0) { // Omit large diffs $diffLink = $title->escapeFullUrl('diff=' . $newid . '&oldid=' . $oldid); $diffText = '<a href="' . $diffLink . '">' . htmlspecialchars(wfMsgForContent('showdiff')) . '</a>'; } elseif ($diffText === false) { // Error in diff engine, probably a missing revision $diffText = "<p>Can't load revision {$newid}</p>"; } else { // Diff output fine, clean up any illegal UTF-8 $diffText = UtfNormal::cleanUp($diffText); $diffText = self::applyDiffStyle($diffText); } wfProfileOut(__FUNCTION__ . "-dodiff"); } else { $rev = Revision::newFromId($newid); if (is_null($rev)) { $newtext = ''; } else { $newtext = $rev->getText(); } $diffText = '<p><b>' . wfMsg('newpage') . '</b></p>' . '<div>' . nl2br(htmlspecialchars($newtext)) . '</div>'; } $completeText .= $diffText; } wfProfileOut(__FUNCTION__); return $completeText; }
/** * Returns the normalized form of the given page title, using the normalization rules of the given site. * If the given title is a redirect, the redirect weill be resolved and the redirect target is returned. * * @note : This actually makes an API request to the remote site, so beware that this function is slow and depends * on an external service. * * @note : If MW_PHPUNIT_TEST is defined, the call to the external site is skipped, and the title * is normalized using the local normalization rules as implemented by the Title class. * * @see Site::normalizePageName * * @since 1.21 * * @param string $pageName * * @return string * @throws MWException */ public function normalizePageName($pageName) { // Check if we have strings as arguments. if (!is_string($pageName)) { throw new MWException('$pageName must be a string'); } // Go on call the external site if (defined('MW_PHPUNIT_TEST')) { // If the code is under test, don't call out to other sites, just normalize locally. // Note: this may cause results to be inconsistent with the actual normalization used by the respective remote site! $t = Title::newFromText($pageName); return $t->getPrefixedText(); } else { // Make sure the string is normalized into NFC (due to the bug 40017) // but do nothing to the whitespaces, that should work appropriately. // @see https://bugzilla.wikimedia.org/show_bug.cgi?id=40017 $pageName = UtfNormal::cleanUp($pageName); // Build the args for the specific call $args = array('action' => 'query', 'prop' => 'info', 'redirects' => true, 'converttitles' => true, 'format' => 'json', 'titles' => $pageName); $url = $this->getFileUrl('api.php') . '?' . wfArrayToCgi($args); // Go on call the external site //@todo: we need a good way to specify a timeout here. $ret = Http::get($url); } if ($ret === false) { wfDebugLog("MediaWikiSite", "call to external site failed: {$url}"); return false; } $data = FormatJson::decode($ret, true); if (!is_array($data)) { wfDebugLog("MediaWikiSite", "call to <{$url}> returned bad json: " . $ret); return false; } $page = static::extractPageRecord($data, $pageName); if (isset($page['missing'])) { wfDebugLog("MediaWikiSite", "call to <{$url}> returned a marker for a missing page title! " . $ret); return false; } if (isset($page['invalid'])) { wfDebugLog("MediaWikiSite", "call to <{$url}> returned a marker for an invalid page title! " . $ret); return false; } if (!isset($page['title'])) { wfDebugLog("MediaWikiSite", "call to <{$url}> did not return a page title! " . $ret); return false; } return $page['title']; }
function processPage($row) { $current = Title::makeTitle($row->page_namespace, $row->page_title); $display = $current->getPrefixedText(); $verified = UtfNormal::cleanUp($display); $title = Title::newFromText($verified); if (is_null($title)) { $this->log("page {$row->page_id} ({$display}) is illegal."); $this->moveIllegalPage($row); return $this->progress(1); } if (!$title->equals($current)) { $this->log("page {$row->page_id} ({$display}) doesn't match self."); $this->moveInconsistentPage($row, $title); return $this->progress(1); } $this->progress(0); }
function makeInputText($max = false) { if ($max === false) { $max = $this->maxLength; } $length = mt_rand($this->minLength, $max); $s = ''; for ($i = 0; $i < $length; $i++) { $hairIndex = mt_rand(0, count($this->hairs) - 1); $s .= $this->hairs[$hairIndex]; } // Send through the UTF-8 normaliser // This resolves a few differences between the old preprocessor and the // XML-based one, which doesn't like illegals and converts line endings. // It's done by the MW UI, so it's a reasonably legitimate thing to do. $s = UtfNormal::cleanUp($s); return $s; }
function processPage($row) { $current = Title::makeTitle($row->page_namespace, $row->page_title); $display = $current->getPrefixedText(); $verified = UtfNormal::cleanUp($display); $title = Title::newFromText($verified); if (!is_null($title) && $title->equals($current) && $title->canExist()) { return $this->progress(0); // all is fine } if ($row->page_namespace == NS_FILE && $this->fileExists($row->page_title)) { $this->log("file {$row->page_title} needs cleanup, please run cleanupImages.php."); return $this->progress(0); } elseif (is_null($title)) { $this->log("page {$row->page_id} ({$display}) is illegal."); $this->moveIllegalPage($row); return $this->progress(1); } else { $this->log("page {$row->page_id} ({$display}) doesn't match self."); $this->moveInconsistentPage($row, $title); return $this->progress(1); } }
/** * Function executed by use of {{#infoboxbuilder:}} parser function. * It gets the code from InfoboxBuilder.lua and creates new module object * from it. The module is then invoked and the result is returned. * @param Parser $parser Parser object * @param PPFrame $frame PPFrame object * @param array $args Array of arguments passed from $frame object * @return string A string returned by InfoboxBuilder.lua */ public static function parserFunctionHook(\Parser $parser, $frame, $args) { wfProfileIn(__METHOD__); try { /** * Add the registered SCSS with the default theme */ $parser->getOutput()->addModuleStyles('ext.wikia.InfoboxBuilder'); $engine = \Scribunto::getParserEngine($parser); unset($args[0]); $childFrame = $frame->newChild($args, $parser->getTitle(), 1); $moduleText = file_get_contents(__DIR__ . '/includes/lua/InfoboxBuilder.lua'); $module = new \Scribunto_LuaModule($engine, $moduleText, 'InfoboxBuilder'); $result = $module->invoke('builder', $childFrame); $result = \UtfNormal::cleanUp(strval($result)); wfProfileOut(__METHOD__); return $result; } catch (\ScribuntoException $e) { $trace = $e->getScriptTraceHtml(array('msgOptions' => array('content'))); $html = \Html::element('p', array(), $e->getMessage()); if ($trace !== false) { $html .= \Html::element('p', array(), wfMessage('scribunto-common-backtrace')->inContentLanguage()->text()) . $trace; } $out = $parser->getOutput(); if (!isset($out->scribunto_errors)) { $out->addOutputHook('ScribuntoError'); $out->scribunto_errors = array(); $parser->addTrackingCategory('scribunto-common-error-category'); } $out->scribunto_errors[] = $html; $id = 'mw-scribunto-error-' . (count($out->scribunto_errors) - 1); $parserError = wfMessage('scribunto-parser-error')->inContentLanguage()->text() . $parser->insertStripItem('<!--' . htmlspecialchars($e->getMessage()) . '-->'); wfProfileOut(__METHOD__); // #iferror-compatible error element return "<strong class=\"error\"><span class=\"scribunto-error\" id=\"{$id}\">" . $parserError . "</span></strong>"; } }
/** * Get Normalized metadata in PHP-serialized form * * @param stdClass $video * @return string */ protected static function getNormalizedMetadata($video) { // image.img_metadata $metadata = unserialize($video->img_metadata); foreach (self::$metadataFieldsContainingName as $field) { if (isset($metadata[$field])) { $metadata[$field] = \UtfNormal::toNFC($metadata[$field]); } } return serialize($metadata); }
/** * Convert a UTF-8 string to normal form C. In Malayalam and Arabic, this * also cleans up certain backwards-compatible sequences, converting them * to the modern Unicode equivalent. * * This is language-specific for performance reasons only. * * @param $s string * * @return string */ function normalize($s) { global $wgAllUnicodeFixes; $s = UtfNormal::cleanUp($s); if ($wgAllUnicodeFixes) { $s = $this->transformUsingPairFile('normalize-ar.ser', $s); $s = $this->transformUsingPairFile('normalize-ml.ser', $s); } return $s; }
function testLine($test, $line, &$total, &$success, &$failed, $columns, $exceptions, $verbose) { $stripped = $line; UtfNormal::quickisNFCVerify($stripped); $same = $line == $stripped; $len = mb_strlen(substr($stripped, 0, strpos($stripped, '|'))); if ($len == 0) { $len = strlen(substr($stripped, 0, strpos($stripped, '|'))); } $ok = $same ^ $test >= 3; $ok ^= in_array($test, $exceptions); $ok &= $columns == $len; $total++; if ($ok) { $success++; } else { $failed++; } if ($verbose || !$ok) { print str_replace("\n", "{$len}\n", $stripped); } }
/** * Callback function for cleanUpUTF8() */ private static function cleanUp_helper(&$s) { if (!is_string($s)) { return; } $s = UtfNormal::cleanUp($s); }
/** * Converts a path consisting of object titles into a path consisting of tree * nodes. The comparison is non-case sensitive. * * Note: this function returns the same result as getNodePath, * but takes a title path as parameter. * * @access public * @param Array Path array with object titles. * e.g. array('ILIAS','English','Course A') * @param ref_id Startnode of the relative path. * Specify null, if the title path is an absolute path. * Specify a ref id, if the title path is a relative * path starting at this ref id. * @return array ordered path info (depth,parent,child,obj_id,type,title) * or null, if the title path can not be converted into a node path. */ function getNodePathForTitlePath($titlePath, $a_startnode_id = null) { global $ilDB, $log; //$log->write('getNodePathForTitlePath('.implode('/',$titlePath)); // handle empty title path if ($titlePath == null || count($titlePath) == 0) { if ($a_startnode_id == 0) { return null; } else { return $this->getNodePath($a_startnode_id); } } // fetch the node path up to the startnode if ($a_startnode_id != null && $a_startnode_id != 0) { // Start using the node path to the root of the relative path $nodePath = $this->getNodePath($a_startnode_id); $parent = $a_startnode_id; } else { // Start using the root of the tree $nodePath = array(); $parent = 0; } // Convert title path into Unicode Normal Form C // This is needed to ensure that we can compare title path strings with // strings from the database. require_once 'include/Unicode/UtfNormal.php'; include_once './Services/Utilities/classes/class.ilStr.php'; $inClause = 'd.title IN ('; for ($i = 0; $i < count($titlePath); $i++) { $titlePath[$i] = ilStr::strToLower(UtfNormal::toNFC($titlePath[$i])); if ($i > 0) { $inClause .= ','; } $inClause .= $ilDB->quote($titlePath[$i], 'text'); } $inClause .= ')'; // Fetch all rows that are potential path elements if ($this->table_obj_reference) { $joinClause = 'JOIN ' . $this->table_obj_reference . ' r ON t.child = r.' . $this->ref_pk . ' ' . 'JOIN ' . $this->table_obj_data . ' d ON r.' . $this->obj_pk . ' = d.' . $this->obj_pk; } else { $joinClause = 'JOIN ' . $this->table_obj_data . ' d ON t.child = d.' . $this->obj_pk; } // The ORDER BY clause in the following SQL statement ensures that, // in case of a multiple objects with the same title, always the Object // with the oldest ref_id is chosen. // This ensure, that, if a new object with the same title is added, // WebDAV clients can still work with the older object. $q = 'SELECT t.depth, t.parent, t.child, d.' . $this->obj_pk . ' obj_id, d.type, d.title ' . 'FROM ' . $this->table_tree . ' t ' . $joinClause . ' ' . 'WHERE ' . $inClause . ' ' . 'AND t.depth <= ' . (count($titlePath) + count($nodePath)) . ' ' . 'AND t.tree = 1 ' . 'ORDER BY t.depth, t.child ASC'; $r = $ilDB->query($q); $rows = array(); while ($row = $r->fetchRow(DB_FETCHMODE_ASSOC)) { $row['title'] = UtfNormal::toNFC($row['title']); $row['ref_id'] = $row['child']; $rows[] = $row; } // Extract the path elements from the fetched rows for ($i = 0; $i < count($titlePath); $i++) { $pathElementFound = false; foreach ($rows as $row) { if ($row['parent'] == $parent && ilStr::strToLower($row['title']) == $titlePath[$i]) { // FIXME - We should test here, if the user has // 'visible' permission for the object. $nodePath[] = $row; $parent = $row['child']; $pathElementFound = true; break; } } // Abort if we haven't found a path element for the current depth if (!$pathElementFound) { //$log->write('ilTree.getNodePathForTitlePath('.var_export($titlePath,true).','.$a_startnode_id.'):null'); return null; } } // Return the node path //$log->write('ilTree.getNodePathForTitlePath('.var_export($titlePath,true).','.$a_startnode_id.'):'.var_export($nodePath,true)); return $nodePath; }
/** * Return the original filename of the uploaded file, as reported by * the submitting user agent. HTML-style character entities are * interpreted and normalized to Unicode normalization form C, in part * to deal with weird input from Safari with non-ASCII filenames. * * Other than this the name is not verified for being a safe filename. * * @param $key String: * @return string or NULL if no such file. */ function getFileName($key) { if (!isset($_FILES[$key])) { return NULL; } $name = $_FILES[$key]['name']; # Safari sends filenames in HTML-encoded Unicode form D... # Horrid and evil! Let's try to make some kind of sense of it. $name = Sanitizer::decodeCharReferences($name); $name = UtfNormal::cleanUp($name); wfDebug("WebRequest::getFileName() '" . $_FILES[$key]['name'] . "' normalized to '{$name}'\n"); return $name; }
/** * Recursively normalizes UTF-8 strings in the given array. * * @param $data string|array * @return array|string cleaned-up version of the given * @private */ function normalizeUnicode($data) { if (is_array($data)) { foreach ($data as $key => $val) { $data[$key] = $this->normalizeUnicode($val); } } else { global $wgContLang; $data = isset($wgContLang) ? $wgContLang->normalize($data) : UtfNormal::cleanUp($data); } return $data; }
/** * Get the normalized composed version of the title * * @return string */ public function getNormalizedDestinationTitle() { return \UtfNormal::toNFC($this->getSanitizedTitleText()); }
/** * Helper function of a helper function to convert charset for iptc values. * @param string|array $data The IPTC string * @param string $charset The charset * * @return string */ private static function convIPTCHelper($data, $charset) { if ($charset) { wfSuppressWarnings(); $data = iconv($charset, "UTF-8//IGNORE", $data); wfRestoreWarnings(); if ($data === false) { $data = ""; wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset {$charset} to utf-8"); } } else { //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8 $oldData = $data; UtfNormal::quickIsNFCVerify($data); //make $data valid utf-8 if ($data === $oldData) { return $data; //if validation didn't change $data } else { return self::convIPTCHelper($oldData, 'Windows-1252'); } } return trim($data); }
/** * Preprocess some wikitext and return the document tree. * This is the ghost of Parser::replace_variables(). * * @param string $text The text to parse * @param integer flags Bitwise combination of: * Parser::PTD_FOR_INCLUSION Handle <noinclude>/<includeonly> as if the text is being * included. Default is to assume a direct page view. * * The generated DOM tree must depend only on the input text and the flags. * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899. * * Any flag added to the $flags parameter here, or any other parameter liable to cause a * change in the DOM tree for a given text, must be passed through the section identifier * in the section edit link and thus back to extractSections(). * * The output of this function is currently only cached in process memory, but a persistent * cache may be implemented at a later date which takes further advantage of these strict * dependency requirements. * * @private */ function preprocessToObj($text, $flags = 0) { wfProfileIn(__METHOD__); global $wgMemc, $wgPreprocessorCacheThreshold; $xml = false; $cacheable = strlen($text) > $wgPreprocessorCacheThreshold; if ($cacheable) { wfProfileIn(__METHOD__ . '-cacheable'); $cacheKey = wfMemcKey('preprocess-xml', md5($text), $flags); $cacheValue = $wgMemc->get($cacheKey); if ($cacheValue) { $version = substr($cacheValue, 0, 8); if (intval($version) == self::CACHE_VERSION) { $xml = substr($cacheValue, 8); // From the cache wfDebugLog("Preprocessor", "Loaded preprocessor XML from memcached (key {$cacheKey})"); } } } if ($xml === false) { if ($cacheable) { wfProfileIn(__METHOD__ . '-cache-miss'); $xml = $this->preprocessToXml($text, $flags); $cacheValue = sprintf("%08d", self::CACHE_VERSION) . $xml; $wgMemc->set($cacheKey, $cacheValue, 86400); wfProfileOut(__METHOD__ . '-cache-miss'); wfDebugLog("Preprocessor", "Saved preprocessor XML to memcached (key {$cacheKey})"); } else { $xml = $this->preprocessToXml($text, $flags); } } wfProfileIn(__METHOD__ . '-loadXML'); $dom = new DOMDocument(); wfSuppressWarnings(); $result = $dom->loadXML($xml); wfRestoreWarnings(); if (!$result) { // Try running the XML through UtfNormal to get rid of invalid characters $xml = UtfNormal::cleanUp($xml); $result = $dom->loadXML($xml); if (!$result) { throw new MWException(__METHOD__ . ' generated invalid XML'); } } $obj = new PPNode_DOM($dom->documentElement); wfProfileOut(__METHOD__ . '-loadXML'); if ($cacheable) { wfProfileOut(__METHOD__ . '-cacheable'); } wfProfileOut(__METHOD__); return $obj; }
/** * Reads messages from the file in a given language and returns an array * of AUTHORS, MESSAGES and possibly other properties. * * @param string $code Language code. * @return array|bool False if the file does not exist * @throws MWException if the file is not readable or has bad encoding */ public function read($code) { if (!$this->exists($code)) { return false; } $filename = $this->group->getSourceFilePath($code); $input = file_get_contents($filename); if ($input === false) { throw new MWException("Unable to read file {$filename}."); } if (!StringUtils::isUtf8($input)) { throw new MWException("Contents of {$filename} are not valid utf-8."); } $input = UtfNormal::cleanUp($input); try { return $this->readFromVariable($input); } catch (Exception $e) { throw new MWException("Parsing {$filename} failed: " . $e->getMessage()); } }
/** * Preprocess some wikitext and return the document tree. * This is the ghost of Parser::replace_variables(). * * @param string $text the text to parse * @param $flags Integer: bitwise combination of: * Parser::PTD_FOR_INCLUSION Handle "<noinclude>" and "<includeonly>" as if the text is being * included. Default is to assume a direct page view. * * The generated DOM tree must depend only on the input text and the flags. * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899. * * Any flag added to the $flags parameter here, or any other parameter liable to cause a * change in the DOM tree for a given text, must be passed through the section identifier * in the section edit link and thus back to extractSections(). * * The output of this function is currently only cached in process memory, but a persistent * cache may be implemented at a later date which takes further advantage of these strict * dependency requirements. * * @throws MWException * @return PPNode_DOM */ function preprocessToObj($text, $flags = 0) { wfProfileIn(__METHOD__); global $wgMemc, $wgPreprocessorCacheThreshold; $xml = false; $cacheable = $wgPreprocessorCacheThreshold !== false && strlen($text) > $wgPreprocessorCacheThreshold; if ($cacheable) { wfProfileIn(__METHOD__ . '-cacheable'); $cacheKey = wfMemcKey('preprocess-xml', md5($text), $flags); $cacheValue = $wgMemc->get($cacheKey); if ($cacheValue) { $version = substr($cacheValue, 0, 8); if (intval($version) == self::CACHE_VERSION) { $xml = substr($cacheValue, 8); // From the cache wfDebugLog("Preprocessor", "Loaded preprocessor XML from memcached (key {$cacheKey})"); } } if ($xml === false) { wfProfileIn(__METHOD__ . '-cache-miss'); $xml = $this->preprocessToXml($text, $flags); $cacheValue = sprintf("%08d", self::CACHE_VERSION) . $xml; $wgMemc->set($cacheKey, $cacheValue, 86400); wfProfileOut(__METHOD__ . '-cache-miss'); wfDebugLog("Preprocessor", "Saved preprocessor XML to memcached (key {$cacheKey})"); } } else { $xml = $this->preprocessToXml($text, $flags); } // Fail if the number of elements exceeds acceptable limits // Do not attempt to generate the DOM $this->parser->mGeneratedPPNodeCount += substr_count($xml, '<'); $max = $this->parser->mOptions->getMaxGeneratedPPNodeCount(); if ($this->parser->mGeneratedPPNodeCount > $max) { if ($cacheable) { wfProfileOut(__METHOD__ . '-cacheable'); } wfProfileOut(__METHOD__); throw new MWException(__METHOD__ . ': generated node count limit exceeded'); } wfProfileIn(__METHOD__ . '-loadXML'); $dom = new DOMDocument(); wfSuppressWarnings(); $result = $dom->loadXML($xml); wfRestoreWarnings(); if (!$result) { // Try running the XML through UtfNormal to get rid of invalid characters $xml = UtfNormal::cleanUp($xml); // 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2 don't barf when the XML is >256 levels deep $result = $dom->loadXML($xml, 1 << 19); } if ($result) { $obj = new PPNode_DOM($dom->documentElement); } wfProfileOut(__METHOD__ . '-loadXML'); if ($cacheable) { wfProfileOut(__METHOD__ . '-cacheable'); } wfProfileOut(__METHOD__); if (!$result) { throw new MWException(__METHOD__ . ' generated invalid XML'); } return $obj; }
function xmlsafe($string) { $fname = 'xmlsafe'; wfProfileIn($fname); /** * The page may contain old data which has not been properly normalized. * Invalid UTF-8 sequences or forbidden control characters will make our * XML output invalid, so be sure to strip them out. */ $string = UtfNormal::cleanUp($string); $string = htmlspecialchars($string); wfProfileOut($fname); return $string; }
/** * This is a method to pass messages from wfDebug to the pretty debugger. * Do NOT use this method, use MWDebug::log or wfDebug() * * @since 1.19 * @param $str string */ public static function debugMsg($str) { global $wgDebugComments, $wgShowDebug; if (self::$enabled || $wgDebugComments || $wgShowDebug) { self::$debug[] = rtrim(UtfNormal::cleanUp($str)); } }
/** * Do userComment tags and similar. See pg. 34 of exif standard. * basically first 8 bytes is charset, rest is value. * This has not been tested on any shift-JIS strings. * @param string $prop prop name. */ private function charCodeString($prop) { if (isset($this->mFilteredExifData[$prop])) { if (strlen($this->mFilteredExifData[$prop]) <= 8) { //invalid. Must be at least 9 bytes long. $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, false); unset($this->mFilteredExifData[$prop]); return; } $charCode = substr($this->mFilteredExifData[$prop], 0, 8); $val = substr($this->mFilteredExifData[$prop], 8); switch ($charCode) { case "JIS": //JIS $charset = "Shift-JIS"; break; case "UNICODE": $charset = "UTF-16" . $this->byteOrder; break; default: //ascii or undefined. $charset = ""; break; } // This could possibly check to see if iconv is really installed // or if we're using the compatibility wrapper in globalFunctions.php if ($charset) { wfSuppressWarnings(); $val = iconv($charset, 'UTF-8//IGNORE', $val); wfRestoreWarnings(); } else { // if valid utf-8, assume that, otherwise assume windows-1252 $valCopy = $val; UtfNormal::quickIsNFCVerify($valCopy); //validates $valCopy. if ($valCopy !== $val) { wfSuppressWarnings(); $val = iconv('Windows-1252', 'UTF-8//IGNORE', $val); wfRestoreWarnings(); } } //trim and check to make sure not only whitespace. $val = trim($val); if (strlen($val) === 0) { //only whitespace. $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, "{$prop}: Is only whitespace"); unset($this->mFilteredExifData[$prop]); return; } //all's good. $this->mFilteredExifData[$prop] = $val; } }
/** * Preprocess some wikitext and return the document tree. * This is the ghost of Parser::replace_variables(). * * @param string $text The text to parse * @param integer flags Bitwise combination of: * Parser::PTD_FOR_INCLUSION Handle <noinclude>/<includeonly> as if the text is being * included. Default is to assume a direct page view. * * The generated DOM tree must depend only on the input text and the flags. * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899. * * Any flag added to the $flags parameter here, or any other parameter liable to cause a * change in the DOM tree for a given text, must be passed through the section identifier * in the section edit link and thus back to extractSections(). * * The output of this function is currently only cached in process memory, but a persistent * cache may be implemented at a later date which takes further advantage of these strict * dependency requirements. * * @private */ function preprocessToObj($text, $flags = 0) { wfProfileIn(__METHOD__); wfProfileIn(__METHOD__ . '-makexml'); $rules = array('{' => array('end' => '}', 'names' => array(2 => 'template', 3 => 'tplarg'), 'min' => 2, 'max' => 3), '[' => array('end' => ']', 'names' => array(2 => null), 'min' => 2, 'max' => 2)); $forInclusion = $flags & Parser::PTD_FOR_INCLUSION; $xmlishElements = $this->parser->getStripList(); $enableOnlyinclude = false; if ($forInclusion) { $ignoredTags = array('includeonly', '/includeonly'); $ignoredElements = array('noinclude'); $xmlishElements[] = 'noinclude'; if (strpos($text, '<onlyinclude>') !== false && strpos($text, '</onlyinclude>') !== false) { $enableOnlyinclude = true; } } else { $ignoredTags = array('noinclude', '/noinclude', 'onlyinclude', '/onlyinclude'); $ignoredElements = array('includeonly'); $xmlishElements[] = 'includeonly'; } $xmlishRegex = implode('|', array_merge($xmlishElements, $ignoredTags)); // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset $elementsRegex = "~({$xmlishRegex})(?:\\s|\\/>|>)|(!--)~iA"; $stack = new PPDStack(); $searchBase = "[{<\n"; #} $revText = strrev($text); // For fast reverse searches $i = 0; # Input pointer, starts out pointing to a pseudo-newline before the start $accum =& $stack->getAccum(); # Current accumulator $accum = '<root>'; $findEquals = false; # True to find equals signs in arguments $findPipe = false; # True to take notice of pipe characters $headingIndex = 1; $inHeading = false; # True if $i is inside a possible heading $noMoreGT = false; # True if there are no more greater-than (>) signs right of $i $findOnlyinclude = $enableOnlyinclude; # True to ignore all input up to the next <onlyinclude> $fakeLineStart = true; # Do a line-start run without outputting an LF character while (true) { //$this->memCheck(); if ($findOnlyinclude) { // Ignore all input up to the next <onlyinclude> $startPos = strpos($text, '<onlyinclude>', $i); if ($startPos === false) { // Ignored section runs to the end $accum .= '<ignore>' . htmlspecialchars(substr($text, $i)) . '</ignore>'; break; } $tagEndPos = $startPos + strlen('<onlyinclude>'); // past-the-end $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i)) . '</ignore>'; $i = $tagEndPos; $findOnlyinclude = false; } if ($fakeLineStart) { $found = 'line-start'; $curChar = ''; } else { # Find next opening brace, closing brace or pipe $search = $searchBase; if ($stack->top === false) { $currentClosing = ''; } else { $currentClosing = $stack->top->close; $search .= $currentClosing; } if ($findPipe) { $search .= '|'; } if ($findEquals) { // First equals will be for the template $search .= '='; } $rule = null; # Output literal section, advance input counter $literalLength = strcspn($text, $search, $i); if ($literalLength > 0) { $accum .= htmlspecialchars(substr($text, $i, $literalLength)); $i += $literalLength; } if ($i >= strlen($text)) { if ($currentClosing == "\n") { // Do a past-the-end run to finish off the heading $curChar = ''; $found = 'line-end'; } else { # All done break; } } else { $curChar = $text[$i]; if ($curChar == '|') { $found = 'pipe'; } elseif ($curChar == '=') { $found = 'equals'; } elseif ($curChar == '<') { $found = 'angle'; } elseif ($curChar == "\n") { if ($inHeading) { $found = 'line-end'; } else { $found = 'line-start'; } } elseif ($curChar == $currentClosing) { $found = 'close'; } elseif (isset($rules[$curChar])) { $found = 'open'; $rule = $rules[$curChar]; } else { # Some versions of PHP have a strcspn which stops on null characters # Ignore and continue ++$i; continue; } } } if ($found == 'angle') { $matches = false; // Handle </onlyinclude> if ($enableOnlyinclude && substr($text, $i, strlen('</onlyinclude>')) == '</onlyinclude>') { $findOnlyinclude = true; continue; } // Determine element name if (!preg_match($elementsRegex, $text, $matches, 0, $i + 1)) { // Element name missing or not listed $accum .= '<'; ++$i; continue; } // Handle comments if (isset($matches[2]) && $matches[2] == '!--') { // To avoid leaving blank lines, when a comment is both preceded // and followed by a newline (ignoring spaces), trim leading and // trailing spaces and one of the newlines. // Find the end $endPos = strpos($text, '-->', $i + 4); if ($endPos === false) { // Unclosed comment in input, runs to end $inner = substr($text, $i); $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>'; $i = strlen($text); } else { // Search backwards for leading whitespace $wsStart = $i ? $i - strspn($revText, ' ', strlen($text) - $i) : 0; // Search forwards for trailing whitespace // $wsEnd will be the position of the last space $wsEnd = $endPos + 2 + strspn($text, ' ', $endPos + 3); // Eat the line if possible // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but // it's a possible beneficial b/c break. if ($wsStart > 0 && substr($text, $wsStart - 1, 1) == "\n" && substr($text, $wsEnd + 1, 1) == "\n") { $startPos = $wsStart; $endPos = $wsEnd + 1; // Remove leading whitespace from the end of the accumulator // Sanity check first though $wsLength = $i - $wsStart; if ($wsLength > 0 && substr($accum, -$wsLength) === str_repeat(' ', $wsLength)) { $accum = substr($accum, 0, -$wsLength); } // Do a line-start run next time to look for headings after the comment $fakeLineStart = true; } else { // No line to eat, just take the comment itself $startPos = $i; $endPos += 2; } if ($stack->top) { $part = $stack->top->getCurrentPart(); if (isset($part->commentEnd) && $part->commentEnd == $wsStart - 1) { // Comments abutting, no change in visual end $part->commentEnd = $wsEnd; } else { $part->visualEnd = $wsStart; $part->commentEnd = $endPos; } } $i = $endPos + 1; $inner = substr($text, $startPos, $endPos - $startPos + 1); $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>'; } continue; } $name = $matches[1]; $lowerName = strtolower($name); $attrStart = $i + strlen($name) + 1; // Find end of tag $tagEndPos = $noMoreGT ? false : strpos($text, '>', $attrStart); if ($tagEndPos === false) { // Infinite backtrack // Disable tag search to prevent worst-case O(N^2) performance $noMoreGT = true; $accum .= '<'; ++$i; continue; } // Handle ignored tags if (in_array($lowerName, $ignoredTags)) { $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i + 1)) . '</ignore>'; $i = $tagEndPos + 1; continue; } $tagStartPos = $i; if ($text[$tagEndPos - 1] == '/') { $attrEnd = $tagEndPos - 1; $inner = null; $i = $tagEndPos + 1; $close = ''; } else { $attrEnd = $tagEndPos; // Find closing tag if (preg_match("/<\\/{$name}\\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1)) { $inner = substr($text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1); $i = $matches[0][1] + strlen($matches[0][0]); $close = '<close>' . htmlspecialchars($matches[0][0]) . '</close>'; } else { // No end tag -- let it run out to the end of the text. $inner = substr($text, $tagEndPos + 1); $i = strlen($text); $close = ''; } } // <includeonly> and <noinclude> just become <ignore> tags if (in_array($lowerName, $ignoredElements)) { $accum .= '<ignore>' . htmlspecialchars(substr($text, $tagStartPos, $i - $tagStartPos)) . '</ignore>'; continue; } $accum .= '<ext>'; if ($attrEnd <= $attrStart) { $attr = ''; } else { $attr = substr($text, $attrStart, $attrEnd - $attrStart); } $accum .= '<name>' . htmlspecialchars($name) . '</name>' . '<attr>' . htmlspecialchars($attr) . '</attr>'; if ($inner !== null) { $accum .= '<inner>' . htmlspecialchars($inner) . '</inner>'; } $accum .= $close . '</ext>'; } elseif ($found == 'line-start') { // Is this the start of a heading? // Line break belongs before the heading element in any case if ($fakeLineStart) { $fakeLineStart = false; } else { $accum .= $curChar; $i++; } $count = strspn($text, '=', $i, 6); if ($count == 1 && $findEquals) { // DWIM: This looks kind of like a name/value separator // Let's let the equals handler have it and break the potential heading // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex. } elseif ($count > 0) { $piece = array('open' => "\n", 'close' => "\n", 'parts' => array(new PPDPart(str_repeat('=', $count))), 'startPos' => $i, 'count' => $count); $stack->push($piece); $accum =& $stack->getAccum(); extract($stack->getFlags()); $i += $count; } } elseif ($found == 'line-end') { $piece = $stack->top; // A heading must be open, otherwise \n wouldn't have been in the search list assert($piece->open == "\n"); $part = $piece->getCurrentPart(); // Search back through the input to see if it has a proper close // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient $wsLength = strspn($revText, " \t", strlen($text) - $i); $searchStart = $i - $wsLength; if (isset($part->commentEnd) && $searchStart - 1 == $part->commentEnd) { // Comment found at line end // Search for equals signs before the comment $searchStart = $part->visualEnd; $searchStart -= strspn($revText, " \t", strlen($text) - $searchStart); } $count = $piece->count; $equalsLength = strspn($revText, '=', strlen($text) - $searchStart); if ($equalsLength > 0) { if ($i - $equalsLength == $piece->startPos) { // This is just a single string of equals signs on its own line // Replicate the doHeadings behaviour /={count}(.+)={count}/ // First find out how many equals signs there really are (don't stop at 6) $count = $equalsLength; if ($count < 3) { $count = 0; } else { $count = min(6, intval(($count - 1) / 2)); } } else { $count = min($equalsLength, $count); } if ($count > 0) { // Normal match, output <h> $element = "<h level=\"{$count}\" i=\"{$headingIndex}\">{$accum}</h>"; $headingIndex++; } else { // Single equals sign on its own line, count=0 $element = $accum; } } else { // No match, no <h>, just pass down the inner text $element = $accum; } // Unwind the stack $stack->pop(); $accum =& $stack->getAccum(); extract($stack->getFlags()); // Append the result to the enclosing accumulator $accum .= $element; // Note that we do NOT increment the input pointer. // This is because the closing linebreak could be the opening linebreak of // another heading. Infinite loops are avoided because the next iteration MUST // hit the heading open case above, which unconditionally increments the // input pointer. } elseif ($found == 'open') { # count opening brace characters $count = strspn($text, $curChar, $i); # we need to add to stack only if opening brace count is enough for one of the rules if ($count >= $rule['min']) { # Add it to the stack $piece = array('open' => $curChar, 'close' => $rule['end'], 'count' => $count, 'lineStart' => $i > 0 && $text[$i - 1] == "\n"); $stack->push($piece); $accum =& $stack->getAccum(); extract($stack->getFlags()); } else { # Add literal brace(s) $accum .= htmlspecialchars(str_repeat($curChar, $count)); } $i += $count; } elseif ($found == 'close') { $piece = $stack->top; # lets check if there are enough characters for closing brace $maxCount = $piece->count; $count = strspn($text, $curChar, $i, $maxCount); # check for maximum matching characters (if there are 5 closing # characters, we will probably need only 3 - depending on the rules) $matchingCount = 0; $rule = $rules[$piece->open]; if ($count > $rule['max']) { # The specified maximum exists in the callback array, unless the caller # has made an error $matchingCount = $rule['max']; } else { # Count is less than the maximum # Skip any gaps in the callback array to find the true largest match # Need to use array_key_exists not isset because the callback can be null $matchingCount = $count; while ($matchingCount > 0 && !array_key_exists($matchingCount, $rule['names'])) { --$matchingCount; } } if ($matchingCount <= 0) { # No matching element found in callback array # Output a literal closing brace and continue $accum .= htmlspecialchars(str_repeat($curChar, $count)); $i += $count; continue; } $name = $rule['names'][$matchingCount]; if ($name === null) { // No element, just literal text $element = $piece->breakSyntax($matchingCount) . str_repeat($rule['end'], $matchingCount); } else { # Create XML element # Note: $parts is already XML, does not need to be encoded further $parts = $piece->parts; $title = $parts[0]->out; unset($parts[0]); # The invocation is at the start of the line if lineStart is set in # the stack, and all opening brackets are used up. if ($maxCount == $matchingCount && !empty($piece->lineStart)) { $attr = ' lineStart="1"'; } else { $attr = ''; } $element = "<{$name}{$attr}>"; $element .= "<title>{$title}</title>"; $argIndex = 1; foreach ($parts as $partIndex => $part) { if (isset($part->eqpos)) { $argName = substr($part->out, 0, $part->eqpos); $argValue = substr($part->out, $part->eqpos + 1); $element .= "<part><name>{$argName}</name>=<value>{$argValue}</value></part>"; } else { $element .= "<part><name index=\"{$argIndex}\" /><value>{$part->out}</value></part>"; $argIndex++; } } $element .= "</{$name}>"; } # Advance input pointer $i += $matchingCount; # Unwind the stack $stack->pop(); $accum =& $stack->getAccum(); # Re-add the old stack element if it still has unmatched opening characters remaining if ($matchingCount < $piece->count) { $piece->parts = array(new PPDPart()); $piece->count -= $matchingCount; # do we still qualify for any callback with remaining count? $names = $rules[$piece->open]['names']; $skippedBraces = 0; $enclosingAccum =& $accum; while ($piece->count) { if (array_key_exists($piece->count, $names)) { $stack->push($piece); $accum =& $stack->getAccum(); break; } --$piece->count; $skippedBraces++; } $enclosingAccum .= str_repeat($piece->open, $skippedBraces); } extract($stack->getFlags()); # Add XML element to the enclosing accumulator $accum .= $element; } elseif ($found == 'pipe') { $findEquals = true; // shortcut for getFlags() $stack->addPart(); $accum =& $stack->getAccum(); ++$i; } elseif ($found == 'equals') { $findEquals = false; // shortcut for getFlags() $stack->getCurrentPart()->eqpos = strlen($accum); $accum .= '='; ++$i; } } # Output any remaining unclosed brackets foreach ($stack->stack as $piece) { $stack->rootAccum .= $piece->breakSyntax(); } $stack->rootAccum .= '</root>'; $xml = $stack->rootAccum; wfProfileOut(__METHOD__ . '-makexml'); wfProfileIn(__METHOD__ . '-loadXML'); $dom = new DOMDocument(); wfSuppressWarnings(); $result = $dom->loadXML($xml); wfRestoreWarnings(); if (!$result) { // Try running the XML through UtfNormal to get rid of invalid characters $xml = UtfNormal::cleanUp($xml); $result = $dom->loadXML($xml); if (!$result) { throw new MWException(__METHOD__ . ' generated invalid XML'); } } $obj = new PPNode_DOM($dom->documentElement); wfProfileOut(__METHOD__ . '-loadXML'); wfProfileOut(__METHOD__); return $obj; }
$diffs = new Diff($ota, $nta); $formatter = new TableDiffFormatter(); $funky = $formatter->format($diffs); preg_match_all('/<span class="diffchange">(.*?)<\\/span>/', $funky, $matches); foreach ($matches[1] as $bit) { $hex = bin2hex($bit); echo "\t{$hex}\n"; } } $size = 16; $n = 0; while (true) { $n++; echo "{$n}\n"; $str = randomString($size, true); $clean = UtfNormal::cleanUp($str); $norm = donorm($str); echo strlen($clean) . ", " . strlen($norm); if ($clean == $norm) { echo " (match)\n"; } else { echo " (FAIL)\n"; echo "\traw: " . bin2hex($str) . "\n" . "\tphp: " . bin2hex($clean) . "\n" . "\ticu: " . bin2hex($norm) . "\n"; echo "\n\tdiffs:\n"; showDiffs($clean, $norm); die; } $str = ''; $clean = ''; $norm = ''; }
if (PHP_SAPI != 'cli') { die("Run me from the command line please.\n"); } if (isset($_SERVER['argv']) && in_array('--icu', $_SERVER['argv'])) { dl('php_utfnormal.so'); } require_once 'UtfNormalDefines.php'; require_once 'UtfNormalUtil.php'; require_once 'UtfNormal.php'; define('BENCH_CYCLES', 1); define('BIGSIZE', 1024 * 1024 * 10); // 10m ini_set('memory_limit', BIGSIZE + 120 * 1024 * 1024); $testfiles = array('testdata/washington.txt' => 'English text', 'testdata/berlin.txt' => 'German text', 'testdata/bulgakov.txt' => 'Russian text', 'testdata/tokyo.txt' => 'Japanese text', 'testdata/young.txt' => 'Korean text'); $normalizer = new UtfNormal(); UtfNormal::loadData(); foreach ($testfiles as $file => $desc) { benchmarkTest($normalizer, $file, $desc); } # ------- function benchmarkTest(&$u, $filename, $desc) { print "Testing {$filename} ({$desc})...\n"; $data = file_get_contents($filename); $all = $data; while (strlen($all) < BIGSIZE) { $all .= $all; } $data = $all; echo "Data is " . strlen($data) . " bytes.\n"; $forms = array('quickIsNFCVerify', 'cleanUp');
private function getTextSpawnedOnce($id) { $ok = fwrite($this->spawnWrite, "{$id}\n"); //$this->progress( ">> $id" ); if (!$ok) { return false; } $ok = fflush($this->spawnWrite); //$this->progress( ">> [flush]" ); if (!$ok) { return false; } $len = fgets($this->spawnRead); //$this->progress( "<< " . trim( $len ) ); if ($len === false) { return false; } $nbytes = intval($len); $text = ""; // Subprocess may not send everything at once, we have to loop. while ($nbytes > strlen($text)) { $buffer = fread($this->spawnRead, $nbytes - strlen($text)); if ($text === false) { break; } $text .= $buffer; } $gotbytes = strlen($text); if ($gotbytes != $nbytes) { $this->progress("Expected {$nbytes} bytes from database subprocess, got {$gotbytes} "); return false; } // Do normalization in the dump thread... $stripped = str_replace("\r", "", $text); $normalized = UtfNormal::cleanUp($stripped); return $normalized; }