Example #1
0
 function processPage($row)
 {
     global $wgContLang;
     $source = $row->img_name;
     if ($source == '') {
         // Ye olde empty rows. Just kill them.
         $this->killRow($source);
         return $this->progress(1);
     }
     $cleaned = $source;
     // About half of old bad image names have percent-codes
     $cleaned = rawurldecode($cleaned);
     // Some are old latin-1
     $cleaned = $wgContLang->checkTitleEncoding($cleaned);
     // Many of remainder look like non-normalized unicode
     $cleaned = UtfNormal::cleanUp($cleaned);
     $title = Title::makeTitleSafe(NS_IMAGE, $cleaned);
     if (is_null($title)) {
         $this->log("page {$source} ({$cleaned}) is illegal.");
         $safe = $this->buildSafeTitle($cleaned);
         $this->pokeFile($source, $safe);
         return $this->progress(1);
     }
     if ($title->getDbKey() !== $source) {
         $munged = $title->getDbKey();
         $this->log("page {$source} ({$munged}) doesn't match self.");
         $this->pokeFile($source, $munged);
         return $this->progress(1);
     }
     $this->progress(0);
 }
 /**
  * Format an XML element as with self::element(), but run text through the
  * UtfNormal::cleanUp() validator first to ensure that no invalid UTF-8
  * is passed.
  *
  * @param $element String:
  * @param $attribs Array: Name=>value pairs. Values will be escaped.
  * @param $contents String: NULL to make an open tag only; '' for a contentless closed tag (default)
  * @return string
  */
 public static function elementClean($element, $attribs = array(), $contents = '')
 {
     if ($attribs) {
         $attribs = array_map(array('UtfNormal', 'cleanUp'), $attribs);
     }
     if ($contents) {
         $contents = UtfNormal::cleanUp($contents);
     }
     return self::element($element, $attribs, $contents);
 }
Example #3
0
 /**
  * Format an XML element as with self::element(), but run text through the
  * UtfNormal::cleanUp() validator first to ensure that no invalid UTF-8
  * is passed.
  *
  * @param $element String:
  * @param $attribs Array: Name=>value pairs. Values will be escaped.
  * @param $contents String: NULL to make an open tag only; '' for a contentless closed tag (default)
  * @return string
  */
 public static function elementClean($element, $attribs = array(), $contents = '')
 {
     if ($attribs) {
         $attribs = array_map(array('UtfNormal', 'cleanUp'), $attribs);
     }
     if ($contents) {
         wfProfileIn(__METHOD__ . '-norm');
         $contents = UtfNormal::cleanUp($contents);
         wfProfileOut(__METHOD__ . '-norm');
     }
     return self::element($element, $attribs, $contents);
 }
Example #4
0
function gs_utf8_decompose_to_ascii($str)
{
    static $map = null;
    if (!is_array($map)) {
        $map = _gs_utf8_get_map();
    }
    $str = UtfNormal::toNFD(strTr($str, $map));
    # return "safe" ASCII without control chars, newlines etc.
    //$str = preg_replace('/[^a-z0-9\-_. *#\'"!$()\/]/i', '', $str);
    $str = preg_replace('/[^\\x20-\\x7E]/', '', $str);
    return $str;
}
Example #5
0
 function processPage($row)
 {
     $current = Title::makeTitle($row->wl_namespace, $row->wl_title);
     $display = $current->getPrefixedText();
     $verified = UtfNormal::cleanUp($display);
     $title = Title::newFromText($verified);
     if ($row->wl_user == 0 || is_null($title) || !$title->equals($current)) {
         $this->log("invalid watch by {$row->wl_user} for ({$row->wl_namespace}, \"{$row->wl_title}\")");
         $this->removeWatch($row);
         return $this->progress(1);
     }
     $this->progress(0);
 }
Example #6
0
 public static function formatDiffRow($title, $oldid, $newid, $timestamp, $comment, $actiontext = '')
 {
     global $wgFeedDiffCutoff, $wgContLang, $wgUser;
     wfProfileIn(__FUNCTION__);
     $skin = $wgUser->getSkin();
     # log enties
     $completeText = '<p>' . implode(' ', array_filter(array($actiontext, $skin->formatComment($comment)))) . "</p>\n";
     //NOTE: Check permissions for anonymous users, not current user.
     //      No "privileged" version should end up in the cache.
     //      Most feed readers will not log in anway.
     $anon = new User();
     $accErrors = $title->getUserPermissionsErrors('read', $anon, true);
     if ($title->getNamespace() >= 0 && !$accErrors) {
         if ($oldid) {
             wfProfileIn(__FUNCTION__ . "-dodiff");
             #$diffText = $de->getDiff( wfMsg( 'revisionasof',
             #	$wgContLang->timeanddate( $timestamp ) ),
             #	wfMsg( 'currentrev' ) );
             // Don't bother generating the diff if we won't be able to show it
             if ($wgFeedDiffCutoff > 0) {
                 $de = new DifferenceEngine($title, $oldid, $newid);
                 $diffText = $de->getDiff(wfMsg('previousrevision'), wfMsg('revisionasof', $wgContLang->timeanddate($timestamp)));
             }
             if (strlen($diffText) > $wgFeedDiffCutoff || $wgFeedDiffCutoff <= 0) {
                 // Omit large diffs
                 $diffLink = $title->escapeFullUrl('diff=' . $newid . '&oldid=' . $oldid);
                 $diffText = '<a href="' . $diffLink . '">' . htmlspecialchars(wfMsgForContent('showdiff')) . '</a>';
             } elseif ($diffText === false) {
                 // Error in diff engine, probably a missing revision
                 $diffText = "<p>Can't load revision {$newid}</p>";
             } else {
                 // Diff output fine, clean up any illegal UTF-8
                 $diffText = UtfNormal::cleanUp($diffText);
                 $diffText = self::applyDiffStyle($diffText);
             }
             wfProfileOut(__FUNCTION__ . "-dodiff");
         } else {
             $rev = Revision::newFromId($newid);
             if (is_null($rev)) {
                 $newtext = '';
             } else {
                 $newtext = $rev->getText();
             }
             $diffText = '<p><b>' . wfMsg('newpage') . '</b></p>' . '<div>' . nl2br(htmlspecialchars($newtext)) . '</div>';
         }
         $completeText .= $diffText;
     }
     wfProfileOut(__FUNCTION__);
     return $completeText;
 }
 /**
  * Returns the normalized form of the given page title, using the normalization rules of the given site.
  * If the given title is a redirect, the redirect weill be resolved and the redirect target is returned.
  *
  * @note  : This actually makes an API request to the remote site, so beware that this function is slow and depends
  *          on an external service.
  *
  * @note  : If MW_PHPUNIT_TEST is defined, the call to the external site is skipped, and the title
  *          is normalized using the local normalization rules as implemented by the Title class.
  *
  * @see Site::normalizePageName
  *
  * @since 1.21
  *
  * @param string $pageName
  *
  * @return string
  * @throws MWException
  */
 public function normalizePageName($pageName)
 {
     // Check if we have strings as arguments.
     if (!is_string($pageName)) {
         throw new MWException('$pageName must be a string');
     }
     // Go on call the external site
     if (defined('MW_PHPUNIT_TEST')) {
         // If the code is under test, don't call out to other sites, just normalize locally.
         // Note: this may cause results to be inconsistent with the actual normalization used by the respective remote site!
         $t = Title::newFromText($pageName);
         return $t->getPrefixedText();
     } else {
         // Make sure the string is normalized into NFC (due to the bug 40017)
         // but do nothing to the whitespaces, that should work appropriately.
         // @see https://bugzilla.wikimedia.org/show_bug.cgi?id=40017
         $pageName = UtfNormal::cleanUp($pageName);
         // Build the args for the specific call
         $args = array('action' => 'query', 'prop' => 'info', 'redirects' => true, 'converttitles' => true, 'format' => 'json', 'titles' => $pageName);
         $url = $this->getFileUrl('api.php') . '?' . wfArrayToCgi($args);
         // Go on call the external site
         //@todo: we need a good way to specify a timeout here.
         $ret = Http::get($url);
     }
     if ($ret === false) {
         wfDebugLog("MediaWikiSite", "call to external site failed: {$url}");
         return false;
     }
     $data = FormatJson::decode($ret, true);
     if (!is_array($data)) {
         wfDebugLog("MediaWikiSite", "call to <{$url}> returned bad json: " . $ret);
         return false;
     }
     $page = static::extractPageRecord($data, $pageName);
     if (isset($page['missing'])) {
         wfDebugLog("MediaWikiSite", "call to <{$url}> returned a marker for a missing page title! " . $ret);
         return false;
     }
     if (isset($page['invalid'])) {
         wfDebugLog("MediaWikiSite", "call to <{$url}> returned a marker for an invalid page title! " . $ret);
         return false;
     }
     if (!isset($page['title'])) {
         wfDebugLog("MediaWikiSite", "call to <{$url}> did not return a page title! " . $ret);
         return false;
     }
     return $page['title'];
 }
 function processPage($row)
 {
     $current = Title::makeTitle($row->page_namespace, $row->page_title);
     $display = $current->getPrefixedText();
     $verified = UtfNormal::cleanUp($display);
     $title = Title::newFromText($verified);
     if (is_null($title)) {
         $this->log("page {$row->page_id} ({$display}) is illegal.");
         $this->moveIllegalPage($row);
         return $this->progress(1);
     }
     if (!$title->equals($current)) {
         $this->log("page {$row->page_id} ({$display}) doesn't match self.");
         $this->moveInconsistentPage($row, $title);
         return $this->progress(1);
     }
     $this->progress(0);
 }
 function makeInputText($max = false)
 {
     if ($max === false) {
         $max = $this->maxLength;
     }
     $length = mt_rand($this->minLength, $max);
     $s = '';
     for ($i = 0; $i < $length; $i++) {
         $hairIndex = mt_rand(0, count($this->hairs) - 1);
         $s .= $this->hairs[$hairIndex];
     }
     // Send through the UTF-8 normaliser
     // This resolves a few differences between the old preprocessor and the
     // XML-based one, which doesn't like illegals and converts line endings.
     // It's done by the MW UI, so it's a reasonably legitimate thing to do.
     $s = UtfNormal::cleanUp($s);
     return $s;
 }
Example #10
0
 function processPage($row)
 {
     $current = Title::makeTitle($row->page_namespace, $row->page_title);
     $display = $current->getPrefixedText();
     $verified = UtfNormal::cleanUp($display);
     $title = Title::newFromText($verified);
     if (!is_null($title) && $title->equals($current) && $title->canExist()) {
         return $this->progress(0);
         // all is fine
     }
     if ($row->page_namespace == NS_FILE && $this->fileExists($row->page_title)) {
         $this->log("file {$row->page_title} needs cleanup, please run cleanupImages.php.");
         return $this->progress(0);
     } elseif (is_null($title)) {
         $this->log("page {$row->page_id} ({$display}) is illegal.");
         $this->moveIllegalPage($row);
         return $this->progress(1);
     } else {
         $this->log("page {$row->page_id} ({$display}) doesn't match self.");
         $this->moveInconsistentPage($row, $title);
         return $this->progress(1);
     }
 }
 /**
  * Function executed by use of {{#infoboxbuilder:}} parser function.
  * It gets the code from InfoboxBuilder.lua and creates new module object
  * from it. The module is then invoked and the result is returned.
  * @param  Parser  $parser Parser object
  * @param  PPFrame $frame  PPFrame object
  * @param  array   $args   Array of arguments passed from $frame object
  * @return string          A string returned by InfoboxBuilder.lua
  */
 public static function parserFunctionHook(\Parser $parser, $frame, $args)
 {
     wfProfileIn(__METHOD__);
     try {
         /**
          * Add the registered SCSS with the default theme
          */
         $parser->getOutput()->addModuleStyles('ext.wikia.InfoboxBuilder');
         $engine = \Scribunto::getParserEngine($parser);
         unset($args[0]);
         $childFrame = $frame->newChild($args, $parser->getTitle(), 1);
         $moduleText = file_get_contents(__DIR__ . '/includes/lua/InfoboxBuilder.lua');
         $module = new \Scribunto_LuaModule($engine, $moduleText, 'InfoboxBuilder');
         $result = $module->invoke('builder', $childFrame);
         $result = \UtfNormal::cleanUp(strval($result));
         wfProfileOut(__METHOD__);
         return $result;
     } catch (\ScribuntoException $e) {
         $trace = $e->getScriptTraceHtml(array('msgOptions' => array('content')));
         $html = \Html::element('p', array(), $e->getMessage());
         if ($trace !== false) {
             $html .= \Html::element('p', array(), wfMessage('scribunto-common-backtrace')->inContentLanguage()->text()) . $trace;
         }
         $out = $parser->getOutput();
         if (!isset($out->scribunto_errors)) {
             $out->addOutputHook('ScribuntoError');
             $out->scribunto_errors = array();
             $parser->addTrackingCategory('scribunto-common-error-category');
         }
         $out->scribunto_errors[] = $html;
         $id = 'mw-scribunto-error-' . (count($out->scribunto_errors) - 1);
         $parserError = wfMessage('scribunto-parser-error')->inContentLanguage()->text() . $parser->insertStripItem('<!--' . htmlspecialchars($e->getMessage()) . '-->');
         wfProfileOut(__METHOD__);
         // #iferror-compatible error element
         return "<strong class=\"error\"><span class=\"scribunto-error\" id=\"{$id}\">" . $parserError . "</span></strong>";
     }
 }
Example #12
0
 /**
  * Get Normalized metadata in PHP-serialized form
  *
  * @param stdClass $video
  * @return string
  */
 protected static function getNormalizedMetadata($video)
 {
     // image.img_metadata
     $metadata = unserialize($video->img_metadata);
     foreach (self::$metadataFieldsContainingName as $field) {
         if (isset($metadata[$field])) {
             $metadata[$field] = \UtfNormal::toNFC($metadata[$field]);
         }
     }
     return serialize($metadata);
 }
Example #13
0
 /**
  * Convert a UTF-8 string to normal form C. In Malayalam and Arabic, this
  * also cleans up certain backwards-compatible sequences, converting them
  * to the modern Unicode equivalent.
  *
  * This is language-specific for performance reasons only.
  *
  * @param $s string
  *
  * @return string
  */
 function normalize($s)
 {
     global $wgAllUnicodeFixes;
     $s = UtfNormal::cleanUp($s);
     if ($wgAllUnicodeFixes) {
         $s = $this->transformUsingPairFile('normalize-ar.ser', $s);
         $s = $this->transformUsingPairFile('normalize-ml.ser', $s);
     }
     return $s;
 }
Example #14
0
function testLine($test, $line, &$total, &$success, &$failed, $columns, $exceptions, $verbose)
{
    $stripped = $line;
    UtfNormal::quickisNFCVerify($stripped);
    $same = $line == $stripped;
    $len = mb_strlen(substr($stripped, 0, strpos($stripped, '|')));
    if ($len == 0) {
        $len = strlen(substr($stripped, 0, strpos($stripped, '|')));
    }
    $ok = $same ^ $test >= 3;
    $ok ^= in_array($test, $exceptions);
    $ok &= $columns == $len;
    $total++;
    if ($ok) {
        $success++;
    } else {
        $failed++;
    }
    if ($verbose || !$ok) {
        print str_replace("\n", "{$len}\n", $stripped);
    }
}
Example #15
0
 /**
  * Callback function for cleanUpUTF8()
  */
 private static function cleanUp_helper(&$s)
 {
     if (!is_string($s)) {
         return;
     }
     $s = UtfNormal::cleanUp($s);
 }
 /**
  * Converts a path consisting of object titles into a path consisting of tree
  * nodes. The comparison is non-case sensitive.
  *
  * Note: this function returns the same result as getNodePath, 
  * but takes a title path as parameter.
  *
  * @access	public
  * @param	Array	Path array with object titles.
  *                       e.g. array('ILIAS','English','Course A')
  * @param	ref_id	Startnode of the relative path. 
  *                       Specify null, if the title path is an absolute path.
  *                       Specify a ref id, if the title path is a relative 
  *                       path starting at this ref id.
  * @return	array	ordered path info (depth,parent,child,obj_id,type,title)
  *               or null, if the title path can not be converted into a node path.
  */
 function getNodePathForTitlePath($titlePath, $a_startnode_id = null)
 {
     global $ilDB, $log;
     //$log->write('getNodePathForTitlePath('.implode('/',$titlePath));
     // handle empty title path
     if ($titlePath == null || count($titlePath) == 0) {
         if ($a_startnode_id == 0) {
             return null;
         } else {
             return $this->getNodePath($a_startnode_id);
         }
     }
     // fetch the node path up to the startnode
     if ($a_startnode_id != null && $a_startnode_id != 0) {
         // Start using the node path to the root of the relative path
         $nodePath = $this->getNodePath($a_startnode_id);
         $parent = $a_startnode_id;
     } else {
         // Start using the root of the tree
         $nodePath = array();
         $parent = 0;
     }
     // Convert title path into Unicode Normal Form C
     // This is needed to ensure that we can compare title path strings with
     // strings from the database.
     require_once 'include/Unicode/UtfNormal.php';
     include_once './Services/Utilities/classes/class.ilStr.php';
     $inClause = 'd.title IN (';
     for ($i = 0; $i < count($titlePath); $i++) {
         $titlePath[$i] = ilStr::strToLower(UtfNormal::toNFC($titlePath[$i]));
         if ($i > 0) {
             $inClause .= ',';
         }
         $inClause .= $ilDB->quote($titlePath[$i], 'text');
     }
     $inClause .= ')';
     // Fetch all rows that are potential path elements
     if ($this->table_obj_reference) {
         $joinClause = 'JOIN ' . $this->table_obj_reference . '  r ON t.child = r.' . $this->ref_pk . ' ' . 'JOIN ' . $this->table_obj_data . ' d ON r.' . $this->obj_pk . ' = d.' . $this->obj_pk;
     } else {
         $joinClause = 'JOIN ' . $this->table_obj_data . '  d ON t.child = d.' . $this->obj_pk;
     }
     // The ORDER BY clause in the following SQL statement ensures that,
     // in case of a multiple objects with the same title, always the Object
     // with the oldest ref_id is chosen.
     // This ensure, that, if a new object with the same title is added,
     // WebDAV clients can still work with the older object.
     $q = 'SELECT t.depth, t.parent, t.child, d.' . $this->obj_pk . ' obj_id, d.type, d.title ' . 'FROM ' . $this->table_tree . '  t ' . $joinClause . ' ' . 'WHERE ' . $inClause . ' ' . 'AND t.depth <= ' . (count($titlePath) + count($nodePath)) . ' ' . 'AND t.tree = 1 ' . 'ORDER BY t.depth, t.child ASC';
     $r = $ilDB->query($q);
     $rows = array();
     while ($row = $r->fetchRow(DB_FETCHMODE_ASSOC)) {
         $row['title'] = UtfNormal::toNFC($row['title']);
         $row['ref_id'] = $row['child'];
         $rows[] = $row;
     }
     // Extract the path elements from the fetched rows
     for ($i = 0; $i < count($titlePath); $i++) {
         $pathElementFound = false;
         foreach ($rows as $row) {
             if ($row['parent'] == $parent && ilStr::strToLower($row['title']) == $titlePath[$i]) {
                 // FIXME - We should test here, if the user has
                 // 'visible' permission for the object.
                 $nodePath[] = $row;
                 $parent = $row['child'];
                 $pathElementFound = true;
                 break;
             }
         }
         // Abort if we haven't found a path element for the current depth
         if (!$pathElementFound) {
             //$log->write('ilTree.getNodePathForTitlePath('.var_export($titlePath,true).','.$a_startnode_id.'):null');
             return null;
         }
     }
     // Return the node path
     //$log->write('ilTree.getNodePathForTitlePath('.var_export($titlePath,true).','.$a_startnode_id.'):'.var_export($nodePath,true));
     return $nodePath;
 }
Example #17
0
 /**
  * Return the original filename of the uploaded file, as reported by
  * the submitting user agent. HTML-style character entities are
  * interpreted and normalized to Unicode normalization form C, in part
  * to deal with weird input from Safari with non-ASCII filenames.
  *
  * Other than this the name is not verified for being a safe filename.
  *
  * @param $key String: 
  * @return string or NULL if no such file.
  */
 function getFileName($key)
 {
     if (!isset($_FILES[$key])) {
         return NULL;
     }
     $name = $_FILES[$key]['name'];
     # Safari sends filenames in HTML-encoded Unicode form D...
     # Horrid and evil! Let's try to make some kind of sense of it.
     $name = Sanitizer::decodeCharReferences($name);
     $name = UtfNormal::cleanUp($name);
     wfDebug("WebRequest::getFileName() '" . $_FILES[$key]['name'] . "' normalized to '{$name}'\n");
     return $name;
 }
Example #18
0
 /**
  * Recursively normalizes UTF-8 strings in the given array.
  *
  * @param $data string|array
  * @return array|string cleaned-up version of the given
  * @private
  */
 function normalizeUnicode($data)
 {
     if (is_array($data)) {
         foreach ($data as $key => $val) {
             $data[$key] = $this->normalizeUnicode($val);
         }
     } else {
         global $wgContLang;
         $data = isset($wgContLang) ? $wgContLang->normalize($data) : UtfNormal::cleanUp($data);
     }
     return $data;
 }
 /**
  * Get the normalized composed version of the title
  *
  * @return string
  */
 public function getNormalizedDestinationTitle()
 {
     return \UtfNormal::toNFC($this->getSanitizedTitleText());
 }
Example #20
0
 /**
  * Helper function of a helper function to convert charset for iptc values.
  * @param string|array $data The IPTC string
  * @param string $charset The charset
  *
  * @return string
  */
 private static function convIPTCHelper($data, $charset)
 {
     if ($charset) {
         wfSuppressWarnings();
         $data = iconv($charset, "UTF-8//IGNORE", $data);
         wfRestoreWarnings();
         if ($data === false) {
             $data = "";
             wfDebugLog('iptc', __METHOD__ . " Error converting iptc data charset {$charset} to utf-8");
         }
     } else {
         //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
         // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
         $oldData = $data;
         UtfNormal::quickIsNFCVerify($data);
         //make $data valid utf-8
         if ($data === $oldData) {
             return $data;
             //if validation didn't change $data
         } else {
             return self::convIPTCHelper($oldData, 'Windows-1252');
         }
     }
     return trim($data);
 }
Example #21
0
 /**
  * Preprocess some wikitext and return the document tree.
  * This is the ghost of Parser::replace_variables().
  *
  * @param string $text The text to parse
  * @param integer flags Bitwise combination of:
  *          Parser::PTD_FOR_INCLUSION    Handle <noinclude>/<includeonly> as if the text is being
  *                                     included. Default is to assume a direct page view.
  *
  * The generated DOM tree must depend only on the input text and the flags.
  * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899.
  *
  * Any flag added to the $flags parameter here, or any other parameter liable to cause a
  * change in the DOM tree for a given text, must be passed through the section identifier
  * in the section edit link and thus back to extractSections().
  *
  * The output of this function is currently only cached in process memory, but a persistent
  * cache may be implemented at a later date which takes further advantage of these strict
  * dependency requirements.
  *
  * @private
  */
 function preprocessToObj($text, $flags = 0)
 {
     wfProfileIn(__METHOD__);
     global $wgMemc, $wgPreprocessorCacheThreshold;
     $xml = false;
     $cacheable = strlen($text) > $wgPreprocessorCacheThreshold;
     if ($cacheable) {
         wfProfileIn(__METHOD__ . '-cacheable');
         $cacheKey = wfMemcKey('preprocess-xml', md5($text), $flags);
         $cacheValue = $wgMemc->get($cacheKey);
         if ($cacheValue) {
             $version = substr($cacheValue, 0, 8);
             if (intval($version) == self::CACHE_VERSION) {
                 $xml = substr($cacheValue, 8);
                 // From the cache
                 wfDebugLog("Preprocessor", "Loaded preprocessor XML from memcached (key {$cacheKey})");
             }
         }
     }
     if ($xml === false) {
         if ($cacheable) {
             wfProfileIn(__METHOD__ . '-cache-miss');
             $xml = $this->preprocessToXml($text, $flags);
             $cacheValue = sprintf("%08d", self::CACHE_VERSION) . $xml;
             $wgMemc->set($cacheKey, $cacheValue, 86400);
             wfProfileOut(__METHOD__ . '-cache-miss');
             wfDebugLog("Preprocessor", "Saved preprocessor XML to memcached (key {$cacheKey})");
         } else {
             $xml = $this->preprocessToXml($text, $flags);
         }
     }
     wfProfileIn(__METHOD__ . '-loadXML');
     $dom = new DOMDocument();
     wfSuppressWarnings();
     $result = $dom->loadXML($xml);
     wfRestoreWarnings();
     if (!$result) {
         // Try running the XML through UtfNormal to get rid of invalid characters
         $xml = UtfNormal::cleanUp($xml);
         $result = $dom->loadXML($xml);
         if (!$result) {
             throw new MWException(__METHOD__ . ' generated invalid XML');
         }
     }
     $obj = new PPNode_DOM($dom->documentElement);
     wfProfileOut(__METHOD__ . '-loadXML');
     if ($cacheable) {
         wfProfileOut(__METHOD__ . '-cacheable');
     }
     wfProfileOut(__METHOD__);
     return $obj;
 }
 /**
  * Reads messages from the file in a given language and returns an array
  * of AUTHORS, MESSAGES and possibly other properties.
  *
  * @param string $code Language code.
  * @return array|bool False if the file does not exist
  * @throws MWException if the file is not readable or has bad encoding
  */
 public function read($code)
 {
     if (!$this->exists($code)) {
         return false;
     }
     $filename = $this->group->getSourceFilePath($code);
     $input = file_get_contents($filename);
     if ($input === false) {
         throw new MWException("Unable to read file {$filename}.");
     }
     if (!StringUtils::isUtf8($input)) {
         throw new MWException("Contents of {$filename} are not valid utf-8.");
     }
     $input = UtfNormal::cleanUp($input);
     try {
         return $this->readFromVariable($input);
     } catch (Exception $e) {
         throw new MWException("Parsing {$filename} failed: " . $e->getMessage());
     }
 }
 /**
  * Preprocess some wikitext and return the document tree.
  * This is the ghost of Parser::replace_variables().
  *
  * @param string $text the text to parse
  * @param $flags Integer: bitwise combination of:
  *          Parser::PTD_FOR_INCLUSION    Handle "<noinclude>" and "<includeonly>" as if the text is being
  *                                     included. Default is to assume a direct page view.
  *
  * The generated DOM tree must depend only on the input text and the flags.
  * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899.
  *
  * Any flag added to the $flags parameter here, or any other parameter liable to cause a
  * change in the DOM tree for a given text, must be passed through the section identifier
  * in the section edit link and thus back to extractSections().
  *
  * The output of this function is currently only cached in process memory, but a persistent
  * cache may be implemented at a later date which takes further advantage of these strict
  * dependency requirements.
  *
  * @throws MWException
  * @return PPNode_DOM
  */
 function preprocessToObj($text, $flags = 0)
 {
     wfProfileIn(__METHOD__);
     global $wgMemc, $wgPreprocessorCacheThreshold;
     $xml = false;
     $cacheable = $wgPreprocessorCacheThreshold !== false && strlen($text) > $wgPreprocessorCacheThreshold;
     if ($cacheable) {
         wfProfileIn(__METHOD__ . '-cacheable');
         $cacheKey = wfMemcKey('preprocess-xml', md5($text), $flags);
         $cacheValue = $wgMemc->get($cacheKey);
         if ($cacheValue) {
             $version = substr($cacheValue, 0, 8);
             if (intval($version) == self::CACHE_VERSION) {
                 $xml = substr($cacheValue, 8);
                 // From the cache
                 wfDebugLog("Preprocessor", "Loaded preprocessor XML from memcached (key {$cacheKey})");
             }
         }
         if ($xml === false) {
             wfProfileIn(__METHOD__ . '-cache-miss');
             $xml = $this->preprocessToXml($text, $flags);
             $cacheValue = sprintf("%08d", self::CACHE_VERSION) . $xml;
             $wgMemc->set($cacheKey, $cacheValue, 86400);
             wfProfileOut(__METHOD__ . '-cache-miss');
             wfDebugLog("Preprocessor", "Saved preprocessor XML to memcached (key {$cacheKey})");
         }
     } else {
         $xml = $this->preprocessToXml($text, $flags);
     }
     // Fail if the number of elements exceeds acceptable limits
     // Do not attempt to generate the DOM
     $this->parser->mGeneratedPPNodeCount += substr_count($xml, '<');
     $max = $this->parser->mOptions->getMaxGeneratedPPNodeCount();
     if ($this->parser->mGeneratedPPNodeCount > $max) {
         if ($cacheable) {
             wfProfileOut(__METHOD__ . '-cacheable');
         }
         wfProfileOut(__METHOD__);
         throw new MWException(__METHOD__ . ': generated node count limit exceeded');
     }
     wfProfileIn(__METHOD__ . '-loadXML');
     $dom = new DOMDocument();
     wfSuppressWarnings();
     $result = $dom->loadXML($xml);
     wfRestoreWarnings();
     if (!$result) {
         // Try running the XML through UtfNormal to get rid of invalid characters
         $xml = UtfNormal::cleanUp($xml);
         // 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2 don't barf when the XML is >256 levels deep
         $result = $dom->loadXML($xml, 1 << 19);
     }
     if ($result) {
         $obj = new PPNode_DOM($dom->documentElement);
     }
     wfProfileOut(__METHOD__ . '-loadXML');
     if ($cacheable) {
         wfProfileOut(__METHOD__ . '-cacheable');
     }
     wfProfileOut(__METHOD__);
     if (!$result) {
         throw new MWException(__METHOD__ . ' generated invalid XML');
     }
     return $obj;
 }
Example #24
0
function xmlsafe($string)
{
    $fname = 'xmlsafe';
    wfProfileIn($fname);
    /**
     * The page may contain old data which has not been properly normalized.
     * Invalid UTF-8 sequences or forbidden control characters will make our
     * XML output invalid, so be sure to strip them out.
     */
    $string = UtfNormal::cleanUp($string);
    $string = htmlspecialchars($string);
    wfProfileOut($fname);
    return $string;
}
Example #25
0
 /**
  * This is a method to pass messages from wfDebug to the pretty debugger.
  * Do NOT use this method, use MWDebug::log or wfDebug()
  *
  * @since 1.19
  * @param $str string
  */
 public static function debugMsg($str)
 {
     global $wgDebugComments, $wgShowDebug;
     if (self::$enabled || $wgDebugComments || $wgShowDebug) {
         self::$debug[] = rtrim(UtfNormal::cleanUp($str));
     }
 }
Example #26
0
 /**
  * Do userComment tags and similar. See pg. 34 of exif standard.
  * basically first 8 bytes is charset, rest is value.
  * This has not been tested on any shift-JIS strings.
  * @param string $prop prop name.
  */
 private function charCodeString($prop)
 {
     if (isset($this->mFilteredExifData[$prop])) {
         if (strlen($this->mFilteredExifData[$prop]) <= 8) {
             //invalid. Must be at least 9 bytes long.
             $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, false);
             unset($this->mFilteredExifData[$prop]);
             return;
         }
         $charCode = substr($this->mFilteredExifData[$prop], 0, 8);
         $val = substr($this->mFilteredExifData[$prop], 8);
         switch ($charCode) {
             case "JIS":
                 //JIS
                 $charset = "Shift-JIS";
                 break;
             case "UNICODE":
                 $charset = "UTF-16" . $this->byteOrder;
                 break;
             default:
                 //ascii or undefined.
                 $charset = "";
                 break;
         }
         // This could possibly check to see if iconv is really installed
         // or if we're using the compatibility wrapper in globalFunctions.php
         if ($charset) {
             wfSuppressWarnings();
             $val = iconv($charset, 'UTF-8//IGNORE', $val);
             wfRestoreWarnings();
         } else {
             // if valid utf-8, assume that, otherwise assume windows-1252
             $valCopy = $val;
             UtfNormal::quickIsNFCVerify($valCopy);
             //validates $valCopy.
             if ($valCopy !== $val) {
                 wfSuppressWarnings();
                 $val = iconv('Windows-1252', 'UTF-8//IGNORE', $val);
                 wfRestoreWarnings();
             }
         }
         //trim and check to make sure not only whitespace.
         $val = trim($val);
         if (strlen($val) === 0) {
             //only whitespace.
             $this->debug($this->mFilteredExifData[$prop], __FUNCTION__, "{$prop}: Is only whitespace");
             unset($this->mFilteredExifData[$prop]);
             return;
         }
         //all's good.
         $this->mFilteredExifData[$prop] = $val;
     }
 }
Example #27
0
 /**
  * Preprocess some wikitext and return the document tree.
  * This is the ghost of Parser::replace_variables().
  *
  * @param string $text The text to parse
  * @param integer flags Bitwise combination of:
  *          Parser::PTD_FOR_INCLUSION    Handle <noinclude>/<includeonly> as if the text is being
  *                                     included. Default is to assume a direct page view.
  *
  * The generated DOM tree must depend only on the input text and the flags.
  * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899.
  *
  * Any flag added to the $flags parameter here, or any other parameter liable to cause a
  * change in the DOM tree for a given text, must be passed through the section identifier
  * in the section edit link and thus back to extractSections().
  *
  * The output of this function is currently only cached in process memory, but a persistent
  * cache may be implemented at a later date which takes further advantage of these strict
  * dependency requirements.
  *
  * @private
  */
 function preprocessToObj($text, $flags = 0)
 {
     wfProfileIn(__METHOD__);
     wfProfileIn(__METHOD__ . '-makexml');
     $rules = array('{' => array('end' => '}', 'names' => array(2 => 'template', 3 => 'tplarg'), 'min' => 2, 'max' => 3), '[' => array('end' => ']', 'names' => array(2 => null), 'min' => 2, 'max' => 2));
     $forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
     $xmlishElements = $this->parser->getStripList();
     $enableOnlyinclude = false;
     if ($forInclusion) {
         $ignoredTags = array('includeonly', '/includeonly');
         $ignoredElements = array('noinclude');
         $xmlishElements[] = 'noinclude';
         if (strpos($text, '<onlyinclude>') !== false && strpos($text, '</onlyinclude>') !== false) {
             $enableOnlyinclude = true;
         }
     } else {
         $ignoredTags = array('noinclude', '/noinclude', 'onlyinclude', '/onlyinclude');
         $ignoredElements = array('includeonly');
         $xmlishElements[] = 'includeonly';
     }
     $xmlishRegex = implode('|', array_merge($xmlishElements, $ignoredTags));
     // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
     $elementsRegex = "~({$xmlishRegex})(?:\\s|\\/>|>)|(!--)~iA";
     $stack = new PPDStack();
     $searchBase = "[{<\n";
     #}
     $revText = strrev($text);
     // For fast reverse searches
     $i = 0;
     # Input pointer, starts out pointing to a pseudo-newline before the start
     $accum =& $stack->getAccum();
     # Current accumulator
     $accum = '<root>';
     $findEquals = false;
     # True to find equals signs in arguments
     $findPipe = false;
     # True to take notice of pipe characters
     $headingIndex = 1;
     $inHeading = false;
     # True if $i is inside a possible heading
     $noMoreGT = false;
     # True if there are no more greater-than (>) signs right of $i
     $findOnlyinclude = $enableOnlyinclude;
     # True to ignore all input up to the next <onlyinclude>
     $fakeLineStart = true;
     # Do a line-start run without outputting an LF character
     while (true) {
         //$this->memCheck();
         if ($findOnlyinclude) {
             // Ignore all input up to the next <onlyinclude>
             $startPos = strpos($text, '<onlyinclude>', $i);
             if ($startPos === false) {
                 // Ignored section runs to the end
                 $accum .= '<ignore>' . htmlspecialchars(substr($text, $i)) . '</ignore>';
                 break;
             }
             $tagEndPos = $startPos + strlen('<onlyinclude>');
             // past-the-end
             $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i)) . '</ignore>';
             $i = $tagEndPos;
             $findOnlyinclude = false;
         }
         if ($fakeLineStart) {
             $found = 'line-start';
             $curChar = '';
         } else {
             # Find next opening brace, closing brace or pipe
             $search = $searchBase;
             if ($stack->top === false) {
                 $currentClosing = '';
             } else {
                 $currentClosing = $stack->top->close;
                 $search .= $currentClosing;
             }
             if ($findPipe) {
                 $search .= '|';
             }
             if ($findEquals) {
                 // First equals will be for the template
                 $search .= '=';
             }
             $rule = null;
             # Output literal section, advance input counter
             $literalLength = strcspn($text, $search, $i);
             if ($literalLength > 0) {
                 $accum .= htmlspecialchars(substr($text, $i, $literalLength));
                 $i += $literalLength;
             }
             if ($i >= strlen($text)) {
                 if ($currentClosing == "\n") {
                     // Do a past-the-end run to finish off the heading
                     $curChar = '';
                     $found = 'line-end';
                 } else {
                     # All done
                     break;
                 }
             } else {
                 $curChar = $text[$i];
                 if ($curChar == '|') {
                     $found = 'pipe';
                 } elseif ($curChar == '=') {
                     $found = 'equals';
                 } elseif ($curChar == '<') {
                     $found = 'angle';
                 } elseif ($curChar == "\n") {
                     if ($inHeading) {
                         $found = 'line-end';
                     } else {
                         $found = 'line-start';
                     }
                 } elseif ($curChar == $currentClosing) {
                     $found = 'close';
                 } elseif (isset($rules[$curChar])) {
                     $found = 'open';
                     $rule = $rules[$curChar];
                 } else {
                     # Some versions of PHP have a strcspn which stops on null characters
                     # Ignore and continue
                     ++$i;
                     continue;
                 }
             }
         }
         if ($found == 'angle') {
             $matches = false;
             // Handle </onlyinclude>
             if ($enableOnlyinclude && substr($text, $i, strlen('</onlyinclude>')) == '</onlyinclude>') {
                 $findOnlyinclude = true;
                 continue;
             }
             // Determine element name
             if (!preg_match($elementsRegex, $text, $matches, 0, $i + 1)) {
                 // Element name missing or not listed
                 $accum .= '&lt;';
                 ++$i;
                 continue;
             }
             // Handle comments
             if (isset($matches[2]) && $matches[2] == '!--') {
                 // To avoid leaving blank lines, when a comment is both preceded
                 // and followed by a newline (ignoring spaces), trim leading and
                 // trailing spaces and one of the newlines.
                 // Find the end
                 $endPos = strpos($text, '-->', $i + 4);
                 if ($endPos === false) {
                     // Unclosed comment in input, runs to end
                     $inner = substr($text, $i);
                     $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>';
                     $i = strlen($text);
                 } else {
                     // Search backwards for leading whitespace
                     $wsStart = $i ? $i - strspn($revText, ' ', strlen($text) - $i) : 0;
                     // Search forwards for trailing whitespace
                     // $wsEnd will be the position of the last space
                     $wsEnd = $endPos + 2 + strspn($text, ' ', $endPos + 3);
                     // Eat the line if possible
                     // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
                     // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
                     // it's a possible beneficial b/c break.
                     if ($wsStart > 0 && substr($text, $wsStart - 1, 1) == "\n" && substr($text, $wsEnd + 1, 1) == "\n") {
                         $startPos = $wsStart;
                         $endPos = $wsEnd + 1;
                         // Remove leading whitespace from the end of the accumulator
                         // Sanity check first though
                         $wsLength = $i - $wsStart;
                         if ($wsLength > 0 && substr($accum, -$wsLength) === str_repeat(' ', $wsLength)) {
                             $accum = substr($accum, 0, -$wsLength);
                         }
                         // Do a line-start run next time to look for headings after the comment
                         $fakeLineStart = true;
                     } else {
                         // No line to eat, just take the comment itself
                         $startPos = $i;
                         $endPos += 2;
                     }
                     if ($stack->top) {
                         $part = $stack->top->getCurrentPart();
                         if (isset($part->commentEnd) && $part->commentEnd == $wsStart - 1) {
                             // Comments abutting, no change in visual end
                             $part->commentEnd = $wsEnd;
                         } else {
                             $part->visualEnd = $wsStart;
                             $part->commentEnd = $endPos;
                         }
                     }
                     $i = $endPos + 1;
                     $inner = substr($text, $startPos, $endPos - $startPos + 1);
                     $accum .= '<comment>' . htmlspecialchars($inner) . '</comment>';
                 }
                 continue;
             }
             $name = $matches[1];
             $lowerName = strtolower($name);
             $attrStart = $i + strlen($name) + 1;
             // Find end of tag
             $tagEndPos = $noMoreGT ? false : strpos($text, '>', $attrStart);
             if ($tagEndPos === false) {
                 // Infinite backtrack
                 // Disable tag search to prevent worst-case O(N^2) performance
                 $noMoreGT = true;
                 $accum .= '&lt;';
                 ++$i;
                 continue;
             }
             // Handle ignored tags
             if (in_array($lowerName, $ignoredTags)) {
                 $accum .= '<ignore>' . htmlspecialchars(substr($text, $i, $tagEndPos - $i + 1)) . '</ignore>';
                 $i = $tagEndPos + 1;
                 continue;
             }
             $tagStartPos = $i;
             if ($text[$tagEndPos - 1] == '/') {
                 $attrEnd = $tagEndPos - 1;
                 $inner = null;
                 $i = $tagEndPos + 1;
                 $close = '';
             } else {
                 $attrEnd = $tagEndPos;
                 // Find closing tag
                 if (preg_match("/<\\/{$name}\\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1)) {
                     $inner = substr($text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1);
                     $i = $matches[0][1] + strlen($matches[0][0]);
                     $close = '<close>' . htmlspecialchars($matches[0][0]) . '</close>';
                 } else {
                     // No end tag -- let it run out to the end of the text.
                     $inner = substr($text, $tagEndPos + 1);
                     $i = strlen($text);
                     $close = '';
                 }
             }
             // <includeonly> and <noinclude> just become <ignore> tags
             if (in_array($lowerName, $ignoredElements)) {
                 $accum .= '<ignore>' . htmlspecialchars(substr($text, $tagStartPos, $i - $tagStartPos)) . '</ignore>';
                 continue;
             }
             $accum .= '<ext>';
             if ($attrEnd <= $attrStart) {
                 $attr = '';
             } else {
                 $attr = substr($text, $attrStart, $attrEnd - $attrStart);
             }
             $accum .= '<name>' . htmlspecialchars($name) . '</name>' . '<attr>' . htmlspecialchars($attr) . '</attr>';
             if ($inner !== null) {
                 $accum .= '<inner>' . htmlspecialchars($inner) . '</inner>';
             }
             $accum .= $close . '</ext>';
         } elseif ($found == 'line-start') {
             // Is this the start of a heading?
             // Line break belongs before the heading element in any case
             if ($fakeLineStart) {
                 $fakeLineStart = false;
             } else {
                 $accum .= $curChar;
                 $i++;
             }
             $count = strspn($text, '=', $i, 6);
             if ($count == 1 && $findEquals) {
                 // DWIM: This looks kind of like a name/value separator
                 // Let's let the equals handler have it and break the potential heading
                 // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.
             } elseif ($count > 0) {
                 $piece = array('open' => "\n", 'close' => "\n", 'parts' => array(new PPDPart(str_repeat('=', $count))), 'startPos' => $i, 'count' => $count);
                 $stack->push($piece);
                 $accum =& $stack->getAccum();
                 extract($stack->getFlags());
                 $i += $count;
             }
         } elseif ($found == 'line-end') {
             $piece = $stack->top;
             // A heading must be open, otherwise \n wouldn't have been in the search list
             assert($piece->open == "\n");
             $part = $piece->getCurrentPart();
             // Search back through the input to see if it has a proper close
             // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient
             $wsLength = strspn($revText, " \t", strlen($text) - $i);
             $searchStart = $i - $wsLength;
             if (isset($part->commentEnd) && $searchStart - 1 == $part->commentEnd) {
                 // Comment found at line end
                 // Search for equals signs before the comment
                 $searchStart = $part->visualEnd;
                 $searchStart -= strspn($revText, " \t", strlen($text) - $searchStart);
             }
             $count = $piece->count;
             $equalsLength = strspn($revText, '=', strlen($text) - $searchStart);
             if ($equalsLength > 0) {
                 if ($i - $equalsLength == $piece->startPos) {
                     // This is just a single string of equals signs on its own line
                     // Replicate the doHeadings behaviour /={count}(.+)={count}/
                     // First find out how many equals signs there really are (don't stop at 6)
                     $count = $equalsLength;
                     if ($count < 3) {
                         $count = 0;
                     } else {
                         $count = min(6, intval(($count - 1) / 2));
                     }
                 } else {
                     $count = min($equalsLength, $count);
                 }
                 if ($count > 0) {
                     // Normal match, output <h>
                     $element = "<h level=\"{$count}\" i=\"{$headingIndex}\">{$accum}</h>";
                     $headingIndex++;
                 } else {
                     // Single equals sign on its own line, count=0
                     $element = $accum;
                 }
             } else {
                 // No match, no <h>, just pass down the inner text
                 $element = $accum;
             }
             // Unwind the stack
             $stack->pop();
             $accum =& $stack->getAccum();
             extract($stack->getFlags());
             // Append the result to the enclosing accumulator
             $accum .= $element;
             // Note that we do NOT increment the input pointer.
             // This is because the closing linebreak could be the opening linebreak of
             // another heading. Infinite loops are avoided because the next iteration MUST
             // hit the heading open case above, which unconditionally increments the
             // input pointer.
         } elseif ($found == 'open') {
             # count opening brace characters
             $count = strspn($text, $curChar, $i);
             # we need to add to stack only if opening brace count is enough for one of the rules
             if ($count >= $rule['min']) {
                 # Add it to the stack
                 $piece = array('open' => $curChar, 'close' => $rule['end'], 'count' => $count, 'lineStart' => $i > 0 && $text[$i - 1] == "\n");
                 $stack->push($piece);
                 $accum =& $stack->getAccum();
                 extract($stack->getFlags());
             } else {
                 # Add literal brace(s)
                 $accum .= htmlspecialchars(str_repeat($curChar, $count));
             }
             $i += $count;
         } elseif ($found == 'close') {
             $piece = $stack->top;
             # lets check if there are enough characters for closing brace
             $maxCount = $piece->count;
             $count = strspn($text, $curChar, $i, $maxCount);
             # check for maximum matching characters (if there are 5 closing
             # characters, we will probably need only 3 - depending on the rules)
             $matchingCount = 0;
             $rule = $rules[$piece->open];
             if ($count > $rule['max']) {
                 # The specified maximum exists in the callback array, unless the caller
                 # has made an error
                 $matchingCount = $rule['max'];
             } else {
                 # Count is less than the maximum
                 # Skip any gaps in the callback array to find the true largest match
                 # Need to use array_key_exists not isset because the callback can be null
                 $matchingCount = $count;
                 while ($matchingCount > 0 && !array_key_exists($matchingCount, $rule['names'])) {
                     --$matchingCount;
                 }
             }
             if ($matchingCount <= 0) {
                 # No matching element found in callback array
                 # Output a literal closing brace and continue
                 $accum .= htmlspecialchars(str_repeat($curChar, $count));
                 $i += $count;
                 continue;
             }
             $name = $rule['names'][$matchingCount];
             if ($name === null) {
                 // No element, just literal text
                 $element = $piece->breakSyntax($matchingCount) . str_repeat($rule['end'], $matchingCount);
             } else {
                 # Create XML element
                 # Note: $parts is already XML, does not need to be encoded further
                 $parts = $piece->parts;
                 $title = $parts[0]->out;
                 unset($parts[0]);
                 # The invocation is at the start of the line if lineStart is set in
                 # the stack, and all opening brackets are used up.
                 if ($maxCount == $matchingCount && !empty($piece->lineStart)) {
                     $attr = ' lineStart="1"';
                 } else {
                     $attr = '';
                 }
                 $element = "<{$name}{$attr}>";
                 $element .= "<title>{$title}</title>";
                 $argIndex = 1;
                 foreach ($parts as $partIndex => $part) {
                     if (isset($part->eqpos)) {
                         $argName = substr($part->out, 0, $part->eqpos);
                         $argValue = substr($part->out, $part->eqpos + 1);
                         $element .= "<part><name>{$argName}</name>=<value>{$argValue}</value></part>";
                     } else {
                         $element .= "<part><name index=\"{$argIndex}\" /><value>{$part->out}</value></part>";
                         $argIndex++;
                     }
                 }
                 $element .= "</{$name}>";
             }
             # Advance input pointer
             $i += $matchingCount;
             # Unwind the stack
             $stack->pop();
             $accum =& $stack->getAccum();
             # Re-add the old stack element if it still has unmatched opening characters remaining
             if ($matchingCount < $piece->count) {
                 $piece->parts = array(new PPDPart());
                 $piece->count -= $matchingCount;
                 # do we still qualify for any callback with remaining count?
                 $names = $rules[$piece->open]['names'];
                 $skippedBraces = 0;
                 $enclosingAccum =& $accum;
                 while ($piece->count) {
                     if (array_key_exists($piece->count, $names)) {
                         $stack->push($piece);
                         $accum =& $stack->getAccum();
                         break;
                     }
                     --$piece->count;
                     $skippedBraces++;
                 }
                 $enclosingAccum .= str_repeat($piece->open, $skippedBraces);
             }
             extract($stack->getFlags());
             # Add XML element to the enclosing accumulator
             $accum .= $element;
         } elseif ($found == 'pipe') {
             $findEquals = true;
             // shortcut for getFlags()
             $stack->addPart();
             $accum =& $stack->getAccum();
             ++$i;
         } elseif ($found == 'equals') {
             $findEquals = false;
             // shortcut for getFlags()
             $stack->getCurrentPart()->eqpos = strlen($accum);
             $accum .= '=';
             ++$i;
         }
     }
     # Output any remaining unclosed brackets
     foreach ($stack->stack as $piece) {
         $stack->rootAccum .= $piece->breakSyntax();
     }
     $stack->rootAccum .= '</root>';
     $xml = $stack->rootAccum;
     wfProfileOut(__METHOD__ . '-makexml');
     wfProfileIn(__METHOD__ . '-loadXML');
     $dom = new DOMDocument();
     wfSuppressWarnings();
     $result = $dom->loadXML($xml);
     wfRestoreWarnings();
     if (!$result) {
         // Try running the XML through UtfNormal to get rid of invalid characters
         $xml = UtfNormal::cleanUp($xml);
         $result = $dom->loadXML($xml);
         if (!$result) {
             throw new MWException(__METHOD__ . ' generated invalid XML');
         }
     }
     $obj = new PPNode_DOM($dom->documentElement);
     wfProfileOut(__METHOD__ . '-loadXML');
     wfProfileOut(__METHOD__);
     return $obj;
 }
Example #28
0
    $diffs = new Diff($ota, $nta);
    $formatter = new TableDiffFormatter();
    $funky = $formatter->format($diffs);
    preg_match_all('/<span class="diffchange">(.*?)<\\/span>/', $funky, $matches);
    foreach ($matches[1] as $bit) {
        $hex = bin2hex($bit);
        echo "\t{$hex}\n";
    }
}
$size = 16;
$n = 0;
while (true) {
    $n++;
    echo "{$n}\n";
    $str = randomString($size, true);
    $clean = UtfNormal::cleanUp($str);
    $norm = donorm($str);
    echo strlen($clean) . ", " . strlen($norm);
    if ($clean == $norm) {
        echo " (match)\n";
    } else {
        echo " (FAIL)\n";
        echo "\traw: " . bin2hex($str) . "\n" . "\tphp: " . bin2hex($clean) . "\n" . "\ticu: " . bin2hex($norm) . "\n";
        echo "\n\tdiffs:\n";
        showDiffs($clean, $norm);
        die;
    }
    $str = '';
    $clean = '';
    $norm = '';
}
if (PHP_SAPI != 'cli') {
    die("Run me from the command line please.\n");
}
if (isset($_SERVER['argv']) && in_array('--icu', $_SERVER['argv'])) {
    dl('php_utfnormal.so');
}
require_once 'UtfNormalDefines.php';
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
define('BENCH_CYCLES', 1);
define('BIGSIZE', 1024 * 1024 * 10);
// 10m
ini_set('memory_limit', BIGSIZE + 120 * 1024 * 1024);
$testfiles = array('testdata/washington.txt' => 'English text', 'testdata/berlin.txt' => 'German text', 'testdata/bulgakov.txt' => 'Russian text', 'testdata/tokyo.txt' => 'Japanese text', 'testdata/young.txt' => 'Korean text');
$normalizer = new UtfNormal();
UtfNormal::loadData();
foreach ($testfiles as $file => $desc) {
    benchmarkTest($normalizer, $file, $desc);
}
# -------
function benchmarkTest(&$u, $filename, $desc)
{
    print "Testing {$filename} ({$desc})...\n";
    $data = file_get_contents($filename);
    $all = $data;
    while (strlen($all) < BIGSIZE) {
        $all .= $all;
    }
    $data = $all;
    echo "Data is " . strlen($data) . " bytes.\n";
    $forms = array('quickIsNFCVerify', 'cleanUp');
 private function getTextSpawnedOnce($id)
 {
     $ok = fwrite($this->spawnWrite, "{$id}\n");
     //$this->progress( ">> $id" );
     if (!$ok) {
         return false;
     }
     $ok = fflush($this->spawnWrite);
     //$this->progress( ">> [flush]" );
     if (!$ok) {
         return false;
     }
     $len = fgets($this->spawnRead);
     //$this->progress( "<< " . trim( $len ) );
     if ($len === false) {
         return false;
     }
     $nbytes = intval($len);
     $text = "";
     // Subprocess may not send everything at once, we have to loop.
     while ($nbytes > strlen($text)) {
         $buffer = fread($this->spawnRead, $nbytes - strlen($text));
         if ($text === false) {
             break;
         }
         $text .= $buffer;
     }
     $gotbytes = strlen($text);
     if ($gotbytes != $nbytes) {
         $this->progress("Expected {$nbytes} bytes from database subprocess, got {$gotbytes} ");
         return false;
     }
     // Do normalization in the dump thread...
     $stripped = str_replace("\r", "", $text);
     $normalized = UtfNormal::cleanUp($stripped);
     return $normalized;
 }