/** * @throws MWException * @return array */ static function getCaseMaps() { static $wikiUpperChars, $wikiLowerChars; if (isset($wikiUpperChars)) { return array($wikiUpperChars, $wikiLowerChars); } wfProfileIn(__METHOD__); $arr = wfGetPrecompiledData('Utf8Case.ser'); if ($arr === false) { throw new MWException("Utf8Case.ser is missing, please run \"make\" in the serialized directory\n"); } $wikiUpperChars = $arr['wikiUpperChars']; $wikiLowerChars = $arr['wikiLowerChars']; wfProfileOut(__METHOD__); return array($wikiUpperChars, $wikiLowerChars); }
function getFirstLetterData() { if ($this->firstLetterData !== null) { return $this->firstLetterData; } $cache = wfGetCache(CACHE_ANYTHING); $cacheKey = wfMemcKey('first-letters', $this->locale); $cacheEntry = $cache->get($cacheKey); if ($cacheEntry) { $this->firstLetterData = $cacheEntry; return $this->firstLetterData; } // Generate data from serialized data file $letters = wfGetPrecompiledData("first-letters-{$this->locale}.ser"); if ($letters === false) { throw new MWException("MediaWiki does not support ICU locale " . "\"{$this->locale}\""); } // Sort the letters. // // It's impossible to have the precompiled data file properly sorted, // because the sort order changes depending on ICU version. If the // array is not properly sorted, the binary search will return random // results. // // We also take this opportunity to remove primary collisions. $letterMap = array(); foreach ($letters as $letter) { $key = $this->getPrimarySortKey($letter); if (isset($letterMap[$key])) { // Primary collision // Keep whichever one sorts first in the main collator if ($this->mainCollator->compare($letter, $letterMap[$key]) < 0) { $letterMap[$key] = $letter; } } else { $letterMap[$key] = $letter; } } ksort($letterMap, SORT_STRING); $data = array('chars' => array_values($letterMap), 'keys' => array_keys($letterMap)); // Reduce memory usage before caching unset($letterMap); // Save to cache $this->firstLetterData = $data; $cache->set($cacheKey, $data, 86400 * 7); return $data; }
function getFirstLetterData() { if ($this->firstLetterData !== null) { return $this->firstLetterData; } $cache = wfGetCache(CACHE_ANYTHING); $cacheKey = wfMemcKey('first-letters', $this->locale, $this->digitTransformLanguage->getCode(), self::getICUVersion()); $cacheEntry = $cache->get($cacheKey); if ($cacheEntry && isset($cacheEntry['version']) && $cacheEntry['version'] == self::FIRST_LETTER_VERSION) { $this->firstLetterData = $cacheEntry; return $this->firstLetterData; } // Generate data from serialized data file if (isset(self::$tailoringFirstLetters[$this->locale])) { $letters = wfGetPrecompiledData("first-letters-root.ser"); // Append additional characters $letters = array_merge($letters, self::$tailoringFirstLetters[$this->locale]); // Remove unnecessary ones, if any if (isset(self::$tailoringFirstLetters['-' . $this->locale])) { $letters = array_diff($letters, self::$tailoringFirstLetters['-' . $this->locale]); } // Apply digit transforms $digits = array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9'); $letters = array_diff($letters, $digits); foreach ($digits as $digit) { $letters[] = $this->digitTransformLanguage->formatNum($digit, true); } } else { $letters = wfGetPrecompiledData("first-letters-{$this->locale}.ser"); if ($letters === false) { throw new MWException("MediaWiki does not support ICU locale " . "\"{$this->locale}\""); } } /* Sort the letters. * * It's impossible to have the precompiled data file properly sorted, * because the sort order changes depending on ICU version. If the * array is not properly sorted, the binary search will return random * results. * * We also take this opportunity to remove primary collisions. */ $letterMap = array(); foreach ($letters as $letter) { $key = $this->getPrimarySortKey($letter); if (isset($letterMap[$key])) { // Primary collision // Keep whichever one sorts first in the main collator if ($this->mainCollator->compare($letter, $letterMap[$key]) < 0) { $letterMap[$key] = $letter; } } else { $letterMap[$key] = $letter; } } ksort($letterMap, SORT_STRING); /* Remove duplicate prefixes. Basically if something has a sortkey * which is a prefix of some other sortkey, then it is an * expansion and probably should not be considered a section * header. * * For example 'þ' is sometimes sorted as if it is the letters * 'th'. Other times it is its own primary element. Another * example is '₨'. Sometimes its a currency symbol. Sometimes it * is an 'R' followed by an 's'. * * Additionally an expanded element should always sort directly * after its first element due to they way sortkeys work. * * UCA sortkey elements are of variable length but no collation * element should be a prefix of some other element, so I think * this is safe. See: * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm * - http://site.icu-project.org/design/collation/uca-weight-allocation * * Additionally, there is something called primary compression to * worry about. Basically, if you have two primary elements that * are more than one byte and both start with the same byte then * the first byte is dropped on the second primary. Additionally * either \x03 or \xFF may be added to mean that the next primary * does not start with the first byte of the first primary. * * This shouldn't matter much, as the first primary is not * changed, and that is what we are comparing against. * * tl;dr: This makes some assumptions about how icu implements * collations. It seems incredibly unlikely these assumptions * will change, but nonetheless they are assumptions. */ $prev = false; $duplicatePrefixes = array(); foreach ($letterMap as $key => $value) { // Remove terminator byte. Otherwise the prefix // comparison will get hung up on that. $trimmedKey = rtrim($key, ""); if ($prev === false || $prev === '') { $prev = $trimmedKey; // We don't yet have a collation element // to compare against, so continue. continue; } // Due to the fact the array is sorted, we only have // to compare with the element directly previous // to the current element (skipping expansions). // An element "X" will always sort directly // before "XZ" (Unless we have "XY", but we // do not update $prev in that case). if (substr($trimmedKey, 0, strlen($prev)) === $prev) { $duplicatePrefixes[] = $key; // If this is an expansion, we don't want to // compare the next element to this element, // but to what is currently $prev continue; } $prev = $trimmedKey; } foreach ($duplicatePrefixes as $badKey) { wfDebug("Removing '{$letterMap[$badKey]}' from first letters.\n"); unset($letterMap[$badKey]); // This code assumes that unsetting does not change sort order. } $data = array('chars' => array_values($letterMap), 'keys' => array_keys($letterMap), 'version' => self::FIRST_LETTER_VERSION); // Reduce memory usage before caching unset($letterMap); // Save to cache $this->firstLetterData = $data; $cache->set($cacheKey, $data, $cache::TTL_WEEK); return $data; }
/** * Transform a string using serialized data stored in the given file (which * must be in the serialized subdirectory of $IP). The file contains pairs * mapping source characters to destination characters. * * The data is cached in process memory. This will go faster if you have the * FastStringSearch extension. * * @param string $file * @param string $string * * @throws MWException * @return string */ function transformUsingPairFile($file, $string) { if (!isset($this->transformData[$file])) { $data = wfGetPrecompiledData($file); if ($data === false) { throw new MWException(__METHOD__ . ": The transformation file {$file} is missing"); } $this->transformData[$file] = new ReplacementArray($data); } return $this->transformData[$file]->replace($string); }