示例#1
0
 /**
  * @throws MWException
  * @return array
  */
 static function getCaseMaps()
 {
     static $wikiUpperChars, $wikiLowerChars;
     if (isset($wikiUpperChars)) {
         return array($wikiUpperChars, $wikiLowerChars);
     }
     wfProfileIn(__METHOD__);
     $arr = wfGetPrecompiledData('Utf8Case.ser');
     if ($arr === false) {
         throw new MWException("Utf8Case.ser is missing, please run \"make\" in the serialized directory\n");
     }
     $wikiUpperChars = $arr['wikiUpperChars'];
     $wikiLowerChars = $arr['wikiLowerChars'];
     wfProfileOut(__METHOD__);
     return array($wikiUpperChars, $wikiLowerChars);
 }
示例#2
0
 function getFirstLetterData()
 {
     if ($this->firstLetterData !== null) {
         return $this->firstLetterData;
     }
     $cache = wfGetCache(CACHE_ANYTHING);
     $cacheKey = wfMemcKey('first-letters', $this->locale);
     $cacheEntry = $cache->get($cacheKey);
     if ($cacheEntry) {
         $this->firstLetterData = $cacheEntry;
         return $this->firstLetterData;
     }
     // Generate data from serialized data file
     $letters = wfGetPrecompiledData("first-letters-{$this->locale}.ser");
     if ($letters === false) {
         throw new MWException("MediaWiki does not support ICU locale " . "\"{$this->locale}\"");
     }
     // Sort the letters.
     //
     // It's impossible to have the precompiled data file properly sorted,
     // because the sort order changes depending on ICU version. If the
     // array is not properly sorted, the binary search will return random
     // results.
     //
     // We also take this opportunity to remove primary collisions.
     $letterMap = array();
     foreach ($letters as $letter) {
         $key = $this->getPrimarySortKey($letter);
         if (isset($letterMap[$key])) {
             // Primary collision
             // Keep whichever one sorts first in the main collator
             if ($this->mainCollator->compare($letter, $letterMap[$key]) < 0) {
                 $letterMap[$key] = $letter;
             }
         } else {
             $letterMap[$key] = $letter;
         }
     }
     ksort($letterMap, SORT_STRING);
     $data = array('chars' => array_values($letterMap), 'keys' => array_keys($letterMap));
     // Reduce memory usage before caching
     unset($letterMap);
     // Save to cache
     $this->firstLetterData = $data;
     $cache->set($cacheKey, $data, 86400 * 7);
     return $data;
 }
示例#3
0
 function getFirstLetterData()
 {
     if ($this->firstLetterData !== null) {
         return $this->firstLetterData;
     }
     $cache = wfGetCache(CACHE_ANYTHING);
     $cacheKey = wfMemcKey('first-letters', $this->locale, $this->digitTransformLanguage->getCode(), self::getICUVersion());
     $cacheEntry = $cache->get($cacheKey);
     if ($cacheEntry && isset($cacheEntry['version']) && $cacheEntry['version'] == self::FIRST_LETTER_VERSION) {
         $this->firstLetterData = $cacheEntry;
         return $this->firstLetterData;
     }
     // Generate data from serialized data file
     if (isset(self::$tailoringFirstLetters[$this->locale])) {
         $letters = wfGetPrecompiledData("first-letters-root.ser");
         // Append additional characters
         $letters = array_merge($letters, self::$tailoringFirstLetters[$this->locale]);
         // Remove unnecessary ones, if any
         if (isset(self::$tailoringFirstLetters['-' . $this->locale])) {
             $letters = array_diff($letters, self::$tailoringFirstLetters['-' . $this->locale]);
         }
         // Apply digit transforms
         $digits = array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9');
         $letters = array_diff($letters, $digits);
         foreach ($digits as $digit) {
             $letters[] = $this->digitTransformLanguage->formatNum($digit, true);
         }
     } else {
         $letters = wfGetPrecompiledData("first-letters-{$this->locale}.ser");
         if ($letters === false) {
             throw new MWException("MediaWiki does not support ICU locale " . "\"{$this->locale}\"");
         }
     }
     /* Sort the letters.
      *
      * It's impossible to have the precompiled data file properly sorted,
      * because the sort order changes depending on ICU version. If the
      * array is not properly sorted, the binary search will return random
      * results.
      *
      * We also take this opportunity to remove primary collisions.
      */
     $letterMap = array();
     foreach ($letters as $letter) {
         $key = $this->getPrimarySortKey($letter);
         if (isset($letterMap[$key])) {
             // Primary collision
             // Keep whichever one sorts first in the main collator
             if ($this->mainCollator->compare($letter, $letterMap[$key]) < 0) {
                 $letterMap[$key] = $letter;
             }
         } else {
             $letterMap[$key] = $letter;
         }
     }
     ksort($letterMap, SORT_STRING);
     /* Remove duplicate prefixes. Basically if something has a sortkey
      * which is a prefix of some other sortkey, then it is an
      * expansion and probably should not be considered a section
      * header.
      *
      * For example 'þ' is sometimes sorted as if it is the letters
      * 'th'. Other times it is its own primary element. Another
      * example is '₨'. Sometimes its a currency symbol. Sometimes it
      * is an 'R' followed by an 's'.
      *
      * Additionally an expanded element should always sort directly
      * after its first element due to they way sortkeys work.
      *
      * UCA sortkey elements are of variable length but no collation
      * element should be a prefix of some other element, so I think
      * this is safe. See:
      * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
      * - http://site.icu-project.org/design/collation/uca-weight-allocation
      *
      * Additionally, there is something called primary compression to
      * worry about. Basically, if you have two primary elements that
      * are more than one byte and both start with the same byte then
      * the first byte is dropped on the second primary. Additionally
      * either \x03 or \xFF may be added to mean that the next primary
      * does not start with the first byte of the first primary.
      *
      * This shouldn't matter much, as the first primary is not
      * changed, and that is what we are comparing against.
      *
      * tl;dr: This makes some assumptions about how icu implements
      * collations. It seems incredibly unlikely these assumptions
      * will change, but nonetheless they are assumptions.
      */
     $prev = false;
     $duplicatePrefixes = array();
     foreach ($letterMap as $key => $value) {
         // Remove terminator byte. Otherwise the prefix
         // comparison will get hung up on that.
         $trimmedKey = rtrim($key, "");
         if ($prev === false || $prev === '') {
             $prev = $trimmedKey;
             // We don't yet have a collation element
             // to compare against, so continue.
             continue;
         }
         // Due to the fact the array is sorted, we only have
         // to compare with the element directly previous
         // to the current element (skipping expansions).
         // An element "X" will always sort directly
         // before "XZ" (Unless we have "XY", but we
         // do not update $prev in that case).
         if (substr($trimmedKey, 0, strlen($prev)) === $prev) {
             $duplicatePrefixes[] = $key;
             // If this is an expansion, we don't want to
             // compare the next element to this element,
             // but to what is currently $prev
             continue;
         }
         $prev = $trimmedKey;
     }
     foreach ($duplicatePrefixes as $badKey) {
         wfDebug("Removing '{$letterMap[$badKey]}' from first letters.\n");
         unset($letterMap[$badKey]);
         // This code assumes that unsetting does not change sort order.
     }
     $data = array('chars' => array_values($letterMap), 'keys' => array_keys($letterMap), 'version' => self::FIRST_LETTER_VERSION);
     // Reduce memory usage before caching
     unset($letterMap);
     // Save to cache
     $this->firstLetterData = $data;
     $cache->set($cacheKey, $data, $cache::TTL_WEEK);
     return $data;
 }
示例#4
0
 /**
  * Transform a string using serialized data stored in the given file (which
  * must be in the serialized subdirectory of $IP). The file contains pairs
  * mapping source characters to destination characters.
  *
  * The data is cached in process memory. This will go faster if you have the
  * FastStringSearch extension.
  *
  * @param string $file
  * @param string $string
  *
  * @throws MWException
  * @return string
  */
 function transformUsingPairFile($file, $string)
 {
     if (!isset($this->transformData[$file])) {
         $data = wfGetPrecompiledData($file);
         if ($data === false) {
             throw new MWException(__METHOD__ . ": The transformation file {$file} is missing");
         }
         $this->transformData[$file] = new ReplacementArray($data);
     }
     return $this->transformData[$file]->replace($string);
 }