Пример #1
0
 public function __construct($locale)
 {
     if (!extension_loaded('intl')) {
         throw new MWException('An ICU collation was requested, ' . 'but the intl extension is not available.');
     }
     $this->locale = $locale;
     // Drop everything after the '@' in locale's name
     $localeParts = explode('@', $locale);
     $this->digitTransformLanguage = Language::factory($locale === 'root' ? 'en' : $localeParts[0]);
     $this->mainCollator = Collator::create($locale);
     if (!$this->mainCollator) {
         throw new MWException("Invalid ICU locale specified for collation: {$locale}");
     }
     $this->primaryCollator = Collator::create($locale);
     $this->primaryCollator->setStrength(Collator::PRIMARY);
 }
Пример #2
0
 public function __construct($locale)
 {
     if (!extension_loaded('intl')) {
         throw new MWException('An ICU collation was requested, ' . 'but the intl extension is not available.');
     }
     $this->locale = $locale;
     // Drop everything after the '@' in locale's name
     $localeParts = explode('@', $locale);
     $this->digitTransformLanguage = Language::factory($locale === 'root' ? 'en' : $localeParts[0]);
     $this->mainCollator = Collator::create($locale);
     if (!$this->mainCollator) {
         throw new MWException("Invalid ICU locale specified for collation: {$locale}");
     }
     $this->primaryCollator = Collator::create($locale);
     $this->primaryCollator->setStrength(Collator::PRIMARY);
     // If the special suffix for numeric collation is present, turn on numeric collation.
     if (substr($locale, -5, 5) === '-u-kn') {
         $this->useNumericCollation = true;
         // Strip off the special suffix so it doesn't trip up fetchFirstLetterData().
         $this->locale = substr($this->locale, 0, -5);
         $this->mainCollator->setAttribute(Collator::NUMERIC_COLLATION, Collator::ON);
         $this->primaryCollator->setAttribute(Collator::NUMERIC_COLLATION, Collator::ON);
     }
 }
 function generateFirstChars()
 {
     $file = fopen("{$this->dataDir}/allkeys.txt", 'r');
     if (!$file) {
         $this->error("Unable to open allkeys.txt");
         exit(1);
     }
     global $IP;
     $outFile = fopen("{$IP}/serialized/first-letters-root.ser", 'w');
     if (!$outFile) {
         $this->error("Unable to open output file first-letters-root.ser");
         exit(1);
     }
     $goodTertiaryChars = array();
     // For each character with an entry in allkeys.txt, overwrite the implicit
     // entry in $this->weights that came from the UCD.
     // Also gather a list of tertiary weights, for use in selecting the group header
     while (false !== ($line = fgets($file))) {
         // We're only interested in single-character weights, pick them out with a regex
         $line = trim($line);
         if (!preg_match('/^([0-9A-F]+)\\s*;\\s*([^#]*)/', $line, $m)) {
             continue;
         }
         $cp = hexdec($m[1]);
         $allWeights = trim($m[2]);
         $primary = '';
         $tertiary = '';
         if (!isset($this->weights[$cp])) {
             // Non-printable, ignore
             continue;
         }
         foreach (StringUtils::explode('[', $allWeights) as $weightStr) {
             preg_match_all('/[*.]([0-9A-F]+)/', $weightStr, $m);
             if (!empty($m[1])) {
                 if ($m[1][0] !== '0000') {
                     $primary .= '.' . $m[1][0];
                 }
                 if ($m[1][2] !== '0000') {
                     $tertiary .= '.' . $m[1][2];
                 }
             }
         }
         $this->weights[$cp] = $primary;
         if ($tertiary === '.0008' || $tertiary === '.000E') {
             $goodTertiaryChars[$cp] = true;
         }
     }
     fclose($file);
     // Identify groups of characters with the same primary weight
     $this->groups = array();
     asort($this->weights, SORT_STRING);
     $prevWeight = reset($this->weights);
     $group = array();
     foreach ($this->weights as $cp => $weight) {
         if ($weight !== $prevWeight) {
             $this->groups[$prevWeight] = $group;
             $prevWeight = $weight;
             if (isset($this->groups[$weight])) {
                 $group = $this->groups[$weight];
             } else {
                 $group = array();
             }
         }
         $group[] = $cp;
     }
     if ($group) {
         $this->groups[$prevWeight] = $group;
     }
     // If one character has a given primary weight sequence, and a second
     // character has a longer primary weight sequence with an initial
     // portion equal to the first character, then remove the second
     // character. This avoids having characters like U+A732 (double A)
     // polluting the basic latin sort area.
     foreach ($this->groups as $weight => $group) {
         if (preg_match('/(\\.[0-9A-F]*)\\./', $weight, $m)) {
             if (isset($this->groups[$m[1]])) {
                 unset($this->groups[$weight]);
             }
         }
     }
     ksort($this->groups, SORT_STRING);
     // Identify the header character in each group
     $headerChars = array();
     $prevChar = "";
     $tertiaryCollator = new Collator('root');
     $primaryCollator = new Collator('root');
     $primaryCollator->setStrength(Collator::PRIMARY);
     $numOutOfOrder = 0;
     foreach ($this->groups as $weight => $group) {
         $uncomposedChars = array();
         $goodChars = array();
         foreach ($group as $cp) {
             if (isset($goodTertiaryChars[$cp])) {
                 $goodChars[] = $cp;
             }
             if (!isset($this->mappedChars[$cp])) {
                 $uncomposedChars[] = $cp;
             }
         }
         $x = array_intersect($goodChars, $uncomposedChars);
         if (!$x) {
             $x = $uncomposedChars;
             if (!$x) {
                 $x = $group;
             }
         }
         // Use ICU to pick the lowest sorting character in the selection
         $tertiaryCollator->sort($x);
         $cp = $x[0];
         $char = UtfNormal\Utils::codepointToUtf8($cp);
         $headerChars[] = $char;
         if ($primaryCollator->compare($char, $prevChar) <= 0) {
             $numOutOfOrder++;
             /*
             				printf( "Out of order: U+%05X > U+%05X\n",
             					utf8ToCodepoint( $prevChar ),
             					utf8ToCodepoint( $char ) );
             */
         }
         $prevChar = $char;
         if ($this->debugOutFile) {
             fwrite($this->debugOutFile, sprintf("%05X %s %s (%s)\n", $cp, $weight, $char, implode(' ', array_map('UtfNormal\\Utils::codepointToUtf8', $group))));
         }
     }
     print "Out of order: {$numOutOfOrder} / " . count($headerChars) . "\n";
     fwrite($outFile, serialize($headerChars));
 }
Пример #4
0
 /**
  * @ignore
  */
 public static function collatorObject($caseInsensitive, $naturalOrder, $locale, $collationFlags)
 {
     // Is public to be accessible by other classes, such as CArray.
     assert('is_bool($caseInsensitive) && is_bool($naturalOrder) && is_cstring($locale) && ' . 'is_bitfield($collationFlags)', vs(isset($this), get_defined_vars()));
     assert('CULocale::isValid($locale) || CString::equalsCi($locale, "root")', vs(isset($this), get_defined_vars()));
     $coll = new Collator($locale);
     // Case sensitivity.
     if (!$caseInsensitive) {
         $coll->setStrength(Collator::TERTIARY);
     } else {
         $coll->setStrength(Collator::SECONDARY);
     }
     // Natural order.
     if (!$naturalOrder) {
         // To be sure.
         $coll->setAttribute(Collator::NUMERIC_COLLATION, Collator::OFF);
     } else {
         $coll->setAttribute(Collator::NUMERIC_COLLATION, Collator::ON);
     }
     // Accents.
     if (CBitField::isBitSet($collationFlags, self::COLLATION_IGNORE_ACCENTS)) {
         $coll->setStrength(Collator::PRIMARY);
         if (!$caseInsensitive) {
             $coll->setAttribute(Collator::CASE_LEVEL, Collator::ON);
         }
     }
     // Invisible characters, some punctuation and symbols.
     if (CBitField::isBitSet($collationFlags, self::COLLATION_IGNORE_NONWORD)) {
         $coll->setAttribute(Collator::ALTERNATE_HANDLING, Collator::SHIFTED);
     }
     // Case order.
     if (!CBitField::isBitSet($collationFlags, self::COLLATION_UPPERCASE_FIRST)) {
         // To be sure.
         $coll->setAttribute(Collator::CASE_FIRST, Collator::OFF);
     } else {
         $coll->setAttribute(Collator::CASE_FIRST, Collator::UPPER_FIRST);
     }
     // "French" collation.
     if (CBitField::isBitSet($collationFlags, self::COLLATION_FRENCH)) {
         $coll->setAttribute(Collator::FRENCH_COLLATION, Collator::ON);
     }
     return $coll;
 }
Пример #5
0
<?php

// Create a collator using Spanish locale
$collator = new Collator("es");
// Returns that the strings are equal, in spite of the emphasis on the "o"
$collator->setStrength(Collator::PRIMARY);
var_dump($collator->compare("una canción", "una cancion"));
// Returns that the strings are not equal
$collator->setStrength(Collator::DEFAULT_VALUE);
var_dump($collator->compare("una canción", "una cancion"));