public function __construct($locale) { if (!extension_loaded('intl')) { throw new MWException('An ICU collation was requested, ' . 'but the intl extension is not available.'); } $this->locale = $locale; // Drop everything after the '@' in locale's name $localeParts = explode('@', $locale); $this->digitTransformLanguage = Language::factory($locale === 'root' ? 'en' : $localeParts[0]); $this->mainCollator = Collator::create($locale); if (!$this->mainCollator) { throw new MWException("Invalid ICU locale specified for collation: {$locale}"); } $this->primaryCollator = Collator::create($locale); $this->primaryCollator->setStrength(Collator::PRIMARY); }
public function __construct($locale) { if (!extension_loaded('intl')) { throw new MWException('An ICU collation was requested, ' . 'but the intl extension is not available.'); } $this->locale = $locale; // Drop everything after the '@' in locale's name $localeParts = explode('@', $locale); $this->digitTransformLanguage = Language::factory($locale === 'root' ? 'en' : $localeParts[0]); $this->mainCollator = Collator::create($locale); if (!$this->mainCollator) { throw new MWException("Invalid ICU locale specified for collation: {$locale}"); } $this->primaryCollator = Collator::create($locale); $this->primaryCollator->setStrength(Collator::PRIMARY); // If the special suffix for numeric collation is present, turn on numeric collation. if (substr($locale, -5, 5) === '-u-kn') { $this->useNumericCollation = true; // Strip off the special suffix so it doesn't trip up fetchFirstLetterData(). $this->locale = substr($this->locale, 0, -5); $this->mainCollator->setAttribute(Collator::NUMERIC_COLLATION, Collator::ON); $this->primaryCollator->setAttribute(Collator::NUMERIC_COLLATION, Collator::ON); } }
function generateFirstChars() { $file = fopen("{$this->dataDir}/allkeys.txt", 'r'); if (!$file) { $this->error("Unable to open allkeys.txt"); exit(1); } global $IP; $outFile = fopen("{$IP}/serialized/first-letters-root.ser", 'w'); if (!$outFile) { $this->error("Unable to open output file first-letters-root.ser"); exit(1); } $goodTertiaryChars = array(); // For each character with an entry in allkeys.txt, overwrite the implicit // entry in $this->weights that came from the UCD. // Also gather a list of tertiary weights, for use in selecting the group header while (false !== ($line = fgets($file))) { // We're only interested in single-character weights, pick them out with a regex $line = trim($line); if (!preg_match('/^([0-9A-F]+)\\s*;\\s*([^#]*)/', $line, $m)) { continue; } $cp = hexdec($m[1]); $allWeights = trim($m[2]); $primary = ''; $tertiary = ''; if (!isset($this->weights[$cp])) { // Non-printable, ignore continue; } foreach (StringUtils::explode('[', $allWeights) as $weightStr) { preg_match_all('/[*.]([0-9A-F]+)/', $weightStr, $m); if (!empty($m[1])) { if ($m[1][0] !== '0000') { $primary .= '.' . $m[1][0]; } if ($m[1][2] !== '0000') { $tertiary .= '.' . $m[1][2]; } } } $this->weights[$cp] = $primary; if ($tertiary === '.0008' || $tertiary === '.000E') { $goodTertiaryChars[$cp] = true; } } fclose($file); // Identify groups of characters with the same primary weight $this->groups = array(); asort($this->weights, SORT_STRING); $prevWeight = reset($this->weights); $group = array(); foreach ($this->weights as $cp => $weight) { if ($weight !== $prevWeight) { $this->groups[$prevWeight] = $group; $prevWeight = $weight; if (isset($this->groups[$weight])) { $group = $this->groups[$weight]; } else { $group = array(); } } $group[] = $cp; } if ($group) { $this->groups[$prevWeight] = $group; } // If one character has a given primary weight sequence, and a second // character has a longer primary weight sequence with an initial // portion equal to the first character, then remove the second // character. This avoids having characters like U+A732 (double A) // polluting the basic latin sort area. foreach ($this->groups as $weight => $group) { if (preg_match('/(\\.[0-9A-F]*)\\./', $weight, $m)) { if (isset($this->groups[$m[1]])) { unset($this->groups[$weight]); } } } ksort($this->groups, SORT_STRING); // Identify the header character in each group $headerChars = array(); $prevChar = ""; $tertiaryCollator = new Collator('root'); $primaryCollator = new Collator('root'); $primaryCollator->setStrength(Collator::PRIMARY); $numOutOfOrder = 0; foreach ($this->groups as $weight => $group) { $uncomposedChars = array(); $goodChars = array(); foreach ($group as $cp) { if (isset($goodTertiaryChars[$cp])) { $goodChars[] = $cp; } if (!isset($this->mappedChars[$cp])) { $uncomposedChars[] = $cp; } } $x = array_intersect($goodChars, $uncomposedChars); if (!$x) { $x = $uncomposedChars; if (!$x) { $x = $group; } } // Use ICU to pick the lowest sorting character in the selection $tertiaryCollator->sort($x); $cp = $x[0]; $char = UtfNormal\Utils::codepointToUtf8($cp); $headerChars[] = $char; if ($primaryCollator->compare($char, $prevChar) <= 0) { $numOutOfOrder++; /* printf( "Out of order: U+%05X > U+%05X\n", utf8ToCodepoint( $prevChar ), utf8ToCodepoint( $char ) ); */ } $prevChar = $char; if ($this->debugOutFile) { fwrite($this->debugOutFile, sprintf("%05X %s %s (%s)\n", $cp, $weight, $char, implode(' ', array_map('UtfNormal\\Utils::codepointToUtf8', $group)))); } } print "Out of order: {$numOutOfOrder} / " . count($headerChars) . "\n"; fwrite($outFile, serialize($headerChars)); }
/** * @ignore */ public static function collatorObject($caseInsensitive, $naturalOrder, $locale, $collationFlags) { // Is public to be accessible by other classes, such as CArray. assert('is_bool($caseInsensitive) && is_bool($naturalOrder) && is_cstring($locale) && ' . 'is_bitfield($collationFlags)', vs(isset($this), get_defined_vars())); assert('CULocale::isValid($locale) || CString::equalsCi($locale, "root")', vs(isset($this), get_defined_vars())); $coll = new Collator($locale); // Case sensitivity. if (!$caseInsensitive) { $coll->setStrength(Collator::TERTIARY); } else { $coll->setStrength(Collator::SECONDARY); } // Natural order. if (!$naturalOrder) { // To be sure. $coll->setAttribute(Collator::NUMERIC_COLLATION, Collator::OFF); } else { $coll->setAttribute(Collator::NUMERIC_COLLATION, Collator::ON); } // Accents. if (CBitField::isBitSet($collationFlags, self::COLLATION_IGNORE_ACCENTS)) { $coll->setStrength(Collator::PRIMARY); if (!$caseInsensitive) { $coll->setAttribute(Collator::CASE_LEVEL, Collator::ON); } } // Invisible characters, some punctuation and symbols. if (CBitField::isBitSet($collationFlags, self::COLLATION_IGNORE_NONWORD)) { $coll->setAttribute(Collator::ALTERNATE_HANDLING, Collator::SHIFTED); } // Case order. if (!CBitField::isBitSet($collationFlags, self::COLLATION_UPPERCASE_FIRST)) { // To be sure. $coll->setAttribute(Collator::CASE_FIRST, Collator::OFF); } else { $coll->setAttribute(Collator::CASE_FIRST, Collator::UPPER_FIRST); } // "French" collation. if (CBitField::isBitSet($collationFlags, self::COLLATION_FRENCH)) { $coll->setAttribute(Collator::FRENCH_COLLATION, Collator::ON); } return $coll; }
<?php // Create a collator using Spanish locale $collator = new Collator("es"); // Returns that the strings are equal, in spite of the emphasis on the "o" $collator->setStrength(Collator::PRIMARY); var_dump($collator->compare("una canción", "una cancion")); // Returns that the strings are not equal $collator->setStrength(Collator::DEFAULT_VALUE); var_dump($collator->compare("una canción", "una cancion"));