/** * Constructor * * @param string $text * @param string $locale */ public function __construct($text, $locale = null) { if (!$locale) { $locale = ini_get('intl.default_locale'); } $iterator = \IntlBreakIterator::createWordInstance($locale); $iterator->setText($text); foreach ($iterator->getPartsIterator() as $part) { if (\IntlBreakIterator::WORD_NONE !== $iterator->getRuleStatus()) { $this->words[] = $part; } } }
<?php ini_set("intl.error_level", E_WARNING); ini_set("intl.default_locale", "pt_PT"); $bi = IntlBreakIterator::createWordInstance('pt'); $bi->setText('foo bar trans zoo bee'); var_dump($bi->isBoundary(0)); var_dump($bi->isBoundary(7)); var_dump($bi->isBoundary(-1)); var_dump($bi->isBoundary(1)); var_dump($bi->isBoundary(50)); ?> ==DONE==
<?php ini_set("intl.error_level", E_WARNING); ini_set("intl.default_locale", "pt_PT"); $text = 'ตัวอย่างข้อความ'; $it = IntlBreakIterator::createCodePointInstance()->getPartsIterator(); $it->getBreakIterator()->setText($text); foreach ($it as $k => $v) { echo "{$k}. {$v} (" . sprintf("U+%04X", $it->getBreakIterator()->getLastCodePoint()) . ") at {$it->getBreakIterator()->current()}\r\n"; } ?> ==DONE==
function searchable($text, $lang = false) { global $cfg; if (function_exists('normalizer_normalize')) { // Normalize text input :: remove diacritics and such $text = normalizer_normalize($text, Normalizer::FORM_C); } else { // As a lightweight compatiblity, use a lightweight C // normalizer with diacritic removal, thanks // http://ahinea.com/en/tech/accented-translate.html $tr = array("ä" => "a", "ñ" => "n", "ö" => "o", "ü" => "u", "ÿ" => "y"); $text = strtr($text, $tr); } // Decompose compatible versions of characters (ä => ae) $tr = array("ß" => "ss", "Æ" => "AE", "æ" => "ae", "IJ" => "IJ", "ij" => "ij", "Œ" => "OE", "œ" => "oe", "Ð" => "D", "Đ" => "D", "ð" => "d", "đ" => "d", "Ħ" => "H", "ħ" => "h", "ı" => "i", "ĸ" => "k", "Ŀ" => "L", "Ł" => "L", "ŀ" => "l", "ł" => "l", "Ŋ" => "N", "ʼn" => "n", "ŋ" => "n", "Ø" => "O", "ø" => "o", "ſ" => "s", "Þ" => "T", "Ŧ" => "T", "þ" => "t", "ŧ" => "t", "ä" => "ae", "ö" => "oe", "ü" => "ue", "Ä" => "AE", "Ö" => "OE", "Ü" => "UE"); $text = strtr($text, $tr); // Drop separated diacritics $text = preg_replace('/\\p{M}/u', '', $text); // Drop extraneous whitespace $text = preg_replace('/(\\s)\\s+/u', '$1', $text); // Drop leading and trailing whitespace $text = trim($text); if (false && class_exists('IntlBreakIterator')) { // Split by word boundaries if ($tokenizer = IntlBreakIterator::createWordInstance($lang ?: ($cfg ? $cfg->getSystemLanguage() : 'en_US'))) { $tokenizer->setText($text); $tokens = array(); foreach ($tokenizer as $token) { $tokens[] = $token; } $text = implode(' ', $tokens); } } else { // Approximate word boundaries from Unicode chart at // http://www.unicode.org/reports/tr29/#Word_Boundaries // Punt for now } return $text; }
<?php ini_set("intl.error_level", E_WARNING); ini_set("intl.default_locale", "pt_PT"); $text = 'ตัวอย่างข้อความ'; $text2 = 'foo'; $it = IntlBreakIterator::createCodePointInstance(); $it->setText($text); $it_clone = clone $it; var_dump($it == $it_clone); $it->setText($text2); var_dump($it == $it_clone); $it_clone->setText($text2); var_dump($it == $it_clone); ?> ==DONE==
<?php ini_set("intl.error_level", E_WARNING); ini_set("intl.default_locale", "pt_PT"); $it = IntlBreakIterator::createWordInstance(NULL); var_dump($it->getPartsIterator(array())); var_dump($it->getPartsIterator(1, 2)); var_dump($it->getPartsIterator(-1)); ?> ==DONE==
<?php ini_set("intl.error_level", E_WARNING); ini_set("intl.default_locale", "pt_PT"); $bi = IntlBreakIterator::createSentenceInstance('pt'); var_dump($bi->getLocale(0)); var_dump($bi->getLocale(1)); ?> ==DONE==
<?php ini_set("intl.error_level", E_WARNING); var_dump(IntlBreakIterator::createWordInstance(array())); var_dump(IntlBreakIterator::createSentenceInstance(NULL, 2)); var_dump(IntlBreakIterator::createCharacterInstance(NULL, 2)); var_dump(IntlBreakIterator::createTitleInstance(NULL, 2)); var_dump(IntlBreakIterator::createLineInstance(NULL, 2));
<?php ini_set("intl.error_level", E_WARNING); var_dump(IntlBreakIterator::createCodePointInstance(array()));