function extractWords($text) { $alphabet = 'abcdefghijklmnopqrstuvwxyzăâîșț'; $text = mb_strtolower($text); $text = AdminStringUtil::removeAccents($text); $result = array(); $currentWord = ''; $chars = AdminStringUtil::unicodeExplode($text); foreach ($chars as $c) { if (strpos($alphabet, $c) !== false) { $currentWord .= $c; } else { if ($currentWord) { $result[] = $currentWord; } $currentWord = ''; } } if ($currentWord) { $result[] = $currentWord; } return $result; }
assertAbbreviations("FILLER bWv FILLER", "FILLER bWv FILLER", "FILLER bWv FILLER", 32); assertAbbreviations("FILLER ed. FILLER", "FILLER #ed.# FILLER", "FILLER <abbr class=\"abbrev\" title=\"ediție, editat\">ed.</abbr> FILLER", 32); assertAbbreviations("FILLER Ed. FILLER", "FILLER #Ed.# FILLER", "FILLER <abbr class=\"abbrev\" title=\"Editura\">Ed.</abbr> FILLER", 32); assertAbbreviations("FILLER ED. FILLER", "FILLER #Ed.# FILLER", "FILLER <abbr class=\"abbrev\" title=\"Editura\">Ed.</abbr> FILLER", 32); // Abbreviation includes special characters assertAbbreviations("FILLER RRHA, TMC FILLER", "FILLER #RRHA, TMC# FILLER", "FILLER <abbr class=\"abbrev\" title=\"Revue Roumaine d'Histoire de l'Art, série Théâtre, Musique, Cinématographie\">RRHA, TMC</abbr> FILLER", 32); assertAbbreviations("FILLER adj. interog.-rel. FILLER", "FILLER #adj. interog.-rel.# FILLER", "FILLER <abbr class=\"abbrev\" title=\"adjectiv interogativ-relativ\">adj. interog.-rel.</abbr> FILLER", 1); // Abbreviation is not delimited by spaces assertAbbreviations("AGNUS DEI", "AGNUS DEI", "AGNUS DEI", 32); assertEquals('@MÁRE^2,@ $mări,$ s.f.', AdminStringUtil::migrateFormatChars('@MÁRE^2@, $mări$, s.f.')); assertEquals('@$%spaced% text$@', AdminStringUtil::migrateFormatChars('@$ % spaced % text $@')); assertEquals('40\\% dolomite', AdminStringUtil::migrateFormatChars('40\\% dolomite')); assertEquals('40 %dolomite%', AdminStringUtil::migrateFormatChars('40% dolomite%')); assertEquals('cățel', AdminStringUtil::internalizeWordName("C~A,t'EL")); assertEquals('ă', AdminStringUtil::internalizeWordName("~~A~~!@#\$%^&*()123456790")); assertEquals('casă', AdminStringUtil::removeAccents('cásă')); assertEquals('mama', StringUtil::cleanupQuery("'mama'")); assertEquals('mama', StringUtil::cleanupQuery('"mama"')); assertEquals('aăbcdef', StringUtil::cleanupQuery("aăbc<mamă foo bar>def")); assertEquals('AĂBCDEF', StringUtil::cleanupQuery("AĂBC<MAMĂ FOO BAR>DEF")); assertEquals('a~abcdef', StringUtil::cleanupQuery("a~abc<mam~a foo bar>def")); assertEquals('a~ABcdef', StringUtil::cleanupQuery("a~ABc<mam~a foo bar>def")); assertEquals('1234', StringUtil::cleanupQuery('12&qweasd;34')); assert(StringUtil::hasDiacritics('mamă')); assert(!StringUtil::hasDiacritics('mama')); $def = Model::factory('Definition')->create(); $def->sourceId = 1; $def->internalRep = 'abcd'; assertEquals('abcd', AdminStringUtil::extractLexicon($def)); $def->internalRep = 'wxyz'; assertEquals('wxyz', AdminStringUtil::extractLexicon($def));