コード例 #1
0
<?php

require_once dirname(__FILE__) . '/../bootstrap/unit.php';
require_once dirname(__FILE__) . '/../../plugins/rtCorePlugin/lib/toolkit/rtIndexToolkit.class.php';
$t = new lime_test(29);
$string = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890';
$string_dirty = 'a,b.c-d&e-f_g;h:i=j k]l[m(n)o|p\\q?r/s!t@u#v$w%x^y*z';
$string_clean = 'a b c d e f g h i j k l m n o p q r s t u v w x y z';
$t->is(rtIndexToolkit::getCleanedString($string), strtolower($string), '::getCleanedString() - alpha numerics are preserved');
$t->is(rtIndexToolkit::getCleanedString($string_dirty), $string_clean, '::getCleanedString() - non-alphanumerics are removed cleanly');
$t->is(rtIndexToolkit::getCleanedString('one     two'), 'one two', '::getCleanedString() - multiple spaces are converted to a single space');
$string = 'One, two, three - off we go!';
$string_as_array = array('one', 'two', 'three', 'off', 'we', 'go');
$t->is(is_array(rtIndexToolkit::getWordsFromString($string)), true, '::getWordsFromString() - returns an array');
$t->is(count(rtIndexToolkit::getWordsFromString($string)), 6, '::getWordsFromString() - returns an array of the correct length');
$t->is(rtIndexToolkit::getWordsFromString($string), $string_as_array, '::getWordsFromString() - returns an array containing each of the words');
$t->is(rtIndexToolkit::getIndexCleanerClass('blah'), false, '::getIndexCleanerClass() - returns false for unknown languages');
$t->is(rtIndexToolkit::getIndexCleanerClass('de'), 'rtIndexCleanerDe', '::getIndexCleanerClass() - [de] returns correct class');
$t->is(rtIndexToolkit::getIndexCleanerClass('en'), 'rtIndexCleanerEn', '::getIndexCleanerClass() - [en] returns class for known languages');
$t->is(rtIndexToolkit::getIndexCleanerClass('es'), 'rtIndexCleanerEs', '::getIndexCleanerClass() - [es] returns class for known languages');
$t->is(rtIndexToolkit::getIndexCleanerClass('fr'), 'rtIndexCleanerFr', '::getIndexCleanerClass() - [fr] returns class for known languages');
$string = 'Stoppwörter nennt man im Information Retrieval Wörter, die bei einer Volltextindexierung nicht beachtet werden, da sie sehr häufig auftreten und gewöhnlich keine Relevanz für die Erfassung des Dokumentinhalts besitzen.';
$string_as_array = array('stoppworter', 'nennt', 'man', 'information', 'retrieval', 'worter', 'volltextindexierung', 'beachtet', 'sehr', 'haufig', 'auftreten', 'gewohnlich', 'keine', 'relevanz', 'fur', 'erfassung', 'dokumentinhalts', 'besitzen');
$t->is(array_values(rtIndexToolkit::getCleanedWordsFromString($string, 'de')), $string_as_array, '::getCleanedWordsFromString() - [de] returns an array, stop words removed');
$string = 'Stop words is the name given to words which are filtered out prior to, or after, processing of natural language data (text).';
$string_as_array = array('stop', 'words', 'given', 'words', 'filtered', 'prior', 'processing', 'natural', 'language', 'data', 'text');
$t->is(array_values(rtIndexToolkit::getCleanedWordsFromString($string, 'en')), $string_as_array, '::getCleanedWordsFromString() - [en] returns an array, stop words removed');
$string = 'Palabras vacías es el nombre que reciben las palabras sin sirtificado como artículos, pronombres, preposiciones, etc. que son filtradas antes o después del procesamiento de datos en lenguaje natural (texto).';
$string_as_array = array('palabras', 'vacias', 'nombre', 'que', 'reciben', 'palabras', 'sirtificado', 'articulos', 'pronombres', 'preposiciones', 'etc', 'que', 'son', 'filtradas', 'despues', 'del', 'procesamiento', 'de', 'datos', 'lenguaje', 'natural', 'texto');
$t->is(array_values(rtIndexToolkit::getCleanedWordsFromString($string, 'es')), $string_as_array, '::getCleanedWordsFromString() - [en] returns an array, stop words removed');
$string = "Les mots vides (ou stop words, en anglais) sont des mots qui sont tellement communs qu'il est inutile de les indexer ou de les utiliser dans une recherche.";