/** * Parse title of the question by * tokenizing it * Overrides parent's parse and users mb_split * instead of preg_split to be UTF-8 Safe * because title can be a UTF-8 string * * @return array tokens; */ public function parse() { if (empty($this->origString)) { d('string was empty, returning empty array'); return array(); } \mb_regex_encoding('UTF-8'); $aTokens = \mb_split('([\\s,;\\"\\?]+)', $this->origString); $aTokens = \array_unique($aTokens); $aStopwords = getStopwords(); \array_walk($aTokens, function (&$val) use($aStopwords) { $val = \trim($val); $val = strlen($val) > 1 && !in_array($val, $aStopwords) ? $val : false; }); /** * Remove empty values * */ $aTokens = \array_filter($aTokens); /** * Call array_values to reindex from 0 * otherwise if filter removed some * elements then Mongo will not * treat this as normal array */ return \array_values($aTokens); }
public function __construct() { $this->aStopwords = getStopwords(); }