/** * Parses a search string into search terms, supports quoted phrases and removes extra punctuation * * @internal * * @param string $terms A text string from a form input to parse into search terms * @param boolean $ignore_stop_words If stop words should be ignored, this setting will be ignored if all words are stop words * @return void */ public static function parseSearchTerms($terms, $ignore_stop_words = FALSE) { $stop_words = array('i', 'a', 'an', 'are', 'as', 'at', 'be', 'by', 'de', 'en', 'en', 'for', 'from', 'how', 'in', 'is', 'it', 'la', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'what', 'when', 'where', 'who', 'will'); preg_match_all('#(?:"[^"]+"|[^\\s]+)#', $terms, $matches); $good_terms = array(); $ignored_terms = array(); foreach ($matches[0] as $match) { // Remove phrases from quotes if ($match[0] == '"' && substr($match, -1)) { $match = substr($match, 1, -1); // Trim any punctuation off of the beginning and end of terms } else { if (self::$pcre_supports_unicode_character_properties === NULL) { fCore::startErrorCapture(); preg_match('#\\pC#u', 'test'); self::$pcre_supports_unicode_character_properties = !(bool) fCore::stopErrorCapture(); } if (self::$pcre_supports_unicode_character_properties) { $match = preg_replace('#(^[\\pC\\pC\\pM\\pP\\pS\\pZ]+|[\\pC\\pC\\pM\\pP\\pS\\pZ]+$)#iDu', '', $match); } else { // This just removes ascii non-alphanumeric characters, plus the unicode punctuation and supplemental punctuation blocks $match = preg_replace('#(^[\\x21-\\x2F\\x3A-\\x40\\x5B-\\x60\\x7B-\\x7F\\x{2000}-\\x{206F}\\x{2E00}-\\x{2E7F}\\x{00A1}-\\x{00A9}\\x{00AB}-\\x{00B1}\\x{00B4}\\x{00B6}-\\x{00B8}\\x{00BB}\\x{00BF}\\x{00D7}\\x{00F7}]+|[\\x21-\\x2F\\x3A-\\x40\\x5B-\\x60\\x7B-\\x7F\\x{2000}-\\x{206F}\\x{2E00}-\\x{2E7F}\\x{00A1}-\\x{00A9}\\x{00AB}-\\x{00B1}\\x{00B4}\\x{00B6}-\\x{00B8}\\x{00BB}\\x{00BF}\\x{00D7}\\x{00F7}]+$)#iDu', '', $match); } } if ($ignore_stop_words && in_array(strtolower($match), $stop_words)) { $ignored_terms[] = $match; continue; } $good_terms[] = $match; } // If no terms were parsed, that means all words were stop words if ($ignored_terms && !$good_terms) { $good_terms = $ignored_terms; } return $good_terms; }