function stripTweet($profileString) { $lower = strtolower($profileString); $split = preg_split('/\\s+/', $lower); $result = ""; foreach ($split as $tokenWithPunctuation) { if (strlen($tokenWithPunctuation) >= 4 && substr($tokenWithPunctuation, 0, 3) == "www" || substr($tokenWithPunctuation, 0, 4) == "http") { continue; } if ($tokenWithPunctuation[0] == "@") { continue; } $token = removePunctuation($tokenWithPunctuation); if ($token == "") { continue; } if ($token == "rt") { continue; } if (isStopword($token)) { continue; } $result .= $token . " "; } return $result; }
/** * Provides and abstraction to the strip_tags function with some additional * case switching for the type of file that is being parsed. * * @param $string * A string with tags in it (or not) to be parsed and have the tags stripped. * * @param $tags * The tags which will be passed into remove_elements(). * * @param $type * The type of file from which the tags are being stripped. * * @return string * A string of scrubbed text. Generally returned via AJAX instead * of a direct call. * * @see remove_elements() * */ function scrub_text($string, $formatting, $tags, $punctuation, $digits, $removeStopWords, $lemmatize, $consolidate, $formatspecial, $lowercase, $common, $stopWords = "", $lemmas = "", $consolidations = "", $specials = "", $type = 'default') { switch ($type) { case 'default': // Make the string variable a string with the requested elements removed. utf8_encode($string); $string = html_entity_decode($string, ENT_QUOTES, 'UTF-8'); print "<br /> Before lowercase <br />" . substr($string, 0, 1000) . "<br />"; if ($lowercase == "on") { $string = strtolower($string); $caparray = array("Æ", "Ð", "Þ"); $lowarray = array("æ", "ð", "þ"); $string = str_replace($caparray, $lowarray, $string); } print "<br /> After lowercase, before special characters <br />" . substr($string, 0, 1000) . "<br />"; if ($formatspecial == "on" or $common == "on") { $string = formatSpecial($string, $formatspecial, $specials, $common, $lowercase); } print "<br /> After special characters, before strip tags <br />" . substr($string, 0, 1000) . "<br />"; if ($formatting == "on") { if ($tags == "keep") { $string = strip_tags($string); } else { $string = preg_replace("'<(.*?)>(.*?)</(.*?)>'U", "", $string); } } print "<br /> After strip tags, before remove punctuation <br />" . substr($string, 0, 1000) . "<br />"; if ($punctuation == "on") { $string = removePunctuation($string); } print "<br /> After remove punctuation, before remove digits <br />" . substr($string, 0, 1000) . "<br />"; if ($digits == "on") { $string = str_replace(range(0, 9), '', $string); } print "<br /> After remove digits, before remove stopwords <br />" . substr($string, 0, 1000) . "<br />"; if ($removeStopWords == "on") { $string = remove_stopWords($string, $stopWords); } print "<br /> After remove stopwords, before lemmatize <br />" . substr($string, 0, 1000) . "<br />"; if ($lemmatize == "on") { $string = lemmatize($string, $lemmas); } print "<br /> After lemmatize, before consolidation <br />" . substr($string, 0, 1000) . "<br />"; if ($consolidate == "on") { $string = consolidate($string, $consolidations); } print "<br /> After consolidation <br />" . substr($string, 0, 1000) . "<br />"; // Clean extra spaces $string = preg_replace("/\\s\\s+/", " ", $string); return $string; break; case 'xml': // Make the string variable a string with the requested elements removed. $string = remove_stopWords($string, $stopWords); strip_tags($string); break; case 'sgml': // Make the string variable a string with the requested elements removed. $string = remove_stopWords($string, $stopWords); strip_tags($string); break; } }