Exemplo n.º 1
function stripTweet($profileString)
    $lower = strtolower($profileString);
    $split = preg_split('/\\s+/', $lower);
    $result = "";
    foreach ($split as $tokenWithPunctuation) {
        if (strlen($tokenWithPunctuation) >= 4 && substr($tokenWithPunctuation, 0, 3) == "www" || substr($tokenWithPunctuation, 0, 4) == "http") {
        if ($tokenWithPunctuation[0] == "@") {
        $token = removePunctuation($tokenWithPunctuation);
        if ($token == "") {
        if ($token == "rt") {
        if (isStopword($token)) {
        $result .= $token . " ";
    return $result;
Exemplo n.º 2
 * Provides and abstraction to the strip_tags function with some additional
 * case switching for the type of file that is being parsed.
 * @param $string
 *	A string with tags in it (or not) to be parsed and have the tags stripped.
 * @param $tags
 *	The tags which will be passed into remove_elements().
 * @param $type
 *	The type of file from which the tags are being stripped.
 * @return string
 *	A string of scrubbed text. Generally returned via AJAX instead
 *	of a direct call.
 * @see remove_elements()
function scrub_text($string, $formatting, $tags, $punctuation, $digits, $removeStopWords, $lemmatize, $consolidate, $formatspecial, $lowercase, $common, $stopWords = "", $lemmas = "", $consolidations = "", $specials = "", $type = 'default')
    switch ($type) {
        case 'default':
            // Make the string variable a string with the requested elements removed.
            $string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
            print "<br /> Before lowercase <br />" . substr($string, 0, 1000) . "<br />";
            if ($lowercase == "on") {
                $string = strtolower($string);
                $caparray = array("Æ", "Ð", "Þ");
                $lowarray = array("æ", "ð", "þ");
                $string = str_replace($caparray, $lowarray, $string);
            print "<br /> After lowercase, before special characters <br />" . substr($string, 0, 1000) . "<br />";
            if ($formatspecial == "on" or $common == "on") {
                $string = formatSpecial($string, $formatspecial, $specials, $common, $lowercase);
            print "<br /> After special characters, before strip tags <br />" . substr($string, 0, 1000) . "<br />";
            if ($formatting == "on") {
                if ($tags == "keep") {
                    $string = strip_tags($string);
                } else {
                    $string = preg_replace("'<(.*?)>(.*?)</(.*?)>'U", "", $string);
            print "<br /> After strip tags, before remove punctuation <br />" . substr($string, 0, 1000) . "<br />";
            if ($punctuation == "on") {
                $string = removePunctuation($string);
            print "<br /> After remove punctuation, before remove digits <br />" . substr($string, 0, 1000) . "<br />";
            if ($digits == "on") {
                $string = str_replace(range(0, 9), '', $string);
            print "<br /> After remove digits, before remove stopwords <br />" . substr($string, 0, 1000) . "<br />";
            if ($removeStopWords == "on") {
                $string = remove_stopWords($string, $stopWords);
            print "<br /> After remove stopwords, before lemmatize <br />" . substr($string, 0, 1000) . "<br />";
            if ($lemmatize == "on") {
                $string = lemmatize($string, $lemmas);
            print "<br /> After lemmatize, before consolidation <br />" . substr($string, 0, 1000) . "<br />";
            if ($consolidate == "on") {
                $string = consolidate($string, $consolidations);
            print "<br /> After consolidation <br />" . substr($string, 0, 1000) . "<br />";
            // Clean extra spaces
            $string = preg_replace("/\\s\\s+/", " ", $string);
            return $string;
        case 'xml':
            // Make the string variable a string with the requested elements removed.
            $string = remove_stopWords($string, $stopWords);
        case 'sgml':
            // Make the string variable a string with the requested elements removed.
            $string = remove_stopWords($string, $stopWords);