Ejemplo n.º 1
0
function stripTweet($profileString)
{
    $lower = strtolower($profileString);
    $split = preg_split('/\\s+/', $lower);
    $result = "";
    foreach ($split as $tokenWithPunctuation) {
        if (strlen($tokenWithPunctuation) >= 4 && substr($tokenWithPunctuation, 0, 3) == "www" || substr($tokenWithPunctuation, 0, 4) == "http") {
            continue;
        }
        if ($tokenWithPunctuation[0] == "@") {
            continue;
        }
        $token = removePunctuation($tokenWithPunctuation);
        if ($token == "") {
            continue;
        }
        if ($token == "rt") {
            continue;
        }
        if (isStopword($token)) {
            continue;
        }
        $result .= $token . " ";
    }
    return $result;
}
Ejemplo n.º 2
0
/**
 * Provides and abstraction to the strip_tags function with some additional
 * case switching for the type of file that is being parsed.
 *
 * @param $string
 *	A string with tags in it (or not) to be parsed and have the tags stripped.
 *
 * @param $tags
 *	The tags which will be passed into remove_elements().
 *
 * @param $type
 *	The type of file from which the tags are being stripped.
 *
 * @return string
 *	A string of scrubbed text. Generally returned via AJAX instead
 *	of a direct call.
 *
 * @see remove_elements()
 *
 */
function scrub_text($string, $formatting, $tags, $punctuation, $digits, $removeStopWords, $lemmatize, $consolidate, $formatspecial, $lowercase, $common, $stopWords = "", $lemmas = "", $consolidations = "", $specials = "", $type = 'default')
{
    switch ($type) {
        case 'default':
            // Make the string variable a string with the requested elements removed.
            utf8_encode($string);
            $string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
            print "<br /> Before lowercase <br />" . substr($string, 0, 1000) . "<br />";
            if ($lowercase == "on") {
                $string = strtolower($string);
                $caparray = array("Æ", "Ð", "Þ");
                $lowarray = array("æ", "ð", "þ");
                $string = str_replace($caparray, $lowarray, $string);
            }
            print "<br /> After lowercase, before special characters <br />" . substr($string, 0, 1000) . "<br />";
            if ($formatspecial == "on" or $common == "on") {
                $string = formatSpecial($string, $formatspecial, $specials, $common, $lowercase);
            }
            print "<br /> After special characters, before strip tags <br />" . substr($string, 0, 1000) . "<br />";
            if ($formatting == "on") {
                if ($tags == "keep") {
                    $string = strip_tags($string);
                } else {
                    $string = preg_replace("'<(.*?)>(.*?)</(.*?)>'U", "", $string);
                }
            }
            print "<br /> After strip tags, before remove punctuation <br />" . substr($string, 0, 1000) . "<br />";
            if ($punctuation == "on") {
                $string = removePunctuation($string);
            }
            print "<br /> After remove punctuation, before remove digits <br />" . substr($string, 0, 1000) . "<br />";
            if ($digits == "on") {
                $string = str_replace(range(0, 9), '', $string);
            }
            print "<br /> After remove digits, before remove stopwords <br />" . substr($string, 0, 1000) . "<br />";
            if ($removeStopWords == "on") {
                $string = remove_stopWords($string, $stopWords);
            }
            print "<br /> After remove stopwords, before lemmatize <br />" . substr($string, 0, 1000) . "<br />";
            if ($lemmatize == "on") {
                $string = lemmatize($string, $lemmas);
            }
            print "<br /> After lemmatize, before consolidation <br />" . substr($string, 0, 1000) . "<br />";
            if ($consolidate == "on") {
                $string = consolidate($string, $consolidations);
            }
            print "<br /> After consolidation <br />" . substr($string, 0, 1000) . "<br />";
            // Clean extra spaces
            $string = preg_replace("/\\s\\s+/", " ", $string);
            return $string;
            break;
        case 'xml':
            // Make the string variable a string with the requested elements removed.
            $string = remove_stopWords($string, $stopWords);
            strip_tags($string);
            break;
        case 'sgml':
            // Make the string variable a string with the requested elements removed.
            $string = remove_stopWords($string, $stopWords);
            strip_tags($string);
            break;
    }
}