Beispiel #1
0
     echo " error: bad url, timeout, redirect loop ";
 }
 if ($result['http_code'] != 200) {
     echo "error: no page, no permissions, no service";
 }
 $page = $result['content'];
 $content_type = $result['content_type'];
 $mime = $result['mime'];
 $charset = $result['charset'];
 echo "This site is encoded with" . " " . "<b>" . $charset . "</b>" . " " . "format" . "<br>";
 $utf8_text = $result['utf8_text'];
 $text = strip_html_tags($page);
 $utf8_text = html_entity_decode($text, ENT_QUOTES, "utf-8");
 $utf8_text = strip_punctuation($utf8_text);
 $utf8_text = strip_symbols($utf8_text);
 $utf8_text = strip_numbers($utf8_text);
 mb_regex_encoding("utf-8");
 $words = mb_split(' +', $utf8_text);
 foreach ($words as $key => $word) {
     $words[$key] = PorterStemmer::Stem($word, true);
 }
 $stopWords = mb_split('[ \\n]+', mb_strtolower($words[$key], 'utf-8'));
 foreach ($stopWords as $key => $word) {
     $stopWords[$key] = PorterStemmer::Stem($word, true);
 }
 $words = array_diff($words, $stopWords);
 $keywordCounts = array_count_values($words);
 arsort($keywordCounts, SORT_NUMERIC);
 $uniqueKeywords = array_keys($keywordCounts);
 echo "The keywords are" . "<br>";
 foreach ($uniqueKeywords as $value) {
Beispiel #2
0
function Index($page_id)
{
    global $db;
    $needles = array("/\\ba\\b/i", "/\\bable\\b/i", "/\\babout\\b/i", "/\\bacross\\b/i", "/\\bafter\\b/i", "/\\ball\\b/i", "/\\balmost\\b/i", "/\\balso\\b/i", "/\\bam\\b/i", "/\\bamong\\b/i", "/\\ban\\b/i", "/\\band\\b/i", "/\\bany\\b/i", "/\\bare\\b/i", "/\\bas\\b/i", "/\\bat\\b/i", "/\\bbe\\b/i", "/\\bbecause\\b/i", "/\\bbeen\\b/i", "/\\bbut\\b/i", "/\\bby\\b/i", "/\\bcan\\b/i", "/\\bcannot\\b/i", "/\\bcould\\b/i", "/\\bdear\\b/i", "/\\bdid\\b/i", "/\\bdo\\b/i", "/\\bdoes\\b/i", "/\\beither\\b/i", "/\\belse\\b/i", "/\\bever\\b/i", "/\\bevery\\b/i", "/\\bfor\\b/i", "/\\bfrom\\b/i", "/\\bget\\b/i", "/\\bgot\\b/i", "/\\bhad\\b/i", "/\\bhas\\b/i", "/\\bhave\\b/i", "/\\bhe\\b/i", "/\\bher\\b/i", "/\\bhers\\b/i", "/\\bhim\\b/i", "/\\bhis\\b/i", "/\\bhow\\b/i", "/\\bhowever\\b/i", "/\\bi\\b/i", "/\\bif\\b/i", "/\\bin\\b/i", "/\\binto\\b/i", "/\\bis\\b/i", "/\\bit\\b/i", "/\\bits\\b/i", "/\\bjust\\b/i", "/\\bleast\\b/i", "/\\blet\\b/i", "/\\blike\\b/i", "/\\blikely\\b/i", "/\\bmay\\b/i", "/\\bme\\b/i", "/\\bmight\\b/i", "/\\bmost\\b/i", "/\\bmust\\b/i", "/\\bmy\\b/i", "/\\bneither\\b/i", "/\\bno\\b/i", "/\\bnor\\b/i", "/\\bnot\\b/i", "/\\bof\\b/i", "/\\boff\\b/i", "/\\boften\\b/i", "/\\bon\\b/i", "/\\bonly\\b/i", "/\\bor\\b/i", "/\\bother\\b/i", "/\\bour\\b/i", "/\\bown\\b/i", "/\\brather\\b/i", "/\\bsaid\\b/i", "/\\bsay\\b/i", "/\\bsays\\b/i", "/\\bshe\\b/i", "/\\bshould\\b/i", "/\\bsince\\b/i", "/\\bso\\b/i", "/\\bsome\\b/i", "/\\bthan\\b/i", "/\\bthat\\b/i", "/\\bthe\\b/i", "/\\btheir\\b/i", "/\\bthem\\b/i", "/\\bthen\\b/i", "/\\bthere\\b/i", "/\\bthese\\b/i", "/\\bthey\\b/i", "/\\bthis\\b/i", "/\\btis\\b/i", "/\\bto\\b/i", "/\\btoo\\b/i", "/\\btwas\\b/i", "/\\bus\\b/i", "/\\bwants\\b/i", "/\\bwas\\b/i", "/\\bwe\\b/i", "/\\bwere\\b/i", "/\\bwhat\\b/i", "/\\bwhen\\b/i", "/\\bwhere\\b/i", "/\\bwhich\\b/i", "/\\bwhile\\b/i", "/\\bwho\\b/i", "/\\bwhom\\b/i", "/\\bwhy\\b/i", "/\\bwill\\b/i", "/\\bwith\\b/i", "/\\bwould\\b/i", "/\\byet\\b/i", "/\\byou\\b/i", "/\\byour\\b/i", "/\\bain't\\b/i", "/\\baren't\\b/i", "/\\bcan't\\b/i", "/\\bcould've\\b/i", "/\\bcouldn't\\b/i", "/\\bdidn't\\b/i", "/\\bdoesn't\\b/i", "/\\bdon't\\b/i", "/\\bhasn't\\b/i", "/\\bhe'd\\b/i", "/\\bhe'll\\b/i", "/\\bhe's\\b/i", "/\\bhow'd\\b/i", "/\\bhow'll\\b/i", "/\\bhow's\\b/i", "/\\bi'd\\b/i", "/\\bi'll\\b/i", "/\\bi'm\\b/i", "/\\bi've\\b/i", "/\\bisn't\\b/i", "/\\bit's\\b/i", "/\\bmight've\\b/i", "/\\bmightn't\\b/i", "/\\bmust've\\b/i", "/\\bmustn't\\b/i", "/\\bshan't\\b/i", "/\\bshe'd\\b/i", "/\\bshe'll\\b/i", "/\\bshe's\\b/i", "/\\bshould've\\b/i", "/\\bshouldn't\\b/i", "/\\bthat'll\\b/i", "/\\bthat's\\b/i", "/\\bthere's\\b/i", "/\\bthey'd\\b/i", "/\\bthey'll\\b/i", "/\\bthey're\\b/i", "/\\bthey've\\b/i", "/\\bwasn't\\b/i", "/\\bwe'd\\b/i", "/\\bwe'll\\b/i", "/\\bwe're\\b/i", "/\\bweren't\\b/i", "/\\bwhat'd\\b/i", "/\\bwhat's\\b/i", "/\\bwhen'd\\b/i", "/\\bwhen'll\\b/i", "/\\bwhen's\\b/i", "/\\bwhere'd\\b/i", "/\\bwhere'll\\b/i", "/\\bwhere's\\b/i", "/\\bwho'd\\b/i", "/\\bwho'll\\b/i", "/\\bwho's\\b/i", "/\\bwhy'd\\b/i", "/\\bwhy'll\\b/i", "/\\bwhy's\\b/i", "/\\bwon't\\b/i", "/\\bwould've\\b/i", "/\\bwouldn't\\b/i", "/\\byou'd\\b/i", "/\\byou'll\\b/i", "/\\byou're\\b/i", "/\\byou've\\b/i");
    $query = $db->query("SELECT contents FROM page WHERE id ={$page_id}");
    $page_contents = $query->fetchAll(PDO::FETCH_ASSOC);
    foreach ($page_contents as $row) {
        $utf8_text = strip_html_tags($row['contents']);
        $utf8_text = html_entity_decode($utf8_text, ENT_QUOTES, "UTF-8");
        $text = strip_punctuation($utf8_text);
        $text = strip_symbols($text);
        $text = strip_numbers($text);
        $text = mb_strtolower($text, "utf-8");
        $words = explode(' ', $text);
        $keywordCounts = array_count_values($words);
        arsort($keywordCounts, SORT_NUMERIC);
        $uniqueKeywords = array_keys($keywordCounts);
        $uniqueKeywords = preg_replace($needles, "", $uniqueKeywords);
        //Remove stop words
        // print_r($uniqueKeywords);
        for ($i = 0; $i < count($uniqueKeywords); $i++) {
            if ($uniqueKeywords[$i] != "") {
                $cur_word = addslashes(strtolower($uniqueKeywords[$i]));
                $query = $db->query("SELECT word_id FROM word WHERE word_word='{$cur_word}'");
                $row = $query->fetchAll();
                if ($row) {
                    if ($row[0]['word_id']) {
                        $word_id = $row[0]['word_id'];
                    }
                } else {
                    $query = $db->query("INSERT INTO word (word_word) VALUES ('{$cur_word}')");
                    $word_id = $db->lastInsertId();
                }
                $db->query("INSERT INTO occurrence (word_id,page_id) VALUES ({$word_id},{$page_id})");
            }
        }
    }
    return "Index";
}