echo " error: bad url, timeout, redirect loop "; } if ($result['http_code'] != 200) { echo "error: no page, no permissions, no service"; } $page = $result['content']; $content_type = $result['content_type']; $mime = $result['mime']; $charset = $result['charset']; echo "This site is encoded with" . " " . "<b>" . $charset . "</b>" . " " . "format" . "<br>"; $utf8_text = $result['utf8_text']; $text = strip_html_tags($page); $utf8_text = html_entity_decode($text, ENT_QUOTES, "utf-8"); $utf8_text = strip_punctuation($utf8_text); $utf8_text = strip_symbols($utf8_text); $utf8_text = strip_numbers($utf8_text); mb_regex_encoding("utf-8"); $words = mb_split(' +', $utf8_text); foreach ($words as $key => $word) { $words[$key] = PorterStemmer::Stem($word, true); } $stopWords = mb_split('[ \\n]+', mb_strtolower($words[$key], 'utf-8')); foreach ($stopWords as $key => $word) { $stopWords[$key] = PorterStemmer::Stem($word, true); } $words = array_diff($words, $stopWords); $keywordCounts = array_count_values($words); arsort($keywordCounts, SORT_NUMERIC); $uniqueKeywords = array_keys($keywordCounts); echo "The keywords are" . "<br>"; foreach ($uniqueKeywords as $value) {
function Index($page_id) { global $db; $needles = array("/\\ba\\b/i", "/\\bable\\b/i", "/\\babout\\b/i", "/\\bacross\\b/i", "/\\bafter\\b/i", "/\\ball\\b/i", "/\\balmost\\b/i", "/\\balso\\b/i", "/\\bam\\b/i", "/\\bamong\\b/i", "/\\ban\\b/i", "/\\band\\b/i", "/\\bany\\b/i", "/\\bare\\b/i", "/\\bas\\b/i", "/\\bat\\b/i", "/\\bbe\\b/i", "/\\bbecause\\b/i", "/\\bbeen\\b/i", "/\\bbut\\b/i", "/\\bby\\b/i", "/\\bcan\\b/i", "/\\bcannot\\b/i", "/\\bcould\\b/i", "/\\bdear\\b/i", "/\\bdid\\b/i", "/\\bdo\\b/i", "/\\bdoes\\b/i", "/\\beither\\b/i", "/\\belse\\b/i", "/\\bever\\b/i", "/\\bevery\\b/i", "/\\bfor\\b/i", "/\\bfrom\\b/i", "/\\bget\\b/i", "/\\bgot\\b/i", "/\\bhad\\b/i", "/\\bhas\\b/i", "/\\bhave\\b/i", "/\\bhe\\b/i", "/\\bher\\b/i", "/\\bhers\\b/i", "/\\bhim\\b/i", "/\\bhis\\b/i", "/\\bhow\\b/i", "/\\bhowever\\b/i", "/\\bi\\b/i", "/\\bif\\b/i", "/\\bin\\b/i", "/\\binto\\b/i", "/\\bis\\b/i", "/\\bit\\b/i", "/\\bits\\b/i", "/\\bjust\\b/i", "/\\bleast\\b/i", "/\\blet\\b/i", "/\\blike\\b/i", "/\\blikely\\b/i", "/\\bmay\\b/i", "/\\bme\\b/i", "/\\bmight\\b/i", "/\\bmost\\b/i", "/\\bmust\\b/i", "/\\bmy\\b/i", "/\\bneither\\b/i", "/\\bno\\b/i", "/\\bnor\\b/i", "/\\bnot\\b/i", "/\\bof\\b/i", "/\\boff\\b/i", "/\\boften\\b/i", "/\\bon\\b/i", "/\\bonly\\b/i", "/\\bor\\b/i", "/\\bother\\b/i", "/\\bour\\b/i", "/\\bown\\b/i", "/\\brather\\b/i", "/\\bsaid\\b/i", "/\\bsay\\b/i", "/\\bsays\\b/i", "/\\bshe\\b/i", "/\\bshould\\b/i", "/\\bsince\\b/i", "/\\bso\\b/i", "/\\bsome\\b/i", "/\\bthan\\b/i", "/\\bthat\\b/i", "/\\bthe\\b/i", "/\\btheir\\b/i", "/\\bthem\\b/i", "/\\bthen\\b/i", "/\\bthere\\b/i", "/\\bthese\\b/i", "/\\bthey\\b/i", "/\\bthis\\b/i", "/\\btis\\b/i", "/\\bto\\b/i", "/\\btoo\\b/i", "/\\btwas\\b/i", "/\\bus\\b/i", "/\\bwants\\b/i", "/\\bwas\\b/i", "/\\bwe\\b/i", "/\\bwere\\b/i", "/\\bwhat\\b/i", "/\\bwhen\\b/i", "/\\bwhere\\b/i", "/\\bwhich\\b/i", "/\\bwhile\\b/i", "/\\bwho\\b/i", "/\\bwhom\\b/i", "/\\bwhy\\b/i", "/\\bwill\\b/i", "/\\bwith\\b/i", "/\\bwould\\b/i", "/\\byet\\b/i", "/\\byou\\b/i", "/\\byour\\b/i", "/\\bain't\\b/i", "/\\baren't\\b/i", "/\\bcan't\\b/i", "/\\bcould've\\b/i", "/\\bcouldn't\\b/i", "/\\bdidn't\\b/i", "/\\bdoesn't\\b/i", "/\\bdon't\\b/i", "/\\bhasn't\\b/i", "/\\bhe'd\\b/i", "/\\bhe'll\\b/i", "/\\bhe's\\b/i", "/\\bhow'd\\b/i", "/\\bhow'll\\b/i", "/\\bhow's\\b/i", "/\\bi'd\\b/i", "/\\bi'll\\b/i", "/\\bi'm\\b/i", "/\\bi've\\b/i", "/\\bisn't\\b/i", "/\\bit's\\b/i", "/\\bmight've\\b/i", "/\\bmightn't\\b/i", "/\\bmust've\\b/i", "/\\bmustn't\\b/i", "/\\bshan't\\b/i", "/\\bshe'd\\b/i", "/\\bshe'll\\b/i", "/\\bshe's\\b/i", "/\\bshould've\\b/i", "/\\bshouldn't\\b/i", "/\\bthat'll\\b/i", "/\\bthat's\\b/i", "/\\bthere's\\b/i", "/\\bthey'd\\b/i", "/\\bthey'll\\b/i", "/\\bthey're\\b/i", "/\\bthey've\\b/i", "/\\bwasn't\\b/i", "/\\bwe'd\\b/i", "/\\bwe'll\\b/i", "/\\bwe're\\b/i", "/\\bweren't\\b/i", "/\\bwhat'd\\b/i", "/\\bwhat's\\b/i", "/\\bwhen'd\\b/i", "/\\bwhen'll\\b/i", "/\\bwhen's\\b/i", "/\\bwhere'd\\b/i", "/\\bwhere'll\\b/i", "/\\bwhere's\\b/i", "/\\bwho'd\\b/i", "/\\bwho'll\\b/i", "/\\bwho's\\b/i", "/\\bwhy'd\\b/i", "/\\bwhy'll\\b/i", "/\\bwhy's\\b/i", "/\\bwon't\\b/i", "/\\bwould've\\b/i", "/\\bwouldn't\\b/i", "/\\byou'd\\b/i", "/\\byou'll\\b/i", "/\\byou're\\b/i", "/\\byou've\\b/i"); $query = $db->query("SELECT contents FROM page WHERE id ={$page_id}"); $page_contents = $query->fetchAll(PDO::FETCH_ASSOC); foreach ($page_contents as $row) { $utf8_text = strip_html_tags($row['contents']); $utf8_text = html_entity_decode($utf8_text, ENT_QUOTES, "UTF-8"); $text = strip_punctuation($utf8_text); $text = strip_symbols($text); $text = strip_numbers($text); $text = mb_strtolower($text, "utf-8"); $words = explode(' ', $text); $keywordCounts = array_count_values($words); arsort($keywordCounts, SORT_NUMERIC); $uniqueKeywords = array_keys($keywordCounts); $uniqueKeywords = preg_replace($needles, "", $uniqueKeywords); //Remove stop words // print_r($uniqueKeywords); for ($i = 0; $i < count($uniqueKeywords); $i++) { if ($uniqueKeywords[$i] != "") { $cur_word = addslashes(strtolower($uniqueKeywords[$i])); $query = $db->query("SELECT word_id FROM word WHERE word_word='{$cur_word}'"); $row = $query->fetchAll(); if ($row) { if ($row[0]['word_id']) { $word_id = $row[0]['word_id']; } } else { $query = $db->query("INSERT INTO word (word_word) VALUES ('{$cur_word}')"); $word_id = $db->lastInsertId(); } $db->query("INSERT INTO occurrence (word_id,page_id) VALUES ({$word_id},{$page_id})"); } } } return "Index"; }