<?php include 'extract_keywords.php'; // connect to mysql ini_set("max_execution_time", 2400); $db = mysql_connect("localhost", "recsys-nju", "recsys-nju"); mysql_select_db("bagsok", $db); // empty table keywords_from_userinfo mysql_query("TRUNCATE table 'keywords_from_userinfo';"); // fetch refer from table userinfo, extract keywords $refers = mysql_query("select refer from userinfo where refer is not null and refer <> '' and refer not like '%mbaobao%'"); $keywords_num = 0; while ($row = mysql_fetch_array($refers)) { $keyword = extract_keywords($row['refer']); $keyword = addslashes($keyword); if ($keyword != '') { $insert_sql = "insert into keywords_from_userinfo (keywords) values('" . $keyword . "')"; $insert_result = mysql_query($insert_sql); if ($insert_result) { // insert successfully $keywords_num++; } else { echo $insert_sql; echo '<br>'; echo mysql_error(); echo '<br>'; } } } echo 'keywords_num = ' . $keywords_num; mysql_close($db);
function extract_matrix($text, $stop_words) { global $CONFIG; set_time_limit(0); //this avoids timeouts include $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/config.php"; //given the text of the document (with this configuration it is only 1), for each word (excluding stop words) calculates statistics (position in the text) and saves them in the D object $text_info = create_index($text, $stop_words); $keywords = extract_keywords($text_info["positions"]); //most present keywords are extracted from the text, till the limit established by the variable keywords_limit $response = array(); if ($context_limit == 0) { foreach ($keywords as $keyword => $recurrence) { $response[$keyword][$keyword] = $recurrence; } } else { $response = sliding_window_ri($keywords, $width_sliding_window, $text_info["text"]); } //it applies the sliding window algorithm to each keyword to extract the contexts return $response; }
die(mysql_error()); } mysql_select_db('bagsok'); $result = mysql_query("SELECT DISTINCT refer FROM userinfo WHERE refer IS NOT NULL"); $all = 0; $kcount = 0; $number = 0; if (!$result) { die('no result available'); } else { $keyword_count = array(); while ($row = mysql_fetch_array($result)) { $all++; //increment all counter $search_url = $row[0]; $keyword_str = extract_keywords($search_url); $keywords = explode(' ', $keyword_str); $keywords = array_filter($keywords, "str_not_empty"); if (!count($keywords)) { continue; } $number += count($keywords); $kcount++; //inrement keyword counter foreach ($keywords as $keyword) { if (isset($keyword_count[$keyword])) { $keyword_count[$keyword]++; } else { $keyword_count[$keyword] = 1; } }
<?php require 'extract_keywords.php'; require 'dbconfig.php'; $con = mysql_connect($db_host, $db_user, $db_pass); if (!$con) { die(mysql_error()); } mysql_select_db('bagsok'); $TOTAL = 2000; $result = mysql_query("SELECT DISTINCT refer FROM userinfo LIMIT {$TOTAL}"); $count = 0; if (!$result) { die('no result available'); } else { echo '<table border="1px">'; while ($row = mysql_fetch_array($result)) { $search_url = $row[0]; echo "<tr><td>{$row['0']}</td><td>"; $keywords = extract_keywords($search_url); if ($keywords) { $count++; echo $keywords; } echo '</td></tr>'; } echo '</table>'; } echo floatval($count) / $TOTAL; mysql_close($con);
$kw_occr = array(); ?> <table border="1px"> <thead> <tr> <th>ID</th> <th>Keyword string</th> </tr> </thead> <tbody> <? while($row = mysql_fetch_array($result)){ $userid = $row['userid']; $refer = $row['refer']; $real_refer = urldecode($refer); $keyword_string = extract_keywords($real_refer); if($keyword_string){ $count++; $kw_set = keywords_array($keyword_string); foreach($kw_set as $kw) { if(!array_key_exists($kw, $kw_occr)) { $kw_occr[$kw] = 0; } $kw_occr[$kw] += 1; } echo "<tr><td>$userid</td><td>$keyword_string</td></tr>"; } } ?> </tbody> </table>
$count = 0; if (!$result) { echo 'no result available'; } else { //STEP1: pre-process //1. get all keyword strings //2. get all splited keyword array mysql_query("BEGIN"); $all_splitted_keyword_sets = array(); $all_rows = array(); //store rows for later iteration $previous_cookie = 'thequickbrownfoxjumpsoverthelazydog'; $previous_keywords = 'theluckymankissesthearrogantlady'; while ($row = mysql_fetch_array($result)) { $current_cookie = $row['cookie_id']; $keyword_string = extract_keywords($row['keywords']); if ($keyword_string) { $count++; $kw_array = keywords_array($keyword_string); $row['keywords'] = $kw_array; $all_rows[] = $row; //see if the keyword is from the same session if ($current_cookie != $previous_cookie && $keyword_string != $previous_keywords) { $all_splitted_keyword_sets[] = $kw_array; $previous_keywords = $keyword_string; } } $previous_cookie = $current_cookie; } echo "<p>entries with keywords/total entries: {$count} / {$TOTAL}</p>"; echo "<p>Threshold = {$THRESHOLD}</p>";
$con = mysql_connect($db_host, $db_user, $db_pass); if (!$con) { die(mysql_error()); } mysql_select_db('bagsok'); $result = mysql_query("SELECT id, refer FROM userinfo WHERE refer IS NOT NULL LIMIT 2000, {$TOTAL}"); $count = 0; if (!$result) { echo 'no result available'; } else { //STEP1: pre-process //1. get all keyword strings //2. get all splited keyword array $all_splitted_keyword_sets = array(); while ($row = mysql_fetch_array($result)) { $keyword_string = extract_keywords($row['refer']); if ($keyword_string) { $count++; $all_splitted_keyword_sets[] = keywords_array($keyword_string); } } echo "<p>entries with keywords/total entries: {$count} / {$TOTAL}</p>"; echo "<p>Threshold = " . THRESHOLD . "</p>"; //STEP2: aggregate $kwset_occur_mapping = array(); foreach ($all_splitted_keyword_sets as $splitted_keyword_set) { echo '<h2>' . kwset_to_string($splitted_keyword_set) . '</h2>'; //deal with keyword sets of size 1 first $current_generation = array(); $size1_set = expand_dimension($splitted_keyword_set); foreach ($size1_set as $keyword_set) {
return $sec[1] - $first[1]; } $str = preg_replace('/[^\\p{L}0-9 ]/', ' ', $str); $str = trim(preg_replace('/\\s+/', ' ', $str)); $words = explode(' ', $str); $keywords = array(); while (($c_word = array_shift($words)) !== null) { if (strlen($c_word) < $minWordLen) { continue; } $c_word = strtolower($c_word); if (array_key_exists($c_word, $keywords)) { $keywords[$c_word][1]++; } else { $keywords[$c_word] = array($c_word, 1); } } usort($keywords, 'keyword_count_sort'); $final_keywords = array(); foreach ($keywords as $keyword_det) { if ($keyword_det[1] < $minWordOccurrences) { break; } array_push($final_keywords, $keyword_det[0]); } return $asArray ? $final_keywords : implode(', ', $final_keywords); } extract_keywords($str, 3, 2, false); ?>
function extract_matrix($text, $stop_words) { global $width_sliding_window, $IndexingClassificationPath, $context_limit, $IOdir; //given the text of the document (with this configuration it is only 1), for each word (excluding stop words) calculates statistics (position in the text) and saves them in the D object $text_info = create_index($text, $stop_words); $keywords = extract_keywords($text_info["positions"]); //most present keywords are extracted from the text, till the limit established by the variable keywords_limit $response = array(); if ($context_limit == 0) { foreach ($keywords as $keyword => $recurrence) { $response[$keyword][$keyword] = $recurrence; } } else { $response = sliding_window_ri($keywords, $width_sliding_window, $text_info["text"]); } //it applies the sliding window algorithm to each keyword to extract the contexts return $response; }
mysql_query($truncate_stmt); $raw_data_query = "SELECT url, keywords FROM pageflow_keywords WHERE keywords IS NOT NULL AND CHAR_LENGTH(keywords) > 0 ORDER BY keywords"; $result = mysql_query($raw_data_query); $TOTAL = mysql_num_rows($result); $count = 0; if (!$result) { echo 'no result available'; die; } else { $stopwords = load_stopwords('stopwords.txt'); $keyword_occur = array(); $previous_keyword_string = 'thequickbrownfoxjumpsoverthelazydog'; $previous_keywords = array(); while ($row = mysql_fetch_array($result)) { $query_string = $row['keywords']; $keyword_string = extract_keywords($query_string); $product = $row['url']; if ($keyword_string != $previous_keyword_string) { $keywords = keywords_array($keyword_string); $keywords = remove_stopwords($keywords, $stopwords); foreach ($keywords as $keyword) { if (!isset($keyword_occur[$keyword])) { $keyword_occur[$keyword] = 0; } } $previous_keyword_string = $keyword_string; $previous_keywords = $keywords; } if (count($previous_keywords) == 0) { continue; }