<?php

include 'extract_keywords.php';
// connect to mysql
ini_set("max_execution_time", 2400);
$db = mysql_connect("localhost", "recsys-nju", "recsys-nju");
mysql_select_db("bagsok", $db);
// empty table keywords_from_userinfo
mysql_query("TRUNCATE table 'keywords_from_userinfo';");
// fetch refer from table userinfo, extract keywords
$refers = mysql_query("select refer from userinfo where refer is not null and refer <> '' and refer not like '%mbaobao%'");
$keywords_num = 0;
while ($row = mysql_fetch_array($refers)) {
    $keyword = extract_keywords($row['refer']);
    $keyword = addslashes($keyword);
    if ($keyword != '') {
        $insert_sql = "insert into keywords_from_userinfo (keywords) values('" . $keyword . "')";
        $insert_result = mysql_query($insert_sql);
        if ($insert_result) {
            // insert successfully
            $keywords_num++;
        } else {
            echo $insert_sql;
            echo '<br>';
            echo mysql_error();
            echo '<br>';
        }
    }
}
echo 'keywords_num = ' . $keywords_num;
mysql_close($db);
function extract_matrix($text, $stop_words)
{
    global $CONFIG;
    set_time_limit(0);
    //this avoids timeouts
    include $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/config.php";
    //given the text of the document (with this configuration it is only 1), for each word (excluding stop words) calculates statistics (position in the text) and saves them in the D object
    $text_info = create_index($text, $stop_words);
    $keywords = extract_keywords($text_info["positions"]);
    //most present keywords are extracted from the text, till the limit established by the variable keywords_limit
    $response = array();
    if ($context_limit == 0) {
        foreach ($keywords as $keyword => $recurrence) {
            $response[$keyword][$keyword] = $recurrence;
        }
    } else {
        $response = sliding_window_ri($keywords, $width_sliding_window, $text_info["text"]);
    }
    //it applies the sliding window algorithm to each keyword to extract the contexts
    return $response;
}
Esempio n. 3
0
    die(mysql_error());
}
mysql_select_db('bagsok');
$result = mysql_query("SELECT DISTINCT refer FROM userinfo WHERE refer IS NOT NULL");
$all = 0;
$kcount = 0;
$number = 0;
if (!$result) {
    die('no result available');
} else {
    $keyword_count = array();
    while ($row = mysql_fetch_array($result)) {
        $all++;
        //increment all counter
        $search_url = $row[0];
        $keyword_str = extract_keywords($search_url);
        $keywords = explode(' ', $keyword_str);
        $keywords = array_filter($keywords, "str_not_empty");
        if (!count($keywords)) {
            continue;
        }
        $number += count($keywords);
        $kcount++;
        //inrement keyword counter
        foreach ($keywords as $keyword) {
            if (isset($keyword_count[$keyword])) {
                $keyword_count[$keyword]++;
            } else {
                $keyword_count[$keyword] = 1;
            }
        }
Esempio n. 4
0
<?php

require 'extract_keywords.php';
require 'dbconfig.php';
$con = mysql_connect($db_host, $db_user, $db_pass);
if (!$con) {
    die(mysql_error());
}
mysql_select_db('bagsok');
$TOTAL = 2000;
$result = mysql_query("SELECT DISTINCT refer FROM userinfo LIMIT {$TOTAL}");
$count = 0;
if (!$result) {
    die('no result available');
} else {
    echo '<table border="1px">';
    while ($row = mysql_fetch_array($result)) {
        $search_url = $row[0];
        echo "<tr><td>{$row['0']}</td><td>";
        $keywords = extract_keywords($search_url);
        if ($keywords) {
            $count++;
            echo $keywords;
        }
        echo '</td></tr>';
    }
    echo '</table>';
}
echo floatval($count) / $TOTAL;
mysql_close($con);
Esempio n. 5
0
    $kw_occr = array();
?>
<table border="1px">
<thead>
<tr>
<th>ID</th>
<th>Keyword string</th>
</tr>
</thead>
<tbody>
<?
    while($row = mysql_fetch_array($result)){
        $userid = $row['userid'];
        $refer = $row['refer'];
        $real_refer = urldecode($refer);
        $keyword_string = extract_keywords($real_refer);
        if($keyword_string){
            $count++;
            $kw_set = keywords_array($keyword_string);
            foreach($kw_set as $kw) {
                if(!array_key_exists($kw, $kw_occr)) {
                    $kw_occr[$kw] = 0;
                }
                $kw_occr[$kw] += 1;
            }
            echo "<tr><td>$userid</td><td>$keyword_string</td></tr>";
        }
    }
?>
</tbody>
</table>
Esempio n. 6
0
$count = 0;
if (!$result) {
    echo 'no result available';
} else {
    //STEP1: pre-process
    //1. get all keyword strings
    //2. get all splited keyword array
    mysql_query("BEGIN");
    $all_splitted_keyword_sets = array();
    $all_rows = array();
    //store rows for later iteration
    $previous_cookie = 'thequickbrownfoxjumpsoverthelazydog';
    $previous_keywords = 'theluckymankissesthearrogantlady';
    while ($row = mysql_fetch_array($result)) {
        $current_cookie = $row['cookie_id'];
        $keyword_string = extract_keywords($row['keywords']);
        if ($keyword_string) {
            $count++;
            $kw_array = keywords_array($keyword_string);
            $row['keywords'] = $kw_array;
            $all_rows[] = $row;
            //see if the keyword is from the same session
            if ($current_cookie != $previous_cookie && $keyword_string != $previous_keywords) {
                $all_splitted_keyword_sets[] = $kw_array;
                $previous_keywords = $keyword_string;
            }
        }
        $previous_cookie = $current_cookie;
    }
    echo "<p>entries with keywords/total entries: {$count} / {$TOTAL}</p>";
    echo "<p>Threshold = {$THRESHOLD}</p>";
Esempio n. 7
0
$con = mysql_connect($db_host, $db_user, $db_pass);
if (!$con) {
    die(mysql_error());
}
mysql_select_db('bagsok');
$result = mysql_query("SELECT id, refer FROM userinfo WHERE refer IS NOT NULL LIMIT 2000, {$TOTAL}");
$count = 0;
if (!$result) {
    echo 'no result available';
} else {
    //STEP1: pre-process
    //1. get all keyword strings
    //2. get all splited keyword array
    $all_splitted_keyword_sets = array();
    while ($row = mysql_fetch_array($result)) {
        $keyword_string = extract_keywords($row['refer']);
        if ($keyword_string) {
            $count++;
            $all_splitted_keyword_sets[] = keywords_array($keyword_string);
        }
    }
    echo "<p>entries with keywords/total entries: {$count} / {$TOTAL}</p>";
    echo "<p>Threshold = " . THRESHOLD . "</p>";
    //STEP2: aggregate
    $kwset_occur_mapping = array();
    foreach ($all_splitted_keyword_sets as $splitted_keyword_set) {
        echo '<h2>' . kwset_to_string($splitted_keyword_set) . '</h2>';
        //deal with keyword sets of size 1 first
        $current_generation = array();
        $size1_set = expand_dimension($splitted_keyword_set);
        foreach ($size1_set as $keyword_set) {
Esempio n. 8
0
        return $sec[1] - $first[1];
    }
    $str = preg_replace('/[^\\p{L}0-9 ]/', ' ', $str);
    $str = trim(preg_replace('/\\s+/', ' ', $str));
    $words = explode(' ', $str);
    $keywords = array();
    while (($c_word = array_shift($words)) !== null) {
        if (strlen($c_word) < $minWordLen) {
            continue;
        }
        $c_word = strtolower($c_word);
        if (array_key_exists($c_word, $keywords)) {
            $keywords[$c_word][1]++;
        } else {
            $keywords[$c_word] = array($c_word, 1);
        }
    }
    usort($keywords, 'keyword_count_sort');
    $final_keywords = array();
    foreach ($keywords as $keyword_det) {
        if ($keyword_det[1] < $minWordOccurrences) {
            break;
        }
        array_push($final_keywords, $keyword_det[0]);
    }
    return $asArray ? $final_keywords : implode(', ', $final_keywords);
}
extract_keywords($str, 3, 2, false);
?>
	
function extract_matrix($text, $stop_words)
{
    global $width_sliding_window, $IndexingClassificationPath, $context_limit, $IOdir;
    //given the text of the document (with this configuration it is only 1), for each word (excluding stop words) calculates statistics (position in the text) and saves them in the D object
    $text_info = create_index($text, $stop_words);
    $keywords = extract_keywords($text_info["positions"]);
    //most present keywords are extracted from the text, till the limit established by the variable keywords_limit
    $response = array();
    if ($context_limit == 0) {
        foreach ($keywords as $keyword => $recurrence) {
            $response[$keyword][$keyword] = $recurrence;
        }
    } else {
        $response = sliding_window_ri($keywords, $width_sliding_window, $text_info["text"]);
    }
    //it applies the sliding window algorithm to each keyword to extract the contexts
    return $response;
}
mysql_query($truncate_stmt);
$raw_data_query = "SELECT url, keywords FROM pageflow_keywords WHERE keywords IS NOT NULL AND CHAR_LENGTH(keywords) > 0 ORDER BY keywords";
$result = mysql_query($raw_data_query);
$TOTAL = mysql_num_rows($result);
$count = 0;
if (!$result) {
    echo 'no result available';
    die;
} else {
    $stopwords = load_stopwords('stopwords.txt');
    $keyword_occur = array();
    $previous_keyword_string = 'thequickbrownfoxjumpsoverthelazydog';
    $previous_keywords = array();
    while ($row = mysql_fetch_array($result)) {
        $query_string = $row['keywords'];
        $keyword_string = extract_keywords($query_string);
        $product = $row['url'];
        if ($keyword_string != $previous_keyword_string) {
            $keywords = keywords_array($keyword_string);
            $keywords = remove_stopwords($keywords, $stopwords);
            foreach ($keywords as $keyword) {
                if (!isset($keyword_occur[$keyword])) {
                    $keyword_occur[$keyword] = 0;
                }
            }
            $previous_keyword_string = $keyword_string;
            $previous_keywords = $keywords;
        }
        if (count($previous_keywords) == 0) {
            continue;
        }