public function preprocess($tables, $startTime = null) { $word_segmenter = new WordSegmenter(); $this->dm->executeSqlFile(__DIR__ . "/rec_tables.sql"); $this->dm->executeSqlFile(__DIR__ . "/rec_tables_additional.sql"); /* * $weight_matrix is actually an array of arrays * $weight_matrix_example = array( * 'keyword1' => array( * 'product1' => 1, * 'product1' => 2, * ), * 'keyword2' => array( * 'product1' => 3, * 'product3' => 5, * ), * ... * ) */ $weight_matrix = array(); // by the way, also accumulate keyword count $keyword_count = array(); $query_results = $this->dm->query("select id, query from " . $tables['query'] . ""); while ($query_row = mysql_fetch_array($query_results)) { /* * get keywords set K from each query and * products set P browsed in this session * for each (k, p) in KxP, increment count by 1 in $weight_matrix */ $keywords = $word_segmenter->segmentWords($query_row['query']); foreach ($keywords as $keyword) { if (isset($keyword_count[$keyword])) { $keyword_count[$keyword] += 1; } else { $keyword_count[$keyword] = 1; } } $item_results = $this->dm->query('select itemId from ' . $tables['query_item'] . ' where queryId = ' . $query_row['id'] . ';'); //itemId is actually item name! $items = array(); while ($item_row = mysql_fetch_array($item_results)) { $items[] = $item_row['itemId']; } foreach ($keywords as $keyword) { if (!array_key_exists($keyword, $weight_matrix)) { $weight_matrix[$keyword] = array(); } foreach ($items as $item) { if (!array_key_exists($item, $weight_matrix[$keyword])) { $weight_matrix[$keyword][$item] = 0; } $weight_matrix[$keyword][$item] += 1; } } } /* * put keyword count to database for later use: * 1. setting up ratings between keywords and items * 2. keyword expansion */ foreach ($keyword_count as $key => $key_count) { $escaped_keyword = addslashes($key); $this->dm->query("insert into Keyword (keyword, count) values('" . $escaped_keyword . "', " . $key_count . " )"); } /* * put the $weight_matrix to database * create table weight_matrix ( * id integer primary key, * keyword varchar, * item varchar, * weight integer, * unique (keyword, item) * ); */ $this->dm->query('truncate weight_matrix;'); foreach ($weight_matrix as $keyword => $weight_array) { $escaped_keyword = addslashes($keyword); foreach ($weight_array as $item => $weight) { $this->dm->query("insert into weight_matrix (keyword, item, weight) values ('{$escaped_keyword}', '{$item}', {$weight});"); } } /* * Construct the keyword and keyword_item_weight table * with Keyword Frequency - Inverted Item Frequency */ $temp_result = $this->dm->query('select count(distinct item) from weight_matrix;'); $temp_row = mysql_fetch_array($temp_result); $items_count = $temp_row[0]; $this->dm->query('truncate rating_matrix;'); $keyword_results = $this->dm->query('select keyword from Keyword;'); while ($keyword_row = mysql_fetch_array($keyword_results)) { $keyword = $keyword_row['keyword']; $escaped_keyword = addslashes($keyword); $temp_result = $this->dm->query("select count(*) from weight_matrix where keyword = '{$escaped_keyword}';"); $temp_row = mysql_fetch_array($temp_result); $related_items_count = $temp_row[0]; //this couldn't be zero $related_results = $this->dm->query("select item, weight from weight_matrix where keyword = '{$escaped_keyword}';"); while ($item_weight_row = mysql_fetch_array($related_results)) { $item = $item_weight_row['item']; $weight = $item_weight_row['weight']; $temp_result = $this->dm->query("select sum(weight) from weight_matrix where item = '{$item}';"); $temp_row = mysql_fetch_array($temp_result); $related_weight = $temp_row[0]; //$iif = log($items_count / $related_items_count); $iif = 1 / $related_items_count; $kf = $weight / $related_weight; $rating = $kf * $iif; // rating_matrix is similar to weight_matrix $this->dm->query("insert into rating_matrix (keyword, item, rating) values ('{$escaped_keyword}', '{$item}', {$rating});"); } } if ($this->name == KEY_LINK_JACCARD) { // to do this, the keyword table is needed $this->wordAssociationWithJaccardPreprocess($tables); } }
public function preprocess($tables, $startTime = null) { echo "KeywordRecommender_{$this->name} preprocess start.....<br/>"; flush(); ob_flush(); $time_start = microtime(true); $word_segmenter = new WordSegmenter(); $this->dm->executeSqlFile(__DIR__ . "/rec_tables.sql"); /* Construct the keyword and keyword_item_weight table */ $keyword_item_count = array(); $item_keyword_count = array(); $query_results = $this->dm->query("select id, query from " . $tables['query'] . ""); $keyword_count = array(); while ($query_row = mysql_fetch_array($query_results)) { $items = array(); $item_results = $this->dm->query("SELECT itemId FROM {$tables['query_item']} WHERE queryId = {$query_row['id']}"); while ($item_row = mysql_fetch_array($item_results)) { $items[] = $item_row['itemId']; $item = $item_row['itemId']; if (!array_key_exists($item, $item_keyword_count)) { $item_keyword_count[$item] = array(); } } $keywords = $word_segmenter->segmentWords($query_row['query']); foreach ($keywords as $keyword) { if (isset($keyword_count[$keyword])) { $keyword_count[$keyword] += 1; } else { $keyword_count[$keyword] = 1; $keyword_item_count[$keyword] = array(); } foreach ($items as $item) { if (!array_key_exists($item, $keyword_item_count[$keyword])) { $keyword_item_count[$keyword][$item] = 0; } $keyword_item_count[$keyword][$item] += 1; if (!array_key_exists($keyword, $item_keyword_count[$item])) { $item_keyword_count[$item][$keyword] = 0; } $item_keyword_count[$item][$keyword] += 1; } } } $this->dm->query("BEGIN"); $temp_result = $this->dm->query("select count(distinct itemId) from {$tables['query_item']};"); $temp_row = mysql_fetch_array($temp_result); $items_count = $temp_row[0]; foreach ($keyword_count as $key => $key_count) { //$all_count = array_sum($keyword_item_count[$key]); $related_items_count = count($keyword_item_count[$key]); $iif = log($items_count / $related_items_count); foreach ($keyword_item_count[$key] as $item => $count) { $all_count = array_sum($item_keyword_count[$item]); //$weight = $count / $key_count; $weight = $count / $all_count * $iif; //$weight = $count * $iif; //$weight = $count; $this->dm->query("INSERT INTO keyword_item_weight(keyword, item, weight) VALUE('{$key}',\r\n\t\t\t\t\t\t\t\t\t'{$item}', '{$weight}')"); } $this->dm->query("INSERT INTO keyword(keyword, count) VALUE ('{$key}', {$key_count})"); } $this->dm->query("COMMIT"); if ($this->name == KEY_COL_SLOPEONE) { $this->collaborativeFilteringWithSlopeOnePreprocess(); } if ($this->name == KEY_LINK_JACCARD) { $this->wordAssociationWithJaccardPreprocess($tables); } if ($this->name == KEY_LINK_COSINE) { $this->wordAssociationWithCosinePreprocess($tables); } $time_end = microtime(true); $cost_time = $time_end - $time_start; echo "KeywordRecommender_{$this->name} preprocess end.....<br/>"; echo "cost time: {$cost_time} <br/>"; flush(); ob_flush(); }