* * Please contact me via email honglianglv@gmail.com * my blog: http://lifecrunch.biz * my twitter: http://twitter.com/honglianglv * * It is free software; you can redistribute it and/or modify it under GPLV3. * * This file stat how many item in the rating each num. */ include "./common.php"; $filename = "./tmpData/userNum_netflix"; $nums = array(); $rows = explode("\n", file_get_contents($filename)); foreach ($rows as $key => $row) { if (strlen($row) < 2) { continue; } $detail = explode("\t", trim($row)); if (count($detail) < 2) { continue; } if (!isset($nums[$detail[1]])) { $nums[$detail[1]] = 1; } else { ++$nums[$detail[1]]; } //if($key == 10){var_dump($nums);die} } ksort($nums); saveArrayToFile($nums, $filename . ".stat");
} if (!isset($itemIds[$itemId])) { $itemIds[$itemId] = $itemNum; //给用户编号 $itemStat[$itemNum] = 1; ++$itemNum; } else { ++$itemStat[$itemIds[$itemId]]; } } $itemIdStr = ''; foreach ($itemIds as $key => $itemId) { //将itemId储存成c容易读取的形式 $itemIdStr .= $key . "\t" . $itemId . "\n"; } $userIdStr = ''; foreach ($userIds as $key => $userId) { //将itemId储存成c容易读取的形式 $userIdStr .= $key . "\t" . $userId . "\n"; } asort($userStat); //对user打分数量进行排序 asort($itemStat); //对item打分数量进行排序 file_put_contents('tmpData/itemIds', $itemIdStr); file_put_contents('tmpData/userIds', $userIdStr); saveArrayToFile($userStat, 'tmpData/userStat'); saveArrayToFile($itemStat, 'tmpData/itemStat'); file_put_contents('tmpData/itemIdArrays', serialize($itemIds)); file_put_contents('tmpData/userIdArrays', serialize($userIds)); echo "get userId Map and itemId map successfully!\n";
//得到hotelId if ($hotelCountArray[$hotelId] < 50) { continue; } //忽略评价用户少于50个的酒店 //正则表达式获得用户的名,然后获得userId,正则表达式同时获得打分详情 $rateContent = file_get_contents($filename); preg_match_all('/<Author>(.+)\\n.+\\n.+\\n<Rating>(.+)\\n/i', $rateContent, $usersArray); foreach ($usersArray[1] as $key => $user) { if (isset($userCountArray[$user])) { ++$userCountArray[$user]; } else { $userCountArray[$user] = 1; } } } asort($userCountArray); file_put_contents('select/userCountArray', serialize($userCountArray)); saveArrayToFile($userCountArray, 'select/userCount'); //观察发现,大于4个评价的用户只有1967个,取这些用户作为实验对象 //重新构造实验数据集合,按照筛选的条件重新构造。构造出来的数据集合放在select/data中,重复利用以上的函数,这样得到的数据 //就可以放入内存中了8M的数据处理起来还是比较轻松的。 //将二维数组存储在文件中 function saveArrayToFile($array, $filename) { $str = ''; foreach ($array as $key => $item) { $str .= $key . "\t" . $item . "\n"; } file_put_contents($filename, $str); }
$userNum = 0; //针对每一个文件,提取出酒店的id $nameArray = explode('/', $filename); $fileSub = $nameArray[count($nameArray) - 1]; $posStart = strpos($fileSub, '_'); $posEnd = strpos($fileSub, '_', $posStart + 1); $hotelId = substr($fileSub, $posStart + 1, $posEnd - $posStart - 1); //下面给每一个用户编号,先读取文件内容,然后用正则表达式匹配, $rateContent = file_get_contents($filename); preg_match_all('/<Author>(.+)/i', $rateContent, $usersArray); $userNum = count($usersArray[1]); $hotelCountArray[$hotelId] = $userNum; } asort($hotelCountArray); file_put_contents('tmpData/hotelCountArray', serialize($hotelCountArray)); saveArrayToFile($hotelCountArray, 'tmpData/hotelCount'); /* $hotelCountArray = unserialize(file_get_contents('tmpData/hotelCountArray')); //遍历其中的每一个文件,找出其中评分数目大于10个的用户,然后删除它们的评价 $userCountArray = array(); foreach( glob($dirBase.'\*.txt') as $filename) { $userNum = 0; //通过文件名获得hotelId, $posStart = strpos($filename,'_'); $posEnd = strpos($filename,'_',$posStart+1); $hotelId = substr($filename,$posStart+1,$posEnd - $posStart-1);//得到hotelId if($hotelCountArray[$hotelId] < 50)continue; //忽略评价用户少于50个的酒店