Exemple #1
0
 * 
 * Please contact me via email honglianglv@gmail.com
 * my blog: http://lifecrunch.biz
 * my twitter: http://twitter.com/honglianglv
 *
 * It is free software; you can redistribute it and/or modify it under GPLV3.
 * 
 * This file stat how many item in the rating each num.
 */
include "./common.php";
$filename = "./tmpData/userNum_netflix";
$nums = array();
$rows = explode("\n", file_get_contents($filename));
foreach ($rows as $key => $row) {
    if (strlen($row) < 2) {
        continue;
    }
    $detail = explode("\t", trim($row));
    if (count($detail) < 2) {
        continue;
    }
    if (!isset($nums[$detail[1]])) {
        $nums[$detail[1]] = 1;
    } else {
        ++$nums[$detail[1]];
    }
    //if($key == 10){var_dump($nums);die}
}
ksort($nums);
saveArrayToFile($nums, $filename . ".stat");
    }
    if (!isset($itemIds[$itemId])) {
        $itemIds[$itemId] = $itemNum;
        //给用户编号
        $itemStat[$itemNum] = 1;
        ++$itemNum;
    } else {
        ++$itemStat[$itemIds[$itemId]];
    }
}
$itemIdStr = '';
foreach ($itemIds as $key => $itemId) {
    //将itemId储存成c容易读取的形式
    $itemIdStr .= $key . "\t" . $itemId . "\n";
}
$userIdStr = '';
foreach ($userIds as $key => $userId) {
    //将itemId储存成c容易读取的形式
    $userIdStr .= $key . "\t" . $userId . "\n";
}
asort($userStat);
//对user打分数量进行排序
asort($itemStat);
//对item打分数量进行排序
file_put_contents('tmpData/itemIds', $itemIdStr);
file_put_contents('tmpData/userIds', $userIdStr);
saveArrayToFile($userStat, 'tmpData/userStat');
saveArrayToFile($itemStat, 'tmpData/itemStat');
file_put_contents('tmpData/itemIdArrays', serialize($itemIds));
file_put_contents('tmpData/userIdArrays', serialize($userIds));
echo "get userId Map and itemId map successfully!\n";
    //得到hotelId
    if ($hotelCountArray[$hotelId] < 50) {
        continue;
    }
    //忽略评价用户少于50个的酒店
    //正则表达式获得用户的名,然后获得userId,正则表达式同时获得打分详情
    $rateContent = file_get_contents($filename);
    preg_match_all('/<Author>(.+)\\n.+\\n.+\\n<Rating>(.+)\\n/i', $rateContent, $usersArray);
    foreach ($usersArray[1] as $key => $user) {
        if (isset($userCountArray[$user])) {
            ++$userCountArray[$user];
        } else {
            $userCountArray[$user] = 1;
        }
    }
}
asort($userCountArray);
file_put_contents('select/userCountArray', serialize($userCountArray));
saveArrayToFile($userCountArray, 'select/userCount');
//观察发现,大于4个评价的用户只有1967个,取这些用户作为实验对象
//重新构造实验数据集合,按照筛选的条件重新构造。构造出来的数据集合放在select/data中,重复利用以上的函数,这样得到的数据
//就可以放入内存中了8M的数据处理起来还是比较轻松的。
//将二维数组存储在文件中
function saveArrayToFile($array, $filename)
{
    $str = '';
    foreach ($array as $key => $item) {
        $str .= $key . "\t" . $item . "\n";
    }
    file_put_contents($filename, $str);
}
    $userNum = 0;
    //针对每一个文件,提取出酒店的id
    $nameArray = explode('/', $filename);
    $fileSub = $nameArray[count($nameArray) - 1];
    $posStart = strpos($fileSub, '_');
    $posEnd = strpos($fileSub, '_', $posStart + 1);
    $hotelId = substr($fileSub, $posStart + 1, $posEnd - $posStart - 1);
    //下面给每一个用户编号,先读取文件内容,然后用正则表达式匹配,
    $rateContent = file_get_contents($filename);
    preg_match_all('/<Author>(.+)/i', $rateContent, $usersArray);
    $userNum = count($usersArray[1]);
    $hotelCountArray[$hotelId] = $userNum;
}
asort($hotelCountArray);
file_put_contents('tmpData/hotelCountArray', serialize($hotelCountArray));
saveArrayToFile($hotelCountArray, 'tmpData/hotelCount');
/*
$hotelCountArray = unserialize(file_get_contents('tmpData/hotelCountArray'));
//遍历其中的每一个文件,找出其中评分数目大于10个的用户,然后删除它们的评价
$userCountArray =  array();
foreach( glob($dirBase.'\*.txt') as $filename)
{
	$userNum = 0;
	
	//通过文件名获得hotelId,
	$posStart = strpos($filename,'_');
	$posEnd = strpos($filename,'_',$posStart+1);
	$hotelId = substr($filename,$posStart+1,$posEnd - $posStart-1);//得到hotelId
	
	if($hotelCountArray[$hotelId] < 50)continue; //忽略评价用户少于50个的酒店