forked from wenjun1055/Bayes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BayesFilter.php
84 lines (77 loc) · 2.26 KB
/
BayesFilter.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
<?php
namespace Lib;
class BayesFilter
{
const SPAMSCALE = 0.5;
const HEALTHYSCALE = 0.5;
private $totalWordNum;
private $sphinx;
private $db;
private $redis;
private static $instance;
public static function getInstance()
{
$class = get_called_class();
if(!self::$instance) self::$instance = new $class();
return self::$instance;
}
private function __construct()
{
$this->sphinx = \Utility_SphinxClient::Connection('filter_comment');
$this->db = new \JMDbMysqlReadWriteSplit();
$this->redis = \JMRedis::getConnectionByName ('filter_comment');
$this->totalWordNum = $this->redis->get(COMMENT_WORDS_COUNTER);
}
public function bayes($sentence)
{
$list = $this->splitWords($sentence);
$count = count($list);
$pn1 = 1.0;
$pn2 = 1.0;
for ($i = 0; $i < $count; $i++)
{
$probability = $this->getProbability($list[$i]['tokenized']);
$spam = $probability['spam'];
$healthy = $probability['healthy'];
$temp = ($spam * self::SPAMSCALE) / (($spam * self::SPAMSCALE) + ($healthy * self::HEALTHYSCALE));
$pn1 *= $temp;
$pn2 *= (1 - $temp);
}
@$p = $pn1 / ($pn1 + $pn2);
return $p * 100;
}
private function getProbability($word)
{
$result = $this->redis->get(COMMENT_WORDS_PREFIX . $word);
if (empty($result)) {
$data["spam"] = 0.4;
$data["healthy"] = 0.6;
return $data;
} else {
$temp = explode("_", $result);
$data['spam_num'] = $temp[1];
$data['healthy_num'] = $temp[0];
}
$data["spam"] = $data['spam_num'] / $this->totalWordNum;
$data["healthy"] = $data['healthy_num'] / $this->totalWordNum;
if (($data["spam"] + $data['healthy']) < 0.005) {
$data["spam"] = 0.4;
$data["healthy"] = 0.6;
}
$data["spam"] = ($data["spam"] < 0.001) ? 0.001 : $data["spam"];
$data["healthy"] = ($data["healthy"] < 0.001) ? 0.001 : $data["healthy"];
return $data;
}
public function splitWords($sentence)
{
$pattern = '/[\x{4e00}-\x{9fa5}]+/u';
$matches = "";
preg_match_all($pattern, $sentence, $matches);
$sentence = "";
foreach ($matches[0] as $row) {
$sentence = $sentence.$row." ";
}
$result = $this->sphinx->buildKeywords($sentence, $index = INDEX, false);
return $result;
}
}