function testPost(ISblamPost $p) { if ($this->blacklist === NULL) { $this->blacklist = new DomainMatch(); if (!$this->importChongqed($this->blacklistfile)) { throw new Exception("Unable to import chongqed.org blacklist from {$this->blacklistfile}"); } } $res4 = 0; $domains = array(); if ($uri = $p->getAuthorURI()) { $uri = new SblamURI($uri); if ($tmp = $this->check($uri)) { $domains[$uri->getHostname()] = true; $res4 += $tmp; } } foreach ($p->getLinks() as $uri) { if ($tmp = $this->check($uri)) { $domains[$uri->getHostname()] = true; $res4 += $tmp; } } if ($res4) { return array(0.8, self::CERTAINITY_NORMAL, "Blacklisted domains (" . implode(', ', array_keys($domains)) . ")"); } }
function preTestPost(ISblamPost $p) { $text = preg_replace('/\\[:..:\\]\\s+/', ' ', $p->getRawContent()); if (strlen($text) < 40) { $text .= $p->getAuthorURI(); } if (strlen($text) < 40) { $text .= $p->getAuthorEmail(); } if (strlen($text) < 40) { $text .= $p->getAuthorName(); } if (strlen($text) < 20) { $text .= $p->getAuthorIP(); } if (strlen($text) < 10) { $this->checksum = NULL; return; } $text = preg_replace(array('/[.,\\s!:;()-]+/', '/([a-f0-9]{1,3}[a-f]{1,6}[0-9]{1,6})+/', '/\\d\\d{1,8}/'), array(' ', 'H', 'D'), strtolower($text)); d($text, 'normalized text'); $this->length = strlen($text); $this->checksum = md5($text); if (!$this->db->exec(sprintf("/*maxtime5*/INSERT INTO dupes (checksum,count,expires,ip) VALUES(UNHEX('%s'),1,%d,%u)\n\t\t\tON DUPLICATE KEY UPDATE count = 1 + IF(expires < %d,CEIL(count/10),count), expires = GREATEST(expires + 3600*6, %d)", $this->checksum, time() + 3600 * 18, ip2long($p->getAuthorIP()), time(), time() + 3600 * 18))) { warn($this->db->errorInfo()); } }
function testPost(ISblamPost $p) { $score = $this->checkFragment($p->getRawContent()); if ($a = $p->getAuthorName()) { $score += $this->checkFragment($a); if (false !== strpos($a, "\n")) { $score += 5; } } if ($a = $p->getAuthorEmail()) { $score += $this->checkFragment($a); if (false !== strpos($a, "\n")) { $score += 5; } } if ($a = $p->getAuthorURI()) { $score += $this->checkFragment($a); if (false !== strpos($a, "\n")) { $score += 5; } } if ($score > 28) { return array(4, self::CERTAINITY_SURE, "Cerainly a mail exploit"); } if ($score > 23) { return array(2, self::CERTAINITY_SURE, "Mail exploit"); } if ($score > 17) { return array(1, self::CERTAINITY_HIGH, "Mail exploit"); } if ($score > 11) { return array(0.6, self::CERTAINITY_NORMAL, "Possible mail exploit"); } return NULL; }
function testPost(ISblamPost $p) { $matched = array(); $score = 0; $post = $p->getRawContent() . "\n" . $p->getAuthorName() . "\n" . $p->getAuthorEmail() . "\n" . $p->getAuthorURI(); foreach ($this->patterns as $pattern) { if (preg_match($pattern[0], $post)) { $matched[] = $pattern[0]; $score += $pattern[1]; } } if ($score) { return array($score, self::CERTAINITY_NORMAL, "Exact spam matches (" . implode(', ', $matched) . ")"); } }
function testPost(ISblamPost $p) { if ($this->keywords === NULL) { $this->importBlocklist2($this->blocklist); } if (!count($this->keywords)) { return NULL; } $res1 = $this->testText($p->getText() . ' ' . $p->getAuthorName()); $res2 = 0; $res3 = 0; $alluris = ''; if ($uri = $p->getAuthorURI()) { $alluris .= strtolower($uri); } if ($uri = $p->getAuthorEmail()) { $alluris .= ' ' . strtolower($uri); } foreach ($p->getLinks() as $link) { if ($label = $link->getLabel()) { $res2 += count(array_intersect($this->getKeywordsFromText($label), $this->keywords)); } if ($uri = $link->getURI()) { $alluris .= ' ' . strtolower($uri); } } $cnt = 0; str_replace($this->keywords, $this->keywords, $alluris, $res3); $sum = $res1 + $res2 + $res3; if (!$sum) { return NULL; } //array(-0.1,self::CERTAINITY_LOW, "No banned keywords"); $out = array(); if ($res1) { $out[] = array(1.2 - 1 / $res1, $sum > 2 ? self::CERTAINITY_HIGH : self::CERTAINITY_NORMAL, "Banned keywords in text ({$res1})"); } if ($res2) { $out[] = array(1.2 - 1 / ($res2 + 1), self::CERTAINITY_HIGH, "Banned keywords in link labels ({$res2})"); } if ($res3) { $out[] = array(1.2 - 1 / $res3, $sum > 2 ? self::CERTAINITY_HIGH : self::CERTAINITY_NORMAL, "Banned keywords in URLs ({$res3})"); } if (count($out)) { return $out; } }
function testPost(ISblamPost $p) { $out = array(); if ($this->hasURI($p->getAuthorEmail())) { $score = 0.2; if ($this->hasURI($p->getAuthorURI())) { $score = 0.4; } $p->addLink($p->getAuthorEmail()); // expose it! $out[] = array($score, self::CERTAINITY_LOW, "Link stuffed in e-mail field"); } if ($this->hasURI($p->getAuthorName())) { $score = 0.1; if ($this->hasURI($p->getAuthorURI())) { $score = 0.3; } $p->addLink($p->getAuthorName()); // expose it! $out[] = array($score, self::CERTAINITY_LOW, "Link stuffed in name field"); } else { if ("" === $p->getAuthorName()) { $out[] = array(0.1, self::CERTAINITY_LOW, "Anonymous"); } } if ($cnt = substr_count($p->getAuthorURI(), "http://") > 1) { $out[] = array($cnt / 10 + 0.2, self::CERTAINITY_LOW, "Multiple links in author URI field"); } if ($cnt = substr_count($p->getAuthorURI(), "<a ") > 1) { $out[] = array($cnt / 5 + 0.2, self::CERTAINITY_LOW, "HTML in author URI field"); } $longs = 0; if (strlen($p->getAuthorName()) > 50) { $longs++; } if (strlen($p->getAuthorEmail()) > 50) { $longs++; } if (strlen($p->getAuthorURI()) > 150) { $longs++; } if ($longs) { $out[] = array($longs / 10 + 0.1, self::CERTAINITY_LOW, "Looong text in name/e-mail/URI fields"); } if ("" === trim($p->getRawContent())) { $out[] = array(0.6, self::CERTAINITY_LOW, "Empty content"); } if (preg_match('!\\b(google\\.com|msn\\.com)\\b!', $p->getAuthorURI())) { $out[] = array(0.2, self::CERTAINITY_LOW, "Not your website"); } return $out; }
function testPost(ISblamPost $p) { $links = $p->getLinks(); if ($links === NULL) { return NULL; } $linkcount = count($links); $authorlink = $p->getAuthorURI() ? 1 : 0; // count separately, because this link may be unrelated to post's contents, so shouldn't skew link/words ratio if ($linkcount + $authorlink == 0) { if (strlen($p->getText()) > 20) { return array(-0.5, self::CERTAINITY_NORMAL, "No links"); } return NULL; // don't give nolinks bonus to posts with no content (no content is abnormal and it may be another way to spam) } if ($linkcount + $authorlink == 1) { return array(0.1, self::CERTAINITY_LOW, "Single link"); } if ($linkcount + $authorlink == 2) { return array(0.2, self::CERTAINITY_LOW, "Two links"); } $numwords = count(preg_split('![^a-z0-9\\x7F-\\xFF-]+|https?://[^\\]\\[\\s\'"<>]+!i', $p->getText(), 500, PREG_SPLIT_NO_EMPTY)); // long posts may legitimately have more links. can't set any limits, because wiki pages may contain lots of links. $ratio = round($linkcount * 100 / (10 + $numwords)); if ($ratio > 22) { return array(0.45, self::CERTAINITY_NORMAL, "Flooded with links (A{$ratio}: {$linkcount} per {$numwords} words)"); } if ($ratio > 17) { return array(0.35, self::CERTAINITY_NORMAL, "Flooded with links (B{$ratio}: {$linkcount} per {$numwords} words)"); } if ($ratio > 12) { return array(0.25, self::CERTAINITY_NORMAL, "Flooded with links (C{$ratio}: {$linkcount} per {$numwords} words)"); } if ($ratio > 6) { return array(0.25, self::CERTAINITY_NORMAL, "Lots of links (D{$ratio}: {$linkcount} per {$numwords} words)"); } return array(0.25, self::CERTAINITY_LOW, "Some links (E{$ratio}: {$linkcount} per {$numwords} words)"); }
protected function extractURIsFromPost(ISblamPost $p) { $uris = array(); if ($uri = $p->getAuthorURI()) { $this->addURI($uris, new SblamURI($uri)); } foreach ($p->getLinks() as $link) { $this->addURI($uris, $link); } $this->addEmail($uris, $p->getAuthorEmail()); return array_keys($uris); }
protected function extractWordsFromPost(ISblamPost $p) { // get both raw and stripped text, to find more phrases (word count doesn't matter) $txt = $p->getRawContent() . ' ' . rawurldecode($p->getText()) . ' ' . $p->getAuthorName() . ' ' . $p->getAuthorEmail() . ' ' . $p->getAuthorURI(); return self::extractWords($txt, $this->db->ignore); }