function preTestPost(ISblamPost $p) { $this->addedhosts = array(); $this->ips = array(); // Check sender's IPs foreach ($p->getAuthorIPs() as $ip) { $this->checkIP($ip, "sender"); } // Check all IPs of all linked websites $links = $p->getLinks(); if ($links) { foreach ($links as $link) { $host = $link->getHostname(); $domain = $link->getDomain(); if ($host && $domain && $host !== $domain) { $this->checkHost($host, "link", 0.75); $this->checkHost($domain, "link", 0.75); } else { if ($host) { $this->checkHost($host, "link"); } if ($domain) { $this->checkHost($domain, "link"); } } } } $this->checkHostList(); }
function startTest(ISblamPost $p) { $this->fp = NULL; $count = 0; $alllinks = "# only links extracted from post;\n"; foreach ($p->getLinks() as $link) { // linksleeve doesn't support 2-level domains $domain = $link->getDomain(); if ($domain === $link->getHostname() || substr_count($domain, '.') >= 2) { d($link->getURI(), "Skipping because of linksleeve bug"); continue; } if (isset($this->ignore[$domain])) { d($domain, 'skipped linksleeve'); continue; } $alllinks .= $link->getURI() . " ; " . substr(preg_replace('/[^a-z0-9.-]+/i', '', $link->getLabel()), 0, 50) . "\n"; $count++; } if (!$count) { return NULL; } $query = '<?xml version="1.0"?><methodCall> <methodName>slv</methodName> <params> <param> <value><string>' . htmlspecialchars($alllinks) . '</string></value> </param> </params> </methodCall>'; $this->fp = $this->services->getHTTP()->setHost(self::API_HOST)->setPath(self::API_PATH)->setPost($query, 'text/xml')->setTimeout($this->timeout)->requestAsync(); return $this->fp != NULL; }
function testPost(ISblamPost $p) { if ($this->blacklist === NULL) { $this->blacklist = new DomainMatch(); if (!$this->importChongqed($this->blacklistfile)) { throw new Exception("Unable to import chongqed.org blacklist from {$this->blacklistfile}"); } } $res4 = 0; $domains = array(); if ($uri = $p->getAuthorURI()) { $uri = new SblamURI($uri); if ($tmp = $this->check($uri)) { $domains[$uri->getHostname()] = true; $res4 += $tmp; } } foreach ($p->getLinks() as $uri) { if ($tmp = $this->check($uri)) { $domains[$uri->getHostname()] = true; $res4 += $tmp; } } if ($res4) { return array(0.8, self::CERTAINITY_NORMAL, "Blacklisted domains (" . implode(', ', array_keys($domains)) . ")"); } }
function testPost(ISblamPost $p) { $score = $this->checkFragment($p->getRawContent()); if ($a = $p->getAuthorName()) { $score += $this->checkFragment($a); if (false !== strpos($a, "\n")) { $score += 5; } } if ($a = $p->getAuthorEmail()) { $score += $this->checkFragment($a); if (false !== strpos($a, "\n")) { $score += 5; } } if ($a = $p->getAuthorURI()) { $score += $this->checkFragment($a); if (false !== strpos($a, "\n")) { $score += 5; } } if ($score > 28) { return array(4, self::CERTAINITY_SURE, "Cerainly a mail exploit"); } if ($score > 23) { return array(2, self::CERTAINITY_SURE, "Mail exploit"); } if ($score > 17) { return array(1, self::CERTAINITY_HIGH, "Mail exploit"); } if ($score > 11) { return array(0.6, self::CERTAINITY_NORMAL, "Possible mail exploit"); } return NULL; }
function testPost(ISblamPost $p) { if ($t = $p->getPostTime()) { $hour = date("G", $t); if ($hour >= 2 and $hour <= 5) { return array(0.15, self::CERTAINITY_LOW, "Late-night posting ({$hour}h)"); } if ($hour >= 1 and $hour <= 7) { return array(0.09, self::CERTAINITY_LOW, "Late-night posting ({$hour}h)"); } } }
function testPost(ISblamPost $p) { if (preg_match('!^\\s*([a-f0-9]{32,64}).+?\\s.+?([a-f0-9]{32,64})?\\s*$!si', $p->getRawContent(), $res)) { if (!preg_match('![a-f][0-9]!', $res[0])) { return NULL; } if (!empty($res[1]) && preg_match('![a-f][0-9]!', $res[1])) { return array(0.3, self::CERTAINITY_NORMAL, "Hash marks (2)"); } return array(0.2, self::CERTAINITY_LOW, "Hash marks (1)"); } return NULL; }
function reportResult(ISblamPost $post, $score, $cert) { if (!$this->add) { return; } if ($score > 0.66 && $cert > 0.75) { $this->plonker->addIPs($post->getAuthorIPs(), $score); } else { if ($score < -0.6 && $cert > 0.7) { $this->plonker->removeIPs($post->getAuthorIPs()); } } }
function testPost(ISblamPost $p) { $txt = $p->getRawContent() . ' ' . $p->getAuthorName() . ' ' . $p->getAuthorEmail(); $rawlinks = preg_match("!(?:^|\\s)https?://!mi", $txt); $bbcode = preg_match("!\\[url\\s*[\\]=]\\s*http!i", $txt); $html = preg_match("!<a\\s[^><]*href[^>]!i", $txt); $textile = preg_match("!\":https?://!i", $txt); if ($bbcode && $html && ($textile || $rawlinks)) { return array(1, self::CERTAINITY_NORMAL, "Mixed BBcode, HTML and other links"); } if ($bbcode && $html) { return array(0.7, self::CERTAINITY_NORMAL, "Mixed BBcode and HTML"); } return NULL; }
function testPost(ISblamPost $p) { $matched = array(); $score = 0; $post = $p->getRawContent() . "\n" . $p->getAuthorName() . "\n" . $p->getAuthorEmail() . "\n" . $p->getAuthorURI(); foreach ($this->patterns as $pattern) { if (preg_match($pattern[0], $post)) { $matched[] = $pattern[0]; $score += $pattern[1]; } } if ($score) { return array($score, self::CERTAINITY_NORMAL, "Exact spam matches (" . implode(', ', $matched) . ")"); } }
function preTestPost(ISblamPost $p) { $this->addedhosts = array(); $links = $p->getLinks(); if ($links) { foreach ($links as $link) { if ($host = $link->getHostname()) { $this->checkHost($host); } if ($domain = $link->getDomain()) { $this->checkHost($domain); } } } }
function testPost(ISblamPost $p) { if (!$this->checksum) { return; } $res = $this->db->query(sprintf("/*maxtime5*/SELECT count,ip FROM dupes WHERE checksum = UNHEX('%s') LIMIT 1", $this->checksum)); if ($res) { $res = $res->fetchAll(); } else { return NULL; } if (count($res)) { $res = $res[0]; $allowed = 2; // double-posting? if (false !== strpos($p->getPath(), 'editpost')) { $allowed++; } $score = ($res['count'] - $allowed) / 15; $cert = self::CERTAINITY_LOW; if ($res['count'] > 35) { $score += 2; $cert = self::CERTAINITY_HIGH; } elseif ($res['count'] > 20) { $score += 0.8; $cert = self::CERTAINITY_HIGH; } elseif ($res['count'] > 10) { $score += 0.4; $cert = self::CERTAINITY_HIGH; } elseif ($res['count'] > 5) { $score += 0.2; $cert = self::CERTAINITY_NORMAL; } $ip = long2ip($res['ip']); if ($ip != $p->getAuthorIP()) { $score = ($score + 0.3) * 1.2; } // different IP? botnet! if ($this->length > 250) { $score = ($score + 0.1) * 1.5; } // less likely to accidentally dupe if ($score > 0.1) { $score = min($score, 2) + min($score / 5, 4); return array($score, $cert, "Duplicate (x" . round($res['count']) . " = " . round($score, 1) . ")"); } } }
function testPost(ISblamPost $p) { $isWhiteIP = false; $out = array(); foreach ($p->getAuthorIPs() as $ip) { if (array_search($ip, $this->whitelist) !== false) { $isWhiteIP = true; } else { $isWhiteIP = false; } } if ($isWhiteIP) { $out[] = array(-1.0, self::CERTAINITY_HIGH, "Sent from whitelisted IP"); } return $out; }
private function getChallengeData(ISblamPost $p) { if ($installid = $p->getInstallId()) { // !!! must be in sync with challenge.js.php $fieldname = 'sc' . abs(crc32($installid)); $post = $p->getPost(); if (!empty($post[$fieldname]) && preg_match('!^([a-f0-9]{32})([a-f0-9]+;([a-f0-9]+);([\\d.,]+))((?:;\\d+)*)$!', $post[$fieldname], $r)) { return array($installid, $fieldname, $r[1], $r[2], hexdec($r[3]), explode(',', $r[4]), explode(';', $r[5])); } d("can't find expected field {$fieldname} for challenge"); foreach ($post as $k => $v) { if (preg_match('/^sc\\d+$/', $k)) { d("Found different install id for challenge"); return array(0, $k, 0, 0, 0, array(), array()); } } } else { d("challenge: didn't get install id"); } return NULL; }
function testPost(ISblamPost $p) { $links = $p->getLinks(); if ($links === NULL) { return NULL; } $linkcount = count($links); $authorlink = $p->getAuthorURI() ? 1 : 0; // count separately, because this link may be unrelated to post's contents, so shouldn't skew link/words ratio if ($linkcount + $authorlink == 0) { if (strlen($p->getText()) > 20) { return array(-0.5, self::CERTAINITY_NORMAL, "No links"); } return NULL; // don't give nolinks bonus to posts with no content (no content is abnormal and it may be another way to spam) } if ($linkcount + $authorlink == 1) { return array(0.1, self::CERTAINITY_LOW, "Single link"); } if ($linkcount + $authorlink == 2) { return array(0.2, self::CERTAINITY_LOW, "Two links"); } $numwords = count(preg_split('![^a-z0-9\\x7F-\\xFF-]+|https?://[^\\]\\[\\s\'"<>]+!i', $p->getText(), 500, PREG_SPLIT_NO_EMPTY)); // long posts may legitimately have more links. can't set any limits, because wiki pages may contain lots of links. $ratio = round($linkcount * 100 / (10 + $numwords)); if ($ratio > 22) { return array(0.45, self::CERTAINITY_NORMAL, "Flooded with links (A{$ratio}: {$linkcount} per {$numwords} words)"); } if ($ratio > 17) { return array(0.35, self::CERTAINITY_NORMAL, "Flooded with links (B{$ratio}: {$linkcount} per {$numwords} words)"); } if ($ratio > 12) { return array(0.25, self::CERTAINITY_NORMAL, "Flooded with links (C{$ratio}: {$linkcount} per {$numwords} words)"); } if ($ratio > 6) { return array(0.25, self::CERTAINITY_NORMAL, "Lots of links (D{$ratio}: {$linkcount} per {$numwords} words)"); } return array(0.25, self::CERTAINITY_LOW, "Some links (E{$ratio}: {$linkcount} per {$numwords} words)"); }
function testPost(ISblamPost $p) { // whitelist only direct connection (because other can be forged) and only when there aren't any objectionable hosts there $out = array(); $firstIP = true; $whitelisted = false; foreach ($p->getAuthorIPs() as $ip) { $rev = SblamURI::gethostbyaddr($ip); if (!$rev) { continue; } if (is_array($rev)) { warn($rev, 'gethostbyaddr returned array'); $rev = reset($rev); } // WTF? if (preg_match('!(?:\\.|^)(?:' . $this->isps . ')$!', $rev)) { $out[] = array(0.5, self::CERTAINITY_LOW, "Sent from blacklisted ISP ({$rev})"); } else { if ($firstIP && preg_match('!\\.(?:' . $this->whitelist . ')$!', $rev)) { $whitelisted = true; } else { if (preg_match('!\\.(?:' . $this->blacklist . ')$!', $rev)) { $out[] = array(0.35, self::CERTAINITY_LOW, "Sent from blacklisted TLD ({$rev})"); } } } $firstIP = false; } if (!count($out) && $whitelisted) { return array(-0.25, self::CERTAINITY_LOW, "Sent from whitelisted TLD ({$rev})"); } if (count($out)) { return $out; } }
function testPost(ISblamPost $p) { if ($this->keywords === NULL) { $this->importBlocklist2($this->blocklist); } if (!count($this->keywords)) { return NULL; } $res1 = $this->testText($p->getText() . ' ' . $p->getAuthorName()); $res2 = 0; $res3 = 0; $alluris = ''; if ($uri = $p->getAuthorURI()) { $alluris .= strtolower($uri); } if ($uri = $p->getAuthorEmail()) { $alluris .= ' ' . strtolower($uri); } foreach ($p->getLinks() as $link) { if ($label = $link->getLabel()) { $res2 += count(array_intersect($this->getKeywordsFromText($label), $this->keywords)); } if ($uri = $link->getURI()) { $alluris .= ' ' . strtolower($uri); } } $cnt = 0; str_replace($this->keywords, $this->keywords, $alluris, $res3); $sum = $res1 + $res2 + $res3; if (!$sum) { return NULL; } //array(-0.1,self::CERTAINITY_LOW, "No banned keywords"); $out = array(); if ($res1) { $out[] = array(1.2 - 1 / $res1, $sum > 2 ? self::CERTAINITY_HIGH : self::CERTAINITY_NORMAL, "Banned keywords in text ({$res1})"); } if ($res2) { $out[] = array(1.2 - 1 / ($res2 + 1), self::CERTAINITY_HIGH, "Banned keywords in link labels ({$res2})"); } if ($res3) { $out[] = array(1.2 - 1 / $res3, $sum > 2 ? self::CERTAINITY_HIGH : self::CERTAINITY_NORMAL, "Banned keywords in URLs ({$res3})"); } if (count($out)) { return $out; } }
protected function extractURIsFromPost(ISblamPost $p) { $uris = array(); if ($uri = $p->getAuthorURI()) { $this->addURI($uris, new SblamURI($uri)); } foreach ($p->getLinks() as $link) { $this->addURI($uris, $link); } $this->addEmail($uris, $p->getAuthorEmail()); return array_keys($uris); }
protected function extractWordsFromPost(ISblamPost $p) { // get both raw and stripped text, to find more phrases (word count doesn't matter) $txt = $p->getRawContent() . ' ' . rawurldecode($p->getText()) . ' ' . $p->getAuthorName() . ' ' . $p->getAuthorEmail() . ' ' . $p->getAuthorURI(); return self::extractWords($txt, $this->db->ignore); }
function testPost(ISblamPost $p) { $h = $p->getHeaders(); if (!$h || count($h) < 2) { return NULL; } // HTTP_HOST is hardcoded! :/// $out = array(); if (!empty($h['HTTP_MOD_SECURITY_MESSAGE'])) { $out[] = array(1, self::CERTAINITY_HIGH, "mod_security warning"); } // Buggy .Net always adds header which is only needed for large forms (and browsers tend not to use it) if (!empty($h["HTTP_EXPECT"]) && false !== strpos($h['HTTP_EXPECT'], '100-') && strlen($p->getRawContent()) < 5000) { $out[] = array(0.3, self::CERTAINITY_NORMAL, "100-expect .Net header"); } // Bots tend to send these if (!empty($h["HTTP_PRAGMA"])) { $out[] = array(empty($h["HTTP_VIA"]) ? 0.3 : 0.1, self::CERTAINITY_LOW, "Pragma header"); } if (!empty($h["HTTP_RANGE"])) { $out[] = array(0.5, self::CERTAINITY_HIGH, "Range header"); } if (!empty($h["HTTP_PROXY_CONNECTION"])) { $out[] = array(0.2, self::CERTAINITY_LOW, "Proxy-Connection header"); } if (!empty($h["HTTP_REFERER"]) && ($cnt = substr_count($h["HTTP_REFERER"], "http://")) > 1) { $out[] = array(min(1.5, 0.5 + $cnt / 6), self::CERTAINITY_HIGH, "Multiple links in referrer"); } if (($cnt = count($p->getAuthorIPs())) > 4) { $out[] = array(($cnt - 2) / 10, $cnt > 7 ? self::CERTAINITY_HIGH : self::CERTAINITY_NORMAL, "Insane number of relays ({$cnt})"); } // Unpatched IE!? if (!empty($h["HTTP_USER_AGENT"]) && preg_match('/MSIE [456]\\.[0-9]; Windows (?:9|NT 5)/', $h['HTTP_USER_AGENT'])) { $out[] = array(0.3, self::CERTAINITY_NORMAL, "Unpatched IE"); } // Browsers almost always send these if (empty($h["HTTP_ACCEPT"])) { $out[] = array(0.7, self::CERTAINITY_NORMAL, "Missing Accept header"); } if (empty($h["HTTP_USER_AGENT"])) { $out[] = array(1, self::CERTAINITY_NORMAL, "Missing UA header"); } if (empty($h["HTTP_ACCEPT_LANGUAGE"])) { $out[] = array(0.5, self::CERTAINITY_NORMAL, "Missing Accept-Language header"); } if (empty($h["HTTP_ACCEPT_ENCODING"]) && empty($h["HTTP_VIA"]) && (empty($h["HTTP_USER_AGENT"]) || false === strpos($h["HTTP_USER_AGENT"], 'Mozilla/4.0 (compatible; MSIE '))) { $out[] = array(0.4, self::CERTAINITY_LOW, "Missing Accept-Encoding header"); } if (!empty($h["HTTP_ACCEPT_CHARSET"])) { $out[] = array(-0.2, self::CERTAINITY_LOW, "Has Accept-Charset header"); } // Non-transparent proxy must add Via header if (empty($h["HTTP_VIA"]) && (!empty($h['HTTP_X_FORWARDED_FOR']) || !empty($h['HTTP_MAX_FORWARDS']))) { $out[] = array(0.2, self::CERTAINITY_LOW, "Lame proxy"); } // TE: requires Connection:TE if (!empty($h["HTTP_TE"]) && (empty($h['HTTP_CONNECTION']) || !preg_match('!\\bTE\\b!', $h['HTTP_CONNECTION']))) { $out[] = array(0.2, self::CERTAINITY_NORMAL, "Invalid TE header"); } // Googlebot doesn't post comments! if (!empty($h['HTTP_USER_AGENT']) && preg_match('!Googlebot[/ -]|Slurp|Wget/|W3C_Validator|Advertise\\.com|nicebot|MMCrawler/|MSIECrawler|ia_archiver|WebaltBot/|nutbot\\.com|\\+http://search\\.!', $h['HTTP_USER_AGENT'])) { $out[] = array(1, self::CERTAINITY_NORMAL, "Bots don't post comments"); } // Headless browsers no thanks if (!empty($h['HTTP_USER_AGENT']) && preg_match('!PhantomJS|CasperJS!', $h['HTTP_USER_AGENT'])) { $out[] = array(1, self::CERTAINITY_HIGH, "Nice try, PhantomJS"); } if (!empty($h['HTTP_USERAGENT']) || !empty($h['HTTP_USER_AGENT']) && preg_match('!^User-Agent!i', $h['HTTP_USER_AGENT'])) { $out[] = array(1, self::CERTAINITY_NORMAL, "Really badly written bot"); } // I assume multipart forms are too tricky for most bots if (!empty($h['HTTP_CONTENT_LENGTH']) && !empty($h['HTTP_CONTENT_TYPE']) && preg_match('!^\\s*multipart/form-data\\s*;\\s*boundary\\s*=!i', $h['HTTP_CONTENT_TYPE'])) { $out[] = array(-0.2, self::CERTAINITY_LOW, "Multipart form"); } // browsers nicely decode and normalize paths, remove fragment part if (($path = $p->getPath()) && preg_match('!&|^https?://|^//|/%7e|#|\\.\\./!i', $path)) { $out[] = array(0.3, self::CERTAINITY_NORMAL, "Improperly encoded path"); } if (!empty($h["HTTP_REFERER"]) && preg_match('!&|/%7e|\\.\\./!i', $h["HTTP_REFERER"])) { $out[] = array(0.25, self::CERTAINITY_LOW, "Improperly encoded referer"); } if (count($out)) { return $out; } }
function testPost(ISblamPost $p) { $out = array(); if ($this->hasURI($p->getAuthorEmail())) { $score = 0.2; if ($this->hasURI($p->getAuthorURI())) { $score = 0.4; } $p->addLink($p->getAuthorEmail()); // expose it! $out[] = array($score, self::CERTAINITY_LOW, "Link stuffed in e-mail field"); } if ($this->hasURI($p->getAuthorName())) { $score = 0.1; if ($this->hasURI($p->getAuthorURI())) { $score = 0.3; } $p->addLink($p->getAuthorName()); // expose it! $out[] = array($score, self::CERTAINITY_LOW, "Link stuffed in name field"); } else { if ("" === $p->getAuthorName()) { $out[] = array(0.1, self::CERTAINITY_LOW, "Anonymous"); } } if ($cnt = substr_count($p->getAuthorURI(), "http://") > 1) { $out[] = array($cnt / 10 + 0.2, self::CERTAINITY_LOW, "Multiple links in author URI field"); } if ($cnt = substr_count($p->getAuthorURI(), "<a ") > 1) { $out[] = array($cnt / 5 + 0.2, self::CERTAINITY_LOW, "HTML in author URI field"); } $longs = 0; if (strlen($p->getAuthorName()) > 50) { $longs++; } if (strlen($p->getAuthorEmail()) > 50) { $longs++; } if (strlen($p->getAuthorURI()) > 150) { $longs++; } if ($longs) { $out[] = array($longs / 10 + 0.1, self::CERTAINITY_LOW, "Looong text in name/e-mail/URI fields"); } if ("" === trim($p->getRawContent())) { $out[] = array(0.6, self::CERTAINITY_LOW, "Empty content"); } if (preg_match('!\\b(google\\.com|msn\\.com)\\b!', $p->getAuthorURI())) { $out[] = array(0.2, self::CERTAINITY_LOW, "Not your website"); } return $out; }
function reportResult(ISblamPost $p, $score, $cert) { if (!function_exists('apc_store')) { throw new Exception("NO APC"); } if ($score > 1.2 && $cert > 0.95) { foreach ($p->getAuthorIPs() as $ip) { apc_store('ip-ban:' . $ip, time() + 5, 5); // block for 5 sec } } }