Example #1
 function preTestPost(ISblamPost $p)
     $text = preg_replace('/\\[:..:\\]\\s+/', ' ', $p->getRawContent());
     if (strlen($text) < 40) {
         $text .= $p->getAuthorURI();
     if (strlen($text) < 40) {
         $text .= $p->getAuthorEmail();
     if (strlen($text) < 40) {
         $text .= $p->getAuthorName();
     if (strlen($text) < 20) {
         $text .= $p->getAuthorIP();
     if (strlen($text) < 10) {
         $this->checksum = NULL;
     $text = preg_replace(array('/[.,\\s!:;()-]+/', '/([a-f0-9]{1,3}[a-f]{1,6}[0-9]{1,6})+/', '/\\d\\d{1,8}/'), array(' ', 'H', 'D'), strtolower($text));
     d($text, 'normalized text');
     $this->length = strlen($text);
     $this->checksum = md5($text);
     if (!$this->db->exec(sprintf("/*maxtime5*/INSERT INTO dupes (checksum,count,expires,ip) VALUES(UNHEX('%s'),1,%d,%u)\n\t\t\tON DUPLICATE KEY UPDATE count = 1 + IF(expires < %d,CEIL(count/10),count), expires = GREATEST(expires + 3600*6, %d)", $this->checksum, time() + 3600 * 18, ip2long($p->getAuthorIP()), time(), time() + 3600 * 18))) {
Example #2
 function testPost(ISblamPost $p)
     $score = $this->checkFragment($p->getRawContent());
     if ($a = $p->getAuthorName()) {
         $score += $this->checkFragment($a);
         if (false !== strpos($a, "\n")) {
             $score += 5;
     if ($a = $p->getAuthorEmail()) {
         $score += $this->checkFragment($a);
         if (false !== strpos($a, "\n")) {
             $score += 5;
     if ($a = $p->getAuthorURI()) {
         $score += $this->checkFragment($a);
         if (false !== strpos($a, "\n")) {
             $score += 5;
     if ($score > 28) {
         return array(4, self::CERTAINITY_SURE, "Cerainly a mail exploit");
     if ($score > 23) {
         return array(2, self::CERTAINITY_SURE, "Mail exploit");
     if ($score > 17) {
         return array(1, self::CERTAINITY_HIGH, "Mail exploit");
     if ($score > 11) {
         return array(0.6, self::CERTAINITY_NORMAL, "Possible mail exploit");
     return NULL;
Example #3
 function testPost(ISblamPost $p)
     $out = array();
     if ($this->hasURI($p->getAuthorEmail())) {
         $score = 0.2;
         if ($this->hasURI($p->getAuthorURI())) {
             $score = 0.4;
         // expose it!
         $out[] = array($score, self::CERTAINITY_LOW, "Link stuffed in e-mail field");
     if ($this->hasURI($p->getAuthorName())) {
         $score = 0.1;
         if ($this->hasURI($p->getAuthorURI())) {
             $score = 0.3;
         // expose it!
         $out[] = array($score, self::CERTAINITY_LOW, "Link stuffed in name field");
     } else {
         if ("" === $p->getAuthorName()) {
             $out[] = array(0.1, self::CERTAINITY_LOW, "Anonymous");
     if ($cnt = substr_count($p->getAuthorURI(), "http://") > 1) {
         $out[] = array($cnt / 10 + 0.2, self::CERTAINITY_LOW, "Multiple links in author URI field");
     if ($cnt = substr_count($p->getAuthorURI(), "<a ") > 1) {
         $out[] = array($cnt / 5 + 0.2, self::CERTAINITY_LOW, "HTML in author URI field");
     $longs = 0;
     if (strlen($p->getAuthorName()) > 50) {
     if (strlen($p->getAuthorEmail()) > 50) {
     if (strlen($p->getAuthorURI()) > 150) {
     if ($longs) {
         $out[] = array($longs / 10 + 0.1, self::CERTAINITY_LOW, "Looong text in name/e-mail/URI fields");
     if ("" === trim($p->getRawContent())) {
         $out[] = array(0.6, self::CERTAINITY_LOW, "Empty content");
     if (preg_match('!\\b(google\\.com|msn\\.com)\\b!', $p->getAuthorURI())) {
         $out[] = array(0.2, self::CERTAINITY_LOW, "Not your website");
     return $out;
Example #4
 function testPost(ISblamPost $p)
     if (preg_match('!^\\s*([a-f0-9]{32,64}).+?\\s.+?([a-f0-9]{32,64})?\\s*$!si', $p->getRawContent(), $res)) {
         if (!preg_match('![a-f][0-9]!', $res[0])) {
             return NULL;
         if (!empty($res[1]) && preg_match('![a-f][0-9]!', $res[1])) {
             return array(0.3, self::CERTAINITY_NORMAL, "Hash marks (2)");
         return array(0.2, self::CERTAINITY_LOW, "Hash marks (1)");
     return NULL;
Example #5
 function testPost(ISblamPost $p)
     $txt = $p->getRawContent() . ' ' . $p->getAuthorName() . ' ' . $p->getAuthorEmail();
     $rawlinks = preg_match("!(?:^|\\s)https?://!mi", $txt);
     $bbcode = preg_match("!\\[url\\s*[\\]=]\\s*http!i", $txt);
     $html = preg_match("!<a\\s[^><]*href[^>]!i", $txt);
     $textile = preg_match("!\":https?://!i", $txt);
     if ($bbcode && $html && ($textile || $rawlinks)) {
         return array(1, self::CERTAINITY_NORMAL, "Mixed BBcode, HTML and other links");
     if ($bbcode && $html) {
         return array(0.7, self::CERTAINITY_NORMAL, "Mixed BBcode and HTML");
     return NULL;
Example #6
 function testPost(ISblamPost $p)
     $matched = array();
     $score = 0;
     $post = $p->getRawContent() . "\n" . $p->getAuthorName() . "\n" . $p->getAuthorEmail() . "\n" . $p->getAuthorURI();
     foreach ($this->patterns as $pattern) {
         if (preg_match($pattern[0], $post)) {
             $matched[] = $pattern[0];
             $score += $pattern[1];
     if ($score) {
         return array($score, self::CERTAINITY_NORMAL, "Exact spam matches (" . implode(', ', $matched) . ")");
Example #7
 protected function extractWordsFromPost(ISblamPost $p)
     // get both raw and stripped text, to find more phrases (word count doesn't matter)
     $txt = $p->getRawContent() . ' ' . rawurldecode($p->getText()) . ' ' . $p->getAuthorName() . ' ' . $p->getAuthorEmail() . ' ' . $p->getAuthorURI();
     return self::extractWords($txt, $this->db->ignore);
Example #8
 function testPost(ISblamPost $p)
     $h = $p->getHeaders();
     if (!$h || count($h) < 2) {
         return NULL;
     // HTTP_HOST is hardcoded! :///
     $out = array();
     if (!empty($h['HTTP_MOD_SECURITY_MESSAGE'])) {
         $out[] = array(1, self::CERTAINITY_HIGH, "mod_security warning");
     // Buggy .Net always adds header which is only needed for large forms (and browsers tend not to use it)
     if (!empty($h["HTTP_EXPECT"]) && false !== strpos($h['HTTP_EXPECT'], '100-') && strlen($p->getRawContent()) < 5000) {
         $out[] = array(0.3, self::CERTAINITY_NORMAL, "100-expect .Net header");
     // Bots tend to send these
     if (!empty($h["HTTP_PRAGMA"])) {
         $out[] = array(empty($h["HTTP_VIA"]) ? 0.3 : 0.1, self::CERTAINITY_LOW, "Pragma header");
     if (!empty($h["HTTP_RANGE"])) {
         $out[] = array(0.5, self::CERTAINITY_HIGH, "Range header");
     if (!empty($h["HTTP_PROXY_CONNECTION"])) {
         $out[] = array(0.2, self::CERTAINITY_LOW, "Proxy-Connection header");
     if (!empty($h["HTTP_REFERER"]) && ($cnt = substr_count($h["HTTP_REFERER"], "http://")) > 1) {
         $out[] = array(min(1.5, 0.5 + $cnt / 6), self::CERTAINITY_HIGH, "Multiple links in referrer");
     if (($cnt = count($p->getAuthorIPs())) > 4) {
         $out[] = array(($cnt - 2) / 10, $cnt > 7 ? self::CERTAINITY_HIGH : self::CERTAINITY_NORMAL, "Insane number of relays ({$cnt})");
     // Unpatched IE!?
     if (!empty($h["HTTP_USER_AGENT"]) && preg_match('/MSIE [456]\\.[0-9]; Windows (?:9|NT 5)/', $h['HTTP_USER_AGENT'])) {
         $out[] = array(0.3, self::CERTAINITY_NORMAL, "Unpatched IE");
     // Browsers almost always send these
     if (empty($h["HTTP_ACCEPT"])) {
         $out[] = array(0.7, self::CERTAINITY_NORMAL, "Missing Accept header");
     if (empty($h["HTTP_USER_AGENT"])) {
         $out[] = array(1, self::CERTAINITY_NORMAL, "Missing UA header");
     if (empty($h["HTTP_ACCEPT_LANGUAGE"])) {
         $out[] = array(0.5, self::CERTAINITY_NORMAL, "Missing Accept-Language header");
     if (empty($h["HTTP_ACCEPT_ENCODING"]) && empty($h["HTTP_VIA"]) && (empty($h["HTTP_USER_AGENT"]) || false === strpos($h["HTTP_USER_AGENT"], 'Mozilla/4.0 (compatible; MSIE '))) {
         $out[] = array(0.4, self::CERTAINITY_LOW, "Missing Accept-Encoding header");
     if (!empty($h["HTTP_ACCEPT_CHARSET"])) {
         $out[] = array(-0.2, self::CERTAINITY_LOW, "Has Accept-Charset header");
     // Non-transparent proxy must add Via header
     if (empty($h["HTTP_VIA"]) && (!empty($h['HTTP_X_FORWARDED_FOR']) || !empty($h['HTTP_MAX_FORWARDS']))) {
         $out[] = array(0.2, self::CERTAINITY_LOW, "Lame proxy");
     // TE: requires Connection:TE
     if (!empty($h["HTTP_TE"]) && (empty($h['HTTP_CONNECTION']) || !preg_match('!\\bTE\\b!', $h['HTTP_CONNECTION']))) {
         $out[] = array(0.2, self::CERTAINITY_NORMAL, "Invalid TE header");
     // Googlebot doesn't post comments!
     if (!empty($h['HTTP_USER_AGENT']) && preg_match('!Googlebot[/ -]|Slurp|Wget/|W3C_Validator|Advertise\\.com|nicebot|MMCrawler/|MSIECrawler|ia_archiver|WebaltBot/|nutbot\\.com|\\+http://search\\.!', $h['HTTP_USER_AGENT'])) {
         $out[] = array(1, self::CERTAINITY_NORMAL, "Bots don't post comments");
     // Headless browsers no thanks
     if (!empty($h['HTTP_USER_AGENT']) && preg_match('!PhantomJS|CasperJS!', $h['HTTP_USER_AGENT'])) {
         $out[] = array(1, self::CERTAINITY_HIGH, "Nice try, PhantomJS");
     if (!empty($h['HTTP_USERAGENT']) || !empty($h['HTTP_USER_AGENT']) && preg_match('!^User-Agent!i', $h['HTTP_USER_AGENT'])) {
         $out[] = array(1, self::CERTAINITY_NORMAL, "Really badly written bot");
     // I assume multipart forms are too tricky for most bots
     if (!empty($h['HTTP_CONTENT_LENGTH']) && !empty($h['HTTP_CONTENT_TYPE']) && preg_match('!^\\s*multipart/form-data\\s*;\\s*boundary\\s*=!i', $h['HTTP_CONTENT_TYPE'])) {
         $out[] = array(-0.2, self::CERTAINITY_LOW, "Multipart form");
     // browsers nicely decode and normalize paths, remove fragment part
     if (($path = $p->getPath()) && preg_match('!&amp;|^https?://|^//|/%7e|#|\\.\\./!i', $path)) {
         $out[] = array(0.3, self::CERTAINITY_NORMAL, "Improperly encoded path");
     if (!empty($h["HTTP_REFERER"]) && preg_match('!&amp;|/%7e|\\.\\./!i', $h["HTTP_REFERER"])) {
         $out[] = array(0.25, self::CERTAINITY_LOW, "Improperly encoded referer");
     if (count($out)) {
         return $out;