Example #1
0
 /**
  * Calculates the Athena value
  * Calls all the filters, works out the probability of each contributing to the spam level, and combines them
  *
  * @param $editPage EditPage
  * @param $text string
  * @param $summary string
  * @return double
  */
 static function calculateAthenaValue($editPage, $text, $summary)
 {
     global $wgUser, $wgAthenaTraining;
     // Get title
     $titleObj = $editPage->getTitle();
     $title = $titleObj->getTitleValue()->getText();
     // Get filter results
     $diffLang = AthenaFilters::differentLanguage($text);
     $deleted = AthenaFilters::wasDeleted($titleObj);
     $wanted = AthenaFilters::isWanted($titleObj);
     $userAge = AthenaFilters::userAge();
     $titleLength = AthenaFilters::titleLength($titleObj);
     $namespace = AthenaFilters::getNamespace($titleObj);
     $syntaxType = AthenaFilters::syntaxType($text);
     $linksPercentage = AthenaFilters::linkPercentage($text);
     // If not training, work out probabilities
     if (!$wgAthenaTraining) {
         // Array to store probability info
         $probabilityArray = array();
         $spam = null;
         $notspam = null;
         // Get the statistics table's contents
         $stats = AthenaHelper::getStatistics();
         // Calculate probability of spam
         AthenaHelper::calculateProbability_Spam($stats, $probabilityArray);
         $lnProbSpamNotSpam = log($probabilityArray['ac_p_spam'] / $probabilityArray['ac_p_not_spam']);
         /* start different language */
         AthenaHelper::calculateProbability_Language($diffLang, $stats, $probabilityArray);
         $sigma = log($probabilityArray['ac_p_langgivenspam'] / $probabilityArray['ac_p_langgivennotspam']);
         /* end different language */
         /* start deleted */
         AthenaHelper::calculateProbability_Deleted($deleted, $stats, $probabilityArray);
         $sigma += log($probabilityArray['ac_p_deletedgivenspam'] / $probabilityArray['ac_p_deletedgivennotspam']);
         /* end deleted */
         /* start wanted */
         AthenaHelper::calculateProbability_Wanted($wanted, $stats, $probabilityArray);
         $sigma += log($probabilityArray['ac_p_wantedgivenspam'] / $probabilityArray['ac_p_wantedgivennotspam']);
         /* end wanted */
         /* start user type */
         AthenaHelper::calculateProbability_User($userAge, $stats, $probabilityArray);
         $sigma += log($probabilityArray['ac_p_usergivenspam'] / $probabilityArray['ac_p_usergivennotspam']);
         /* end user type */
         /* start title length */
         AthenaHelper::calculateProbability_Length($titleLength, $stats, $probabilityArray);
         $sigma += log($probabilityArray['ac_p_titlelengthgivenspam'] / $probabilityArray['ac_p_titlelengthgivennotspam']);
         /* end title length */
         /* start namespace */
         AthenaHelper::calculateProbability_Namespace($namespace, $stats, $probabilityArray);
         $sigma += log($probabilityArray['ac_p_namespacegivenspam'] / $probabilityArray['ac_p_namespacegivennotspam']);
         /* end namespace */
         /* start syntax */
         AthenaHelper::calculateProbability_Syntax($syntaxType, $stats, $probabilityArray);
         $sigma += log($probabilityArray['ac_p_syntaxgivenspam'] / $probabilityArray['ac_p_syntaxgivennotspam']);
         /* end syntax */
         /* start links */
         AthenaHelper::calculateProbability_Links($linksPercentage, $stats, $probabilityArray);
         $sigma += log($probabilityArray['ac_p_linksgivenspam'] / $probabilityArray['ac_p_linksgivennotspam']);
         /* end links */
         $prob = $lnProbSpamNotSpam + $sigma;
         //wfErrorLog("------------------------------------------------", '/var/www/html/a/extensions/Athena/data/debug.log');
         //wfErrorLog("Probability is $prob", '/var/www/html/a/extensions/Athena/data/debug.log');
     } else {
         // al_value is double unsigned not null, so let's just set to 0 and let the code ignore it later on
         $prob = 0;
         $probabilityArray = null;
     }
     $links = AthenaFilters::numberOfLinks($text);
     $logArray = AthenaHelper::prepareLogArray($prob, $userAge, $links, $linksPercentage, $syntaxType, $diffLang, $deleted, $wanted);
     $detailsArray = AthenaHelper::preparePageDetailsArray($namespace, $title, $text, $summary, $wgUser->getId());
     AthenaHelper::logAttempt($logArray, $detailsArray, $probabilityArray);
     AthenaHelper::updateStats($logArray, $titleObj);
     return $prob;
 }
Example #2
0
 /**
  * Gets the number of (certain) syntax uses in an article
  * 2 is advanced, 1 is basic, 0 is none
  * 3 is broken spam bot
  *
  * @param $text string
  * @return integer 0|1|2|3
  */
 public static function syntaxType($text)
 {
     if (AthenaFilters::brokenSpamBot($text)) {
         return 3;
     } else {
         // Start with headings
         $count = preg_match_all("/==([^=]+)==(\\s)*(\n|\$)/", $text);
         $count += preg_match_all("/===([^=]+)===(\\s)*(\n|\$)/", $text);
         $count += preg_match_all("/====([^=]+)====(\\s)*(\n|\$)/", $text);
         $count += preg_match_all("/=====([^=]+)=====(\\s)*(\n|\$)/", $text);
         $count += preg_match_all("/======([^=]+)======(\\s)*(\n|\$)/", $text);
         // nowiki tags are very wiki specific
         $count += preg_match_all("/<nowiki>(.*)<\\/nowiki>/", $text);
         $count += preg_match_all("/<nowiki\\/>/", $text);
         // Internal links
         $count += preg_match_all("/\\[\\[([^\\[\\]])+\\]\\]/", $text);
         // Tables
         $count += preg_match_all("/\\{\\|([^\\{\\|\\}])+\\|\\}/", $text);
         // Templates
         $count += preg_match_all("/\\{\\{([^\\{\\}])+\\}\\}/", $text);
         if ($count > 1) {
             return 2;
         } else {
             // Basic wiki syntax (bold, brs, links)
             $count = 0;
             // Links
             $count += AthenaFilters::numberOfLinks($text);
             // Line breaks
             $count += preg_match_all("/<br\\/>|<br>/", $text);
             // Bold
             $count += preg_match_all("/'''([^(''')]+)'''/", $text);
             // Italics
             $count += preg_match_all("/''([^('')]+)''/", $text);
             // Check for alternative syntax
             $count += preg_match_all("/<strong>(.*)<\\/strong>/", $text);
             $count += preg_match_all("/<a(.*)>(.*)<\\/a>/", $text);
             $count += preg_match_all("/\\[url\\]/", $text);
             if ($count > 1) {
                 return 1;
             }
         }
     }
     // Else no syntax
     return 0;
 }
Example #3
0
 public function execute()
 {
     $dbw = wfGetDB(DB_SLAVE);
     $res = $dbw->select(array('athena_log', 'athena_page_details'), array('athena_log.al_id', 'apd_content', 'al_success'), array(), __METHOD__, array(), array('athena_page_details' => array('INNER JOIN', array('athena_log.al_id=athena_page_details.al_id'))));
     $syntaxNone = 0;
     $syntaxBasic = 0;
     $syntaxComplex = 0;
     $bsb = 0;
     $spamandsyntaxNone = 0;
     $spamandsyntaxBasic = 0;
     $spamandsyntaxComplex = 0;
     $spamandbsb = 0;
     foreach ($res as $row) {
         echo "\n----------------------------------------------\n";
         echo "al_id is {$row->al_id} \n";
         $content = $row->apd_content;
         $result = AthenaFilters::syntaxType($content);
         if ($result == 0) {
             $syntaxNone++;
             if ($row->al_success == 3) {
                 $spamandsyntaxNone++;
             }
         } else {
             if ($result == 1) {
                 $syntaxBasic++;
                 if ($row->al_success == 3) {
                     $spamandsyntaxBasic++;
                 }
             } else {
                 if ($result == 3) {
                     $bsb++;
                     if ($row->al_success == 3) {
                         $spamandbsb++;
                     }
                 } else {
                     $syntaxComplex++;
                     if ($row->al_success == 3) {
                         $spamandsyntaxComplex++;
                     }
                 }
             }
         }
         $dbw->update('athena_log', array('al_syntax' => $result), array('al_id' => $row->al_id), __METHOD__, null);
         echo "\n----------------------------------------------\n";
     }
     echo "\n\n\n----------------------------------------------\n\n\n";
     echo "None: {$syntaxNone} \n";
     echo "Basic {$syntaxBasic}\n";
     echo "Complex: {$syntaxComplex} \n";
     echo "BSB {$bsb}\n";
     $total = $syntaxBasic + $syntaxComplex + $syntaxNone + $bsb;
     echo "Total page: {$total} \n";
     echo "\n\n\n----------------------------------------------\n\n\n";
     $dbw->update('athena_stats', array('as_value' => $syntaxNone, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "syntaxnone"'), __METHOD__, null);
     $dbw->update('athena_stats', array('as_value' => $syntaxBasic, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "syntaxbasic"'), __METHOD__, null);
     $dbw->update('athena_stats', array('as_value' => $syntaxComplex, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "syntaxcomplex"'), __METHOD__, null);
     $dbw->update('athena_stats', array('as_value' => $bsb, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "brokenspambot"'), __METHOD__, null);
     echo "\n\n\n----------------------------------------------\n\n\n";
     echo "None: {$spamandsyntaxNone} \n";
     echo "Basic {$spamandsyntaxBasic}\n";
     echo "Complex: {$spamandsyntaxComplex} \n";
     echo "BSB {$spamandbsb}\n";
     $total = $spamandsyntaxBasic + $spamandsyntaxComplex + $spamandsyntaxNone + $spamandbsb;
     echo "Total spam: {$total} \n";
     echo "\n\n\n----------------------------------------------\n\n\n";
     $dbw->update('athena_stats', array('as_value' => $spamandsyntaxNone, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "spamandsyntaxnone"'), __METHOD__, null);
     $dbw->update('athena_stats', array('as_value' => $spamandsyntaxBasic, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "spamandsyntaxbasic"'), __METHOD__, null);
     $dbw->update('athena_stats', array('as_value' => $spamandsyntaxComplex, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "spamandsyntaxcomplex"'), __METHOD__, null);
     $dbw->update('athena_stats', array('as_value' => $spamandbsb, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "spamandbrokenspambot"'), __METHOD__, null);
 }