/** * Calculates the Athena value * Calls all the filters, works out the probability of each contributing to the spam level, and combines them * * @param $editPage EditPage * @param $text string * @param $summary string * @return double */ static function calculateAthenaValue($editPage, $text, $summary) { global $wgUser, $wgAthenaTraining; // Get title $titleObj = $editPage->getTitle(); $title = $titleObj->getTitleValue()->getText(); // Get filter results $diffLang = AthenaFilters::differentLanguage($text); $deleted = AthenaFilters::wasDeleted($titleObj); $wanted = AthenaFilters::isWanted($titleObj); $userAge = AthenaFilters::userAge(); $titleLength = AthenaFilters::titleLength($titleObj); $namespace = AthenaFilters::getNamespace($titleObj); $syntaxType = AthenaFilters::syntaxType($text); $linksPercentage = AthenaFilters::linkPercentage($text); // If not training, work out probabilities if (!$wgAthenaTraining) { // Array to store probability info $probabilityArray = array(); $spam = null; $notspam = null; // Get the statistics table's contents $stats = AthenaHelper::getStatistics(); // Calculate probability of spam AthenaHelper::calculateProbability_Spam($stats, $probabilityArray); $lnProbSpamNotSpam = log($probabilityArray['ac_p_spam'] / $probabilityArray['ac_p_not_spam']); /* start different language */ AthenaHelper::calculateProbability_Language($diffLang, $stats, $probabilityArray); $sigma = log($probabilityArray['ac_p_langgivenspam'] / $probabilityArray['ac_p_langgivennotspam']); /* end different language */ /* start deleted */ AthenaHelper::calculateProbability_Deleted($deleted, $stats, $probabilityArray); $sigma += log($probabilityArray['ac_p_deletedgivenspam'] / $probabilityArray['ac_p_deletedgivennotspam']); /* end deleted */ /* start wanted */ AthenaHelper::calculateProbability_Wanted($wanted, $stats, $probabilityArray); $sigma += log($probabilityArray['ac_p_wantedgivenspam'] / $probabilityArray['ac_p_wantedgivennotspam']); /* end wanted */ /* start user type */ AthenaHelper::calculateProbability_User($userAge, $stats, $probabilityArray); $sigma += log($probabilityArray['ac_p_usergivenspam'] / $probabilityArray['ac_p_usergivennotspam']); /* end user type */ /* start title length */ AthenaHelper::calculateProbability_Length($titleLength, $stats, $probabilityArray); $sigma += log($probabilityArray['ac_p_titlelengthgivenspam'] / $probabilityArray['ac_p_titlelengthgivennotspam']); /* end title length */ /* start namespace */ AthenaHelper::calculateProbability_Namespace($namespace, $stats, $probabilityArray); $sigma += log($probabilityArray['ac_p_namespacegivenspam'] / $probabilityArray['ac_p_namespacegivennotspam']); /* end namespace */ /* start syntax */ AthenaHelper::calculateProbability_Syntax($syntaxType, $stats, $probabilityArray); $sigma += log($probabilityArray['ac_p_syntaxgivenspam'] / $probabilityArray['ac_p_syntaxgivennotspam']); /* end syntax */ /* start links */ AthenaHelper::calculateProbability_Links($linksPercentage, $stats, $probabilityArray); $sigma += log($probabilityArray['ac_p_linksgivenspam'] / $probabilityArray['ac_p_linksgivennotspam']); /* end links */ $prob = $lnProbSpamNotSpam + $sigma; //wfErrorLog("------------------------------------------------", '/var/www/html/a/extensions/Athena/data/debug.log'); //wfErrorLog("Probability is $prob", '/var/www/html/a/extensions/Athena/data/debug.log'); } else { // al_value is double unsigned not null, so let's just set to 0 and let the code ignore it later on $prob = 0; $probabilityArray = null; } $links = AthenaFilters::numberOfLinks($text); $logArray = AthenaHelper::prepareLogArray($prob, $userAge, $links, $linksPercentage, $syntaxType, $diffLang, $deleted, $wanted); $detailsArray = AthenaHelper::preparePageDetailsArray($namespace, $title, $text, $summary, $wgUser->getId()); AthenaHelper::logAttempt($logArray, $detailsArray, $probabilityArray); AthenaHelper::updateStats($logArray, $titleObj); return $prob; }
/** * Gets the number of (certain) syntax uses in an article * 2 is advanced, 1 is basic, 0 is none * 3 is broken spam bot * * @param $text string * @return integer 0|1|2|3 */ public static function syntaxType($text) { if (AthenaFilters::brokenSpamBot($text)) { return 3; } else { // Start with headings $count = preg_match_all("/==([^=]+)==(\\s)*(\n|\$)/", $text); $count += preg_match_all("/===([^=]+)===(\\s)*(\n|\$)/", $text); $count += preg_match_all("/====([^=]+)====(\\s)*(\n|\$)/", $text); $count += preg_match_all("/=====([^=]+)=====(\\s)*(\n|\$)/", $text); $count += preg_match_all("/======([^=]+)======(\\s)*(\n|\$)/", $text); // nowiki tags are very wiki specific $count += preg_match_all("/<nowiki>(.*)<\\/nowiki>/", $text); $count += preg_match_all("/<nowiki\\/>/", $text); // Internal links $count += preg_match_all("/\\[\\[([^\\[\\]])+\\]\\]/", $text); // Tables $count += preg_match_all("/\\{\\|([^\\{\\|\\}])+\\|\\}/", $text); // Templates $count += preg_match_all("/\\{\\{([^\\{\\}])+\\}\\}/", $text); if ($count > 1) { return 2; } else { // Basic wiki syntax (bold, brs, links) $count = 0; // Links $count += AthenaFilters::numberOfLinks($text); // Line breaks $count += preg_match_all("/<br\\/>|<br>/", $text); // Bold $count += preg_match_all("/'''([^(''')]+)'''/", $text); // Italics $count += preg_match_all("/''([^('')]+)''/", $text); // Check for alternative syntax $count += preg_match_all("/<strong>(.*)<\\/strong>/", $text); $count += preg_match_all("/<a(.*)>(.*)<\\/a>/", $text); $count += preg_match_all("/\\[url\\]/", $text); if ($count > 1) { return 1; } } } // Else no syntax return 0; }
public function execute() { $dbw = wfGetDB(DB_SLAVE); $res = $dbw->select(array('athena_log', 'athena_page_details'), array('athena_log.al_id', 'apd_content', 'al_success'), array(), __METHOD__, array(), array('athena_page_details' => array('INNER JOIN', array('athena_log.al_id=athena_page_details.al_id')))); $syntaxNone = 0; $syntaxBasic = 0; $syntaxComplex = 0; $bsb = 0; $spamandsyntaxNone = 0; $spamandsyntaxBasic = 0; $spamandsyntaxComplex = 0; $spamandbsb = 0; foreach ($res as $row) { echo "\n----------------------------------------------\n"; echo "al_id is {$row->al_id} \n"; $content = $row->apd_content; $result = AthenaFilters::syntaxType($content); if ($result == 0) { $syntaxNone++; if ($row->al_success == 3) { $spamandsyntaxNone++; } } else { if ($result == 1) { $syntaxBasic++; if ($row->al_success == 3) { $spamandsyntaxBasic++; } } else { if ($result == 3) { $bsb++; if ($row->al_success == 3) { $spamandbsb++; } } else { $syntaxComplex++; if ($row->al_success == 3) { $spamandsyntaxComplex++; } } } } $dbw->update('athena_log', array('al_syntax' => $result), array('al_id' => $row->al_id), __METHOD__, null); echo "\n----------------------------------------------\n"; } echo "\n\n\n----------------------------------------------\n\n\n"; echo "None: {$syntaxNone} \n"; echo "Basic {$syntaxBasic}\n"; echo "Complex: {$syntaxComplex} \n"; echo "BSB {$bsb}\n"; $total = $syntaxBasic + $syntaxComplex + $syntaxNone + $bsb; echo "Total page: {$total} \n"; echo "\n\n\n----------------------------------------------\n\n\n"; $dbw->update('athena_stats', array('as_value' => $syntaxNone, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "syntaxnone"'), __METHOD__, null); $dbw->update('athena_stats', array('as_value' => $syntaxBasic, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "syntaxbasic"'), __METHOD__, null); $dbw->update('athena_stats', array('as_value' => $syntaxComplex, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "syntaxcomplex"'), __METHOD__, null); $dbw->update('athena_stats', array('as_value' => $bsb, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "brokenspambot"'), __METHOD__, null); echo "\n\n\n----------------------------------------------\n\n\n"; echo "None: {$spamandsyntaxNone} \n"; echo "Basic {$spamandsyntaxBasic}\n"; echo "Complex: {$spamandsyntaxComplex} \n"; echo "BSB {$spamandbsb}\n"; $total = $spamandsyntaxBasic + $spamandsyntaxComplex + $spamandsyntaxNone + $spamandbsb; echo "Total spam: {$total} \n"; echo "\n\n\n----------------------------------------------\n\n\n"; $dbw->update('athena_stats', array('as_value' => $spamandsyntaxNone, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "spamandsyntaxnone"'), __METHOD__, null); $dbw->update('athena_stats', array('as_value' => $spamandsyntaxBasic, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "spamandsyntaxbasic"'), __METHOD__, null); $dbw->update('athena_stats', array('as_value' => $spamandsyntaxComplex, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "spamandsyntaxcomplex"'), __METHOD__, null); $dbw->update('athena_stats', array('as_value' => $spamandbsb, 'as_updated' => 'CURRENT_TIMESTAMP'), array('as_name = "spamandbrokenspambot"'), __METHOD__, null); }