/** * -- Add function info here -- */ function getFarthestToken($blogId, $tokens) { $bayesianTokens = new BayesianTokens(); $tokens = $bayesianTokens->getBayesianTokensFromArray($blogId, $tokens, false); $tempArray = array(); foreach ($tokens as $token) { array_push($tempArray, abs($token->getProb() - 0.5)); } arsort($tempArray); $keys = array_keys($tempArray); $key = $keys[0]; return $tokens[$key]; }
/** * @private */ function _getMostSignificantTokens($blogId, $tokens) { $config =& Config::getConfig(); $bayesianFilterInfos = new BayesianFilterInfos(); $bayesianFilterInfo = $bayesianFilterInfos->getBlogBayesianFilterInfo($blogId); $totalSpam = $bayesianFilterInfo->getTotalSpam(); $totalNonSpam = $bayesianFilterInfo->getTotalNonSpam(); $bayesianTokens = new BayesianTokens(); foreach ($tokens as $token) { $bayesianTokens->updateOccurrences($blogId, $token, 0, 0, $totalSpam, $totalNonSpam, false); } $tokens = $bayesianTokens->getBayesianTokensFromArray($blogId, $tokens); $tempArray = array(); foreach ($tokens as $token) { if ($token->isSignificant() && $token->isValid()) { array_push($tempArray, abs($token->getProb() - 0.5)); } } arsort($tempArray); $significantTokens = array(); $count = 0; foreach ($tempArray as $key => $value) { array_push($significantTokens, $tokens[$key]); $count++; if ($count == $config->getValue("bayesian_filter_number_significant_tokens")) { break; } } return $significantTokens; }
/** * untrains the filter * * @param blogId The blog id * @param topic The topic of the comment/article that we're using to untrain the filter * @param text The text of the comment/articles that we're usingn to untrain the filter * @param userName Name of the user posting this comment/article * @param userEmail Email address of the user posting this comment/article * @param userUrl URL of the user posting this comment/article * @param spam Wether we should unmark these contents as spam or not. The content will be unmarked * as non-spam by default * @static * @see train */ function untrain($blogId, $topic, $text, $userName, $userEmail, $userUrl, $spam = false) { $tokenizer = new BayesianTokenizer(); $tokensTopic = $tokenizer->addContextMark($tokenizer->tokenize($topic), TOKEN_TOPIC_MARK); $tokensText = $tokenizer->tokenize($text); $tokensUserName = $tokenizer->addContextMark($tokenizer->tokenize($userName), TOKEN_USER_NAME_MARK); $tokensUserEmail = $tokenizer->addContextMark($tokenizer->tokenize($userEmail), TOKEN_USER_EMAIL_MARK); $tokensUserUrl = $tokenizer->addContextMark($tokenizer->tokenize($userUrl), TOKEN_USER_URL_MARK); $tokens = array_merge($tokensTopic, $tokensText, $tokensUserName, $tokensUserEmail, $tokensUserUrl); $bayesianFilterInfos = new BayesianFilterInfos(); $bayesianFilterInfo = $bayesianFilterInfos->getBlogBayesianFilterInfo($blogId); $totalSpam = $bayesianFilterInfo->getTotalSpam(); $totalNonSpam = $bayesianFilterInfo->getTotalNonSpam(); $bayesianTokens = new BayesianTokens(); if ($spam) { $bayesianTokens->decSpamOccurrencesFromTokensArray($blogId, $tokens, $totalSpam, $totalNonSpam); $bayesianFilterInfos->decTotalSpam($bayesianFilterInfo->getId()); } else { $bayesianTokens->decNonSpamOccurrencesFromTokensArray($blogId, $tokens, $totalSpam, $totalNonSpam); $bayesianFilterInfos->decTotalNonSpam($bayesianFilterInfo->getId()); } return true; }