function DisentangleStrOfWords_Backward($strtochk, $strtochkLEN, $dbh) { include "ObsceneClean.settings.php"; DebugToLog('DEBUG: Begin DisentangleStrOfWords_Backward strtochk=' . $strtochk); $word_array = array(); $pullLen = 1; $pos = $strtochkLEN - 1; $flag = 0; $arr_count = 0; $idx = 0; $newcount = 0; $newcountMAX = 5; while ($pos > 0 and $newcount < $newcountMAX) { while ($pos >= 0 and $pullLen < 80) { $chkstr = substr($strtochk, $pos, $pullLen); // DebugToLog('SUPERDUMP: check: $pos=' . $pos . ' $pullLen=' . $pullLen . ' $chkstr =' . $chkstr); if (IsAKnownWord($chkstr, $dbh)) { $word_array[$idx] = strtolower($chkstr); } // else { DebugToLog('SUPERDUMP: NOT A WORD'); } $pullLen++; $pos--; } DebugToLog('DUMP: inner while iteration done-------------------------------------------------'); $newcount = count($word_array); $TotalLengthOfAllFoundWords_Backward = 0; for ($y = 0; $y < $newcount; $y++) { $TotalLengthOfAllFoundWords_Backward += strlen($word_array[$y]); } if ($newcount > $arr_count) { $arr_count = $newcount; $pos = $strtochkLEN - $TotalLengthOfAllFoundWords_Backward - 1; $idx++; } else { if ($flag > 10) { $newcount = $newcountMAX + 1; } else { $flag++; $pos--; } } $pullLen = 1; DebugToLog('DUMP: $flag=' . $flag . ' $newcount=' . $newcount . ' $pos=' . $pos); DebugToLog('DEBUG: Result so far in DisentangleStrOfWords backward $word_array=' . print_r($word_array, TRUE)); } return $word_array; }
function ObsceneCleanDetectOWs2($checkstr) { include_once "ObsceneCleanLib.php"; include "ObsceneClean.settings.php"; /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Housekeeping /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ global $LogFile; $LogFile = GetLogFileName(); CreateLogFile(); $currentDateTime = date("Y-m-d-H-i-s-") . substr((string) microtime(), 2, 8); DebugToLog(' '); DebugToLog(' '); DebugToLog('=================================== BEGIN ObsceneClean Run using INPUT STRING ON ' . $currentDateTime . ' ============================'); DelLogs(); if ($LogToScreen) { DebugToLog('DEBUG: $LogToScreen is TRUE therefore all debug output will go to screen instead of log file.'); } error_reporting(E_ALL); ini_set('display_errors', true); // set for debug or higher only $real_usage = TRUE; DebugToLog('DEBUG: BEGIN MEMORY=' . memory_get_usage()); DebugToLog('DEBUG: MEMORY PEAK=' . memory_get_peak_usage()); global $original_state_checkstr; // we need this to look for foreign character sets $original_state_checkstr = $checkstr; $checkstr = html_entity_decode($checkstr, ENT_QUOTES, 'UTF-8'); // Yes, they will try to trick the filter with HTML entities. (UTF-8 meeds to be there despite what doc says). $AllDefinedVars = get_defined_vars(); DebugToLog('DUMP: Unformatted All Defined Vars=' . print_r($AllDefinedVars, true)); DebugToLog('DEBUG: ---------------------------------------------------------All Startup Defined Variables----------------------------------------------------------------'); reset($AllDefinedVars); foreach ($AllDefinedVars as $key => $value) { $arrtmp1 = $key; if (is_bool(${$arrtmp1})) { if (${$arrtmp1}) { DebugToLog('DEBUG: $' . $key . '=TRUE'); } else { DebugToLog('DEBUG: $' . $key . '=FALSE'); } } elseif (is_array($value)) { DebugToLog('DEBUG: $' . $key . '=' . print_r(${$arrtmp1}, true)); } else { DebugToLog('DEBUG: $' . $key . '=' . $value); } } if ($DictionaryToUse < 2) { $pspell_link = pspell_new("en"); } //0. Get the dictionary $encode = mb_detect_encoding($checkstr, "auto"); DebugToLog('DEBUG: Encoding from mb_detect_encoding=' . $encode); $OverallProbability = $InitialOverallProbability; // DebugToLog('DEBUG: checkstr=' . print_r($checkstr, true)); $checkstr = stripslashes($checkstr); DebugToLog('DEBUG: with CR LF preserved checkstr='); DebugToLog('DEBUG: INPUT STRING=' . $checkstr); DebugToLog('DEBUG: $ObsceneCleanReturnValue=' . $ObsceneCleanReturnValue); /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Error checking is checkstr == 2 or less chars? Y=return error, lowest sev must be 10 or less! /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ if ($BypassCode != NULL) { $BypassUsed = strpos($checkstr, $BypassCode); if ($BypassUsed === false) { DebugToLog('DEBUG: Bypass code was not specified'); } else { DebugToLog('DEBUG: Correct bypass code was specified. Skipping all offensive language checking.'); if ($ObsceneCleanReturnValue == 0) { return TRUE; } elseif ($ObsceneCleanReturnValue == 1) { return 0; } elseif ($ObsceneCleanReturnValue == 2) { return $checkstr; } elseif ($ObsceneCleanReturnValue >= 3) { $OWArray = array(); return $OWArray; } } } $checkstrSize = strlen($checkstr); if ($checkstrSize < 14) { $Trimmedcheckstr = trim($checkstr); $Trimmedcheckstr = trim($Trimmedcheckstr, '!&*.,?@-_'); DebugToLog('DEBUG: $Trimmedcheckstr =' . $Trimmedcheckstr); $TrimmedcheckstrSize = strlen($Trimmedcheckstr); unset($Trimmedcheckstr); } else { $TrimmedcheckstrSize = $checkstrSize; } DebugToLog('DEBUG: Input string size=' . $checkstrSize); DebugToLog('DEBUG: $TrimmedcheckstrSize=' . $TrimmedcheckstrSize); DebugToLog('DEBUG: AFTER STRIPSLASHES Input string size=' . $checkstrSize); if ($UseMaxInputStringSize and $checkstrSize > $MaxInputStringSize) { DebugToLog('ERROR: Input string is larger than allowed Maximum. Max=' . $MaxInputStringSize); exit; } if ($checkstrSize > $MaxInputToChk) { $checkstr = substr($checkstr, 0, $MaxInputToChk); } if ($ExitOnHighOverallProb and $ObsceneCleanReturnValue > 1) { DebugToLog('ERROR: Setting $ObsceneCleanReturnValue to a value greater than 1 and $ExitOnHighOverallProb to TRUE are incompatible and may not replace all OWs.'); } if (!$usethindisguise and $usepoordisguise) { $usethindisguise = TRUE; } if (strlen($checkstr) <= $MinCheckstrSize) { DebugToLog('ERROR: String to small to be checked'); exit; } if ($TooManyUniqueOWs < 2) { DebugToLog('ERROR: The setting $TooManyUniqueOWs must be greater than 1.'); } /* --------sqlite CONNECT------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ DebugToLog('DEBUG: sqlite connect begins $dbname=' . $dbname); $DBfile = $dbname; if (isset($DataDir) and !empty($DataDir)) { $db = $DataDir . '/' . $DBfile; echo '<br> hello $DataDir=' . $DataDir; } else { $app_root = dirname(__FILE__); $db = $app_root . '/' . 'dat' . '/' . $DBfile; } DebugToLog('DEBUG: FULL DB SPEC $db=' . $db); // open database file global $dbh; $dbh = new PDO('sqlite:' . $db); /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // do diacritical folding e.g. accent removal -- use safeapplied here? /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ if ($folddiacritics) { $checkstr = FoldDiacritics($checkstr); DebugToLog('DEBUG: after folddiacritics $checkstr=' . $checkstr); } /* ----------------------------------------------------------------------------------------------------- */ // $checkstr always copied to $SafeAppliedStr $SafeAppliedStr = $checkstr; /* ---------- Replace BBcode ----------------------------------------------------------- */ if ($ReplaceBBcode) { if (ReplaceBBCode($SafeAppliedStr)) { DebugToLog('SUBRULE MATCH: ReplaceBBCode BBcode was found.'); } else { DebugToLog('DEBUG: No BBcode found'); } } /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Blank out completely safe strings in the string to be tested /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ $MasterSafeList = GetMasterSafeList($dbh); $TheFileToOpen = 'LocalSafeList.dat'; if ($DataDir == "") { $app_root = dirname(__FILE__); $FTfile = $app_root . '/' . 'dat' . '/' . $TheFileToOpen; } else { $FTfile = $DataDir . '/' . $TheFileToOpen; } if (filesize($FTfile) > 0) { $LocalSafeList = GetLocalSafeList(); $MasterSafeList = array_merge($MasterSafeList, $LocalSafeList); } $SafeCnt = count($MasterSafeList); // for ($x=0; $x < $SafeCnt; $x++){DebugToLog('DEBUG1: Safe list entry=' . $MasterSafeList[$x]); } $MasterSafeList = array_unique($MasterSafeList); if ($MasterSafeList == NULL || !is_array($MasterSafeList)) { DebugToLog('ERROR $MasterSafeList is NULL or IS not an array. Is table stil in DB?'); } else { // DebugToLog('DEBUG: List safe list entries to be applied.'); $SafeCnt = count($MasterSafeList); // for ($x=0; $x < $SafeCnt; $x++){DebugToLog('DEBUG: Safe list entry=' . $MasterSafeList[$x]); } $ReplacementsCount = 0; $SafeAppliedStr = preg_replace_callback($MasterSafeList, 'ReplaceWithStrOfEqualSize', $SafeAppliedStr, -1, $ReplacementsCount); if ($ReplacementsCount > 0) { DebugToLog('SUBRULE MATCH: Some text in input matched safelist and was replaced. $ReplacementsCount=' . $ReplacementsCount); } DebugToLog('DEBUG: Safe list applied $SafeAppliedStr=' . $SafeAppliedStr); } DebugToLog('DEBUG: Length of $checkstr=' . strlen($checkstr)); DebugToLog('DEBUG: Length of $SafeAppliedStr=' . strlen($SafeAppliedStr)); DebugToLog('DEBUG: Length of $original_state_checkstr=' . strlen($original_state_checkstr)); DebugToLog('DEBUG: Beginning memory=' . memory_get_usage()); DebugToLog('DEBUG: Memory peak=' . memory_get_peak_usage()); /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Put list of decrypted OWs & parms into array, 1 per element /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ $OWArray = GetOWsAndParms($LowestSevConsidered, $dbh); if ($ValidateOWs) { ValidateOWS($OWArray); } if ($OWArray == NULL || !is_array($OWArray)) { DebugToLog('ERROR $OWArray is NULL or not an array '); die; } DebugToLog('DEBUG: Total OWs loaded=' . count($OWArray)); DebugToLog('DUMP: OWs retrieved. OWs to be used in array=' . print_r($OWArray, true)); /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Get 'other' word lists /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ $QtyWordsArray = GetQtyWords($dbh); global $InsultingWordsArray; $InsultingWordsArray = GetInsultingWords($dbh); $AntagonisticWordsArray = GetAntagonisticWords($dbh); $OWLookupArray = GetOWCategoryLookup($dbh); if ($UseProximityRules) { $ProximityRules = GetProximityRules($dbh); DebugToLog('DUMP: only follows ProximityRules=' . print_r($ProximityRules, true)); DebugToLog('DUMP: Proximity rules loaded follow:'); $ProximityRulesCount = count($ProximityRules); for ($x = 0; $x < $ProximityRulesCount; $x++) { DebugToLog('DUMP: Proximity Rule loaded: OW=' . $ProximityRules[$x]['OW'] . ' rule=' . $ProximityRules[$x]['rule'] . ' Nearword=' . $ProximityRules[$x]['nearword'] . ' Proximity=' . $ProximityRules[$x]['proximity'] . ' Direction=' . $ProximityRules[$x]['direction'] . ' Weight=' . $ProximityRules[$x]['weight']); } } global $ChineseSurnames; $ChineseSurnames = GetChineseSurnames($dbh); global $Turkish1stNames; $Turkish1stNames = GetTurkish1stNames($dbh); // Put all insulting & Quantitative terms in array with their position for later analysis /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ $InsultsAndPos = GetPos4InsultsInArray($checkstr, $InsultingWordsArray); if (is_array($InsultsAndPos)) { DebugToLog('DUMP: $InsultsAndPos=' . print_r($InsultsAndPos, true)); } else { DebugToLog('DEBUG: $InsultsAndPos is NOT an array because there were NO insults found in input.'); } $QtyAndPos = GetPos4WordsInArray($checkstr, $QtyWordsArray, false); global $TotalInsultsWeight; global $InitialProb; $TotalInsultsWeight = 0; if (is_array($InsultsAndPos)) { $TotalInsultingCount += CountAllInsults($InsultsAndPos, $InsultingWordsArray); DebugToLog('DUMP: $TotalInsultingCount=' . print_r($TotalInsultingCount, true)); $TotalInsultsWeight = WeighAllInsults($InsultsAndPos, $InsultingWordsArray); DebugToLog('DUMP: $TotalInsultsWeight=' . print_r($TotalInsultsWeight, true)); $UniqueInsults = ReturnUniqueInsults($InsultsAndPos); DebugToLog('DUMP: Total Insults loaded=' . count($InsultsAndPos)); } else { DebugToLog('DEBUG: $TotalInsultsWeight is zero.'); } /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Evaluate Antagonism We got rid of the antagonistic categories. /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ $AntagonismProbability = 0; if ($EvaluateAntagonism) { $TotalAntagonisticWords = AssessAntagonism($checkstr, $AntagonisticWordsArray); if ($TotalAntagonisticWords > 1 and $TotalAntagonisticWords < $AntagonismLow) { DebugToLog('RULE MATCH: Antagonistic language is present but low.'); $Antagonism = FALSE; } if ($TotalAntagonisticWords >= $AntagonismLow and $TotalAntagonisticWords < $AntagonismHigh) { $AntagonismProbability += 20; $Antagonism = TRUE; DebugToLog('RULE MATCH: Antagonistic language is moderate Overall Probability=' . $AntagonismProbability); } if ($TotalAntagonisticWords >= $AntagonismHigh and $TotalAntagonisticWords < $AntagonismVeryHigh) { $AntagonismProbability += 40; $Antagonism = TRUE; DebugToLog('RULE MATCH: Antagonistic language is high Overall Probability=' . $AntagonismProbability); } if ($TotalAntagonisticWords >= $AntagonismVeryHigh) { $AntagonismProbability += 60; DebugToLog('RULE MATCH: Antagonistic language is very high Overall Probability=' . $AntagonismProbability); if (is_array($InsultsAndPos)) { if ($TotalInsultsWeight > $InsultingWeightThreshold or $UniqueInsults >= $InsultingUniqueThreshold) { DebugToLog('RULE MATCH: Antagonistic and insulting language are very high or too many unique insults. Unique Insults=' . $UniqueInsults . ' Insults weight=' . $TotalInsultsWeight); if ($ExitOnVeryHighAntagonismAndInsults and $ObsceneCleanReturnValue < 2) { if ($ReportAntagonism) { DebugToLog('Total Antagonistic words=' . $TotalAntagonisticWords); } if ($ReportInsults) { ReportInsults($InsultsAndPos); } DebugToLog('RULE MATCH: Exiting on very high antagonism and insulting language if $ObsceneCleanReturnValue is 1 or 0. $AntagonismProbability=' . $AntagonismProbability . '% '); if ($ObsceneCleanReturnValue == 1) { return $AntagonismProbability; } if ($ObsceneCleanReturnValue == 0) { return TRUE; } } } } } } else { DebugToLog('Antagonism not evaluated'); } /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Use disguise chars., lookalikes to form a template REGEX for finding OWs /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ $regexPattern = FormREGEX_Template($usethindisguise, $usepoordisguise, $PoorDisguisingCharsOnly, $GoodDisguisingCharsOnly, $MaxGoodDisguisingChars, $MaxPoorDisguisingChars, $DisguiseCharAlternationFactor, $MaxOWLetterRepeat); $regexPattern_for_1st_letter = FormREGEX_Template_for_1st_letter($usethindisguise, $usepoordisguise, $PoorDisguisingCharsOnly, $GoodDisguisingCharsOnly, $MaxGoodDisguisingChars, $MaxPoorDisguisingChars, $DisguiseCharAlternationFactor, $MaxOWLetterRepeat); /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Now lets programatically sew letters of OW with regex template to form a new array of regexs that can find all OWs including their pluralization /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ $REGEXs_array = MakeArrayOfREGEXsToFindOWs($OWArray, $regexPattern, $uselookalikes, $VowelSubstitutionChars, $SubstitutableChars, $SubstitutionChars, $PoorDisguisingCharsOnly, $GoodDisguisingCharsOnly, $VowelSubstitutionRule, $VowelSubstitutionLen, $UseGreedyREGEXs, $MaxGoodDisguisingChars, $regexPattern_for_1st_letter); /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Following arrays are used later /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ DebugToLog('DUMP: before dash replace $checkstr=' . $checkstr); global $AllWordsInAnArray; $AllWordsInAnArray = str_word_count(str_replace('-', ' ', $checkstr), 2, $WordChars); // Hyphens and dashes are often over-used in Web talk and moreso with offensive language so let's just eliminate dashes. 'gobble-de-gook' or 'dumb-gook' was seen as 1 word by str_word_count and proximity rule and owrecog did not work. DebugToLog('DUMP: after dash replace $checkstr=' . $checkstr); $AllWordsInAnArray = RemovePossesiveS($AllWordsInAnArray); DebugToLog('DEBUG: After possessive strip $AllWordsInAnArray=' . print_r($AllWordsInAnArray, true)); $AllWordsInAnArray = RemoveSingleQuotes($AllWordsInAnArray); DebugToLog('DEBUG: After single quote strip $AllWordsInAnArray=' . print_r($AllWordsInAnArray, true)); // stupid str_word_count will ignore single quotes (but not double quotes) and put them into the array. $AllWordsInAnArray_cnt = count($AllWordsInAnArray); DebugToLog('DEBUG: $AllWordsInAnArray_cnt=' . $AllWordsInAnArray_cnt); // this is really the total number of words input to OB if ($AllWordsInAnArray_cnt <= 0) { if ($ObsceneCleanReturnValue == 0) { return TRUE; } elseif ($ObsceneCleanReturnValue == 1) { return 0; } elseif ($ObsceneCleanReturnValue == 2) { return $checkstr; } elseif ($ObsceneCleanReturnValue == 3) { $OWArray = array(); return $OWArray; } else { DebugToLog('ERROR: Check $ObsceneCleanReturnValue in settings file!'); } } global $WordsAndTheirBounds; $WordsAndTheirBounds = array(); $WordsAndTheirBounds = FindAllWordBoundaries($checkstr, $AllWordsInAnArray); DebugToLog('DUMP: $WordsAndTheirBounds=' . print_r($WordsAndTheirBounds, true)); /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ $MultiWordAllowableRuns = $AllWordsInAnArray_cnt + 1; // The func MultiWordCheck will tend to run more times than there are words and thus may time out so we must limit it. Try "BitchBitchBitchBitchBitchBitchBitchBitchBitchBitch" The "+ 1" add a slightly more robust search for a jumble of cuss words DebugToLog('DEBUG: $MultiWordAllowableRuns=' . $MultiWordAllowableRuns); /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Now use array of REGEXs to find OWs - After each REGEX check if any nonHomonym OWs /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ $MatchesEvaluatedSoFar = 0; $TotalMatchesSoFar = 0; $TempArray = array(); // global $InputStrMostlyWords = FALSE; $WhatRegexFoundTakesUpMostOfString = FALSE; global $matches; $matches = array(); $iOffset = 0; $HighestSev = 0; $UniqueOWsCnt = 0; global $i2; // Index of the table of matches: $matches $i2 = 0; // increment for each OW matched by REGEX $array_elements = count($REGEXs_array); for ($i = 0; $i < $array_elements; $i++) { DebugToLog('DEBUG: Begin test of regex for ' . $OWArray[$i]['OW'] . ' ----------------------------------------------------------------------------------------------------------'); DebugToLog('DUMP: MEMORY IS=' . memory_get_usage()); DebugToLog('DUMP: PEAK MEMORY IS=' . memory_get_peak_usage()); DebugToLog('DEBUG: i$=' . $i . ' Regex to be used=' . $REGEXs_array[$i]); $TempArray = NULL; $regrtncode = preg_match_all($REGEXs_array[$i], $SafeAppliedStr, $TempArray, PREG_OFFSET_CAPTURE, $iOffset); // preg_match might return more than 1 instance of an OW // pcre_error_decode(); DebugToLog('DUMP: preg_match_all $regrtncode=' . $regrtncode); if ($regrtncode > 0) { DebugToLog('DEBUG: Regex match occurred OW=' . $OWArray[$i]['OW']); DebugToLog('DEBUG: preg_match_all returned: $TempArray[0]=' . print_r($TempArray[0], true)); $NumOfInstancesAdded = 0; // A count of how many instances of this specific OW were added to matches array. $NumOfMatches = count($TempArray[0]); // $TempArray simply holds result of preg_match_all while $matches holds result of all OWs found so far for ($i3 = 0; $i3 < $NumOfMatches; $i3++) { $TheMatch = $TempArray[0][$i3][0]; $PositionOfMatch = $TempArray[0][$i3][1]; $CRLFpos = strpos($TheMatch, $CRLFvalue); if ($CRLFpos === FALSE) { DebugToLog('DEBUG: CR & LF not found in match. Match=' . $TheMatch); } else { $WithoutCRLF = substr($TheMatch, $CRLFpos + 2); DebugToLog('DEBUG: Match after CR LF=' . $WithoutCRLF); $regrtncode = preg_match_all($REGEXs_array[$i], $WithoutCRLF, $TempArray2, PREG_OFFSET_CAPTURE, $iOffset); if ($regrtncode > 0) { $TheMatch = $TempArray2[0][$i3][0]; $PositionOfMatch = $PositionOfMatch + $CRLFpos + 2; DebugToLog('RULE MATCH: CR LF removed from original match and remainder still matches=' . $WithoutCRLF); } } if (!AlreadyDetected($matches, $TheMatch, $PositionOfMatch)) { DebugToLog('RULE MATCH: Regex Match=' . $TheMatch . ', Position=' . $PositionOfMatch . ', OW searched for=' . $OWArray[$i]['OW'] . ' Pluralization=' . $OWArray[$i]['pluralsuffix'] . ', Matched Instance=' . $i3 . ' (First instance is zero.)'); if ($NumOfInstancesAdded > $TooManyMatchesOf1OW and $OWArray[$i]['homonym'] == 0 and $OWArray[$i]['sev'] >= $SeverityOfMoreRecognizableOWs) { $matches[$i2]['OW'] = $TheMatch; // Why store at this point? Because at this point we have assumed probability Is greater than zero. $matches[$i2]['pos'] = $PositionOfMatch; // Store start postion of match. $matches[$i2]['OWidx'] = $i; // Store the index (in $OWArray) of the OW that was matched, to refer back to it later $matches[$i2]['sev'] = $OWArray[$i]['sev']; $matches[$i2]['recog'] = $matches[$i2 - 1]['recog']; // same as previous one, if inaccurate we don't care $matches[$i2]['prob'] = 100; $NumOfInstancesAdded++; $i2++; // increment $i2 only when REGEX matches & puts match in $matches DebugToLog('DEBUG: No dissection needed.'); } else { DebugToLog('DEBUG: Begin Dissection of match. ---------------------------------------------------------------------------------'); $TheMatchUnfiltered = substr($checkstr, $PositionOfMatch, strlen($TheMatch)); DebugToLog('DEBUG: Match without any safe lists, diacritical folding, etc. applied. $TheMatchUnfiltered=' . $TheMatchUnfiltered); $WordBoundBeforeMatch = GetWordBoundBeforeMatch($WordsAndTheirBounds, $PositionOfMatch); $WordBoundAfterMatch = GetWordBoundAfterMatch($WordsAndTheirBounds, $PositionOfMatch, $TheMatch); $PartBefrStr = substr($checkstr, $WordBoundBeforeMatch + 1, $PositionOfMatch - ($WordBoundBeforeMatch + 1)); DebugToLog('DEBUG: $PartBefrStr=' . $PartBefrStr); $PartAftrStr = substr($checkstr, $PositionOfMatch + strlen($TheMatch), $WordBoundAfterMatch - ($PositionOfMatch + strlen($TheMatch))); DebugToLog('DEBUG: $PartAftrStr=' . $PartAftrStr); // Bounding: 0 Fully UNbounded 1 Fully bounded 2 Left bounded only 3 Right bounded only $bounding = IsWordBounded($checkstr, $TheMatch, $PositionOfMatch, $AllWordsInAnArray, $PartBefrStr, $PartAftrStr); DebugToLog('DEBUG: Final $bounding=' . $bounding . ' Translation: ' . $BoundingCodes[$bounding]); $MatchWithNearestBounds = substr($checkstr, $WordBoundBeforeMatch + 1, $WordBoundAfterMatch - $WordBoundBeforeMatch - 1); // never assume $MatchWithNearestBounds is same as match! If match bounded you'd think this must be same as match BUT NO. Fred with quotes is considered bounded "fred". "Fred" and fred are not the same. // $BoundedAndQuoted = BoundedAndQuoted($checkstr, $TheMatch, $PositionOfMatch, $bounding, $MatchWithNearestBounds); // if ($BoundedAndQuoted) { $bounding = 1; } //This line here only because str_word_count ingnores single quotes when it creates $AllWordsInAnArray. str_word_count will actually put 'f**k' into the array as a word with the quotes! Then IsWordBounded func gets the $AllWordsInAnArray with quotes and can't tell whether and says it not bounded when in fact a word with single quotes is essentially bounded. We correct that with this line of code. DebugToLog('DEBUG: Match with its nearest bounds. $MatchWithNearestBounds=' . $MatchWithNearestBounds); DebugToLog('DEBUG: Postion of $WordBoundBeforeMatch =' . $WordBoundBeforeMatch); DebugToLog('DEBUG: Postion of $WordBoundAfterMatch =' . $WordBoundAfterMatch); if (IsAKnownOW($MatchWithNearestBounds, $OWArray)) { $PartAftrIsWord = FALSE; $PartBefrIsWord = FALSE; } else { if (strlen($PartAftrStr) > 0 and IsAKnownWord($PartAftrStr, $dbh)) { $PartAftrIsWord = TRUE; } else { $PartAftrIsWord = FALSE; } if (strlen($PartBefrStr) > 0 and IsAKnownWord($PartBefrStr, $dbh)) { $PartBefrIsWord = TRUE; } else { $PartBefrIsWord = FALSE; } } if ($PartBefrIsWord) { DebugToLog('DEBUG: $PartBefrIsWord - Part before match is a word.'); } else { DebugToLog('DEBUG: $PartBefrIsWord - Part before match is NOT a word.'); } if ($PartAftrIsWord) { DebugToLog('DEBUG: $PartAftrIsWord - Part after match is a word.'); } else { DebugToLog('DEBUG: $PartAftrIsWord - Part after match is NOT a word.'); } $PartAftrStrLen = strlen($PartAftrStr); $PartBefrStrLen = strlen($PartBefrStr); $ExactMatch = ExactMatchOfOW($TheMatch, $OWArray, $i); $AccidentalOWResult = AccidentalOW($checkstr, $TheMatch, $PositionOfMatch, $OWArray[$i]['sev'], $OWArray[$i]['OW'], $WordBoundBeforeMatch, $WordBoundAfterMatch, $bounding, $PartAftrIsWord, $PartBefrIsWord, $OWArray, $dbh); if ($AccidentalOWResult[0] > 0 and $AccidentalOWResult[0] == $AccidentalOWResult[1]) { $AccidentalOW = TRUE; } else { $AccidentalOW = FALSE; } // $AccidentalOW = FALSE; // for testing robustness of OWrecognizable if ($AccidentalOW) { DebugToLog('DEBUG: $AccidentalOW=TRUE '); DebugToLog('DEBUG: Accidental OW. A series of valid words (or a single word) accidentally formed an OW or the string containing the OW and all characters within the nearest bounds is an OW that was not found in the master OW file. $WordBoundAfterMatch =' . $WordBoundAfterMatch); } else { DebugToLog('DEBUG: $AccidentalOW=FALSE '); } if (!$AccidentalOW) { $BBcodeTrick = BBcodeTrick($TheMatchUnfiltered, $TheMatch, $OWArray[$i]['OW']); // can't recall the difference between MatchIsUBBorHTMLtagged and BBcodeTrick funcs. They could be redundant. if ($BBcodeTrick) { DebugToLog('SUBRULE: BBcodeTrick TRUE'); } else { DebugToLog('DEBUG: BBcodeTrick FALSE'); } if (!$PartBefrIsWord and $MultiWordAllowableRuns > 0) { $PartBefrIsMultiWord = MultiWordCheck($PartBefrStr, $dbh, $InsultingWordsArray, $OWArray); } else { $PartBefrIsMultiWord = 0; } if (!$PartAftrIsWord and $MultiWordAllowableRuns > 0) { $PartAftrIsMultiWord = MultiWordCheck($PartAftrStr, $dbh, $InsultingWordsArray, $OWArray); } else { $PartAftrIsMultiWord = 0; } if ($PartAftrIsMultiWord == 0 and $PartBefrIsMultiWord == 0) { $LogTypeTmp = 'DEBUG:'; } else { $LogTypeTmp = 'SUBRULE MATCH:'; } // if zeros there was NO subrule match DebugToLog($LogTypeTmp . ' Multiword result $PartBefrIsMultiWord=' . $PartBefrIsMultiWord . ' $PartAftrIsMultiWord=' . $PartAftrIsMultiWord . ' Key: 0=False 1=nice words 2=some bad words 3=lotsa bad words'); $MultiWordAllowableRuns--; DebugToLog('DEBUG: After decrement $MultiWordAllowableRuns=' . $MultiWordAllowableRuns); if ($ExactMatch and $bounding == 0 and $OWArray[$i]['sev'] >= $SeverityOfMoreRecognizableOWs and strlen($MatchWithNearestBounds) > 7 and ($PartAftrStrLen > $MinCharsAfterMatchRecognizable or $PartBefrStrLen > $MinCharsAfterMatchRecognizable) and $PartAftrIsWord == FALSE and $PartBefrIsWord == FALSE) { $PartsCombined = $PartBefrStr . $PartAftrStr; if (IsAKnownWord($PartsCombined, $dbh)) { DebugToLog('DEBUG: OW encapsulated in other word. ' . $MatchWithNearestBounds . ' Other word: ' . $PartsCombined); $EncapsulatedOW = TRUE; } else { $EncapsulatedOW = FALSE; } } else { $EncapsulatedOW = FALSE; } $Recognizability = OWrecognizable($checkstr, $TheMatch, $PositionOfMatch, $i, $OWArray, $PartBefrStr, $PartAftrStr, $PartBefrIsWord, $PartAftrIsWord, $ExactMatch, $BBcodeTrick, $AccidentalOWResult, $CRLFpos, $PartAftrIsMultiWord, $PartBefrIsMultiWord, $EncapsulatedOW); if ($Recognizability < 1) { $Recognizability = 1; } if ($Recognizability > 10) { $Recognizability = 10; } DebugToLog('SUBRULE MATCH: Final $Recognizability=' . $Recognizability . ' for match=' . $TheMatch . ' - (If 10, match is easily recognized. If 1, highly obscured.)'); if ($OWArray[$i]['homonym'] == 1) { $homonym = TRUE; } else { $homonym = FALSE; } // 1= Homonym OW if ($homonym) { DebugToLog('SUBRULE MATCH: Match is a homonym OW.'); } else { DebugToLog('SUBRULE MATCH: Match is a non-homonym OW.'); } $DisguisingChars = $GoodDisguisingCharsOnly . $PoorDisguisingCharsOnly; DebugToLog('DEBUG: $DisguisingChars=' . $DisguisingChars); $ContainsDisguiseChars = 0; $ContainsDisguiseChars = ContainsDisguiseChars($TheMatch, $DisguisingChars, 3); // 3 ok? DebugToLog('DEBUG: End dissection. ------------------------------------------------------------------------------------------------ '); // if ($Recognizability > $IgnoreLowRecognizability) { $matches[$i2]['OW'] = $TheMatch; // Why store in "matches" array at this point? Because at this point we have assumed probability of greater than zero. $matches[$i2]['pos'] = $PositionOfMatch; // Store start postion of match. $matches[$i2]['OWidx'] = $i; // Store the index (in $OWArray) of the OW that was matched, to refer back to it later $matches[$i2]['sev'] = $OWArray[$i]['sev']; $matches[$i2]['recog'] = $Recognizability; $NumOfInstancesAdded++; if ($OWArray[$i]['sev'] > $HighestSev) { $HighestSev = $OWArray[$i]['sev']; } DebugToLog('DEBUG: Begin testing of match=' . $matches[$i2]['OW'] . ' in position=' . $matches[$i2]['pos'] . ' referenced OW=' . $OWArray[$i]['OW']); if ($homonym) { DebugToLog('DEBUG: Preset weight of OW=' . $OWArray[$i]['weight']); $InitialProb = $DefaultHomonymOWProbability + $OWArray[$i]['weight']; $matches[$i2]['prob'] = $InitialProb; } else { $matches[$i2]['prob'] = $DefaultNonHomonymOWProbability + $OWArray[$i]['weight']; } DebugToLog('RULE MATCH: Initial probability for match assigned plus OW weight, ' . $TheMatch . ', is ' . $matches[$i2]['prob'] . '% '); //-------------------------- BEGIN OW PROBABILITY DETERMINATION RULES -------------------------------- if ($Recognizability == 10 and !$homonym) { $matches[$i2]['prob'] = 100; // $matches[$i2][prob] is the probability factor. DebugToLog('RULE MATCH: (A) Exact or very close match of non-Homonym OW. Tested match=' . $matches[$i2]['OW'] . ' Probability=' . $matches[$i2]['prob']); } elseif ($ExactMatch and $bounding == 1 and !$homonym) { $matches[$i2]['prob'] = 100; $OverallProbability += 50; DebugToLog('RULE MATCH: Exact, bounded and nonHomonym match. TESTED=' . $matches[$i2]['OW'] . ' Probability=' . $matches[$i2]['prob'] . ' $OverallProbability=' . $OverallProbability); } elseif ($PartBefrIsWord and $PartAftrIsWord and ($PartAftrStrLen > $MinCharsAfterMatchRecognizable and $PartBefrStrLen > $MinCharsAfterMatchRecognizable)) { $OverallProbability += 5; $matches[$i2]['prob'] += 10; DebugToLog('RULE MATCH: OW in unbounded combination of words. Match with nearest bounds=' . $MatchWithNearestBounds . ' Match=' . $TheMatch . ' Position=' . $PositionOfMatch . ' Probability=' . $matches[$i2]['prob'] . ' $OverallProbability=' . $OverallProbability); if (!$homonym) { $matches[$i2]['prob'] += 10; DebugToLog('RULE MATCH: Not a homonym OW. Probability=' . $matches[$i2]['prob']); } if ($ExactMatch) { $matches[$i2]['prob'] += 10; DebugToLog('RULE MATCH: Exact Match. Probability=' . $matches[$i2]['prob']); } if ($ContainsDisguiseChars) { $matches[$i2]['prob'] += 5; DebugToLog('RULE MATCH: Contains disguise chars. Probability=' . $matches[$i2]['prob']); } if (IsInsult($PartBefrStr, $InsultingWordsArray)) { $matches[$i2]['prob'] += 10; DebugToLog('RULE MATCH: Preceding word an insult . Probability=' . $matches[$i2]['prob']); } //dumbfuck if (IsQuantifier($PartBefrStr, $QtyWordsArray)) { $matches[$i2]['prob'] += 5; DebugToLog('RULE MATCH: Preceding quantifying word. Probability=' . $matches[$i2]['prob']); } // bigfuck } elseif ($EncapsulatedOW) { DebugToLog('SUBRULE MATCH: OW encapsulated in other word. ' . $MatchWithNearestBounds . ' Other word: ' . $PartsCombined); $matches[$i2]['prob'] += 10; } elseif ($PartAftrStrLen == 0 and $PartBefrIsWord or $PartBefrStrLen == 0 and $PartAftrIsWord) { $matches[$i2]['prob'] += 10; DebugToLog('RULE MATCH: COMBO word. OW in partially bounded combination of words. Match with nearest bounds=' . $MatchWithNearestBounds . ' Match=' . $TheMatch . ' Position=' . $PositionOfMatch . ' Probability increased to ' . $matches[$i2]['prob'] . '%'); if (!$homonym) { $matches[$i2]['prob'] += 10; DebugToLog('RULE MATCH: COMBO word and OW not Homonym. Probability increased to ' . $matches[$i2]['prob'] . '%'); } if ($ExactMatch) { $matches[$i2]['prob'] += 5; DebugToLog('RULE MATCH: COMBO word and OW is exact match without considering bounding. Probability increased to ' . $matches[$i2]['prob'] . '%'); } if ($ContainsDisguiseChars) { $matches[$i2]['prob'] += 5; DebugToLog('RULE MATCH: COMBO word. OW contains disguise words. Probability increased to ' . $matches[$i2]['prob'] . '%'); } if ($bounding == 1) { $matches[$i2]['prob'] += 5; DebugToLog('RULE MATCH: COMBO word. OW is fully bounded. Probability increased to ' . $matches[$i2]['prob'] . '%'); } if ($PartBefrIsWord and IsInsult($PartBefrStr, $InsultingWordsArray)) { $matches[$i2]['prob'] += 10; DebugToLog('RULE MATCH: COMBO word. Part before OW is insult. Probability increased to ' . $matches[$i2]['prob'] . '%'); } if ($PartBefrIsWord and IsQuantifier($PartBefrStr, $QtyWordsArray)) { $matches[$i2]['prob'] += 5; DebugToLog('RULE MATCH: COMBO word. Part before OW is quantifier. Probability increased to ' . $matches[$i2]['prob'] . '%'); } } elseif (!$homonym and $ContainsDisguiseChars and NonLetterBounds($checkstr, $TheMatch, $PositionOfMatch) and !RidiculouslyLong($TheMatch, $OWArray, $i, $BoundedOWLenLimit)) { $matches[$i2]['prob'] = 90; DebugToLog('RULE MATCH:</span> Regex match has word boundaries, is not ridicously long, is NonHomophone and is disguised. TESTED=' . $matches[$i2]['OW'] . ' Probability increased to ' . $matches[$i2]['prob']); } elseif ($BBcodeTrick and $Recognizability >= $HighRecognizability) { $matches[$i2]['prob'] += 10; DebugToLog('RULE MATCH: BBcode within OW. TESTED=' . $matches[$i2]['OW'] . ' Unfiltered Match=' . $TheMatchUnfiltered . ' Probability increased to ' . $matches[$i2]['prob'] . '%'); if ($bounding == 1) { $matches[$i2]['prob'] += 10; DebugToLog('SUBRULE MATCH: UBBTrick and OW is fully bounded. Probability increased to ' . $matches[$i2]['prob'] . '%'); } } elseif ($PartAftrIsMultiWord == 1 and $PartBefrStrLen == 0 or $PartBefrIsMultiWord == 1 and $PartAftrStrLen == 0) { // Key: 0=False 1=nice words 2=some bad words 3=lotsa bad words if (strlen($MatchWithNearestBounds) < 15) { $matches[$i2]['prob'] += 40; } else { $matches[$i2]['prob'] += 30; } DebugToLog('RULE MATCH: OW preceded or followed by Multiwords that do not contain insults or OWs. TESTED=' . $matches[$i2]['OW'] . ' Unfiltered Match=' . $TheMatchUnfiltered . ' Probability increased to ' . $matches[$i2]['prob'] . '%'); } elseif ($PartAftrIsMultiWord == 2 or $PartBefrIsMultiWord == 2) { $matches[$i2]['prob'] += 50; DebugToLog('RULE MATCH: OW within Multiwords that contains insults or other OWs. TESTED=' . $matches[$i2]['OW'] . ' Unfiltered Match=' . $TheMatchUnfiltered . ' Probability increased to ' . $matches[$i2]['prob'] . '%'); } elseif (($PartAftrIsMultiWord == 3 or $PartBefrIsMultiWord == 3) and ($PartAftrIsMultiWord >= 2 or $PartBefrIsMultiWord >= 2)) { $matches[$i2]['prob'] += 50; DebugToLog('RULE MATCH: OW within Multiwords that contains other OWs and possibly insults. TESTED=' . $matches[$i2]['OW'] . ' Unfiltered Match=' . $TheMatchUnfiltered . ' Probability increased to ' . $matches[$i2]['prob'] . '%'); } elseif ($TrimmedcheckstrSize == strlen(trim($TheMatch))) { $matches[$i2]['prob'] = 100; DebugToLog('RULE MATCH: Input string (minus some extraneous characters) IS the match. That is rather unambiguous. TESTED=' . $matches[$i2]['OW'] . ' Probability increased to ' . $matches[$i2]['prob'] . '%'); } else { DebugToLog('DEBUG: No easy and evident match. Begin extended testing using multiple factors. Probability=' . $matches[$i2]['prob']); $QuantifiersCount = 0; $InsultsAndPosCount = 0; if (is_array($QtyAndPos)) { $QuantifiersCount = CountWordsInArrayNearMatch($PositionOfMatch, strlen($TheMatch), $QtyAndPos, $QtyRuleWordsLenChk, $QtyRuleDirection); DebugToLog('DEBUG: $QuantifiersCount=' . $QuantifiersCount); } if (is_array($InsultsAndPos)) { $InsultsAndPosCount = CountWordsInArrayNearMatch($PositionOfMatch, strlen($TheMatch), $InsultsAndPos, $InsultingRuleWordsLenChk, $InsultingRuleDirection); DebugToLog('DEBUG: $InsultsAndPosCount=' . $InsultsAndPosCount); } // Have information about OW and its context now evaluate it for OW probability if ($QuantifiersCount >= $QtyRuleThreshold) { $matches[$i2]['prob'] += $QuantifiersAddFactor; DebugToLog('RULE MATCH: Quantifying words near OW. Probability increased to ' . $matches[$i2]['prob'] . '% ' . 'TESTED MATCH=' . $matches[$i2]['OW'] . ' $QuantifiersCount=' . $QuantifiersCount); } if ($InsultsAndPosCount >= $InsultsThreshold and $homonym) { $matches[$i2]['prob'] += $InsultsAddFactor; DebugToLog('RULE MATCH: Multiple insulting words near match. Probability increased to ' . $matches[$i2]['prob'] . '%' . ' TESTED MATCH=' . $matches[$i2]['OW'] . ' $InsultsAndPosCount=' . $InsultsAndPosCount); } if ($InsultsAndPosCount >= $InsultsLowThreshold and $InsultsAndPosCount < $InsultsThreshold and $homonym) { $matches[$i2]['prob'] += $LowInsultsAddFactor; DebugToLog('RULE MATCH:</span> Low number of insulting words near match. Probability increased to ' . $matches[$i2]['prob'] . '%' . ' TESTED MATCH=' . $matches[$i2]['OW'] . ' $InsultsAndPosCount=' . $InsultsAndPosCount); } if (PreviousWordsAnInsult($checkstr, $TheMatch, $PositionOfMatch)) { $matches[$i2]['prob'] += $PreviousWordAnInsultAddFactor; DebugToLog('RULE MATCH:</span> Word preceding match was an insult. Probability increased to ' . $matches[$i2]['prob'] . '%' . ' TESTED MATCH=' . $matches[$i2]['OW']); } if (ContainsDisguiseChars($TheMatch, $PoorDisguisingCharsOnly, 2)) { $matches[$i2]['prob'] += $ContainsDisguiseCharsAddFactor; DebugToLog('RULE MATCH:</span> Match contains disguising characters. Probability increased to ' . $matches[$i2]['prob'] . '%' . ' TESTED MATCH=' . $matches[$i2]['OW'] . ' $PoorDisguisingCharsOnly=' . $PoorDisguisingCharsOnly); } if ($HighRecognizabilityIncreasesProbability and $Recognizability >= $HighRecognizability) { $matches[$i2]['prob'] += $HighRecognizabilityAddFactor; DebugToLog('RULE MATCH:</span> Probability increased to ' . $matches[$i2]['prob'] . '%' . ' because of high recognizability. TESTED MATCH=' . $matches[$i2]['OW']); } if (NonLetterBounds($checkstr, $TheMatch, $PositionOfMatch) and !$homonym) { $matches[$i2]['prob'] += 5; $OverallProbability += 5; DebugToLog('RULE MATCH: NonHomophone OW bounded by non-letters. Probability increased to ' . $matches[$i2]['prob'] . '%' . ' TESTED MATCH=' . $matches[$i2]['OW']); } if ($bounding == 1 and PronounReference($checkstr, $TheMatch, $PositionOfMatch)) { $matches[$i2]['prob'] += 25; // tested "hey you f****t" and only 15 or higher worked DebugToLog('RULE MATCH: OW used with pronoun reference. Probability increased to ' . $matches[$i2]['prob'] . '%'); } else { DebugToLog('DEBUG: Match was not quoted'); } if ($UseProximityRules and $homonym) { DebugToLog('DEBUG: Begin use of Proximity rules for match=' . $matches[$i2]['OW']); $NumProxRulesApplied = 0; for ($x = 0; $x < $ProximityRulesCount; $x++) { DebugToLog('DUMP: Proximity Rule word=' . $ProximityRules[$x]['OW'] . ' OW Matched=' . $OWArray[$i]['OW']); if ($ProximityRules[$x]['OW'] == $OWArray[$i]['OW']) { $Func = 'ProximityRule' . $ProximityRules[$x]['rule']; DebugToLog('DEBUG: This Proximity rule will now be tested=' . $Func . ' Proximity word=' . $ProximityRules[$x]['nearword']); if ($Func($checkstr, $ProximityRules[$x]['nearword'], $matches[$i2]['OW'], $matches[$i2]['pos'], $ProximityRules[$x]['proximity'], $ProximityRules[$x]['direction'])) { DebugToLog('RULE MATCH: Proximity Rule ' . $ProximityRules[$x]['rule'] . ' was applied. Match=' . $TheMatch . ' (OW=' . $OWArray[$i]['OW'] . '). Mitigating or aggravating word=' . $ProximityRules[$x]['nearword'] . ' Proximity=' . $ProximityRules[$x]['proximity'] . ' Direction=' . $ProximityRules[$x]['direction'] . ' Weight applied=' . $ProximityRules[$x]['weight']); DebugToLog('DEBUG: Probability before Proximity rule=' . $matches[$i2]['prob']); DebugToLog('DEBUG: Weight applied=' . $ProximityRules[$x]['weight']); $matches[$i2]['prob'] += $ProximityRules[$x]['weight']; $NumProxRulesApplied += 1; DebugToLog('DEBUG: Probability after Proximity rule applied=' . $matches[$i2]['prob']); } else { DebugToLog('DEBUG: Proximity rule NOT applied=' . $Func); } } } if ($NumProxRulesApplied == 0) { DebugToLog('SUBRULE MATCH: OW recognized but no Proximity rules applied for match: ' . $OWArray[$i]['OW']); } } if ($homonym and $NumProxRulesApplied == 0 and $AllWordsInAnArray_cnt <= $FewWordsInput and $AllWordsInAnArray_cnt > 1) { $matches[$i2]['prob'] += 15; DebugToLog('RULE MATCH: Probability increased to ' . $matches[$i2]['prob'] . '%' . ' because little text was input and no disambiguation terms found. TESTED MATCH=' . $matches[$i2]['OW']); } if ($LowRecognizabilityReducesProbability and $Recognizability < $HighRecognizability and $Recognizability * 10 < $matches[$i2]['prob']) { DebugToLog('DEBUG: Recognizability is low. Recognizability times 10 is lower than match severity. Match probability=' . $matches[$i2]['prob']); $RoundedValue = $matches[$i2]['prob'] - ($matches[$i2]['prob'] - $Recognizability * 10) / $LowRecognizabilityReducingFactor; $matches[$i2]['prob'] = round($RoundedValue); DebugToLog('RULE MATCH: Probability reduced to ' . $matches[$i2]['prob'] . '%' . ' because of low recognizability. TESTED MATCH=' . $matches[$i2]['OW'] . ' $Recognizability=' . $Recognizability); } } if ($matches[$i2]['prob'] > 100) { $matches[$i2]['prob'] = 100; } DebugToLog('DEBUG: Final Probability for ' . $matches[$i2]['OW'] . '=' . $matches[$i2]['prob']); $i2++; // increment $i2 only after REGEX matches & puts match in $matches } else { DebugToLog('DEBUG: Very low recognizability, match ignored . Match with nearest bounds=' . $MatchWithNearestBounds . ' Match=' . $TheMatch . ' Position=' . $PositionOfMatch); } } else { DebugToLog('RULE MATCH: Accidental OW. This OW will be ignored. Match with nearest bounds=' . $MatchWithNearestBounds . ' Match=' . $TheMatch); } } // end dissection } else { DebugToLog('DEBUG: This match already matched within a previous OW. Current match=' . $TheMatch); } } // end loop per instance of regex match $TotalLenMatches = 0; for ($x = 0; $x < $NumOfMatches; $x++) { $TotalLenMatches += strlen($TempArray[0][$x][0]); } DebugToLog('DEBUG: $WhatRegexFoundTakesUpMostOfString as a percent=' . $TotalLenMatches / $checkstrSize * 100); if ($TotalLenMatches / $checkstrSize * 100 > $AllowablePercentOfOWcharsInStr) { $WhatRegexFoundTakesUpMostOfString = TRUE; } else { DebugToLog('DEBUG: $WhatRegexFoundTakesUpMostOfString was NOT set to TRUE'); } } else { DebugToLog('DEBUG: Regex for OW ' . $OWArray[$i]['OW'] . ' did not match anything. '); } DebugToLog('DEBUG: Processing for OW ' . $OWArray[$i]['OW'] . ' is complete. ----------------------------------------------------------------------------------------'); DebugToLog('DEBUG: Analyse all matches so far ====================================================================='); $TotalMatchesSoFar = count($matches); if ($TotalMatchesSoFar > $MatchesEvaluatedSoFar) { $MatchesEvaluatedSoFar = $TotalMatchesSoFar; if ($UniqueOWsCnt < $TooManyUniqueOWs) { $UniqueOWsCnt = TooManyUniqueOws($TotalMatchesSoFar, $matches, $TooManyUniqueOWs, $OWArray); // if more than $TooManyUniqueOWs so far (regardless whether homophone) then exit & report. if ($UniqueOWsCnt >= $TooManyUniqueOWs) { $OverallProbability += 80; // x number of different homophone OWs and they all just happened to be innocent homophones? Please. DebugToLog('RULE MATCH: (A) Too many unique OWs. There are ' . $TotalMatchesSoFar . ' different unique OWs. Overall probability raised by ' . $OverallProbability . '%'); } } if (PercentOfOWsInStrTooHigh($matches, $checkstr, $AllowablePercentOfOWcharsInStr, $TotalMatchesSoFar)) { DebugToLog('RULE MATCH: Too much of text contains OWs. $AllowablePercentOfOWcharsInStr=' . $AllowablePercentOfOWcharsInStr); // rule applies to "f**k and f**k shit c**t shit c**t" for ($x = 0; $x <= $TotalMatchesSoFar; $x++) { $OverallProbability += 50; } } if ($WhatRegexFoundTakesUpMostOfString and $HighestSev > 7) { DebugToLog('RULE MATCH: One or more instances of a specific OW takes up most of input string'); $OverallProbability += 70; } // this rule applies to "f**k and f**k f**k f**k f**k" } if ($ExitOnHighOverallProb) { $HighestProb = 0; for ($x = 0; $x < $TotalMatchesSoFar; $x++) { if ($matches[$x]['prob'] > $HighestProb and $matches[$x]['recog'] >= $IgnoreOWRecognizabilityForOverallProbability) { $HighestProb = $matches[$x]['prob']; } } if ($HighestProb >= $HighestProbIsOverallProb) { $OverallProbability = $HighestProb; } } if ($OverallProbability > 100) { $OverallProbability = 100; } DebugToLog('DEBUG: $ExitOnHighOverallProb=' . $ExitOnHighOverallProb . ' $OverallProbability=' . $OverallProbability); if ($ExitOnHighOverallProb and $OverallProbability >= $ExitOnHighOverallProbThreshold) { DebugToLog('RULE MATCH: $ExitOnHighOverallProb TRUE and exiting on high overall probability=' . $OverallProbability); if ($ObsceneCleanReturnValue == 1) { return $OverallProbability; } if ($ObsceneCleanReturnValue == 0) { return TRUE; } if ($ObsceneCleanReturnValue > 1) { DebugToLog('ERROR: $ObsceneCleanReturnValue set to value higher than 1 is incompatible with $ExitOnHighOverallProb set to TRUE'); } } } // ============================== end big loop -one loop per OW regex /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // FINAL CHECK RULES /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ DebugToLog('DEBUG: Big Regex loop finished. So far $OverallProbability=' . $OverallProbability . ' ================================================================='); /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // Search for offensive expressions (OEs)OEs do not have so much emphaSis on disguising. /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ if ($SearchForOEs) { $NumOfOEMatches = 0; $MasterOEList = GetMasterOEList($dbh); $oecount = count($MasterOEList); $TempOEArray = array(); $OEMatch = array(); $y = 0; DebugToLog('DEBUG: Search for Offensive Expressions (OEs) $oecount=' . $oecount); DebugToLog('DEBUG: List of offensive expressions to be applied:'); for ($i = 0; $i < $oecount; $i++) { DebugToLog('DEBUG: Begin test of OE REGEX No.=' . $i . ' Severity=' . $MasterOEList[$i]['sev'] . ' Regex=' . $MasterOEList[$i]['oe']); if (preg_match_all($MasterOEList[$i]['oe'], $SafeAppliedStr, $TempOEArray, PREG_OFFSET_CAPTURE) > 0) { DebugToLog('SUBRULE MATCH: Regex match occurred OE=' . $MasterOEList[$i]['oe'] . ' Matched=' . print_r($TempOEArray[0], true)); DebugToLog('DEBUG: preg_match_all returned: $TempOEArray[0]=' . print_r($TempOEArray[0], true)); $NumOfOEMatches = count($TempOEArray[0]); for ($x = 0; $x < $NumOfOEMatches; $x++) { $OEMatch[$y]['oematch'] = $TempOEArray[0][$x][0]; $OEMatch[$y]['pos'] = $TempOEArray[0][$x][1]; $OEMatch[$y]['sev'] = $MasterOEList[$i]['sev']; $y++; } } } if (is_array($OEMatch)) { $NumOfOEMatches = count($OEMatch); for ($x = 0; $x < $NumOfOEMatches; $x++) { DebugToLog('RULE MATCH: OE Match=' . $OEMatch[$x]['oematch'] . ' Position=' . $OEMatch[$x]['pos'] . ' Severity=' . $OEMatch[$x]['sev']); } if ($ReplaceOEs and $NumOfOEMatches > 0) { ReplaceOEsWithSpecialChars($SafeAppliedStr, $OEMatch); } } else { 'DEBUG: No Offensive Expressions found.'; } } /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ // FINAL REPORT $dbh = null; // close db /* --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ DebugToLog(' . '); DebugToLog(' ============================================= REPORT ============================================='); DebugToLog(' . '); if ($ReportAntagonism and $EvaluateAntagonism) { DebugToLog('Total Antagonistic words=' . $TotalAntagonisticWords); } if ($ReportInsults) { ReportInsults($InsultsAndPos); } // ------------------------------------------------------------------------------------------------------------------------------------------------- if (!is_array($matches) and $NumOfOEs = 0) { DebugToLog('DEBUG: No OWs or OEs were found. $matches is NOT an array. '); exit; } // ------------------------------------------------------------------------------------------------------------------------------------------------- $numOWMatches = 0; $numOWMatches = count($matches); DebugToLog('DEBUG: $numOWMatches =' . $numOWMatches); // ------------------------------------------------------------------------------------------------------------------------------------------------- DebugToLog('DUMP: All Matches RAW print=' . print_r($matches, true)); /* ------------------------------------------------------------------------------------------------------ "NiggerBitchCuntFaggot" repeated many times w/o any whitespace may produce low recognizability because MultiWordCheck is imperfect and too resource intensive to do an exhausitive analysis so here we look to see how much of the total input string is made up of OWs & raise prob and recog if it is. BTW, no algorithm can perfectly disentangle many words all sandwiched together without whitespace because there are multiple solutions and probably only human intelligence can determine which has rational meaning. The following code would also process "f**k and f**k f**k f**k f**k". The repetitiveness here makes the OW less ambiguous */ if ($checkstrSize > $InputStringIsModestlyBig) { DebugToLog('DEBUG: Input String is modestly big.'); $TotalLenMatches = 0; for ($x = 0; $x < $numOWMatches; $x++) { $TotalLenMatches += strlen($matches[$x]['OW']); } DebugToLog('DEBUG: Total length of all OW matches $TotalLenMatches=' . $TotalLenMatches); if ($TotalLenMatches / $checkstrSize * 100 > $AllowablePercentOfOWcharsInStr) { for ($x = 0; $x < $numOWMatches; $x++) { if ($matches[$x]['sev'] > $SeverityOfMoreRecognizableOWs) { $matches[$x]['recog'] = 10; // the repetitiveness of an OW makes it stand out more $matches[$x]['prob'] = 100; // High recognizability increases probability DebugToLog('RULE MATCH: Many repetitive high severity OWs make up most of input string. Recognizability and Probability increased to ' . $matches[$x]['prob'] . '%' . ' because of high recognizability for match=' . $matches[$x]['OW'] . ' Position=' . $matches[$x]['pos'] . ' Recognizability=' . $matches[$x]['recog']); } } } else { DebugToLog('DEBUG: All matches combined do not take up most of input string'); } } else { DebugToLog('DEBUG: Input String too small for repetitive OW test.'); } // ---------------------------------------SORT OUTPUT DebugToLog('DEBUG: $ReportSortOrder=' . $ReportSortOrder); if ($ReportSortOrder == 1) { usort($matches, 'compareByPosition'); } if ($ReportSortOrder == 2) { usort($matches, 'compareBySeverity'); } if ($ReportSortOrder == 3) { usort($matches, 'compareByOW'); } // -------------------------------------------------- One lonely Homonym? -------------------------------------------------------------------- $OneLonelyHomonymOW = FALSE; if ($numOWMatches > 0 and $OneLonelyOWrule) { for ($x = 0; $x < $numOWMatches; $x++) { $OWsOnly[$x] = $matches[$x]['OWidx']; } $OWsOnlyUnique = array_unique($OWsOnly); // condense the index of OWs into new array to unique OWs only. Plural and singular of same homonym word would break this logic but it is infrequent enough that we do not care. bunghole and bungholes would not reduce to one homonym. DebugToLog('DEBUG: Total unique OWs, reduced=' . count($OWsOnlyUnique)); if (count($OWsOnlyUnique) == 1) { $idx4 = $OWsOnlyUnique[0]; if ($OWArray[$idx4]['homonym'] == 1) { $OneLonelyHomonymOW = TRUE; } } } $Lonelyhomonym_val = $OneLonelyHomonymOW ? 'true' : 'false'; DebugToLog('DEBUG: $OneLonelyHomonymOW=' . $Lonelyhomonym_val); DebugToLog('DEBUG: (A)========> So far $OverallProbability=' . $OverallProbability); /* ------------------------------------------------------------------------------------------------------------------- If only one Homonym OW, there is no insulting language and antagonism is very low and then lower Overall prob This rule may seem arbitrary and applied on a very generalized basis BUT, after much testing, it seems to work amazingly well in eliminating homonym false positives. Only rarely does it let an OW get through. -------------------------------------------------------------------------------------------------------------------------- */ if ($numOWMatches > 0 and $TotalInsultingCount < 1 and $TotalAntagonisticWords <= $AntagonismLow and $OneLonelyHomonymOW) { $NumCapPronouns = 0; $OverallProbability -= $OneLonelyOWSubtractFactor; DebugToLog('RULE MATCH: Only 1 Homonym OW found when matches are sorted and reduced, no insults and little or no antagonism. Probability of offensive homophone reduced. Overall probability was lowered by ' . $OneLonelyOWSubtractFactor . '%'); DebugToLog('DEBUG: (B)========> So far $OverallProbability=' . $OverallProbability); if ($PayAttentionToProperNouns) { for ($x = 0; $x < $numOWMatches; $x++) { $TestFirstUpper = strtolower($matches[$x]['OW']); $TestFirstUpper = ucfirst($TestFirstUpper); // make first char uppercase if ($matches[$x]['OW'] == $TestFirstUpper) { DebugToLog('DEBUG: Proper Noun found for one Lonely Homonym. Probability reduced for #' . $x . ' Match=' . $matches[$x]['OW'] . ' Position=' . $matches[$x]['pos']); $matches[$x]['prob'] -= $ProperNounSubtractFactor; $NumCapPronouns++; } } if ($numOWMatches > 2 and $numOWMatches == $NumCapPronouns) { DebugToLog('RULE MATCH: All OWs are capitalized proper nouns. Overall probability lowered.'); $OverallProbability += $OneLonelyOWandAllCapPronouns; } } else { DebugToLog('DEBUG: Proper Nouns ignored'); } } DebugToLog('DEBUG: (C)========> So far $OverallProbability=' . $OverallProbability); // ------------------------------------------------------------------------------------------------------ // --------------------------------------- Overall Prob? --------------------------------------------------------------- if ($EvaluateAntagonism and $AntagonismAddsToOverallProb and $AntagonismProbability > 0) { DebugToLog('RULE MATCH: Antagonism is high and adds $AntagonismProbability to overall probability. $AntagonismProbability=' . $AntagonismProbability); $OverallProbability += $AntagonismProbability; } DebugToLog('DEBUG: (D)========> So far $OverallProbability=' . $OverallProbability); $HighestProb = 0; $TotalProbs = 0; $OverallProbDivisor = 0; DebugToLog('DEBUG: OW $numOWMatches=' . $numOWMatches); if ($numOWMatches > 0) { $TotalProbs = 0; $HighestProb = $matches[0]['prob']; if ($HighestProb > 100) { $HighestProb = 100; } for ($i = 0; $i < $numOWMatches; $i++) { if ($matches[$i]['prob'] > 100) { $matches[$i]['prob'] = 100; } if ($matches[$i]['prob'] < 1) { $matches[$i]['prob'] = 1; } if ($matches[$i]['prob'] > $HighestProb) { $HighestProb = $matches[$i]['prob']; } // What is the highest proability we have found so far? if ($matches[$i]['recog'] > $IgnoreOWRecognizabilityForOverallProbability and $matches[$i]['sev'] > $IgnoreOWSeverityForOverallProbability) { $TotalProbs += $matches[$i]['prob']; $OverallProbDivisor++; } } } else { DebugToLog('DEBUG: No OWs Matched. $numOWMatches =' . $numOWMatches); } DebugToLog('DEBUG: Total OWs found is $numOWMatches=' . $numOWMatches . ' -- Total minus ignored due to low recognizability and probability is $OverallProbDivisor=' . $OverallProbDivisor); DebugToLog('DEBUG: $HighestProb=' . $HighestProb . ' $TotalProbs=' . $TotalProbs . ' $NumOfOEMatches=' . $NumOfOEMatches); $AllMatches = $numOWMatches; /* ----------------------------------------------------------------------------------------------- By def all OEs are considered and given a probability of 100% because multi-word expression are far less ambiguous. To change this see settings. */ if ($SearchForOEs and $NumOfOEMatches > 0) { $AllMatches += $NumOfOEMatches; DebugToLog('DEBUG: $AllMatches=' . $AllMatches . ' $NumOfOEMatches=' . $NumOfOEMatches); for ($x = 0; $x < $NumOfOEMatches; $x++) { if ($OEMatch[$x]['sev'] >= $OESeveritiesToApply) { $TotalProbs += $OEProbabiltyToApply; $OverallProbDivisor++; } } $HighestProb = $OEProbabiltyToApply; } DebugToLog('DEBUG: Final $TotalProbs=' . $TotalProbs); DebugToLog('DEBUG: Final $OverallProbDivisor=' . $OverallProbDivisor); if ($TotalProbs > 0) { if ($HighestProb >= $HighestProbIsOverallProb and !$OneLonelyHomonymOW) { $OverallProbability += $HighestProb; // we changed it to add here from assign. testing? DebugToLog('RULE MATCH: Highest probability ($HighestProb) added to overall probability. $HighestProbIsOverallProb=' . $HighestProbIsOverallProb . ' $HighestProb=' . $HighestProb); DebugToLog('DEBUG: (E)========> So far $OverallProbability=' . $OverallProbability); } else { DebugToLog('DEBUG: Overall probability will be based on average. Total addition of individual probabilities: $TotalProbs=' . $TotalProbs); DebugToLog('DEBUG: Total number of OE and OW matches summed. $AllMatches=' . $AllMatches); $AverageProb = $TotalProbs / $OverallProbDivisor; // do not use std dev here cause one outlier can be telling DebugToLog('DEBUG: $AverageProb=' . $AverageProb); $OverallProbability += $AverageProb; // "+=" because you can set $OverallProbability to have a value when the program starts if ($OverallProbability > 100) { $OverallProbability = 100; } $OverallProbability = round($OverallProbability); DebugToLog('RULE MATCH: Average probability, ' . $AverageProb . ', was added to overall probability. $OverallProbability=' . $OverallProbability); } } else { DebugToLog('DEBUG: No OWs or OEs had high enough recognizability or severity to be used in calculating overall probability. '); } // and with good reason /* ----------------------------------------------------------------------------------------------------------------------------------- */ if ($SearchForOEs and $NumOfOEMatches > 0) { DebugToLog('Total Offensive Expressions found: ' . $NumOfOEMatches); } // -------------------------------------------- REPORT --------------------------------------------------------- $AllOWsFound = array(); $j = 0; if ($numOWMatches > 0) { DebugToLog(' ============================ OFFENSIVE WORDS FOUND =========================='); for ($i = 0; $i < $numOWMatches; $i++) { if ($matches[$i]['sev'] >= $lowestsev and ($matches[$i]['recog'] >= $LowestRecognizabilityReported and $matches[$i]['sev'] >= $LowestSeverityReported)) { $idx = $matches[$i]['OWidx']; // if ($OWArray[$idx]['soundalike']) { $soundalike_val = 'TRUE'; } else { $soundalike_val = 'FALSE'; } // if ($OWArray[$idx]['homonym']) { $homonym_val = 'TRUE'; } else { $homonym_val = 'FALSE'; } $CatSubcatDesc = GetCatSubcatDescription($OWArray[$idx]['cat'], $OWArray[$idx]['subcat'], $OWLookupArray); $MainCat = $CatSubcatDesc['cat']; $SubCat = $CatSubcatDesc['subcat']; $AllOWsFound[$j]['OW'] = $matches[$i]['OW']; $AllOWsFound[$j]['OWReferenced'] = $OWArray[$idx]['OW']; $AllOWsFound[$j]['position'] = $matches[$i]['pos']; $AllOWsFound[$j]['sev'] = $matches[$i]['sev']; $AllOWsFound[$j]['recognizability'] = $matches[$i]['recog']; $AllOWsFound[$j]['cat'] = $MainCat; $AllOWsFound[$j]['subcat'] = $SubCat; $AllOWsFound[$j]['homonym'] = $OWArray[$idx]['homonym']; // $homonym_val; $AllOWsFound[$j]['soundalike'] = $OWArray[$idx]['soundalike']; // $soundalike_val; $AllOWsFound[$j]['weight'] = $OWArray[$idx]['weight']; $AllOWsFound[$j]['probability'] = $matches[$i]['prob']; $AllOWsFound[$j]['pluralsuffix'] = $OWArray[$idx]['pluralsuffix']; DebugToLog('#' . $j . ' Found=' . $AllOWsFound[$j]['OW'] . ' Searched for=' . $AllOWsFound[$j]['OWReferenced'] . ' with plural suffix=' . $AllOWsFound[$j]['pluralsuffix'] . ' Position=' . $AllOWsFound[$j]['position'] . ' Severity=' . $AllOWsFound[$j]['sev'] . ' Recognizability=' . $AllOWsFound[$j]['recognizability'] . ' Category=' . $AllOWsFound[$j]['cat'] . ' Subcategory=' . $AllOWsFound[$j]['subcat'] . ' Soundalike=' . (is_bool($AllOWsFound[$j]['soundalike']) ? $AllOWsFound[$j]['soundalike'] ? "true" : "false" : $AllOWsFound[$j]['soundalike']) . ' Homonym=' . (is_bool($AllOWsFound[$j]['homonym']) ? $AllOWsFound[$j]['homonym'] ? "true" : "false" : $AllOWsFound[$j]['homonym']) . ' Inital weight=' . $AllOWsFound[$j]['weight'] . ' Probability of offensiveness=' . $AllOWsFound[$j]['probability']); $j++; } else { DebugToLog('DEBUG: Match will not be reported because recognizability and severity are too low. Match=' . $matches[$i]['OW'] . ' Pos=' . $matches[$i]['pos'] . ' severity=' . $matches[$i]['sev'] . ' recognizability=' . $matches[$i]['recog']); } } DebugToLog('========================================================================='); } else { DebugToLog(' -------------------- NO OFFENSIVE WORDS FOUND -----------------------------------------'); } if ($OverallProbability < 1) { $OverallProbability = 1; } if ($OverallProbability > 100) { $OverallProbability = 100; } // -------------------------------------------- REPLACEMENT --------------------------------------------------------- if ($ReplaceOWs and $OverallProbability >= $ReplaceOWsWithSpecialCharsOverallProb) { ReplaceOWsWithSpecialChars($SafeAppliedStr, $matches); DebugToLog('DEBUG: Input string after replacements=' . $SafeAppliedStr); } // -------------------------------------------- FINAL --------------------------------------------------------- DebugToLog('DEBUG: ENDING MEMORY=' . memory_get_usage()); DebugToLog('DEBUG: MEMORY PEAK=' . memory_get_peak_usage()); DebugToLog(' <h2>The Overall probability of offensive language is ' . $OverallProbability . '%' . ' Current threshold is >=' . $OverallProbabilityThreshold . '% </h2>'); DebugToLog('DEBUG: All done here now chill. Yeah you.'); DebugToLog('DEBUG: ======================= END ObsceneClean Run started on ' . $currentDateTime . ' =============================='); if ($ObsceneCleanReturnValue == 0) { if ($OverallProbability >= $OverallProbabilityThreshold) { return TRUE; } else { return FALSE; } } if ($ObsceneCleanReturnValue == 1) { return $OverallProbability; } if ($ObsceneCleanReturnValue == 2) { return $SafeAppliedStr; } // accent marks and Unicode chars may be stripped from returned output. if ($ObsceneCleanReturnValue == 3) { return $AllOWsFound; } // accent marks and Unicode chars may be stripped from returned output. if ($ObsceneCleanReturnValue == 4) { if (is_array($OEMatch)) { $AllOWsOEs = array(); $AllOWsOEs[0] = $AllOWsFound; $AllOWsOEs[1] = $OEMatch; return $AllOWsOEs; } } }