function GetFingerprint($atext) { global $CFG; $plagiarismsettings = (array) get_config('plagiarism'); $gram_size = $plagiarismsettings['crot_grammarsize']; $window_size = $plagiarismsettings['crot_windowsize']; $hashes = array(); try { $stripped_text = StripText($atext, ""); } catch (Exception $e) { echo "exception with stripping\n"; flush(); } $text_len = mb_strlen($stripped_text, "utf-8") - $gram_size; // get the original positions $offset = 0; $curtext = $atext; preg_match_all('/.|\\n/u', $stripped_text, $matches); $curstripped = $matches[0]; preg_match_all('/.|\\n/u', $atext, $matches2); $btext = $matches2[0]; $bpos = 0; $offset = 0; $values = array(); for ($i = 0; $i < $text_len; $i++) { while ($btext[$offset] != $curstripped[$i]) { $offset++; } $orig_positions[$i] = $offset; $offset++; $values[$i] = hash('md5', mb_substr($stripped_text, $i, $gram_size, "utf-8")); } // compiling fingerprint $fingers = array(); $fp = array(); $up = $text_len - $window_size + 1; $i = 0; $minHashPos = $window_size - 1; while ($i < $up) { if ($i == 0 || $minHashPos == $i - 1) { $minHashPos = $i + $window_size - 1; $hash = new Fingerprint(); $hash->value = $values[$minHashPos]; $hash->position = $orig_positions[$minHashPos]; $min_hash = $hash; for ($j = $i + $window_size - 1; $j >= $i; $j--) { if ($values[$j] < $min_hash->value) { $hash = new Fingerprint(); $hash->value = $values[$j]; $hash->position = $orig_positions[$j]; $min_hash = $hash; $minHashPos = $j; } } $i = $minHashPos + 1; $fingers[] = $min_hash; } else { if ($values[$i + $window_size - 1] < $min_hash->value) { $minHashPos = $i + $window_size - 1; $hash = new Fingerprint(); $hash->value = $values[$minHashPos]; $hash->position = $orig_positions[$minHashPos]; $min_hash = $hash; $fingers[] = $min_hash; $i = $minHashPos + 1; } } } return $fingers; }
} else { // TODO update } } // end of comparing with local documents } } // end for local search if ($plagiarismvalues['crot_global'] == 1) { // global search echo "\nfile {$afile->id} is selected for global search. Starting global search\n"; // strip text $atext = StripText($atext, " "); // create search queries $words = array(); $words = preg_split("/[\\s]+/", trim(StripText($atext, " "))); $max = sizeof($words) - $query_size + 1; $queries = array(); for ($i = 0; $i < $max; $i++) { $query = ""; for ($j = $i; $j - $i < $query_size; $j++) { $query = $query . " " . $words[$j]; } $queries[] = $query; } // queries are ready! // create list of URLs srand((double) microtime() * 10000000); // randomly select x% of queries $rand_keys = array_rand($queries, sizeof($queries) / 100 * $globs); $narr = array();