/** * This method applies the Winnowing Algorithmn described at the article * `Winnowing: Local Algorithms for Document Fingerprinting` by Saul Schleimer, * Daniel S. Wilkerson, and Alex Aiken. It consists in extracting the smallest hash * in a window of characters taken directly from the text. This way it's possible to * identify similarities across documents since you can have different individual hashs * in a very similar sequence of hashs. For more information read the article. * * @public * @param $documentContent the document text * @return {array} an array of Fingerprints */ public function extractFingerprint($documentContent) { $fingerprints = array(); $contentLength = strlen($documentContent); $hashCount = $contentLength - $this->k; $windowSize = $this->threshold - $this->k + 1; for ($i = 0; $i < $hashCount; $i++) { $window = array(); for ($j = $i; $j < $i + $windowSize; $j++) { $value = substr($documentContent, $j, $this->k); $window[$j] = hash('md5', $value, false); } reset($window); $min = key($window); $fingerprint = $window[$min]; foreach ($window as $index => $hash) { if ($hash <= $fingerprint) { $min = $index; $fingerprint = $hash; } } $fingerprints[$min] = $fingerprint; } $ret = array(); $normalizedLocation = 0; foreach ($fingerprints as $loc => $hash) { array_push($ret, Fingerprint::fill($hash, $loc, $normalizedLocation++)); } return $ret; }