예제 #1
0
 /**
  * This method applies the Winnowing Algorithmn described at the article
  * `Winnowing: Local Algorithms for Document Fingerprinting` by Saul Schleimer,
  * Daniel S. Wilkerson, and Alex Aiken. It consists in extracting the smallest hash
  * in a window of characters taken directly from the text. This way it's possible to
  * identify similarities across documents since you can have different individual hashs
  * in a very similar sequence of hashs. For more information read the article.
  *
  * @public
  * @param $documentContent the document text
  * @return {array} an array of Fingerprints
  */
 public function extractFingerprint($documentContent)
 {
     $fingerprints = array();
     $contentLength = strlen($documentContent);
     $hashCount = $contentLength - $this->k;
     $windowSize = $this->threshold - $this->k + 1;
     for ($i = 0; $i < $hashCount; $i++) {
         $window = array();
         for ($j = $i; $j < $i + $windowSize; $j++) {
             $value = substr($documentContent, $j, $this->k);
             $window[$j] = hash('md5', $value, false);
         }
         reset($window);
         $min = key($window);
         $fingerprint = $window[$min];
         foreach ($window as $index => $hash) {
             if ($hash <= $fingerprint) {
                 $min = $index;
                 $fingerprint = $hash;
             }
         }
         $fingerprints[$min] = $fingerprint;
     }
     $ret = array();
     $normalizedLocation = 0;
     foreach ($fingerprints as $loc => $hash) {
         array_push($ret, Fingerprint::fill($hash, $loc, $normalizedLocation++));
     }
     return $ret;
 }