Beispiel #1
0
 public function testHasSubstring3()
 {
     $string = "ceccbbbbbecbecbebbebbb";
     $tree = new SuffixTree($string);
     $this->assertEquals(-1, $tree->hasSubstring("bbbbbbb"));
     $this->assertEquals(-1, $tree->hasSubstring("bbbbbb"));
     $this->assertEquals(1, $tree->hasSubstring("bbbbb"));
 }
Beispiel #2
0
 /**
  * Annotates the analysis tree with surprise values in respect to the reference tree
  * in a recursive manner. 
  * 
  * @param  SuffixTree $pReferenceTree    Tree to use substring occurences as reference
  * @param  SuffixTree $pAnalysisTree     Tree on which to calculate surprise values
  * @param  Node       $pNode             Current active node ( on the beginnig: the root node )
  * @param  string     $representedString Substring of the whole string represented by the 
  *                                       analysis tree. Starting at the root, ending 
  *                                       on the active node.
  */
 private function annotateNode(SuffixTree &$pReferenceTree, SuffixTree &$pAnalysisTree, Node &$pNode, $representedString)
 {
     if ($pNode->start != -1 && $pNode->end != -1) {
         // is not the root node
         $word = implode('', $pAnalysisTree->text);
         $representedString .= substr($word, $pNode->start, $pNode->end - $pNode->start);
         $scaleFactor = (count($pAnalysisTree->text) - strlen($representedString) + 1) / (count($pReferenceTree->text) - strlen($representedString) + 1);
         $occurenceInRef = 0;
         $surprise = 0;
         if ($pReferenceTree->getOccurence($representedString) > 0) {
             // trivial case
             $occurenceInRef = $scaleFactor * $pReferenceTree->getOccurence($representedString);
         } else {
             // check reference string for substrings
             $largestInterval = 1;
             // find largest length of substrings of represented string in the reference tree
             // such that each substring is contained in the reference tree
             // l = interval size
             // j = sliding index in representedString
             $largestFound = false;
             for ($l = 2; $l < strlen($representedString); $l++) {
                 // starting at 2 because l must be greater than 1
                 // according to the formula
                 // if all substrings of the same
                 // length in this interval ($l) got found
                 $allSubstringsFound = true;
                 for ($j = 0; $j < strlen($representedString) - $l + 1; $j++) {
                     $ret = $pReferenceTree->getOccurence(substr($representedString, $j, $l));
                     if ($ret === 0) {
                         // substring of length '$l' is not contained anymore in
                         // the reference string -> last interval size was the
                         // largest
                         $largestFound = true;
                         $allSubstringsFound = false;
                         break;
                     }
                 }
                 // all strings in this interval were
                 // found in the tree
                 if ($allSubstringsFound == true) {
                     $largestInterval = $l;
                 }
                 // don't increase interval size once a string
                 // is not found anymore
                 if ($largestFound === true) {
                     break;
                 }
             }
             if ($largestInterval > 1) {
                 $counter = 1;
                 $denominator = 1;
                 for ($j = 0; $j < strlen($representedString) - $largestInterval + 1; $j++) {
                     $counter *= $pReferenceTree->getOccurence(substr($representedString, $j, $largestInterval));
                 }
                 for ($j = 1; $j < strlen($representedString) - $largestInterval + 1; $j++) {
                     $denominator *= $pReferenceTree->getOccurence(substr($representedString, $j, $largestInterval - 1));
                 }
                 $occurenceInRef = $scaleFactor * ($counter / $denominator);
             } else {
                 // approximate the reference occurence by calculating the probability of appearance
                 // of each character of the representedString in the string represented by
                 // the reference suffix tree
                 $probSum = 0;
                 for ($i = 0; $i < strlen($representedString); $i++) {
                     // number of occurences of each char in represented substring
                     $counter = 1;
                     for ($j = 0; $j < strlen($representedString); $j++) {
                         if ($pReferenceTree->text[$j] == $representedString[$i]) {
                             $counter++;
                         }
                     }
                     $denominator = $pAnalysisTree->getOccurence($representedString[$i]);
                     $probSum += $counter / $denominator;
                 }
                 $occurenceInRef = (count($pAnalysisTree->text) + strlen($representedString) + 1) * $probSum;
             }
         }
         $pNode->surpriseValue = $pAnalysisTree->getOccurence($representedString) - $occurenceInRef;
     }
     // annotate children
     foreach ($pNode->next as $childKey => $childValue) {
         $this->annotateNode($pReferenceTree, $pAnalysisTree, $pAnalysisTree->nodes[$childValue], $representedString);
     }
 }