public function testHasSubstring3() { $string = "ceccbbbbbecbecbebbebbb"; $tree = new SuffixTree($string); $this->assertEquals(-1, $tree->hasSubstring("bbbbbbb")); $this->assertEquals(-1, $tree->hasSubstring("bbbbbb")); $this->assertEquals(1, $tree->hasSubstring("bbbbb")); }
/** * Annotates the analysis tree with surprise values in respect to the reference tree * in a recursive manner. * * @param SuffixTree $pReferenceTree Tree to use substring occurences as reference * @param SuffixTree $pAnalysisTree Tree on which to calculate surprise values * @param Node $pNode Current active node ( on the beginnig: the root node ) * @param string $representedString Substring of the whole string represented by the * analysis tree. Starting at the root, ending * on the active node. */ private function annotateNode(SuffixTree &$pReferenceTree, SuffixTree &$pAnalysisTree, Node &$pNode, $representedString) { if ($pNode->start != -1 && $pNode->end != -1) { // is not the root node $word = implode('', $pAnalysisTree->text); $representedString .= substr($word, $pNode->start, $pNode->end - $pNode->start); $scaleFactor = (count($pAnalysisTree->text) - strlen($representedString) + 1) / (count($pReferenceTree->text) - strlen($representedString) + 1); $occurenceInRef = 0; $surprise = 0; if ($pReferenceTree->getOccurence($representedString) > 0) { // trivial case $occurenceInRef = $scaleFactor * $pReferenceTree->getOccurence($representedString); } else { // check reference string for substrings $largestInterval = 1; // find largest length of substrings of represented string in the reference tree // such that each substring is contained in the reference tree // l = interval size // j = sliding index in representedString $largestFound = false; for ($l = 2; $l < strlen($representedString); $l++) { // starting at 2 because l must be greater than 1 // according to the formula // if all substrings of the same // length in this interval ($l) got found $allSubstringsFound = true; for ($j = 0; $j < strlen($representedString) - $l + 1; $j++) { $ret = $pReferenceTree->getOccurence(substr($representedString, $j, $l)); if ($ret === 0) { // substring of length '$l' is not contained anymore in // the reference string -> last interval size was the // largest $largestFound = true; $allSubstringsFound = false; break; } } // all strings in this interval were // found in the tree if ($allSubstringsFound == true) { $largestInterval = $l; } // don't increase interval size once a string // is not found anymore if ($largestFound === true) { break; } } if ($largestInterval > 1) { $counter = 1; $denominator = 1; for ($j = 0; $j < strlen($representedString) - $largestInterval + 1; $j++) { $counter *= $pReferenceTree->getOccurence(substr($representedString, $j, $largestInterval)); } for ($j = 1; $j < strlen($representedString) - $largestInterval + 1; $j++) { $denominator *= $pReferenceTree->getOccurence(substr($representedString, $j, $largestInterval - 1)); } $occurenceInRef = $scaleFactor * ($counter / $denominator); } else { // approximate the reference occurence by calculating the probability of appearance // of each character of the representedString in the string represented by // the reference suffix tree $probSum = 0; for ($i = 0; $i < strlen($representedString); $i++) { // number of occurences of each char in represented substring $counter = 1; for ($j = 0; $j < strlen($representedString); $j++) { if ($pReferenceTree->text[$j] == $representedString[$i]) { $counter++; } } $denominator = $pAnalysisTree->getOccurence($representedString[$i]); $probSum += $counter / $denominator; } $occurenceInRef = (count($pAnalysisTree->text) + strlen($representedString) + 1) * $probSum; } } $pNode->surpriseValue = $pAnalysisTree->getOccurence($representedString) - $occurenceInRef; } // annotate children foreach ($pNode->next as $childKey => $childValue) { $this->annotateNode($pReferenceTree, $pAnalysisTree, $pAnalysisTree->nodes[$childValue], $representedString); } }