function test_bounds() { // we test multiple cases here - format: in, offset, length, out $tests = array(); // bounds checking $tests[] = array('aaживπά우리をあöä', -2, false, 0); $tests[] = array('aaживπά우리をあöä', 128, false, 29); $tests[] = array('aaживπά우리をあöä', -2, true, 0); $tests[] = array('aaживπά우리をあöä', 128, true, 29); foreach ($tests as $test) { $this->assertEqual(utf8_correctIdx($test[0], $test[1], $test[2]), $test[3]); } }
/** * Creates a snippet extract * * @author Andreas Gohr <*****@*****.**> * @triggers FULLTEXT_SNIPPET_CREATE */ function ft_snippet($id, $highlight) { $text = rawWiki($id); $text = str_replace("", '', $text); // remove soft-hyphens $evdata = array('id' => $id, 'text' => &$text, 'highlight' => &$highlight, 'snippet' => ''); $evt = new Doku_Event('FULLTEXT_SNIPPET_CREATE', $evdata); if ($evt->advise_before()) { $match = array(); $snippets = array(); $utf8_offset = $offset = $end = 0; $len = utf8_strlen($text); // build a regexp from the phrases to highlight $re1 = '(' . join('|', array_map('ft_snippet_re_preprocess', array_map('preg_quote_cb', array_filter((array) $highlight)))) . ')'; $re2 = "{$re1}.{0,75}(?!\\1){$re1}"; $re3 = "{$re1}.{0,45}(?!\\1){$re1}.{0,45}(?!\\1)(?!\\2){$re1}"; for ($cnt = 4; $cnt--;) { if (0) { } else { if (preg_match('/' . $re3 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { } else { if (preg_match('/' . $re2 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { } else { if (preg_match('/' . $re1 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { } else { break; } } } } list($str, $idx) = $match[0]; // convert $idx (a byte offset) into a utf8 character offset $utf8_idx = utf8_strlen(substr($text, 0, $idx)); $utf8_len = utf8_strlen($str); // establish context, 100 bytes surrounding the match string // first look to see if we can go 100 either side, // then drop to 50 adding any excess if the other side can't go to 50, $pre = min($utf8_idx - $utf8_offset, 100); $post = min($len - $utf8_idx - $utf8_len, 100); if ($pre > 50 && $post > 50) { $pre = $post = 50; } else { if ($pre > 50) { $pre = min($pre, 100 - $post); } else { if ($post > 50) { $post = min($post, 100 - $pre); } else { // both are less than 50, means the context is the whole string // make it so and break out of this loop - there is no need for the // complex snippet calculations $snippets = array($text); break; } } } // establish context start and end points, try to append to previous // context if possible $start = $utf8_idx - $pre; $append = $start < $end ? $end : false; // still the end of the previous context snippet $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context if ($append) { $snippets[count($snippets) - 1] .= utf8_substr($text, $append, $end - $append); } else { $snippets[] = utf8_substr($text, $start, $end - $start); } // set $offset for next match attempt // substract strlen to avoid splitting a potential search success, // this is an approximation as the search pattern may match strings // of varying length and it will fail if the context snippet // boundary breaks a matching string longer than the current match $utf8_offset = $utf8_idx + $post; $offset = $idx + strlen(utf8_substr($text, $utf8_idx, $post)); $offset = utf8_correctIdx($text, $offset); } $m = ""; $snippets = preg_replace('/' . $re1 . '/iu', $m . '$1' . $m, $snippets); $snippet = preg_replace('/' . $m . '([^' . $m . ']*?)' . $m . '/iu', '<strong class="search_hit">$1</strong>', hsc(join('... ', $snippets))); $evdata['snippet'] = $snippet; } $evt->advise_after(); unset($evt); return $evdata['snippet']; }
/** * Creates a snippet extract * * @author Andreas Gohr <*****@*****.**> */ function ft_snippet($id, $highlight) { $text = rawWiki($id); $match = array(); $snippets = array(); $utf8_offset = $offset = $end = 0; $len = utf8_strlen($text); // build a regexp from the phrases to highlight $re = join('|', array_map('preg_quote_cb', array_filter((array) $highlight))); for ($cnt = 3; $cnt--;) { if (!preg_match('#(' . $re . ')#iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { break; } list($str, $idx) = $match[0]; // convert $idx (a byte offset) into a utf8 character offset $utf8_idx = utf8_strlen(substr($text, 0, $idx)); $utf8_len = utf8_strlen($str); // establish context, 100 bytes surrounding the match string // first look to see if we can go 100 either side, // then drop to 50 adding any excess if the other side can't go to 50, $pre = min($utf8_idx - $utf8_offset, 100); $post = min($len - $utf8_idx - $utf8_len, 100); if ($pre > 50 && $post > 50) { $pre = $post = 50; } elseif ($pre > 50) { $pre = min($pre, 100 - $post); } elseif ($post > 50) { $post = min($post, 100 - $pre); } else { // both are less than 50, means the context is the whole string // make it so and break out of this loop - there is no need for the // complex snippet calculations $snippets = array($text); break; } // establish context start and end points, try to append to previous // context if possible $start = $utf8_idx - $pre; $append = $start < $end ? $end : false; // still the end of the previous context snippet $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context if ($append) { $snippets[count($snippets) - 1] .= utf8_substr($text, $append, $end - $append); } else { $snippets[] = utf8_substr($text, $start, $end - $start); } // set $offset for next match attempt // substract strlen to avoid splitting a potential search success, // this is an approximation as the search pattern may match strings // of varying length and it will fail if the context snippet // boundary breaks a matching string longer than the current match $utf8_offset = $utf8_idx + $post; $offset = $idx + strlen(utf8_substr($text, $utf8_idx, $post)); $offset = utf8_correctIdx($text, $offset); } $m = ""; $snippets = preg_replace('#(' . $re . ')#iu', $m . '$1' . $m, $snippets); $snippet = preg_replace('#' . $m . '([^' . $m . ']*?)' . $m . '#iu', '<strong class="search_hit">$1</strong>', hsc(join('... ', $snippets))); return $snippet; }