Exemplo n.º 1
0
 function test_bounds()
 {
     // we test multiple cases here - format: in, offset, length, out
     $tests = array();
     // bounds checking
     $tests[] = array('aaживπά우리をあöä', -2, false, 0);
     $tests[] = array('aaживπά우리をあöä', 128, false, 29);
     $tests[] = array('aaживπά우리をあöä', -2, true, 0);
     $tests[] = array('aaживπά우리をあöä', 128, true, 29);
     foreach ($tests as $test) {
         $this->assertEqual(utf8_correctIdx($test[0], $test[1], $test[2]), $test[3]);
     }
 }
Exemplo n.º 2
0
/**
 * Creates a snippet extract
 *
 * @author Andreas Gohr <*****@*****.**>
 * @triggers FULLTEXT_SNIPPET_CREATE
 */
function ft_snippet($id, $highlight)
{
    $text = rawWiki($id);
    $text = str_replace("­", '', $text);
    // remove soft-hyphens
    $evdata = array('id' => $id, 'text' => &$text, 'highlight' => &$highlight, 'snippet' => '');
    $evt = new Doku_Event('FULLTEXT_SNIPPET_CREATE', $evdata);
    if ($evt->advise_before()) {
        $match = array();
        $snippets = array();
        $utf8_offset = $offset = $end = 0;
        $len = utf8_strlen($text);
        // build a regexp from the phrases to highlight
        $re1 = '(' . join('|', array_map('ft_snippet_re_preprocess', array_map('preg_quote_cb', array_filter((array) $highlight)))) . ')';
        $re2 = "{$re1}.{0,75}(?!\\1){$re1}";
        $re3 = "{$re1}.{0,45}(?!\\1){$re1}.{0,45}(?!\\1)(?!\\2){$re1}";
        for ($cnt = 4; $cnt--;) {
            if (0) {
            } else {
                if (preg_match('/' . $re3 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
                } else {
                    if (preg_match('/' . $re2 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
                    } else {
                        if (preg_match('/' . $re1 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
                        } else {
                            break;
                        }
                    }
                }
            }
            list($str, $idx) = $match[0];
            // convert $idx (a byte offset) into a utf8 character offset
            $utf8_idx = utf8_strlen(substr($text, 0, $idx));
            $utf8_len = utf8_strlen($str);
            // establish context, 100 bytes surrounding the match string
            // first look to see if we can go 100 either side,
            // then drop to 50 adding any excess if the other side can't go to 50,
            $pre = min($utf8_idx - $utf8_offset, 100);
            $post = min($len - $utf8_idx - $utf8_len, 100);
            if ($pre > 50 && $post > 50) {
                $pre = $post = 50;
            } else {
                if ($pre > 50) {
                    $pre = min($pre, 100 - $post);
                } else {
                    if ($post > 50) {
                        $post = min($post, 100 - $pre);
                    } else {
                        // both are less than 50, means the context is the whole string
                        // make it so and break out of this loop - there is no need for the
                        // complex snippet calculations
                        $snippets = array($text);
                        break;
                    }
                }
            }
            // establish context start and end points, try to append to previous
            // context if possible
            $start = $utf8_idx - $pre;
            $append = $start < $end ? $end : false;
            // still the end of the previous context snippet
            $end = $utf8_idx + $utf8_len + $post;
            // now set it to the end of this context
            if ($append) {
                $snippets[count($snippets) - 1] .= utf8_substr($text, $append, $end - $append);
            } else {
                $snippets[] = utf8_substr($text, $start, $end - $start);
            }
            // set $offset for next match attempt
            //   substract strlen to avoid splitting a potential search success,
            //   this is an approximation as the search pattern may match strings
            //   of varying length and it will fail if the context snippet
            //   boundary breaks a matching string longer than the current match
            $utf8_offset = $utf8_idx + $post;
            $offset = $idx + strlen(utf8_substr($text, $utf8_idx, $post));
            $offset = utf8_correctIdx($text, $offset);
        }
        $m = "";
        $snippets = preg_replace('/' . $re1 . '/iu', $m . '$1' . $m, $snippets);
        $snippet = preg_replace('/' . $m . '([^' . $m . ']*?)' . $m . '/iu', '<strong class="search_hit">$1</strong>', hsc(join('... ', $snippets)));
        $evdata['snippet'] = $snippet;
    }
    $evt->advise_after();
    unset($evt);
    return $evdata['snippet'];
}
Exemplo n.º 3
0
/**
 * Creates a snippet extract
 *
 * @author Andreas Gohr <*****@*****.**>
 */
function ft_snippet($id, $highlight)
{
    $text = rawWiki($id);
    $match = array();
    $snippets = array();
    $utf8_offset = $offset = $end = 0;
    $len = utf8_strlen($text);
    // build a regexp from the phrases to highlight
    $re = join('|', array_map('preg_quote_cb', array_filter((array) $highlight)));
    for ($cnt = 3; $cnt--;) {
        if (!preg_match('#(' . $re . ')#iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
            break;
        }
        list($str, $idx) = $match[0];
        // convert $idx (a byte offset) into a utf8 character offset
        $utf8_idx = utf8_strlen(substr($text, 0, $idx));
        $utf8_len = utf8_strlen($str);
        // establish context, 100 bytes surrounding the match string
        // first look to see if we can go 100 either side,
        // then drop to 50 adding any excess if the other side can't go to 50,
        $pre = min($utf8_idx - $utf8_offset, 100);
        $post = min($len - $utf8_idx - $utf8_len, 100);
        if ($pre > 50 && $post > 50) {
            $pre = $post = 50;
        } elseif ($pre > 50) {
            $pre = min($pre, 100 - $post);
        } elseif ($post > 50) {
            $post = min($post, 100 - $pre);
        } else {
            // both are less than 50, means the context is the whole string
            // make it so and break out of this loop - there is no need for the
            // complex snippet calculations
            $snippets = array($text);
            break;
        }
        // establish context start and end points, try to append to previous
        // context if possible
        $start = $utf8_idx - $pre;
        $append = $start < $end ? $end : false;
        // still the end of the previous context snippet
        $end = $utf8_idx + $utf8_len + $post;
        // now set it to the end of this context
        if ($append) {
            $snippets[count($snippets) - 1] .= utf8_substr($text, $append, $end - $append);
        } else {
            $snippets[] = utf8_substr($text, $start, $end - $start);
        }
        // set $offset for next match attempt
        //   substract strlen to avoid splitting a potential search success,
        //   this is an approximation as the search pattern may match strings
        //   of varying length and it will fail if the context snippet
        //   boundary breaks a matching string longer than the current match
        $utf8_offset = $utf8_idx + $post;
        $offset = $idx + strlen(utf8_substr($text, $utf8_idx, $post));
        $offset = utf8_correctIdx($text, $offset);
    }
    $m = "";
    $snippets = preg_replace('#(' . $re . ')#iu', $m . '$1' . $m, $snippets);
    $snippet = preg_replace('#' . $m . '([^' . $m . ']*?)' . $m . '#iu', '<strong class="search_hit">$1</strong>', hsc(join('... ', $snippets)));
    return $snippet;
}