Esempio n. 1
0
$words = array();
$results['str'] = $link->tags;
$results['docs'] = $db->get_var("select max(link_id) from links");
$results['phrases'] = 0;
$results['in_title'] = 0;
$results['min_freq'] = 100;
$results['max_freq'] = 0;
$results['highs'] = 0;
$a = preg_split('/,+/', $link->tags, -1, PREG_SPLIT_NO_EMPTY);
$results['tags'] = count($a);
foreach ($a as $w) {
    $w = trim($w);
    $r = array();
    $r['w'] = $w;
    $r['len'] = mb_strlen($w);
    $r['hits'] = $h = min($results['docs'], intval(sphinx_doc_hits($w)));
    $r['freq'] = round(100 * $h / $results['docs'], 1);
    if ($r['freq'] < $results['min_freq']) {
        $results['min_freq'] = $r['freq'];
    }
    if ($r['freq'] > $results['max_freq']) {
        $results['max_freq'] = $r['freq'];
    }
    if ($r['freq'] > 1) {
        $results['highs'] += 1;
    }
    if (preg_match('/ /', $w)) {
        $results['phrases'] += 1;
        $r['phrase'] = true;
    } else {
        $r['phrase'] = false;
Esempio n. 2
0
 function get_related($max = 10)
 {
     global $globals, $db;
     $related = array();
     $phrases = 0;
     // Only work with sphinx
     if (!$globals['sphinx_server']) {
         return $related;
     }
     require mnminclude . 'search.php';
     $maxid = $db->get_var("select max(link_id) from links");
     if ($this->status == 'published') {
         $_REQUEST['s'] = '! abuse discard autodiscard';
     }
     $words = array();
     $freqs = array();
     $hits = array();
     $freq_min = 1;
     // Filter title
     $a = preg_split('/[\\s,\\.;:“”–\\"\'\\-\\(\\)\\[\\]«»<>\\/\\?¿¡!]+/u', preg_replace('/[\\[\\(] *\\w{1,6} *[\\)\\]] */', ' ', htmlspecialchars_decode($this->title, ENT_QUOTES)), -1, PREG_SPLIT_NO_EMPTY);
     $i = 0;
     $n = count($a);
     foreach ($a as $w) {
         $w = unaccent($w);
         $wlower = mb_strtolower($w);
         $len = mb_strlen($w);
         if (!isset($words[$wlower]) && ($len > 3 || preg_match('/^[A-Z]{2,}$/', $w)) && !preg_match('/^\\d{1,3}\\D{0,1}$/', $w)) {
             $h = sphinx_doc_hits($wlower);
             $hits[$wlower] = $h;
             if ($h < 1 || $h > $maxid / 10) {
                 continue;
             }
             // If 0 or 1 it won't help to the search, too frequents neither
             // Store the frequency
             $freq = $h / $maxid;
             if (!isset($freqs[$wlower]) || $freqs[$wlower] > $freq) {
                 $freqs[$wlower] = $freq;
             }
             if ($freq < $freq_min) {
                 $freq_min = max(0.0001, $freq);
             }
             if (preg_match('/^[A-Z]/', $w) && $len > 2) {
                 $coef = 2 * log10($maxid / $h);
             } else {
                 $coef = 2;
             }
             // Increase coefficient if a name appears also in tags
             // s{0,1} is a trick for plurals, until we use stemmed words
             if (preg_match('/(^|[ ,])' . preg_quote($w) . 's{0,1}([ ,]|$)/ui', $this->tags)) {
                 $coef *= 2;
                 if ($i == 0 || $i == $n - 1) {
                     $coef *= 2;
                 }
                 // It's the first or last word
             }
             $words[$wlower] = intval($h / $coef);
         }
         $i++;
     }
     // Filter tags
     $a = preg_split('/,+/', $this->tags, -1, PREG_SPLIT_NO_EMPTY);
     foreach ($a as $w) {
         $w = trim($w);
         $wlower = mb_strtolower(unaccent($w));
         $len = mb_strlen($w);
         if (isset($words[$wlower])) {
             continue;
         }
         if (preg_match('/\\s/', $w)) {
             $wlower = "\"{$wlower}\"";
             $phrases++;
         }
         $h = sphinx_doc_hits($wlower);
         $hits[$wlower] = $h;
         if ($h < 1 || $h > $maxid / 10) {
             continue;
         }
         // If 0 or 1 it won't help to the search, too frequents neither
         // Store the frequency
         $freq = $h / $maxid;
         if (!isset($freqs[$wlower]) || $freqs[$wlower] > $freq) {
             $freqs[$wlower] = $freq;
         }
         if ($freq < $freq_min) {
             $freq_min = max(0.0001, $freq);
         }
         $words[$wlower] = intval($h / 2);
     }
     // Filter content, check length and that it's begin con capital
     $a = preg_split('/[\\s,\\.;:“”–\\"\'\\-\\(\\)\\[\\]«»<>\\/\\?¿¡!]+/u', preg_replace('/https{0,1}:\\/\\/\\S+|[\\[\\(] *\\w{1,6} *[\\)\\]]/i', '', $this->sanitize($this->content)), -1, PREG_SPLIT_NO_EMPTY);
     foreach ($a as $w) {
         $wlower = mb_strtolower(unaccent($w));
         if (!preg_match('/^[A-Z][a-zA-Z]{2,}/', $w)) {
             continue;
         }
         $len = mb_strlen($w);
         if (!isset($words[$wlower]) && ($len > 2 || preg_match('/^[A-Z]{2,}$/', $w)) && !preg_match('/^\\d{1,3}\\D{0,1}$/', $w)) {
             $h = sphinx_doc_hits($wlower);
             $hits[$wlower] = $h;
             if ($h < 1 || $h > $maxid / 50) {
                 continue;
             }
             // If 0 or 1 it won't help to the search, too frequents neither
             if (preg_match('/^[A-Z]/', $w) && $h < $maxid / 1000) {
                 $coef = max(log10($maxid / $h) - 1, 1);
             } else {
                 $coef = 1;
             }
             $words[$wlower] = intval($h / $coef);
         }
     }
     // Increase "hits" proportional to word's lenght
     // because longer words tends to appear less
     foreach ($words as $w => $v) {
         $len = mb_strlen($w);
         if ($len > 6 && !preg_match('/ /', $w)) {
             $words[$w] = $v * $len / 6;
         }
     }
     asort($words);
     $i = 0;
     $text = '';
     foreach ($words as $w => $v) {
         // Filter words if we got good candidates
         // echo "<!-- $w: ".$freqs[$w]." coef: ".$words[$w]."-->\n";
         if ($i > 4 && $freq_min < 0.005 && strlen($w) > 3 && (empty($freqs[$w]) || $freqs[$w] > 0.01 || $freqs[$w] > $freq_min * 100)) {
             continue;
         }
         $i++;
         if ($i > 14 or $i > 8 && $v > $maxid / 2000) {
             break;
         }
         $text .= "{$w} ";
     }
     echo "\n<!-- Search terms: {$text} Phrases: {$phrases} -->\n";
     $_REQUEST['q'] = $text;
     // Center the date about the the link's date
     $_REQUEST['root_time'] = $this->date;
     if ($globals['now'] - $this->date > 86400 * 5) {
         $this->old = true;
     } else {
         $this->old = false;
     }
     $response = do_search(false, 0, $max + 1, false);
     if ($response && isset($response['ids'])) {
         foreach ($response['ids'] as $id) {
             if ($id == $this->id) {
                 continue;
             }
             $l = Link::from_db($id);
             if (!$l) {
                 continue;
             }
             if (empty($l->permalink)) {
                 $l->permalink = $l->get_permalink();
             }
             $related[] = $l;
         }
     }
     return $related;
 }