Exemplo n.º 1
0
/**
 * Get text in A-Z a-z 0-9 characters range
 * @param string $str
 * @return string
 */
function azname($str)
{
    $str = unaccent($str);
    $str = iconv('UTF-8', 'ASCII//TRANSLIT', $str);
    $str = str_replace('&', 'and', $str);
    return preg_replace('#[^A-Za-z0-9\\.\\-]#', '_', $str);
}
Exemplo n.º 2
0
 function get_related($max = 10)
 {
     global $globals, $db;
     $related = array();
     $phrases = 0;
     // Only work with sphinx
     if (!$globals['sphinx_server']) {
         return $related;
     }
     require mnminclude . 'search.php';
     $maxid = $db->get_var("select max(link_id) from links");
     if ($this->status == 'published') {
         $_REQUEST['s'] = '! abuse discard autodiscard';
     }
     $words = array();
     $freqs = array();
     $hits = array();
     $freq_min = 1;
     // Filter title
     $a = preg_split('/[\\s,\\.;:“”–\\"\'\\-\\(\\)\\[\\]«»<>\\/\\?¿¡!]+/u', preg_replace('/[\\[\\(] *\\w{1,6} *[\\)\\]] */', ' ', htmlspecialchars_decode($this->title, ENT_QUOTES)), -1, PREG_SPLIT_NO_EMPTY);
     $i = 0;
     $n = count($a);
     foreach ($a as $w) {
         $w = unaccent($w);
         $wlower = mb_strtolower($w);
         $len = mb_strlen($w);
         if (!isset($words[$wlower]) && ($len > 3 || preg_match('/^[A-Z]{2,}$/', $w)) && !preg_match('/^\\d{1,3}\\D{0,1}$/', $w)) {
             $h = sphinx_doc_hits($wlower);
             $hits[$wlower] = $h;
             if ($h < 1 || $h > $maxid / 10) {
                 continue;
             }
             // If 0 or 1 it won't help to the search, too frequents neither
             // Store the frequency
             $freq = $h / $maxid;
             if (!isset($freqs[$wlower]) || $freqs[$wlower] > $freq) {
                 $freqs[$wlower] = $freq;
             }
             if ($freq < $freq_min) {
                 $freq_min = max(0.0001, $freq);
             }
             if (preg_match('/^[A-Z]/', $w) && $len > 2) {
                 $coef = 2 * log10($maxid / $h);
             } else {
                 $coef = 2;
             }
             // Increase coefficient if a name appears also in tags
             // s{0,1} is a trick for plurals, until we use stemmed words
             if (preg_match('/(^|[ ,])' . preg_quote($w) . 's{0,1}([ ,]|$)/ui', $this->tags)) {
                 $coef *= 2;
                 if ($i == 0 || $i == $n - 1) {
                     $coef *= 2;
                 }
                 // It's the first or last word
             }
             $words[$wlower] = intval($h / $coef);
         }
         $i++;
     }
     // Filter tags
     $a = preg_split('/,+/', $this->tags, -1, PREG_SPLIT_NO_EMPTY);
     foreach ($a as $w) {
         $w = trim($w);
         $wlower = mb_strtolower(unaccent($w));
         $len = mb_strlen($w);
         if (isset($words[$wlower])) {
             continue;
         }
         if (preg_match('/\\s/', $w)) {
             $wlower = "\"{$wlower}\"";
             $phrases++;
         }
         $h = sphinx_doc_hits($wlower);
         $hits[$wlower] = $h;
         if ($h < 1 || $h > $maxid / 10) {
             continue;
         }
         // If 0 or 1 it won't help to the search, too frequents neither
         // Store the frequency
         $freq = $h / $maxid;
         if (!isset($freqs[$wlower]) || $freqs[$wlower] > $freq) {
             $freqs[$wlower] = $freq;
         }
         if ($freq < $freq_min) {
             $freq_min = max(0.0001, $freq);
         }
         $words[$wlower] = intval($h / 2);
     }
     // Filter content, check length and that it's begin con capital
     $a = preg_split('/[\\s,\\.;:“”–\\"\'\\-\\(\\)\\[\\]«»<>\\/\\?¿¡!]+/u', preg_replace('/https{0,1}:\\/\\/\\S+|[\\[\\(] *\\w{1,6} *[\\)\\]]/i', '', $this->sanitize($this->content)), -1, PREG_SPLIT_NO_EMPTY);
     foreach ($a as $w) {
         $wlower = mb_strtolower(unaccent($w));
         if (!preg_match('/^[A-Z][a-zA-Z]{2,}/', $w)) {
             continue;
         }
         $len = mb_strlen($w);
         if (!isset($words[$wlower]) && ($len > 2 || preg_match('/^[A-Z]{2,}$/', $w)) && !preg_match('/^\\d{1,3}\\D{0,1}$/', $w)) {
             $h = sphinx_doc_hits($wlower);
             $hits[$wlower] = $h;
             if ($h < 1 || $h > $maxid / 50) {
                 continue;
             }
             // If 0 or 1 it won't help to the search, too frequents neither
             if (preg_match('/^[A-Z]/', $w) && $h < $maxid / 1000) {
                 $coef = max(log10($maxid / $h) - 1, 1);
             } else {
                 $coef = 1;
             }
             $words[$wlower] = intval($h / $coef);
         }
     }
     // Increase "hits" proportional to word's lenght
     // because longer words tends to appear less
     foreach ($words as $w => $v) {
         $len = mb_strlen($w);
         if ($len > 6 && !preg_match('/ /', $w)) {
             $words[$w] = $v * $len / 6;
         }
     }
     asort($words);
     $i = 0;
     $text = '';
     foreach ($words as $w => $v) {
         // Filter words if we got good candidates
         // echo "<!-- $w: ".$freqs[$w]." coef: ".$words[$w]."-->\n";
         if ($i > 4 && $freq_min < 0.005 && strlen($w) > 3 && (empty($freqs[$w]) || $freqs[$w] > 0.01 || $freqs[$w] > $freq_min * 100)) {
             continue;
         }
         $i++;
         if ($i > 14 or $i > 8 && $v > $maxid / 2000) {
             break;
         }
         $text .= "{$w} ";
     }
     echo "\n<!-- Search terms: {$text} Phrases: {$phrases} -->\n";
     $_REQUEST['q'] = $text;
     // Center the date about the the link's date
     $_REQUEST['root_time'] = $this->date;
     if ($globals['now'] - $this->date > 86400 * 5) {
         $this->old = true;
     } else {
         $this->old = false;
     }
     $response = do_search(false, 0, $max + 1, false);
     if ($response && isset($response['ids'])) {
         foreach ($response['ids'] as $id) {
             if ($id == $this->id) {
                 continue;
             }
             $l = Link::from_db($id);
             if (!$l) {
                 continue;
             }
             if (empty($l->permalink)) {
                 $l->permalink = $l->get_permalink();
             }
             $related[] = $l;
         }
     }
     return $related;
 }
Exemplo n.º 3
0
                    }
                }
            }
        }
    }
    if (!$utf8) {
        $str = utf8_encode($str);
    }
    return str_replace(array_keys($transliteration), array_values($transliteration), $str);
}
//Step 1 - Make Dico
$dico = file_get_contents(__DIR__ . '/liste.de.mots.francais.frgut.txt');
$dico = str_replace([PHP_EOL, "\r"], "\n", $dico);
$dico = str_replace(["\t", " "], '', $dico);
$dico = explode("\n", $dico);
$c = count($dico) - 1;
for ($i = 0; $i < $c; $i++) {
    $dico[$i] = unaccent($dico[$i]);
}
$dico = array_unique($dico);
file_put_contents(__DIR__ . '/dico.txt', implode("\n", $dico));
//Step 2 - Make Dico-Light
$dico = explode("\n", file_get_contents(__DIR__ . '/dico.txt'));
$c = count($dico) - 1;
for ($i = 0; $i < $c; $i++) {
    $l = strlen($dico[$i]);
    if ($l < 3 || $l > 7) {
        unset($dico[$i]);
    }
}
file_put_contents(__DIR__ . '/dico-light.txt', implode("\n", $dico));
Exemplo n.º 4
0
function slug($string, $slug = '-', $extra = null)
{
    return strtolower(trim(preg_replace('~[^0-9a-z' . preg_quote($extra, '~') . ']+~i', $slug, unaccent($string)), $slug));
}