/** * Get text in A-Z a-z 0-9 characters range * @param string $str * @return string */ function azname($str) { $str = unaccent($str); $str = iconv('UTF-8', 'ASCII//TRANSLIT', $str); $str = str_replace('&', 'and', $str); return preg_replace('#[^A-Za-z0-9\\.\\-]#', '_', $str); }
function get_related($max = 10) { global $globals, $db; $related = array(); $phrases = 0; // Only work with sphinx if (!$globals['sphinx_server']) { return $related; } require mnminclude . 'search.php'; $maxid = $db->get_var("select max(link_id) from links"); if ($this->status == 'published') { $_REQUEST['s'] = '! abuse discard autodiscard'; } $words = array(); $freqs = array(); $hits = array(); $freq_min = 1; // Filter title $a = preg_split('/[\\s,\\.;:“”–\\"\'\\-\\(\\)\\[\\]«»<>\\/\\?¿¡!]+/u', preg_replace('/[\\[\\(] *\\w{1,6} *[\\)\\]] */', ' ', htmlspecialchars_decode($this->title, ENT_QUOTES)), -1, PREG_SPLIT_NO_EMPTY); $i = 0; $n = count($a); foreach ($a as $w) { $w = unaccent($w); $wlower = mb_strtolower($w); $len = mb_strlen($w); if (!isset($words[$wlower]) && ($len > 3 || preg_match('/^[A-Z]{2,}$/', $w)) && !preg_match('/^\\d{1,3}\\D{0,1}$/', $w)) { $h = sphinx_doc_hits($wlower); $hits[$wlower] = $h; if ($h < 1 || $h > $maxid / 10) { continue; } // If 0 or 1 it won't help to the search, too frequents neither // Store the frequency $freq = $h / $maxid; if (!isset($freqs[$wlower]) || $freqs[$wlower] > $freq) { $freqs[$wlower] = $freq; } if ($freq < $freq_min) { $freq_min = max(0.0001, $freq); } if (preg_match('/^[A-Z]/', $w) && $len > 2) { $coef = 2 * log10($maxid / $h); } else { $coef = 2; } // Increase coefficient if a name appears also in tags // s{0,1} is a trick for plurals, until we use stemmed words if (preg_match('/(^|[ ,])' . preg_quote($w) . 's{0,1}([ ,]|$)/ui', $this->tags)) { $coef *= 2; if ($i == 0 || $i == $n - 1) { $coef *= 2; } // It's the first or last word } $words[$wlower] = intval($h / $coef); } $i++; } // Filter tags $a = preg_split('/,+/', $this->tags, -1, PREG_SPLIT_NO_EMPTY); foreach ($a as $w) { $w = trim($w); $wlower = mb_strtolower(unaccent($w)); $len = mb_strlen($w); if (isset($words[$wlower])) { continue; } if (preg_match('/\\s/', $w)) { $wlower = "\"{$wlower}\""; $phrases++; } $h = sphinx_doc_hits($wlower); $hits[$wlower] = $h; if ($h < 1 || $h > $maxid / 10) { continue; } // If 0 or 1 it won't help to the search, too frequents neither // Store the frequency $freq = $h / $maxid; if (!isset($freqs[$wlower]) || $freqs[$wlower] > $freq) { $freqs[$wlower] = $freq; } if ($freq < $freq_min) { $freq_min = max(0.0001, $freq); } $words[$wlower] = intval($h / 2); } // Filter content, check length and that it's begin con capital $a = preg_split('/[\\s,\\.;:“”–\\"\'\\-\\(\\)\\[\\]«»<>\\/\\?¿¡!]+/u', preg_replace('/https{0,1}:\\/\\/\\S+|[\\[\\(] *\\w{1,6} *[\\)\\]]/i', '', $this->sanitize($this->content)), -1, PREG_SPLIT_NO_EMPTY); foreach ($a as $w) { $wlower = mb_strtolower(unaccent($w)); if (!preg_match('/^[A-Z][a-zA-Z]{2,}/', $w)) { continue; } $len = mb_strlen($w); if (!isset($words[$wlower]) && ($len > 2 || preg_match('/^[A-Z]{2,}$/', $w)) && !preg_match('/^\\d{1,3}\\D{0,1}$/', $w)) { $h = sphinx_doc_hits($wlower); $hits[$wlower] = $h; if ($h < 1 || $h > $maxid / 50) { continue; } // If 0 or 1 it won't help to the search, too frequents neither if (preg_match('/^[A-Z]/', $w) && $h < $maxid / 1000) { $coef = max(log10($maxid / $h) - 1, 1); } else { $coef = 1; } $words[$wlower] = intval($h / $coef); } } // Increase "hits" proportional to word's lenght // because longer words tends to appear less foreach ($words as $w => $v) { $len = mb_strlen($w); if ($len > 6 && !preg_match('/ /', $w)) { $words[$w] = $v * $len / 6; } } asort($words); $i = 0; $text = ''; foreach ($words as $w => $v) { // Filter words if we got good candidates // echo "<!-- $w: ".$freqs[$w]." coef: ".$words[$w]."-->\n"; if ($i > 4 && $freq_min < 0.005 && strlen($w) > 3 && (empty($freqs[$w]) || $freqs[$w] > 0.01 || $freqs[$w] > $freq_min * 100)) { continue; } $i++; if ($i > 14 or $i > 8 && $v > $maxid / 2000) { break; } $text .= "{$w} "; } echo "\n<!-- Search terms: {$text} Phrases: {$phrases} -->\n"; $_REQUEST['q'] = $text; // Center the date about the the link's date $_REQUEST['root_time'] = $this->date; if ($globals['now'] - $this->date > 86400 * 5) { $this->old = true; } else { $this->old = false; } $response = do_search(false, 0, $max + 1, false); if ($response && isset($response['ids'])) { foreach ($response['ids'] as $id) { if ($id == $this->id) { continue; } $l = Link::from_db($id); if (!$l) { continue; } if (empty($l->permalink)) { $l->permalink = $l->get_permalink(); } $related[] = $l; } } return $related; }
} } } } } if (!$utf8) { $str = utf8_encode($str); } return str_replace(array_keys($transliteration), array_values($transliteration), $str); } //Step 1 - Make Dico $dico = file_get_contents(__DIR__ . '/liste.de.mots.francais.frgut.txt'); $dico = str_replace([PHP_EOL, "\r"], "\n", $dico); $dico = str_replace(["\t", " "], '', $dico); $dico = explode("\n", $dico); $c = count($dico) - 1; for ($i = 0; $i < $c; $i++) { $dico[$i] = unaccent($dico[$i]); } $dico = array_unique($dico); file_put_contents(__DIR__ . '/dico.txt', implode("\n", $dico)); //Step 2 - Make Dico-Light $dico = explode("\n", file_get_contents(__DIR__ . '/dico.txt')); $c = count($dico) - 1; for ($i = 0; $i < $c; $i++) { $l = strlen($dico[$i]); if ($l < 3 || $l > 7) { unset($dico[$i]); } } file_put_contents(__DIR__ . '/dico-light.txt', implode("\n", $dico));
function slug($string, $slug = '-', $extra = null) { return strtolower(trim(preg_replace('~[^0-9a-z' . preg_quote($extra, '~') . ']+~i', $slug, unaccent($string)), $slug)); }