function explorerDir($path, $tab_mot_vide) { $folder = opendir($path); while ($entree = readdir($folder)) { //On ignore les entrées if ($entree != "." && $entree != "..") { // On vérifie si il s'agit d'un répertoire if (is_dir($path . "/" . $entree)) { $sav_path = $path; // Construction du path jusqu'au nouveau répertoire $path .= "/" . $entree; //echo "DOSSIER = ", $path, "<BR>"; // On parcours le nouveau répertoire explorerDir($path, $tab_mot_vide); $path = $sav_path; } else { //C'est un fichier html ou pas $path_source = $path . "/" . $entree; if (stripos($path_source, '.htm')) { //echo 'On appelle le module indexation <br>'; //echo $path_source, '<br>'; indexer($path_source, $tab_mot_vide); } //Si c'est un .html //On appelle la fonction d'indexation //Dans le module_indexation.php //Par un include } } } closedir($folder); }
function summarize($filename, $compress) { // proses indexing $inv_index = indexer(); // load file dan daftar stopwords $load_file = file_get_contents("./corpus/" . $filename); $sentence = preg_split("/[.]+/", $load_file); $sentence = array_slice($sentence, 0, sizeof($sentence) - 1); // buang array terakhir (kosong) $stopwords = file_get_contents("./stopwords.txt"); $stopwords = preg_split("/[\\s]+/", $stopwords); // jumlah kalimat yang diringkas $compression_rate = $compress / 100; $max_sentence = floor(sizeof($sentence) * $compression_rate); // inisialisasi $sentence_weight = array(); // menghitung bobot tf.idf tiap kalimat foreach ($sentence as $key => $value) { // tokenisasi dengan membuang stopwords $word = preg_split("/[\\d\\W\\s]+/", strtolower($value)); $word = array_diff($word, $stopwords); $word = array_values($word); // perbaiki indeks // inisialisasi bobot dan hitung frekuensi token $tf_idf = 0; $freq_word = array_count_values($word); // hitung bobot tf.idf foreach ($freq_word as $token => $tf) { $tf_idf += $tf * $inv_index[$token]['idf']; } // simpan nilai bobot kalimat array_push($sentence_weight, $tf_idf); } // sorting bobot tertinggi -> potong array -> sorting urutan kalimat arsort($sentence_weight); $sorted = array_slice($sentence_weight, 0, $max_sentence, true); ksort($sorted); // gabungkan ringkasan $summary = ""; foreach ($sorted as $key => $value) { $summary = $summary . $sentence[$key] . ". "; } // return teks asli dan hasil ringkasan $output = array(); $output['original'] = $load_file; $output['summary'] = $summary; return $output; }
// the index array structure should be (but not comfirmed yet) as below: // $index = array( // 'slug-after-hashbang'=>array('timestamp'=>23234234, 'title'=>'Title from that mada', 'excerpt'=>'first para without img'), // 'slug-after-hashbang'=>array('timestamp'=>23234234, 'title'=>'Title from that mada', 'excerpt'=>'first para without img'), // 'slug-after-hashbang'=>array('timestamp'=>23234234, 'title'=>'Title from that mada', 'excerpt'=>'first para without img'), // 'slug-after-hashbang'=>array('timestamp'=>23234234, 'title'=>'Title from that mada', 'excerpt'=>'first para without img'), // ); // in json mind you. :) ## Configurations: const MARKDOWN_DIR = 'posts'; error_reporting(E_ALL); ## MAIN $mds = get_mds(); foreach ($mds as $filename) { if ($filename != '.DS_Store') { $ind = indexer($filename); $index[$ind['slug']] = $ind; } } dumper($index); $data = json_encode($index); file_put_contents('assets/index.json', $data); echo 'done!'; ################################# ########### FUNCTIONS ########### function dumper($multi) { echo '<pre>'; var_dump($multi); echo '</pre>'; }