function doctermTags($req) { global $CONFIG; set_time_limit(0); //this avoids timeouts require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/config.php"; require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/classes.php"; require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/stemming.php"; $outputfile = $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/output.log"; file_put_contents($outputfile, "Starting creating TAGS Doc-term matrix...\n", FILE_APPEND); $guids = unserialize(file_get_contents($IOdir . "guids")); $lr_array = unserialize(file_get_contents($IOdir . "lr")); $tags_dt = array(); $stop_words = set_stop_words_tags($CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/stop_words_eng.txt"); //create an array containing "stop words", in order to eliminate them from the text if ($req["dt_useold"] == "true" && file_exists($IOdir . "old_lr") && file_exists($IOdir . "old_tags_dt_raw")) { $old_lr = unserialize(file_get_contents($IOdir . "old_lr")); $old_tags_dt_raw = unserialize(file_get_contents($IOdir . "old_tags_dt_raw")); //we need the raw version of the doc-term matrix, before applying IDF or synonyms } //create an array containing all the tags of each document foreach ($guids as $guid) { if (!empty($lr_array[$guid]->tags)) { if ($req["dt_useold"] == "true" && isset($old_lr[$guid]->tags) && $lr_array[$guid]->tags == $old_lr[$guid]->tags && isset($old_tags_dt_raw[$guid])) { $tags_dt[$guid] = $old_tags_dt_raw[$guid]; continue; } //find all the tags for the current resource $tags = array(); foreach ($lr_array[$guid]->tags as $sentence) { if ($sentence == "") { continue; } $sentence = strip_punctuation($sentence); //strip punctuation $sentence_clean = str_replace($stop_words, " ", $sentence); //eliminate stop words $tags_sentence = explode(" ", $sentence_clean); $tags = array_merge($tags, $tags_sentence); } //stem each tag foreach ($tags as $num => $element) { $tags[$num] = PorterStemmer::Stem(strtolower(trim($element))); //stem elements } $tags = array_filter(array_unique($tags)); //delete duplicates and empty elements //create the entry for the current document in the doc-term tags matrix foreach ($tags as $tag) { $tags_dt[$guid][$tag] = 1; } } } file_put_contents($IOdir . "tags_dt_raw", serialize($tags_dt)); //it saves the raw version of the doc-term matrix if (PHP_OS == "Linux") { chmod($IOdir . "tags_dt_raw", 0666); } //set rw permissions for everybody for this file if ($enable_synonyms) { foreach ($keys as $num => $key) { if (!isset($keys[$num])) { continue; } //since there is an unset on this array into the foreach, we have to check if the present key is still available or not foreach ($keys as $num2 => $key2) { if (!isset($keys[$num2])) { continue; } //since there is an unset on this array into the foreach, we have to check if the present key is still available or not if ($key != $key2 && check_synonyms($key, $key2)) { file_put_contents($outputfile, "\n{$key} and {$key2} are synonyms\n", FILE_APPEND); foreach ($tags_dt as $guid => $element) { if (isset($tags_dt[$guid][$key2])) { unset($tags_dt[$guid][$key2]); $tags_dt[$guid][$key] = 1; } } unset($keys[$num2]); } } } } file_put_contents($IOdir . "tags_dt", serialize($tags_dt)); if (PHP_OS == "Linux" && posix_getuid() == fileowner($IOdir . 'tags_dt')) { chmod($IOdir . 'tags_dt', 0666); } //set rw permissions for everybody for this file file_put_contents($outputfile, "TAGS Doc-term matrix created\n\n", FILE_APPEND); return "OK"; }
function doctermTags($lr_array) { require_once 'stemming.php'; global $IOdir, $enable_synonyms, $IndexingClassificationPath, $dt_new_indexing_required, $guids; echo 'Starting creating TAGS Doc-term matrix...' . "\n"; $tags_dt = array(); $stop_words = set_stop_words_tags($IndexingClassificationPath . "stop_words_eng.txt"); //create an array containing "stop words", in order to eliminate them from the text if ($dt_new_indexing_required == 0 && file_exists($IOdir . "old_lr") && file_exists($IOdir . "old_tags_dt_raw")) { $old_lr = unserialize(file_get_contents($IOdir . "old_lr")); $old_tags_dt_raw = unserialize(file_get_contents($IOdir . "old_tags_dt_raw")); } //create an array containing all the tags of each document foreach ($guids as $guid) { if (!empty($lr_array[$guid]->tags)) { if ($dt_new_indexing_required == 0 && isset($old_lr[$guid]->tags) && $lr_array[$guid]->tags == $old_lr[$guid]->tags && isset($old_tags_dt_raw[$guid])) { $tags_dt[$guid] = $old_tags_dt_raw[$guid]; continue; } //find all the tags for the current resource $tags = array(); foreach ($lr_array[$guid]->tags as $sentence) { if ($sentence == "") { continue; } $sentence = strip_punctuation($sentence); //strip punctuation $sentence_clean = str_replace($stop_words, " ", $sentence); //eliminate stop words $tags_sentence = explode(" ", $sentence_clean); $tags = array_merge($tags, $tags_sentence); } //stem each tag foreach ($tags as $num => $element) { $tags[$num] = PorterStemmer::Stem(strtolower(trim($element))); //stem elements } $tags = array_filter(array_unique($tags)); //delete duplicates and empty elements //create the entry for the current document in the doc-term tags matrix foreach ($tags as $tag) { $tags_dt[$guid][$tag] = 1; } } } file_put_contents($IOdir . "tags_dt_raw", serialize($tags_dt)); if (PHP_OS == "Linux") { chmod($IOdir . "tags_dt_raw", 0666); } //set rw permissions for everybody for this file if ($enable_synonyms) { foreach ($keys as $num => $key) { if (!isset($keys[$num])) { continue; } //since there is an unset on this array into the foreach, we have to check if the present key is still available or not foreach ($keys as $num2 => $key2) { if (!isset($keys[$num2])) { continue; } //since there is an unset on this array into the foreach, we have to check if the present key is still available or not if ($key != $key2 && check_synonyms($key, $key2)) { echo "\n{$key} and {$key2} are synonyms\n"; foreach ($tags_dt as $guid => $element) { if (isset($tags_dt[$guid][$key2])) { unset($tags_dt[$guid][$key2]); $tags_dt[$guid][$key] = 1; } } unset($keys[$num2]); } } } } //print_r($tags_dt); echo 'TAGS Doc-term matrix created' . "\n"; return $tags_dt; }