function doctermUses($req) { global $CONFIG; set_time_limit(0); //this avoids timeouts require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/config.php"; require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/classes.php"; $outputfile = $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/output.log"; file_put_contents($outputfile, "Starting creating USES Doc-term matrix...\n", FILE_APPEND); $guids = unserialize(file_get_contents($IOdir . "guids")); $lr_array = unserialize(file_get_contents($IOdir . "lr")); $uses_dt = array(); $stop_words = set_stop_words($CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/stop_words_eng.txt"); if ($req["dt_useold"] == "true" && file_exists($IOdir . "old_lr") && file_exists($IOdir . "old_uses_dt_raw")) { $old_lr = unserialize(file_get_contents($IOdir . "old_lr")); $old_uses_dt_raw = unserialize(file_get_contents($IOdir . "old_uses_dt_raw")); //we need the raw version of the doc-term matrix, before applying IDF or synonyms } foreach ($guids as $guid) { if (!empty($lr_array[$guid]->uses)) { if ($req["dt_useold"] == "true" && isset($old_lr[$guid]->uses) && $lr_array[$guid]->uses == $old_lr[$guid]->uses && isset($old_uses_dt_raw[$guid])) { $uses_dt[$guid] = $old_uses_dt_raw[$guid]; continue; } $pieces = array(); foreach ($lr_array[$guid]->uses as $text) { $pieces[] = extract_matrix($text, $stop_words); //create a docterm matrix for each uses field text } $uses_dt[$guid] = join_pieces($pieces); //put all the docterm matrices together } } file_put_contents($IOdir . "uses_dt_raw", serialize($uses_dt)); //it saves the raw version of the doc-term matrix if (PHP_OS == "Linux") { chmod($IOdir . "uses_dt_raw", 0666); } //set rw permissions for everybody for this file //extract all the keys $keys = array(); foreach ($uses_dt as $doc) { foreach ($doc as $keyword => $element) { if (!in_array($keyword, $keys)) { $keys[] = $keyword; } } } //creates the idf for each key: log(num_documents/num_documents_containing_the_key) if ($enable_idf) { $idf = array(); $num_docs = count($uses_dt); foreach ($keys as $key) { $num_docs_with_key = 0; foreach ($uses_dt as $doc) { if (isset($doc[$key])) { $num_docs_with_key++; } } $idf[$key] = log($num_docs / $num_docs_with_key); //each key frequency of each document will be now given by its frequency multiplied by the key idf foreach ($uses_dt as $guid => $doc) { if (isset($uses_dt[$guid][$key])) { $uses_dt[$guid][$key][$key] *= $idf[$key]; } } } } //if synonyms support is enabled, if keyword A and keyword B are synonyms, we add B frequency and contexts to A and delete B if ($enable_synonyms) { foreach ($keys as $num => $key) { if (!isset($keys[$num])) { continue; } //since there is an unset on this array into the foreach, we have to check if the present key is still available or not foreach ($keys as $num2 => $key2) { if (!isset($keys[$num2])) { continue; } //since there is an unset on this array into the foreach, we have to check if the present key is still available or not if ($key != $key2 && check_synonyms($key, $key2)) { file_put_contents($outputfile, "\n{$key} and {$key2} are synonyms\n", FILE_APPEND); foreach ($uses_dt as $guid => $element) { if (!isset($uses_dt[$guid][$key2])) { continue; } if (!isset($uses_dt[$guid][$key])) { $uses_dt[$guid][$key] = $uses_dt[$guid][$key2]; } else { //if both key and key2 are present in this element $uses_dt[$guid][$key][$key] += $uses_dt[$guid][$key2][$key2]; //add the frequency of key2 to key1 foreach ($element[$key2] as $context2 => $frequency2) { if ($context2 == $key2) { continue; } else { //if the current context of key2 is present also in key1, we add the values, otherwise we add the current context to key1 if (isset($uses_dt[$guid][$key][$context2])) { $uses_dt[$guid][$key][$context2] += $uses_dt[$guid][$key2][$context2]; } else { $uses_dt[$guid][$key][$context2] = $uses_dt[$guid][$key2][$context2]; } } } } unset($uses_dt[$guid][$key2]); //delete key2 fron the current document } unset($keys[$num2]); //delete key2 } } } } file_put_contents($IOdir . "uses_dt", serialize($uses_dt)); if (PHP_OS == "Linux" && posix_getuid() == fileowner($IOdir . 'uses_dt')) { chmod($IOdir . 'uses_dt', 0666); } //set rw permissions for everybody for this file file_put_contents($outputfile, "USES Doc-term matrix created\n\n", FILE_APPEND); return "OK"; }
function doctermUses($lr_array) { global $IOdir, $enable_synonyms, $enable_idf, $dt_new_indexing_required, $guids, $IndexingClassificationPath; echo 'Starting creating USES Doc-term matrix...' . "\n"; $uses_dt = array(); $stop_words = set_stop_words($IndexingClassificationPath . 'stop_words_eng.txt'); if ($dt_new_indexing_required == 0 && file_exists($IOdir . "old_lr") && file_exists($IOdir . "old_uses_dt_raw")) { $old_lr = unserialize(file_get_contents($IOdir . "old_lr")); $old_uses_dt_raw = unserialize(file_get_contents($IOdir . "old_uses_dt_raw")); } foreach ($guids as $guid) { if (!empty($lr_array[$guid]->uses)) { if ($dt_new_indexing_required == 0 && isset($old_lr[$guid]->uses) && $lr_array[$guid]->uses == $old_lr[$guid]->uses && isset($old_uses_dt_raw[$guid])) { $uses_dt[$guid] = $old_uses_dt_raw[$guid]; continue; } $pieces = array(); foreach ($lr_array[$guid]->uses as $text) { $pieces[] = extract_matrix($text, $stop_words); //create a docterm matrix for each uses field text } $uses_dt[$guid] = join_pieces($pieces); //put all the docterm matrices together } } file_put_contents($IOdir . "uses_dt_raw", serialize($uses_dt)); if (PHP_OS == "Linux") { chmod($IOdir . "uses_dt_raw", 0666); } //set rw permissions for everybody for this file //extract all the keys $keys = array(); foreach ($uses_dt as $doc) { foreach ($doc as $keyword => $element) { if (!in_array($keyword, $keys)) { $keys[] = $keyword; } } } //creates the idf for each key: log(num_documents/num_documents_containing_the_key) if ($enable_idf) { $idf = array(); $num_docs = count($uses_dt); foreach ($keys as $key) { $num_docs_with_key = 0; foreach ($uses_dt as $doc) { if (isset($doc[$key])) { $num_docs_with_key++; } } $idf[$key] = log($num_docs / $num_docs_with_key); //each key frequency of each document will be now given by its frequency multiplied by the key idf foreach ($uses_dt as $guid => $doc) { if (isset($uses_dt[$guid][$key])) { $uses_dt[$guid][$key][$key] *= $idf[$key]; } } } } //if synonyms support is enabled, if keyword A and keyword B are synonyms, we add B frequency and contexts to A and delete B if ($enable_synonyms) { foreach ($keys as $num => $key) { if (!isset($keys[$num])) { continue; } //since there is an unset on this array into the foreach, we have to check if the present key is still available or not foreach ($keys as $num2 => $key2) { if (!isset($keys[$num2])) { continue; } //since there is an unset on this array into the foreach, we have to check if the present key is still available or not if ($key != $key2 && check_synonyms($key, $key2)) { echo "\n{$key} and {$key2} are synonyms\n"; foreach ($uses_dt as $guid => $element) { if (!isset($uses_dt[$guid][$key2])) { continue; } if (!isset($uses_dt[$guid][$key])) { $uses_dt[$guid][$key] = $uses_dt[$guid][$key2]; } else { //if both key and key2 are present in this element $uses_dt[$guid][$key][$key] += $uses_dt[$guid][$key2][$key2]; //add the frequency of key2 to key1 foreach ($element[$key2] as $context2 => $frequency2) { if ($context2 == $key2) { continue; } else { //if the current context of key2 is present also in key1, we add the values, otherwise we add the current context to key1 if (isset($uses_dt[$guid][$key][$context2])) { $uses_dt[$guid][$key][$context2] += $uses_dt[$guid][$key2][$context2]; } else { $uses_dt[$guid][$key][$context2] = $uses_dt[$guid][$key2][$context2]; } } } } unset($uses_dt[$guid][$key2]); //delete key2 fron the current document } unset($keys[$num2]); //delete key2 } } } } echo 'USES Doc-term matrix created' . "\n"; return $uses_dt; }