function classify($req) { global $CONFIG; set_time_limit(0); //this avoids timeouts require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/config.php"; require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/classes.php"; $outputfile = $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/output.log"; file_put_contents($outputfile, "Starting classification...\n", FILE_APPEND); $classification_methods = array("metadata" => $classification_method_metadata, "uses" => $classification_method_uses, "tags" => $classification_method_tags, "replinks" => $classification_method_replinks); foreach ($classification_methods as $type => $value) { file_put_contents($outputfile, "Creating clusters for {$type}...\n", FILE_APPEND); if ($value == 1) { //if classification_method is Kohonen require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/kohonen.php"; $array_clusters[$type] = run_kohonen($type, $req["cl_useold"]); } elseif ($value == 2) { //if classification_method is aggregative require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/aggregative.php"; $array_clusters[$type] = clusterize_aggregative($type, $req["cl_useold"]); } else { //if classification_method is 3 (YACA) require_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/yaca.php"; $array_clusters[$type] = clusterize_yaca($type, $req["cl_useold"]); } //add positive features to clusters if ($type != "replinks") { //there are not positive features for replinks file_put_contents($outputfile, "Calculating positive features for {$type}...\n", FILE_APPEND); $dt_matrix = unserialize(file_get_contents($IOdir . $type . "_dt")); $array_clusters[$type] = get_positive_features($array_clusters[$type], $dt_matrix, $type); } file_put_contents($IOdir . "clusters_" . $type, serialize($array_clusters[$type])); if (PHP_OS == "Linux" && posix_getuid() == fileowner($IOdir . "clusters_" . $type)) { chmod($IOdir . "clusters_" . $type, 0666); } //set rw permissions for everybody for this file file_put_contents($outputfile, "Clusters for {$type} created\n\n", FILE_APPEND); } return "OK"; }
function clusterize_aggregative($type) { global $IOdir; global $usern; switch ($type) { case 'metadata': $docterm = unserialize(file_get_contents($IOdir . 'metadata_dt')); $docdoc = unserialize(file_get_contents($IOdir . 'metadata_dd')); break; case 'tags': $docterm = unserialize(file_get_contents($IOdir . 'tags_dt')); $docdoc = unserialize(file_get_contents($IOdir . 'tags_dd')); break; case 'uses': $docterm = unserialize(file_get_contents($IOdir . 'uses_dt')); $docdoc = unserialize(file_get_contents($IOdir . 'uses_dd')); break; case 'replinks': $docdoc = unserialize(file_get_contents($IOdir . 'replinks_dd')); break; } $docdoc = sort_matrix($docdoc); $clusters = array(); foreach ($docdoc as $g => $row) { $c = new Cluster($type, $usern, "aggregative", 1); $c->array_docs[] = $g; $clusters[] = $c; } $def_cluster = array(); $it = 0; while (true) { $it++; $current = $clusters; $next = array(); $excluded = array(); $changes = false; for ($i = 0; $i < count($clusters); $i++) { if (in_array($current[$i]->id, $excluded)) { continue; } $dist_v = array(); for ($j = 0; $j < count($clusters); $j++) { if (in_array($current[$j]->id, $excluded)) { continue; } if ($i != $j) { $dist_v[$current[$j]->id] = correlation($current[$i], $current[$j], $docdoc); } } arsort($dist_v); foreach ($dist_v as $id => $correlation) { $fuse = find_cluster_by_id($id, $current); if (!$fuse) { continue; } $qi1 = quality_index($current[$i], $docdoc); $qi2 = quality_index($current[$fuse], $docdoc); if ($correlation == 0 || $correlation < $qi1 || $correlation < $qi2) { break; } $c = new Cluster($type, $usern, "aggregative", 1); $c->array_docs = array_merge($current[$i]->array_docs, $current[$fuse]->array_docs); $next[] = $c; $excluded[] = $current[$fuse]->id; $excluded[] = $current[$i]->id; $changes = true; break; } } foreach ($current as $remaining) { if (!in_array($remaining->id, $excluded)) { $next[] = $remaining; } } // usort($next,'sort_clusters'); $clusters = $next; if ($it > 50 || !$changes) { $def_cluster = $next; break; } } //delete clusters that contain only one element foreach ($def_cluster as $key => $cluster) { if (count($cluster->array_docs) == 1) { unset($def_cluster[$key]); } } echo "\nCalculating positive features for {$type}...\n"; if ($type != 'replinks') { $array_cluster = get_positive_features($def_cluster, $docterm, $type); } //sort clusters documents by guid foreach ($array_cluster as $key => $cluster) { sort($array_cluster[$key]->array_docs); } return $array_cluster; }
function run_kohonen($type_cluster) { global $output_file_kohonen, $IndexingClassificationPath, $IOdir; global $usern; $m_dt = ''; $m_dd = ''; $num_t = 0; switch ($type_cluster) { case 'tags': $m_dt = unserialize(file_get_contents($IOdir . 'tags_dt')); $m_dd = unserialize(file_get_contents($IOdir . 'tags_dd')); break; case 'uses': $m_dt = unserialize(file_get_contents($IOdir . 'uses_dt')); $m_dd = unserialize(file_get_contents($IOdir . 'uses_dd')); break; case 'metadata': default: $m_dt = unserialize(file_get_contents($IOdir . 'metadata_dt')); $m_dd = unserialize(file_get_contents($IOdir . 'metadata_dd')); break; } $kohonen_c = $IndexingClassificationPath . 'kohonen'; $doc_term = ''; foreach ($m_dt as $guid => $row) { $num_t = count($row); $j = 0; ksort($row); foreach ($row as $k => $v) { if ($type_cluster == 'tags') { if ($j < $num_t - 1) { $doc_term .= "{$v},"; } else { $doc_term .= "{$v}-"; } } else { if ($j < $num_t - 1) { $doc_term .= "{$v[$k]},"; } else { $doc_term .= "{$v[$k]}-"; } } $j++; } } $doc_doc = ''; $array_guid = array(); foreach ($m_dd as $guid => $row) { $array_guid[] = $guid; $num_d = count($row); $j = 0; ksort($row); foreach ($row as $k => $v) { if ($j < $num_d - 1) { $doc_doc .= "{$v},"; } else { $doc_doc .= "{$v}-"; } $j++; } } $num_d = count($m_dt); echo $command_string = "{$kohonen_c} -r {$num_d} -c {$num_t} -t {$doc_term} -d {$doc_doc}"; //file_put_contents("d","$command_string"); shell_exec($command_string); $clusters_strings = file_get_contents($output_file_kohonen); $array_cluster = array(); $info_cluster = split("\n", $clusters_strings); unset($info_cluster[count($info_cluster) - 1]); for ($i = 0; $i < count($array_guid); $i++) { if (count($array_cluster[$info_cluster[$i]]) == '') { $array_cluster[$info_cluster[$i]] = new Cluster($type_cluster, $usern, "kohonen", 1); } $array_cluster[$info_cluster[$i]]->array_docs[] = $array_guid[$i]; } echo "\nCalculating positive features for {$type_cluster}...\n"; $array_cluster = get_positive_features($array_cluster, $m_dt, $type_cluster); //adds positive features to the clusters foreach ($array_cluster as $k => $obj) { if (count($obj->array_docs) < 2) { unset($array_cluster[$k]); } } return $array_cluster; }
function clusterize_yaca($type) { global $YACA_threshold; global $IOdir; global $usern; global $new_classification_required; $docdoc = unserialize(file_get_contents($IOdir . $type . "_dd")); if ($type != "replinks") { $docterm = unserialize(file_get_contents($IOdir . $type . "_dt")); } $old_data_retrieved = false; //if possible, use old classification results: it is based on the file "changes", that is the log of edited and new documents //the filce "changes" doesn't contain logs for replinks (so this "shortcut" is not possible for replinks) //the strategy is to use old clusters and make the needed changes instead of recalculating them from scratch if ($new_classification_required == 0 && $type != "replinks") { //only if there were not very important changes and if it's not the case of replinks if (file_exists($IOdir . "old_clusters_{$type}") && file_exists($IOdir . "old_{$type}" . "_dd")) { //we need both the old clusters and the old doc-doc matrix $changes = unserialize(file_get_contents($IOdir . "changes")); $old_clusters = unserialize(file_get_contents($IOdir . "old_clusters_{$type}")); $old_clusters_values = array_values($old_clusters); $old_dd = unserialize(file_get_contents($IOdir . "old_{$type}" . "_dd")); if ($old_clusters_values[0]->clusteringAlgorithm == "YACA") { //we can continue only if the old clusters have been created with YACA //create new clusters that have the same documents of the old ones (we re-create them in order not to mess things up, especially the IDs, however it's a very quick process) foreach ($old_clusters as $old_cluster) { $c = new Cluster($type, $usern, "YACA", 1); $c->array_docs = $old_cluster->array_docs; //we don't put positive features right now $clusters[] = $c; } //for each edited element... foreach ($changes["edited"][$type] as $guid) { //...for each cluster foreach ($clusters as $num => $cluster) { //...see if the element is contained in the cluster and, if so, delete it from the cluster and delete its inherence from each element of the cluster $pos = $cluster->belongs($guid); if ($pos != false) { unset($clusters[$num]->array_docs[$pos - 1]); //delete it (the key is given by (position - 1) foreach ($cluster->array_docs as $numres => $resource) { //delete its inherence from each element $clusters[$num]->array_docs[$numres]["inherence"] = ($clusters[$num]->array_docs[$numres]["inherence"] * count($clusters[$num]->array_docs) - $old_dd[$guid][$resource["guid"]]) / (count($clusters[$num]->array_docs) - 1); } } $clusters[$num]->array_docs = array_values($clusters[$num]->array_docs); //in order to preserve the normal sequence in the array //see if the element is now related to the cluster or not (if it is related to almost one other element, it is considered related) $related = false; foreach ($cluster->array_docs as $resource) { if ($docdoc[$guid][$resource["guid"]] > $YACA_threshold) { $related = true; break; } } //if the element is related to the cluster, add its inherence to each element of the cluster and add it to the cluster if ($related) { $new_inherence = 0; foreach ($cluster->array_docs as $numres => $resource) { //add its inherence to each element of the cluster $clusters[$num]->array_docs[$numres]["inherence"] = ($clusters[$num]->array_docs[$numres]["inherence"] * (count($clusters[$num]->array_docs) - 1) + $docdoc[$guid][$resource["guid"]]) / count($clusters[$num]->array_docs); $new_inherence += $docdoc[$guid][$resource["guid"]]; } $clusters[$num]->array_docs[] = array("guid" => $guid, "inherence" => $new_inherence / count($clusters[$num]->array_docs)); //add it to the cluster } } } //for each new resource... foreach ($changes["new"] as $guid) { //...for each cluster foreach ($clusters as $num => $cluster) { //see if the element is related to the cluster or not (if it is related to almost one other element, it is considered related) $related = false; foreach ($cluster->array_docs as $resource) { if ($docdoc[$guid][$resource["guid"]] > $YACA_threshold) { $related = true; break; } } //if the element is related to the cluster, add its inherence to each element of the cluster and add it to the cluster if ($related) { $new_inherence = 0; foreach ($cluster->array_docs as $numres => $resource) { $clusters[$num]->array_docs[$numres]["inherence"] = ($clusters[$num]->array_docs[$numres]["inherence"] * (count($clusters[$num]->array_docs) - 1) + $docdoc[$guid][$resource["guid"]]) / count($clusters[$num]->array_docs); $new_inherence += $docdoc[$guid][$resource["guid"]]; } $clusters[$num]->array_docs[] = array("guid" => $guid, "inherence" => $new_inherence / count($clusters[$num]->array_docs)); } } //moreover we have to create a new cluster starting from the new element $c = new Cluster($type, $usern, "YACA", 1); $c->array_docs[] = array("guid" => $guid, "inherence" => 0); $guids = unserialize(file_get_contents($IOdir . "guids")); foreach ($guids as $guid2) { if ($docdoc[$guid][$guid2] >= $YACA_threshold) { if ($guid != $guid2) { $new_inherence = 0; foreach ($c->array_docs as $numres => $resource) { $c->array_docs[$numres]["inherence"] += ($c->array_docs[$numres]["inherence"] * (count($c->array_docs) - 1) + $docdoc[$guid2][$resource["guid"]]) / count($c->array_docs); $new_inherence += $docdoc[$guid2][$resource["guid"]]; } $c->array_docs[] = array("guid" => $guid2, "inherence" => $new_inherence / count($c->array_docs)); } } } if (count($c->array_docs) == 1) { $c->array_docs[0]["inherence"] = -1; } // -1 stands for "the maximum", since when a cluster has only an element, this is obviously totally inherent to the cluster $clusters[] = $c; } $old_data_retrieved = true; } } } //if it wasn't possible to use the old classification results, do the whole process if ($old_data_retrieved == false) { $clusters = array(); foreach ($docdoc as $key => $row) { $c = new Cluster($type, $usern, "YACA", 1); $c->array_docs[] = array("guid" => $key, "inherence" => 0); foreach ($row as $key2 => $value) { if ($docdoc[$key][$key2] >= $YACA_threshold) { if ($key != $key2) { $new_inherence = 0; //add the element score to the other elements' inherence foreach ($c->array_docs as $numres => $resource) { $c->array_docs[$numres]["inherence"] = ($c->array_docs[$numres]["inherence"] * (count($c->array_docs) - 1) + $docdoc[$key2][$resource["guid"]]) / count($c->array_docs); $new_inherence += $docdoc[$key2][$resource["guid"]]; } //we add now inherence to each element of the cluster $c->array_docs[] = array("guid" => $key2, "inherence" => $new_inherence / count($c->array_docs)); } } } if (count($c->array_docs) == 1) { $c->array_docs[0]["inherence"] = -1; } // -1 stands for "the maximum", since when a cluster has only an element, this is obviously totally inherent to the cluster $clusters[] = $c; } } $clusters_ok = delete_unnecessary_clusters($clusters); //sort each cluster's documents by inherence descending order and then by guid foreach ($clusters_ok as $num => $cluster) { usort($cluster->array_docs, "compare_inherence_guid"); } //add positive features to clusters if ($type != 'replinks') { echo "\nCalculating positive features for {$type}...\n"; $clusters_pos = get_positive_features($clusters_ok, $docterm, $type); //it's very slow for metadata } else { $clusters_pos = $clusters_ok; } return $clusters_pos; }