Exemplo n.º 1
0
/**
 * Correlation
 */
function correl(array $x_values, array $y_values)
{
    return correlation($x_values, $y_values);
}
function clusterize_aggregative($type, $cl_useold)
{
    global $CONFIG;
    set_time_limit(0);
    //this avoids timeouts
    include $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/config.php";
    include_once $CONFIG->path . "mod/profile_manager/views/default/profile_manager/members/classes.php";
    switch ($type) {
        case 'metadata':
            $docterm = unserialize(file_get_contents($IOdir . 'metadata_dt'));
            $docdoc = unserialize(file_get_contents($IOdir . 'metadata_dd'));
            break;
        case 'tags':
            $docterm = unserialize(file_get_contents($IOdir . 'tags_dt'));
            $docdoc = unserialize(file_get_contents($IOdir . 'tags_dd'));
            break;
        case 'uses':
            $docterm = unserialize(file_get_contents($IOdir . 'uses_dt'));
            $docdoc = unserialize(file_get_contents($IOdir . 'uses_dd'));
            break;
        case 'replinks':
            $docdoc = unserialize(file_get_contents($IOdir . 'replinks_dd'));
            break;
    }
    $docdoc = sort_matrix($docdoc);
    $clusters = array();
    foreach ($docdoc as $g => $row) {
        $c = new Cluster($type, get_loggedin_user()->username, "aggregative", 1);
        $c->array_docs[] = $g;
        $clusters[] = $c;
    }
    $def_cluster = array();
    $it = 0;
    while (true) {
        $it++;
        $current = $clusters;
        $next = array();
        $excluded = array();
        $changes = false;
        for ($i = 0; $i < count($clusters); $i++) {
            if (in_array($current[$i]->id, $excluded)) {
                continue;
            }
            $dist_v = array();
            for ($j = 0; $j < count($clusters); $j++) {
                if (in_array($current[$j]->id, $excluded)) {
                    continue;
                }
                if ($i != $j) {
                    $dist_v[$current[$j]->id] = correlation($current[$i], $current[$j], $docdoc);
                }
            }
            arsort($dist_v);
            foreach ($dist_v as $id => $correlation) {
                $fuse = find_cluster_by_id($id, $current);
                if (!$fuse) {
                    continue;
                }
                $qi1 = quality_index($current[$i], $docdoc);
                $qi2 = quality_index($current[$fuse], $docdoc);
                if ($correlation == 0 || $correlation < $qi1 || $correlation < $qi2) {
                    break;
                }
                $c = new Cluster($type, get_loggedin_user()->username, "aggregative", 1);
                $c->array_docs = array_merge($current[$i]->array_docs, $current[$fuse]->array_docs);
                $next[] = $c;
                $excluded[] = $current[$fuse]->id;
                $excluded[] = $current[$i]->id;
                $changes = true;
                break;
            }
        }
        foreach ($current as $remaining) {
            if (!in_array($remaining->id, $excluded)) {
                $next[] = $remaining;
            }
        }
        //	usort($next,'sort_clusters');
        $clusters = $next;
        if ($it > 50 || !$changes) {
            $def_cluster = $next;
            break;
        }
    }
    //delete clusters that contain only one element
    foreach ($def_cluster as $key => $cluster) {
        if (count($cluster->array_docs) == 1) {
            unset($def_cluster[$key]);
        }
    }
    //sort clusters documents by guid
    foreach ($def_cluster as $key => $cluster) {
        sort($def_cluster[$key]->array_docs);
    }
    return $def_cluster;
}
function clusterize_aggregative($type)
{
    global $IOdir;
    global $usern;
    switch ($type) {
        case 'metadata':
            $docterm = unserialize(file_get_contents($IOdir . 'metadata_dt'));
            $docdoc = unserialize(file_get_contents($IOdir . 'metadata_dd'));
            break;
        case 'tags':
            $docterm = unserialize(file_get_contents($IOdir . 'tags_dt'));
            $docdoc = unserialize(file_get_contents($IOdir . 'tags_dd'));
            break;
        case 'uses':
            $docterm = unserialize(file_get_contents($IOdir . 'uses_dt'));
            $docdoc = unserialize(file_get_contents($IOdir . 'uses_dd'));
            break;
        case 'replinks':
            $docdoc = unserialize(file_get_contents($IOdir . 'replinks_dd'));
            break;
    }
    $docdoc = sort_matrix($docdoc);
    $clusters = array();
    foreach ($docdoc as $g => $row) {
        $c = new Cluster($type, $usern, "aggregative", 1);
        $c->array_docs[] = $g;
        $clusters[] = $c;
    }
    $def_cluster = array();
    $it = 0;
    while (true) {
        $it++;
        $current = $clusters;
        $next = array();
        $excluded = array();
        $changes = false;
        for ($i = 0; $i < count($clusters); $i++) {
            if (in_array($current[$i]->id, $excluded)) {
                continue;
            }
            $dist_v = array();
            for ($j = 0; $j < count($clusters); $j++) {
                if (in_array($current[$j]->id, $excluded)) {
                    continue;
                }
                if ($i != $j) {
                    $dist_v[$current[$j]->id] = correlation($current[$i], $current[$j], $docdoc);
                }
            }
            arsort($dist_v);
            foreach ($dist_v as $id => $correlation) {
                $fuse = find_cluster_by_id($id, $current);
                if (!$fuse) {
                    continue;
                }
                $qi1 = quality_index($current[$i], $docdoc);
                $qi2 = quality_index($current[$fuse], $docdoc);
                if ($correlation == 0 || $correlation < $qi1 || $correlation < $qi2) {
                    break;
                }
                $c = new Cluster($type, $usern, "aggregative", 1);
                $c->array_docs = array_merge($current[$i]->array_docs, $current[$fuse]->array_docs);
                $next[] = $c;
                $excluded[] = $current[$fuse]->id;
                $excluded[] = $current[$i]->id;
                $changes = true;
                break;
            }
        }
        foreach ($current as $remaining) {
            if (!in_array($remaining->id, $excluded)) {
                $next[] = $remaining;
            }
        }
        //	usort($next,'sort_clusters');
        $clusters = $next;
        if ($it > 50 || !$changes) {
            $def_cluster = $next;
            break;
        }
    }
    //delete clusters that contain only one element
    foreach ($def_cluster as $key => $cluster) {
        if (count($cluster->array_docs) == 1) {
            unset($def_cluster[$key]);
        }
    }
    echo "\nCalculating positive features for {$type}...\n";
    if ($type != 'replinks') {
        $array_cluster = get_positive_features($def_cluster, $docterm, $type);
    }
    //sort clusters documents by guid
    foreach ($array_cluster as $key => $cluster) {
        sort($array_cluster[$key]->array_docs);
    }
    return $array_cluster;
}