<?php

/**
 * K-means clustering with centroid and normalize value
 *
 * @see   http://phpir.com/clustering
 * @see   
 */
define(WITH_NORMALIZE, false);
$data = array(array(0.05, 0.95), array(0.1, 0.9), array(0.2, 0.8), array(0.25, 0.75), array(0.45, 0.55), array(0.5, 0.5), array(0.55, 0.45), array(0.85, 0.15), array(0.9, 0.1), array(0.95, 0.05));
var_dump(kMeans($data, 3, WITH_NORMALIZE));
function initialiseCentroids(array $data, $k, $normalize = false)
{
    $dimensions = count($data[0]);
    $centroids = array();
    $dimmax = array();
    $dimmin = array();
    foreach ($data as $document) {
        foreach ($document as $dim => $val) {
            if (!isset($dimmax[$dim]) || $val > $dimmax[$dim]) {
                $dimmax[$dim] = $val;
            }
            if (!isset($dimmin[$dim]) || $val < $dimmin[$dim]) {
                $dimmin[$dim] = $val;
            }
        }
    }
    for ($i = 0; $i < $k; $i++) {
        $centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize);
    }
    return $centroids;
<?php

$data = array(array(0.05, 0.95), array(0.1, 0.9), array(0.2, 0.8), array(0.25, 0.75), array(0.45, 0.55), array(0.5, 0.5), array(0.55, 0.45), array(0.85, 0.15), array(0.9, 0.1), array(0.95, 0.05));
// Lets normalise the input data
foreach ($data as $key => $d) {
    $data[$key] = normaliseValue($d, sqrt($d[0] * $d[0] + $d[1] * $d[1]));
}
var_dump(kMeans($data, 3));
function initialiseCentroids(array $data, $k)
{
    $dimensions = count($data[0]);
    $centroids = array();
    $dimmax = array();
    $dimmin = array();
    foreach ($data as $document) {
        foreach ($document as $dim => $val) {
            if (!isset($dimmax[$dim]) || $val > $dimmax[$dim]) {
                $dimmax[$dim] = $val;
            }
            if (!isset($dimmin[$dim]) || $val < $dimmin[$dim]) {
                $dimmin[$dim] = $val;
            }
        }
    }
    for ($i = 0; $i < $k; $i++) {
        $centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin);
    }
    return $centroids;
}
function initialiseCentroid($dimensions, $dimmax, $dimmin)
{
Example #3
0
 $resultsTfidf = getTfidf($resultsIndex, $nonAggregatedTotalSet);
 // Normalise input data for centroids
 $normalisedTfidf = normaliseTfidf($resultsTfidf);
 /*	
 			 	// test
 foreach ( $normalisedTfidf as $docID => $value )
 {
 	foreach ( $value as $term => $tfidfValue )
 	{
 		$normal .= "docId: " . $docID . " , term: " . $term . " , tf-idf: " . $tfidfValue . "<br />";
 	}
 }
 */
 // Calculate centroids, map docs to them
 $numClusters = 5;
 $mapDocToClusterID = kMeans($normalisedTfidf, $numClusters);
 // Get the clustered results list
 $clusteredList = getClusteredResults($nonAggregated, $mapDocToClusterID);
 /*				
 // test	
 $tfidfmax = array();
 $tfidfmin = array();
 $centroidsCoord = initialiseCentroids( $normalisedTfidf , $numClusters , $tfidfmax, $tfidfmin );				
 foreach ( $tfidfmax as $term => $tfidfMax )
 {
 	$maxmin .= $term .  " => Max:" . $tfidfMax . " Min:" . $tfidfmin[$term] . "<br />";
 	if( $tfidfMax != $tfidfmin[$term] )
 		$maxmin .= "HOLY SHIT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! <br/>";
 }
 foreach ( $centroidsCoord as $clusterID => $value )
 {