<?php /** * K-means clustering with centroid and normalize value * * @see http://phpir.com/clustering * @see */ define(WITH_NORMALIZE, false); $data = array(array(0.05, 0.95), array(0.1, 0.9), array(0.2, 0.8), array(0.25, 0.75), array(0.45, 0.55), array(0.5, 0.5), array(0.55, 0.45), array(0.85, 0.15), array(0.9, 0.1), array(0.95, 0.05)); var_dump(kMeans($data, 3, WITH_NORMALIZE)); function initialiseCentroids(array $data, $k, $normalize = false) { $dimensions = count($data[0]); $centroids = array(); $dimmax = array(); $dimmin = array(); foreach ($data as $document) { foreach ($document as $dim => $val) { if (!isset($dimmax[$dim]) || $val > $dimmax[$dim]) { $dimmax[$dim] = $val; } if (!isset($dimmin[$dim]) || $val < $dimmin[$dim]) { $dimmin[$dim] = $val; } } } for ($i = 0; $i < $k; $i++) { $centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin, $normalize); } return $centroids;
<?php $data = array(array(0.05, 0.95), array(0.1, 0.9), array(0.2, 0.8), array(0.25, 0.75), array(0.45, 0.55), array(0.5, 0.5), array(0.55, 0.45), array(0.85, 0.15), array(0.9, 0.1), array(0.95, 0.05)); // Lets normalise the input data foreach ($data as $key => $d) { $data[$key] = normaliseValue($d, sqrt($d[0] * $d[0] + $d[1] * $d[1])); } var_dump(kMeans($data, 3)); function initialiseCentroids(array $data, $k) { $dimensions = count($data[0]); $centroids = array(); $dimmax = array(); $dimmin = array(); foreach ($data as $document) { foreach ($document as $dim => $val) { if (!isset($dimmax[$dim]) || $val > $dimmax[$dim]) { $dimmax[$dim] = $val; } if (!isset($dimmin[$dim]) || $val < $dimmin[$dim]) { $dimmin[$dim] = $val; } } } for ($i = 0; $i < $k; $i++) { $centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin); } return $centroids; } function initialiseCentroid($dimensions, $dimmax, $dimmin) {
$resultsTfidf = getTfidf($resultsIndex, $nonAggregatedTotalSet); // Normalise input data for centroids $normalisedTfidf = normaliseTfidf($resultsTfidf); /* // test foreach ( $normalisedTfidf as $docID => $value ) { foreach ( $value as $term => $tfidfValue ) { $normal .= "docId: " . $docID . " , term: " . $term . " , tf-idf: " . $tfidfValue . "<br />"; } } */ // Calculate centroids, map docs to them $numClusters = 5; $mapDocToClusterID = kMeans($normalisedTfidf, $numClusters); // Get the clustered results list $clusteredList = getClusteredResults($nonAggregated, $mapDocToClusterID); /* // test $tfidfmax = array(); $tfidfmin = array(); $centroidsCoord = initialiseCentroids( $normalisedTfidf , $numClusters , $tfidfmax, $tfidfmin ); foreach ( $tfidfmax as $term => $tfidfMax ) { $maxmin .= $term . " => Max:" . $tfidfMax . " Min:" . $tfidfmin[$term] . "<br />"; if( $tfidfMax != $tfidfmin[$term] ) $maxmin .= "HOLY SHIT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! <br/>"; } foreach ( $centroidsCoord as $clusterID => $value ) {