function kMeans($data, $k, $normalize = false) { $centroids = initialiseCentroids($data, $k, $normalize = false); $mapping = array(); while (true) { $new_mapping = assignCentroids($data, $centroids); foreach ($new_mapping as $documentID => $centroidID) { if (!isset($mapping[$documentID]) || $centroidID != $mapping[$documentID]) { $mapping = $new_mapping; break; } else { return formatResults($mapping, $data, $centroids); } } $centroids = updateCentroids($mapping, $data, $k); } }
function kMeans($normalisedTfidf, $numClusters) { // Initialise centroids ( with key = term and value = tf-idf ) $centroids = initialiseCentroids($normalisedTfidf, $numClusters); $mapDocToCentroid = array(); $maxIterations = 10; while (true) { // Assign documents to centroids $newMapDocToCentroid = assignCentroids($normalisedTfidf, $centroids); // Stopping condition: when the centroid stops changing $changed = false; // Resulting array has docID for key and centroidID for values foreach ($newMapDocToCentroid as $docID => $centroidID) { // If centroid for a doc in new mapping is not same as that of old mapping, use the new mapping if (!isset($mapDocToCentroid[$docID]) || $centroidID != $mapDocToCentroid[$docID]) { $mapDocToCentroid = $newMapDocToCentroid; $changed = true; break; } } // Decrease iterations (which were set as alternative stopping condition) $maxIterations--; // Check status of assignment and return the array with docs mapped to centroids if (!$changed || $maxIterations == 0) { return $mapDocToCentroid; } // Update the centroids $centroids = updateCentroids($mapDocToCentroid, $normalisedTfidf, $numClusters); } }