#!/usr/bin/php <?php // read the README require "phpcluster/k-means.php"; $c = new Kmeans(); $c->setCentroids(1000); $c->setThreshold(50); echo "Loading entries\n"; $foo = 0; $title = array(); foreach (glob("data-abc/*") as $file) { $rss = unserialize(file_get_contents($file)); if (!isset($rss['link']) && isset($rss['guid'])) { $rss['link'] = $rss['guid']; } if (!isset($rss['title']) || !isset($rss['link']) || !isset($rss['description'])) { continue; } if (trim($rss['title']) == "") { continue; } if (isset($title[$rss['title']])) { continue; } $title[$rss['title']] = true; /* transform id to link */ $id = substr(trim($rss['link']), 5); $id = substr($id, 0, strlen($id) - 5); $link = "http://www.abc.com.py/imprimir.php?pid={$id}"; /* split the calc in chunks of 20,000 news is a */ /* a good idea, since large files will take a lot */
<?php include '../../../autoloader.php'; include '../../testing.php'; include '../cluster_testing.php'; use NlpTools\Clustering\KMeans; use NlpTools\Similarity\Euclidean; use NlpTools\Similarity\CosineSimilarity; use NlpTools\Clustering\CentroidFactories\MeanAngle; use NlpTools\Clustering\CentroidFactories\Euclidean as EuclidCF; use NlpTools\Documents\TrainingSet; use NlpTools\FeatureFactories\DataAsFeatures; use NlpTools\Documents\Document; $NC = 2; // number of clusters $clust = new Kmeans($NC, new Euclidean(), new EuclidCF(), 0.001); $tset = new TrainingSet(); for ($i = 0; $i < 500; $i++) { $tset->addDocument('', EuclideanPoint::getRandomPointAround(100, 100, 45)); } for ($i = 0; $i < 500; $i++) { $tset->addDocument('', EuclideanPoint::getRandomPointAround(200, 100, 45)); } list($clusters, $centroids, $distances) = $clust->cluster($tset, new DataAsFeatures()); $im = draw_clusters($tset, $clusters, $centroids, false); if ($im) { imagepng($im, 'clusters.png'); } else { var_dump($clusters); }