<?php /* * Simple coword calculation, all in memory * * WARNING: Because of PHP's very very very bad UTF-8 support, splitting tweets into words only works reliably for tweets consisting solely of latin chars (i.e. English) */ if (0) { $coword = new Coword(); $coword->setDocuments(array("bla bla bla test test clash", "test test test bla bla", "clash bla test")); $coword->addDocument("bla, !test 345 cla5sh"); $coword->iterate(); var_export($coword->getCowordsAsCsv()); print "\n\n"; var_export($coword->getWordsAsCsv()); print "\n"; } class Coword { public $documents = array(); public $punctuation = array(); public $hashtags_are_separate_words; public $extract_only_hashtags; public $remove_stop_words; public $min_word_length; public $min_word_frequency; public $words = array(); // holds word frequencies public $cowords = array(); // holds coword frequencies public $document_word_frequencies = array();