require_once 'utils/BarPageBuilder.php';
/**
 *  Get some text from the internet
 *  we will grab tom sawyer from the gutenberg project
 *  http://www.gutenberg.org/cache/epub/74/pg74.txt
 * 
 */
/**
 * @var string $book 
 */
$book = file_get_contents('data/books/pg74.txt');
/**
 *  Create a tokenizer object to parse the book into a set of tokens
 *  
 */
$tokenizer = new \TextAnalysis\Tokenizers\GeneralTokenizer();
/**
 * Get the set of tokens generated by the tokenize, see 
 *  
 */
$tokens = $tokenizer->tokenize($book);
$freqDist = new \TextAnalysis\Analysis\FreqDist($tokens);
/**
 * Get the top 10 most used words in Tom Sawyer 
 */
$top10 = array_splice($freqDist->getKeyValuesByFrequency(), 0, 10);
/** 
 * Use High Charts to visualize the data
 */
$pageBuilder = new BarPageBuilder($top10);
$html = $pageBuilder->getHtmlPage();
 * An example of creating a creating document collection 
 * Document Collections allow you to work with a group of documents easily
 */
require_once 'vendor/autoload.php';
//used to generate a chart from the output of PHP Text Analysis
require_once 'utils/BarPageBuilder.php';
/**
 * @var string $book 
 */
$tomSawyerBook = file_get_contents('data/books/pg74.txt');
$huckFinnBook = file_get_contents('data/books/pg76.txt');
/**
 *  Create a tokenizer object to parse the book into a set of tokens
 *  
 */
$tokenizer = new \TextAnalysis\Tokenizers\GeneralTokenizer();
/**
 * Get the set of tokens generated by the tokenize and
 * create a token document from the tokens
 *  
 */
$tomSawyerDocument = new \TextAnalysis\Documents\TokensDocument($tokenizer->tokenize($tomSawyerBook));
$huckFinnDocument = new \TextAnalysis\Documents\TokensDocument($tokenizer->tokenize($huckFinnBook));
/**
 * create a document collection that can have filters or further analysis done
 */
$docCollection = new \TextAnalysis\Collections\DocumentArrayCollection(array($tomSawyerDocument, $huckFinnDocument));
/**
 *  Apply filters to the document collection
 *  lower case the documents, remove quotes and remove stop words
 */