require_once 'utils/BarPageBuilder.php'; /** * Get some text from the internet * we will grab tom sawyer from the gutenberg project * http://www.gutenberg.org/cache/epub/74/pg74.txt * */ /** * @var string $book */ $book = file_get_contents('data/books/pg74.txt'); /** * Create a tokenizer object to parse the book into a set of tokens * */ $tokenizer = new \TextAnalysis\Tokenizers\GeneralTokenizer(); /** * Get the set of tokens generated by the tokenize, see * */ $tokens = $tokenizer->tokenize($book); $freqDist = new \TextAnalysis\Analysis\FreqDist($tokens); /** * Get the top 10 most used words in Tom Sawyer */ $top10 = array_splice($freqDist->getKeyValuesByFrequency(), 0, 10); /** * Use High Charts to visualize the data */ $pageBuilder = new BarPageBuilder($top10); $html = $pageBuilder->getHtmlPage();
* An example of creating a creating document collection * Document Collections allow you to work with a group of documents easily */ require_once 'vendor/autoload.php'; //used to generate a chart from the output of PHP Text Analysis require_once 'utils/BarPageBuilder.php'; /** * @var string $book */ $tomSawyerBook = file_get_contents('data/books/pg74.txt'); $huckFinnBook = file_get_contents('data/books/pg76.txt'); /** * Create a tokenizer object to parse the book into a set of tokens * */ $tokenizer = new \TextAnalysis\Tokenizers\GeneralTokenizer(); /** * Get the set of tokens generated by the tokenize and * create a token document from the tokens * */ $tomSawyerDocument = new \TextAnalysis\Documents\TokensDocument($tokenizer->tokenize($tomSawyerBook)); $huckFinnDocument = new \TextAnalysis\Documents\TokensDocument($tokenizer->tokenize($huckFinnBook)); /** * create a document collection that can have filters or further analysis done */ $docCollection = new \TextAnalysis\Collections\DocumentArrayCollection(array($tomSawyerDocument, $huckFinnDocument)); /** * Apply filters to the document collection * lower case the documents, remove quotes and remove stop words */