/** * @dataProvider provider */ public function testGetkeywords($text, $expected) { $config = new \crodas\TextRank\Config(); $config->addListener(new \crodas\TextRank\Stopword()); $analizer = new \crodas\TextRank\TextRank($config); $keywords = $analizer->getKeywords($text); foreach ($expected as $word) { $catch = false; if ($word[0] == '*') { $catch = true; $word = substr($word, 1); } try { $this->assertTrue(!empty($keywords[$word]), "cannot find \"{$word}\""); } catch (\Exception $e) { if (!$catch) { throw $e; } } } }
private function findKeywords($pages) { $proxy = 'kuzh.polytechnique.fr:8080'; $text = ''; foreach ($pages as $page) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $page); curl_setopt($ch, CURLOPT_PROXY, $proxy); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5); curl_setopt($ch, CURLOPT_TIMEOUT, 10); //timeout in seconds $t = curl_exec($ch); $status = curl_getinfo($ch)['http_code']; if (!in_array($status, [200, 301, 302])) { break; } $text .= '. ' . $this->html2txt($t); curl_close($ch); } if (strlen($text) < 50) { return []; } require "TextRank/vendor/autoload.php"; $config = new \crodas\TextRank\Config(); $config->addListener(new \crodas\TextRank\Stopword()); $textrank = new \crodas\TextRank\TextRank($config); $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8'); # Remove non printable character (i.e. below ascii code 32). $text = preg_replace('/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F]/u', '', $text); $text = html_entity_decode($text); $keywords = $textrank->getAllKeywordsSorted($text); // echo '<br><br />' . $text; return $keywords; }