Ejemplo n.º 1
0
 /** 
  * @dataProvider provider 
  */
 public function testGetkeywords($text, $expected)
 {
     $config = new \crodas\TextRank\Config();
     $config->addListener(new \crodas\TextRank\Stopword());
     $analizer = new \crodas\TextRank\TextRank($config);
     $keywords = $analizer->getKeywords($text);
     foreach ($expected as $word) {
         $catch = false;
         if ($word[0] == '*') {
             $catch = true;
             $word = substr($word, 1);
         }
         try {
             $this->assertTrue(!empty($keywords[$word]), "cannot find \"{$word}\"");
         } catch (\Exception $e) {
             if (!$catch) {
                 throw $e;
             }
         }
     }
 }
Ejemplo n.º 2
0
 private function findKeywords($pages)
 {
     $proxy = 'kuzh.polytechnique.fr:8080';
     $text = '';
     foreach ($pages as $page) {
         $ch = curl_init();
         curl_setopt($ch, CURLOPT_URL, $page);
         curl_setopt($ch, CURLOPT_PROXY, $proxy);
         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
         curl_setopt($ch, CURLOPT_HEADER, 0);
         curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
         curl_setopt($ch, CURLOPT_TIMEOUT, 10);
         //timeout in seconds
         $t = curl_exec($ch);
         $status = curl_getinfo($ch)['http_code'];
         if (!in_array($status, [200, 301, 302])) {
             break;
         }
         $text .= '. ' . $this->html2txt($t);
         curl_close($ch);
     }
     if (strlen($text) < 50) {
         return [];
     }
     require "TextRank/vendor/autoload.php";
     $config = new \crodas\TextRank\Config();
     $config->addListener(new \crodas\TextRank\Stopword());
     $textrank = new \crodas\TextRank\TextRank($config);
     $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
     # Remove non printable character (i.e. below ascii code 32).
     $text = preg_replace('/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F]/u', '', $text);
     $text = html_entity_decode($text);
     $keywords = $textrank->getAllKeywordsSorted($text);
     // echo '<br><br />' . $text;
     return $keywords;
 }