예제 #1
0
    public function testDomDocument()
    {
        $html = $this->loadDat('dom_document.html');
        //		$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
        $extractor = new ContentExtractor();
        $extractor->exec($html);
        $xpath = $extractor->calculateXpath();
        $text = $extractor->scan($extractor->getExtractedNode());
        d($extractor->getExtractedNode()->nodeName);
        d($text);
        d($xpath);
        d($extractor->params);
        d($extractor->text);
        d($extractor->title);
        d('pancutuationCountAll:' . $extractor->pancutuationCountAll);
        d('domCountAll:' . $extractor->domCountAll);
        d('textLengthAll:' . $extractor->textLengthAll);
        d('textAll:' . $extractor->textAll);
        d(mb_strlen('あ', 'utf-8'));
        d(mb_strlen('ほげ
				'));
    }
 public function testXpath()
 {
     $datPath = implode('/', [PATH_TEST, 'dat', 'ContentExtractor']);
     foreach ($this->getData() as $name => $testData) {
         // ファイル名が正解のxpathのキー
         $path = implode('/', [$datPath, $name]);
         if (is_dir($path)) {
             continue;
         }
         $html = file_get_contents($path);
         $extractor = new ContentExtractor();
         $extractor->exec($html);
         $xpath = $extractor->calculateXpath();
         $text = $extractor->scan($extractor->getExtractedNode());
         $hit = false;
         foreach ($testData->xpathCandidates as $xpathExpected) {
             if ($xpathExpected === $xpath) {
                 $hit = true;
             }
         }
         if (!$hit) {
             d($text);
             file_put_contents('./test2.txt', $text);
             d($extractor->getExtractedNode()->nodeName);
             d($xpath);
             d($testData->url);
             d($extractor->params);
             d('pancutuationCountAll:' . $extractor->pancutuationCountAll);
             d('domCountAll:' . $extractor->domCountAll);
             d('textLengthAll:' . $extractor->textLengthAll);
             d('textAll:' . $extractor->textAll);
             //				d('preProcessedInput:' . $extractor->preProcessedInput);
         }
         $this->assertEquals(true, $hit, $xpath . ' ' . $testData->url);
         // . PHP_EOL . var_export($extractor->params, true) . PHP_EOL);
     }
 }