public function testDomDocument() { $html = $this->loadDat('dom_document.html'); // $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); $extractor = new ContentExtractor(); $extractor->exec($html); $xpath = $extractor->calculateXpath(); $text = $extractor->scan($extractor->getExtractedNode()); d($extractor->getExtractedNode()->nodeName); d($text); d($xpath); d($extractor->params); d($extractor->text); d($extractor->title); d('pancutuationCountAll:' . $extractor->pancutuationCountAll); d('domCountAll:' . $extractor->domCountAll); d('textLengthAll:' . $extractor->textLengthAll); d('textAll:' . $extractor->textAll); d(mb_strlen('あ', 'utf-8')); d(mb_strlen('ほげ ')); }
public function testXpath() { $datPath = implode('/', [PATH_TEST, 'dat', 'ContentExtractor']); foreach ($this->getData() as $name => $testData) { // ファイル名が正解のxpathのキー $path = implode('/', [$datPath, $name]); if (is_dir($path)) { continue; } $html = file_get_contents($path); $extractor = new ContentExtractor(); $extractor->exec($html); $xpath = $extractor->calculateXpath(); $text = $extractor->scan($extractor->getExtractedNode()); $hit = false; foreach ($testData->xpathCandidates as $xpathExpected) { if ($xpathExpected === $xpath) { $hit = true; } } if (!$hit) { d($text); file_put_contents('./test2.txt', $text); d($extractor->getExtractedNode()->nodeName); d($xpath); d($testData->url); d($extractor->params); d('pancutuationCountAll:' . $extractor->pancutuationCountAll); d('domCountAll:' . $extractor->domCountAll); d('textLengthAll:' . $extractor->textLengthAll); d('textAll:' . $extractor->textAll); // d('preProcessedInput:' . $extractor->preProcessedInput); } $this->assertEquals(true, $hit, $xpath . ' ' . $testData->url); // . PHP_EOL . var_export($extractor->params, true) . PHP_EOL); } }