Uses patterns specified in site config files and auto detection (hNews/PHP Readability) to extract content from HTML files.
Beispiel #1
0
 public function testIframeEmbeddedContent()
 {
     $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);
     $config = new SiteConfig();
     // '//header' is a bad pattern, and it will jump to the next one
     $config->body = array('//header', '//div');
     // obviously a bad parser which will be converted to use the default one
     $config->parser = 'toto';
     $res = $contentExtractor->process('<div>' . str_repeat('this is the best part of the show', 10) . '</div><div class="video_player"><iframe src="http://www.dailymotion.com/embed/video/x2kjh59" frameborder="0" width="534" height="320"></iframe></div>', 'https://lemonde.io/35941909', $config);
     $this->assertTrue($res, 'Extraction went well');
     $domElement = $contentExtractor->getContent();
     $content = $domElement->ownerDocument->saveXML($domElement);
     $this->assertContains('<iframe src="http://www.dailymotion.com/embed/video/x2kjh59" frameborder="0" width="534" height="320">[embedded content]</iframe>', $content);
 }