public function testGetHTML() { $sampleFile1 = test_files_path() . 'sample1.txt'; $xml = TikaWrapper::getHTML($sampleFile1); $xmlLines = preg_split('/\\n/', $xml); $this->assertEquals('<html xmlns="http://www.w3.org/1999/xhtml">', trim($xmlLines[0])); $this->assertEquals('<head>', trim($xmlLines[1])); $this->assertEquals('<meta name="Content-Length" content="119"/>', trim($xmlLines[2])); $this->assertEquals('<meta name="Content-Encoding" content="ISO-8859-1"/>', trim($xmlLines[3])); $this->assertEquals('<meta name="Content-Type" content="text/plain; charset=ISO-8859-1"/>', trim($xmlLines[4])); $this->assertEquals('<meta name="resourceName" content="sample1.txt"/>', trim($xmlLines[5])); $this->assertEquals('<title></title>', trim($xmlLines[6])); $this->assertEquals('</head>', trim($xmlLines[7])); $this->assertEquals('<body><p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam blandit blandit augue, eu tristique arcu tincidunt et.</p>', trim($xmlLines[8])); $this->assertEquals('</body></html>', trim($xmlLines[9])); }