/** * @param \OCA\News\Db\Item $item * @return \OCA\News\Db\Item enhanced item */ public function enhance(Item $item) { foreach ($this->regexXPathPair as $regex => $search) { if (preg_match($regex, $item->getUrl())) { $file = $this->getFile($item->getUrl()); // convert encoding by detecting charset from header $contentType = $file->headers['content-type']; if (preg_match('/(?<=charset=)[^;]*/', $contentType, $matches)) { $body = mb_convert_encoding($file->body, 'HTML-ENTITIES', $matches[0]); } else { $body = $file->body; } $dom = new \DOMDocument(); @$dom->loadHTML($body); $xpath = new \DOMXpath($dom); $xpathResult = $xpath->evaluate($search); // in case it wasnt a text query assume its a single if (!is_string($xpathResult)) { $xpathResult = $this->domToString($xpathResult); } // convert all relative to absolute URLs $xpathResult = $this->substituteRelativeLinks($xpathResult, $item->getUrl()); if ($xpathResult) { $item->setBody($xpathResult); } } } return $item; }
/** * @param \OCA\News\Db\Item $item * @return \OCA\News\Db\Item enhanced item */ public function enhance(Item $item){ foreach($this->regexXPathPair as $regex => $search) { if(preg_match($regex, $item->getUrl())) { $body = $this->getFile($item->getUrl()); // First check if either <meta charset="..."> or // <meta http-equiv="Content-Type" ...> is specified and use it // If this fails use mb_detect_encoding() $regex = '/<meta\s+[^>]*(?:charset\s*=\s*[\'"]([^>\'"]*)[\'"]' . '|http-equiv\s*=\s*[\'"]content-type[\'"]\s+[^>]*' . 'content\s*=\s*[\'"][^>]*charset=([^>]*)[\'"])[^>]*>' . '/i'; if(preg_match($regex, $body, $matches)) { $enc = strtoupper($matches[sizeof($matches) - 1]); } else { $enc = mb_detect_encoding($body); } $enc = $enc ? $enc : 'UTF-8'; $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc); if (trim($body) === '') { return $item; } $dom = new DOMDocument(); @$dom->loadHTML($body); $xpath = new DOMXpath($dom); $xpathResult = $xpath->evaluate($search); // in case it wasnt a text query assume its a dom element and // convert it to text if(!is_string($xpathResult)) { $xpathResult = $this->domToString($xpathResult); } $xpathResult = trim($xpathResult); // convert all relative to absolute URLs $xpathResult = $this->substituteRelativeLinks( $xpathResult, $item->getUrl() ); if($xpathResult) { $item->setBody($xpathResult); } } } return $item; }
/** * @param \OCA\News\Db\Item $item * @return \OCA\News\Db\Item enhanced item */ public function enhance(Item $item) { if (preg_match($this->matchArticleUrl, $item->getUrl())) { $body = $item->getBody(); foreach ($this->regexPair as $search => $replaceWith) { $body = preg_replace($search, $replaceWith, $body); } $item->setBody($body); } return $item; }
public function testDontTransformAbsoluteUrlsAndMails() { $file = new \stdClass(); $file->headers = array("content-type" => "text/html; charset=utf-8"); $file->body = '<html> <body> <img src="http://www.url.com/absolute/url.png"> <a href="mailto:test@testsite.com">mail</a> </body> </html>'; $item = new Item(); $item->setUrl('https://www.explosm.net/all/312'); $item->setBody('Hello thar'); $this->fileFactory->expects($this->once())->method('getFile')->with($this->equalTo($item->getUrl()), $this->equalTo($this->timeout), $this->equalTo($this->redirects), $this->equalTo($this->headers), $this->equalTo($this->userAgent))->will($this->returnValue($file)); $result = $this->testEnhancer->enhance($item); $this->assertEquals('<img src="http://www.url.com/absolute/url.png"><a target="_blank" href="mailto:test@testsite.com">mail</a>', $result->getBody()); }
public function testDontTransformAbsoluteUrlsAndMails() { $encoding = 'utf-8'; $body = '<html> <body> <img src="http://www.url.com/absolute/url.png"> <a href="mailto:test@testsite.com">mail</a> </body> </html>'; $item = new Item(); $item->setUrl('https://www.explosm.net/all/312'); $item->setBody('Hello thar'); $this->setUpFile($body, $encoding, $item->getUrl()); $result = $this->testEnhancer->enhance($item); $this->assertEquals('<div>' . '<img src="http://www.url.com/absolute/url.png">' . '<a target="_blank" rel="noreferrer" href="mailto:test@testsite.com">mail</a>' . '</div>', $result->getBody()); }