/** * returns single page response, or false if not found. * * @param string $html * @param string $url * * @return false|array From httpClient fetch */ private function getSinglePage($html, $url) { $this->logger->log('debug', 'Looking for site config files to see if single page link exists'); $siteConfig = $this->configBuilder->buildFromUrl($url); // no single page found? if (empty($siteConfig->single_page_link)) { $this->logger->log('debug', 'No "single_page_link" config found'); return false; } // Build DOM tree from HTML $readability = new Readability($html, $url); $xpath = new \DOMXPath($readability->dom); // Loop through single_page_link xpath expressions $singlePageUrl = null; foreach ($siteConfig->single_page_link as $pattern) { $elems = $xpath->evaluate($pattern, $readability->dom); if (is_string($elems)) { $singlePageUrl = trim($elems); break; } elseif ($elems instanceof \DOMNodeList && $elems->length > 0) { foreach ($elems as $item) { if ($item instanceof \DOMElement && $item->hasAttribute('href')) { $singlePageUrl = $item->getAttribute('href'); break 2; } elseif ($item instanceof \DOMAttr && $item->value) { $singlePageUrl = $item->value; break 2; } } } } if (!$singlePageUrl) { $this->logger->log('debug', 'No url found'); return false; } // try to resolve against $url $singlePageUrl = $this->makeAbsoluteStr($url, $singlePageUrl); // check it's not what we have already! if (false !== $singlePageUrl && $singlePageUrl != $url) { // it's not, so let's try to fetch it... $response = $this->httpClient->fetch($singlePageUrl, false, $siteConfig->http_header); if ($response['status'] < 300) { $this->logger->log('debug', 'Single page content found with url', ['url' => $singlePageUrl]); return $response; } } $this->logger->log('debug', 'No content found with url', ['url' => $singlePageUrl]); return false; }
public function testBuildWithCachedVersion() { $configBuilder = new ConfigBuilder(array('site_config' => array(dirname(__FILE__) . '/../fixtures/site_config'))); $res = $configBuilder->build('fr.wikipedia.org'); $this->assertInstanceOf('Graby\\SiteConfig\\SiteConfig', $res); $configBuilder->addToCache($res->cache_key, $res); $res2 = $configBuilder->build('fr.wikipedia.org'); $this->assertInstanceOf('Graby\\SiteConfig\\SiteConfig', $res); $this->assertEquals($res, $res2, 'Config retrieve from cache'); }