function testAbsoluteUrl() { $tests = array(array('base' => 'http://solutions.weblite.ca/', 'in' => 'index.html', 'out' => 'http://solutions.weblite.ca/index.html'), array('base' => 'http://solutions.weblite.ca/', 'in' => '/index.html', 'out' => 'http://solutions.weblite.ca/index.html'), array('base' => 'http://solutions.weblite.ca/foo/', 'in' => '/index.html', 'out' => 'http://solutions.weblite.ca/index.html'), array('base' => 'http://solutions.weblite.ca', 'in' => 'index.html', 'out' => 'http://solutions.weblite.ca/index.html'), array('base' => 'http://solutions.weblite.ca/foo', 'in' => 'index.html', 'out' => 'http://solutions.weblite.ca/foo/index.html'), array('base' => 'http://solutions.weblite.ca/foo/', 'in' => 'index.html', 'out' => 'http://solutions.weblite.ca/foo/index.html'), array('base' => 'https://foo.bar.com/foo/', 'in' => 'http://foo.bar.com/foo/index.html', 'out' => 'http://foo.bar.com/foo/index.html'), array('base' => 'https://foo.bar.com/foo/', 'in' => '//foo.bar.com/foo/index.html', 'out' => 'https://foo.bar.com/foo/index.html'), array('base' => 'http://foo.bar.com/foo/', 'in' => 'index.html?go=1&bar=2', 'out' => 'http://foo.bar.com/foo/index.html?go=1&bar=2')); foreach ($tests as $test) { $this->assertEquals($test['out'], SweteTools::absoluteUrl($test['in'], $test['base']), 'Converting ' . $test['in'] . ', ' . $test['base']); } }
/** * @private * * @brief Callback used by preg_replace when converting CSS. */ public function _cssCallback($match) { $absUrl = SweteTools::absoluteUrl($match[2], $this->_currBase); try { $hash = $this->processResource($absUrl, true); return 'url(' . $match[1] . $hash . $match[3] . ')'; } catch (Exception $ex) { return $match[0]; } }
/** * @brief Processes the HTML found at a specified URL. It goes through all of the * links and adds them to the queue to be processed in later rounds. * @param string $html The HTML to be processed. * @param string $url The URL of the webpage that stored the html in $html * @returns void * */ public function processHtml($html, $url) { require_once 'lib/simple_html_dom.php'; if (strpos($url, $this->site->getSiteUrl()) !== 0) { throw new Exception("HTML could not be processed because the url {$url} lies outside the target site: " . $this->site->getSiteUrl()); } $base = $url; if ($base[strlen($base) - 1] != '/') { $base = substr($base, 0, strrpos($base, '/')); } $dom = str_get_html($html); $baseTags = $dom->find('base[href]'); foreach ($baseTags as $baseTag) { $base = $baseTag->href; if ($base[strlen($base) - 1] != '/') { $base .= '/'; } } // Now that we have our base tag, we can begin to fire away. $node = $this->addNodeAtAbsoluteUrl($url, array('type' => 'page', 'url' => $url)); if ($this->loadContent) { $this->addNodeAtAbsoluteUrl($url, array('content' => $html)); } if (isset($this->delegate) and method_exists($this->delegate, 'loadContent')) { $this->delegate->loadContent($node, $html); } // Now let's harvest the links $links = $dom->find('a[href]'); foreach ($links as $link) { $href = SweteTools::absoluteUrl($link->href, $base); if (strpos($href, $this->site->getSiteUrl()) !== 0) { // this link doesn't belong in our tree continue; } else { $this->addNodeAtAbsoluteUrl($href); } } }