/** * Set website url for crawling * * @param string $baseUrl [] * * @return void */ public function setBaseUrl($baseUrl) { if (strpos($baseUrl, 'http') === false) { $this->baseUrl = Uri\normalize('http://' . $baseUrl); } else { $this->baseUrl = Uri\normalize($baseUrl); } }
/** * Calculates the uri for a request, making sure that the base uri is stripped out * * @param string $uri * @throws Exception\Forbidden A permission denied exception is thrown whenever there was an attempt to supply a uri outside of the base uri * @return string */ function calculateUri($uri) { if ($uri[0] != '/' && strpos($uri, '://')) { $uri = parse_url($uri, PHP_URL_PATH); } $uri = Uri\normalize(str_replace('//', '/', $uri)); $baseUri = Uri\normalize($this->getBaseUri()); if (strpos($uri, $baseUri) === 0) { return trim(URLUtil::decodePath(substr($uri, strlen($baseUri))), '/'); // A special case, if the baseUri was accessed without a trailing // slash, we'll accept it as well. } elseif ($uri . '/' === $baseUri) { return ''; } else { throw new Exception\Forbidden('Requested uri (' . $uri . ') is out of base uri (' . $this->getBaseUri() . ')'); } }
/** * Returns the relative path. * * This is being calculated using the base url. This path will not start * with a slash, so it will always return something like * 'example/path.html'. * * If the full path is equal to the base url, this method will return an * empty string. * * This method will also urldecode the path, and if the url was incoded as * ISO-8859-1, it will convert it to UTF-8. * * If the path is outside of the base url, a LogicException will be thrown. * * @return string */ function getPath() { // Removing duplicated slashes. $uri = str_replace('//', '/', $this->getUrl()); $uri = Uri\normalize($uri); $baseUri = Uri\normalize($this->getBaseUrl()); if (strpos($uri, $baseUri) === 0) { // We're not interested in the query part (everything after the ?). list($uri) = explode('?', $uri); return trim(URLUtil::decodePath(substr($uri, strlen($baseUri))), '/'); } elseif ($uri . '/' === $baseUri) { return ''; } throw new \LogicException('Requested uri (' . $this->getUrl() . ') is out of base uri (' . $this->getBaseUrl() . ')'); }