/** * The basic URL parser. * @link https://url.spec.whatwg.org/#concept-basic-url-parser URL Standard * @param string $input A UTF-8 string. * @param URL|null $base A base URL. * @param string|null $encodingOverride A valid name of an encoding. * @param (URL|string)[]|null $urlAndStateOverride An URL ("url" key) and a state override ("state override" key). * @throws \DomainException If $urlAndStateOverride['state override'] is invalid. * @return URL|false|null */ public static function parseBasicURL($input, self $base = null, $encodingOverride = null, array $urlAndStateOverride = null) { $input = str_replace(["\t", "\n", "\r"], '', $input); if ($urlAndStateOverride) { $url = $urlAndStateOverride['url']; $stateOverride = (string) $urlAndStateOverride['state override']; $string = (string) $input; $state = $stateOverride; } else { $url = new self(); $stateOverride = null; $string = trim($input, ".. "); $state = 'scheme start state'; } $encoding = $encodingOverride ? URLencoding::getOutputEncoding((string) $encodingOverride) : 'UTF-8'; $buffer = ''; $atFlag = false; $bracketFlag = false; $codePoints = preg_split('//u', $string, -1, PREG_SPLIT_NO_EMPTY); for ($pointer = 0; true; $pointer++) { $c = isset($codePoints[$pointer]) ? $codePoints[$pointer] : ''; switch ($state) { case 'scheme start state': if (stripos('abcdefghijklmnopqrstuvwxyz', $c) !== false) { $buffer .= strtolower($c); $state = 'scheme state'; } elseif (!$stateOverride) { $state = 'no scheme state'; $pointer--; } else { return; } break; case 'scheme state': if (stripos('0123456789abcdefghijklmnopqrstuvwxyz+-.', $c) !== false) { $buffer .= strtolower($c); } elseif ($c === ':') { if ($stateOverride && array_key_exists($url->scheme, self::$specialSchemes) !== array_key_exists($buffer, self::$specialSchemes)) { return; } $url->scheme = $buffer; $buffer = ''; if ($stateOverride) { return; } if ($url->scheme === 'file') { $state = 'file state'; } elseif ($url->isSpecial() && $base && $base->scheme === $url->scheme) { $state = 'special relative or authority state'; } elseif ($url->isSpecial()) { $state = 'special authority slashes state'; } elseif (isset($codePoints[$pointer + 1]) && $codePoints[$pointer + 1] === '/') { $state = 'path or authority state'; $pointer++; } else { $url->nonRelativeFlag = $url->cannotBeABaseURLFlag = true; $url->path[] = ''; $state = 'non-relative path state'; } } elseif (!$stateOverride) { $buffer = ''; $state = 'no scheme state'; $pointer = -1; } else { return; } break; case 'no scheme state': if (!$base || $base->cannotBeABaseURLFlag && $c !== '#') { return false; } elseif ($base->cannotBeABaseURLFlag && $c === '#') { $url->scheme = $base->scheme; $url->path = $base->path; $url->query = $base->query; $url->fragment = ''; $url->nonRelativeFlag = $url->cannotBeABaseURLFlag = true; $state = 'fragment state'; } elseif ($base->scheme !== 'file') { $state = 'relative state'; $pointer--; } else { $state = 'file state'; $pointer--; } break; case 'special relative or authority state': if ($c === '/' && isset($codePoints[$pointer + 1]) && $codePoints[$pointer + 1] === '/') { $state = 'special authority ignore slashes state'; $pointer++; } else { $state = 'relative state'; $pointer--; } break; case 'path or authority state': if ($c === '/') { $state = 'authority state'; } else { $state = 'path state'; $pointer--; } break; case 'relative state': $url->scheme = $base->scheme; switch ($c) { case '': $url->username = $base->username; $url->password = $base->password; $url->host = $base->host; $url->port = $base->port; $url->path = $base->path; $url->query = $base->query; break; case '/': $state = 'relative slash state'; break; case '?': $url->username = $base->username; $url->password = $base->password; $url->host = $base->host; $url->port = $base->port; $url->path = $base->path; $url->query = ''; $state = 'query state'; break; case '#': $url->username = $base->username; $url->password = $base->password; $url->host = $base->host; $url->port = $base->port; $url->path = $base->path; $url->query = $base->query; $url->fragment = ''; $state = 'fragment state'; break; default: if ($c === '\\' && $url->isSpecial()) { $state = 'relative slash state'; } else { $url->username = $base->username; $url->password = $base->password; $url->host = $base->host; $url->port = $base->port; $url->path = $base->path; array_pop($url->path); $state = 'path state'; $pointer--; } } break; case 'relative slash state': if ($c === '/' || $c === '\\' && $url->isSpecial()) { $state = 'special authority ignore slashes state'; } else { $url->username = $base->username; $url->password = $base->password; $url->host = $base->host; $url->port = $base->port; $state = 'path state'; $pointer--; } break; case 'special authority slashes state': if ($c === '/' && isset($codePoints[$pointer + 1]) && $codePoints[$pointer + 1] === '/') { $state = 'special authority ignore slashes state'; $pointer++; } else { $state = 'special authority ignore slashes state'; $pointer--; } break; case 'special authority ignore slashes state': if (!in_array($c, ['/', '\\'])) { $state = 'authority state'; $pointer--; } break; case 'authority state': if ($c === '@') { if ($atFlag) { $buffer = '%40' . $buffer; } $atFlag = true; $usernameAndPassword = explode(':', $buffer, 2); $url->username .= self::percentEncodeCodePoints(Infrastructure::USERINFO_ENCODE_SET, $usernameAndPassword[0]); if (isset($usernameAndPassword[1])) { $url->password .= self::percentEncodeCodePoints(Infrastructure::USERINFO_ENCODE_SET, $usernameAndPassword[1]); } $buffer = ''; } elseif (in_array($c, ['', '/', '?', '#']) || $c === '\\' && $url->isSpecial()) { $pointer -= mb_strlen($buffer, 'UTF-8') + 1; $buffer = ''; $state = 'host state'; } else { $buffer .= $c; } break; case 'host state': case 'hostname state': if ($c === ':' && !$bracketFlag) { if ($buffer === '' && $url->isSpecial()) { return false; } $host = HostProcessing::parseHost($buffer); if ($host === false) { return false; } $url->host = $host; $buffer = ''; $state = 'port state'; if ($stateOverride === 'hostname state') { return; } } elseif (in_array($c, ['', '/', '?', '#']) || $c === '\\' && $url->isSpecial()) { $pointer--; if ($buffer === '' && $url->isSpecial()) { return false; } $host = HostProcessing::parseHost($buffer); if ($host === false) { return false; } $url->host = $host; $buffer = ''; $state = 'path start state'; if ($stateOverride) { return; } } else { if ($c === '[') { $bracketFlag = true; } if ($c === ']') { $bracketFlag = false; } $buffer .= $c; } break; case 'port state': if (ctype_digit($c)) { $buffer .= $c; } elseif (in_array($c, ['', '/', '?', '#']) || $c === '\\' && $url->isSpecial() || $stateOverride) { if ($buffer !== '') { $port = (int) $buffer; if ($port > pow(2, 16) - 1) { return false; } $url->port = isset(self::$specialSchemes[$url->scheme]) && self::$specialSchemes[$url->scheme] === $port ? null : $port; $buffer = ''; } if ($stateOverride) { return; } $state = 'path start state'; $pointer--; } else { return false; } break; case 'file state': $url->scheme = 'file'; switch ($c) { case '': if ($base && $base->scheme === 'file') { $url->host = $base->host; $url->path = $base->path; $url->query = $base->query; } break; case '\\': case '/': $state = 'file slash state'; break; case '?': if ($base && $base->scheme === 'file') { $url->host = $base->host; $url->path = $base->path; $url->query = ''; $state = 'query state'; } break; case '#': if ($base && $base->scheme === 'file') { $url->host = $base->host; $url->path = $base->path; $url->query = $base->query; $url->fragment = ''; $state = 'fragment state'; } break; default: $remaining = array_slice($codePoints, $pointer + 1); if ($base && $base->scheme === 'file' && isset($remaining[0]) && preg_match(Infrastructure::WINDOWS_DRIVE_LETTER, $c . $remaining[0]) === 0 && (count($remaining) === 1 || isset($remaining[1]) && strpos('/\\?#', $remaining[1]) === false)) { $url->host = $base->host; $url->path = $base->path; $url->popPath(); } $state = 'path state'; $pointer--; } break; case 'file slash state': if ($c === '/' || $c === '\\') { $state = 'file host state'; } else { if ($base && $base->scheme === 'file' && isset($base->path[0]) && preg_match(Infrastructure::NORMALIZED_WINDOWS_DRIVE_LETTER, $base->path[0]) === 1) { $url->path[] = $base->path[0]; } $state = 'path state'; $pointer--; } break; case 'file host state': if (in_array($c, ['', '/', '\\', '?', '#'])) { $pointer--; if (preg_match(Infrastructure::WINDOWS_DRIVE_LETTER, $buffer) === 1) { $state = 'path state'; } elseif ($buffer === '') { $state = 'path start state'; } else { $host = HostProcessing::parseHost($buffer); if ($host === false) { return false; } if ($host !== 'localhost') { $url->host = $host; } $buffer = ''; $state = 'path start state'; } } else { $buffer .= $c; } break; case 'path start state': $state = 'path state'; if (!($c === '/' || $c === '\\' && $url->isSpecial())) { $pointer--; } break; case 'path state': if (in_array($c, ['', '/']) || $c === '\\' && $url->isSpecial() || !$stateOverride && in_array($c, ['?', '#'])) { if (preg_match(self::DOUBLE_DOT_PATH_SEGMENT, $buffer) === 1) { $url->popPath(); if (!($c === '/' || $c === '\\' && $url->isSpecial())) { $url->path[] = ''; } } elseif (preg_match(self::SINGLE_DOT_PATH_SEGMENT, $buffer) === 1 && !($c === '/' || $c === '\\' && $url->isSpecial())) { $url->path[] = ''; } elseif (preg_match(self::SINGLE_DOT_PATH_SEGMENT, $buffer) !== 1) { if ($url->scheme === 'file' && !$url->path && preg_match(Infrastructure::WINDOWS_DRIVE_LETTER, $buffer) === 1) { $url->host = null; $buffer[1] = ':'; } $url->path[] = $buffer; } $buffer = ''; if ($c === '?') { $url->query = ''; $state = 'query state'; } elseif ($c === '#') { $url->fragment = ''; $state = 'fragment state'; } } else { if (stripos(implode('', array_slice($codePoints, $pointer)), '%2e') === 0) { $buffer .= '.'; $pointer += 2; } else { $buffer .= Infrastructure::utf8PercentEncode(Infrastructure::DEFAULT_ENCODE_SET, $c); } } break; case 'non-relative path state': if ($c === '?') { $url->query = ''; $state = 'query state'; } elseif ($c === '#') { $url->fragment = ''; $state = 'fragment state'; } else { if ($c !== '') { $url->path[0] .= Infrastructure::utf8PercentEncode(Infrastructure::SIMPLE_ENCODE_SET, $c); } } break; case 'query state': if ($c === '' || !$stateOverride && $c === '#') { if (!$url->isSpecial() || $url->scheme === 'ws' || $url->scheme === 'wss') { $encoding = 'UTF-8'; } $buffer = URLencoding::encode($buffer, $encoding); $url->query = self::percentEncodeCodePoints('/[^!$-;=?-~]/', $buffer); $buffer = ''; if ($c === '#') { $url->fragment = ''; $state = 'fragment state'; } } else { $buffer .= $c; } break; case 'fragment state': if ($c !== '') { $url->fragment .= str_replace("", '', implode('', array_slice($codePoints, $pointer))); } break 2; default: throw new \DomainException(sprintf('"%s" is an unknown state', $state)); } if ($pointer >= 0 && !isset($codePoints[$pointer])) { break; } } return $url; }
/** * @param string $input * @param boolean|null $unicodeFlag * @param string|integer[]|false $domain * @param string|null $message * @dataProvider hostProvider */ public function testParseHost($input, $unicodeFlag, $domain, $message = null) { $this->assertSame($domain, $unicodeFlag === null ? HostProcessing::parseHost($input) : HostProcessing::parseHost($input, $unicodeFlag), $message); }