public function Process($url, $profile = "auto", $tempoptions = array()) { $startts = microtime(true); $redirectts = $startts; if (isset($tempoptions["timeout"])) { $timeout = $tempoptions["timeout"]; } else { if (isset($this->data["httpopts"]["timeout"])) { $timeout = $this->data["httpopts"]["timeout"]; } else { $timeout = false; } } if (!isset($this->data["httpopts"]["headers"])) { $this->data["httpopts"]["headers"] = array(); } $this->data["httpopts"]["headers"] = HTTP::NormalizeHeaders($this->data["httpopts"]["headers"]); unset($this->data["httpopts"]["method"]); unset($this->data["httpopts"]["write_body_callback"]); unset($this->data["httpopts"]["body"]); unset($this->data["httpopts"]["postvars"]); unset($this->data["httpopts"]["files"]); $httpopts = $this->data["httpopts"]; $numfollow = $this->data["maxfollow"]; $numredirects = 0; $totalrawsendsize = 0; if (!isset($tempoptions["headers"])) { $tempoptions["headers"] = array(); } $tempoptions["headers"] = HTTP::NormalizeHeaders($tempoptions["headers"]); if (isset($tempoptions["headers"]["Referer"])) { $this->data["referer"] = $tempoptions["headers"]["Referer"]; } // If a referrer is specified, use it to generate an absolute URL. if ($this->data["referer"] != "") { $url = HTTP::ConvertRelativeToAbsoluteURL($this->data["referer"], $url); } $urlinfo = HTTP::ExtractURL($url); do { if (!isset($this->data["allowedprotocols"][$urlinfo["scheme"]]) || !$this->data["allowedprotocols"][$urlinfo["scheme"]]) { return array("success" => false, "error" => HTTP::HTTPTranslate("Protocol '%s' is not allowed in '%s'.", $urlinfo["scheme"], $url), "errorcode" => "allowed_protocols"); } $filename = HTTP::ExtractFilename($urlinfo["path"]); $pos = strrpos($filename, "."); $fileext = $pos !== false ? strtolower(substr($filename, $pos + 1)) : ""; // Set up some standard headers. $headers = array(); $profile = strtolower($profile); $tempprofile = explode("-", $profile); if (count($tempprofile) == 2) { $profile = $tempprofile[0]; $fileext = $tempprofile[1]; } if (substr($profile, 0, 2) == "ie" || $profile == "auto" && substr($this->data["useragent"], 0, 2) == "ie") { if ($fileext == "css") { $headers["Accept"] = "text/css"; } else { if ($fileext == "png" || $fileext == "jpg" || $fileext == "jpeg" || $fileext == "gif" || $fileext == "svg") { $headers["Accept"] = "image/png, image/svg+xml, image/*;q=0.8, */*;q=0.5"; } else { if ($fileext == "js") { $headers["Accept"] = "application/javascript, */*;q=0.8"; } else { if ($this->data["referer"] != "" || $fileext == "" || $fileext == "html" || $fileext == "xhtml" || $fileext == "xml") { $headers["Accept"] = "text/html, application/xhtml+xml, */*"; } else { $headers["Accept"] = "*/*"; } } } } $headers["Accept-Language"] = "en-US"; $headers["User-Agent"] = HTTP::GetUserAgent(substr($profile, 0, 2) == "ie" ? $profile : $this->data["useragent"]); } else { if ($profile == "firefox" || $profile == "auto" && $this->data["useragent"] == "firefox") { if ($fileext == "css") { $headers["Accept"] = "text/css,*/*;q=0.1"; } else { if ($fileext == "png" || $fileext == "jpg" || $fileext == "jpeg" || $fileext == "gif" || $fileext == "svg") { $headers["Accept"] = "image/png,image/*;q=0.8,*/*;q=0.5"; } else { if ($fileext == "js") { $headers["Accept"] = "*/*"; } else { $headers["Accept"] = "text/html, application/xhtml+xml, */*"; } } } $headers["Accept-Language"] = "en-us,en;q=0.5"; $headers["Cache-Control"] = "max-age=0"; $headers["User-Agent"] = HTTP::GetUserAgent("firefox"); } else { if ($profile == "opera" || $profile == "auto" && $this->data["useragent"] == "opera") { // Opera has the right idea: Just send the same thing regardless of the request type. $headers["Accept"] = "text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1"; $headers["Accept-Language"] = "en-US,en;q=0.9"; $headers["Cache-Control"] = "no-cache"; $headers["User-Agent"] = HTTP::GetUserAgent("opera"); } else { if ($profile == "safari" || $profile == "chrome" || $profile == "auto" && ($this->data["useragent"] == "safari" || $this->data["useragent"] == "chrome")) { if ($fileext == "css") { $headers["Accept"] = "text/css,*/*;q=0.1"; } else { if ($fileext == "png" || $fileext == "jpg" || $fileext == "jpeg" || $fileext == "gif" || $fileext == "svg" || $fileext == "js") { $headers["Accept"] = "*/*"; } else { $headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; } } $headers["Accept-Charset"] = "ISO-8859-1,utf-8;q=0.7,*;q=0.3"; $headers["Accept-Language"] = "en-US,en;q=0.8"; $headers["User-Agent"] = HTTP::GetUserAgent($profile == "safari" || $profile == "chrome" ? $profile : $this->data["useragent"]); } } } } if ($this->data["referer"] != "") { $headers["Referer"] = $this->data["referer"]; } // Generate the final headers array. $headers = array_merge($headers, $httpopts["headers"], $tempoptions["headers"]); // Calculate the host and reverse host and remove port information. $host = isset($headers["Host"]) ? $headers["Host"] : $urlinfo["host"]; $pos = strpos($host, "]"); if (substr($host, 0, 1) == "[" && $pos !== false) { $host = substr($host, 0, $pos + 1); } else { $pos = strpos($host, ":"); if ($pos !== false) { $host = substr($host, 0, $pos); } } $dothost = $host; if (substr($dothost, 0, 1) != ".") { $dothost = "." . $dothost; } // Append cookies and delete old, invalid cookies. $secure = $urlinfo["scheme"] == "https"; $cookiepath = $urlinfo["path"]; if ($cookiepath == "") { $cookiepath = "/"; } $pos = strrpos($cookiepath, "/"); if ($pos !== false) { $cookiepath = substr($cookiepath, 0, $pos + 1); } $cookies = array(); foreach ($this->data["cookies"] as $domain => $paths) { if (substr($domain, -strlen($dothost)) == $dothost) { foreach ($paths as $path => $cookies2) { if (substr($cookiepath, 0, strlen($path)) == $path) { foreach ($cookies2 as $num => $info) { if (isset($info["expires_ts"]) && $this->GetExpiresTimestamp($info["expires_ts"]) < time()) { unset($this->data["cookies"][$domain][$path][$num]); } else { if ($secure || !isset($info["secure"])) { $cookies[$info["name"]] = $info["value"]; } } } if (!count($this->data["cookies"][$domain][$path])) { unset($this->data["cookies"][$domain][$path]); } } } if (!count($this->data["cookies"][$domain])) { unset($this->data["cookies"][$domain]); } } } $cookies2 = array(); foreach ($cookies as $name => $value) { $cookies2[] = rawurlencode($name) . "=" . rawurlencode($value); } $headers["Cookie"] = implode("; ", $cookies2); if ($headers["Cookie"] == "") { unset($headers["Cookie"]); } // Generate the final options array. $options = array_merge($httpopts, $tempoptions); $options["headers"] = $headers; if ($timeout !== false) { $options["timeout"] = HTTP::GetTimeLeft($startts, $timeout); } // Process the request. $result = HTTP::RetrieveWebpage($url, $options); $result["url"] = $url; $result["options"] = $options; $result["firstreqts"] = $startts; $result["numredirects"] = $numredirects; $result["redirectts"] = $redirectts; if (isset($result["rawsendsize"])) { $totalrawsendsize += $result["rawsendsize"]; } $result["totalrawsendsize"] = $totalrawsendsize; unset($result["options"]["files"]); unset($result["options"]["body"]); if (!$result["success"]) { return array("success" => false, "error" => HTTP::HTTPTranslate("Unable to retrieve content. %s", $result["error"]), "info" => $result, "errorcode" => "retrievewebpage"); } // Set up structures for another round. if ($this->data["autoreferer"]) { $this->data["referer"] = $url; } if (isset($result["headers"]["Location"]) && $this->data["followlocation"]) { $redirectts = microtime(true); unset($tempoptions["method"]); unset($tempoptions["write_body_callback"]); unset($tempoptions["body"]); unset($tempoptions["postvars"]); unset($tempoptions["files"]); $tempoptions["headers"]["Referer"] = $url; $url = $result["headers"]["Location"][0]; // Generate an absolute URL. if ($this->data["referer"] != "") { $url = HTTP::ConvertRelativeToAbsoluteURL($this->data["referer"], $url); } $urlinfo2 = HTTP::ExtractURL($url); if (!isset($this->data["allowedredirprotocols"][$urlinfo2["scheme"]]) || !$this->data["allowedredirprotocols"][$urlinfo2["scheme"]]) { return array("success" => false, "error" => HTTP::HTTPTranslate("Protocol '%s' is not allowed. Server attempted to redirect to '%s'.", $urlinfo2["scheme"], $url), "info" => $result, "errorcode" => "allowed_redir_protocols"); } if ($urlinfo2["host"] != $urlinfo["host"]) { unset($tempoptions["headers"]["Host"]); unset($httpopts["headers"]["Host"]); } $urlinfo = $urlinfo2; $numredirects++; } // Handle any 'Set-Cookie' headers. if (isset($result["headers"]["Set-Cookie"])) { foreach ($result["headers"]["Set-Cookie"] as $cookie) { $items = explode("; ", $cookie); $item = trim(array_shift($items)); if ($item != "") { $cookie2 = array(); $pos = strpos($item, "="); if ($pos === false) { $cookie2["name"] = urldecode($item); $cookie2["value"] = ""; } else { $cookie2["name"] = urldecode(substr($item, 0, $pos)); $cookie2["value"] = urldecode(substr($item, $pos + 1)); } $cookie = array(); foreach ($items as $item) { $item = trim($item); if ($item != "") { $pos = strpos($item, "="); if ($pos === false) { $cookie[strtolower(trim(urldecode($item)))] = ""; } else { $cookie[strtolower(trim(urldecode(substr($item, 0, $pos))))] = urldecode(substr($item, $pos + 1)); } } } $cookie = array_merge($cookie, $cookie2); if (isset($cookie["expires"])) { $ts = HTTP::GetDateTimestamp($cookie["expires"]); $cookie["expires_ts"] = gmdate("Y-m-d H:i:s", $ts === false ? time() - 24 * 60 * 60 : $ts); } else { if (isset($cookie["max-age"])) { $cookie["expires_ts"] = gmdate("Y-m-d H:i:s", time() + (int) $cookie["max-age"]); } else { unset($cookie["expires_ts"]); } } if (!isset($cookie["domain"])) { $cookie["domain"] = $dothost; } if (substr($cookie["domain"], 0, 1) != ".") { $cookie["domain"] = "." . $cookie["domain"]; } if (!isset($cookie["path"])) { $cookie["path"] = $cookiepath; } $cookie["path"] = str_replace("\\", "/", $cookie["path"]); if (substr($cookie["path"], -1) != "/") { $cookie["path"] = "/"; } if (!isset($this->data["cookies"][$cookie["domain"]])) { $this->data["cookies"][$cookie["domain"]] = array(); } if (!isset($this->data["cookies"][$cookie["domain"]][$cookie["path"]])) { $this->data["cookies"][$cookie["domain"]][$cookie["path"]] = array(); } $this->data["cookies"][$cookie["domain"]][$cookie["path"]][] = $cookie; } } } if ($numfollow > 0) { $numfollow--; } } while (isset($result["headers"]["Location"]) && $this->data["followlocation"] && $numfollow); $result["numredirects"] = $numredirects; $result["redirectts"] = $redirectts; // Extract the forms from the page in a parsed format. // Call WebBrowser::GenerateFormRequest() to prepare an actual request for Process(). if ($this->data["extractforms"]) { $result["forms"] = $this->ExtractForms($result["url"], $result["body"]); } return $result; }
public function Process($url, $profile = "auto", $tempoptions = array()) { $startts = microtime(true); $redirectts = $startts; if (isset($tempoptions["timeout"])) { $timeout = $tempoptions["timeout"]; } else { if (isset($this->data["httpopts"]["timeout"])) { $timeout = $this->data["httpopts"]["timeout"]; } else { $timeout = false; } } // Deal with possible application hanging issues. if (isset($tempoptions["streamtimeout"])) { $streamtimeout = $tempoptions["streamtimeout"]; } else { if (isset($this->data["httpopts"]["streamtimeout"])) { $streamtimeout = $this->data["httpopts"]["streamtimeout"]; } else { $streamtimeout = 300; } } $tempoptions["streamtimeout"] = $streamtimeout; if (!isset($this->data["httpopts"]["headers"])) { $this->data["httpopts"]["headers"] = array(); } $this->data["httpopts"]["headers"] = HTTP::NormalizeHeaders($this->data["httpopts"]["headers"]); unset($this->data["httpopts"]["method"]); unset($this->data["httpopts"]["write_body_callback"]); unset($this->data["httpopts"]["body"]); unset($this->data["httpopts"]["postvars"]); unset($this->data["httpopts"]["files"]); $httpopts = $this->data["httpopts"]; $numfollow = $this->data["maxfollow"]; $numredirects = 0; $totalrawsendsize = 0; if (!isset($tempoptions["headers"])) { $tempoptions["headers"] = array(); } $tempoptions["headers"] = HTTP::NormalizeHeaders($tempoptions["headers"]); if (isset($tempoptions["headers"]["Referer"])) { $this->data["referer"] = $tempoptions["headers"]["Referer"]; } // If a referrer is specified, use it to generate an absolute URL. if ($this->data["referer"] != "") { $url = HTTP::ConvertRelativeToAbsoluteURL($this->data["referer"], $url); } $urlinfo = HTTP::ExtractURL($url); // Initialize the process state array. $state = array("async" => false, "startts" => $startts, "redirectts" => $redirectts, "timeout" => $timeout, "tempoptions" => $tempoptions, "httpopts" => $httpopts, "numfollow" => $numfollow, "numredirects" => $numredirects, "totalrawsendsize" => $totalrawsendsize, "profile" => $profile, "url" => $url, "urlinfo" => $urlinfo, "state" => "initialize", "httpstate" => false, "result" => false); // Run at least one state cycle to properly initialize the state array. $result = $this->ProcessState($state); // Return the state for async calls. Caller must call ProcessState(). if ($state["async"]) { return array("success" => true, "state" => $state); } return $result; }