Example #1
0
function file_get_html($url, $testId, $use_include_path = false, $context = null, $offset = -1, $maxLen = -1, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT)
{
    connectToDb($db);
    if ($db) {
        incrementHttpRequests($db, $testId);
    }
    // We DO force the tags to be terminated.
    $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $defaultBRText);
    // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
    $contents = file_get_contents($url, $use_include_path, $context, $offset);
    // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
    //    $contents = retrieve_url_contents($url);
    if (empty($contents)) {
        return false;
    }
    // The second parameter can force the selectors to all be lowercase.
    $dom->load($contents, $lowercase, $stripRN);
    return $dom;
}
Example #2
0
 function SendRequest($arguments)
 {
     connectToDb(&$db);
     if ($db) {
         incrementHttpRequests($db, $this->testId);
     }
     //fsockopen is called in receivePage below so increment HTTP requests sent
     if (strlen($this->error)) {
         return $this->error;
     }
     if (isset($arguments["ProxyUser"])) {
         $this->proxy_request_user = $arguments["ProxyUser"];
     } elseif (isset($this->proxy_user)) {
         $this->proxy_request_user = $this->proxy_user;
     }
     if (isset($arguments["ProxyPassword"])) {
         $this->proxy_request_password = $arguments["ProxyPassword"];
     } elseif (isset($this->proxy_password)) {
         $this->proxy_request_password = $this->proxy_password;
     }
     if (isset($arguments["ProxyRealm"])) {
         $this->proxy_request_realm = $arguments["ProxyRealm"];
     } elseif (isset($this->proxy_realm)) {
         $this->proxy_request_realm = $this->proxy_realm;
     }
     if (isset($arguments["ProxyWorkstation"])) {
         $this->proxy_request_workstation = $arguments["ProxyWorkstation"];
     } elseif (isset($this->proxy_workstation)) {
         $this->proxy_request_workstation = $this->proxy_workstation;
     }
     switch ($this->state) {
         case "Disconnected":
             return $this->SetError("1 connection was not yet established");
         case "Connected":
             $connect = 0;
             break;
         case "ConnectedToProxy":
             if (strlen($error = $this->ConnectFromProxy($arguments, $headers))) {
                 return $error;
             }
             $connect = 1;
             break;
         default:
             return $this->SetError("2 can not send request in the current connection state");
     }
     if (isset($arguments["RequestMethod"])) {
         $this->request_method = $arguments["RequestMethod"];
     }
     if (isset($arguments["User-Agent"])) {
         $this->user_agent = $arguments["User-Agent"];
     }
     if (!isset($arguments["Headers"]["User-Agent"]) && strlen($this->user_agent)) {
         $arguments["Headers"]["User-Agent"] = $this->user_agent;
     }
     if (isset($arguments["KeepAlive"])) {
         $this->keep_alive = intval($arguments["KeepAlive"]);
     }
     if (!isset($arguments["Headers"]["Connection"]) && $this->keep_alive) {
         $arguments["Headers"]["Connection"] = 'Keep-Alive';
     }
     if (isset($arguments["Accept"])) {
         $this->user_agent = $arguments["Accept"];
     }
     if (!isset($arguments["Headers"]["Accept"]) && strlen($this->accept)) {
         $arguments["Headers"]["Accept"] = $this->accept;
     }
     if (strlen($this->request_method) == 0) {
         return $this->SetError("3 it was not specified a valid request method");
     }
     if (isset($arguments["RequestURI"])) {
         $this->request_uri = $arguments["RequestURI"];
     }
     if (strlen($this->request_uri) == 0 || substr($this->request_uri, 0, 1) != "/") {
         return $this->SetError("4 it was not specified a valid request URI");
     }
     $this->request_arguments = $arguments;
     $this->request_headers = isset($arguments["Headers"]) ? $arguments["Headers"] : array();
     $body_length = 0;
     $this->request_body = "";
     $get_body = 1;
     if ($this->request_method == "POST" || $this->request_method == "PUT") {
         if (isset($arguments['StreamRequest'])) {
             $get_body = 0;
             $this->request_headers["Transfer-Encoding"] = "chunked";
         } elseif (isset($arguments["PostFiles"]) || $this->force_multipart_form_post && isset($arguments["PostValues"])) {
             $boundary = "--" . md5(uniqid(time()));
             $this->request_headers["Content-Type"] = "multipart/form-data; boundary=" . $boundary . (isset($arguments["CharSet"]) ? "; charset=" . $arguments["CharSet"] : "");
             $post_parts = array();
             if (isset($arguments["PostValues"])) {
                 $values = $arguments["PostValues"];
                 if (GetType($values) != "array") {
                     return $this->SetError("5 it was not specified a valid POST method values array");
                 }
                 for (Reset($values), $value = 0; $value < count($values); Next($values), $value++) {
                     $input = Key($values);
                     $headers = "--" . $boundary . "\r\nContent-Disposition: form-data; name=\"" . $input . "\"\r\n\r\n";
                     $data = $values[$input];
                     $post_parts[] = array("HEADERS" => $headers, "DATA" => $data);
                     $body_length += strlen($headers) + strlen($data) + strlen("\r\n");
                 }
             }
             $body_length += strlen("--" . $boundary . "--\r\n");
             $files = isset($arguments["PostFiles"]) ? $arguments["PostFiles"] : array();
             Reset($files);
             $end = GetType($input = Key($files)) != "string";
             for (; !$end;) {
                 if (strlen($error = $this->GetFileDefinition($files[$input], $definition))) {
                     return "3 " . $error;
                 }
                 $headers = "--" . $boundary . "\r\nContent-Disposition: form-data; name=\"" . $input . "\"; filename=\"" . $definition["NAME"] . "\"\r\nContent-Type: " . $definition["Content-Type"] . "\r\n\r\n";
                 $part = count($post_parts);
                 $post_parts[$part] = array("HEADERS" => $headers);
                 if (isset($definition["FILENAME"])) {
                     $post_parts[$part]["FILENAME"] = $definition["FILENAME"];
                     $data = "";
                 } else {
                     $data = $definition["DATA"];
                 }
                 $post_parts[$part]["DATA"] = $data;
                 $body_length += strlen($headers) + $definition["Content-Length"] + strlen("\r\n");
                 Next($files);
                 $end = GetType($input = Key($files)) != "string";
             }
             $get_body = 0;
         } elseif (isset($arguments["PostValues"])) {
             $values = $arguments["PostValues"];
             if (GetType($values) != "array") {
                 return $this->SetError("5 it was not specified a valid POST method values array");
             }
             for (Reset($values), $value = 0; $value < count($values); Next($values), $value++) {
                 $k = Key($values);
                 if (GetType($values[$k]) == "array") {
                     for ($v = 0; $v < count($values[$k]); $v++) {
                         if ($value + $v > 0) {
                             $this->request_body .= "&";
                         }
                         $this->request_body .= UrlEncode($k) . "=" . UrlEncode($values[$k][$v]);
                     }
                 } else {
                     if ($value > 0) {
                         $this->request_body .= "&";
                     }
                     $this->request_body .= UrlEncode($k) . "=" . UrlEncode($values[$k]);
                 }
             }
             $this->request_headers["Content-Type"] = "application/x-www-form-urlencoded" . (isset($arguments["CharSet"]) ? "; charset=" . $arguments["CharSet"] : "");
             $get_body = 0;
         }
     }
     if ($get_body && (isset($arguments["Body"]) || isset($arguments["BodyStream"]))) {
         if (isset($arguments["Body"])) {
             $this->request_body = $arguments["Body"];
         } else {
             $stream = $arguments["BodyStream"];
             $this->request_body = "";
             for ($part = 0; $part < count($stream); $part++) {
                 if (isset($stream[$part]["Data"])) {
                     $this->request_body .= $stream[$part]["Data"];
                 } elseif (isset($stream[$part]["File"])) {
                     if (!($file = @fopen($stream[$part]["File"], "rb"))) {
                         return $this->SetPHPError("could not open upload file " . $stream[$part]["File"], $php_errormsg);
                     }
                     while (!feof($file)) {
                         if (GetType($block = @fread($file, $this->file_buffer_length)) != "string") {
                             $error = $this->SetPHPError("could not read body stream file " . $stream[$part]["File"], $php_errormsg);
                             fclose($file);
                             return $error;
                         }
                         $this->request_body .= $block;
                     }
                     fclose($file);
                 } else {
                     return "5 it was not specified a valid file or data body stream element at position " . $part;
                 }
             }
         }
         if (!isset($this->request_headers["Content-Type"])) {
             $this->request_headers["Content-Type"] = "application/octet-stream" . (isset($arguments["CharSet"]) ? "; charset=" . $arguments["CharSet"] : "");
         }
     }
     if (isset($arguments["AuthUser"])) {
         $this->request_user = $arguments["AuthUser"];
     } elseif (isset($this->user)) {
         $this->request_user = $this->user;
     }
     if (isset($arguments["AuthPassword"])) {
         $this->request_password = $arguments["AuthPassword"];
     } elseif (isset($this->password)) {
         $this->request_password = $this->password;
     }
     if (isset($arguments["AuthRealm"])) {
         $this->request_realm = $arguments["AuthRealm"];
     } elseif (isset($this->realm)) {
         $this->request_realm = $this->realm;
     }
     if (isset($arguments["AuthWorkstation"])) {
         $this->request_workstation = $arguments["AuthWorkstation"];
     } elseif (isset($this->workstation)) {
         $this->request_workstation = $this->workstation;
     }
     if (strlen($this->proxy_host_name) == 0 || $connect) {
         $request_uri = $this->request_uri;
     } else {
         switch (strtolower($this->protocol)) {
             case "http":
                 $default_port = 80;
                 break;
             case "https":
                 $default_port = 443;
                 break;
         }
         $request_uri = strtolower($this->protocol) . "://" . $this->host_name . ($this->host_port == 0 || $this->host_port == $default_port ? "" : ":" . $this->host_port) . $this->request_uri;
     }
     if ($this->use_curl) {
         $version = GetType($v = curl_version()) == "array" ? isset($v["version"]) ? $v["version"] : "0.0.0" : (preg_match("/^libcurl\\/([0-9]+\\.[0-9]+\\.[0-9]+)/", $v, $m) ? $m[1] : "0.0.0");
         $curl_version = 100000 * intval($this->Tokenize($version, ".")) + 1000 * intval($this->Tokenize(".")) + intval($this->Tokenize(""));
         $protocol_version = $curl_version < 713002 ? "1.0" : $this->protocol_version;
     } else {
         $protocol_version = $this->protocol_version;
     }
     $this->request = $this->request_method . " " . $request_uri . " HTTP/" . $protocol_version;
     if ($body_length || ($body_length = strlen($this->request_body))) {
         $this->request_headers["Content-Length"] = $body_length;
     }
     for ($headers = array(), $host_set = 0, Reset($this->request_headers), $header = 0; $header < count($this->request_headers); Next($this->request_headers), $header++) {
         $header_name = Key($this->request_headers);
         $header_value = $this->request_headers[$header_name];
         if (GetType($header_value) == "array") {
             for (Reset($header_value), $value = 0; $value < count($header_value); Next($header_value), $value++) {
                 $headers[] = $header_name . ": " . $header_value[Key($header_value)];
             }
         } else {
             $headers[] = $header_name . ": " . $header_value;
         }
         if (strtolower(Key($this->request_headers)) == "host") {
             $this->request_host = strtolower($header_value);
             $host_set = 1;
         }
     }
     if (!$host_set) {
         $headers[] = "Host: " . $this->host_name;
         $this->request_host = strtolower($this->host_name);
     }
     if (count($this->cookies)) {
         $cookies = array();
         $this->PickCookies($cookies, 0);
         if (strtolower($this->protocol) == "https") {
             $this->PickCookies($cookies, 1);
         }
         if (count($cookies)) {
             $h = count($headers);
             $headers[$h] = "Cookie:";
             for (Reset($cookies), $cookie = 0; $cookie < count($cookies); Next($cookies), $cookie++) {
                 $cookie_name = Key($cookies);
                 $headers[$h] .= " " . $cookie_name . "=" . $cookies[$cookie_name]["value"] . ";";
             }
         }
     }
     $next_state = "RequestSent";
     if ($this->use_curl) {
         if (isset($arguments['StreamRequest'])) {
             return $this->SetError("Streaming request data is not supported when using Curl");
         }
         if ($body_length && strlen($this->request_body) == 0) {
             for ($request_body = "", $success = 1, $part = 0; $part < count($post_parts); $part++) {
                 $request_body .= $post_parts[$part]["HEADERS"] . $post_parts[$part]["DATA"];
                 if (isset($post_parts[$part]["FILENAME"])) {
                     if (!($file = @fopen($post_parts[$part]["FILENAME"], "rb"))) {
                         $this->SetPHPError("could not open upload file " . $post_parts[$part]["FILENAME"], $php_errormsg);
                         $success = 0;
                         break;
                     }
                     while (!feof($file)) {
                         if (GetType($block = @fread($file, $this->file_buffer_length)) != "string") {
                             $this->SetPHPError("could not read upload file", $php_errormsg);
                             $success = 0;
                             break;
                         }
                         $request_body .= $block;
                     }
                     fclose($file);
                     if (!$success) {
                         break;
                     }
                 }
                 $request_body .= "\r\n";
             }
             $request_body .= "--" . $boundary . "--\r\n";
         } else {
             $request_body = $this->request_body;
         }
         curl_setopt($this->connection, CURLOPT_HEADER, 1);
         curl_setopt($this->connection, CURLOPT_RETURNTRANSFER, 1);
         if ($this->timeout) {
             curl_setopt($this->connection, CURLOPT_TIMEOUT, $this->timeout);
         }
         curl_setopt($this->connection, CURLOPT_SSL_VERIFYPEER, 0);
         curl_setopt($this->connection, CURLOPT_SSL_VERIFYHOST, 0);
         $request = $this->request . "\r\n" . implode("\r\n", $headers) . "\r\n\r\n" . $request_body;
         curl_setopt($this->connection, CURLOPT_CUSTOMREQUEST, $request);
         if ($this->debug) {
             $this->OutputDebug("C " . $request);
         }
         if (!($success = strlen($this->response = curl_exec($this->connection)) != 0)) {
             $error = curl_error($this->connection);
             $this->SetError("Could not execute the request" . (strlen($error) ? ": " . $error : ""));
         }
     } else {
         if ($success = $this->PutLine($this->request)) {
             for ($header = 0; $header < count($headers); $header++) {
                 if (!($success = $this->PutLine($headers[$header]))) {
                     break;
                 }
             }
             if ($success && ($success = $this->PutLine(""))) {
                 if (isset($arguments['StreamRequest'])) {
                     $next_state = "SendingRequestBody";
                 } elseif ($body_length) {
                     if (strlen($this->request_body)) {
                         $success = $this->PutData($this->request_body);
                     } else {
                         for ($part = 0; $part < count($post_parts); $part++) {
                             if (!($success = $this->PutData($post_parts[$part]["HEADERS"])) || !($success = $this->PutData($post_parts[$part]["DATA"]))) {
                                 break;
                             }
                             if (isset($post_parts[$part]["FILENAME"])) {
                                 if (!($file = @fopen($post_parts[$part]["FILENAME"], "rb"))) {
                                     $this->SetPHPError("could not open upload file " . $post_parts[$part]["FILENAME"], $php_errormsg);
                                     $success = 0;
                                     break;
                                 }
                                 while (!feof($file)) {
                                     if (GetType($block = @fread($file, $this->file_buffer_length)) != "string") {
                                         $this->SetPHPError("could not read upload file", $php_errormsg);
                                         $success = 0;
                                         break;
                                     }
                                     if (!($success = $this->PutData($block))) {
                                         break;
                                     }
                                 }
                                 fclose($file);
                                 if (!$success) {
                                     break;
                                 }
                             }
                             if (!($success = $this->PutLine(""))) {
                                 break;
                             }
                         }
                         if ($success) {
                             $success = $this->PutLine("--" . $boundary . "--");
                         }
                     }
                     if ($success) {
                         $sucess = $this->FlushData();
                     }
                 }
             }
         }
     }
     if (!$success) {
         return $this->SetError("5 could not send the HTTP request: " . $this->error);
     }
     $this->state = $next_state;
     return "";
 }
Example #3
0
 function go()
 {
     connectToDb($db);
     $starting_time = $this->getmicrotime();
     // Init, split given URL into host, port, path and file a.s.o.
     $url_parts = PHPCrawlerUtils::splitURL($this->url_to_crawl);
     // Set base-host and base-path "global" for this class,
     // we need it very often (i guess at this point...)
     $this->base_path = $url_parts["path"];
     $this->base_host = $url_parts["host"];
     $this->base_domain = $url_parts["domain"];
     // If the base port wasnt set by the user ->
     // take the one from the given start-URL.
     if ($this->base_port == "") {
         $this->base_port = $url_parts["port"];
     }
     // if the base-port WAS set by the user
     $url_parts["port"] = $this->base_port;
     // Reset the base_url
     $this->url_to_crawl = PHPCrawlerUtils::rebuildURL($url_parts);
     $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($this->url_to_crawl);
     // Init counters
     $links_followed = 0;
     $files_received = 0;
     // Put the first url into our main-array
     $tmp[0]["url_rebuild"] = $this->url_to_crawl;
     PHPCrawlerUtils::removeMatchingLinks($tmp, $this->not_follow_matches);
     if (isset($tmp[0]["url_rebuild"]) && $tmp[0]["url_rebuild"] != "") {
         PHPCrawlerUtils::addToArray($tmp, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
     }
     // MAIN-LOOP -------------------------------------------------------------------
     // It works like this:
     // The first loop looks through all the "Priority"-arrays and checks if any
     // of these arrays is filled with URLS.
     for ($pri_level = $this->max_priority_level + 1; $pri_level > -1; $pri_level--) {
         // Yep. Found a priority-array with at least one URL
         if (isset($this->urls_to_crawl[$pri_level]) && !isset($stop_crawling)) {
             // Now "process" all URLS in this priroity-array
             @reset($this->urls_to_crawl[$pri_level]);
             while (list($key) = @each($this->urls_to_crawl[$pri_level])) {
                 $all_start = $this->getmicrotime();
                 $stop_crawling_this_level = false;
                 // init
                 // Request URL (crawl())
                 unset($page_data);
                 if (!isset($this->urls_to_crawl[$pri_level][$key]["referer_url"])) {
                     $this->urls_to_crawl[$pri_level][$key]["referer_url"] = "";
                 }
                 if ($db) {
                     incrementHttpRequests($db, $this->testId);
                 }
                 //Increment number of HTTP requests sent as fsockopen is called next
                 $page_data = $this->pageRequest->receivePage($this->urls_to_crawl[$pri_level][$key]["url_rebuild"], $this->urls_to_crawl[$pri_level][$key]["referer_url"]);
                 // If the request-object just irnored the URL ->
                 // -> Stop and remove URL from Array
                 if ($page_data == false) {
                     unset($this->urls_to_crawl[$pri_level][$key]);
                     continue;
                 }
                 $links_followed++;
                 // Now $page_data["links_found"] contains all found links at this point
                 // Check if a "<base href.."-tag is given in the source and xtract
                 // the base URL
                 // !! Doesnt have to be rebuild cause it only can be a full
                 // qualified URL !!
                 $base_url = PHPCrawlerUtils::getBasePathFromTag($page_data["source"]);
                 if ($base_url == "") {
                     $actual_url =& $this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
                 } else {
                     $actual_url = $base_url;
                 }
                 // Set flag "content_found" if..content was found
                 if (isset($page_data["http_status_code"]) && $page_data["http_status_code"] == 200) {
                     $content_found = true;
                 }
                 // Check for a REDIRECT-header and if wanted, put it into the array of found links
                 $redirect = PHPCrawlerUtils::getRedirectLocation($page_data["header"]);
                 if ($redirect && $this->follow_redirects == true) {
                     $tmp_array["link_raw"] = $redirect;
                     $tmp_array["referer_url"] = $this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
                     $page_data["links_found"][] = $tmp_array;
                 }
                 // Count files that have been received completly
                 if ($page_data["received"] == true) {
                     $files_received++;
                 }
                 // If traffic-limit is reached -> stop crawling
                 if ($page_data["traffic_limit_reached"] == true) {
                     $stop_crawling = true;
                 }
                 // Check if pagelimit is reached if set
                 // (and check WHICH page-limit was set)
                 if ($this->page_limit_all > 0) {
                     if ($this->page_limit_count_ct_only == true && $files_received >= $this->page_limit_all) {
                         $stop_crawling = true;
                     } elseif ($this->page_limit_count_ct_only == false && $links_followed >= $this->page_limit_all) {
                         $stop_crawling = true;
                     }
                 }
                 // Add the actual referer to the page_data array for the handlePageData-method
                 $page_data["refering_linktext"] =& $this->urls_to_crawl[$pri_level][$key]["linktext"];
                 $page_data["refering_link_raw"] =& $this->urls_to_crawl[$pri_level][$key]["link_raw"];
                 $page_data["refering_linkcode"] =& $this->urls_to_crawl[$pri_level][$key]["linkcode"];
                 // build new absolute URLs from found links
                 $page_data["links_found"] = PHPCrawlerUtils::buildURLs($page_data["links_found"], $actual_url);
                 // Call the overridable user-function here, but first
                 // "save" the found links from user-manipulation
                 $links_found = $page_data["links_found"];
                 $user_return = $this->handlePageData($page_data);
                 // Stop crawling if user returned a negative value
                 if ($user_return < 0) {
                     $stop_crawling = true;
                     $page_data["user_abort"] = true;
                 }
                 // Compare the found links with link-priorities set by the user
                 // and add the priority-level to our array $links_found
                 if ($this->benchmark == true) {
                     $bm_start = $this->getmicrotime();
                 }
                 PHPCrawlerUtils::addURLPriorities($links_found, $this->link_priorities);
                 if ($this->benchmark == true) {
                     echo "addUrlPriorities(): " . ($this->getmicrotime() - $bm_start) . "<br>";
                 }
                 // Here we can delete the tmp-file maybe created by the pageRequest-object
                 if (file_exists($this->pageRequest->tmp_file)) {
                     @unlink($this->pageRequest->tmp_file);
                 }
                 // Stop everything if a limit was reached
                 if (isset($stop_crawling)) {
                     break;
                     $pri_level = 1000;
                 }
                 // Remove links to other hosts if follow_mode is 2 or 3
                 if ($this->general_follow_mode == 2 || $this->general_follow_mode == 3) {
                     PHPCrawlerUtils::removeURLsToOtherHosts($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
                 }
                 // Remove links to other domains if follow_mode=1
                 if ($this->general_follow_mode == 1) {
                     PHPCrawlerUtils::removeURLsToOtherDomains($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
                 }
                 // Remove "pathUp"-links if follow_mode=3
                 // (fe: base-site: www.foo.com/bar/index.htm -> dont follow: www.foo.com/anotherbar/xyz)
                 if ($this->general_follow_mode == 3) {
                     PHPCrawlerUtils::removePathUpLinks($links_found, $this->url_to_crawl);
                 }
                 // If given, dont follow "not matching"-links
                 // (dont follow given preg_matches)
                 if (count($this->not_follow_matches) > 0) {
                     PHPCrawlerUtils::removeMatchingLinks($links_found, $this->not_follow_matches);
                 }
                 // If given, just follow "matching"-links
                 // (only follow given preg_matches)
                 if (count($this->follow_matches) > 0) {
                     $links_found =& PHPCrawlerUtils::removeNotMatchingLinks($links_found, $this->follow_matches);
                 }
                 // Add found and filtered links to the main_array urls_to_crawl
                 if ($this->benchmark == true) {
                     $bm_start = $this->getmicrotime();
                 }
                 PHPCrawlerUtils::addToArray($links_found, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
                 if ($this->benchmark == true) {
                     echo "addToArray(): " . ($this->getmicrotime() - $bm_start) . "<br>";
                 }
                 // If there is wasnt any content found so far (code 200) and theres
                 // a redirect location
                 // -> follow it, doesnt matter what follow-mode was choosen !
                 // (put it into the main-array !)
                 if (!isset($content_found) && $redirect != "" && $this->follow_redirects_till_content == true) {
                     $rd[0]["url_rebuild"] = phpcrawlerutils::buildURL($redirect, $actual_url);
                     $rd[0]["priority_level"] = 0;
                     PHPCrawlerUtils::addToArray($rd, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
                 }
                 // Now we remove the actual URL from the priority-array
                 unset($this->urls_to_crawl[$pri_level][$key]);
                 // Now we check if a priority-array with a higher priority
                 // contains URLS and if so, stop processing this pri-array and "switch" to the higher
                 // one
                 for ($pri_level_check = $this->max_priority_level + 1; $pri_level_check > $pri_level; $pri_level_check--) {
                     if (isset($this->urls_to_crawl[$pri_level_check]) && $pri_level_check > $pri_level) {
                         $stop_crawling_this_level = true;
                     }
                 }
                 // Stop crawling this level
                 if ($stop_crawling_this_level == true) {
                     $pri_level = $this->max_priority_level + 1;
                     break;
                 }
                 // Unset crawled URL, not nedded anymore
                 unset($this->urls_to_crawl[$pri_level][$key]);
                 // echo "All:".($this->getmicrotime()-$all_start);
             }
             // end of loop over priority-array
             // If a priority_level was crawled completely -> unset the whole array
             if ($stop_crawling_this_level == false) {
                 unset($this->urls_to_crawl[$pri_level]);
             }
         }
         // end if priority-level exists
     }
     // end of main loop
     // Loop stopped here, build report-array (status_return)
     $this->status_return["links_followed"] = $links_followed;
     $this->status_return["files_received"] = $files_received;
     $this->status_return["bytes_received"] = $this->pageRequest->traffic_all;
     $this->status_return["traffic_limit_reached"] = $page_data["traffic_limit_reached"];
     if (isset($page_data["file_limit_reached"])) {
         $this->status_return["file_limit_reached"] = $page_data["file_limit_reached"];
     } else {
         $this->status_return["file_limit_reached"] = false;
     }
     if (isset($page_data["user_abort"])) {
         $this->status_return["user_abort"] = $page_data["user_abort"];
     } else {
         $this->status_return["user_abort"] = false;
     }
     if (isset($stop_crawling)) {
         $this->status_return["limit_reached"] = true;
     } else {
         $this->status_return["limit_reached"] = false;
     }
     // Process-time
     $this->status_return["process_runtime"] = $this->getMicroTime() - $starting_time;
     // Average bandwith / throughput
     $this->status_return["data_throughput"] = round($this->status_return["bytes_received"] / $this->status_return["process_runtime"]);
     if ($this->firstCrawl) {
         $query = "UPDATE tests SET status = 'Finished Crawling!' WHERE id = {$this->testId};";
         if (connectToDb($db)) {
             $db->query($query);
             $duration = $this->status_return["process_runtime"];
             $query = "UPDATE tests SET duration = {$duration} WHERE id = {$this->testId};";
             $db->query($query);
         }
     }
 }
Example #4
0
function testSslCertificate($urlsToTest, $testId)
{
    connectToDb($db);
    updateStatus($db, "Testing {$urlsToTest} for untrustworthy SSL certificates...", $testId);
    $log = new Logger();
    $log->lfile('logs/eventlogs');
    $log->lwrite("Starting SSL certificate verification function on {$urlsToTest}");
    //Identify which URLs, if any, begin with https
    $log->lwrite("Identifying which URLs, if any, begin with HTTPS");
    updateStatus($db, "Identifying which URLs, if any, begin with HTTPS...", $testId);
    $usingHttps = false;
    $httpsUrl = '';
    foreach ($urlsToTest as $currentUrl) {
        if (substr($currentUrl, 0, 5) == 'https') {
            $usingHttps = true;
            $httpsUrl = $currentUrl;
            echo "https url = {$currentUrl} <br>";
            $log->lwrite("Found HTTPS URL: {$currentUrl}");
            break;
        }
    }
    if ($usingHttps) {
        //Check if Mozilla's cacert.pem file is online and update our version of it if needed
        $log->lwrite("Checking if cacert.pem is up to date");
        $http = new http_class();
        $http->timeout = 0;
        $http->data_timeout = 0;
        //$http->debug=1;
        $http->user_agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
        $http->follow_redirect = 1;
        $http->redirection_limit = 5;
        $cacertsUrl = "http://curl.haxx.se/ca/cacert.pem";
        $error = $http->GetRequestArguments($cacertsUrl, $arguments);
        $error = $http->Open($arguments);
        $log->lwrite("URL to be requested is: {$cacertsUrl}");
        if ($error == "") {
            $log->lwrite("Sending HTTP request to {$cacertsUrl}");
            $error = $http->SendRequest($arguments);
            if ($error == "") {
                $headers = array();
                $error = $http->ReadReplyHeaders($headers);
                if ($error == "") {
                    $responseCode = $http->response_status;
                    //This is a string
                    $log->lwrite("Received response code: {$responseCode}");
                    if (intval($responseCode) == 200) {
                        //Update cacerts.pem file
                        $cacerts = file_get_contents($cacertsUrl);
                        $oldCacerts = file_get_contents('tests/cacert.pem');
                        if ($cacerts != $oldCacerts) {
                            file_put_contents('tests/cacert.pem', $cacerts);
                            $log->lwrite("cacert.pem file updated");
                        } else {
                            $log->lwrite("cacert.pem is already up to date so was not updated");
                        }
                    } else {
                        $log->lwrite("Problem accessing Mozilla's URL containing cacert.pem file");
                    }
                }
            }
        }
        // Initialize session and set URL.
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $httpsUrl);
        // Set so curl_exec returns the result instead of outputting it.
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        $user_agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
        curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        //Check server's certificate against certificates specified in .pem file below
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true);
        //If last parameter is 1, checks the SSL certificate for a comman name (the domain of the site sometimes specified in the certificate), e.g. the site that acquired the certificate
        //If last parameter is 2, checks for the common name and, if it exists, checks that it matches the hostname provided
        //Default is 2
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
        //Using Mozillas certificate file with trusted certificates
        curl_setopt($ch, CURLOPT_CAINFO, getcwd() . "/cacert.pem");
        // Get the response and close the channel.
        $response = curl_exec($ch);
        if ($db) {
            incrementHttpRequests($db, $testId);
        }
        if (empty($response)) {
            //The echo's here are for testing/debugging the function on its own
            echo '<br>SSL Certificate is not trusted!<br>Url: ' . $httpsUrl . '<br>';
            echo 'Method: GET <br>';
            //echo 'Url Requested: ' . $testUrl . '<br>';
            echo 'Error: ' . curl_error($ch) . '<br>';
            $tableName = 'test' . $testId;
            //Check if this vulnerability has already been found and added to DB. If it hasn't, add it to DB.
            $query = "SELECT * FROM test_results WHERE test_id = {$testId} AND type = 'sslcert' AND method = 'get' AND url = '{$httpsUrl}' AND attack_str = '{$httpsUrl}'";
            $result = $db->query($query);
            if (!$result) {
                $log->lwrite("Could not execute query {$query}");
            } else {
                $log->lwrite("Successfully executed query {$query}");
                $numRows = $result->num_rows;
                if ($numRows == 0) {
                    $log->lwrite("Number of rows is {$numRows} for query: {$query}");
                    insertTestResult($db, $testId, 'sslcert', 'get', $httpsUrl, $httpsUrl);
                }
            }
        }
        curl_close($ch);
    }
}