$urlDerived = "http://www." . $domain . "/"; // This MUST be the URL we tried the very first time. // Get the most recent result from HTTP Archive for this URL. $row = doRowQuery("select pageid, urlHtml from {$gPagesTable} where url='{$urlDerived}' order by pageid desc limit 1;"); if (!$row) { // This is the first time for this URL. // We'll have to clean it up next time. continue; } $pageid = $row['pageid']; $urlHtml = $row['urlHtml']; if ($urlDerived != $urlHtml) { $xdiff++; // CVSNO // Get more info about the initial URL. $row = doRowQuery("select requestid, status, resp_location, resp_cache_control, resp_expires from {$gRequestsTable} where pageid={$pageid} and url='{$urlDerived}' order by requestid asc limit 1;"); $requestid = $row['requestid']; $status = $row['status']; $resp_cache_control = $row['resp_cache_control']; $resp_expires = $row['resp_expires']; $resp_location = $row['resp_location']; if (0 === strpos($resp_location, "/")) { // relative location $resp_location = $urlDerived + strstr($resp_location, 1); } if (301 == $status || 302 == $status) { $x300++; // CVSNO if (false != strpos($urlHtml, "?")) { // Don't store a derived URL that contains a querystring. $snotfixed .= "NOTFIXED (querystring): {$urlDerived} != {$urlHtml}\n";
if ($minid === $maxid) { tprint("The rows have already been copied."); } else { $cmd = "replace into {$pagesTable} select * from pagestmp where pageid >= {$minid} and pageid <= {$maxid} and maxDomainReqs != 0;"; tprint("Replacing " . ($maxid - $minid) . " rows from pagestmp to {$pagesTable}:\n {$cmd}"); doSimpleCommand($cmd); tprint("...done copying rows."); } } // 5. Recalculate stats tprint("\n5. Recalculate stats..."); // TODO - This script doesn't detect & skip this step if it's already been done, but it's very fast (20 seconds) so we won't worry. // TODO - could test for bytesFont or perFonts tprint("Update numPages and numRequests in crawls:"); // It's possible that while removing orphans and pages with wptid="" some meta-crawl information has changed: $row = doRowQuery("select count(*) as numPages, min(pageid) as minPageid, max(pageid) as maxPageid from {$pagesTable} where {$pageidCond};"); $numRequests = doSimpleQuery("select count(*) from {$requestsTable} where {$pageidCond};"); doSimpleCommand("update {$gCrawlsTable} set numPages = " . $row['numPages'] . ", minPageid = " . $row['minPageid'] . ", maxPageid = " . $row['maxPageid'] . ", numRequests = {$numRequests} where label = '{$label}' and location='{$device}';"); tprint("Compute stats:"); removeStats($label, NULL, $device); computeMissingStats($device, true); tprint("...done recalculating stats."); // 6. Mysqldump tprint("\n6. Mysqldump & wrap up..."); $col = doSimpleQuery("show columns from {$requestsTable} like '%redirectUrlShort%';"); if ($col) { tprint("You have to remove the redirectUrlShort column before we can do the dumps:\n" . " alter table {$requestsTable} drop column redirectUrlShort;"); tprint("almost done!"); } else { $labelUnderscore = str_replace(" ", "_", $label); $tmpdir = "/tmp/{$labelUnderscore}." . time();
See the License for the specific language governing permissions and limitations under the License. */ require_once "utils.inc"; require_once "ui.inc"; require_once "pages.inc"; $gArchive = getParam('a'); $gLabel = getParam('l'); $gPageid = getParam('p'); $gFormat = getParam('format'); if ($gPageid && "csv" == $gFormat) { // request to download a CSV of an individual website's requests data header('Content-Type: application/octet-stream; name="httparchive.csv"'); header('Content-Disposition: inline; filename="httparchive_page' . $gPageid . '.csv"'); $aColumns = array("url", "mimeType", "method", "status", "time", "respSize", "reqCookieLen", "respCookieLen", "reqHttpVersion", "respHttpVersion", "req_accept", "req_accept_charset", "req_accept_encoding", "req_accept_language", "req_connection", "req_host", "req_referer", "resp_accept_ranges", "resp_age", "resp_cache_control", "resp_connection", "resp_content_encoding", "resp_content_language", "resp_content_length", "resp_content_location", "resp_content_type", "resp_date", "resp_etag", "resp_expires", "resp_keep_alive", "resp_last_modified", "resp_location", "resp_pragma", "resp_server", "resp_transfer_encoding", "resp_vary", "resp_via", "resp_x_powered_by"); $sRows = implode(",", $aColumns); $sRows .= "\n"; $row = doRowQuery("select wptid, wptrun from {$gPagesTable} where pageid = {$gPageid};"); $page = pageFromWPT($row['wptid'], $row['wptrun']); $aResources = $page['resources']; foreach ($aResources as $resource) { foreach ($aColumns as $column) { $sRows .= (array_key_exists($column, $resource) ? '"' . $resource[$column] . '"' : "") . ","; // wrap in double quotes in case of commas } rtrim($sRows, ","); // remove trailing comma $sRows .= "\n"; } echo $sRows; }
exit; } if (0 === totalNotDone()) { $curPasses++; updateCrawl($labelFromRun, $gArchive, $locations[0], array("passes" => $curPasses)); if ($curPasses < $maxPasses) { resubmitFailures(); cprint("Resubmitted failures - going around once again..."); } else { // We just finished the last pass. Wrap it up... cprint(date("G:i") . ": DONE with tests. Copying..."); $gParamLabel = $labelFromRun; // hack! // IMPORTANT: Update crawl info FIRST because many pieces of code reference this: $query = "select min(pageid) as minid, max(pageid) as maxid from {$gPagesTable} where label='{$labelFromRun}';"; $row = doRowQuery($query); $minid = $row['minid']; $maxid = $row['maxid']; $numPages = doSimpleQuery("select count(*) from {$gPagesTable} where pageid >= {$minid} and pageid <= {$maxid};"); $numRequests = doSimpleQuery("select count(*) from {$gRequestsTable} where pageid >= {$minid} and pageid <= {$maxid};"); updateCrawl($labelFromRun, $gArchive, $locations[0], array("minPageid" => $minid, "maxPageid" => $maxid, "numErrors" => statusErrors(), "numPages" => $numPages, "numRequests" => $numRequests)); // Copy rows, calc stats, create dump files, etc. require_once "copy.php"; updateCrawl($labelFromRun, $gArchive, $locations[0], array("finishedDateTime" => time())); cprint(date("G:i") . ": DONE with crawl!"); exit(0); } } // TODO - Combine "obtain" and "parse"? // The "crawl" process has multiple distinct tasks. This is because the URLs are // sent to and queued at WebPagetest which is asynchronous.