Пример #1
0
 $urlDerived = "http://www." . $domain . "/";
 // This MUST be the URL we tried the very first time.
 // Get the most recent result from HTTP Archive for this URL.
 $row = doRowQuery("select pageid, urlHtml from {$gPagesTable} where url='{$urlDerived}' order by pageid desc limit 1;");
 if (!$row) {
     // This is the first time for this URL.
     // We'll have to clean it up next time.
     continue;
 }
 $pageid = $row['pageid'];
 $urlHtml = $row['urlHtml'];
 if ($urlDerived != $urlHtml) {
     $xdiff++;
     // CVSNO
     // Get more info about the initial URL.
     $row = doRowQuery("select requestid, status, resp_location, resp_cache_control, resp_expires from {$gRequestsTable} where pageid={$pageid} and url='{$urlDerived}' order by requestid asc limit 1;");
     $requestid = $row['requestid'];
     $status = $row['status'];
     $resp_cache_control = $row['resp_cache_control'];
     $resp_expires = $row['resp_expires'];
     $resp_location = $row['resp_location'];
     if (0 === strpos($resp_location, "/")) {
         // relative location
         $resp_location = $urlDerived + strstr($resp_location, 1);
     }
     if (301 == $status || 302 == $status) {
         $x300++;
         // CVSNO
         if (false != strpos($urlHtml, "?")) {
             // Don't store a derived URL that contains a querystring.
             $snotfixed .= "NOTFIXED (querystring): {$urlDerived} != {$urlHtml}\n";
Пример #2
0
    if ($minid === $maxid) {
        tprint("The rows have already been copied.");
    } else {
        $cmd = "replace into {$pagesTable} select * from pagestmp where pageid >= {$minid} and pageid <= {$maxid} and maxDomainReqs != 0;";
        tprint("Replacing " . ($maxid - $minid) . " rows from pagestmp to {$pagesTable}:\n  {$cmd}");
        doSimpleCommand($cmd);
        tprint("...done copying rows.");
    }
}
// 5. Recalculate stats
tprint("\n5. Recalculate stats...");
// TODO - This script doesn't detect & skip this step if it's already been done, but it's very fast (20 seconds) so we won't worry.
// TODO - could test for bytesFont or perFonts
tprint("Update numPages and numRequests in crawls:");
// It's possible that while removing orphans and pages with wptid="" some meta-crawl information has changed:
$row = doRowQuery("select count(*) as numPages, min(pageid) as minPageid, max(pageid) as maxPageid from {$pagesTable} where {$pageidCond};");
$numRequests = doSimpleQuery("select count(*) from {$requestsTable} where {$pageidCond};");
doSimpleCommand("update {$gCrawlsTable} set numPages = " . $row['numPages'] . ", minPageid = " . $row['minPageid'] . ", maxPageid = " . $row['maxPageid'] . ", numRequests = {$numRequests} where label = '{$label}' and location='{$device}';");
tprint("Compute stats:");
removeStats($label, NULL, $device);
computeMissingStats($device, true);
tprint("...done recalculating stats.");
// 6. Mysqldump
tprint("\n6. Mysqldump & wrap up...");
$col = doSimpleQuery("show columns from {$requestsTable} like '%redirectUrlShort%';");
if ($col) {
    tprint("You have to remove the redirectUrlShort column before we can do the dumps:\n" . "    alter table {$requestsTable} drop column redirectUrlShort;");
    tprint("almost done!");
} else {
    $labelUnderscore = str_replace(" ", "_", $label);
    $tmpdir = "/tmp/{$labelUnderscore}." . time();
Пример #3
0
See the License for the specific language governing permissions and
limitations under the License.
*/
require_once "utils.inc";
require_once "ui.inc";
require_once "pages.inc";
$gArchive = getParam('a');
$gLabel = getParam('l');
$gPageid = getParam('p');
$gFormat = getParam('format');
if ($gPageid && "csv" == $gFormat) {
    // request to download a CSV of an individual website's requests data
    header('Content-Type: application/octet-stream; name="httparchive.csv"');
    header('Content-Disposition: inline; filename="httparchive_page' . $gPageid . '.csv"');
    $aColumns = array("url", "mimeType", "method", "status", "time", "respSize", "reqCookieLen", "respCookieLen", "reqHttpVersion", "respHttpVersion", "req_accept", "req_accept_charset", "req_accept_encoding", "req_accept_language", "req_connection", "req_host", "req_referer", "resp_accept_ranges", "resp_age", "resp_cache_control", "resp_connection", "resp_content_encoding", "resp_content_language", "resp_content_length", "resp_content_location", "resp_content_type", "resp_date", "resp_etag", "resp_expires", "resp_keep_alive", "resp_last_modified", "resp_location", "resp_pragma", "resp_server", "resp_transfer_encoding", "resp_vary", "resp_via", "resp_x_powered_by");
    $sRows = implode(",", $aColumns);
    $sRows .= "\n";
    $row = doRowQuery("select wptid, wptrun from {$gPagesTable} where pageid = {$gPageid};");
    $page = pageFromWPT($row['wptid'], $row['wptrun']);
    $aResources = $page['resources'];
    foreach ($aResources as $resource) {
        foreach ($aColumns as $column) {
            $sRows .= (array_key_exists($column, $resource) ? '"' . $resource[$column] . '"' : "") . ",";
            // wrap in double quotes in case of commas
        }
        rtrim($sRows, ",");
        // remove trailing comma
        $sRows .= "\n";
    }
    echo $sRows;
}
Пример #4
0
    exit;
}
if (0 === totalNotDone()) {
    $curPasses++;
    updateCrawl($labelFromRun, $gArchive, $locations[0], array("passes" => $curPasses));
    if ($curPasses < $maxPasses) {
        resubmitFailures();
        cprint("Resubmitted failures - going around once again...");
    } else {
        // We just finished the last pass. Wrap it up...
        cprint(date("G:i") . ": DONE with tests. Copying...");
        $gParamLabel = $labelFromRun;
        // hack!
        // IMPORTANT: Update crawl info FIRST because many pieces of code reference this:
        $query = "select min(pageid) as minid, max(pageid) as maxid from {$gPagesTable} where label='{$labelFromRun}';";
        $row = doRowQuery($query);
        $minid = $row['minid'];
        $maxid = $row['maxid'];
        $numPages = doSimpleQuery("select count(*) from {$gPagesTable} where pageid >= {$minid} and pageid <= {$maxid};");
        $numRequests = doSimpleQuery("select count(*) from {$gRequestsTable} where pageid >= {$minid} and pageid <= {$maxid};");
        updateCrawl($labelFromRun, $gArchive, $locations[0], array("minPageid" => $minid, "maxPageid" => $maxid, "numErrors" => statusErrors(), "numPages" => $numPages, "numRequests" => $numRequests));
        // Copy rows, calc stats, create dump files, etc.
        require_once "copy.php";
        updateCrawl($labelFromRun, $gArchive, $locations[0], array("finishedDateTime" => time()));
        cprint(date("G:i") . ": DONE with crawl!");
        exit(0);
    }
}
// TODO - Combine "obtain" and "parse"?
// The "crawl" process has multiple distinct tasks. This is because the URLs are
// sent to and queued at WebPagetest which is asynchronous.