function cleanupRequests($location, $table)
{
    global $gSkipRuns, $gbActuallyDoit;
    $query = "select * from crawls where location = '{$location}' and finishedDateTime is not null order by crawlid desc limit " . ($gSkipRuns + 1) . ";";
    $results = doQuery($query);
    mysql_data_seek($results, $gSkipRuns);
    $row = mysql_fetch_assoc($results);
    if ($gbActuallyDoit) {
        $nUnfinished = doSimpleQuery("select count(*) from crawls where location = '{$location}' and finishedDateTime is null;");
        if (0 < $nUnfinished) {
            echo "SORRY! There is an unfinished crawl for location '{$location}'. Skipping the cleanup while the crawl is running.\n";
            return;
        }
        // Actually delete rows and optimize the table.
        echo "Delete requests from \"{$table}\" table starting with crawl \"{$row['label']}\" crawlid={$row['crawlid']} minPageid={$row['minPageid']} maxPageid={$row['maxPageid']}...\n";
        $cmd = "delete from {$table} where crawlid <= {$row['crawlid']};";
        echo "{$cmd}\n";
        doSimpleCommand($cmd);
        echo "DONE\nOptimize table \"{$table}\"...\n";
        doSimpleCommand("optimize table {$table};");
        echo "DONE\n";
    } else {
        echo "WOULD delete requests from \"{$table}\" table starting with crawl \"{$row['label']}\" crawlid={$row['crawlid']} minPageid={$row['minPageid']} maxPageid={$row['maxPageid']}...\n";
    }
}
Esempio n. 2
0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
require_once "../utils.inc";
require_once "batch_lib.inc";
require_once "bootstrap.inc";
$date = getdate();
$label = substr($date['month'], 0, 3) . " " . $date['mday'] . " " . $date['year'];
$aCrawlnames = array("dev", "mobile", "android", "chrome");
foreach ($aCrawlnames as $crawlname) {
    $sProblems = "";
    // we fill the status table before creating the crawl
    $numStatus = doSimpleQuery("select count(*) from status{$crawlname} where label = '{$label}';");
    if (0 === $numStatus) {
        $sProblems .= "    No URLs have been queued up in the status{$crawlname} table.\n";
    } else {
        if (("dev" === $crawlname || "chrome" === $crawlname) && 490000 > $numStatus || ("mobile" === $crawlname || "android" === $crawlname) && 4900 > $numStatus) {
            $sProblems .= "    Only {$numStatus} URLs have been queued up in the status{$crawlname} table for crawl \"{$label}\".\n";
        }
    }
    // check that the crawl exists and has the right number of URLs
    $device = curDevice($crawlname);
    $crawl = getCrawl($label, null, $device);
    if (!$crawl) {
        $sProblems .= "    Could not find the crawl for \"{$label}\".\n";
    } else {
        $numUrls = $crawl['numUrls'];
        if (("dev" === $crawlname || "chrome" === $crawlname) && $numStatus !== $numUrls || ("mobile" === $crawlname || "android" === $crawlname) && $numStatus !== $numUrls) {
Esempio n. 3
0
// 5. Recalculate stats
tprint("\n5. Recalculate stats...");
// TODO - This script doesn't detect & skip this step if it's already been done, but it's very fast (20 seconds) so we won't worry.
// TODO - could test for bytesFont or perFonts
tprint("Update numPages and numRequests in crawls:");
// It's possible that while removing orphans and pages with wptid="" some meta-crawl information has changed:
$row = doRowQuery("select count(*) as numPages, min(pageid) as minPageid, max(pageid) as maxPageid from {$pagesTable} where {$pageidCond};");
$numRequests = doSimpleQuery("select count(*) from {$requestsTable} where {$pageidCond};");
doSimpleCommand("update {$gCrawlsTable} set numPages = " . $row['numPages'] . ", minPageid = " . $row['minPageid'] . ", maxPageid = " . $row['maxPageid'] . ", numRequests = {$numRequests} where label = '{$label}' and location='{$device}';");
tprint("Compute stats:");
removeStats($label, NULL, $device);
computeMissingStats($device, true);
tprint("...done recalculating stats.");
// 6. Mysqldump
tprint("\n6. Mysqldump & wrap up...");
$col = doSimpleQuery("show columns from {$requestsTable} like '%redirectUrlShort%';");
if ($col) {
    tprint("You have to remove the redirectUrlShort column before we can do the dumps:\n" . "    alter table {$requestsTable} drop column redirectUrlShort;");
    tprint("almost done!");
} else {
    $labelUnderscore = str_replace(" ", "_", $label);
    $tmpdir = "/tmp/{$labelUnderscore}." . time();
    // Unique dir for this dump cuz mysqldump writes files that aren't writable by this process, and mysqldump -T can NOT overwrite existing files.
    // pages
    $cmd = "mysqldump --where='pageid >= {$minPageid} and pageid <= {$maxPageid}' --no-create-db --no-create-info --skip-add-drop-table --complete-insert -u {$gMysqlUsername} -p{$gMysqlPassword} -h {$gMysqlServer} {$gMysqlDb} {$pagesTable} | gzip > ../downloads/httparchive_" . ($gbMobile ? "mobile_" : "") . $labelUnderscore . "_pages.gz";
    tprint("Dump pages table:");
    exec($cmd);
    // pages csv
    $cmd = "mkdir {$tmpdir}; chmod 777 {$tmpdir}; " . "mysqldump --where='pageid >= {$minPageid} and pageid <= {$maxPageid}' -u {$gMysqlUsername} -p{$gMysqlPassword} -h {$gMysqlServer} -T {$tmpdir} --fields-enclosed-by=\\\" --fields-terminated-by=, {$gMysqlDb} {$pagesTable}; " . "gzip -f -c {$tmpdir}/{$pagesTable}.txt > ../downloads/httparchive_" . ($gbMobile ? "mobile_" : "") . $labelUnderscore . "_pages.csv.gz";
    tprint("Dump pages table CSV:");
    exec($cmd);
Esempio n. 4
0
header('Content-Type: text/javascript');
$maxResults = 50;
$maxLimit = 200;
// we LIMIT more than the max results so that, after whittling, we have enough results
$term = getParam("term");
if (!$term) {
    return;
    // no need to run anything - May be, return 'warn' in dev mode
}
$term = strtolower($term);
// always search lower case (esp. a problem on iOS)
// First, get all the urlhashes from the "urls" table that match the search term.
// We CAN'T do ordering here because hashes are shared by multiple URLs,
// and all we're transferring from here are the hashes.
$query = "select group_concat(urlhash) from {$gUrlsTable} where (urlOrig like '%{$term}%' or urlFixed like '%{$term}%');";
$sUrlhashes = doSimpleQuery($query);
// It's possible the list ends in "," which is bad (eg, if urlhash is null).
if ("," === substr($sUrlhashes, -1)) {
    $sUrlhashes = substr($sUrlhashes, 0, -1);
}
// It's possible that we don't have any results for some of these URLs:
// they could have a low rank (> 300K) or always return errors.
// So we have to look for actual results.
// The tricky part of this is doing "group by url" but figuring out what to order on.
// If we order by "rank asc" then NULL gets listed first.
// We fix that by ordering by "brank, rank asc" but that might not match the LATEST results - only the aggregate.
// So it's possible we'll put something too high in the order that USED TO BE ranked but is now NULL.
// TODO - better ordering?
// TODO - could also save the newest pageid to urls table (altho is that mobile or desktop?) or just a boolean bAreThereAnyResultsForThisURL
$query = "select url, urlhash, max(pageid) as pageid, min(rank is null) as brank, min(rank) as rank from {$gPagesTable} " . "where " . dateRange(true) . " and archive='{$gArchive}' and urlhash in ({$sUrlhashes}) and urlShort like '%{$term}%' group by url order by brank, rank asc limit {$maxLimit};";
$result = doQuery($query);
Esempio n. 5
0
function getSavingsDiffTestnames($bCss, $testname1, $testname2)
{
    $query = "select testnameid from spritemetestnames where testname = '{$testname1}';";
    $testnameid1 = doSimpleQuery($query);
    $query = "select testnameid from spritemetestnames where testname = '{$testname2}';";
    $testnameid2 = doSimpleQuery($query);
    if (!testnameid1 || !$testnameid2) {
        return;
    }
    $query1 = "select createdate, url, id, sd from spritemesavings where testnameid={$testnameid1} group by url;";
    $result1 = doQuery($query1);
    $sHtml = "";
    if ($bCss) {
        $sHtml .= <<<OUTPUT
<style>
.savingstable TH { padding: 0; color: #333; backgmyround: #CCC; font-weight: bold; }
.savingstable TD { padding: 2px 8px 2px 8px; color: #333; backgmyround: #FFF; }
TD.surl { padding-right: 0; }
TD.sreqs { text-align: right; padding-right: 16px; padding-left: 0; }
TD.ssize { text-align: right; padding-right: 8px; }
TH.ssize { padding-left: 8px; padding-right: 8px; }
TD.avg { backgmyround: #FFF; font-weight: bold; text-decoration: none; border-bottom: 1px solid; }
</style>
OUTPUT;
    }
    $sHtml .= "<table class=savingstable border=0 cellspacing=0 cellpadding=0>\n" . "<tr> <th></th> <th></th> <th class=sreqs>requests<br>eliminated</th> <th class=ssize>bytes<br>saved</th> <th class=sreqs>requests<br>eliminated</th> <th class=ssize>bytes<br>saved</th> </tr>\n";
    $cntr = 0;
    $totalImages1 = 0;
    $totalDelta1 = 0;
    $totalImages2 = 0;
    $totalDelta2 = 0;
    $sRows = "";
    while ($row1 = mysql_fetch_assoc($result1)) {
        $url = $row1['url'];
        $id1 = $row1['id'];
        $sd1 = $row1['sd'];
        $query2 = "select createdate, url, id, sd from spritemesavings where testnameid={$testnameid2} and url='{$url}' limit 1;";
        $result2 = doQuery($query2);
        $url2 = "";
        $id2 = $sd2 = 0;
        while ($row2 = mysql_fetch_assoc($result2)) {
            $url2 = $row2['url'];
            // use this as a flag
            $id2 = $row2['id'];
            $sd2 = $row2['sd'];
            break;
        }
        mysql_free_result($result2);
        if ($url2) {
            $sRows .= "<tr>" . "<td class=sdate>" . date("H:i", $row1['createdate']) . "</td>" . "<td class=surl><a class=ahover href='" . $row1['url'] . "' target='_blank'>" . shortenUrl($url) . "</a></td>" . "<td class=sreqs>{$id1}</td>" . "<td class=ssize>" . myround($sd1 / 1000) . " K</td>" . "<td class=sreqs style='" . ($id2 > $id1 ? "color: #0A0" : ($id2 < $id1 ? "color: #C00" : "")) . "'>{$id2}</td>" . "<td class=ssize style='" . ($sd2 > $sd1 ? "color: #0A0" : ($sd2 < $sd1 ? "color: #C00" : "")) . "'>" . myround($sd2 / 1000) . " K</td>" . "</tr>\n";
            $cntr++;
            $totalImages1 += $id1;
            $totalDelta1 += $sd1;
            $totalImages2 += $id2;
            $totalDelta2 += $sd2;
        }
    }
    mysql_free_result($result1);
    if ($cntr) {
        $sHtml .= "<tr>" . "<td class=avg></td>" . "<td class=avg style='text-align: right;'>AVERAGE SAVINGS</td>" . "<td class='sreqs avg'>" . intval($totalImages1 / $cntr + 0.5) . "</td>" . "<td class='ssize avg'>" . intval($totalDelta1 / ($cntr * 1000) + 0.5) . " K</td>" . "<td class='sreqs avg'>" . intval($totalImages2 / $cntr + 0.5) . "</td>" . "<td class='ssize avg'>" . intval($totalDelta2 / ($cntr * 1000) + 0.5) . " K</td>" . "</tr>\n" . $sRows . "<tr> <th></th> <th></th> <th class=sreqs>requests<br>eliminated</th> <th class=ssize>bytes<br>saved</th> <th class=sreqs>requests<br>eliminated</th> <th class=ssize>bytes<br>saved</th> </tr>\n";
    }
    $sHtml .= "</table>\n";
    return $sHtml;
}
Esempio n. 6
0
<?php

/*
Copyright 2010 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
require_once "../utils.inc";
require_once "../dbapi.inc";
$query = "select count(*) from {$gUrlsChangeTableDesktop};";
$num = doSimpleQuery($query);
if (0 < $num) {
    echo "There " . (1 == $num ? "is" : "are") . " {$num} URL " . (1 == $num ? "change" : "changes") . " in the queue.\n";
}
Esempio n. 7
0
            if ($gbChrome) {
                loadUrlsFromDB($crawlid, $label, 500000, true);
            } else {
                if ($gbAndroid) {
                    loadUrlsFromDB($crawlid, $label, 5000, false);
                } else {
                    if ($gbDev) {
                        loadUrlsFromDB($crawlid, $label, 500000, true);
                        // THIS IS THE ONLY CRAWL THAT UPDATES THE URLS!
                    }
                }
            }
        }
    }
}
$numUrls = doSimpleQuery("select count(*) from {$gStatusTable} where crawlid={$crawlid};");
updateCrawlFromId($crawlid, array("numUrls" => $numUrls));
lprint("done.");
cprint("done.");
lprint("DONE submitting batch run");
cprint("DONE submitting batch run");
// Load the URLs in urls.txt file into status table.
function loadUrlsFromFile($crawlid, $label, $file = NULL)
{
    $file = $file ? $file : ($gbMobile ? './urls.1000' : './urls.txt');
    $urls = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
    foreach ($urls as $url) {
        $url = trim($url);
        if (strlen($url)) {
            loadUrl($crawlid, $label, $url);
        }
Esempio n. 8
0
$pageidCond = "pageid >= {$minid} and pageid <= {$maxid}";
if ($gbDev && $gPagesTableDesktop != $gPagesTableDev) {
    $count = doSimpleQuery("select count(*) from {$gPagesTableDesktop} where {$pageidCond};");
    if ($count) {
        lprint("Rows already copied.");
    } else {
        lprint("Copy 'requests' rows to production...");
        doSimpleCommand("insert into {$gRequestsTableDesktop} select * from {$gRequestsTableDev} where {$pageidCond};");
        lprint("Copy 'pages' rows to production...");
        doSimpleCommand("insert into {$gPagesTableDesktop} select * from {$gPagesTableDev} where {$pageidCond};");
        lprint("...DONE.");
    }
}
// orphaned records
lprint("Checking for orphaned records...");
$numOrphans = doSimpleQuery("select count(*) from {$gRequestsTable} where {$pageidCond} and pageid not in (select pageid from {$gPagesTable} where {$pageidCond});");
if ($numOrphans) {
    lprint("There are {$numOrphans} orphaned records in the \"{$gRequestsTable}\" table.");
    $cmd = "delete from {$gRequestsTable} where {$pageidCond} and pageid not in (select pageid from {$gPagesTable} where {$pageidCond});";
    if ($numOrphans < 5000) {
        lprint("Deleting orphans now...");
        doSimpleCommand($cmd);
    } else {
        lprint("You should delete them, recalculate the stats, and regenerate the mysql dump files.\n    {$cmd}");
    }
} else {
    lprint("No orphaned records.");
}
// Compute stats
require_once "../stats.inc";
require_once "../dbapi.inc";
Esempio n. 9
0
}
$sTH .= "<th class=sortnum>&#35; data</th>" . "</tr>\n";
// track min & max (initialize values for each URL)
$ghMin = array();
$ghMax = array();
foreach ($aUrls as $url) {
    $ghMin[$url] = 9999999;
    $ghMax[$url] = 0;
}
$ghMedians = array();
foreach ($gaBrowsers as $browser) {
    $ghMedians[$browser] = array();
    foreach ($aUrls as $url) {
        $num = doSimpleQuery("select count(loadtime) as num from {$gBeaconsTable} where browser='{$browser}' and url='{$url}' and {$gRange};");
        if ($num > 0) {
            $median = doSimpleQuery("select loadtime as median from {$gBeaconsTable} where browser='{$browser}' and url='{$url}' and {$gRange} order by loadtime asc limit " . floor(($num - 1) / 2) . ",1;");
            $ghMedians[$browser][$url] = $median;
            if (array_key_exists($browser, $ghExceptions) && array_key_exists($url, $ghExceptions[$browser])) {
                // don't let exceptions take the min
            } else {
                if ($median < $ghMin[$url]) {
                    $ghMin[$url] = $median;
                }
            }
            if ($median > $ghMax[$url]) {
                $ghMax[$url] = $median;
            }
        }
    }
    $ghMedians[$browser]['num'] = $num;
    // bogus if URLs have different # of beacons
Esempio n. 10
0
<option value=200 <?php 
echo 200 == $gW ? "selected" : "";
?>
> medium
<option value=400 <?php 
echo 400 == $gW ? "selected" : "";
?>
> large
</select>
</form>
<a href="javascript:gotoLink()" style="margin-left: 1em; font-size: 0.8em;" class=txt>link</a>
</div>

<?php 
// Figure out which crawl is earliest:
$earliestLabel = doSimpleQuery("select label from crawls where (label = '{$gLabel1}' or label = '{$gLabel2}') and location = '" . curDevice() . "' order by minPageid asc limit 1;");
$bChrono = $earliestLabel == $gLabel1;
// Find the topmost URLs in both crawls:
$limitgoogle = "(url = 'http://www.google.com/' OR url not like '%://www.google.%')";
// There are 10+ sites that all look the same from Google intl sites
$maxRank = 5 * $gNumUrls;
// we get back MORE results than needed so we can filter out adult content
$query = "select url, min(pageid) as minid, max(pageid) as maxid, count(*) as num, _adult_site from {$gPagesTable}, {$gUrlsTable} as u where (label = '{$gLabel1}' or label = '{$gLabel2}') and url=urlOrig and u.rank > 0 and u.rank < {$maxRank} and {$limitgoogle} group by url having num=2 order by u.rank asc;";
$result = doQuery($query);
$i = 0;
$imgs1 = "";
$imgs2 = "";
while ($row = mysql_fetch_assoc($result)) {
    $url = $row['url'];
    $minid = $row['minid'];
    $maxid = $row['maxid'];
Esempio n. 11
0
     echo "<p class=warning>The URL entered is invalid: {$gRurl}</p>\n";
 } else {
     $urlObj = getUrl($gRurl, true);
     $bAdd = false;
     if ($urlObj) {
         if ($urlObj['optout']) {
             $bAdd = false;
             echo "<p class=warning>The owner of {$gRurl} has opted out of the HTTP Archive.</p>\n";
         } else {
             if (!$urlObj['other']) {
                 // If it exists but it's not marked "other" - then add it and set other=true;
                 $bAdd = true;
             } else {
                 $bAdd = false;
                 $query = "select max(pageid) as pageid from {$gPagesTable} where url='{$existingUrl}';";
                 $pageid = doSimpleQuery($query);
                 if ($pageid) {
                     echo "<p class=warning>{$gRurl} is already in the list of URLs. See the <a href='viewsite.php?pageid={$pageid}'>latest results</a>.</p>\n";
                 } else {
                     echo "<p class=warning>{$gRurl} is already in the list of URLs but doesn't have any data yet. It will be included in the next crawl.</p>\n";
                 }
             }
         }
     } else {
         // We get A LOT of requests to add deep pages (eg, "http://www.youtube.com/blahblah").
         // But we only allow one page per hostname if the URL is not in the list of top sites.
         // Here we check if this is a deep URL.
         $rooturl = substr($gRurl, 0, strpos($gRurl, "/", 10));
         $rooturlObj = getUrl($rooturl, true);
         if ($rooturlObj) {
             echo "<p class=warning>{$gRurl} will not be added because <a href='{$rooturlObj['url']}'>{$rooturlObj['url']}</a> is already in the crawl. If a URL is not in the list of <a href='about.php#listofurls'>top URLs</a> it can only be added if there are no other URLs with the same hostname already in the crawl.</p>\n";
Esempio n. 12
0
    updateCrawl($labelFromRun, $gArchive, $locations[0], array("passes" => $curPasses));
    if ($curPasses < $maxPasses) {
        resubmitFailures();
        cprint("Resubmitted failures - going around once again...");
    } else {
        // We just finished the last pass. Wrap it up...
        cprint(date("G:i") . ": DONE with tests. Copying...");
        $gParamLabel = $labelFromRun;
        // hack!
        // IMPORTANT: Update crawl info FIRST because many pieces of code reference this:
        $query = "select min(pageid) as minid, max(pageid) as maxid from {$gPagesTable} where label='{$labelFromRun}';";
        $row = doRowQuery($query);
        $minid = $row['minid'];
        $maxid = $row['maxid'];
        $numPages = doSimpleQuery("select count(*) from {$gPagesTable} where pageid >= {$minid} and pageid <= {$maxid};");
        $numRequests = doSimpleQuery("select count(*) from {$gRequestsTable} where pageid >= {$minid} and pageid <= {$maxid};");
        updateCrawl($labelFromRun, $gArchive, $locations[0], array("minPageid" => $minid, "maxPageid" => $maxid, "numErrors" => statusErrors(), "numPages" => $numPages, "numRequests" => $numRequests));
        // Copy rows, calc stats, create dump files, etc.
        require_once "copy.php";
        updateCrawl($labelFromRun, $gArchive, $locations[0], array("finishedDateTime" => time()));
        cprint(date("G:i") . ": DONE with crawl!");
        exit(0);
    }
}
// TODO - Combine "obtain" and "parse"?
// The "crawl" process has multiple distinct tasks. This is because the URLs are
// sent to and queued at WebPagetest which is asynchronous.
// We create a child process for each task.
// Each task has a unique file lock, so that a long task does NOT block a
// shorter process from being restarted during the next cronjob.
$aChildPids = array();