function cleanupRequests($location, $table)
{
    global $gSkipRuns, $gbActuallyDoit;
    $query = "select * from crawls where location = '{$location}' and finishedDateTime is not null order by crawlid desc limit " . ($gSkipRuns + 1) . ";";
    $results = doQuery($query);
    mysql_data_seek($results, $gSkipRuns);
    $row = mysql_fetch_assoc($results);
    if ($gbActuallyDoit) {
        $nUnfinished = doSimpleQuery("select count(*) from crawls where location = '{$location}' and finishedDateTime is null;");
        if (0 < $nUnfinished) {
            echo "SORRY! There is an unfinished crawl for location '{$location}'. Skipping the cleanup while the crawl is running.\n";
            return;
        }
        // Actually delete rows and optimize the table.
        echo "Delete requests from \"{$table}\" table starting with crawl \"{$row['label']}\" crawlid={$row['crawlid']} minPageid={$row['minPageid']} maxPageid={$row['maxPageid']}...\n";
        $cmd = "delete from {$table} where crawlid <= {$row['crawlid']};";
        echo "{$cmd}\n";
        doSimpleCommand($cmd);
        echo "DONE\nOptimize table \"{$table}\"...\n";
        doSimpleCommand("optimize table {$table};");
        echo "DONE\n";
    } else {
        echo "WOULD delete requests from \"{$table}\" table starting with crawl \"{$row['label']}\" crawlid={$row['crawlid']} minPageid={$row['minPageid']} maxPageid={$row['maxPageid']}...\n";
    }
}
                } else {
                    if (302 == $status && $resp_location == $urlHtml && false !== strpos($resp_cache_control, "max-age") && false === strpos($resp_cache_control, "max-age=0")) {
                        // 302s that are cacheable are saved.
                        $xfixed++;
                        $sfixed .= "FIXED: {$urlDerived} => {$urlHtml}\n";
                        $urlDerived = $urlHtml;
                    } else {
                        $snotfixed .= "NOTFIXED: {$urlDerived} != {$urlHtml}\n";
                        $snotfixed .= "  resp_location = {$resp_location}\n  status = {$status}\n  resp_cache_control = {$resp_cache_control}\n";
                        $xnotfixed++;
                    }
                }
            }
        }
    }
    doSimpleCommand("update {$gUrlsTable} set url='{$urlDerived}' where urlid={$urlid};");
}
mysql_free_result($result);
echo <<<OUTPUT
xtotal = {$xtotal}
xdiff = {$xdiff}
x300 = {$x300}
xfixed = {$xfixed}
xnotfixed = {$xnotfixed}

OUTPUT;
echo "DONE\n";
/*
if ( str_replace("www.", "", $urlDerived) == $urlHtml ) {
	echo "remove www.\n";
	$bUpdate = true;
Beispiel #3
0
function importPageMod($hPage)
{
    global $pagesTable, $requestsTable;
    $t_CVSNO = time();
    $pageid = $hPage['pageid'];
    $wptid = $hPage['wptid'];
    $wptrun = $hPage['wptrun'];
    if (!$wptid || !$wptrun) {
        tprint("ERROR: importPageMod({$pageid}): failed to find wptid and wptrun: {$wptid}, {$wptrun}");
        return;
    }
    // lifted from importWptResults
    $wptServer = wptServer();
    $request = $wptServer . "export.php?test={$wptid}&run={$wptrun}&cached=0&php=1";
    $response = fetchUrl($request);
    //tprint("after fetchUrl", $t_CVSNO);
    if (!strlen($response)) {
        tprint("ERROR: importPageMod({$pageid}): URL failed: {$request}");
        return;
    }
    // lifted from importHarJson
    $json_text = $response;
    $HAR = json_decode($json_text);
    if (NULL == $HAR) {
        tprint("ERROR: importPageMod({$pageid}): JSON decode failed");
        return;
    }
    $log = $HAR->{'log'};
    $pages = $log->{'pages'};
    $pagecount = count($pages);
    if (0 == $pagecount) {
        tprint("ERROR: importPageMod({$pageid}): No pages found");
        return;
    }
    // lifted from importPage
    $page = $pages[0];
    if (array_key_exists('_TTFB', $page)) {
        $hPage['TTFB'] = $page->{'_TTFB'};
    }
    if (array_key_exists('_fullyLoaded', $page)) {
        $hPage['fullyLoaded'] = $page->{'_fullyLoaded'};
    }
    if (array_key_exists('_visualComplete', $page)) {
        $hPage['visualComplete'] = $page->{'_visualComplete'};
    }
    if (array_key_exists('_gzip_total', $page)) {
        $hPage['gzipTotal'] = $page->{'_gzip_total'};
        $hPage['gzipSavings'] = $page->{'_gzip_savings'};
    }
    if (array_key_exists('_domElements', $page)) {
        $hPage['numDomElements'] = $page->{'_domElements'};
    }
    if (array_key_exists('_domContentLoadedEventStart', $page)) {
        $hPage['onContentLoaded'] = $page->{'_domContentLoadedEventStart'};
    }
    if (array_key_exists('_base_page_cdn', $page)) {
        $hPage['cdn'] = $page->{'_base_page_cdn'};
    }
    if (array_key_exists('_SpeedIndex', $page)) {
        $hPage['SpeedIndex'] = $page->{'_SpeedIndex'};
    }
    // lifted from aggregateStats
    // initialize variables for counting the page's stats
    $hPage['bytesTotal'] = 0;
    $hPage['reqTotal'] = 0;
    $typeMap = array("flash" => "Flash", "css" => "CSS", "image" => "Img", "script" => "JS", "html" => "Html", "font" => "Font", "other" => "Other", "gif" => "Gif", "jpg" => "Jpg", "png" => "Png");
    foreach (array_keys($typeMap) as $type) {
        // initialize the hashes
        $hPage['req' . $typeMap[$type]] = 0;
        $hPage['bytes' . $typeMap[$type]] = 0;
    }
    $hDomains = array();
    $hPage['maxageNull'] = $hPage['maxage0'] = $hPage['maxage1'] = $hPage['maxage30'] = $hPage['maxage365'] = $hPage['maxageMore'] = 0;
    $hPage['bytesHtmlDoc'] = $hPage['numRedirects'] = $hPage['numErrors'] = $hPage['numGlibs'] = $hPage['numHttps'] = $hPage['numCompressed'] = $hPage['maxDomainReqs'] = 0;
    $result = doQuery("select mimeType, urlShort, resp_content_type, respSize, expAge, firstHtml, status, resp_content_encoding, req_host from {$requestsTable} where pageid = {$pageid};");
    //tprint("after query", $t_CVSNO);
    while ($row = mysql_fetch_assoc($result)) {
        $reqUrl = $row['urlShort'];
        $mimeType = prettyType($row['mimeType'], $reqUrl);
        $respSize = intval($row['respSize']);
        $hPage['reqTotal']++;
        $hPage['bytesTotal'] += $respSize;
        $hPage['req' . $typeMap[$mimeType]]++;
        $hPage['bytes' . $typeMap[$mimeType]] += $respSize;
        if ("image" === $mimeType) {
            $content_type = $row['resp_content_type'];
            $imgformat = false !== stripos($content_type, "image/gif") ? "gif" : (false !== stripos($content_type, "image/jpg") || false !== stripos($content_type, "image/jpeg") ? "jpg" : (false !== stripos($content_type, "image/png") ? "png" : ""));
            if ($imgformat) {
                $hPage['req' . $typeMap[$imgformat]]++;
                $hPage['bytes' . $typeMap[$imgformat]] += $respSize;
            }
        }
        // count unique domains (really hostnames)
        $aMatches = array();
        if ($reqUrl && preg_match('/http[s]*:\\/\\/([^\\/]*)/', $reqUrl, $aMatches)) {
            $hostname = $aMatches[1];
            if (!array_key_exists($hostname, $hDomains)) {
                $hDomains[$hostname] = 0;
            }
            $hDomains[$hostname]++;
            // count hostnames
        } else {
            tprint("ERROR: importPageMod({$pageid}): No hostname found in URL: {$reqUrl}");
        }
        // count expiration windows
        $expAge = $row['expAge'];
        $daySecs = 24 * 60 * 60;
        if (NULL === $expAge) {
            $hPage['maxageNull']++;
        } else {
            if (0 === intval($expAge)) {
                $hPage['maxage0']++;
            } else {
                if ($expAge <= 1 * $daySecs) {
                    $hPage['maxage1']++;
                } else {
                    if ($expAge <= 30 * $daySecs) {
                        $hPage['maxage30']++;
                    } else {
                        if ($expAge <= 365 * $daySecs) {
                            $hPage['maxage365']++;
                        } else {
                            $hPage['maxageMore']++;
                        }
                    }
                }
            }
        }
        if ($row['firstHtml']) {
            $hPage['bytesHtmlDoc'] = $respSize;
        }
        // CVSNO - can we get this UNgzipped?!
        $status = $row['status'];
        if (300 <= $status && $status < 400 && 304 != $status) {
            $hPage['numRedirects']++;
        } else {
            if (400 <= $status && $status < 600) {
                $hPage['numErrors']++;
            }
        }
        if (0 === stripos($reqUrl, "https://")) {
            $hPage['numHttps']++;
        }
        if (FALSE !== stripos($row['req_host'], "googleapis.com")) {
            $hPage['numGlibs']++;
        }
        if ("gzip" == $row['resp_content_encoding'] || "deflate" == $row['resp_content_encoding']) {
            $hPage['numCompressed']++;
        }
    }
    mysql_free_result($result);
    $hPage['numDomains'] = count(array_keys($hDomains));
    foreach (array_keys($hDomains) as $domain) {
        $hPage['maxDomainReqs'] = max($hPage['maxDomainReqs'], $hDomains[$domain]);
    }
    //$cmd = "UPDATE $pagesTable SET reqTotal = $reqTotal, bytesTotal = $bytesTotal" .
    $cmd = "insert into pagestmp SET " . hashImplode(", ", "=", $hPage) . ";";
    //tprint("before insert", $t_CVSNO);
    doSimpleCommand($cmd);
    //tprint("after insert\n", $t_CVSNO);
}
Beispiel #4
0
function createTables()
{
    if (1 == 1) {
        $command = "create table spritemesavings (" . "savingsid int unsigned not null auto_increment" . ", createdate int(10) unsigned not null" . ", url varchar (255) not null" . ", ib int(4)" . ", ia int(4)" . ", id int(4)" . ", sb int" . ", sa int" . ", sd int" . ", tb int(4)" . ", tc int(4)" . ", testnameid int unsigned" . ", primary key (savingsid)" . ");";
        doSimpleCommand($command);
        $command = "create table spritemetestnames (" . "testnameid int unsigned not null auto_increment" . ", createdate int(10) unsigned not null" . ", testname varchar (255) not null" . ", savingsidstart int" . ", primary key (testnameid)" . ", unique key (testname)" . ");";
        doSimpleCommand($command);
    }
}
if ($gbImportUrls) {
    // TODO - Should we do this for $gbMobile too?????
    require_once "importurls.php";
    if (!$gbMobile && $gPagesTableDesktop != $gPagesTableDev) {
        lprint("Copy 'urls' rows to production");
        cprint("Copy 'urls' rows to production");
        // We have to do this immediately BEFORE the mobile crawl kicks off.
        // This is scary but the issue is we need to clear out all the previous ranks, optouts, others, etc. and use what's in urlsdev.
        for ($i = 0; $i <= 70000; $i += 1000) {
            $cmd = "delete from {$gUrlsTableDesktop} where urlhash <= {$i};";
            cprint("About to delete urls: {$cmd}");
            doSimpleCommand();
        }
        $cmd = "insert into {$gUrlsTableDesktop} select * from {$gUrlsTableDev};";
        cprint("About to copy urls: {$cmd}");
        doSimpleCommand($cmd);
        lprint("done.");
        cprint("done.");
    }
}
// Empty the status table
lprint("Clear status table...");
cprint("Clear status table...");
removeAllStatusData();
// START THE CRAWL
// create a partial crawl record - we'll fill out the missing fields as we get them
// WARNING: Two runs submitted on the same day will have the same label.
$date = getdate();
$label = substr($date['month'], 0, 3) . " " . $date['mday'] . " " . $date['year'] . $gSublabel;
createCrawl(array("label" => $label, "archive" => $gArchive, "location" => $locations[0], "video" => $video, "docComplete" => $docComplete, "fvonly" => $fvonly, "runs" => $runs, "startedDateTime" => $startedDateTime, "passes" => 0));
$crawl = getCrawl($label, $gArchive, $locations[0]);
    // catch any final inserts
    if ($sInsert) {
        doSimpleCommand("insert into {$gUrlsTable} (urlOrig, timeAdded" . ($rank ? ", ranktmp" : "") . ($other ? ", other" : "") . ") VALUES " . substr($sInsert, 1) . " ON DUPLICATE KEY UPDATE " . ($rank ? "ranktmp=VALUES(ranktmp)" : "") . ($other ? ($rank ? ", " : "") . "other=VALUES(other)" : ""));
    }
    fclose($handle);
} else {
    echo "ERROR: Unable to open file \"{$gUrlsFile}\".\n";
}
if ("alexa" === $gFileType) {
    doSimpleCommand("update {$gUrlsTable} set rank=ranktmp;");
    echo "The ranks have been updated.\n";
}
// update the urlhash column if null
// WARNING: URLs added through the admin.php page do NOT have urlhash set immediately - but they should be caught here.
// "urlhash" is a substring of the URL's MD5 hash converted to base-10.
doSimpleCommand("update {$gUrlsTable} set urlhash=conv(substring(md5(urlOrig), 1, 4), 16, 10) where urlhash is null;");
echo "The urlhash have been updated.\n";
echo "DONE\n";
// return the name of the downloaded file
function downloadAlexaList()
{
    $listfile = "top-1m.csv";
    // move current list out of the way
    if (file_exists($listfile)) {
        exec("mv {$listfile} {$listfile}.prev");
    }
    // get the new file
    if (!file_put_contents("{$listfile}.zip", file_get_contents("http://s3.amazonaws.com/alexa-static/{$listfile}.zip"))) {
        echo "ERROR: Unable to download list file.\n";
        return false;
    }
Beispiel #7
0
See the source code here:
     http://code.google.com/p/loadtimer/
*/
// This is a beacon - don't return any content.
header("HTTP/1.1 204 No Content");
require_once "db.inc";
require_once "uaparser.inc";
createTables();
if (array_key_exists("action", $_GET) && "done" === $_GET["action"]) {
    // could do some agg stuff here
    exit(0);
}
// only for my testing
// $gLFID = ( array_key_exists("REMOTE_ADDR", $_SERVER) ? crypt($_SERVER["REMOTE_ADDR"], "DC") : "" );
$gLFID = "public";
$gUrl = trim(array_key_exists("url", $_GET) ? $_GET["url"] : "");
$gTime = array_key_exists("loadtime", $_GET) ? $_GET["loadtime"] : 0;
$gId = array_key_exists("id", $_GET) ? $_GET["id"] : 0;
$gUA = $_SERVER['HTTP_USER_AGENT'];
$gBrowserName = $gV1 = "";
parseUserAgent($gUA, $gBrowserName, $gV1);
$gBrowser = "{$gBrowserName} {$gV1}";
if (!$gUrl || !$gTime || !$gId || !$gBrowser) {
    exit(1);
}
// save the results
$now = time();
$command = "insert into {$gBeaconsTable} set createdate={$now}, sessid='{$gLFID}', url='{$gUrl}', loadtime={$gTime}, id='{$gId}', browser='{$gBrowser}', useragent='{$gUA}';";
doSimpleCommand($command);