function cleanupRequests($location, $table) { global $gSkipRuns, $gbActuallyDoit; $query = "select * from crawls where location = '{$location}' and finishedDateTime is not null order by crawlid desc limit " . ($gSkipRuns + 1) . ";"; $results = doQuery($query); mysql_data_seek($results, $gSkipRuns); $row = mysql_fetch_assoc($results); if ($gbActuallyDoit) { $nUnfinished = doSimpleQuery("select count(*) from crawls where location = '{$location}' and finishedDateTime is null;"); if (0 < $nUnfinished) { echo "SORRY! There is an unfinished crawl for location '{$location}'. Skipping the cleanup while the crawl is running.\n"; return; } // Actually delete rows and optimize the table. echo "Delete requests from \"{$table}\" table starting with crawl \"{$row['label']}\" crawlid={$row['crawlid']} minPageid={$row['minPageid']} maxPageid={$row['maxPageid']}...\n"; $cmd = "delete from {$table} where crawlid <= {$row['crawlid']};"; echo "{$cmd}\n"; doSimpleCommand($cmd); echo "DONE\nOptimize table \"{$table}\"...\n"; doSimpleCommand("optimize table {$table};"); echo "DONE\n"; } else { echo "WOULD delete requests from \"{$table}\" table starting with crawl \"{$row['label']}\" crawlid={$row['crawlid']} minPageid={$row['minPageid']} maxPageid={$row['maxPageid']}...\n"; } }
} else { if (302 == $status && $resp_location == $urlHtml && false !== strpos($resp_cache_control, "max-age") && false === strpos($resp_cache_control, "max-age=0")) { // 302s that are cacheable are saved. $xfixed++; $sfixed .= "FIXED: {$urlDerived} => {$urlHtml}\n"; $urlDerived = $urlHtml; } else { $snotfixed .= "NOTFIXED: {$urlDerived} != {$urlHtml}\n"; $snotfixed .= " resp_location = {$resp_location}\n status = {$status}\n resp_cache_control = {$resp_cache_control}\n"; $xnotfixed++; } } } } } doSimpleCommand("update {$gUrlsTable} set url='{$urlDerived}' where urlid={$urlid};"); } mysql_free_result($result); echo <<<OUTPUT xtotal = {$xtotal} xdiff = {$xdiff} x300 = {$x300} xfixed = {$xfixed} xnotfixed = {$xnotfixed} OUTPUT; echo "DONE\n"; /* if ( str_replace("www.", "", $urlDerived) == $urlHtml ) { echo "remove www.\n"; $bUpdate = true;
function importPageMod($hPage) { global $pagesTable, $requestsTable; $t_CVSNO = time(); $pageid = $hPage['pageid']; $wptid = $hPage['wptid']; $wptrun = $hPage['wptrun']; if (!$wptid || !$wptrun) { tprint("ERROR: importPageMod({$pageid}): failed to find wptid and wptrun: {$wptid}, {$wptrun}"); return; } // lifted from importWptResults $wptServer = wptServer(); $request = $wptServer . "export.php?test={$wptid}&run={$wptrun}&cached=0&php=1"; $response = fetchUrl($request); //tprint("after fetchUrl", $t_CVSNO); if (!strlen($response)) { tprint("ERROR: importPageMod({$pageid}): URL failed: {$request}"); return; } // lifted from importHarJson $json_text = $response; $HAR = json_decode($json_text); if (NULL == $HAR) { tprint("ERROR: importPageMod({$pageid}): JSON decode failed"); return; } $log = $HAR->{'log'}; $pages = $log->{'pages'}; $pagecount = count($pages); if (0 == $pagecount) { tprint("ERROR: importPageMod({$pageid}): No pages found"); return; } // lifted from importPage $page = $pages[0]; if (array_key_exists('_TTFB', $page)) { $hPage['TTFB'] = $page->{'_TTFB'}; } if (array_key_exists('_fullyLoaded', $page)) { $hPage['fullyLoaded'] = $page->{'_fullyLoaded'}; } if (array_key_exists('_visualComplete', $page)) { $hPage['visualComplete'] = $page->{'_visualComplete'}; } if (array_key_exists('_gzip_total', $page)) { $hPage['gzipTotal'] = $page->{'_gzip_total'}; $hPage['gzipSavings'] = $page->{'_gzip_savings'}; } if (array_key_exists('_domElements', $page)) { $hPage['numDomElements'] = $page->{'_domElements'}; } if (array_key_exists('_domContentLoadedEventStart', $page)) { $hPage['onContentLoaded'] = $page->{'_domContentLoadedEventStart'}; } if (array_key_exists('_base_page_cdn', $page)) { $hPage['cdn'] = $page->{'_base_page_cdn'}; } if (array_key_exists('_SpeedIndex', $page)) { $hPage['SpeedIndex'] = $page->{'_SpeedIndex'}; } // lifted from aggregateStats // initialize variables for counting the page's stats $hPage['bytesTotal'] = 0; $hPage['reqTotal'] = 0; $typeMap = array("flash" => "Flash", "css" => "CSS", "image" => "Img", "script" => "JS", "html" => "Html", "font" => "Font", "other" => "Other", "gif" => "Gif", "jpg" => "Jpg", "png" => "Png"); foreach (array_keys($typeMap) as $type) { // initialize the hashes $hPage['req' . $typeMap[$type]] = 0; $hPage['bytes' . $typeMap[$type]] = 0; } $hDomains = array(); $hPage['maxageNull'] = $hPage['maxage0'] = $hPage['maxage1'] = $hPage['maxage30'] = $hPage['maxage365'] = $hPage['maxageMore'] = 0; $hPage['bytesHtmlDoc'] = $hPage['numRedirects'] = $hPage['numErrors'] = $hPage['numGlibs'] = $hPage['numHttps'] = $hPage['numCompressed'] = $hPage['maxDomainReqs'] = 0; $result = doQuery("select mimeType, urlShort, resp_content_type, respSize, expAge, firstHtml, status, resp_content_encoding, req_host from {$requestsTable} where pageid = {$pageid};"); //tprint("after query", $t_CVSNO); while ($row = mysql_fetch_assoc($result)) { $reqUrl = $row['urlShort']; $mimeType = prettyType($row['mimeType'], $reqUrl); $respSize = intval($row['respSize']); $hPage['reqTotal']++; $hPage['bytesTotal'] += $respSize; $hPage['req' . $typeMap[$mimeType]]++; $hPage['bytes' . $typeMap[$mimeType]] += $respSize; if ("image" === $mimeType) { $content_type = $row['resp_content_type']; $imgformat = false !== stripos($content_type, "image/gif") ? "gif" : (false !== stripos($content_type, "image/jpg") || false !== stripos($content_type, "image/jpeg") ? "jpg" : (false !== stripos($content_type, "image/png") ? "png" : "")); if ($imgformat) { $hPage['req' . $typeMap[$imgformat]]++; $hPage['bytes' . $typeMap[$imgformat]] += $respSize; } } // count unique domains (really hostnames) $aMatches = array(); if ($reqUrl && preg_match('/http[s]*:\\/\\/([^\\/]*)/', $reqUrl, $aMatches)) { $hostname = $aMatches[1]; if (!array_key_exists($hostname, $hDomains)) { $hDomains[$hostname] = 0; } $hDomains[$hostname]++; // count hostnames } else { tprint("ERROR: importPageMod({$pageid}): No hostname found in URL: {$reqUrl}"); } // count expiration windows $expAge = $row['expAge']; $daySecs = 24 * 60 * 60; if (NULL === $expAge) { $hPage['maxageNull']++; } else { if (0 === intval($expAge)) { $hPage['maxage0']++; } else { if ($expAge <= 1 * $daySecs) { $hPage['maxage1']++; } else { if ($expAge <= 30 * $daySecs) { $hPage['maxage30']++; } else { if ($expAge <= 365 * $daySecs) { $hPage['maxage365']++; } else { $hPage['maxageMore']++; } } } } } if ($row['firstHtml']) { $hPage['bytesHtmlDoc'] = $respSize; } // CVSNO - can we get this UNgzipped?! $status = $row['status']; if (300 <= $status && $status < 400 && 304 != $status) { $hPage['numRedirects']++; } else { if (400 <= $status && $status < 600) { $hPage['numErrors']++; } } if (0 === stripos($reqUrl, "https://")) { $hPage['numHttps']++; } if (FALSE !== stripos($row['req_host'], "googleapis.com")) { $hPage['numGlibs']++; } if ("gzip" == $row['resp_content_encoding'] || "deflate" == $row['resp_content_encoding']) { $hPage['numCompressed']++; } } mysql_free_result($result); $hPage['numDomains'] = count(array_keys($hDomains)); foreach (array_keys($hDomains) as $domain) { $hPage['maxDomainReqs'] = max($hPage['maxDomainReqs'], $hDomains[$domain]); } //$cmd = "UPDATE $pagesTable SET reqTotal = $reqTotal, bytesTotal = $bytesTotal" . $cmd = "insert into pagestmp SET " . hashImplode(", ", "=", $hPage) . ";"; //tprint("before insert", $t_CVSNO); doSimpleCommand($cmd); //tprint("after insert\n", $t_CVSNO); }
function createTables() { if (1 == 1) { $command = "create table spritemesavings (" . "savingsid int unsigned not null auto_increment" . ", createdate int(10) unsigned not null" . ", url varchar (255) not null" . ", ib int(4)" . ", ia int(4)" . ", id int(4)" . ", sb int" . ", sa int" . ", sd int" . ", tb int(4)" . ", tc int(4)" . ", testnameid int unsigned" . ", primary key (savingsid)" . ");"; doSimpleCommand($command); $command = "create table spritemetestnames (" . "testnameid int unsigned not null auto_increment" . ", createdate int(10) unsigned not null" . ", testname varchar (255) not null" . ", savingsidstart int" . ", primary key (testnameid)" . ", unique key (testname)" . ");"; doSimpleCommand($command); } }
if ($gbImportUrls) { // TODO - Should we do this for $gbMobile too????? require_once "importurls.php"; if (!$gbMobile && $gPagesTableDesktop != $gPagesTableDev) { lprint("Copy 'urls' rows to production"); cprint("Copy 'urls' rows to production"); // We have to do this immediately BEFORE the mobile crawl kicks off. // This is scary but the issue is we need to clear out all the previous ranks, optouts, others, etc. and use what's in urlsdev. for ($i = 0; $i <= 70000; $i += 1000) { $cmd = "delete from {$gUrlsTableDesktop} where urlhash <= {$i};"; cprint("About to delete urls: {$cmd}"); doSimpleCommand(); } $cmd = "insert into {$gUrlsTableDesktop} select * from {$gUrlsTableDev};"; cprint("About to copy urls: {$cmd}"); doSimpleCommand($cmd); lprint("done."); cprint("done."); } } // Empty the status table lprint("Clear status table..."); cprint("Clear status table..."); removeAllStatusData(); // START THE CRAWL // create a partial crawl record - we'll fill out the missing fields as we get them // WARNING: Two runs submitted on the same day will have the same label. $date = getdate(); $label = substr($date['month'], 0, 3) . " " . $date['mday'] . " " . $date['year'] . $gSublabel; createCrawl(array("label" => $label, "archive" => $gArchive, "location" => $locations[0], "video" => $video, "docComplete" => $docComplete, "fvonly" => $fvonly, "runs" => $runs, "startedDateTime" => $startedDateTime, "passes" => 0)); $crawl = getCrawl($label, $gArchive, $locations[0]);
// catch any final inserts if ($sInsert) { doSimpleCommand("insert into {$gUrlsTable} (urlOrig, timeAdded" . ($rank ? ", ranktmp" : "") . ($other ? ", other" : "") . ") VALUES " . substr($sInsert, 1) . " ON DUPLICATE KEY UPDATE " . ($rank ? "ranktmp=VALUES(ranktmp)" : "") . ($other ? ($rank ? ", " : "") . "other=VALUES(other)" : "")); } fclose($handle); } else { echo "ERROR: Unable to open file \"{$gUrlsFile}\".\n"; } if ("alexa" === $gFileType) { doSimpleCommand("update {$gUrlsTable} set rank=ranktmp;"); echo "The ranks have been updated.\n"; } // update the urlhash column if null // WARNING: URLs added through the admin.php page do NOT have urlhash set immediately - but they should be caught here. // "urlhash" is a substring of the URL's MD5 hash converted to base-10. doSimpleCommand("update {$gUrlsTable} set urlhash=conv(substring(md5(urlOrig), 1, 4), 16, 10) where urlhash is null;"); echo "The urlhash have been updated.\n"; echo "DONE\n"; // return the name of the downloaded file function downloadAlexaList() { $listfile = "top-1m.csv"; // move current list out of the way if (file_exists($listfile)) { exec("mv {$listfile} {$listfile}.prev"); } // get the new file if (!file_put_contents("{$listfile}.zip", file_get_contents("http://s3.amazonaws.com/alexa-static/{$listfile}.zip"))) { echo "ERROR: Unable to download list file.\n"; return false; }
See the source code here: http://code.google.com/p/loadtimer/ */ // This is a beacon - don't return any content. header("HTTP/1.1 204 No Content"); require_once "db.inc"; require_once "uaparser.inc"; createTables(); if (array_key_exists("action", $_GET) && "done" === $_GET["action"]) { // could do some agg stuff here exit(0); } // only for my testing // $gLFID = ( array_key_exists("REMOTE_ADDR", $_SERVER) ? crypt($_SERVER["REMOTE_ADDR"], "DC") : "" ); $gLFID = "public"; $gUrl = trim(array_key_exists("url", $_GET) ? $_GET["url"] : ""); $gTime = array_key_exists("loadtime", $_GET) ? $_GET["loadtime"] : 0; $gId = array_key_exists("id", $_GET) ? $_GET["id"] : 0; $gUA = $_SERVER['HTTP_USER_AGENT']; $gBrowserName = $gV1 = ""; parseUserAgent($gUA, $gBrowserName, $gV1); $gBrowser = "{$gBrowserName} {$gV1}"; if (!$gUrl || !$gTime || !$gId || !$gBrowser) { exit(1); } // save the results $now = time(); $command = "insert into {$gBeaconsTable} set createdate={$now}, sessid='{$gLFID}', url='{$gUrl}', loadtime={$gTime}, id='{$gId}', browser='{$gBrowser}', useragent='{$gUA}';"; doSimpleCommand($command);