Skip to content

Instantly share code, notes, and snippets.

@igrigorik
Last active September 13, 2024 07:27
Show Gist options
  • Save igrigorik/5801492 to your computer and use it in GitHub Desktop.
Save igrigorik/5801492 to your computer and use it in GitHub Desktop.
Sample BigQuery queries for the HTTP Archive dataset.
function onOpen() {
var ss = SpreadsheetApp.getActiveSpreadsheet();
var menuEntries = [ {name: "Run Query", functionName: "runQuery"} ];
ss.addMenu("HTTP Archive + BigQuery", menuEntries);
}
function runQuery() {
var projectNumber = 'httparchive';
var sheet = SpreadsheetApp.getActiveSheet();
var c = SpreadsheetApp.getActiveSheet().getRange('F2').getValue();
var dates = ['2013_06','2013_05','2013_04','2013_03','2013_02','2013_01',
'2012_12','2012_11','2012_10','2012_09','2012_08','2012_07',
'2012_06','2012_05','2012_04','2012_03','2012_02','2012_01',
];
var sql = 'SELECT date, median, seventy_fifth, ninetieth FROM '
for (var i=0; i<dates.length; i++) {
sql = sql + '(SELECT "'+dates[i]+'" date,\
NTH(50, quantiles('+c+',101)) median,\
NTH(75, quantiles('+c+',101)) seventy_fifth,\
NTH(90, quantiles('+c+',101)) ninetieth \
FROM [httparchive:runs.'+dates[i]+'_01_pages]),'
}
var queryResults;
// Inserts a Query Job
try {
queryResults = BigQuery.Jobs.query(projectNumber, sql);
}
catch (err) {
Logger.log(err);
Browser.msgBox(err);
return;
}
// Check on status of the Query Job
while (queryResults.getJobComplete() == false) {
try {
queryResults = BigQuery.Jobs.getQueryResults(projectNumber, queryJob.getJobReference().getJobId());
}
catch (err) {
Logger.log(err);
Browser.msgBox(err);
return;
}
}
// Update the amount of results
var resultCount = queryResults.getTotalRows();
var resultSchema = queryResults.getSchema();
var resultValues = new Array(resultCount);
var tableRows = queryResults.getRows();
// Iterate through query results
for (var i = 0; i < tableRows.length; i++) {
var cols = tableRows[i].getF();
resultValues[i] = new Array(cols.length);
// For each column, add values to the result array
for (var j = 0; j < cols.length; j++) {
resultValues[i][j] = cols[j].getV();
}
}
// Update the Spreadsheet with data from the resultValues array, starting from cell A1
sheet.getRange(2, 1, resultCount, tableRows[0].getF().length).setValues(resultValues);
}
/* One JS framework is clearly not enough... Which sites are using multiple popular JS frameworks, and how many? */
SELECT pages.pageid, url, cnt, libs, pages.rank rank FROM [httparchive:runs.2013_06_01_pages] as pages JOIN (
SELECT pageid, count(distinct(type)) cnt, GROUP_CONCAT(type) libs FROM (
SELECT REGEXP_EXTRACT(url, r'(jquery|dojo|angular|prototype|backbone|emberjs|sencha|scriptaculous).*\.js') type, pageid
FROM [httparchive:runs.2013_06_01_requests]
WHERE REGEXP_MATCH(url, r'jquery|dojo|angular|prototype|backbone|emberjs|sencha|scriptaculous.*\.js')
GROUP BY pageid, type
)
GROUP BY pageid
HAVING cnt >= 2
) as lib ON lib.pageid = pages.pageid
WHERE rank IS NOT NULL
ORDER BY rank asc
/* Which sites use multiple versions of JQuery, and how many? (facepalm) */
SELECT pages.pageid, url, cnt, versions, pages.rank rank FROM [httparchive:runs.2013_06_01_pages] as pages JOIN (
SELECT pageid, count(distinct(version)) cnt, GROUP_CONCAT(version) versions FROM (
SELECT url, REGEXP_EXTRACT(url, r'googleapis.*jquery\/(\d+\.\d+\.\d+)\/.*\.js') version, pageid
FROM [httparchive:runs.2013_06_01_requests]
WHERE REGEXP_MATCH(url, r'jquery.*\.js')
GROUP BY url, pageid, version
)
WHERE version IS NOT NULL
GROUP by pageid
HAVING cnt >= 2
) as lib ON lib.pageid = pages.pageid
WHERE rank is NOT NULL
ORDER BY rank asc
/* Which frameworks are the most popular? */
SELECT REGEXP_EXTRACT(url, r'(jquery|dojo|angular|prototype)') type, count(distinct(pageid))
FROM [httparchive:runs.2013_05_15_requests]
WHERE REGEXP_MATCH(url, r'jquery|dojo|angular|prototype')
GROUP BY type;
/* Medians for different content-type's... */
SELECT
NTH(50, quantiles(bytesTotal,101)) TOTAL_med,
NTH(50, quantiles(bytesHtmlDoc,101)) HTMLDOC_med,
NTH(50, quantiles(bytesHtml,101)) HTML_med,
NTH(50, quantiles(bytesJS,101)) JS_med,
NTH(50, quantiles(bytesCSS,101)) CSS_med,
NTH(50, quantiles(bytesImg,101)) IMG_med,
NTH(50, quantiles(bytesGif,101)) GIF_med,
NTH(50, quantiles(bytesJpg,101)) JPG_med,
NTH(50, quantiles(bytesPng,101)) PNG_med,
NTH(50, quantiles(bytesFont,101)) FONT_med,
NTH(50, quantiles(bytesFlash,101)) FLASH_med,
NTH(50, quantiles(bytesJson,101)) JSON_med,
NTH(50, quantiles(bytesOther,101)) OTHER_med
FROM [httparchive:runs.2013_06_01_pages]
/* First render quantiles? */
SELECT
NTH(50, quantiles(renderStart,101)) median,
NTH(75, quantiles(renderStart,101)) seventy_fifth,
NTH(90, quantiles(renderStart,101)) ninetieth
FROM [httparchive:runs.2013_06_01_pages]
/* Trend quantiles over time ... */
SELECT * FROM
(SELECT '2013_06' date,
NTH(50, quantiles(renderStart,101)) median,
NTH(75, quantiles(renderStart,101)) seventy_fifth,
NTH(90, quantiles(renderStart,101)) ninetieth
FROM [httparchive:runs.2013_06_01_pages]),
(SELECT '2013_05' date,
NTH(50, quantiles(renderStart,101)) median,
NTH(75, quantiles(renderStart,101)) seventy_fifth,
NTH(90, quantiles(renderStart,101)) ninetieth
FROM [httparchive:runs.2013_05_01_pages]),
(SELECT '2013_04' date,
NTH(50, quantiles(renderStart,101)) median,
NTH(75, quantiles(renderStart,101)) seventy_fifth,
NTH(90, quantiles(renderStart,101)) ninetieth
FROM [httparchive:runs.2013_04_01_pages]),
(SELECT '2013_03' date,
NTH(50, quantiles(renderStart,101)) median,
NTH(75, quantiles(renderStart,101)) seventy_fifth,
NTH(90, quantiles(renderStart,101)) ninetieth
FROM [httparchive:runs.2013_03_01_pages])
/* Which sites enabled Resource Timing? Courtesy of Steve Souders. */
SELECT domain(url) as domainname, count(*) as num
FROM [httparchive:runs.2013_06_01_requests]
WHERE lower(respOtherHeaders) contains "timing-allow-origin"
GROUP BY domainname
ORDER BY num desc
/* Which domains are the top third party dependencies? */
SELECT DOMAIN(req.url) third_party, COUNT(*) num_requests
FROM [httparchive:runs.2013_06_01_requests] as req JOIN (
SELECT DOMAIN(url) self, pageid
FROM [httparchive:runs.2013_06_01_pages]
) as pages ON pages.pageid = req.pageid
WHERE DOMAIN(req.url) != pages.self
GROUP BY third_party
ORDER BY num_requests desc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment