Skip to content

Instantly share code, notes, and snippets.

@bcks
Last active November 27, 2020 07:36
Show Gist options
  • Save bcks/2ca28d47ffd3858d9393e172126d9d9c to your computer and use it in GitHub Desktop.
Save bcks/2ca28d47ffd3858d9393e172126d9d9c to your computer and use it in GitHub Desktop.
tableau-covid-scraping
// Works with Node v12.0 and puppeteer 4.0
const URL = 'https://public.tableau.com/views/PPV_15924847800480/ppv_db?%3Aembed=y&%3AshowVizHome=no&%3Adisplay_count=y&%3Adisplay_static_image=n&%3AbootstrapWhenNotified=true&%3Alanguage=en&:embed=y&:showVizHome=n&:apiID=host0';
const puppeteer = require('puppeteer');
function parseDataDictionary(jsonParsed) {
let dataColumns = jsonParsed[1].secondaryInfo.presModelMap.dataDictionary.presModelHolder
.genDataDictionaryPresModel.dataSegments["0"].dataColumns;
let cstring = dataColumns[1].dataValues;
// The full data dictionary:
// console.log( JSON.stringify(cstring) );
let paneColumnsList = jsonParsed[1].secondaryInfo.presModelMap.vizData.presModelHolder
.genPresModelMapPresModel.presModelMap.PPV.presModelHolder.genVizDataPresModel
.paneColumnsData.paneColumnsList;
// Tests Per 10K Residents
let output = '';
let length = paneColumnsList[1].vizPaneColumns[2].aliasIndices.length;
// Label
output += ( cstring[ paneColumnsList[1].vizPaneColumns[3].aliasIndices[0] ] + ',');
output += ( "Date\n");
for (let i=0;i<length;i++) {
output += ( cstring[ -1 * paneColumnsList[1].vizPaneColumns[1].aliasIndices[i] - 1 ] + ',');
output += ( '"' + cstring[ -1 * paneColumnsList[1].vizPaneColumns[2].aliasIndices[i] - 1 ] + '"' + "\n");
}
console.log( output );
// Percent Positive
output = '';
length = paneColumnsList[0].vizPaneColumns[3].aliasIndices.length;
// Label
output += ( cstring[ paneColumnsList[0].vizPaneColumns[3].aliasIndices[0] ] + ',');
output += ( "Date\n");
for (let i=0;i<length;i++) {
output += ( cstring[ -1 * paneColumnsList[0].vizPaneColumns[2].aliasIndices[i] - 1 ] + ',');
output += ( '"' + cstring[ -1 * paneColumnsList[0].vizPaneColumns[1].aliasIndices[i] - 1 ] + '"' + "\n");
}
console.log( output );
}
// Below, largely cribbed from Thomas Dondorf at https://stackoverflow.com/questions/52969381/how-can-i-capture-all-network-requests-and-full-response-data-when-loading-a-pag
(async () => {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
let paused = false;
let pausedRequests = [];
const nextRequest = () => { // continue the next request or "unpause"
if (pausedRequests.length === 0) {
paused = false;
} else {
// continue first request in "queue"
(pausedRequests.shift())(); // calls the request.continue function
}
};
await page.setRequestInterception(true);
page.on('request', request => {
if (paused) {
pausedRequests.push(() => request.continue());
} else {
paused = true; // pause, as we are processing a request now
request.continue();
}
});
page.on('requestfinished', async (request) => {
const response = await request.response();
let responseBody;
if (request.url().includes('bootstrapSession')){
responseBody = await response.buffer();
responseBody = responseBody.toString();
responseBody = responseBody.replace(/^\d+;{/g,'{');
responseBody = responseBody.replace(/\d+;{/g,',{');
responseBody = '[' + responseBody + ']';
let jsonParsed = JSON.parse( responseBody );
parseDataDictionary(jsonParsed);
}
nextRequest(); // continue with next request
});
page.on('requestfailed', (request) => {
// handle failed request
nextRequest();
});
await page.goto(URL, { waitUntil: 'networkidle0' });
await browser.close();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment