Last active
November 27, 2020 07:36
-
-
Save bcks/2ca28d47ffd3858d9393e172126d9d9c to your computer and use it in GitHub Desktop.
tableau-covid-scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Works with Node v12.0 and puppeteer 4.0 | |
const URL = 'https://public.tableau.com/views/PPV_15924847800480/ppv_db?%3Aembed=y&%3AshowVizHome=no&%3Adisplay_count=y&%3Adisplay_static_image=n&%3AbootstrapWhenNotified=true&%3Alanguage=en&:embed=y&:showVizHome=n&:apiID=host0'; | |
const puppeteer = require('puppeteer'); | |
function parseDataDictionary(jsonParsed) { | |
let dataColumns = jsonParsed[1].secondaryInfo.presModelMap.dataDictionary.presModelHolder | |
.genDataDictionaryPresModel.dataSegments["0"].dataColumns; | |
let cstring = dataColumns[1].dataValues; | |
// The full data dictionary: | |
// console.log( JSON.stringify(cstring) ); | |
let paneColumnsList = jsonParsed[1].secondaryInfo.presModelMap.vizData.presModelHolder | |
.genPresModelMapPresModel.presModelMap.PPV.presModelHolder.genVizDataPresModel | |
.paneColumnsData.paneColumnsList; | |
// Tests Per 10K Residents | |
let output = ''; | |
let length = paneColumnsList[1].vizPaneColumns[2].aliasIndices.length; | |
// Label | |
output += ( cstring[ paneColumnsList[1].vizPaneColumns[3].aliasIndices[0] ] + ','); | |
output += ( "Date\n"); | |
for (let i=0;i<length;i++) { | |
output += ( cstring[ -1 * paneColumnsList[1].vizPaneColumns[1].aliasIndices[i] - 1 ] + ','); | |
output += ( '"' + cstring[ -1 * paneColumnsList[1].vizPaneColumns[2].aliasIndices[i] - 1 ] + '"' + "\n"); | |
} | |
console.log( output ); | |
// Percent Positive | |
output = ''; | |
length = paneColumnsList[0].vizPaneColumns[3].aliasIndices.length; | |
// Label | |
output += ( cstring[ paneColumnsList[0].vizPaneColumns[3].aliasIndices[0] ] + ','); | |
output += ( "Date\n"); | |
for (let i=0;i<length;i++) { | |
output += ( cstring[ -1 * paneColumnsList[0].vizPaneColumns[2].aliasIndices[i] - 1 ] + ','); | |
output += ( '"' + cstring[ -1 * paneColumnsList[0].vizPaneColumns[1].aliasIndices[i] - 1 ] + '"' + "\n"); | |
} | |
console.log( output ); | |
} | |
// Below, largely cribbed from Thomas Dondorf at https://stackoverflow.com/questions/52969381/how-can-i-capture-all-network-requests-and-full-response-data-when-loading-a-pag | |
(async () => { | |
const browser = await puppeteer.launch(); | |
const [page] = await browser.pages(); | |
let paused = false; | |
let pausedRequests = []; | |
const nextRequest = () => { // continue the next request or "unpause" | |
if (pausedRequests.length === 0) { | |
paused = false; | |
} else { | |
// continue first request in "queue" | |
(pausedRequests.shift())(); // calls the request.continue function | |
} | |
}; | |
await page.setRequestInterception(true); | |
page.on('request', request => { | |
if (paused) { | |
pausedRequests.push(() => request.continue()); | |
} else { | |
paused = true; // pause, as we are processing a request now | |
request.continue(); | |
} | |
}); | |
page.on('requestfinished', async (request) => { | |
const response = await request.response(); | |
let responseBody; | |
if (request.url().includes('bootstrapSession')){ | |
responseBody = await response.buffer(); | |
responseBody = responseBody.toString(); | |
responseBody = responseBody.replace(/^\d+;{/g,'{'); | |
responseBody = responseBody.replace(/\d+;{/g,',{'); | |
responseBody = '[' + responseBody + ']'; | |
let jsonParsed = JSON.parse( responseBody ); | |
parseDataDictionary(jsonParsed); | |
} | |
nextRequest(); // continue with next request | |
}); | |
page.on('requestfailed', (request) => { | |
// handle failed request | |
nextRequest(); | |
}); | |
await page.goto(URL, { waitUntil: 'networkidle0' }); | |
await browser.close(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment