Created
September 3, 2019 09:30
-
-
Save beatobongco/bec6d7c82086b780be8b113ca179d3fb to your computer and use it in GitHub Desktop.
Scraping a webpage via console
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Create an interval which will keep scrolling downwards | |
const interval = setInterval(() => { | |
window.scrollTo(0, document.body.scrollHeight) | |
}, 250) | |
// If there are no more items, you can call `clearInterval(interval)` to stop scrolling | |
// Now copy the below code to scrape data and download a file containing the data | |
;(() => { | |
// Define a function to scrape elements we want | |
// document.querySelectorAll returns a NodeList so we need to convert to Array | |
function scrapeData () { | |
const titles = Array.from( | |
document.querySelectorAll('.media-body h4 a') | |
).map(o => o.innerText) | |
const abstracts = Array.from( | |
document.querySelectorAll('.media-body .li16') | |
).map(o => o.innerText) | |
return titles.map((el, index) => [el, abstracts[index]]) | |
} | |
// Define a function so we can download what we've scraped | |
function download (filename, text) { | |
const element = document.createElement('a') | |
element.setAttribute( | |
'href', | |
'data:text/csv;charset=utf-8,' + encodeURIComponent(text) | |
) | |
element.setAttribute('download', filename) | |
element.style.display = 'none' | |
document.body.appendChild(element) | |
element.click() | |
document.body.removeChild(element) | |
} | |
const dataset = scrapeData() | |
let csvContent = '' | |
dataset.forEach(row => { | |
csvContent += row.join('\t') + '\r\n' | |
}) | |
download('scraped.tsv', csvContent) | |
})() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment