Created
November 23, 2021 17:31
-
-
Save kami4ka/6cb40dfbce19aed0ba538d44325de9a0 to your computer and use it in GitHub Desktop.
Reddit scraping with ScrapingAnt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Get data from Reddit | |
* | |
* ScrapingAnt allows you to scrape for free using proxy servers | |
* | |
* npm install @scrapingant/scrapingant-client | |
* npm install cheerio | |
**/ | |
const cheerio = require('cheerio'); | |
const ScrapingAnt = require('@scrapingant/scrapingant-client'); | |
const API_KEY = '<SCRAPINGANT_API_KEY>'; | |
const URL_TO_SCRAPE = 'https://www.reddit.com/r/webscraping/'; | |
const BASE_URL = 'https://www.reddit.com'; | |
const POSTS_NUMBER = 100; // Due to 30 seconds of maximum execution time, the max value is about 100 | |
const client = new ScrapingAnt({ apiKey: API_KEY }); | |
main() | |
.then(console.log) | |
.catch(console.error); | |
async function main() { | |
// Get all posts data | |
const customJS = getCustomJS(POSTS_NUMBER); | |
const data = []; | |
const responseResult = await client.scrape(URL_TO_SCRAPE, { js_snippet: customJS }); | |
const $ = cheerio.load(responseResult.content); | |
const posts = $('div[data-testid="post-container"]'); | |
posts.each((i, el) => { | |
const url = $(el).find('a[data-click-id="body"]').attr('href'); | |
const title = $(el).find('h3').text(); | |
const timestamp = $(el).find('a[data-click-id="timestamp"]').text(); | |
data.push({ | |
url: BASE_URL + url, | |
title: title, | |
timestamp: timestamp | |
}) | |
}); | |
return data; | |
} | |
function getCustomJS(postNumber) { | |
let customJS = ''; | |
const numberOfScrollIterations = parseInt(postNumber / 25); // Preliminary value, can be tweaked | |
let iterator = 0; | |
while (iterator <= numberOfScrollIterations) { | |
customJS += 'window.scrollTo(0,document.body.scrollHeight);\n' + | |
'await new Promise(r => setTimeout(r, 2000));\n'; | |
iterator++; | |
} | |
return customJS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment