Created
September 13, 2021 08:28
-
-
Save kami4ka/00eff02b3f124daa9598c8d5ac76b0f4 to your computer and use it in GitHub Desktop.
Most popular Medium writers scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Get the most popular Medium writers by tags | |
* | |
* ScrapingAnt allows you to scrape for free using proxy servers | |
* | |
* npm install @scrapingant/scrapingant-client | |
* npm install cheerio | |
**/ | |
const ScrapingAntClient = require('@scrapingant/scrapingant-client'); | |
const cheerio = require('cheerio'); | |
const client = new ScrapingAntClient({ apiKey: "<SCRAPINGANT_API_KEY>" }); | |
const tags = ['leadership', 'productivity', 'startup', 'technology', 'creativity', 'entrepreneurship']; | |
const followersRegex = />([0-9KM.]* Followers)<\//; | |
const openTopWritersJS = `document.evaluate("//p[text()='See More']", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.click()`; | |
(async () => { | |
const writers = []; | |
for (const tag of tags) { | |
const response = await client.scrape(getTagURL(tag), { proxy_country: 'US', js_snippet: openTopWritersJS }); | |
writers.push(...await getDataFromTagPage(response.content)); | |
} | |
for (const writer of writers) { | |
const writerResponse = await client.scrape(writer.url, { proxy_country: 'US' }); | |
writer.followers = extractFollowersCount(writerResponse.content); | |
} | |
// scraped results | |
console.log(writers); | |
})(); | |
function getTagURL(tag) { | |
return `https://medium.com/tag/${tag}`; | |
} | |
function getDataFromTagPage(html) { | |
const pageResults = []; | |
const $ = cheerio.load(html); | |
const dialog = $('div[role=dialog]'); | |
const links = $(dialog).find('a'); | |
links.each((i, link) => { | |
if (i % 2 === 1) { | |
const result = {}; | |
result.url = getFullURL($(link).attr('href')); | |
result.name = $(link).find('h2').text(); | |
pageResults.push(result); | |
} | |
}); | |
return pageResults; | |
} | |
function getFullURL(href) { | |
if (href.startsWith('/')) { | |
return `https://medium.com${href}` | |
} | |
return href; | |
} | |
function extractFollowersCount(writerContent) { | |
return followersRegex.exec(writerContent)[1]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment