Created
November 8, 2021 18:14
-
-
Save kami4ka/2276b16f2e6428961696162092712f38 to your computer and use it in GitHub Desktop.
Scrape doctors info from Doximity using ScrapingAnt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Get data from Doximity | |
* | |
* ScrapingAnt allows you to scrape for free using proxy servers | |
* Current setup uses browser-less scraping to save time and API credits | |
* | |
* npm install @scrapingant/scrapingant-client | |
* npm install cheerio | |
**/ | |
const cheerio = require('cheerio'); | |
const ScrapingAnt = require('@scrapingant/scrapingant-client'); | |
const API_KEY = '<SCRAPINANT_API_KEY>'; | |
const ROOT = 'https://www.doximity.com/'; | |
// Initial URL. Can be improved to substitute a particular speciality | |
const INITIAL_URL = 'https://www.doximity.com/directory/md/specialty/oncology'; | |
const client = new ScrapingAnt({ apiKey: API_KEY }); | |
main() | |
.then(console.log) | |
.catch(console.error); | |
async function main() { | |
// Get all profile URLs | |
const data = []; | |
const profileUrls = await getProfileUrls(INITIAL_URL); | |
console.log('Profiles count: ' + profileUrls.length); | |
// Scrape each profile URL | |
for (const profileUrl of profileUrls) { | |
data.push(await getProfileData(profileUrl)); | |
} | |
// Retrieved data can be easily saved to CSV or XLS using third-party library | |
return data; | |
} | |
async function getProfileUrls(initialUrl) { | |
console.log('Scraping list URL: ' + initialUrl); | |
const urls = []; | |
const html = (await client.scrape(initialUrl, { browser: false })).content; | |
const $ = cheerio.load(html); | |
const profileCards = $('.list-4-col > li > a'); | |
for (const profileCard of profileCards) { | |
urls.push(ROOT + $(profileCard).attr('href')); | |
} | |
const nextPageUrl = $('a.next_page').attr('href'); | |
if (nextPageUrl) { | |
urls.push(...(await getProfileUrls(ROOT + nextPageUrl))); | |
} | |
return urls; | |
} | |
async function getProfileData(profileUrl) { | |
console.log('Scraping profile URL: ' + profileUrl); | |
const info = {}; | |
info.url = profileUrl; | |
const html = (await client.scrape(profileUrl, { browser: false })).content; | |
const $ = cheerio.load(html); | |
info.firstName = $('.user-name-first').text(); | |
info.lastName = $('.user-name-last').text(); | |
info.credentials = $('.user-name-credentials').text(); | |
info.photoUrl = $('.profile-photo > img').attr('src'); | |
info.speciality = $('.profile-head-subtitle').text(); | |
info.subSpeciality = $('.user-subspecialty').text(); | |
info.address = $('.profile-contact-information-office-line-item').first().text().trim(); | |
info.phone = $('.office-info-telephone').text().trim().replace('Phone', ''); | |
info.fax = $('.office-info-fax').text().trim().replace('Fax', ''); | |
const educations = $('ul.training > li > .profile-section-wrapper-text'); | |
info.educationAndTrainings = ''; | |
for (const education of educations) { | |
info.educationAndTrainings += $(education).find('.black').text(); | |
info.educationAndTrainings += ', '; | |
info.educationAndTrainings += $(education).find('.br').text(); | |
info.educationAndTrainings += ' | '; | |
} | |
const certifications = $('.certification-info > ul > li > .profile-section-wrapper-text'); | |
info.certifications = ''; | |
for (const certification of certifications) { | |
info.certifications += $(certification).find('.black').text(); | |
info.certifications += ', '; | |
info.certifications += $(certification).find('.br').text(); | |
info.certifications += ' | '; | |
} | |
const publications = $('.publication-lists > div > ul > li'); | |
info.publications = ''; | |
for (const publication of publications) { | |
info.publications += $(publication).find('.black').text(); | |
info.publications += ': '; | |
info.publications += $(publication).find('.black').attr('href'); | |
info.publications += ', '; | |
info.publications += $(publication).find('.br').text(); | |
info.publications += ' | '; | |
} | |
return info; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment