Skip to content

Instantly share code, notes, and snippets.

@kami4ka
Created November 8, 2021 18:14
Show Gist options
  • Save kami4ka/2276b16f2e6428961696162092712f38 to your computer and use it in GitHub Desktop.
Save kami4ka/2276b16f2e6428961696162092712f38 to your computer and use it in GitHub Desktop.
Scrape doctors info from Doximity using ScrapingAnt
/**
* Get data from Doximity
*
* ScrapingAnt allows you to scrape for free using proxy servers
* Current setup uses browser-less scraping to save time and API credits
*
* npm install @scrapingant/scrapingant-client
* npm install cheerio
**/
const cheerio = require('cheerio');
const ScrapingAnt = require('@scrapingant/scrapingant-client');
const API_KEY = '<SCRAPINANT_API_KEY>';
const ROOT = 'https://www.doximity.com/';
// Initial URL. Can be improved to substitute a particular speciality
const INITIAL_URL = 'https://www.doximity.com/directory/md/specialty/oncology';
const client = new ScrapingAnt({ apiKey: API_KEY });
main()
.then(console.log)
.catch(console.error);
async function main() {
// Get all profile URLs
const data = [];
const profileUrls = await getProfileUrls(INITIAL_URL);
console.log('Profiles count: ' + profileUrls.length);
// Scrape each profile URL
for (const profileUrl of profileUrls) {
data.push(await getProfileData(profileUrl));
}
// Retrieved data can be easily saved to CSV or XLS using third-party library
return data;
}
async function getProfileUrls(initialUrl) {
console.log('Scraping list URL: ' + initialUrl);
const urls = [];
const html = (await client.scrape(initialUrl, { browser: false })).content;
const $ = cheerio.load(html);
const profileCards = $('.list-4-col > li > a');
for (const profileCard of profileCards) {
urls.push(ROOT + $(profileCard).attr('href'));
}
const nextPageUrl = $('a.next_page').attr('href');
if (nextPageUrl) {
urls.push(...(await getProfileUrls(ROOT + nextPageUrl)));
}
return urls;
}
async function getProfileData(profileUrl) {
console.log('Scraping profile URL: ' + profileUrl);
const info = {};
info.url = profileUrl;
const html = (await client.scrape(profileUrl, { browser: false })).content;
const $ = cheerio.load(html);
info.firstName = $('.user-name-first').text();
info.lastName = $('.user-name-last').text();
info.credentials = $('.user-name-credentials').text();
info.photoUrl = $('.profile-photo > img').attr('src');
info.speciality = $('.profile-head-subtitle').text();
info.subSpeciality = $('.user-subspecialty').text();
info.address = $('.profile-contact-information-office-line-item').first().text().trim();
info.phone = $('.office-info-telephone').text().trim().replace('Phone', '');
info.fax = $('.office-info-fax').text().trim().replace('Fax', '');
const educations = $('ul.training > li > .profile-section-wrapper-text');
info.educationAndTrainings = '';
for (const education of educations) {
info.educationAndTrainings += $(education).find('.black').text();
info.educationAndTrainings += ', ';
info.educationAndTrainings += $(education).find('.br').text();
info.educationAndTrainings += ' | ';
}
const certifications = $('.certification-info > ul > li > .profile-section-wrapper-text');
info.certifications = '';
for (const certification of certifications) {
info.certifications += $(certification).find('.black').text();
info.certifications += ', ';
info.certifications += $(certification).find('.br').text();
info.certifications += ' | ';
}
const publications = $('.publication-lists > div > ul > li');
info.publications = '';
for (const publication of publications) {
info.publications += $(publication).find('.black').text();
info.publications += ': ';
info.publications += $(publication).find('.black').attr('href');
info.publications += ', ';
info.publications += $(publication).find('.br').text();
info.publications += ' | ';
}
return info;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment