Created
May 12, 2021 08:12
-
-
Save javipus/6841e9bb8865e7f410e52401ed366397 to your computer and use it in GitHub Desktop.
Adapted from https://github.com/edenbaus/webscrape-Psychology-today/blob/master/scraping-psych-today.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Scaping Psychology Today for Mental Health Professionals in a given city" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Load libraries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"from selenium import webdriver\n", | |
"import time\n", | |
"import lxml\n", | |
"import pandas as pd\n", | |
"from bs4 import BeautifulSoup" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Parameters" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# This is the path to the driver executable. You must have downloaded it before running this notebook.\n", | |
"# Some download links (working as of 2021/05/12):\n", | |
"# - Geckodriver (Firefox): https://github.com/mozilla/geckodriver/releases\n", | |
"# - Chromedriver: https://chromedriver.chromium.org/downloads\n", | |
"executable_path = \"PATH_TO_DRIVER\"\n", | |
"# Change this if you're using a different driver, e.g. `webdriver.Chrome`\n", | |
"driver_ = webdriver.Firefox\n", | |
"\n", | |
"# URL to start scraping from. It must contain a list of therapists in a given city.\n", | |
"# Examples:\n", | |
"# - NYC: https://www.psychologytoday.com/us/therapists/ny/new-york\n", | |
"# - London: https://www.psychologytoday.com/gb/counselling/eng/london\n", | |
"# - Buenos Aires: https://www.psychologytoday.com/ar/psicologos/ba/buenos-aires\n", | |
"main_url = \"CITY_URL\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Lets get scraping!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def scrape_me(url=main_url):\n", | |
" driver = driver_(executable_path=executable_path)\n", | |
" ret = [] #generating empty list\n", | |
" while True:\n", | |
" time.sleep(1) #added a 1 second sleep to limit bot detection\n", | |
" driver.get(url) #open the url in selenium\n", | |
" \n", | |
" # grab the content with beautifulsoup for parsing\n", | |
" soup = BeautifulSoup(driver.page_source,'lxml')\n", | |
" # main table contains all doctors and some extra stuff\n", | |
" main_table = soup.findAll('div',{'class':'col-12 col-sm-12 col-md-12 col-lg-10 push-lg-2 results-column'})[0]\n", | |
" # select doctors in main table\n", | |
" docs = main_table.findAll('div',{'class':'row'}, recursive=False)\n", | |
" \n", | |
" #building a list of dictionaries - filled with doctor info\n", | |
" for doc in docs:\n", | |
" basic_info = doc.div.div.attrs\n", | |
" text_info = doc.div.div.findAll('div', recursive=False)[2]\n", | |
"\n", | |
" ret += [{\n", | |
" 'phone': basic_info['data-phone'].replace(' ', ''),\n", | |
" 'url': basic_info['data-profile-url'],\n", | |
" }]\n", | |
" try: \n", | |
" url = soup.find('a', {'class': 'btn btn-default btn-next'}, recursive=True).attrs['href']\n", | |
" except Exception as e:\n", | |
" print(e)\n", | |
" break\n", | |
" print(\"Scraping Complete!\")\n", | |
" return ret" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Clean up scrapped data and save it to csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def parse_therapist(url, driver):\n", | |
" # Load profile page\n", | |
" driver.get(url)\n", | |
" soup = BeautifulSoup(driver.page_source, 'lxml')\n", | |
" \n", | |
" ret = {}\n", | |
" \n", | |
" time.sleep(1)\n", | |
" \n", | |
" def get_text(node):\n", | |
" try:\n", | |
" return node.text.strip()\n", | |
" except AttributeError:\n", | |
" return\n", | |
" \n", | |
" # Location info\n", | |
" location = soup.find('div', {'class': 'location-address-phone'})\n", | |
" ret['address'] = get_text(location.find('span', {'itemprop': 'streetAddress'}))\n", | |
" ret['postalcode'] = get_text(location.find('span', {'itemprop': 'postalcode'}))\n", | |
" ret['city'] = get_text(location.find('span', {'itemprop': 'addressLocality'}))\n", | |
" \n", | |
" # Specialties and style\n", | |
" specialties = soup.find('ul', {'class': 'specialties-list'})\n", | |
" if specialties:\n", | |
" ret['specialties'] = [get_text(item) for item in specialties.findAll('li')]\n", | |
" else:\n", | |
" ret['specialties'] = None\n", | |
" \n", | |
" style = soup.find('div', {'class': 'attributes-treatment-orientation'})\n", | |
" if style:\n", | |
" ret['style'] = [get_text(item) for item in style.findAll('li')]\n", | |
" else:\n", | |
" ret['style'] = None\n", | |
" \n", | |
" # Price\n", | |
" ret['price'] = get_text(soup.find('div', {'class': 'finances-office'}))\n", | |
" \n", | |
" # Text\n", | |
" ret['text'] = get_text(soup.find('div', {'class': 'section profile-personalstatement'}))\n", | |
" \n", | |
" return ret" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Crawl all pages\n", | |
"docs = scrape_me()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Parse every therapist\n", | |
"driver = webdriver.Firefox(executable_path=executable_path)\n", | |
"df = []\n", | |
"for doc in docs:\n", | |
" info = parse_therapist(doc['url'], driver=driver)\n", | |
" df.append({**doc, **info})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Cast to dataframe and save\n", | |
"df = pd.DataFrame(df)\n", | |
"df.to_csv(\"data.csv\")" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment