Skip to content

Instantly share code, notes, and snippets.

@javipus
Created May 12, 2021 08:12
Show Gist options
  • Save javipus/6841e9bb8865e7f410e52401ed366397 to your computer and use it in GitHub Desktop.
Save javipus/6841e9bb8865e7f410e52401ed366397 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Scaping Psychology Today for Mental Health Professionals in a given city"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from selenium import webdriver\n",
"import time\n",
"import lxml\n",
"import pandas as pd\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Parameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# This is the path to the driver executable. You must have downloaded it before running this notebook.\n",
"# Some download links (working as of 2021/05/12):\n",
"# - Geckodriver (Firefox): https://github.com/mozilla/geckodriver/releases\n",
"# - Chromedriver: https://chromedriver.chromium.org/downloads\n",
"executable_path = \"PATH_TO_DRIVER\"\n",
"# Change this if you're using a different driver, e.g. `webdriver.Chrome`\n",
"driver_ = webdriver.Firefox\n",
"\n",
"# URL to start scraping from. It must contain a list of therapists in a given city.\n",
"# Examples:\n",
"# - NYC: https://www.psychologytoday.com/us/therapists/ny/new-york\n",
"# - London: https://www.psychologytoday.com/gb/counselling/eng/london\n",
"# - Buenos Aires: https://www.psychologytoday.com/ar/psicologos/ba/buenos-aires\n",
"main_url = \"CITY_URL\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lets get scraping!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def scrape_me(url=main_url):\n",
" driver = driver_(executable_path=executable_path)\n",
" ret = [] #generating empty list\n",
" while True:\n",
" time.sleep(1) #added a 1 second sleep to limit bot detection\n",
" driver.get(url) #open the url in selenium\n",
" \n",
" # grab the content with beautifulsoup for parsing\n",
" soup = BeautifulSoup(driver.page_source,'lxml')\n",
" # main table contains all doctors and some extra stuff\n",
" main_table = soup.findAll('div',{'class':'col-12 col-sm-12 col-md-12 col-lg-10 push-lg-2 results-column'})[0]\n",
" # select doctors in main table\n",
" docs = main_table.findAll('div',{'class':'row'}, recursive=False)\n",
" \n",
" #building a list of dictionaries - filled with doctor info\n",
" for doc in docs:\n",
" basic_info = doc.div.div.attrs\n",
" text_info = doc.div.div.findAll('div', recursive=False)[2]\n",
"\n",
" ret += [{\n",
" 'phone': basic_info['data-phone'].replace(' ', ''),\n",
" 'url': basic_info['data-profile-url'],\n",
" }]\n",
" try: \n",
" url = soup.find('a', {'class': 'btn btn-default btn-next'}, recursive=True).attrs['href']\n",
" except Exception as e:\n",
" print(e)\n",
" break\n",
" print(\"Scraping Complete!\")\n",
" return ret"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clean up scrapped data and save it to csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def parse_therapist(url, driver):\n",
" # Load profile page\n",
" driver.get(url)\n",
" soup = BeautifulSoup(driver.page_source, 'lxml')\n",
" \n",
" ret = {}\n",
" \n",
" time.sleep(1)\n",
" \n",
" def get_text(node):\n",
" try:\n",
" return node.text.strip()\n",
" except AttributeError:\n",
" return\n",
" \n",
" # Location info\n",
" location = soup.find('div', {'class': 'location-address-phone'})\n",
" ret['address'] = get_text(location.find('span', {'itemprop': 'streetAddress'}))\n",
" ret['postalcode'] = get_text(location.find('span', {'itemprop': 'postalcode'}))\n",
" ret['city'] = get_text(location.find('span', {'itemprop': 'addressLocality'}))\n",
" \n",
" # Specialties and style\n",
" specialties = soup.find('ul', {'class': 'specialties-list'})\n",
" if specialties:\n",
" ret['specialties'] = [get_text(item) for item in specialties.findAll('li')]\n",
" else:\n",
" ret['specialties'] = None\n",
" \n",
" style = soup.find('div', {'class': 'attributes-treatment-orientation'})\n",
" if style:\n",
" ret['style'] = [get_text(item) for item in style.findAll('li')]\n",
" else:\n",
" ret['style'] = None\n",
" \n",
" # Price\n",
" ret['price'] = get_text(soup.find('div', {'class': 'finances-office'}))\n",
" \n",
" # Text\n",
" ret['text'] = get_text(soup.find('div', {'class': 'section profile-personalstatement'}))\n",
" \n",
" return ret"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Crawl all pages\n",
"docs = scrape_me()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Parse every therapist\n",
"driver = webdriver.Firefox(executable_path=executable_path)\n",
"df = []\n",
"for doc in docs:\n",
" info = parse_therapist(doc['url'], driver=driver)\n",
" df.append({**doc, **info})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cast to dataframe and save\n",
"df = pd.DataFrame(df)\n",
"df.to_csv(\"data.csv\")"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment