javipus · May 12, 2021 08:12
diff --git a/scraping-psych-today.ipynb b/scraping-psych-today.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scaping Psychology Today for Mental Health Professionals in a given city"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from selenium import webdriver\n",
    "import time\n",
    "import lxml\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is the path to the driver executable. You must have downloaded it before running this notebook.\n",
    "# Some download links (working as of 2021/05/12):\n",
    "# - Geckodriver (Firefox): https://github.com/mozilla/geckodriver/releases\n",
    "# - Chromedriver: https://chromedriver.chromium.org/downloads\n",
    "executable_path = \"PATH_TO_DRIVER\"\n",
    "# Change this if you're using a different driver, e.g. `webdriver.Chrome`\n",
    "driver_ = webdriver.Firefox\n",
    "\n",
    "# URL to start scraping from. It must contain a list of therapists in a given city.\n",
    "# Examples:\n",
    "# - NYC: https://www.psychologytoday.com/us/therapists/ny/new-york\n",
    "# - London: https://www.psychologytoday.com/gb/counselling/eng/london\n",
    "# - Buenos Aires: https://www.psychologytoday.com/ar/psicologos/ba/buenos-aires\n",
    "main_url = \"CITY_URL\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lets get scraping!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def scrape_me(url=main_url):\n",
    "    driver = driver_(executable_path=executable_path)\n",
    "    ret = []                   #generating empty list\n",
    "    while True:\n",
    "        time.sleep(1)           #added a 1 second sleep to limit bot detection\n",
    "        driver.get(url)         #open the url in selenium\n",
    "        \n",
    "        # grab the content with beautifulsoup for parsing\n",
    "        soup = BeautifulSoup(driver.page_source,'lxml')\n",
    "        # main table contains all doctors and some extra stuff\n",
    "        main_table = soup.findAll('div',{'class':'col-12 col-sm-12 col-md-12 col-lg-10 push-lg-2 results-column'})[0]\n",
    "        # select doctors in main table\n",
    "        docs = main_table.findAll('div',{'class':'row'}, recursive=False)\n",
    "        \n",
    "        #building a list of dictionaries - filled with doctor info\n",
    "        for doc in docs:\n",
    "            basic_info = doc.div.div.attrs\n",
    "            text_info = doc.div.div.findAll('div', recursive=False)[2]\n",
    "\n",
    "            ret += [{\n",
    "                'phone': basic_info['data-phone'].replace(' ', ''),\n",
    "                'url': basic_info['data-profile-url'],\n",
    "            }]\n",
    "        try:    \n",
    "            url = soup.find('a', {'class': 'btn btn-default btn-next'}, recursive=True).attrs['href']\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            break\n",
    "            print(\"Scraping Complete!\")\n",
    "    return ret"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clean up scrapped data and save it to csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_therapist(url, driver):\n",
    "    # Load profile page\n",
    "    driver.get(url)\n",
    "    soup = BeautifulSoup(driver.page_source, 'lxml')\n",
    "    \n",
    "    ret = {}\n",
    "    \n",
    "    time.sleep(1)\n",
    "    \n",
    "    def get_text(node):\n",
    "        try:\n",
    "            return node.text.strip()\n",
    "        except AttributeError:\n",
    "            return\n",
    "    \n",
    "    # Location info\n",
    "    location = soup.find('div', {'class': 'location-address-phone'})\n",
    "    ret['address'] = get_text(location.find('span', {'itemprop': 'streetAddress'}))\n",
    "    ret['postalcode'] = get_text(location.find('span', {'itemprop': 'postalcode'}))\n",
    "    ret['city'] = get_text(location.find('span', {'itemprop': 'addressLocality'}))\n",
    "    \n",
    "    # Specialties and style\n",
    "    specialties = soup.find('ul', {'class': 'specialties-list'})\n",
    "    if specialties:\n",
    "        ret['specialties'] = [get_text(item) for item in specialties.findAll('li')]\n",
    "    else:\n",
    "        ret['specialties'] = None\n",
    "    \n",
    "    style = soup.find('div', {'class': 'attributes-treatment-orientation'})\n",
    "    if style:\n",
    "        ret['style'] = [get_text(item) for item in style.findAll('li')]\n",
    "    else:\n",
    "        ret['style'] = None\n",
    "    \n",
    "    # Price\n",
    "    ret['price'] = get_text(soup.find('div', {'class': 'finances-office'}))\n",
    "    \n",
    "    # Text\n",
    "    ret['text'] = get_text(soup.find('div', {'class': 'section profile-personalstatement'}))\n",
    "    \n",
    "    return ret"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Crawl all pages\n",
    "docs = scrape_me()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parse every therapist\n",
    "driver = webdriver.Firefox(executable_path=executable_path)\n",
    "df = []\n",
    "for doc in docs:\n",
    "    info = parse_therapist(doc['url'], driver=driver)\n",
    "    df.append({**doc, **info})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cast to dataframe and save\n",
    "df = pd.DataFrame(df)\n",
    "df.to_csv(\"data.csv\")"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Scaping Psychology Today for Mental Health Professionals in a given city"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Load libraries"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"from selenium import webdriver\n",
	"import time\n",
	"import lxml\n",
	"import pandas as pd\n",
	"from bs4 import BeautifulSoup"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Parameters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# This is the path to the driver executable. You must have downloaded it before running this notebook.\n",
	"# Some download links (working as of 2021/05/12):\n",
	"# - Geckodriver (Firefox): https://github.com/mozilla/geckodriver/releases\n",
	"# - Chromedriver: https://chromedriver.chromium.org/downloads\n",
	"executable_path = \"PATH_TO_DRIVER\"\n",
	"# Change this if you're using a different driver, e.g. `webdriver.Chrome`\n",
	"driver_ = webdriver.Firefox\n",
	"\n",
	"# URL to start scraping from. It must contain a list of therapists in a given city.\n",
	"# Examples:\n",
	"# - NYC: https://www.psychologytoday.com/us/therapists/ny/new-york\n",
	"# - London: https://www.psychologytoday.com/gb/counselling/eng/london\n",
	"# - Buenos Aires: https://www.psychologytoday.com/ar/psicologos/ba/buenos-aires\n",
	"main_url = \"CITY_URL\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Lets get scraping!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def scrape_me(url=main_url):\n",
	" driver = driver_(executable_path=executable_path)\n",
	" ret = [] #generating empty list\n",
	" while True:\n",
	" time.sleep(1) #added a 1 second sleep to limit bot detection\n",
	" driver.get(url) #open the url in selenium\n",
	" \n",
	" # grab the content with beautifulsoup for parsing\n",
	" soup = BeautifulSoup(driver.page_source,'lxml')\n",
	" # main table contains all doctors and some extra stuff\n",
	" main_table = soup.findAll('div',{'class':'col-12 col-sm-12 col-md-12 col-lg-10 push-lg-2 results-column'})[0]\n",
	" # select doctors in main table\n",
	" docs = main_table.findAll('div',{'class':'row'}, recursive=False)\n",
	" \n",
	" #building a list of dictionaries - filled with doctor info\n",
	" for doc in docs:\n",
	" basic_info = doc.div.div.attrs\n",
	" text_info = doc.div.div.findAll('div', recursive=False)[2]\n",
	"\n",
	" ret += [{\n",
	" 'phone': basic_info['data-phone'].replace(' ', ''),\n",
	" 'url': basic_info['data-profile-url'],\n",
	" }]\n",
	" try: \n",
	" url = soup.find('a', {'class': 'btn btn-default btn-next'}, recursive=True).attrs['href']\n",
	" except Exception as e:\n",
	" print(e)\n",
	" break\n",
	" print(\"Scraping Complete!\")\n",
	" return ret"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Clean up scrapped data and save it to csv"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def parse_therapist(url, driver):\n",
	" # Load profile page\n",
	" driver.get(url)\n",
	" soup = BeautifulSoup(driver.page_source, 'lxml')\n",
	" \n",
	" ret = {}\n",
	" \n",
	" time.sleep(1)\n",
	" \n",
	" def get_text(node):\n",
	" try:\n",
	" return node.text.strip()\n",
	" except AttributeError:\n",
	" return\n",
	" \n",
	" # Location info\n",
	" location = soup.find('div', {'class': 'location-address-phone'})\n",
	" ret['address'] = get_text(location.find('span', {'itemprop': 'streetAddress'}))\n",
	" ret['postalcode'] = get_text(location.find('span', {'itemprop': 'postalcode'}))\n",
	" ret['city'] = get_text(location.find('span', {'itemprop': 'addressLocality'}))\n",
	" \n",
	" # Specialties and style\n",
	" specialties = soup.find('ul', {'class': 'specialties-list'})\n",
	" if specialties:\n",
	" ret['specialties'] = [get_text(item) for item in specialties.findAll('li')]\n",
	" else:\n",
	" ret['specialties'] = None\n",
	" \n",
	" style = soup.find('div', {'class': 'attributes-treatment-orientation'})\n",
	" if style:\n",
	" ret['style'] = [get_text(item) for item in style.findAll('li')]\n",
	" else:\n",
	" ret['style'] = None\n",
	" \n",
	" # Price\n",
	" ret['price'] = get_text(soup.find('div', {'class': 'finances-office'}))\n",
	" \n",
	" # Text\n",
	" ret['text'] = get_text(soup.find('div', {'class': 'section profile-personalstatement'}))\n",
	" \n",
	" return ret"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Crawl all pages\n",
	"docs = scrape_me()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Parse every therapist\n",
	"driver = webdriver.Firefox(executable_path=executable_path)\n",
	"df = []\n",
	"for doc in docs:\n",
	" info = parse_therapist(doc['url'], driver=driver)\n",
	" df.append({doc, info})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Cast to dataframe and save\n",
	"df = pd.DataFrame(df)\n",
	"df.to_csv(\"data.csv\")"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}