Created
February 6, 2018 01:09
-
-
Save kylebarron/2c90fbf79f0aca54e2e92df47dcc35b5 to your computer and use it in GitHub Desktop.
Scrape HCPCS codes from http://www.icd9data.com/HCPCS/ and https://www.hcpcsdata.com/Codes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
""" | |
--------------------------------------------------------------------- | |
Program: hcpcs_scrape.py | |
Author: Kyle Barron <[email protected]> | |
Created: 2/5/2018 | |
Updated: 2/5/2018 | |
Purpose: Scrape HCPCS codes from the internet | |
""" | |
import requests | |
import pandas as pd | |
import lxml.html as LH | |
from bs4 import BeautifulSoup | |
from time import sleep | |
def main(outpath='hcpcs.csv'): | |
"""Scrapes HCPCS codes for all years where the data exists on these sites | |
Args: | |
outpath: path to export final csv file of codes | |
""" | |
data_list = [] | |
for year in range(2007, 2018): | |
data_list.append(get_hcpcs_codes(year)) | |
data = pd.concat(data_list) | |
data.to_csv(outpath) | |
def get_hcpcs_codes(year, sleep_time=5): | |
"""Function to scrape HCPCS codes from | |
http://www.icd9data.com/HCPCS/ and https://www.hcpcsdata.com/Codes. | |
Args: | |
year: Year of codes to scrape. http://www.icd9data.com/HCPCS/ seems to | |
have data from 2007 through 2016. https://www.hcpcsdata.com/Codes | |
has data for 2017/2018. | |
sleep_time: Number of seconds to wait between page loads. A greater | |
number puts less pressure on the servers that run these sites. | |
Returns: | |
DataFrame with HCPCS codes, description and year. | |
""" | |
if year > 2016: | |
top_level = 'https://www.hcpcsdata.com/Codes' | |
page = requests.get(top_level) | |
sleep(sleep_time) | |
sub_level_links = LH.fromstring(page.content).xpath('//tr/td/a/@href') | |
sub_level_links = [ | |
'https://www.hcpcsdata.com' + x for x in sub_level_links | |
] | |
# sub_level = sub_level_links[0] | |
all_leaf_links = [] | |
for sub_level in sub_level_links: | |
page = requests.get(sub_level) | |
sleep(sleep_time) | |
leaf_links = LH.fromstring(page.content).xpath('//tr/td/a/@href') | |
leaf_links = ['https://www.hcpcsdata.com' + x for x in leaf_links] | |
all_leaf_links.extend(leaf_links) | |
all_codes = {} | |
i = 0 | |
for leaf in all_leaf_links: | |
i += 1 | |
page = requests.get(leaf) | |
sleep(sleep_time) | |
soup = BeautifulSoup(page.content, 'lxml') | |
code = soup.find(class_='identifier16').get_text() | |
title = soup.find('h5').get_text() | |
all_codes[code] = title | |
if i % 20 == 0: | |
msg = f'Finished scraping page {i} of {len(all_leaf_links)}' | |
msg += f' for year {year}' | |
print(msg) | |
df = pd.DataFrame.from_dict(all_codes, orient='index') | |
df.index.rename('HCPCS Code', inplace=True) | |
df.columns = ['Description'] | |
elif (year >= 2007) & (year <= 2016): | |
top_level = f'http://www.icd9data.com/HCPCS/{year}/default.htm' | |
page = requests.get(top_level) | |
sleep(sleep_time) | |
sub_level_links = LH.fromstring( | |
page.content).xpath('//ul[@class="codeList"]/li/a/@href') | |
sub_level_links = [ | |
'http://www.icd9data.com' + x for x in sub_level_links | |
] | |
all_leaf_links = [] | |
for sub_level in sub_level_links: | |
page = requests.get(sub_level) | |
sleep(sleep_time) | |
leaf_links = LH.fromstring( | |
page.content).xpath('//ul[@class="hcpcs"]/li/span/a/@href') | |
leaf_links = ['http://www.icd9data.com' + x for x in leaf_links] | |
all_leaf_links.extend(leaf_links) | |
all_codes = {} | |
i = 0 | |
for leaf in all_leaf_links: | |
i += 1 | |
page = requests.get(leaf) | |
sleep(sleep_time) | |
soup = BeautifulSoup(page.content, 'lxml') | |
code = soup.find(class_='identifier').get_text() | |
title = soup.find('dd').get_text() | |
all_codes[code] = title | |
if i % 20 == 0: | |
print(f'Finished scraping page {i} of {len(all_leaf_links)}') | |
df = pd.DataFrame.from_dict(all_codes, orient='index') | |
df.index.rename('HCPCS Code', inplace=True) | |
df.columns = ['Description'] | |
else: | |
raise ValueError(f'Codes for {year} not provided') | |
df['Year'] = year | |
return df | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment