Skip to content

Instantly share code, notes, and snippets.

@jacobdanovitch
Created January 23, 2020 21:59
Show Gist options
  • Save jacobdanovitch/b588227a7e4bf833632f60da6f198606 to your computer and use it in GitHub Desktop.
Save jacobdanovitch/b588227a7e4bf833632f60da6f198606 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import shutil\n",
"\n",
"from datetime import datetime\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"today = datetime.today().strftime('%D').replace('/', '-')\n",
"folder = f'dumps/cbc_data-{today}/'\n",
"\n",
"os.makedirs(folder, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"name_index = pd.read_csv('name_index.csv')\n",
"tables = pd.read_csv('home_profiles.csv')\n",
"staffing = pd.read_csv('staffing2017.csv')#.fillna(-1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"pdfs = pd.read_csv(\"inspection_data.csv\").drop(columns=['empty', 'error']) # home_inspection_data_no_text\n",
"\n",
"dt = pd.to_datetime(pdfs['Inspection Report Date'])\n",
"pdfs['Inspection Report Date'] = dt\n",
"pdfs = pdfs[dt.dt.year >= 2014]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"dup = pdfs.text.str.contains('Amended Inspection Summary')\n",
"pdfs = pdfs.drop(columns=\"text\")\n",
"\n",
"url = \"http://publicreporting.ltchomes.net/en-ca/File.aspx?\"\n",
"duplicates = pdfs[dup].assign(\n",
" filename=lambda x: x.filename.str.replace('pdfs/', ''),\n",
" url=lambda x: url + x.filename.str.replace('pdfs/', '').str.replace('.pdf', '')\n",
")\n",
"duplicates.to_csv(folder + f'dropped.csv', index=False)\n",
"\n",
"pdfs = pdfs[~dup]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"expanded = pdfs.merge(tables, how='left', on='pageid')\n",
"expanded.to_csv(folder + f'expanded.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"stats = pdfs.groupby([pdfs.pageid, pdfs['Inspection Report Date'].dt.year]) \\\n",
" .agg('sum') \\\n",
" .reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"output = pd.merge(stats, tables, how='left', on='pageid')\n",
"\n",
"output['Licensed Beds'] = output['Licensed Beds'].str.extract('(\\d+)')\n",
"output = output.merge(name_index, how='left', on='pageid')\n",
"output = output.merge(staffing, how='left', left_on='name', right_on='LTC Home Name')\n",
"\n",
"output = output.fillna({'LTC Home Name': output.name}).drop(columns='name')\n",
"output = output[~output['LTC Home Name'].isnull()]\n",
"\n",
"output = output[['LTC Home Name', *filter(lambda x: x!='LTC Home Name', output.columns)]]\n",
"output.to_csv(folder+'joined.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/jacobgdt/CBCScraping/dumps/cbc_data-01-23-20.zip'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shutil.make_archive(folder[:-1], 'zip', folder[:-1])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment