Created
December 5, 2015 20:56
-
-
Save thekensta/804c01a36582a8a02c74 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import matplotlib.pyplot as plt\n", | |
"import numpy as np\n", | |
"\n", | |
"%matplotlib inline\n", | |
"\n", | |
"from collections import Counter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['../Prepped data/17. CRS Service Users 2014/All service users.csv',\n", | |
" '../Prepped data/17. CRS Service Users 2014/Carer age.csv',\n", | |
" '../Prepped data/17. CRS Service Users 2014/Carer Ethnicity.csv',\n", | |
" '../Prepped data/17. CRS Service Users 2014/Carer Gender.csv',\n", | |
" '../Prepped data/17. CRS Service Users 2014/Carer Language.csv',\n", | |
" '../Prepped data/17. CRS Service Users 2014/Carer Partnership.csv']" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import glob\n", | |
"glob.glob(\"../Prepped data/17. CRS Service Users 2014/*\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Read in all the raw survey files\n", | |
"datasets = {fin: pd.read_csv(\"../Prepped data/\" +fin) \n", | |
" for fin in glob.glob(\"../Prepped data/17. CRS Service Users 2014/*\")}\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Load the data from Data set 8\n", | |
"\n", | |
"Conver the really wide columns into a narrow set with sex / age/ ethnicity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data8 = pd.read_csv(\"../Prepped data/08.1 Ethnicity Sex Age England & Wales/8. Ethnicity Sex Age EW.csv\")\n", | |
"#data8.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Get all the columns and look for the pivot and value columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"366 columns in total\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"['LAD12CD',\n", | |
" 'Local.Authority.District.12',\n", | |
" 'Counties.&.UAs.code',\n", | |
" 'Counties.&.Unitary.Authorities',\n", | |
" 'Locality.cost.centre',\n", | |
" 'Locality']" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d8c = [c for c in data8.columns]\n", | |
"print(len(d8c), \"columns in total\")\n", | |
"d8c[:6]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Melt the dataset to narrow format \n", | |
"Then add columns for sex / age / ethnicity " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data8melt = pd.melt(data8, id_vars=d8c[:6], value_vars=d8c[6:])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(348, 366)\n", | |
"(125280, 8)\n" | |
] | |
} | |
], | |
"source": [ | |
"print(data8.shape)\n", | |
"print(data8melt.shape)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('Age.65.and.over', 72),\n", | |
" ('All.categories:.Age', 72),\n", | |
" ('Age.0.to.24', 72),\n", | |
" ('Age.25.to.49', 72),\n", | |
" ('Age.50.to.64', 72)]" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def parse_col_name(cname):\n", | |
" \"\"\"Split a column into sex, age and ethnicity fields. \n", | |
" \n", | |
" >>> parse_col_name('Sex:.All.persons;.Age:.All.categories:.Age;.Ethnic.Group:.White:.English/Welsh/Scottish/Northern.Irish/British')\n", | |
" ('All.persons',\n", | |
" 'All.categories:.Age',\n", | |
" 'White:.English/Welsh/Scottish/Northern.Irish/British')\n", | |
" \n", | |
" \"\"\"\n", | |
" # All multi fields have sex / age / erthnicity\n", | |
" fields = cname.split(\";\")\n", | |
" sex_, age_, ethnic_ = fields\n", | |
" sex = sex_.split(\":\", 1)[1]\n", | |
" age = age_.split(\":\", 1)[1]\n", | |
" ethnic = ethnic_.split(\":\", 1)[1]\n", | |
" # Remove leading '.' on all columns\n", | |
" return sex[1:], age[1:], ethnic[1:]\n", | |
"\n", | |
"\n", | |
"sexes, ages, ethnics = [], [], []\n", | |
"for col in d8c:\n", | |
" # Ignore all the columns used for pivoting\n", | |
" if col.startswith(\"Sex\"):\n", | |
" s, a, e = parse_col_name(col)\n", | |
" sexes.append(s)\n", | |
" ages.append(a)\n", | |
" ethnics.append(e)\n", | |
"\n", | |
"# Run to show counts of what is in each\n", | |
"c = Counter()\n", | |
"c.update(ages) # sexes / ethnics\n", | |
"c.most_common()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The long, ugly column names in the wide table are now a single column\n", | |
"\n", | |
"Build a DataFrame with columns [\"long name\", \"sex\", \"age\", \"ethnicity\"] and then join\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(125280, 8)\n", | |
"(125280, 12)\n", | |
"['LAD12CD', 'Local.Authority.District.12', 'Counties.&.UAs.code', 'Counties.&.Unitary.Authorities', 'Locality.cost.centre', 'Locality', 'variable', 'value', 'age', 'colname', 'ethnic', 'sex']\n" | |
] | |
} | |
], | |
"source": [ | |
"## Columns Lookup\n", | |
"column_lookup = pd.DataFrame({\"colname\": [c for c in d8c if c.startswith(\"Sex\")], \n", | |
" \"sex\": sexes, \n", | |
" \"age\": ages, \n", | |
" \"ethnic\": ethnics})\n", | |
"\n", | |
"\n", | |
"\n", | |
"data8_sex_age_eth = pd.merge(data8melt, column_lookup, left_on=\"variable\", right_on=\"colname\")\n", | |
"\n", | |
"print(data8melt.shape)\n", | |
"print(data8_sex_age_eth.shape)\n", | |
"\n", | |
"print([c for c in data8_sex_age_eth.columns])\n", | |
"cols_of_interest = ['LAD12CD', 'Local.Authority.District.12', 'Counties.&.UAs.code', \n", | |
" 'Counties.&.Unitary.Authorities', 'Locality.cost.centre', 'Locality', 'variable', 'value', \n", | |
" 'age', 'ethnic', 'sex']\n", | |
"\n", | |
"# Filter out columns and save to file\n", | |
"data8_sex_age_eth_final = data8_sex_age_eth[cols_of_interest]\n", | |
"data8_sex_age_eth_final.to_csv(\"../workingdata/data8_with_age_sex_ethnic.csv\", index=False, sep=\",\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# data8melt.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Users of the Service Survey" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#\n", | |
"# Load all the data and turn into narrow format\n", | |
"# \n", | |
"\n", | |
"## Working directory is\n", | |
"#\n", | |
"# /+\n", | |
"# /notebooks/<this file>\n", | |
"# | \n", | |
"# /Prepped data/\n", | |
"# |\n", | |
"# /<survey 17>/\n", | |
"# | \n", | |
"# + <data.csv>\n", | |
"#\n", | |
"# There is one CSV per survey\n", | |
"#\n", | |
"datasets = {fin: pd.read_csv(\"../Prepped data/\" +fin) \n", | |
" for fin in glob.glob(\"../Prepped data/17. CRS Service Users 2014/*\")}\n", | |
"\n", | |
"\n", | |
"# Locality needs to be based on the \"Locality CRS 2014\" columns\n", | |
"# This needs to be split according to the ratio in locations-remap.csv (in version control)\n", | |
"def single_narrow_dataset(datasets, locality_field = \"Locality.CRS.2014\"):\n", | |
" \n", | |
" # Read in a list of files from a directory and create a single DataFrame\n", | |
" df_list = []\n", | |
" \n", | |
" for key, df in datasets.items():\n", | |
" \n", | |
" # Convert non-locality columns in a single column\n", | |
" df_ = pd.melt(df, id_vars=locality_field, \n", | |
" value_vars=[c for c in df.columns[2:]], \n", | |
" value_name=\"value\")\n", | |
" # Select the columns we are interested in\n", | |
" df_.columns = [\"Location\", \"Variable\", \"Count\"]\n", | |
" # Create a dataset name from the original CSV\n", | |
" dimname = os.path.basename(key)[:-4]\n", | |
" df_[\"Dimension\"] = dimname\n", | |
" df_[[\"Location\", \"Dimension\", \"Variable\", \"Count\"]]\n", | |
" df_list.append(df_)\n", | |
" \n", | |
" # Stack all the columns \n", | |
" clean_df = pd.concat(df_list)\n", | |
" # Get rid of all the non-total fields\n", | |
" full_data = clean_df[clean_df.Variable != \"Grand.Total\"]\n", | |
" \n", | |
" # Re-map Avon, Gloucestershire & Wiltshire \n", | |
" # Banes, Swindon, Wiltshire and Care Services 0.274\n", | |
" # and\n", | |
" # Bristol, Gloucestershire and South Gloucestershire 72.6 \n", | |
" LocationLookup = pd.read_csv(\"../workingdata/locations-remap.csv\")\n", | |
" full_data = pd.merge(full_data, LocationLookup, on=\"Location\")\n", | |
" \n", | |
" # Set all the Nans to 0\n", | |
" full_data[\"FullCount\"] = full_data[\"Count\"].fillna(0)\n", | |
" \n", | |
" # Aggregate by Location and Variable\n", | |
" full_data = (full_data\n", | |
" .groupby([\"NewLocation\", \"Variable\", \"Dimension\"])\n", | |
" [\"FullCount\"]\n", | |
" .sum()\n", | |
" .reset_index())\n", | |
" \n", | |
" # Get a within segment proportion\n", | |
" full_data[\"FullProp\"] = (full_data\n", | |
" .groupby([\"Dimension\", \"NewLocation\"])\n", | |
" [\"FullCount\"]\n", | |
" .transform(lambda x: x/x.sum()))\n", | |
" return full_data[[]]\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"full_data = single_narrow_dataset(datasets)\n", | |
"full_data.to_csv(\"../workingdata/known_data_by_question_response2.csv\", index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment