Skip to content

Instantly share code, notes, and snippets.

@thekensta
Created December 5, 2015 20:56
Show Gist options
  • Save thekensta/804c01a36582a8a02c74 to your computer and use it in GitHub Desktop.
Save thekensta/804c01a36582a8a02c74 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"%matplotlib inline\n",
"\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['../Prepped data/17. CRS Service Users 2014/All service users.csv',\n",
" '../Prepped data/17. CRS Service Users 2014/Carer age.csv',\n",
" '../Prepped data/17. CRS Service Users 2014/Carer Ethnicity.csv',\n",
" '../Prepped data/17. CRS Service Users 2014/Carer Gender.csv',\n",
" '../Prepped data/17. CRS Service Users 2014/Carer Language.csv',\n",
" '../Prepped data/17. CRS Service Users 2014/Carer Partnership.csv']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import glob\n",
"glob.glob(\"../Prepped data/17. CRS Service Users 2014/*\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Read in all the raw survey files\n",
"datasets = {fin: pd.read_csv(\"../Prepped data/\" +fin) \n",
" for fin in glob.glob(\"../Prepped data/17. CRS Service Users 2014/*\")}\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load the data from Data set 8\n",
"\n",
"Conver the really wide columns into a narrow set with sex / age/ ethnicity"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data8 = pd.read_csv(\"../Prepped data/08.1 Ethnicity Sex Age England & Wales/8. Ethnicity Sex Age EW.csv\")\n",
"#data8.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get all the columns and look for the pivot and value columns"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"366 columns in total\n"
]
},
{
"data": {
"text/plain": [
"['LAD12CD',\n",
" 'Local.Authority.District.12',\n",
" 'Counties.&.UAs.code',\n",
" 'Counties.&.Unitary.Authorities',\n",
" 'Locality.cost.centre',\n",
" 'Locality']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d8c = [c for c in data8.columns]\n",
"print(len(d8c), \"columns in total\")\n",
"d8c[:6]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Melt the dataset to narrow format \n",
"Then add columns for sex / age / ethnicity "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data8melt = pd.melt(data8, id_vars=d8c[:6], value_vars=d8c[6:])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(348, 366)\n",
"(125280, 8)\n"
]
}
],
"source": [
"print(data8.shape)\n",
"print(data8melt.shape)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('Age.65.and.over', 72),\n",
" ('All.categories:.Age', 72),\n",
" ('Age.0.to.24', 72),\n",
" ('Age.25.to.49', 72),\n",
" ('Age.50.to.64', 72)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def parse_col_name(cname):\n",
" \"\"\"Split a column into sex, age and ethnicity fields. \n",
" \n",
" >>> parse_col_name('Sex:.All.persons;.Age:.All.categories:.Age;.Ethnic.Group:.White:.English/Welsh/Scottish/Northern.Irish/British')\n",
" ('All.persons',\n",
" 'All.categories:.Age',\n",
" 'White:.English/Welsh/Scottish/Northern.Irish/British')\n",
" \n",
" \"\"\"\n",
" # All multi fields have sex / age / erthnicity\n",
" fields = cname.split(\";\")\n",
" sex_, age_, ethnic_ = fields\n",
" sex = sex_.split(\":\", 1)[1]\n",
" age = age_.split(\":\", 1)[1]\n",
" ethnic = ethnic_.split(\":\", 1)[1]\n",
" # Remove leading '.' on all columns\n",
" return sex[1:], age[1:], ethnic[1:]\n",
"\n",
"\n",
"sexes, ages, ethnics = [], [], []\n",
"for col in d8c:\n",
" # Ignore all the columns used for pivoting\n",
" if col.startswith(\"Sex\"):\n",
" s, a, e = parse_col_name(col)\n",
" sexes.append(s)\n",
" ages.append(a)\n",
" ethnics.append(e)\n",
"\n",
"# Run to show counts of what is in each\n",
"c = Counter()\n",
"c.update(ages) # sexes / ethnics\n",
"c.most_common()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The long, ugly column names in the wide table are now a single column\n",
"\n",
"Build a DataFrame with columns [\"long name\", \"sex\", \"age\", \"ethnicity\"] and then join\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(125280, 8)\n",
"(125280, 12)\n",
"['LAD12CD', 'Local.Authority.District.12', 'Counties.&.UAs.code', 'Counties.&.Unitary.Authorities', 'Locality.cost.centre', 'Locality', 'variable', 'value', 'age', 'colname', 'ethnic', 'sex']\n"
]
}
],
"source": [
"## Columns Lookup\n",
"column_lookup = pd.DataFrame({\"colname\": [c for c in d8c if c.startswith(\"Sex\")], \n",
" \"sex\": sexes, \n",
" \"age\": ages, \n",
" \"ethnic\": ethnics})\n",
"\n",
"\n",
"\n",
"data8_sex_age_eth = pd.merge(data8melt, column_lookup, left_on=\"variable\", right_on=\"colname\")\n",
"\n",
"print(data8melt.shape)\n",
"print(data8_sex_age_eth.shape)\n",
"\n",
"print([c for c in data8_sex_age_eth.columns])\n",
"cols_of_interest = ['LAD12CD', 'Local.Authority.District.12', 'Counties.&.UAs.code', \n",
" 'Counties.&.Unitary.Authorities', 'Locality.cost.centre', 'Locality', 'variable', 'value', \n",
" 'age', 'ethnic', 'sex']\n",
"\n",
"# Filter out columns and save to file\n",
"data8_sex_age_eth_final = data8_sex_age_eth[cols_of_interest]\n",
"data8_sex_age_eth_final.to_csv(\"../workingdata/data8_with_age_sex_ethnic.csv\", index=False, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# data8melt.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Users of the Service Survey"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# Load all the data and turn into narrow format\n",
"# \n",
"\n",
"## Working directory is\n",
"#\n",
"# /+\n",
"# /notebooks/<this file>\n",
"# | \n",
"# /Prepped data/\n",
"# |\n",
"# /<survey 17>/\n",
"# | \n",
"# + <data.csv>\n",
"#\n",
"# There is one CSV per survey\n",
"#\n",
"datasets = {fin: pd.read_csv(\"../Prepped data/\" +fin) \n",
" for fin in glob.glob(\"../Prepped data/17. CRS Service Users 2014/*\")}\n",
"\n",
"\n",
"# Locality needs to be based on the \"Locality CRS 2014\" columns\n",
"# This needs to be split according to the ratio in locations-remap.csv (in version control)\n",
"def single_narrow_dataset(datasets, locality_field = \"Locality.CRS.2014\"):\n",
" \n",
" # Read in a list of files from a directory and create a single DataFrame\n",
" df_list = []\n",
" \n",
" for key, df in datasets.items():\n",
" \n",
" # Convert non-locality columns in a single column\n",
" df_ = pd.melt(df, id_vars=locality_field, \n",
" value_vars=[c for c in df.columns[2:]], \n",
" value_name=\"value\")\n",
" # Select the columns we are interested in\n",
" df_.columns = [\"Location\", \"Variable\", \"Count\"]\n",
" # Create a dataset name from the original CSV\n",
" dimname = os.path.basename(key)[:-4]\n",
" df_[\"Dimension\"] = dimname\n",
" df_[[\"Location\", \"Dimension\", \"Variable\", \"Count\"]]\n",
" df_list.append(df_)\n",
" \n",
" # Stack all the columns \n",
" clean_df = pd.concat(df_list)\n",
" # Get rid of all the non-total fields\n",
" full_data = clean_df[clean_df.Variable != \"Grand.Total\"]\n",
" \n",
" # Re-map Avon, Gloucestershire & Wiltshire \n",
" # Banes, Swindon, Wiltshire and Care Services 0.274\n",
" # and\n",
" # Bristol, Gloucestershire and South Gloucestershire 72.6 \n",
" LocationLookup = pd.read_csv(\"../workingdata/locations-remap.csv\")\n",
" full_data = pd.merge(full_data, LocationLookup, on=\"Location\")\n",
" \n",
" # Set all the Nans to 0\n",
" full_data[\"FullCount\"] = full_data[\"Count\"].fillna(0)\n",
" \n",
" # Aggregate by Location and Variable\n",
" full_data = (full_data\n",
" .groupby([\"NewLocation\", \"Variable\", \"Dimension\"])\n",
" [\"FullCount\"]\n",
" .sum()\n",
" .reset_index())\n",
" \n",
" # Get a within segment proportion\n",
" full_data[\"FullProp\"] = (full_data\n",
" .groupby([\"Dimension\", \"NewLocation\"])\n",
" [\"FullCount\"]\n",
" .transform(lambda x: x/x.sum()))\n",
" return full_data[[]]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"full_data = single_narrow_dataset(datasets)\n",
"full_data.to_csv(\"../workingdata/known_data_by_question_response2.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment