Created
August 2, 2013 14:58
-
-
Save tkb/6140520 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "unpivoting-csv-pandas" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#Unpivoting CSV data with pandas\n", | |
"by [Tariq Khokhar](twitter.com/tkb)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Our first job is to read the CSV file we want into a pandas dataframe." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df = pd.read_csv('lifeexpectancy-pivot.csv')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Let's see which columns and values we have" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.columns" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"Index([Country, Country Code, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df.values" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 28, | |
"text": [ | |
"array([['Afghanistan', 'AFG', '31.1', ..., '47.9', '48.3', '48.7'],\n", | |
" ['Albania', 'ALB', '62.3', ..., '76.8', '76.9', '77.0'],\n", | |
" ['Algeria', 'DZA', '47.0', ..., '72.6', '72.9', '73.1'],\n", | |
" ..., \n", | |
" ['Sub-Saharan Africa (developing only)', 'SSA', '40.5', ..., '53.7',\n", | |
" '54.2', '54.7'],\n", | |
" ['South Asia', 'SAS', '43.3', ..., '65.0', '65.4', '65.7'],\n", | |
" ['High income', 'HIC', '68.4', ..., '78.3', '78.6', '78.8']], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 28 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We want to \"melt\" this by holding the country and country code columns and have two new columns that will contain the year and life expectancy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"le = pd.melt(df, id_vars=['Country','Country Code']) " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"OK, let's see what that did" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"le.columns" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 25, | |
"text": [ | |
"Index([Country, Country Code, variable, value], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 25 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"le.values" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 27, | |
"text": [ | |
"array([['Afghanistan', 'AFG', '1960', '31.1'],\n", | |
" ['Albania', 'ALB', '1960', '62.3'],\n", | |
" ['Algeria', 'DZA', '1960', '47.0'],\n", | |
" ..., \n", | |
" ['Sub-Saharan Africa (developing only)', 'SSA', '2011', '54.7'],\n", | |
" ['South Asia', 'SAS', '2011', '65.7'],\n", | |
" ['High income', 'HIC', '2011', '78.8']], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 27 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Great - so we can now just rename \"variable\" and \"value\" and then sort by country" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"le2 = le.rename(columns={ 'variable':'year','value':'life_expectancy'}).sort('Country')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 30 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"le2.columns" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 32, | |
"text": [ | |
"Index([Country, Country Code, year, life_expectancy], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 32 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"le2.values" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 33, | |
"text": [ | |
"array([['Afghanistan', 'AFG', '1984', '40.5'],\n", | |
" ['Afghanistan', 'AFG', '1986', '41.1'],\n", | |
" ['Afghanistan', 'AFG', '2011', '48.7'],\n", | |
" ..., \n", | |
" ['Zimbabwe', 'ZWE', '1988', '61.5'],\n", | |
" ['Zimbabwe', 'ZWE', '1994', '55.0'],\n", | |
" ['Zimbabwe', 'ZWE', '2010', '49.9']], dtype=object)" | |
] | |
} | |
], | |
"prompt_number": 33 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Finally, we export a CSV" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"le2.to_csv('lifeexpectancy-unpivoted.csv', sep=',', index=False)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 35 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment