Created
July 28, 2014 16:23
-
-
Save catethos/ab979acf3f7c818828a1 to your computer and use it in GitHub Desktop.
dplyr in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:3dc5f3d60796427b79e67432cd5917d24ddffaed9465d883f5e86d48eef949b6" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"dplyr inspired data manipulation" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"[dplyr](https://github.com/hadley/dplyr) is a new R package that provides a consistent set of interfaces for efficiently manipulating data easily. For example, the following code add a _c_ column to the dataframe _df_ which equals to the sum of columns _a_ and _b_.\n", | |
"\n", | |
"<blockquote>\n", | |
"mutate(df, c=a+b)\n", | |
"</blockquote>\n", | |
"\n", | |
"I wish to implement a similar interface in Python. The following code is a naive implementation of the _mutate_ function. It is simply a proof of concept and I have perhap used all the wrong tools to do it, for example using regex for parsing and exec to run the code. Anyway, I hope that someone has better idea to do this." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import re\n", | |
"import pandas as pd" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"function_regex = re.compile(r\"(?P<function>^[a-z]*)\\((?P<argument>[a-z]*)\\)$\")\n", | |
"operator_regex = re.compile(r\"(\\+|-|\\*|/)\")\n", | |
"column_regex = re.compile(r\"(\\w_?)+\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def m(token):\n", | |
" column = column_regex.match(token)\n", | |
" operator = operator_regex.match(token)\n", | |
" function = function_regex.match(token)\n", | |
" if function:\n", | |
" return \"df['{}'].apply({})\".format(function.group(\"argument\"),function.group(\"function\"))\n", | |
" elif operator:\n", | |
" return token\n", | |
" elif column:\n", | |
" return \"df['{}']\".format(token)\n", | |
" \n", | |
"def transform(f):\n", | |
" lhs,rhs = f.split(\"=\")\n", | |
" lhs = lhs.strip()\n", | |
" rhs = rhs.strip()\n", | |
" tokens = rhs.split(\" \")\n", | |
" return \"df['{}'] = \".format(lhs) + \"\".join([m(x) for x in tokens])\n", | |
"\n", | |
"def mutate(df,formula):\n", | |
" exec(transform(formula),\n", | |
" globals(),\n", | |
" {\"df\":df})\n", | |
" " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Example:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"dataframe = pd.DataFrame.from_dict({\"a\":[1,2,3],\"b\":[2,4,6]})" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"dataframe" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>a</th>\n", | |
" <th>b</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> 2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2</td>\n", | |
" <td> 4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 3</td>\n", | |
" <td> 6</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>3 rows \u00d7 2 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
" a b\n", | |
"0 1 2\n", | |
"1 2 4\n", | |
"2 3 6\n", | |
"\n", | |
"[3 rows x 2 columns]" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mutate(dataframe,\n", | |
" \"c = a + b\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"dataframe" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>a</th>\n", | |
" <th>b</th>\n", | |
" <th>c</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> 2</td>\n", | |
" <td> 3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2</td>\n", | |
" <td> 4</td>\n", | |
" <td> 6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 3</td>\n", | |
" <td> 6</td>\n", | |
" <td> 9</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>3 rows \u00d7 3 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
" a b c\n", | |
"0 1 2 3\n", | |
"1 2 4 6\n", | |
"2 3 6 9\n", | |
"\n", | |
"[3 rows x 3 columns]" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def addone(x):\n", | |
" return x+1" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mutate(dataframe,\n", | |
" \"d = addone(b)\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"dataframe" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>a</th>\n", | |
" <th>b</th>\n", | |
" <th>c</th>\n", | |
" <th>d</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> 2</td>\n", | |
" <td> 3</td>\n", | |
" <td> 3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2</td>\n", | |
" <td> 4</td>\n", | |
" <td> 6</td>\n", | |
" <td> 5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 3</td>\n", | |
" <td> 6</td>\n", | |
" <td> 9</td>\n", | |
" <td> 7</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>3 rows \u00d7 4 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 19, | |
"text": [ | |
" a b c d\n", | |
"0 1 2 3 3\n", | |
"1 2 4 6 5\n", | |
"2 3 6 9 7\n", | |
"\n", | |
"[3 rows x 4 columns]" | |
] | |
} | |
], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mutate(dataframe,\n", | |
" \"e = addone(b) + c\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"dataframe" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>a</th>\n", | |
" <th>b</th>\n", | |
" <th>c</th>\n", | |
" <th>d</th>\n", | |
" <th>e</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> 2</td>\n", | |
" <td> 3</td>\n", | |
" <td> 3</td>\n", | |
" <td> 6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2</td>\n", | |
" <td> 4</td>\n", | |
" <td> 6</td>\n", | |
" <td> 5</td>\n", | |
" <td> 11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 3</td>\n", | |
" <td> 6</td>\n", | |
" <td> 9</td>\n", | |
" <td> 7</td>\n", | |
" <td> 16</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>3 rows \u00d7 5 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 21, | |
"text": [ | |
" a b c d e\n", | |
"0 1 2 3 3 6\n", | |
"1 2 4 6 5 11\n", | |
"2 3 6 9 7 16\n", | |
"\n", | |
"[3 rows x 5 columns]" | |
] | |
} | |
], | |
"prompt_number": 21 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment