Created
September 17, 2012 18:47
-
-
Save wesm/3739027 to your computer and use it in GitHub Desktop.
Self merge example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "SelfMerge" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"pd.set_printoptions(notebook_repr_html=False)\n", | |
"\n", | |
"np.random.seed(1)\n", | |
"\n", | |
"df = pd.DataFrame({'idx': np.arange(10).repeat(5),\n", | |
" 'x': np.random.randn(50),\n", | |
" 'y': np.tile(list('abcde'), 10),\n", | |
" 'ok': np.random.binomial(1, 0.9, 50)})\n", | |
"\n", | |
"n = pd.DataFrame({'y': list('abcde'), 'y1': list('bcdea')})\n", | |
"\n", | |
"df" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
" idx ok x y\n", | |
"0 0 0 1.624345 a\n", | |
"1 0 1 -0.611756 b\n", | |
"2 0 1 -0.528172 c\n", | |
"3 0 1 -1.072969 d\n", | |
"4 0 1 0.865408 e\n", | |
"5 1 1 -2.301539 a\n", | |
"6 1 0 1.744812 b\n", | |
"7 1 1 -0.761207 c\n", | |
"8 1 1 0.319039 d\n", | |
"9 1 1 -0.249370 e\n", | |
"10 2 1 1.462108 a\n", | |
"11 2 1 -2.060141 b\n", | |
"12 2 1 -0.322417 c\n", | |
"13 2 1 -0.384054 d\n", | |
"14 2 1 1.133769 e\n", | |
"15 3 1 -1.099891 a\n", | |
"16 3 1 -0.172428 b\n", | |
"17 3 0 -0.877858 c\n", | |
"18 3 1 0.042214 d\n", | |
"19 3 1 0.582815 e\n", | |
"20 4 1 -1.100619 a\n", | |
"21 4 0 1.144724 b\n", | |
"22 4 1 0.901591 c\n", | |
"23 4 1 0.502494 d\n", | |
"24 4 1 0.900856 e\n", | |
"25 5 1 -0.683728 a\n", | |
"26 5 0 -0.122890 b\n", | |
"27 5 1 -0.935769 c\n", | |
"28 5 1 -0.267888 d\n", | |
"29 5 1 0.530355 e\n", | |
"30 6 1 -0.691661 a\n", | |
"31 6 1 -0.396754 b\n", | |
"32 6 1 -0.687173 c\n", | |
"33 6 1 -0.845206 d\n", | |
"34 6 0 -0.671246 e\n", | |
"35 7 1 -0.012665 a\n", | |
"36 7 1 -1.117310 b\n", | |
"37 7 0 0.234416 c\n", | |
"38 7 1 1.659802 d\n", | |
"39 7 0 0.742044 e\n", | |
"40 8 1 -0.191836 a\n", | |
"41 8 1 -0.887629 b\n", | |
"42 8 0 -0.747158 c\n", | |
"43 8 1 1.692455 d\n", | |
"44 8 1 0.050808 e\n", | |
"45 9 1 -0.636996 a\n", | |
"46 9 1 0.190915 b\n", | |
"47 9 0 2.100255 c\n", | |
"48 9 1 0.120159 d\n", | |
"49 9 1 0.617203 e" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
" y y1\n", | |
"0 a b\n", | |
"1 b c\n", | |
"2 c d\n", | |
"3 d e\n", | |
"4 e a" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mask = df.ok == 0\n", | |
"\n", | |
"ny1 = n.set_index('y')['y1']\n", | |
"df['mapping'] = df['y'].map(ny1)\n", | |
"df\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
" idx ok x y mapping\n", | |
"0 0 0 1.624345 a b\n", | |
"1 0 1 -0.611756 b c\n", | |
"2 0 1 -0.528172 c d\n", | |
"3 0 1 -1.072969 d e\n", | |
"4 0 1 0.865408 e a\n", | |
"5 1 1 -2.301539 a b\n", | |
"6 1 0 1.744812 b c\n", | |
"7 1 1 -0.761207 c d\n", | |
"8 1 1 0.319039 d e\n", | |
"9 1 1 -0.249370 e a\n", | |
"10 2 1 1.462108 a b\n", | |
"11 2 1 -2.060141 b c\n", | |
"12 2 1 -0.322417 c d\n", | |
"13 2 1 -0.384054 d e\n", | |
"14 2 1 1.133769 e a\n", | |
"15 3 1 -1.099891 a b\n", | |
"16 3 1 -0.172428 b c\n", | |
"17 3 0 -0.877858 c d\n", | |
"18 3 1 0.042214 d e\n", | |
"19 3 1 0.582815 e a\n", | |
"20 4 1 -1.100619 a b\n", | |
"21 4 0 1.144724 b c\n", | |
"22 4 1 0.901591 c d\n", | |
"23 4 1 0.502494 d e\n", | |
"24 4 1 0.900856 e a\n", | |
"25 5 1 -0.683728 a b\n", | |
"26 5 0 -0.122890 b c\n", | |
"27 5 1 -0.935769 c d\n", | |
"28 5 1 -0.267888 d e\n", | |
"29 5 1 0.530355 e a\n", | |
"30 6 1 -0.691661 a b\n", | |
"31 6 1 -0.396754 b c\n", | |
"32 6 1 -0.687173 c d\n", | |
"33 6 1 -0.845206 d e\n", | |
"34 6 0 -0.671246 e a\n", | |
"35 7 1 -0.012665 a b\n", | |
"36 7 1 -1.117310 b c\n", | |
"37 7 0 0.234416 c d\n", | |
"38 7 1 1.659802 d e\n", | |
"39 7 0 0.742044 e a\n", | |
"40 8 1 -0.191836 a b\n", | |
"41 8 1 -0.887629 b c\n", | |
"42 8 0 -0.747158 c d\n", | |
"43 8 1 1.692455 d e\n", | |
"44 8 1 0.050808 e a\n", | |
"45 9 1 -0.636996 a b\n", | |
"46 9 1 0.190915 b c\n", | |
"47 9 0 2.100255 c d\n", | |
"48 9 1 0.120159 d e\n", | |
"49 9 1 0.617203 e a" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"merged = pd.merge(df, df, left_on=['idx', 'mapping'],\n", | |
" right_on=['idx', 'y'], sort=False)\n", | |
"merged" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
" idx ok_x x_x y_x mapping_x ok_y x_y y_y mapping_y\n", | |
"0 0 0 1.624345 a b 1 -0.611756 b c\n", | |
"1 0 1 -0.611756 b c 1 -0.528172 c d\n", | |
"2 0 1 -0.528172 c d 1 -1.072969 d e\n", | |
"3 0 1 -1.072969 d e 1 0.865408 e a\n", | |
"4 0 1 0.865408 e a 0 1.624345 a b\n", | |
"5 1 1 -2.301539 a b 0 1.744812 b c\n", | |
"6 1 0 1.744812 b c 1 -0.761207 c d\n", | |
"7 1 1 -0.761207 c d 1 0.319039 d e\n", | |
"8 1 1 0.319039 d e 1 -0.249370 e a\n", | |
"9 1 1 -0.249370 e a 1 -2.301539 a b\n", | |
"10 2 1 1.462108 a b 1 -2.060141 b c\n", | |
"11 2 1 -2.060141 b c 1 -0.322417 c d\n", | |
"12 2 1 -0.322417 c d 1 -0.384054 d e\n", | |
"13 2 1 -0.384054 d e 1 1.133769 e a\n", | |
"14 2 1 1.133769 e a 1 1.462108 a b\n", | |
"15 3 1 -1.099891 a b 1 -0.172428 b c\n", | |
"16 3 1 -0.172428 b c 0 -0.877858 c d\n", | |
"17 3 0 -0.877858 c d 1 0.042214 d e\n", | |
"18 3 1 0.042214 d e 1 0.582815 e a\n", | |
"19 3 1 0.582815 e a 1 -1.099891 a b\n", | |
"20 4 1 -1.100619 a b 0 1.144724 b c\n", | |
"21 4 0 1.144724 b c 1 0.901591 c d\n", | |
"22 4 1 0.901591 c d 1 0.502494 d e\n", | |
"23 4 1 0.502494 d e 1 0.900856 e a\n", | |
"24 4 1 0.900856 e a 1 -1.100619 a b\n", | |
"25 5 1 -0.683728 a b 0 -0.122890 b c\n", | |
"26 5 0 -0.122890 b c 1 -0.935769 c d\n", | |
"27 5 1 -0.935769 c d 1 -0.267888 d e\n", | |
"28 5 1 -0.267888 d e 1 0.530355 e a\n", | |
"29 5 1 0.530355 e a 1 -0.683728 a b\n", | |
"30 6 1 -0.691661 a b 1 -0.396754 b c\n", | |
"31 6 1 -0.396754 b c 1 -0.687173 c d\n", | |
"32 6 1 -0.687173 c d 1 -0.845206 d e\n", | |
"33 6 1 -0.845206 d e 0 -0.671246 e a\n", | |
"34 6 0 -0.671246 e a 1 -0.691661 a b\n", | |
"35 7 1 -0.012665 a b 1 -1.117310 b c\n", | |
"36 7 1 -1.117310 b c 0 0.234416 c d\n", | |
"37 7 0 0.234416 c d 1 1.659802 d e\n", | |
"38 7 1 1.659802 d e 0 0.742044 e a\n", | |
"39 7 0 0.742044 e a 1 -0.012665 a b\n", | |
"40 8 1 -0.191836 a b 1 -0.887629 b c\n", | |
"41 8 1 -0.887629 b c 0 -0.747158 c d\n", | |
"42 8 0 -0.747158 c d 1 1.692455 d e\n", | |
"43 8 1 1.692455 d e 1 0.050808 e a\n", | |
"44 8 1 0.050808 e a 1 -0.191836 a b\n", | |
"45 9 1 -0.636996 a b 1 0.190915 b c\n", | |
"46 9 1 0.190915 b c 0 2.100255 c d\n", | |
"47 9 0 2.100255 c d 1 0.120159 d e\n", | |
"48 9 1 0.120159 d e 1 0.617203 e a\n", | |
"49 9 1 0.617203 e a 1 -0.636996 a b" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"result = df[mask]\n", | |
"result['x'] = merged[mask]['x_y'].values\n", | |
"result" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
" idx ok x y mapping\n", | |
"0 0 0 -0.611756 a b\n", | |
"6 1 0 -0.761207 b c\n", | |
"17 3 0 0.042214 c d\n", | |
"21 4 0 0.901591 b c\n", | |
"26 5 0 -0.935769 b c\n", | |
"34 6 0 -0.691661 e a\n", | |
"37 7 0 1.659802 c d\n", | |
"39 7 0 -0.012665 e a\n", | |
"42 8 0 1.692455 c d\n", | |
"47 9 0 0.120159 c d" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment