Skip to content

Instantly share code, notes, and snippets.

@wesm
Created September 17, 2012 18:47
Show Gist options
  • Save wesm/3739027 to your computer and use it in GitHub Desktop.
Save wesm/3739027 to your computer and use it in GitHub Desktop.
Self merge example
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "SelfMerge"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"pd.set_printoptions(notebook_repr_html=False)\n",
"\n",
"np.random.seed(1)\n",
"\n",
"df = pd.DataFrame({'idx': np.arange(10).repeat(5),\n",
" 'x': np.random.randn(50),\n",
" 'y': np.tile(list('abcde'), 10),\n",
" 'ok': np.random.binomial(1, 0.9, 50)})\n",
"\n",
"n = pd.DataFrame({'y': list('abcde'), 'y1': list('bcdea')})\n",
"\n",
"df"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 7,
"text": [
" idx ok x y\n",
"0 0 0 1.624345 a\n",
"1 0 1 -0.611756 b\n",
"2 0 1 -0.528172 c\n",
"3 0 1 -1.072969 d\n",
"4 0 1 0.865408 e\n",
"5 1 1 -2.301539 a\n",
"6 1 0 1.744812 b\n",
"7 1 1 -0.761207 c\n",
"8 1 1 0.319039 d\n",
"9 1 1 -0.249370 e\n",
"10 2 1 1.462108 a\n",
"11 2 1 -2.060141 b\n",
"12 2 1 -0.322417 c\n",
"13 2 1 -0.384054 d\n",
"14 2 1 1.133769 e\n",
"15 3 1 -1.099891 a\n",
"16 3 1 -0.172428 b\n",
"17 3 0 -0.877858 c\n",
"18 3 1 0.042214 d\n",
"19 3 1 0.582815 e\n",
"20 4 1 -1.100619 a\n",
"21 4 0 1.144724 b\n",
"22 4 1 0.901591 c\n",
"23 4 1 0.502494 d\n",
"24 4 1 0.900856 e\n",
"25 5 1 -0.683728 a\n",
"26 5 0 -0.122890 b\n",
"27 5 1 -0.935769 c\n",
"28 5 1 -0.267888 d\n",
"29 5 1 0.530355 e\n",
"30 6 1 -0.691661 a\n",
"31 6 1 -0.396754 b\n",
"32 6 1 -0.687173 c\n",
"33 6 1 -0.845206 d\n",
"34 6 0 -0.671246 e\n",
"35 7 1 -0.012665 a\n",
"36 7 1 -1.117310 b\n",
"37 7 0 0.234416 c\n",
"38 7 1 1.659802 d\n",
"39 7 0 0.742044 e\n",
"40 8 1 -0.191836 a\n",
"41 8 1 -0.887629 b\n",
"42 8 0 -0.747158 c\n",
"43 8 1 1.692455 d\n",
"44 8 1 0.050808 e\n",
"45 9 1 -0.636996 a\n",
"46 9 1 0.190915 b\n",
"47 9 0 2.100255 c\n",
"48 9 1 0.120159 d\n",
"49 9 1 0.617203 e"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 8,
"text": [
" y y1\n",
"0 a b\n",
"1 b c\n",
"2 c d\n",
"3 d e\n",
"4 e a"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mask = df.ok == 0\n",
"\n",
"ny1 = n.set_index('y')['y1']\n",
"df['mapping'] = df['y'].map(ny1)\n",
"df\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 9,
"text": [
" idx ok x y mapping\n",
"0 0 0 1.624345 a b\n",
"1 0 1 -0.611756 b c\n",
"2 0 1 -0.528172 c d\n",
"3 0 1 -1.072969 d e\n",
"4 0 1 0.865408 e a\n",
"5 1 1 -2.301539 a b\n",
"6 1 0 1.744812 b c\n",
"7 1 1 -0.761207 c d\n",
"8 1 1 0.319039 d e\n",
"9 1 1 -0.249370 e a\n",
"10 2 1 1.462108 a b\n",
"11 2 1 -2.060141 b c\n",
"12 2 1 -0.322417 c d\n",
"13 2 1 -0.384054 d e\n",
"14 2 1 1.133769 e a\n",
"15 3 1 -1.099891 a b\n",
"16 3 1 -0.172428 b c\n",
"17 3 0 -0.877858 c d\n",
"18 3 1 0.042214 d e\n",
"19 3 1 0.582815 e a\n",
"20 4 1 -1.100619 a b\n",
"21 4 0 1.144724 b c\n",
"22 4 1 0.901591 c d\n",
"23 4 1 0.502494 d e\n",
"24 4 1 0.900856 e a\n",
"25 5 1 -0.683728 a b\n",
"26 5 0 -0.122890 b c\n",
"27 5 1 -0.935769 c d\n",
"28 5 1 -0.267888 d e\n",
"29 5 1 0.530355 e a\n",
"30 6 1 -0.691661 a b\n",
"31 6 1 -0.396754 b c\n",
"32 6 1 -0.687173 c d\n",
"33 6 1 -0.845206 d e\n",
"34 6 0 -0.671246 e a\n",
"35 7 1 -0.012665 a b\n",
"36 7 1 -1.117310 b c\n",
"37 7 0 0.234416 c d\n",
"38 7 1 1.659802 d e\n",
"39 7 0 0.742044 e a\n",
"40 8 1 -0.191836 a b\n",
"41 8 1 -0.887629 b c\n",
"42 8 0 -0.747158 c d\n",
"43 8 1 1.692455 d e\n",
"44 8 1 0.050808 e a\n",
"45 9 1 -0.636996 a b\n",
"46 9 1 0.190915 b c\n",
"47 9 0 2.100255 c d\n",
"48 9 1 0.120159 d e\n",
"49 9 1 0.617203 e a"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"merged = pd.merge(df, df, left_on=['idx', 'mapping'],\n",
" right_on=['idx', 'y'], sort=False)\n",
"merged"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 10,
"text": [
" idx ok_x x_x y_x mapping_x ok_y x_y y_y mapping_y\n",
"0 0 0 1.624345 a b 1 -0.611756 b c\n",
"1 0 1 -0.611756 b c 1 -0.528172 c d\n",
"2 0 1 -0.528172 c d 1 -1.072969 d e\n",
"3 0 1 -1.072969 d e 1 0.865408 e a\n",
"4 0 1 0.865408 e a 0 1.624345 a b\n",
"5 1 1 -2.301539 a b 0 1.744812 b c\n",
"6 1 0 1.744812 b c 1 -0.761207 c d\n",
"7 1 1 -0.761207 c d 1 0.319039 d e\n",
"8 1 1 0.319039 d e 1 -0.249370 e a\n",
"9 1 1 -0.249370 e a 1 -2.301539 a b\n",
"10 2 1 1.462108 a b 1 -2.060141 b c\n",
"11 2 1 -2.060141 b c 1 -0.322417 c d\n",
"12 2 1 -0.322417 c d 1 -0.384054 d e\n",
"13 2 1 -0.384054 d e 1 1.133769 e a\n",
"14 2 1 1.133769 e a 1 1.462108 a b\n",
"15 3 1 -1.099891 a b 1 -0.172428 b c\n",
"16 3 1 -0.172428 b c 0 -0.877858 c d\n",
"17 3 0 -0.877858 c d 1 0.042214 d e\n",
"18 3 1 0.042214 d e 1 0.582815 e a\n",
"19 3 1 0.582815 e a 1 -1.099891 a b\n",
"20 4 1 -1.100619 a b 0 1.144724 b c\n",
"21 4 0 1.144724 b c 1 0.901591 c d\n",
"22 4 1 0.901591 c d 1 0.502494 d e\n",
"23 4 1 0.502494 d e 1 0.900856 e a\n",
"24 4 1 0.900856 e a 1 -1.100619 a b\n",
"25 5 1 -0.683728 a b 0 -0.122890 b c\n",
"26 5 0 -0.122890 b c 1 -0.935769 c d\n",
"27 5 1 -0.935769 c d 1 -0.267888 d e\n",
"28 5 1 -0.267888 d e 1 0.530355 e a\n",
"29 5 1 0.530355 e a 1 -0.683728 a b\n",
"30 6 1 -0.691661 a b 1 -0.396754 b c\n",
"31 6 1 -0.396754 b c 1 -0.687173 c d\n",
"32 6 1 -0.687173 c d 1 -0.845206 d e\n",
"33 6 1 -0.845206 d e 0 -0.671246 e a\n",
"34 6 0 -0.671246 e a 1 -0.691661 a b\n",
"35 7 1 -0.012665 a b 1 -1.117310 b c\n",
"36 7 1 -1.117310 b c 0 0.234416 c d\n",
"37 7 0 0.234416 c d 1 1.659802 d e\n",
"38 7 1 1.659802 d e 0 0.742044 e a\n",
"39 7 0 0.742044 e a 1 -0.012665 a b\n",
"40 8 1 -0.191836 a b 1 -0.887629 b c\n",
"41 8 1 -0.887629 b c 0 -0.747158 c d\n",
"42 8 0 -0.747158 c d 1 1.692455 d e\n",
"43 8 1 1.692455 d e 1 0.050808 e a\n",
"44 8 1 0.050808 e a 1 -0.191836 a b\n",
"45 9 1 -0.636996 a b 1 0.190915 b c\n",
"46 9 1 0.190915 b c 0 2.100255 c d\n",
"47 9 0 2.100255 c d 1 0.120159 d e\n",
"48 9 1 0.120159 d e 1 0.617203 e a\n",
"49 9 1 0.617203 e a 1 -0.636996 a b"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"result = df[mask]\n",
"result['x'] = merged[mask]['x_y'].values\n",
"result"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 11,
"text": [
" idx ok x y mapping\n",
"0 0 0 -0.611756 a b\n",
"6 1 0 -0.761207 b c\n",
"17 3 0 0.042214 c d\n",
"21 4 0 0.901591 b c\n",
"26 5 0 -0.935769 b c\n",
"34 6 0 -0.691661 e a\n",
"37 7 0 1.659802 c d\n",
"39 7 0 -0.012665 e a\n",
"42 8 0 1.692455 c d\n",
"47 9 0 0.120159 c d"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment