Last active
February 2, 2016 09:34
-
-
Save bgbg/d8d46f974cf871e58c78 to your computer and use it in GitHub Desktop.
Another triple_apply pecularity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO] Using MetricMock instead of real metrics, mode is: QA\n" | |
] | |
} | |
], | |
"source": [ | |
"import time\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import sframe as gl" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The data below is an edge table. It contains three types of vertices: A, B, and C. Edge type is determined by vertix types: \"A_A\", \"C_B\", etc. There are 150 edges." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Overwriting test_edges.csv\n" | |
] | |
} | |
], | |
"source": [ | |
"%%writefile test_edges.csv\n", | |
"__src_id,__dst_id,edge_type,weight\n", | |
"\"C_56\",\"C_36\",\"C_C\",1\n", | |
"\"C_53\",\"B_20\",\"C_B\",1\n", | |
"\"B_12\",\"A_02\",\"B_A\",1\n", | |
"\"B_00\",\"A_00\",\"B_A\",1\n", | |
"\"C_02\",\"B_04\",\"C_B\",1\n", | |
"\"C_80\",\"B_31\",\"C_B\",1\n", | |
"\"C_80\",\"C_28\",\"C_C\",1\n", | |
"\"B_34\",\"A_06\",\"B_A\",1\n", | |
"\"B_33\",\"A_06\",\"B_A\",1\n", | |
"\"B_35\",\"A_06\",\"B_A\",1\n", | |
"\"C_02\",\"B_05\",\"C_B\",1\n", | |
"\"B_37\",\"A_06\",\"B_A\",1\n", | |
"\"C_13\",\"B_57\",\"C_B\",1\n", | |
"\"C_53\",\"C_16\",\"C_C\",1\n", | |
"\"C_53\",\"B_21\",\"C_B\",1\n", | |
"\"C_02\",\"B_03\",\"C_B\",1\n", | |
"\"C_47\",\"C_41\",\"C_C\",1\n", | |
"\"C_15\",\"C_36\",\"C_C\",1\n", | |
"\"C_81\",\"C_36\",\"C_C\",1\n", | |
"\"C_78\",\"C_36\",\"C_C\",1\n", | |
"\"C_99\",\"C_36\",\"C_C\",1\n", | |
"\"C_38\",\"B_46\",\"C_B\",1\n", | |
"\"C_81\",\"B_40\",\"C_B\",1\n", | |
"\"C_15\",\"B_40\",\"C_B\",1\n", | |
"\"C_78\",\"B_40\",\"C_B\",1\n", | |
"\"C_96\",\"B_55\",\"C_B\",1\n", | |
"\"C_31\",\"B_55\",\"C_B\",1\n", | |
"\"C_73\",\"B_25\",\"C_B\",1\n", | |
"\"C_81\",\"B_55\",\"C_B\",1\n", | |
"\"C_20\",\"B_55\",\"C_B\",1\n", | |
"\"C_25\",\"B_55\",\"C_B\",1\n", | |
"\"C_86\",\"C_16\",\"C_C\",1\n", | |
"\"C_50\",\"B_14\",\"C_B\",1\n", | |
"\"C_01\",\"C_16\",\"C_C\",1\n", | |
"\"C_81\",\"B_39\",\"C_B\",1\n", | |
"\"C_81\",\"B_38\",\"C_B\",1\n", | |
"\"C_78\",\"B_38\",\"C_B\",1\n", | |
"\"C_12\",\"C_41\",\"C_C\",1\n", | |
"\"C_73\",\"B_24\",\"C_B\",1\n", | |
"\"C_61\",\"C_41\",\"C_C\",1\n", | |
"\"C_19\",\"A_05\",\"C_A\",1\n", | |
"\"C_92\",\"B_55\",\"C_B\",1\n", | |
"\"C_74\",\"B_55\",\"C_B\",1\n", | |
"\"C_35\",\"C_16\",\"C_C\",1\n", | |
"\"C_87\",\"C_16\",\"C_C\",1\n", | |
"\"C_71\",\"C_16\",\"C_C\",1\n", | |
"\"C_65\",\"B_56\",\"C_B\",1\n", | |
"\"C_65\",\"C_54\",\"C_C\",1\n", | |
"\"C_87\",\"C_54\",\"C_C\",1\n", | |
"\"C_65\",\"C_11\",\"C_C\",1\n", | |
"\"C_100\",\"C_41\",\"C_C\",1\n", | |
"\"C_91\",\"C_36\",\"C_C\",1\n", | |
"\"C_27\",\"B_28\",\"C_B\",1\n", | |
"\"C_05\",\"C_50\",\"C_C\",1\n", | |
"\"C_05\",\"B_15\",\"C_B\",1\n", | |
"\"C_34\",\"B_55\",\"C_B\",1\n", | |
"\"B_36\",\"A_06\",\"B_A\",1\n", | |
"\"C_79\",\"C_29\",\"C_C\",1\n", | |
"\"C_66\",\"C_16\",\"C_C\",1\n", | |
"\"C_82\",\"C_16\",\"C_C\",1\n", | |
"\"C_05\",\"B_14\",\"C_B\",1\n", | |
"\"C_51\",\"C_16\",\"C_C\",1\n", | |
"\"C_70\",\"C_16\",\"C_C\",1\n", | |
"\"C_21\",\"C_16\",\"C_C\",1\n", | |
"\"C_58\",\"B_48\",\"C_B\",1\n", | |
"\"C_69\",\"B_30\",\"C_B\",1\n", | |
"\"B_27\",\"A_04\",\"B_A\",1\n", | |
"\"C_26\",\"C_54\",\"C_C\",1\n", | |
"\"C_06\",\"B_06\",\"C_B\",1\n", | |
"\"C_39\",\"B_29\",\"C_B\",1\n", | |
"\"C_06\",\"B_07\",\"C_B\",1\n", | |
"\"C_58\",\"C_41\",\"C_C\",1\n", | |
"\"C_17\",\"B_12\",\"C_B\",1\n", | |
"\"C_72\",\"C_36\",\"C_C\",1\n", | |
"\"C_22\",\"B_20\",\"C_B\",1\n", | |
"\"C_23\",\"C_36\",\"C_C\",1\n", | |
"\"C_16\",\"B_20\",\"C_B\",1\n", | |
"\"C_16\",\"B_22\",\"C_B\",1\n", | |
"\"C_22\",\"B_22\",\"C_B\",1\n", | |
"\"C_55\",\"B_28\",\"C_B\",1\n", | |
"\"B_08\",\"A_01\",\"B_A\",1\n", | |
"\"B_09\",\"A_01\",\"B_A\",1\n", | |
"\"C_29\",\"C_50\",\"C_C\",1\n", | |
"\"C_16\",\"B_19\",\"C_B\",1\n", | |
"\"C_62\",\"B_55\",\"C_B\",1\n", | |
"\"B_32\",\"A_06\",\"B_A\",1\n", | |
"\"C_60\",\"B_55\",\"C_B\",1\n", | |
"\"C_57\",\"B_54\",\"C_B\",1\n", | |
"\"C_22\",\"C_16\",\"C_C\",1\n", | |
"\"C_52\",\"C_16\",\"C_C\",1\n", | |
"\"C_29\",\"B_14\",\"C_B\",1\n", | |
"\"C_16\",\"B_21\",\"C_B\",1\n", | |
"\"C_98\",\"C_16\",\"C_C\",1\n", | |
"\"C_18\",\"B_23\",\"C_B\",1\n", | |
"\"C_16\",\"B_18\",\"C_B\",1\n", | |
"\"C_63\",\"B_17\",\"C_B\",1\n", | |
"\"C_48\",\"C_36\",\"C_C\",1\n", | |
"\"C_48\",\"B_42\",\"C_B\",1\n", | |
"\"C_04\",\"C_08\",\"C_C\",1\n", | |
"\"C_44\",\"C_29\",\"C_C\",1\n", | |
"\"C_07\",\"C_16\",\"C_C\",1\n", | |
"\"C_85\",\"C_16\",\"C_C\",1\n", | |
"\"C_45\",\"B_14\",\"C_B\",1\n", | |
"\"C_37\",\"B_26\",\"C_B\",1\n", | |
"\"C_89\",\"B_41\",\"C_B\",1\n", | |
"\"C_48\",\"B_38\",\"C_B\",1\n", | |
"\"C_40\",\"B_38\",\"C_B\",1\n", | |
"\"C_93\",\"B_02\",\"C_B\",1\n", | |
"\"C_43\",\"B_10\",\"C_B\",1\n", | |
"\"C_84\",\"C_36\",\"C_C\",1\n", | |
"\"C_03\",\"C_50\",\"C_C\",1\n", | |
"\"C_84\",\"B_28\",\"C_B\",1\n", | |
"\"B_13\",\"A_03\",\"B_A\",1\n", | |
"\"C_09\",\"B_11\",\"C_B\",1\n", | |
"\"C_88\",\"B_43\",\"C_B\",1\n", | |
"\"C_10\",\"B_55\",\"C_B\",1\n", | |
"\"C_03\",\"C_42\",\"C_C\",1\n", | |
"\"C_101\",\"C_16\",\"C_C\",1\n", | |
"\"C_46\",\"C_16\",\"C_C\",1\n", | |
"\"C_03\",\"B_16\",\"C_B\",1\n", | |
"\"C_03\",\"C_24\",\"C_C\",1\n", | |
"\"C_76\",\"C_16\",\"C_C\",1\n", | |
"\"C_97\",\"C_16\",\"C_C\",1\n", | |
"\"C_88\",\"B_45\",\"C_B\",1\n", | |
"\"C_77\",\"B_45\",\"C_B\",1\n", | |
"\"B_51\",\"A_07\",\"B_A\",1\n", | |
"\"C_88\",\"B_44\",\"C_B\",1\n", | |
"\"C_41\",\"B_49\",\"C_B\",1\n", | |
"\"C_64\",\"C_36\",\"C_C\",1\n", | |
"\"C_75\",\"B_28\",\"C_B\",1\n", | |
"\"B_02\",\"A_00\",\"B_A\",1\n", | |
"\"C_68\",\"B_11\",\"C_B\",1\n", | |
"\"C_94\",\"B_55\",\"C_B\",1\n", | |
"\"C_32\",\"B_55\",\"C_B\",1\n", | |
"\"C_14\",\"B_55\",\"C_B\",1\n", | |
"\"C_95\",\"C_16\",\"C_C\",1\n", | |
"\"C_41\",\"B_47\",\"C_B\",1\n", | |
"\"C_30\",\"C_16\",\"C_C\",1\n", | |
"\"C_90\",\"C_16\",\"C_C\",1\n", | |
"\"C_49\",\"C_16\",\"C_C\",1\n", | |
"\"C_67\",\"C_00\",\"C_C\",1\n", | |
"\"C_41\",\"B_48\",\"C_B\",1\n", | |
"\"C_41\",\"B_50\",\"C_B\",1\n", | |
"\"C_59\",\"B_53\",\"C_B\",1\n", | |
"\"C_83\",\"B_01\",\"C_B\",1\n", | |
"\"C_33\",\"C_54\",\"C_C\",1\n", | |
"\"C_64\",\"C_54\",\"C_C\",1\n", | |
"\"C_41\",\"B_52\",\"C_B\",1\n", | |
"\"C_41\",\"B_51\",\"C_B\",1\n", | |
"\"C_67\",\"B_02\",\"C_B\",1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We may test the validity of edge types using the following function. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def validate_edge_types(gr):\n", | |
" '''Print True or False, depending on edge type validity\n", | |
" Also, return a binary array of valid edge types\n", | |
" '''\n", | |
" sel = gl.SArray([\"%s_%s\" % (r['__src_id'][0], r['__dst_id'][0]) == r['edge_type'] for r in gr.edges])\n", | |
" print(np.all(sel))\n", | |
" return sel" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The following function simulates some activity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def do_stuff(src, edge, dst):\n", | |
" edge['weight'] /= 1.0\n", | |
" return src, edge, dst" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Let's load the data, create a graph and test edge type validity:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[INFO] Start server at: ipc:///tmp/graphlab_server-54687 - Server binary: /Users/boris/anaconda/lib/python2.7/site-packages/sframe/unity_server - Server log: /tmp/sframe_server_1454405531.log\n", | |
"[INFO] GraphLab Server Version: 1.6\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"PROGRESS: Finished parsing file /Users/boris/temp/test_edges.csv\n", | |
"PROGRESS: Parsing completed. Parsed 100 lines in 0.022644 secs.\n", | |
"------------------------------------------------------\n", | |
"Inferred types from first line of file as \n", | |
"column_type_hints=[str,str,str,int]\n", | |
"If parsing fails due to incorrect types, you can correct\n", | |
"the inferred type list above and pass it to read_csv in\n", | |
"the column_type_hints argument\n", | |
"------------------------------------------------------\n", | |
"PROGRESS: Finished parsing file /Users/boris/temp/test_edges.csv\n", | |
"PROGRESS: Parsing completed. Parsed 150 lines in 0.009042 secs.\n", | |
"True\n" | |
] | |
} | |
], | |
"source": [ | |
"gg = gl.SGraph(edges=gl.load_sframe('test_edges.csv'))\n", | |
"validate_edge_types(gg);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Good, now let's `triple_apply`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"gg_after = gg.triple_apply(do_stuff, ['weight'])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The input graph is still valid, but not the resulting one:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"True\n", | |
"False\n", | |
"+----------+----------+-----------+--------+\n", | |
"| __src_id | __dst_id | edge_type | weight |\n", | |
"+----------+----------+-----------+--------+\n", | |
"| C_73 | B_24 | C_C | 1 |\n", | |
"| C_12 | C_41 | C_B | 1 |\n", | |
"| C_05 | B_14 | C_C | 1 |\n", | |
"| C_82 | C_16 | C_B | 1 |\n", | |
"| B_32 | A_06 | C_B | 1 |\n", | |
"| C_57 | B_54 | B_A | 1 |\n", | |
"| C_03 | B_16 | C_C | 1 |\n", | |
"| C_101 | C_16 | C_B | 1 |\n", | |
"| B_02 | A_00 | C_B | 1 |\n", | |
"| C_75 | B_28 | B_A | 1 |\n", | |
"+----------+----------+-----------+--------+\n", | |
"[? rows x 4 columns]\n", | |
"Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.\n", | |
"You can use len(sf) to force materialization.\n" | |
] | |
} | |
], | |
"source": [ | |
"validate_edge_types(gg)\n", | |
"sel = validate_edge_types(gg_after)\n", | |
"print(gg_after.edges[1-sel])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Testing a workaround\n", | |
"\n", | |
"It [has been suggested](https://github.com/dato-code/SFrame/issues/157#issue-130537095) that including all the columns in the `mutated_fields` argument prevents the problem, at the expense of efficiency. Let's test it" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"gg_workaround = gg.triple_apply(do_stuff, ['edge_type', 'weight'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"True\n" | |
] | |
} | |
], | |
"source": [ | |
"_ = validate_edge_types(gg_workaround)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment