Last active
February 23, 2018 06:56
-
-
Save adam704a/b976deee4a8f86906d5971cf3749b33f to your computer and use it in GitHub Desktop.
Paring the sample file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1741506" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sum(1 for line in open('globe.csv'))\n", | |
"# 870,753 = 1741506/2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1745425" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sum(1 for line in open('smart.csv'))\n", | |
"# 872,712.5 = 1745425/2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"smart1 bad is 667413\n", | |
"smart1 good is 203340\n", | |
"globe1 bad is 549565\n", | |
"globe1 good is 321188\n" | |
] | |
} | |
], | |
"source": [ | |
"smart1_bad = sum(1 for line in open('smart1_bad.csv'))\n", | |
"smart1_good = sum(1 for line in open('smart1_good.csv'))\n", | |
"globe1_bad = sum(1 for line in open('globe1_bad.csv'))\n", | |
"globe1_good = sum(1 for line in open('globe1_good.csv'))\n", | |
"print(\"smart1 bad is \"+ str(smart1_bad))\n", | |
"print(\"smart1 good is \"+ str(smart1_good))\n", | |
"print(\"globe1 bad is \"+ str(globe1_bad))\n", | |
"print(\"globe1 good is \"+ str(globe1_good))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"smart is 1745425\n", | |
"smart 1 is 872712\n", | |
"smart 2 is 872713\n" | |
] | |
} | |
], | |
"source": [ | |
"smart = sum(1 for line in open('smart.csv'))\n", | |
"smart1 = sum(1 for line in open('smart1.csv'))\n", | |
"smart2 = sum(1 for line in open('smart2.csv'))\n", | |
"\n", | |
"print(\"smart is \"+ str(smart))\n", | |
"print(\"smart 1 is \"+ str(smart1))\n", | |
"print(\"smart 2 is \"+ str(smart2))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1745425" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"num_lines\n", | |
"# 1745425 = 872,712 + 872,713" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Good prefixes from https://docs.google.com/spreadsheets/d/1tlvcmpeFMN5ZHgim8TZkLTwTwhIvjbFY0ZUl7Lpabow/edit#gid=571114283\n", | |
"\n", | |
"good_globe=[\n", | |
"63905,\n", | |
"63915,\n", | |
"63916,\n", | |
"63926,\n", | |
"63927,\n", | |
"63997,\n", | |
"63995]\n", | |
"\n", | |
"good_smart=[\n", | |
"63908,\n", | |
"63919,\n", | |
"63920,\n", | |
"63930,\n", | |
"63939,\n", | |
"63998,\n", | |
"63999]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"true\n" | |
] | |
} | |
], | |
"source": [ | |
"\n", | |
"mine = \"639080000704\"\n", | |
"prefix = mine[:5]\n", | |
"\n", | |
"if int(prefix) in good_smart:\n", | |
" print(\"true\")\n", | |
"else:\n", | |
" print(\"untrue\") " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Split Globe and filter on the good list" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import csv, itertools\n", | |
"\n", | |
"# as globe1 and globe2\n", | |
"with open('globe1_bad.csv', 'w') as bad_file:\n", | |
" with open('globe1_good.csv', 'w') as good_file:\n", | |
" with open('globe.csv') as origfile:\n", | |
" for row in itertools.islice(csv.reader(origfile), 870753):\n", | |
" #print row[0].strip('+')\n", | |
" #print(row[0])\n", | |
" clean = row[0].strip('+')\n", | |
" prefix = clean[:5]\n", | |
" if int(prefix) in good_globe:\n", | |
" good_file.write(clean+'\\n')\n", | |
" else:\n", | |
" bad_file.write(clean+'\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# as globe1 and globe2\n", | |
"with open('globe2_bad.csv', 'w') as bad_file:\n", | |
" with open('globe2_good.csv', 'w') as good_file:\n", | |
" with open('globe.csv') as origfile:\n", | |
" for row in itertools.islice(csv.reader(origfile), 870753, None):\n", | |
" #print row[0].strip('+')\n", | |
" #print(row[0])\n", | |
" clean = row[0].strip('+')\n", | |
" prefix = clean[:5]\n", | |
" if int(prefix) in good_globe:\n", | |
" good_file.write(clean+'\\n')\n", | |
" else:\n", | |
" bad_file.write(clean+'\\n')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Regular Split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# as globe1 and globe2\n", | |
"with open('globe1.csv', 'w') as newfile:\n", | |
" with open('globe.csv') as origfile:\n", | |
" for row in itertools.islice(csv.reader(origfile), 870753):\n", | |
" newfile.write(row[0]+'\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# as globe1 and globe2\n", | |
"with open('globe2.csv', 'w') as newfile:\n", | |
" with open('globe.csv') as origfile:\n", | |
" for row in itertools.islice(csv.reader(origfile), 870753, None):\n", | |
" newfile.write(row[0]+'\\n')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Split Smart and filter on the good list" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import csv, itertools\n", | |
"\n", | |
"# as smart and smart2\n", | |
"with open('smart1_bad.csv', 'w') as bad_file:\n", | |
" with open('smart1_good.csv', 'w') as good_file:\n", | |
" with open('smart.csv') as origfile:\n", | |
" for row in itertools.islice(csv.reader(origfile), 872712):\n", | |
" clean = row[0].strip('+')\n", | |
" prefix = clean[:5]\n", | |
" if int(prefix) in good_smart:\n", | |
" good_file.write(clean+'\\n')\n", | |
" else:\n", | |
" bad_file.write(clean+'\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# as smart and smart2\n", | |
"with open('smart2_bad.csv', 'w') as bad_file:\n", | |
" with open('smart2_good.csv', 'w') as good_file:\n", | |
" with open('smart.csv') as origfile:\n", | |
" for row in itertools.islice(csv.reader(origfile), 872712, None):\n", | |
" clean = row[0].strip('+')\n", | |
" prefix = clean[:5]\n", | |
" if int(prefix) in good_smart:\n", | |
" good_file.write(clean+'\\n')\n", | |
" else:\n", | |
" bad_file.write(clean+'\\n')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Regular Split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import csv, itertools\n", | |
"\n", | |
"# as globe1 and globe2\n", | |
"with open('smart1.csv', 'w') as newfile:\n", | |
" with open('smart.csv') as origfile:\n", | |
" for row in itertools.islice(csv.reader(origfile), 872712):\n", | |
" #print(row[0])\n", | |
" newfile.write(row[0]+'\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"# as globe1 and globe2\n", | |
"with open('smart2.csv', 'w') as newfile:\n", | |
" with open('smart.csv') as origfile:\n", | |
" for row in itertools.islice(csv.reader(origfile), 872712, None):\n", | |
" #print(row[0])\n", | |
" newfile.write(row[0]+'\\n')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment