Skip to content

Instantly share code, notes, and snippets.

@adam704a
Last active February 23, 2018 06:56
Show Gist options
  • Save adam704a/b976deee4a8f86906d5971cf3749b33f to your computer and use it in GitHub Desktop.
Save adam704a/b976deee4a8f86906d5971cf3749b33f to your computer and use it in GitHub Desktop.
Paring the sample file
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1741506"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(1 for line in open('globe.csv'))\n",
"# 870,753 = 1741506/2"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1745425"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(1 for line in open('smart.csv'))\n",
"# 872,712.5 = 1745425/2"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"smart1 bad is 667413\n",
"smart1 good is 203340\n",
"globe1 bad is 549565\n",
"globe1 good is 321188\n"
]
}
],
"source": [
"smart1_bad = sum(1 for line in open('smart1_bad.csv'))\n",
"smart1_good = sum(1 for line in open('smart1_good.csv'))\n",
"globe1_bad = sum(1 for line in open('globe1_bad.csv'))\n",
"globe1_good = sum(1 for line in open('globe1_good.csv'))\n",
"print(\"smart1 bad is \"+ str(smart1_bad))\n",
"print(\"smart1 good is \"+ str(smart1_good))\n",
"print(\"globe1 bad is \"+ str(globe1_bad))\n",
"print(\"globe1 good is \"+ str(globe1_good))\n"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"smart is 1745425\n",
"smart 1 is 872712\n",
"smart 2 is 872713\n"
]
}
],
"source": [
"smart = sum(1 for line in open('smart.csv'))\n",
"smart1 = sum(1 for line in open('smart1.csv'))\n",
"smart2 = sum(1 for line in open('smart2.csv'))\n",
"\n",
"print(\"smart is \"+ str(smart))\n",
"print(\"smart 1 is \"+ str(smart1))\n",
"print(\"smart 2 is \"+ str(smart2))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1745425"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num_lines\n",
"# 1745425 = 872,712 + 872,713"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Good prefixes from https://docs.google.com/spreadsheets/d/1tlvcmpeFMN5ZHgim8TZkLTwTwhIvjbFY0ZUl7Lpabow/edit#gid=571114283\n",
"\n",
"good_globe=[\n",
"63905,\n",
"63915,\n",
"63916,\n",
"63926,\n",
"63927,\n",
"63997,\n",
"63995]\n",
"\n",
"good_smart=[\n",
"63908,\n",
"63919,\n",
"63920,\n",
"63930,\n",
"63939,\n",
"63998,\n",
"63999]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"true\n"
]
}
],
"source": [
"\n",
"mine = \"639080000704\"\n",
"prefix = mine[:5]\n",
"\n",
"if int(prefix) in good_smart:\n",
" print(\"true\")\n",
"else:\n",
" print(\"untrue\") "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Split Globe and filter on the good list"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import csv, itertools\n",
"\n",
"# as globe1 and globe2\n",
"with open('globe1_bad.csv', 'w') as bad_file:\n",
" with open('globe1_good.csv', 'w') as good_file:\n",
" with open('globe.csv') as origfile:\n",
" for row in itertools.islice(csv.reader(origfile), 870753):\n",
" #print row[0].strip('+')\n",
" #print(row[0])\n",
" clean = row[0].strip('+')\n",
" prefix = clean[:5]\n",
" if int(prefix) in good_globe:\n",
" good_file.write(clean+'\\n')\n",
" else:\n",
" bad_file.write(clean+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# as globe1 and globe2\n",
"with open('globe2_bad.csv', 'w') as bad_file:\n",
" with open('globe2_good.csv', 'w') as good_file:\n",
" with open('globe.csv') as origfile:\n",
" for row in itertools.islice(csv.reader(origfile), 870753, None):\n",
" #print row[0].strip('+')\n",
" #print(row[0])\n",
" clean = row[0].strip('+')\n",
" prefix = clean[:5]\n",
" if int(prefix) in good_globe:\n",
" good_file.write(clean+'\\n')\n",
" else:\n",
" bad_file.write(clean+'\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Regular Split"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# as globe1 and globe2\n",
"with open('globe1.csv', 'w') as newfile:\n",
" with open('globe.csv') as origfile:\n",
" for row in itertools.islice(csv.reader(origfile), 870753):\n",
" newfile.write(row[0]+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# as globe1 and globe2\n",
"with open('globe2.csv', 'w') as newfile:\n",
" with open('globe.csv') as origfile:\n",
" for row in itertools.islice(csv.reader(origfile), 870753, None):\n",
" newfile.write(row[0]+'\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Split Smart and filter on the good list"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import csv, itertools\n",
"\n",
"# as smart and smart2\n",
"with open('smart1_bad.csv', 'w') as bad_file:\n",
" with open('smart1_good.csv', 'w') as good_file:\n",
" with open('smart.csv') as origfile:\n",
" for row in itertools.islice(csv.reader(origfile), 872712):\n",
" clean = row[0].strip('+')\n",
" prefix = clean[:5]\n",
" if int(prefix) in good_smart:\n",
" good_file.write(clean+'\\n')\n",
" else:\n",
" bad_file.write(clean+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# as smart and smart2\n",
"with open('smart2_bad.csv', 'w') as bad_file:\n",
" with open('smart2_good.csv', 'w') as good_file:\n",
" with open('smart.csv') as origfile:\n",
" for row in itertools.islice(csv.reader(origfile), 872712, None):\n",
" clean = row[0].strip('+')\n",
" prefix = clean[:5]\n",
" if int(prefix) in good_smart:\n",
" good_file.write(clean+'\\n')\n",
" else:\n",
" bad_file.write(clean+'\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Regular Split"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import csv, itertools\n",
"\n",
"# as globe1 and globe2\n",
"with open('smart1.csv', 'w') as newfile:\n",
" with open('smart.csv') as origfile:\n",
" for row in itertools.islice(csv.reader(origfile), 872712):\n",
" #print(row[0])\n",
" newfile.write(row[0]+'\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\n",
"# as globe1 and globe2\n",
"with open('smart2.csv', 'w') as newfile:\n",
" with open('smart.csv') as origfile:\n",
" for row in itertools.islice(csv.reader(origfile), 872712, None):\n",
" #print(row[0])\n",
" newfile.write(row[0]+'\\n')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment