Created
February 24, 2014 12:45
-
-
Save olp-cs/9187724 to your computer and use it in GitHub Desktop.
Test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "exploring_a_single_data_file" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import json\n", | |
"path = 'data/usagov_bitly_data2012-05-21-1337634399.txt'\n", | |
"records = [json.loads(line) for line in open(path)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Display a couple of records\n", | |
"records[0:2]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 32, | |
"text": [ | |
"[{u'a': u'Mozilla/5.0 (iPod; U; CPU iPhone OS 3_1_3 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7E18 Safari/528.16',\n", | |
" u'al': u'en-us',\n", | |
" u'c': u'US',\n", | |
" u'cy': u'Chesapeake',\n", | |
" u'g': u'JKZUHq',\n", | |
" u'gr': u'VA',\n", | |
" u'h': u'J8ZPYk',\n", | |
" u'hc': 1337629186,\n", | |
" u'hh': u'go.nasa.gov',\n", | |
" u'l': u'nasatwitter',\n", | |
" u'll': [36.755798, -76.292801],\n", | |
" u'nk': 1,\n", | |
" u'r': u'http://t.co/JEY40vW4',\n", | |
" u't': 1337634399,\n", | |
" u'tz': u'America/New_York',\n", | |
" u'u': u'http://www.nasa.gov/mission_pages/hinode/eclipse_120520.html'},\n", | |
" {u'a': u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5',\n", | |
" u'al': u'en-US,en;q=0.8',\n", | |
" u'c': u'US',\n", | |
" u'cy': u'O Fallon',\n", | |
" u'g': u'vNJS4H',\n", | |
" u'gr': u'MO',\n", | |
" u'h': u'u0uD9q',\n", | |
" u'hc': 1319563556,\n", | |
" u'hh': u'1.usa.gov',\n", | |
" u'l': u'o_4us71ccioa',\n", | |
" u'll': [38.8251, -90.728897],\n", | |
" u'nk': 1,\n", | |
" u'r': u'direct',\n", | |
" u't': 1337634399,\n", | |
" u'tz': u'America/Chicago',\n", | |
" u'u': u'https://www.nysdot.gov/rexdesign/design/community.gif'}]" | |
] | |
} | |
], | |
"prompt_number": 32 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Display the user agent from the first record\n", | |
"records[0][\"a\"]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 33, | |
"text": [ | |
"u'Mozilla/5.0 (iPod; U; CPU iPhone OS 3_1_3 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7E18 Safari/528.16'" | |
] | |
} | |
], | |
"prompt_number": 33 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Utility function: get counts for each element from a collection\n", | |
"\n", | |
"from collections import defaultdict\n", | |
"\n", | |
"def get_counts(sequence):\n", | |
" counts = defaultdict(int) # values will initialize to 0\n", | |
" for x in sequence:\n", | |
" count[x] += 1\n", | |
" return counts" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 34 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# How a time zone looks like\n", | |
"records[0]['tz']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 35, | |
"text": [ | |
"u'America/New_York'" | |
] | |
} | |
], | |
"prompt_number": 35 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Check if a time zone is listed for the record\n", | |
"def time_zone_listed(record):\n", | |
" return 'tz' in record\n", | |
" \n", | |
"time_zone_listed(records[0])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 36, | |
"text": [ | |
"True" | |
] | |
} | |
], | |
"prompt_number": 36 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# See where the timezone is not listed \n", | |
"[item for item in records if not time_zone_listed(item)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 38, | |
"text": [ | |
"[{u'_heartbeat_': 1337634451},\n", | |
" {u'_heartbeat_': 1337634482},\n", | |
" {u'_heartbeat_': 1337634512},\n", | |
" {u'_heartbeat_': 1337634541},\n", | |
" {u'_heartbeat_': 1337634571},\n", | |
" {u'_heartbeat_': 1337634601},\n", | |
" {u'_heartbeat_': 1337634631},\n", | |
" {u'_heartbeat_': 1337634661},\n", | |
" {u'_heartbeat_': 1337634691},\n", | |
" {u'_heartbeat_': 1337634721},\n", | |
" {u'_heartbeat_': 1337634751},\n", | |
" {u'_heartbeat_': 1337634781},\n", | |
" {u'_heartbeat_': 1337634811},\n", | |
" {u'_heartbeat_': 1337634841},\n", | |
" {u'_heartbeat_': 1337634871},\n", | |
" {u'_heartbeat_': 1337634901},\n", | |
" {u'_heartbeat_': 1337634931},\n", | |
" {u'_heartbeat_': 1337634961},\n", | |
" {u'_heartbeat_': 1337634991},\n", | |
" {u'_heartbeat_': 1337635021},\n", | |
" {u'_heartbeat_': 1337635051},\n", | |
" {u'_heartbeat_': 1337635081},\n", | |
" {u'_heartbeat_': 1337635112},\n", | |
" {u'_heartbeat_': 1337635141},\n", | |
" {u'_heartbeat_': 1337635171},\n", | |
" {u'_heartbeat_': 1337635201},\n", | |
" {u'_heartbeat_': 1337635231},\n", | |
" {u'_heartbeat_': 1337635261},\n", | |
" {u'_heartbeat_': 1337635291},\n", | |
" {u'_heartbeat_': 1337635321},\n", | |
" {u'_heartbeat_': 1337635351},\n", | |
" {u'_heartbeat_': 1337635381},\n", | |
" {u'_heartbeat_': 1337635411},\n", | |
" {u'_heartbeat_': 1337635441},\n", | |
" {u'_heartbeat_': 1337635471},\n", | |
" {u'_heartbeat_': 1337635501},\n", | |
" {u'_heartbeat_': 1337635531},\n", | |
" {u'_heartbeat_': 1337635561},\n", | |
" {u'_heartbeat_': 1337635591},\n", | |
" {u'_heartbeat_': 1337635621},\n", | |
" {u'_heartbeat_': 1337635651},\n", | |
" {u'_heartbeat_': 1337635681},\n", | |
" {u'_heartbeat_': 1337635711},\n", | |
" {u'_heartbeat_': 1337635741},\n", | |
" {u'_heartbeat_': 1337635771},\n", | |
" {u'_heartbeat_': 1337635801},\n", | |
" {u'_heartbeat_': 1337635831},\n", | |
" {u'_heartbeat_': 1337635861},\n", | |
" {u'_heartbeat_': 1337635891},\n", | |
" {u'_heartbeat_': 1337635921},\n", | |
" {u'_heartbeat_': 1337635951},\n", | |
" {u'_heartbeat_': 1337635981},\n", | |
" {u'_heartbeat_': 1337636011},\n", | |
" {u'_heartbeat_': 1337636041},\n", | |
" {u'_heartbeat_': 1337636071},\n", | |
" {u'_heartbeat_': 1337636101},\n", | |
" {u'_heartbeat_': 1337636131},\n", | |
" {u'_heartbeat_': 1337636161},\n", | |
" {u'_heartbeat_': 1337636191},\n", | |
" {u'_heartbeat_': 1337636221}]" | |
] | |
} | |
], | |
"prompt_number": 38 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"time_zones = [item['tz'] for item in records if time_zone_listed(item)]\n", | |
"time_zones[0:3]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 42, | |
"text": [ | |
"[u'America/New_York', u'America/Chicago', u'America/New_York']" | |
] | |
} | |
], | |
"prompt_number": 42 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from collections import Counter\n", | |
"\n", | |
"Counter(time_zones).most_common(10)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 46, | |
"text": [ | |
"[(u'America/Chicago', 643),\n", | |
" (u'America/New_York', 571),\n", | |
" (u'', 521),\n", | |
" (u'America/Los_Angeles', 315),\n", | |
" (u'Europe/London', 135),\n", | |
" (u'America/Denver', 77),\n", | |
" (u'Europe/Amsterdam', 32),\n", | |
" (u'America/Phoenix', 32),\n", | |
" (u'Europe/Madrid', 29),\n", | |
" (u'America/Rainy_River', 26)]" | |
] | |
} | |
], | |
"prompt_number": 46 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment