Last active
December 28, 2015 18:41
-
-
Save bbengfort/c21012d8af009c953f21 to your computer and use it in GitHub Desktop.
Import and wrangling of the Big Tweet Dump from @murphsp1.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# big_tweet_import | |
# Imports the tweet-dump into MongoDB | |
# | |
# Author: Benjamin Bengfort <[email protected]> | |
# Created: Thu Aug 28 07:37:41 2014 -0400 | |
# | |
# Copyright (C) 2014 Bengfort.com | |
# For license information, see LICENSE.txt | |
# | |
# ID: utils.py [] [email protected] $ | |
""" | |
Import and wrangling of Big Tweet Dump from Sean. | |
""" | |
########################################################################## | |
## Imports | |
########################################################################## | |
import os | |
import json | |
import pymongo | |
from collections import Counter | |
from operator import itemgetter | |
########################################################################## | |
## Module Constants | |
########################################################################## | |
BASE_DIR = os.path.abspath(os.path.dirname(__file__)) | |
DUMP_PATH = os.path.join(BASE_DIR, "big_tweet_dump.json") | |
JSONDATE = "%Y-%m-%dT%H:%M:%SZ" | |
########################################################################## | |
## Helper functions | |
########################################################################## | |
def utcstr(dt, fmt=JSONDATE): | |
if not dt: return None | |
tups = dt.utctimetuple() | |
return datetime.fromtimestamp(time.mktime(tups)).strftime(fmt) | |
def parse_date(dt, fmts=None): | |
""" | |
Attempts to parse a date using a series of formats. | |
""" | |
try: | |
return datetime.strptime(dt, JSONDATE) | |
except ValueError: | |
return None | |
def mongodb(**kwargs): | |
""" | |
Connect to the Mongo Database (returns a Database object) | |
""" | |
host = kwargs.get('host', 'localhost') | |
port = kwargs.get('port', 27017) | |
database = kwargs.get('database', 'tweet-corpus') | |
client = pymongo.MongoClient(host, port) | |
return client[database] | |
def dotkeys(obj, parent=None): | |
""" | |
Returns a depth first search of all keys in dot notation. | |
""" | |
for key, val in obj.items(): | |
key = "%s.%s" % (parent, key) if parent else key | |
yield key | |
if hasattr(val, 'items') and callable(val.items): | |
for key in dotkeys(val, key): | |
yield key | |
########################################################################## | |
## Import Utility | |
########################################################################## | |
def load_data(path=DUMP_PATH, **kwargs): | |
""" | |
Loads the data from the dump file into the Mongo Database | |
""" | |
tweets = mongodb(**kwargs).tweets | |
with open(path, 'r') as data: | |
data = json.load(data) | |
for tweet in data: | |
tweets.insert(tweet) | |
########################################################################## | |
## Inspection Utility | |
########################################################################## | |
def inspect(**kwargs): | |
def percent(item, count): | |
pcent = (float(item[1]) /count) * 100 | |
return item[0], pcent | |
tweets = mongodb(**kwargs).tweets | |
count = tweets.count() | |
counter = Counter() | |
for tweet in tweets.find(): | |
for key in dotkeys(tweet): | |
counter[key] += 1 | |
pcents = map(lambda item: percent(item, count), counter.items()) | |
return sorted(pcents, key=itemgetter(0)) | |
if __name__ == "__main__": | |
for item in inspect(): | |
print "{1: >5.1f}%: {0}".format(*item) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
100.0%: _id | |
100.0%: contributors | |
100.0%: coordinates | |
4.6%: coordinates.coordinates | |
4.6%: coordinates.type | |
100.0%: created_at | |
100.0%: entities | |
100.0%: entities.hashtags | |
9.9%: entities.media | |
100.0%: entities.symbols | |
100.0%: entities.urls | |
100.0%: entities.user_mentions | |
9.9%: extended_entities | |
9.9%: extended_entities.media | |
100.0%: favorite_count | |
100.0%: favorited | |
100.0%: geo | |
4.6%: geo.coordinates | |
4.6%: geo.type | |
100.0%: id | |
100.0%: id_str | |
100.0%: in_reply_to_screen_name | |
100.0%: in_reply_to_status_id | |
100.0%: in_reply_to_status_id_str | |
100.0%: in_reply_to_user_id | |
100.0%: in_reply_to_user_id_str | |
100.0%: lang | |
100.0%: place | |
4.9%: place.attributes | |
0.0%: place.attributes.street_address | |
4.9%: place.bounding_box | |
4.9%: place.bounding_box.coordinates | |
4.9%: place.bounding_box.type | |
4.9%: place.contained_within | |
4.9%: place.country | |
4.9%: place.country_code | |
4.9%: place.full_name | |
4.9%: place.id | |
4.9%: place.name | |
4.9%: place.place_type | |
4.9%: place.url | |
48.5%: possibly_sensitive | |
100.0%: retweet_count | |
100.0%: retweeted | |
26.6%: retweeted_status | |
26.6%: retweeted_status.contributors | |
26.6%: retweeted_status.coordinates | |
0.6%: retweeted_status.coordinates.coordinates | |
0.6%: retweeted_status.coordinates.type | |
26.6%: retweeted_status.created_at | |
26.6%: retweeted_status.entities | |
26.6%: retweeted_status.entities.hashtags | |
5.7%: retweeted_status.entities.media | |
26.6%: retweeted_status.entities.symbols | |
26.6%: retweeted_status.entities.urls | |
26.6%: retweeted_status.entities.user_mentions | |
5.7%: retweeted_status.extended_entities | |
5.7%: retweeted_status.extended_entities.media | |
26.6%: retweeted_status.favorite_count | |
26.6%: retweeted_status.favorited | |
26.6%: retweeted_status.geo | |
0.6%: retweeted_status.geo.coordinates | |
0.6%: retweeted_status.geo.type | |
26.6%: retweeted_status.id | |
26.6%: retweeted_status.id_str | |
26.6%: retweeted_status.in_reply_to_screen_name | |
26.6%: retweeted_status.in_reply_to_status_id | |
26.6%: retweeted_status.in_reply_to_status_id_str | |
26.6%: retweeted_status.in_reply_to_user_id | |
26.6%: retweeted_status.in_reply_to_user_id_str | |
26.6%: retweeted_status.lang | |
26.6%: retweeted_status.place | |
0.6%: retweeted_status.place.attributes | |
0.6%: retweeted_status.place.bounding_box | |
0.6%: retweeted_status.place.bounding_box.coordinates | |
0.6%: retweeted_status.place.bounding_box.type | |
0.6%: retweeted_status.place.contained_within | |
0.6%: retweeted_status.place.country | |
0.6%: retweeted_status.place.country_code | |
0.6%: retweeted_status.place.full_name | |
0.6%: retweeted_status.place.id | |
0.6%: retweeted_status.place.name | |
0.6%: retweeted_status.place.place_type | |
0.6%: retweeted_status.place.url | |
12.3%: retweeted_status.possibly_sensitive | |
26.6%: retweeted_status.retweet_count | |
26.6%: retweeted_status.retweeted | |
0.0%: retweeted_status.scopes | |
0.0%: retweeted_status.scopes.followers | |
0.0%: retweeted_status.scopes.place_ids | |
26.6%: retweeted_status.source | |
26.6%: retweeted_status.text | |
26.6%: retweeted_status.truncated | |
26.6%: retweeted_status.user | |
26.6%: retweeted_status.user.contributors_enabled | |
26.6%: retweeted_status.user.created_at | |
26.6%: retweeted_status.user.default_profile | |
26.6%: retweeted_status.user.default_profile_image | |
26.6%: retweeted_status.user.description | |
26.6%: retweeted_status.user.entities | |
26.6%: retweeted_status.user.entities.description | |
26.6%: retweeted_status.user.entities.description.urls | |
17.2%: retweeted_status.user.entities.url | |
17.2%: retweeted_status.user.entities.url.urls | |
26.6%: retweeted_status.user.favourites_count | |
26.6%: retweeted_status.user.follow_request_sent | |
26.6%: retweeted_status.user.followers_count | |
26.6%: retweeted_status.user.following | |
26.6%: retweeted_status.user.friends_count | |
26.6%: retweeted_status.user.geo_enabled | |
26.6%: retweeted_status.user.id | |
26.6%: retweeted_status.user.id_str | |
26.6%: retweeted_status.user.is_translation_enabled | |
26.6%: retweeted_status.user.is_translator | |
26.6%: retweeted_status.user.lang | |
26.6%: retweeted_status.user.listed_count | |
26.6%: retweeted_status.user.location | |
26.6%: retweeted_status.user.name | |
26.6%: retweeted_status.user.notifications | |
26.6%: retweeted_status.user.profile_background_color | |
26.6%: retweeted_status.user.profile_background_image_url | |
26.6%: retweeted_status.user.profile_background_image_url_https | |
26.6%: retweeted_status.user.profile_background_tile | |
21.6%: retweeted_status.user.profile_banner_url | |
26.6%: retweeted_status.user.profile_image_url | |
26.6%: retweeted_status.user.profile_image_url_https | |
26.6%: retweeted_status.user.profile_link_color | |
26.6%: retweeted_status.user.profile_sidebar_border_color | |
26.6%: retweeted_status.user.profile_sidebar_fill_color | |
26.6%: retweeted_status.user.profile_text_color | |
26.6%: retweeted_status.user.profile_use_background_image | |
26.6%: retweeted_status.user.protected | |
26.6%: retweeted_status.user.screen_name | |
26.6%: retweeted_status.user.statuses_count | |
26.6%: retweeted_status.user.time_zone | |
26.6%: retweeted_status.user.url | |
26.6%: retweeted_status.user.utc_offset | |
26.6%: retweeted_status.user.verified | |
100.0%: source | |
100.0%: text | |
100.0%: truncated | |
100.0%: user | |
100.0%: user.contributors_enabled | |
100.0%: user.created_at | |
100.0%: user.default_profile | |
100.0%: user.default_profile_image | |
100.0%: user.description | |
100.0%: user.entities | |
100.0%: user.entities.description | |
100.0%: user.entities.description.urls | |
49.0%: user.entities.url | |
49.0%: user.entities.url.urls | |
100.0%: user.favourites_count | |
100.0%: user.follow_request_sent | |
100.0%: user.followers_count | |
100.0%: user.following | |
100.0%: user.friends_count | |
100.0%: user.geo_enabled | |
100.0%: user.id | |
100.0%: user.id_str | |
100.0%: user.is_translation_enabled | |
100.0%: user.is_translator | |
100.0%: user.lang | |
100.0%: user.listed_count | |
100.0%: user.location | |
100.0%: user.name | |
100.0%: user.notifications | |
100.0%: user.profile_background_color | |
100.0%: user.profile_background_image_url | |
100.0%: user.profile_background_image_url_https | |
100.0%: user.profile_background_tile | |
66.5%: user.profile_banner_url | |
100.0%: user.profile_image_url | |
100.0%: user.profile_image_url_https | |
100.0%: user.profile_link_color | |
100.0%: user.profile_sidebar_border_color | |
100.0%: user.profile_sidebar_fill_color | |
100.0%: user.profile_text_color | |
100.0%: user.profile_use_background_image | |
100.0%: user.protected | |
100.0%: user.screen_name | |
100.0%: user.statuses_count | |
100.0%: user.time_zone | |
100.0%: user.url | |
100.0%: user.utc_offset | |
100.0%: user.verified |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment