Created
February 8, 2014 01:51
-
-
Save ptwobrussell/8875470 to your computer and use it in GitHub Desktop.
Improvements to Example 6-13 that use regular expressions to enable searching by an email address as opposed to an exact string match on the From: field of a JSONified mbox
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import pymongo # pip install pymongo | |
from bson import json_util # Comes with pymongo | |
import re | |
# The basis of our query | |
FROM = "[email protected]" # As opposed to a value like "Coursera <[email protected]>" | |
client = pymongo.MongoClient() | |
db = client.enron | |
mbox = db.mbox | |
# Get the recipient lists for each message | |
recipients_per_message = db.mbox.aggregate([ | |
{"$match" : {"From" : re.compile(r".*{0}.*".format(FROM), re.IGNORECASE)}}, | |
{"$project" : {"From" : 1, "To" : 1} }, | |
{"$group" : {"_id" : "$From", "recipients" : {"$addToSet" : "$To" } } } | |
])['result'][0]['recipients'] | |
# Collapse the lists of recipients into a single list | |
all_recipients = [recipient | |
for message in recipients_per_message | |
for recipient in message] | |
# Calculate the number of recipients per sent message and sort | |
recipients_per_message_totals = \ | |
sorted([len(recipients) | |
for recipients in recipients_per_message]) | |
# Demonstrate how to use $unwind followed by $group to collapse | |
# the recipient lists into a single list (with no duplicates | |
# per the $addToSet operator) | |
unique_recipients = db.mbox.aggregate([ | |
{"$match" : {"From" : re.compile(r".*{0}.*".format(FROM), re.IGNORECASE)}}, | |
{"$project" : {"From" : 1, "To" : 1} }, | |
{"$unwind" : "$To"}, | |
{"$group" : {"_id" : "From", "recipients" : {"$addToSet" : "$To"}} } | |
])['result'][0]['recipients'] | |
print all_recipients | |
print "Num total recipients on all messages:", len(all_recipients) | |
print "Num recipients for each message:", recipients_per_message_totals | |
print "Num unique recipients", len(unique_recipients) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment