tyleha · January 11, 2016 01:55
diff --git a/gmail_heatmap.py b/gmail_heatmap.py
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import matplotlib.dates as dates
 import matplotlib.gridspec as gridspec
 from datetime import timedelta, datetime, date

 import GmailAccount # my package

 gmail = GmailAccount(username='[email protected]', password=password)
 gmail.login()

 daysback = 6000 # ~10yrs...make this whatever ya like
 notsince = 0 # since now.
 since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
 before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")

 SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
 ALL_HEADERS = '(BODY.PEEK[HEADER.FIELDS (DATE TO CC FROM SUBJECT)])'

 # Search and fetch emails!
 received = gmail.load_parse_query(search_query=SEARCH, 
                                  fetch_query=ALL_HEADERS, 
                                  folder='"[Gmail]/All Mail"')
                                  
                                  
 def scrub_email(headers):   
    # IMAP sometimes returns fields with varying capitalization. Lowercase each header name.
    return dict([(title.lower(), value) for title, value in headers]) 

 df = pd.DataFrame([scrub_email(email._headers) for email in received])

 # Parse date strings remaining naive across multiple timezones
 def try_parse_date(d):
    try:
        ts = pd.Timestamp(d)
        # IMAP is very much not perfect...some of my emails have no timezone
        # in their date string. ¯\_(ツ)_/¯
        if ts.tz is None: 
            ts = ts.tz_localize('UTC')
        # I moved from east coast to west coast in 2010, so automatically assume EST/PST 
        # before/after that date.
        if ts < pd.Timestamp('2010-09-01', tz='US/Eastern'):
            ts = ts.tz_convert('US/Eastern')
        else:
            ts = ts.tz_convert('US/Pacific')
        # Here's the magic to use timezone-naive timestamps
        return pd.Timestamp(ts.to_datetime().replace(tzinfo=None))

    except:
        # If we fail, return NaN so pandas can remove this email later.
        return np.nan

 df['timestamp'] = df.date.map(try_parse_date)
 # Remove any emails that Timestamp was unable to parse
 df = df.dropna(subset=['timestamp'])

 df['hour'] = df.timestamp.map(lambda x: x.hour)
 freq = 'M' # could also be 'W' (week) or 'D' (day), but month looks nice.
 df = df.set_index('timestamp', drop=False)
 df.index = df.index.to_period(freq)


 mindate = df.timestamp.min()
 maxdate = df.timestamp.max()
 pr = pd.period_range(mindate, maxdate, freq=freq)
 # Initialize a new HeatMap dataframe where the indicies are actually Periods of time
 # Size the frame anticipating the correct number of rows (periods) and columns (hours in a day)
 hm = pd.DataFrame(np.zeros([len(pr), 24]) , index=pr)

 for period in pr:
    # HERE'S where the magic happens...with pandas, when you structure your data correctly, 
    # it can be so terse that you almost aren't sure the program does what it says it does...
    # For this period (month), find relevant emails and count how many emails were received in
    # each hour of the day. Takes more words to explain than to code.
    if period in df.index:
        hm.ix[period] = df.ix[[period]].hour.value_counts()

 # If for some weird reason there was ever an hour period where you had no email,
 # fill those NaNs with zeros.
 hm.fillna(0, inplace=True)


 ### Set up figure
 fig = plt.figure(figsize=(12,8))
 # This will be useful laterz
 gs = gridspec.GridSpec(2, 2, height_ratios=[4,1], width_ratios=[20,1],)
 gs.update(wspace=0.05)

 ### Plot our heatmap
 ax = plt.subplot(gs[0])
 x = dates.date2num([p.start_time for p in pr])
 t = [datetime(2000, 1, 1, h, 0, 0) for h in range(24)]
 t.append(datetime(2000, 1, 2, 0, 0, 0)) # add last fencepost
 y = dates.date2num(t)
 cm = plt.get_cmap('Oranges')
 plt.pcolor(x, y, hm.transpose().as_matrix(), cmap=cm)

 ### Now format our axes to be human-readable
 ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y'))
 ax.yaxis.set_major_formatter(dates.DateFormatter('%H:%M'))
 ax.set_yticks(t[::2])
 ax.set_xticks(x[::12])
 ax.set_xlim([x[0], x[-1]])
 ax.set_ylim([t[0], t[-1]])
 ax.tick_params(axis='x', pad=14, length=10, direction='inout')

 ### pcolor makes it sooo easy to add a color bar!
 plt.colorbar(cax=plt.subplot(gs[1]))

 ax2 = plt.subplot(gs[2])
 total_email = df.groupby(level=0).hour.count()
 plt.plot_date(total_email.index, total_email, '-', linewidth=1.5, color=cm(0.999))
 ax2.fill_between(total_email.index, 0, total_email, color=cm(0.5))

 ax2.xaxis.tick_top()
 out = ax2.set_xticks(total_email.index[::12])
 out = ax2.xaxis.set_ticklabels([])
 ax2.tick_params(axis='x', pad=14, length=10, direction='inout')
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import matplotlib.dates as dates
	import matplotlib.gridspec as gridspec
	from datetime import timedelta, datetime, date

	import GmailAccount # my package

	gmail = GmailAccount(username='[email protected]', password=password)
	gmail.login()

	daysback = 6000 # ~10yrs...make this whatever ya like
	notsince = 0 # since now.
	since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
	before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")

	SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
	ALL_HEADERS = '(BODY.PEEK[HEADER.FIELDS (DATE TO CC FROM SUBJECT)])'

	# Search and fetch emails!
	received = gmail.load_parse_query(search_query=SEARCH,
	fetch_query=ALL_HEADERS,
	folder='"[Gmail]/All Mail"')


	def scrub_email(headers):
	# IMAP sometimes returns fields with varying capitalization. Lowercase each header name.
	return dict([(title.lower(), value) for title, value in headers])

	df = pd.DataFrame([scrub_email(email._headers) for email in received])

	# Parse date strings remaining naive across multiple timezones
	def try_parse_date(d):
	try:
	ts = pd.Timestamp(d)
	# IMAP is very much not perfect...some of my emails have no timezone
	# in their date string. ¯\_(ツ)_/¯
	if ts.tz is None:
	ts = ts.tz_localize('UTC')
	# I moved from east coast to west coast in 2010, so automatically assume EST/PST
	# before/after that date.
	if ts < pd.Timestamp('2010-09-01', tz='US/Eastern'):
	ts = ts.tz_convert('US/Eastern')
	else:
	ts = ts.tz_convert('US/Pacific')
	# Here's the magic to use timezone-naive timestamps
	return pd.Timestamp(ts.to_datetime().replace(tzinfo=None))

	except:
	# If we fail, return NaN so pandas can remove this email later.
	return np.nan

	df['timestamp'] = df.date.map(try_parse_date)
	# Remove any emails that Timestamp was unable to parse
	df = df.dropna(subset=['timestamp'])

	df['hour'] = df.timestamp.map(lambda x: x.hour)
	freq = 'M' # could also be 'W' (week) or 'D' (day), but month looks nice.
	df = df.set_index('timestamp', drop=False)
	df.index = df.index.to_period(freq)


	mindate = df.timestamp.min()
	maxdate = df.timestamp.max()
	pr = pd.period_range(mindate, maxdate, freq=freq)
	# Initialize a new HeatMap dataframe where the indicies are actually Periods of time
	# Size the frame anticipating the correct number of rows (periods) and columns (hours in a day)
	hm = pd.DataFrame(np.zeros([len(pr), 24]) , index=pr)

	for period in pr:
	# HERE'S where the magic happens...with pandas, when you structure your data correctly,
	# it can be so terse that you almost aren't sure the program does what it says it does...
	# For this period (month), find relevant emails and count how many emails were received in
	# each hour of the day. Takes more words to explain than to code.
	if period in df.index:
	hm.ix[period] = df.ix[[period]].hour.value_counts()

	# If for some weird reason there was ever an hour period where you had no email,
	# fill those NaNs with zeros.
	hm.fillna(0, inplace=True)


	### Set up figure
	fig = plt.figure(figsize=(12,8))
	# This will be useful laterz
	gs = gridspec.GridSpec(2, 2, height_ratios=[4,1], width_ratios=[20,1],)
	gs.update(wspace=0.05)

	### Plot our heatmap
	ax = plt.subplot(gs[0])
	x = dates.date2num([p.start_time for p in pr])
	t = [datetime(2000, 1, 1, h, 0, 0) for h in range(24)]
	t.append(datetime(2000, 1, 2, 0, 0, 0)) # add last fencepost
	y = dates.date2num(t)
	cm = plt.get_cmap('Oranges')
	plt.pcolor(x, y, hm.transpose().as_matrix(), cmap=cm)

	### Now format our axes to be human-readable
	ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y'))
	ax.yaxis.set_major_formatter(dates.DateFormatter('%H:%M'))
	ax.set_yticks(t[::2])
	ax.set_xticks(x[::12])
	ax.set_xlim([x[0], x[-1]])
	ax.set_ylim([t[0], t[-1]])
	ax.tick_params(axis='x', pad=14, length=10, direction='inout')

	### pcolor makes it sooo easy to add a color bar!
	plt.colorbar(cax=plt.subplot(gs[1]))

	ax2 = plt.subplot(gs[2])
	total_email = df.groupby(level=0).hour.count()
	plt.plot_date(total_email.index, total_email, '-', linewidth=1.5, color=cm(0.999))
	ax2.fill_between(total_email.index, 0, total_email, color=cm(0.5))

	ax2.xaxis.tick_top()
	out = ax2.set_xticks(total_email.index[::12])
	out = ax2.xaxis.set_ticklabels([])
	ax2.tick_params(axis='x', pad=14, length=10, direction='inout')