|
#!/usr/bin/env python |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
|
|
# df = pd.read_csv('./exported_tracks.new.txt.gz', compression='gzip', |
|
df = pd.read_csv('./exported_tracks.txt', |
|
sep='\t', |
|
header=None, |
|
names=['date','tr_name','ar_name','al_name', |
|
'tr_duration','tr_listeners', |
|
'tr_playcount', |
|
'tr_t0','tr_t1','tr_t2','tr_t3','tr_t4', |
|
'tr_mbid','ar_yf','ar_yt', |
|
'ar_ctry','ar_listeners','ar_playcount', |
|
'ar_t0','ar_t1','ar_t2','ar_t3','ar_t4', |
|
'ar_mbid', |
|
'al_releasedate','al_playcount', |
|
'al_listeners','al_mbid'], |
|
parse_dates=False) |
|
|
|
|
|
df.index = pd.to_datetime(df.pop('date'), unit='s') |
|
|
|
|
|
df.tr_duration = df.tr_duration.apply(lambda x: x/60/1000) |
|
|
|
|
|
df['al_releasedate'] = df.al_releasedate.apply( lambda x: np.nan if x == ' ' else x) |
|
df['al_releasedate'] = pd.to_datetime(df.al_releasedate, format=' %d %b %Y, %H:%M') |
|
|
|
b = df.groupby(pd.TimeGrouper('d')).ar_t0.apply(lambda x: x.value_counts().head(30)).unstack(level=1).fillna(0)\ |
|
.add(df.groupby(pd.TimeGrouper('d')).ar_t1.apply(lambda x: x.value_counts().head(30)).unstack(level=1).fillna(0), fill_value=0)\ |
|
.add(df.groupby(pd.TimeGrouper('d')).ar_t2.apply(lambda x: x.value_counts().head(30)).unstack(level=1).fillna(0), fill_value=0)\ |
|
.add(df.groupby(pd.TimeGrouper('d')).ar_t3.apply(lambda x: x.value_counts().head(30)).unstack(level=1).fillna(0), fill_value=0)\ |
|
.add(df.groupby(pd.TimeGrouper('d')).ar_t4.apply(lambda x: x.value_counts().head(30)).unstack(level=1).fillna(0), fill_value=0) |
|
b_t = b.T |
|
|
|
b_t['svalue'] = b_t.sum(axis=1).values |
|
b_t_plot = b_t.sort_index(by='svalue', ascending=True).drop('svalue',axis=1)[-30:]#.sort_index() |
|
|
|
current_plot = 0 |
|
|
|
cols_max = len(b_t_plot.columns) |
|
|
|
for slice_value_tip in xrange(0,len(b_t_plot.columns),5): |
|
slice_value_top = slice_value_tip + 30 |
|
df_temp = b_t_plot.iloc[:,slice_value_tip:slice_value_top] |
|
print slice_value_tip, slice_value_top, cols_max |
|
|
|
fig, ax = plt.subplots() |
|
heatmap = ax.pcolor(df_temp, |
|
cmap=plt.cm.YlGn, |
|
alpha=0.8, |
|
vmin=0, |
|
vmax= b_t_plot.max().max() / 4) |
|
|
|
fig = plt.gcf() |
|
fig.set_size_inches(7,7) |
|
# turn off the frame |
|
ax.set_frame_on(False) |
|
|
|
# put the major ticks at the middle of each cell |
|
ax.set_yticks(np.arange(df_temp.shape[0])+0.1, minor=False) |
|
ax.set_xticks(np.arange(df_temp.shape[1])+0.5, minor=False) |
|
#ax.set_xticks(minor=False) |
|
# want a more natural, table-like display |
|
ax.invert_yaxis() |
|
ax.xaxis.tick_top() |
|
|
|
# Set the labels |
|
|
|
# note I could have used nba_sort.columns but made "labels" instead |
|
ax.set_xticklabels(df_temp.columns.map(lambda x: x.year), minor=False, size='small') |
|
ax.set_yticklabels(df_temp.index, minor=False, va='top',size='small') |
|
|
|
ax.grid(False) |
|
|
|
# rotate the |
|
plt.xticks(rotation=90) |
|
|
|
# Turn off all the ticks |
|
ax = plt.gca() |
|
|
|
for t in ax.xaxis.get_major_ticks(): |
|
t.tick1On = False |
|
t.tick2On = False |
|
for t in ax.yaxis.get_major_ticks(): |
|
t.tick1On = False |
|
t.tick2On = False |
|
|
|
fig.savefig('./beat' + str(current_plot) + '.png') |
|
current_plot = current_plot + 1 |
|
del df_temp |
|
plt.clf() |