|  | import numpy as np | 
        
          |  | import matplotlib.pyplot as plt | 
        
          |  | import matplotlib.dates as mdates | 
        
          |  | import pandas as pd | 
        
          |  | from datetime import datetime, timedelta | 
        
          |  | import re | 
        
          |  | import sys | 
        
          |  |  | 
        
          |  | def read_jrnl_sport(jrnlfile, colormap='Pastel1'): | 
        
          |  | # convert datafile to Pandas DataFrame | 
        
          |  | df = pd.read_csv(jrnlfile, sep='@', names=['tmp', 'sport']) | 
        
          |  |  | 
        
          |  | # extract date and copy to `date` column | 
        
          |  | temp = df.tmp.str.extract('.*\[(.*)\].*') | 
        
          |  | df['date'] = temp.astype('datetime64[ns]') | 
        
          |  |  | 
        
          |  | # set date as index | 
        
          |  | df.set_index('date', inplace=True) | 
        
          |  |  | 
        
          |  | # extract description and copy to `desc` column | 
        
          |  | df['desc'] = df['tmp'].str.slice(13) | 
        
          |  |  | 
        
          |  | # set `sport` column to categorical | 
        
          |  | df['sport'] = df.sport.astype('category') | 
        
          |  |  | 
        
          |  | # make `sport_id` column with numerical ids | 
        
          |  | # corresponding to categories | 
        
          |  | df['sport_id'] = df.sport.cat.codes | 
        
          |  |  | 
        
          |  | # assign one color to each sport activity | 
        
          |  | allsports = df.sport.cat.categories | 
        
          |  | list_of_colors = plt.cm.get_cmap(colormap).colors | 
        
          |  | cdict = dict(zip(np.unique(allsports), list_of_colors)) | 
        
          |  |  | 
        
          |  | # remove `tmp` column | 
        
          |  | return df.drop(['tmp'], axis=1), cdict | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def select_period(df, year=None, date=None, days=30, forward=False): | 
        
          |  | ''' | 
        
          |  | Select period either by specifying a single year | 
        
          |  | or a specific date with an interval in days. | 
        
          |  |  | 
        
          |  | > df_sel = select_period(df, year=2021) | 
        
          |  |  | 
        
          |  | Select the last 30 days starting now: | 
        
          |  | > df_sel = select_period(df, days=30) | 
        
          |  |  | 
        
          |  | Select the last 30 days starting on a specific date: | 
        
          |  | > df_sel = select_period(df, date='2020-06-19', days=30) | 
        
          |  | ''' | 
        
          |  | if year is not None: | 
        
          |  | mask = df.index.year == year | 
        
          |  | else: | 
        
          |  | if date is None: | 
        
          |  | date = datetime.now() | 
        
          |  | else: | 
        
          |  | date = datetime.strptime(date, '%Y-%m-%d') | 
        
          |  | if forward: | 
        
          |  | start_date = date | 
        
          |  | end_date = date + timedelta(days=days) | 
        
          |  | else: | 
        
          |  | start_date = date - timedelta(days=days) | 
        
          |  | end_date = date | 
        
          |  | mask = (df.index >= start_date) & (df.index <= end_date) | 
        
          |  | return df.loc[mask] | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def get_runs(df): | 
        
          |  | #---------------------------------- | 
        
          |  | # FIRST, get trail runs | 
        
          |  | trails = df[df.sport=='TRAIL'] | 
        
          |  |  | 
        
          |  | # extracts distances from `desc` column for trail runs | 
        
          |  | kilometers = [] | 
        
          |  | for i, r in trails.iterrows(): | 
        
          |  | kk = re.search(r'([0-9]*[.])?[0-9]+km', r.desc) | 
        
          |  | if kk is not None: | 
        
          |  | kilometers.append(kk.group().replace('km','')) | 
        
          |  | else: | 
        
          |  | kilometers.append(None) | 
        
          |  | kkarr = np.asarray(kilometers, dtype=np.float) | 
        
          |  | trails = trails.assign(distance=kkarr) | 
        
          |  |  | 
        
          |  | #---------------------------------- | 
        
          |  | # SECOND, get normal runs | 
        
          |  | runs = df[df.sport=='CORSA'] | 
        
          |  |  | 
        
          |  | # extracts time and distances from `desc` column for runs | 
        
          |  | minutes = [] | 
        
          |  | kilometers = [] | 
        
          |  | for i, r in runs.iterrows(): | 
        
          |  | mm = re.search(r'([0-9]*[.])?[0-9]+m', r.desc) | 
        
          |  | kk = re.search(r'([0-9]*[.])?[0-9]+km', r.desc) | 
        
          |  | if mm is not None: | 
        
          |  | minutes.append(mm.group().replace('m','')) | 
        
          |  | else: | 
        
          |  | minutes.append(None) | 
        
          |  | if kk is not None: | 
        
          |  | kilometers.append(kk.group().replace('km','')) | 
        
          |  | else: | 
        
          |  | kilometers.append(None) | 
        
          |  | mmarr = np.asarray(minutes, dtype=np.float) | 
        
          |  | kkarr = np.asarray(kilometers, dtype=np.float) | 
        
          |  | runs = runs.assign(distance=kkarr) | 
        
          |  | runs = runs.assign(time=mmarr) | 
        
          |  |  | 
        
          |  | # set up some filters | 
        
          |  | nodist = pd.isna(runs.distance) | 
        
          |  | notime = pd.isna(runs.time) | 
        
          |  | both = ~notime & ~nodist | 
        
          |  |  | 
        
          |  | # calculate speed for each activity | 
        
          |  | # it will be NaN where either speed or distance is not registered | 
        
          |  | runs['speed'] = runs['time'] / runs['distance'] | 
        
          |  |  | 
        
          |  | # calculate avg speed in minutes/km | 
        
          |  | avgspeed = runs['speed'].mean() | 
        
          |  |  | 
        
          |  | # fill in runs where only distance has been recorded | 
        
          |  | # time is based on average speed, and speed = average speed | 
        
          |  | onlydist = notime & ~nodist | 
        
          |  | runs.loc[onlydist, 'time'] = runs[onlydist]['distance'] * avgspeed | 
        
          |  | runs.loc[onlydist, 'speed'] = avgspeed | 
        
          |  |  | 
        
          |  | # fill in runs where only time has been recorded | 
        
          |  | # time is based on average speed, and speed = average speed | 
        
          |  | onlytime = ~notime & nodist | 
        
          |  | runs.loc[onlytime, 'distance'] = runs[onlytime]['time'] / avgspeed | 
        
          |  | runs.loc[onlytime, 'speed'] = avgspeed | 
        
          |  |  | 
        
          |  | #---------------------------------- | 
        
          |  | # LAST, append trails to the end of runs | 
        
          |  | # speed and time is of no interest for trail runs | 
        
          |  | # sorted on dates | 
        
          |  | return runs.append(trails).sort_index() | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def plot_sportpie(df, cdict, ax=None): | 
        
          |  | def label_pie_slice(val): | 
        
          |  | return f'{val/100*len(df):.0f} ({val:.0f}%)' | 
        
          |  |  | 
        
          |  | grouped = df.groupby(df['sport'].cat.remove_unused_categories(), sort=False) | 
        
          |  | sports = grouped['sport'].count().index.values.to_list() | 
        
          |  |  | 
        
          |  | date_min = df.index.min().strftime('%Y-%m-%d') | 
        
          |  | date_max = df.index.max().strftime('%Y-%m-%d') | 
        
          |  | if ax is None: | 
        
          |  | f, ax = plt.subplots(constrained_layout=True, num=1) | 
        
          |  | grouped.size().plot(kind='pie', autopct=label_pie_slice, | 
        
          |  | colors=[cdict[key] for key in sports], ax=ax) | 
        
          |  | ax.set_title('START: {}\nEND: {}'.format(date_min, date_max), fontsize='x-large') | 
        
          |  | ax.set_ylabel('') | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def plot_sportbar(df, cdict, ax=None): | 
        
          |  | date_min = df.index.min().strftime('%Y-%m-%d') | 
        
          |  | date_max = df.index.max().strftime('%Y-%m-%d') | 
        
          |  | sports = df['sport'].cat.categories.to_list() | 
        
          |  |  | 
        
          |  | if ax is None: | 
        
          |  | f, ax = plt.subplots(constrained_layout=True, num=2) | 
        
          |  | df['sport'].value_counts(sort=False).plot(kind='bar', rot=45, ax=ax, | 
        
          |  | color=[cdict[key] for key in sports]) | 
        
          |  | ax.set_title('START: {}\nEND: {}'.format(date_min, date_max), fontsize='x-large') | 
        
          |  | ax.set_ylabel('Number of activities') | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def plot_runs_hist(df, ax=None): | 
        
          |  | # calculate statistics | 
        
          |  | mean = df.distance.mean() | 
        
          |  | p25 = df.distance.describe().loc['25%'] | 
        
          |  | p75 = df.distance.describe().loc['75%'] | 
        
          |  |  | 
        
          |  | # selects only trail runs | 
        
          |  | tt = df[df.sport=='TRAIL'] | 
        
          |  |  | 
        
          |  | # set plot options | 
        
          |  | opt = dict(lw=2, alpha=0.5) | 
        
          |  | opt_tr = dict(marker=2, color='g', ls='none', ms=10, label='Trail Runs') | 
        
          |  |  | 
        
          |  | # make plot | 
        
          |  | if ax is None: | 
        
          |  | _, ax = plt.subplots(constrained_layout=True, figsize=(8,4), num=3) | 
        
          |  | df.distance.plot.hist(bins=50, color='k', alpha=0.25, ax=ax, label='') | 
        
          |  | ax.axvline(mean, color='r', ls='--', label='Mean: {:.1f} km'.format(mean), **opt) | 
        
          |  | ax.axvline(p25, color='r', ls=':', label='P25: {:.1f} km'.format(p25), **opt) | 
        
          |  | ax.axvline(p75, color='r', ls=':', label='P75: {:.1f} km'.format(p75), **opt) | 
        
          |  | if tt.shape[0] > 0: | 
        
          |  | ax.plot(tt.distance, np.zeros(tt.shape[0]), **opt_tr) | 
        
          |  | ax.legend() | 
        
          |  | ax.set_xlabel('Distance (km)') | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def plot_runs_stats(df): | 
        
          |  | # trim input dataframe to first sunday | 
        
          |  | nn = df.index.day_name()=='Sunday' | 
        
          |  | start_weekly_stats = df[nn].iloc[[0]].index.to_pydatetime()[0] | 
        
          |  | mm = df.index >= start_weekly_stats | 
        
          |  | weekly_runs = df[mm].resample('W', closed='left').sum() | 
        
          |  | avg_weekd = weekly_runs.distance.mean() | 
        
          |  | max_weekd = weekly_runs.distance.max() | 
        
          |  |  | 
        
          |  | # calculate stats | 
        
          |  | runs = df.shape[0] | 
        
          |  | weeks = (df.index[-1]-df.index[0]).days//7 | 
        
          |  | freq = runs/weeks | 
        
          |  | avgdst = df.describe().loc['mean', 'distance'] | 
        
          |  | pb_dst = df.describe().loc['max', 'distance'] | 
        
          |  | totdst = df['distance'].sum() | 
        
          |  | tmp0 = df.describe().loc['mean', 'speed'] | 
        
          |  | avgs_min = int(tmp0) | 
        
          |  | avgs_sec = np.round((tmp0 - int(tmp0))*60) | 
        
          |  | avgspd = "{:.0f}'{:.0f}\"".format(avgs_min, avgs_sec) | 
        
          |  |  | 
        
          |  | tmp1 = df.describe().loc['min', 'speed'] | 
        
          |  | pbs_min = int(tmp1) | 
        
          |  | pbs_sec = np.round((tmp1 - int(tmp1))*60) | 
        
          |  | pb_spd = "{:.0f}'{:.0f}\"".format(pbs_min, pbs_sec) | 
        
          |  |  | 
        
          |  |  | 
        
          |  | textstr = f''' | 
        
          |  | Runs: {runs} | 
        
          |  | Frequency: {freq:.1f} runs/week | 
        
          |  | Avg distance: {avgdst:.1f} km | 
        
          |  | Avg speed: {avgspd} min/km | 
        
          |  | PB distance: {pb_dst} km | 
        
          |  | PB speed: {pb_spd} mins/km | 
        
          |  | Total distance: {totdst:.0f} km | 
        
          |  |  | 
        
          |  | Avg weekly: {avg_weekd:.1f} km | 
        
          |  | Max weekly: {max_weekd:.1f} km | 
        
          |  | ''' | 
        
          |  |  | 
        
          |  | opt = dict(color='k', marker='_', ls='none') | 
        
          |  | f, ax = plt.subplots(nrows=3, num=4, | 
        
          |  | sharex=True, figsize=(10, 6)) | 
        
          |  | # first subplot: TIME | 
        
          |  | ax[0].bar(df.index, height=df.time, width=1.5, alpha=0.2, color='b') | 
        
          |  | ax[0].set_ylabel('Time (min)', color='b') | 
        
          |  | ax[0].tick_params(axis='y', colors='b') | 
        
          |  | axwt = ax[0].twinx() | 
        
          |  | axwt.plot(weekly_runs.index, weekly_runs.time, **opt) | 
        
          |  | axwt.set_ylabel('Weekly Avg', color='k') | 
        
          |  | axwt.tick_params(axis='y', colors='k') | 
        
          |  |  | 
        
          |  | # second subplot: DISTANCES | 
        
          |  | ax[1].bar(df.index, height=df.distance, width=1.5, alpha=0.2, color='g') | 
        
          |  | ax[1].set_ylabel('Distance (km)', color='g') | 
        
          |  | ax[1].tick_params(axis='y', colors='g') | 
        
          |  | axwd = ax[1].twinx() | 
        
          |  | axwd.plot(weekly_runs.index, weekly_runs.distance, **opt) | 
        
          |  | axwd.set_ylabel('Weekly Avg', color='k') | 
        
          |  | axwd.tick_params(axis='y', colors='k') | 
        
          |  |  | 
        
          |  | # third subplot: SPEED | 
        
          |  | ax[2].bar(df.index, height=df.speed, width=1.5, alpha=0.2, color='r') | 
        
          |  | ax[2].plot(df.rolling('7d').mean().speed, '-r') | 
        
          |  | ax[2].set_ylabel('Speed (min/km)', color='r') | 
        
          |  | ax[2].tick_params(axis='y', colors='r') | 
        
          |  | ax[2].tick_params(axis='x', rotation=45) | 
        
          |  | ax[2].set_ylim(np.floor(df.speed.min()), np.ceil(df.speed.max())) | 
        
          |  | # turn on horizontal gridlines | 
        
          |  | for aa in ax: | 
        
          |  | aa.grid(axis='y') | 
        
          |  |  | 
        
          |  | # add stats | 
        
          |  | props = dict(boxstyle='round', facecolor='white', alpha=0.7) | 
        
          |  | plt.figtext(0.78, 0.5, textstr, va='center', bbox=props) | 
        
          |  | plt.subplots_adjust(right=0.7, left=0.1, top=0.95, bottom=0.1) | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def make_plots(df, cdict): | 
        
          |  | plot_sportpie(df, cdict) | 
        
          |  | plot_sportbar(df, cdict) | 
        
          |  | runs = get_runs(df) | 
        
          |  | plot_runs_hist(runs) | 
        
          |  | plot_runs_stats(runs) | 
        
          |  | plt.show() | 
        
          |  |  | 
        
          |  |  | 
        
          |  | if len(sys.argv) == 1: | 
        
          |  | print('Usage: python', sys.argv[0], '[year YYYY | days DD | all]') | 
        
          |  | sys.exit(1) | 
        
          |  |  | 
        
          |  | if sys.argv[1]=='year': | 
        
          |  | df, cdict = read_jrnl_sport('~/GOOGLEDRIVE/Apps/journal_sport.txt') | 
        
          |  | selected_period = select_period(df, year=int(sys.argv[2])) | 
        
          |  | make_plots(selected_period, cdict) | 
        
          |  |  | 
        
          |  | elif sys.argv[1]=='days': | 
        
          |  | df, cdict = read_jrnl_sport('~/GOOGLEDRIVE/Apps/journal_sport.txt') | 
        
          |  | selected_period = select_period(df, days=int(sys.argv[2])) | 
        
          |  | make_plots(selected_period, cdict) | 
        
          |  |  | 
        
          |  | elif sys.argv[1]=='all': | 
        
          |  | df, cdict = read_jrnl_sport('~/GOOGLEDRIVE/Apps/journal_sport.txt') | 
        
          |  | make_plots(df, cdict) | 
  
Examples of output plots
Pie chart with all activities in selected period:
Histogram of running distances:
Duration, distances and speed of runs; the grey markers on the duration and distance plots mark weekly averages, while the continuous red line on the speed plot is a running average over a period of a week:
Distances and speed are in kilometers and minutes/kilometers because that's what I'm used to.