Skip to content

Instantly share code, notes, and snippets.

@raggleton
Last active April 13, 2016 17:29
Show Gist options
  • Save raggleton/a7566f07beb6f9964b82a24b0edea48b to your computer and use it in GitHub Desktop.
Save raggleton/a7566f07beb6f9964b82a24b0edea48b to your computer and use it in GitHub Desktop.
Change MAIN_TEX_FILE and START_DATE to suit yourself
#!/usr/bin/env python
"""
Go through TeX files and count words, plot things.
TODO:
- only count commits where tex file changed?
"""
import os
import numpy as np
import matplotlib.pyplot as plt
from subprocess import check_output, check_call
from itertools import izip
import datetime
# import matplotlib.dates as mdates
from sys import platform as _platform
MAIN_TEX_FILE = 'thesis.tex'
START_DATE = datetime.datetime(2016, 3, 29)
def get_wordcount(main_file, include_bib=True):
"""Get wordcount using main TeX file and its included.
include_bib is a flag to include bibliography in word count.
"""
incbib_opt = '-incbib' if include_bib else ''
cmds = ['texcount', '-1', '-sum', '-inc', incbib_opt, main_file]
return int(check_output(cmds).strip())
def get_git_username():
"""Get git user.name"""
return check_output('git config --get user.name'.split()).strip()
def get_git_current_hash(short=False):
"""Get current commit hash."""
cmd = 'git rev-parse %s HEAD' % ('--short' if short else '')
return check_output(cmd.split()).strip()
def get_git_commit_hashes(author, short=False):
"""Get all commit hashes for given author. short flag for short hasheds.
Returned in chronological order, oldest first.
"""
hash_fmt = r'%h' if short else r'%H'
cmds = ['git', 'log', '--pretty=format:"%s"' % hash_fmt,
'--author=%s' % author, '--reverse']
return [x.strip('"') for x in check_output(cmds).splitlines()]
def get_git_commit_timestamp(commit_hash):
"""Get timestamp for commit hash"""
cmds = ['git', 'show', '-s', '--pretty=format:"%ct"', commit_hash]
return check_output(cmds).strip('"')
def get_wordcount_history(main_file, last_recorded_hash=None):
"""Return lists of commit hashes, timestamps, and wordcounts"""
curr_branch = check_output(['git', 'branch']).split()[1].strip()
check_call(['git', 'stash']) # stash current changes
try:
hashes = get_git_commit_hashes(get_git_username()) # get commits to checkout
# figure out which commits we need to loop over
if last_recorded_hash:
if last_recorded_hash in hashes:
start = hashes.index(last_recorded_hash)
hashes = hashes[start + 1:]
else:
print 'last_recorded_hash not in hash history, doing all commits'
# checkout each, get wordcount & timestamp for that commit
timestamps, wordcounts = [], []
for ghash in hashes:
check_call(['git', 'checkout', '-q', ghash])
timestamps.append(get_git_commit_timestamp(ghash))
wordcounts.append(get_wordcount(main_file))
print 'hash:', ghash, 'timestamp:', timestamps[-1], 'wordcount:', wordcounts[-1]
finally:
# get back current changes
check_call(['git', 'checkout', curr_branch])
print 'Applying stashed changes...'
check_call(['git', 'stash', 'apply'])
return hashes, timestamps, wordcounts
def make_recarray(hashes, timestamps, wordcounts):
"""Make numpy recarray from lists"""
dtypes = [('hash', 'S40'), ('timestamp', 'i8'), ('wordcount', 'i8')]
objects = [(h, ts, wc) for h, ts, wc in izip(hashes, timestamps, wordcounts)]
return np.rec.array(objects, dtype=dtypes)
def update_recarray(store, hashes, timestamps, wordcounts):
"""Update numpy recarray `store` from lists"""
new_store = make_recarray(hashes, timestamps, wordcounts)
return np.rec.array(np.concatenate([store, new_store]), dtype=store.dtype)
def write_recarray_to_file(store, csv_filename):
"""Save recarray to file as CSV"""
np.savetxt(csv_filename, store, delimiter=',',
header=','.join(store.dtype.names),
fmt=['%s', '%u', '%u'])
def plot_wordcount_vs_time(store):
"""Make plot of wordcount vs time"""
# Ignore everything before a certain date, convert to epoch time
start = (START_DATE - datetime.datetime.utcfromtimestamp(0)).total_seconds()
mask = store.timestamp > start
# Convert to datetime objects, otherwise matplotlib can't handle it
# TODO: get the right timezone, currnetly thinks its UTC not BST
timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]
fig, ax = plt.subplots()
ax.plot_date(timestamps_dt, store.wordcount[mask], 'o-')
ax.set_xlabel('Date')
ax.set_ylabel('Word count')
# mondays = mdates.WeekdayLocator(byweekday=mdates.MONDAY)
# mondaysFmt = mdates.DateFormatter("%d %b")
# months = mdates.MonthLocator()
# monthsFmt = mdates.DateFormatter("%b %y")
# ax.xaxis.set_major_locator(mondays)
# ax.xaxis.set_major_formatter(mondaysFmt)
ax.grid(True)
fig.autofmt_xdate()
last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1])
last_commit = last_commit.strftime(r"%c")
plt.suptitle("As of last commit on %s" % last_commit)
filename = 'wordcount.pdf'
plt.savefig(filename)
plt.clf()
return filename
def open_pdf(pdf_filename):
"""Open a PDF file using system's default PDF viewer."""
if _platform.startswith("linux"):
# linux
check_call(["xdg-open", pdf_filename])
elif _platform == "darwin":
# OS X
check_call(["open", pdf_filename])
elif _platform == "win32":
# Windows
check_call(["start", pdf_filename])
if __name__ == "__main__":
print 'Current word count:', get_wordcount(MAIN_TEX_FILE)
# Get our data - either from CSV, or go through old commits
csv_filename = 'word_count_history.csv'
generate_data = False
last_recorded_hash = None
if not os.path.isfile(csv_filename) or os.stat(csv_filename).st_size == 0:
generate_data = True
else:
store = np.recfromcsv(csv_filename)
# check if we already have a word count for this commit,
# if not recheck commits - we've prob missed others
if get_git_current_hash() not in store.hash:
generate_data = True
last_recorded_hash = store.hash[-1]
if generate_data:
hashes, timestamps, wordcounts = get_wordcount_history(MAIN_TEX_FILE, last_recorded_hash)
if last_recorded_hash is None:
store = make_recarray(hashes, timestamps, wordcounts)
else:
store = update_recarray(store, hashes, timestamps, wordcounts)
write_recarray_to_file(store, csv_filename)
# Now do any analysis and plotting
pdf_filename = plot_wordcount_vs_time(store)
open_pdf(pdf_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment