Last active
April 13, 2016 17:29
-
-
Save raggleton/a7566f07beb6f9964b82a24b0edea48b to your computer and use it in GitHub Desktop.
Change MAIN_TEX_FILE and START_DATE to suit yourself
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Go through TeX files and count words, plot things. | |
TODO: | |
- only count commits where tex file changed? | |
""" | |
import os | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from subprocess import check_output, check_call | |
from itertools import izip | |
import datetime | |
# import matplotlib.dates as mdates | |
from sys import platform as _platform | |
MAIN_TEX_FILE = 'thesis.tex' | |
START_DATE = datetime.datetime(2016, 3, 29) | |
def get_wordcount(main_file, include_bib=True): | |
"""Get wordcount using main TeX file and its included. | |
include_bib is a flag to include bibliography in word count. | |
""" | |
incbib_opt = '-incbib' if include_bib else '' | |
cmds = ['texcount', '-1', '-sum', '-inc', incbib_opt, main_file] | |
return int(check_output(cmds).strip()) | |
def get_git_username(): | |
"""Get git user.name""" | |
return check_output('git config --get user.name'.split()).strip() | |
def get_git_current_hash(short=False): | |
"""Get current commit hash.""" | |
cmd = 'git rev-parse %s HEAD' % ('--short' if short else '') | |
return check_output(cmd.split()).strip() | |
def get_git_commit_hashes(author, short=False): | |
"""Get all commit hashes for given author. short flag for short hasheds. | |
Returned in chronological order, oldest first. | |
""" | |
hash_fmt = r'%h' if short else r'%H' | |
cmds = ['git', 'log', '--pretty=format:"%s"' % hash_fmt, | |
'--author=%s' % author, '--reverse'] | |
return [x.strip('"') for x in check_output(cmds).splitlines()] | |
def get_git_commit_timestamp(commit_hash): | |
"""Get timestamp for commit hash""" | |
cmds = ['git', 'show', '-s', '--pretty=format:"%ct"', commit_hash] | |
return check_output(cmds).strip('"') | |
def get_wordcount_history(main_file, last_recorded_hash=None): | |
"""Return lists of commit hashes, timestamps, and wordcounts""" | |
curr_branch = check_output(['git', 'branch']).split()[1].strip() | |
check_call(['git', 'stash']) # stash current changes | |
try: | |
hashes = get_git_commit_hashes(get_git_username()) # get commits to checkout | |
# figure out which commits we need to loop over | |
if last_recorded_hash: | |
if last_recorded_hash in hashes: | |
start = hashes.index(last_recorded_hash) | |
hashes = hashes[start + 1:] | |
else: | |
print 'last_recorded_hash not in hash history, doing all commits' | |
# checkout each, get wordcount & timestamp for that commit | |
timestamps, wordcounts = [], [] | |
for ghash in hashes: | |
check_call(['git', 'checkout', '-q', ghash]) | |
timestamps.append(get_git_commit_timestamp(ghash)) | |
wordcounts.append(get_wordcount(main_file)) | |
print 'hash:', ghash, 'timestamp:', timestamps[-1], 'wordcount:', wordcounts[-1] | |
finally: | |
# get back current changes | |
check_call(['git', 'checkout', curr_branch]) | |
print 'Applying stashed changes...' | |
check_call(['git', 'stash', 'apply']) | |
return hashes, timestamps, wordcounts | |
def make_recarray(hashes, timestamps, wordcounts): | |
"""Make numpy recarray from lists""" | |
dtypes = [('hash', 'S40'), ('timestamp', 'i8'), ('wordcount', 'i8')] | |
objects = [(h, ts, wc) for h, ts, wc in izip(hashes, timestamps, wordcounts)] | |
return np.rec.array(objects, dtype=dtypes) | |
def update_recarray(store, hashes, timestamps, wordcounts): | |
"""Update numpy recarray `store` from lists""" | |
new_store = make_recarray(hashes, timestamps, wordcounts) | |
return np.rec.array(np.concatenate([store, new_store]), dtype=store.dtype) | |
def write_recarray_to_file(store, csv_filename): | |
"""Save recarray to file as CSV""" | |
np.savetxt(csv_filename, store, delimiter=',', | |
header=','.join(store.dtype.names), | |
fmt=['%s', '%u', '%u']) | |
def plot_wordcount_vs_time(store): | |
"""Make plot of wordcount vs time""" | |
# Ignore everything before a certain date, convert to epoch time | |
start = (START_DATE - datetime.datetime.utcfromtimestamp(0)).total_seconds() | |
mask = store.timestamp > start | |
# Convert to datetime objects, otherwise matplotlib can't handle it | |
# TODO: get the right timezone, currnetly thinks its UTC not BST | |
timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]] | |
fig, ax = plt.subplots() | |
ax.plot_date(timestamps_dt, store.wordcount[mask], 'o-') | |
ax.set_xlabel('Date') | |
ax.set_ylabel('Word count') | |
# mondays = mdates.WeekdayLocator(byweekday=mdates.MONDAY) | |
# mondaysFmt = mdates.DateFormatter("%d %b") | |
# months = mdates.MonthLocator() | |
# monthsFmt = mdates.DateFormatter("%b %y") | |
# ax.xaxis.set_major_locator(mondays) | |
# ax.xaxis.set_major_formatter(mondaysFmt) | |
ax.grid(True) | |
fig.autofmt_xdate() | |
last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1]) | |
last_commit = last_commit.strftime(r"%c") | |
plt.suptitle("As of last commit on %s" % last_commit) | |
filename = 'wordcount.pdf' | |
plt.savefig(filename) | |
plt.clf() | |
return filename | |
def open_pdf(pdf_filename): | |
"""Open a PDF file using system's default PDF viewer.""" | |
if _platform.startswith("linux"): | |
# linux | |
check_call(["xdg-open", pdf_filename]) | |
elif _platform == "darwin": | |
# OS X | |
check_call(["open", pdf_filename]) | |
elif _platform == "win32": | |
# Windows | |
check_call(["start", pdf_filename]) | |
if __name__ == "__main__": | |
print 'Current word count:', get_wordcount(MAIN_TEX_FILE) | |
# Get our data - either from CSV, or go through old commits | |
csv_filename = 'word_count_history.csv' | |
generate_data = False | |
last_recorded_hash = None | |
if not os.path.isfile(csv_filename) or os.stat(csv_filename).st_size == 0: | |
generate_data = True | |
else: | |
store = np.recfromcsv(csv_filename) | |
# check if we already have a word count for this commit, | |
# if not recheck commits - we've prob missed others | |
if get_git_current_hash() not in store.hash: | |
generate_data = True | |
last_recorded_hash = store.hash[-1] | |
if generate_data: | |
hashes, timestamps, wordcounts = get_wordcount_history(MAIN_TEX_FILE, last_recorded_hash) | |
if last_recorded_hash is None: | |
store = make_recarray(hashes, timestamps, wordcounts) | |
else: | |
store = update_recarray(store, hashes, timestamps, wordcounts) | |
write_recarray_to_file(store, csv_filename) | |
# Now do any analysis and plotting | |
pdf_filename = plot_wordcount_vs_time(store) | |
open_pdf(pdf_filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment