raggleton · April 13, 2016 17:29
diff --git a/word_counter.py b/word_counter.py
 #!/usr/bin/env python

 """
 Go through TeX files and count words, plot things.

 TODO:
 - only count commits where tex file changed?
 """


 import os
 import numpy as np
 import matplotlib.pyplot as plt
 from subprocess import check_output, check_call
 from itertools import izip
 import datetime
 # import matplotlib.dates as mdates
 from sys import platform as _platform


 MAIN_TEX_FILE = 'thesis.tex'
 START_DATE = datetime.datetime(2016, 3, 29)


 def get_wordcount(main_file, include_bib=True):
    """Get wordcount using main TeX file and its included.
    include_bib is a flag to include bibliography in word count.
    """
    incbib_opt = '-incbib' if include_bib else ''
    cmds = ['texcount', '-1', '-sum', '-inc', incbib_opt, main_file]
    return int(check_output(cmds).strip())


 def get_git_username():
    """Get git user.name"""
    return check_output('git config --get user.name'.split()).strip()


 def get_git_current_hash(short=False):
    """Get current commit hash."""
    cmd = 'git rev-parse %s HEAD' % ('--short' if short else '')
    return check_output(cmd.split()).strip()


 def get_git_commit_hashes(author, short=False):
    """Get all commit hashes for given author. short flag for short hasheds.
    Returned in chronological order, oldest first.
    """
    hash_fmt = r'%h' if short else r'%H'
    cmds = ['git', 'log', '--pretty=format:"%s"' % hash_fmt,
            '--author=%s' % author, '--reverse']
    return [x.strip('"') for x in check_output(cmds).splitlines()]


 def get_git_commit_timestamp(commit_hash):
    """Get timestamp for commit hash"""
    cmds = ['git', 'show', '-s', '--pretty=format:"%ct"', commit_hash]
    return check_output(cmds).strip('"')


 def get_wordcount_history(main_file, last_recorded_hash=None):
    """Return lists of commit hashes, timestamps, and wordcounts"""
    curr_branch = check_output(['git', 'branch']).split()[1].strip()
    check_call(['git', 'stash'])  # stash current changes
    try:
        hashes = get_git_commit_hashes(get_git_username())  # get commits to checkout
        # figure out which commits we need to loop over
        if last_recorded_hash:
            if last_recorded_hash in hashes:
                start = hashes.index(last_recorded_hash)
                hashes = hashes[start + 1:]
            else:
                print 'last_recorded_hash not in hash history, doing all commits'

        # checkout each, get wordcount & timestamp for that commit
        timestamps, wordcounts = [], []
        for ghash in hashes:
            check_call(['git', 'checkout', '-q', ghash])
            timestamps.append(get_git_commit_timestamp(ghash))
            wordcounts.append(get_wordcount(main_file))
            print 'hash:', ghash, 'timestamp:', timestamps[-1], 'wordcount:', wordcounts[-1]
    finally:
        # get back current changes
        check_call(['git', 'checkout', curr_branch])
        print 'Applying stashed changes...'
        check_call(['git', 'stash', 'apply'])

    return hashes, timestamps, wordcounts


 def make_recarray(hashes, timestamps, wordcounts):
    """Make numpy recarray from lists"""
    dtypes = [('hash', 'S40'), ('timestamp', 'i8'), ('wordcount', 'i8')]
    objects = [(h, ts, wc) for h, ts, wc in izip(hashes, timestamps, wordcounts)]
    return np.rec.array(objects, dtype=dtypes)


 def update_recarray(store, hashes, timestamps, wordcounts):
    """Update numpy recarray `store` from lists"""
    new_store = make_recarray(hashes, timestamps, wordcounts)
    return np.rec.array(np.concatenate([store, new_store]), dtype=store.dtype)


 def write_recarray_to_file(store, csv_filename):
    """Save recarray to file as CSV"""
    np.savetxt(csv_filename, store, delimiter=',',
               header=','.join(store.dtype.names),
               fmt=['%s', '%u', '%u'])


 def plot_wordcount_vs_time(store):
    """Make plot of wordcount vs time"""
    # Ignore everything before a certain date, convert to epoch time
    start = (START_DATE - datetime.datetime.utcfromtimestamp(0)).total_seconds()
    mask = store.timestamp > start
    # Convert to datetime objects, otherwise matplotlib can't handle it
    # TODO: get the right timezone, currnetly thinks its UTC not BST
    timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]

    fig, ax = plt.subplots()
    ax.plot_date(timestamps_dt, store.wordcount[mask], 'o-')
    ax.set_xlabel('Date')
    ax.set_ylabel('Word count')

    # mondays = mdates.WeekdayLocator(byweekday=mdates.MONDAY)
    # mondaysFmt = mdates.DateFormatter("%d %b")

    # months = mdates.MonthLocator()
    # monthsFmt = mdates.DateFormatter("%b %y")

    # ax.xaxis.set_major_locator(mondays)
    # ax.xaxis.set_major_formatter(mondaysFmt)

    ax.grid(True)
    fig.autofmt_xdate()
    last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1])
    last_commit = last_commit.strftime(r"%c")
    plt.suptitle("As of last commit on %s" % last_commit)
    filename = 'wordcount.pdf'
    plt.savefig(filename)
    plt.clf()
    return filename


 def open_pdf(pdf_filename):
    """Open a PDF file using system's default PDF viewer."""
    if _platform.startswith("linux"):
        # linux
        check_call(["xdg-open", pdf_filename])
    elif _platform == "darwin":
        # OS X
        check_call(["open", pdf_filename])
    elif _platform == "win32":
        # Windows
        check_call(["start", pdf_filename])


 if __name__ == "__main__":

    print 'Current word count:', get_wordcount(MAIN_TEX_FILE)

    # Get our data - either from CSV, or go through old commits
    csv_filename = 'word_count_history.csv'
    generate_data = False
    last_recorded_hash = None
    if not os.path.isfile(csv_filename) or os.stat(csv_filename).st_size == 0:
        generate_data = True
    else:
        store = np.recfromcsv(csv_filename)
        # check if we already have a word count for this commit,
        # if not recheck commits - we've prob missed others
        if get_git_current_hash() not in store.hash:
            generate_data = True
            last_recorded_hash = store.hash[-1]

    if generate_data:
        hashes, timestamps, wordcounts = get_wordcount_history(MAIN_TEX_FILE, last_recorded_hash)
        if last_recorded_hash is None:
            store = make_recarray(hashes, timestamps, wordcounts)
        else:
            store = update_recarray(store, hashes, timestamps, wordcounts)
        write_recarray_to_file(store, csv_filename)

    # Now do any analysis and plotting
    pdf_filename = plot_wordcount_vs_time(store)
    open_pdf(pdf_filename)
	#!/usr/bin/env python

	"""
	Go through TeX files and count words, plot things.

	TODO:
	- only count commits where tex file changed?
	"""


	import os
	import numpy as np
	import matplotlib.pyplot as plt
	from subprocess import check_output, check_call
	from itertools import izip
	import datetime
	# import matplotlib.dates as mdates
	from sys import platform as _platform


	MAIN_TEX_FILE = 'thesis.tex'
	START_DATE = datetime.datetime(2016, 3, 29)


	def get_wordcount(main_file, include_bib=True):
	"""Get wordcount using main TeX file and its included.
	include_bib is a flag to include bibliography in word count.
	"""
	incbib_opt = '-incbib' if include_bib else ''
	cmds = ['texcount', '-1', '-sum', '-inc', incbib_opt, main_file]
	return int(check_output(cmds).strip())


	def get_git_username():
	"""Get git user.name"""
	return check_output('git config --get user.name'.split()).strip()


	def get_git_current_hash(short=False):
	"""Get current commit hash."""
	cmd = 'git rev-parse %s HEAD' % ('--short' if short else '')
	return check_output(cmd.split()).strip()


	def get_git_commit_hashes(author, short=False):
	"""Get all commit hashes for given author. short flag for short hasheds.
	Returned in chronological order, oldest first.
	"""
	hash_fmt = r'%h' if short else r'%H'
	cmds = ['git', 'log', '--pretty=format:"%s"' % hash_fmt,
	'--author=%s' % author, '--reverse']
	return [x.strip('"') for x in check_output(cmds).splitlines()]


	def get_git_commit_timestamp(commit_hash):
	"""Get timestamp for commit hash"""
	cmds = ['git', 'show', '-s', '--pretty=format:"%ct"', commit_hash]
	return check_output(cmds).strip('"')


	def get_wordcount_history(main_file, last_recorded_hash=None):
	"""Return lists of commit hashes, timestamps, and wordcounts"""
	curr_branch = check_output(['git', 'branch']).split()[1].strip()
	check_call(['git', 'stash']) # stash current changes
	try:
	hashes = get_git_commit_hashes(get_git_username()) # get commits to checkout
	# figure out which commits we need to loop over
	if last_recorded_hash:
	if last_recorded_hash in hashes:
	start = hashes.index(last_recorded_hash)
	hashes = hashes[start + 1:]
	else:
	print 'last_recorded_hash not in hash history, doing all commits'

	# checkout each, get wordcount & timestamp for that commit
	timestamps, wordcounts = [], []
	for ghash in hashes:
	check_call(['git', 'checkout', '-q', ghash])
	timestamps.append(get_git_commit_timestamp(ghash))
	wordcounts.append(get_wordcount(main_file))
	print 'hash:', ghash, 'timestamp:', timestamps[-1], 'wordcount:', wordcounts[-1]
	finally:
	# get back current changes
	check_call(['git', 'checkout', curr_branch])
	print 'Applying stashed changes...'
	check_call(['git', 'stash', 'apply'])

	return hashes, timestamps, wordcounts


	def make_recarray(hashes, timestamps, wordcounts):
	"""Make numpy recarray from lists"""
	dtypes = [('hash', 'S40'), ('timestamp', 'i8'), ('wordcount', 'i8')]
	objects = [(h, ts, wc) for h, ts, wc in izip(hashes, timestamps, wordcounts)]
	return np.rec.array(objects, dtype=dtypes)


	def update_recarray(store, hashes, timestamps, wordcounts):
	"""Update numpy recarray `store` from lists"""
	new_store = make_recarray(hashes, timestamps, wordcounts)
	return np.rec.array(np.concatenate([store, new_store]), dtype=store.dtype)


	def write_recarray_to_file(store, csv_filename):
	"""Save recarray to file as CSV"""
	np.savetxt(csv_filename, store, delimiter=',',
	header=','.join(store.dtype.names),
	fmt=['%s', '%u', '%u'])


	def plot_wordcount_vs_time(store):
	"""Make plot of wordcount vs time"""
	# Ignore everything before a certain date, convert to epoch time
	start = (START_DATE - datetime.datetime.utcfromtimestamp(0)).total_seconds()
	mask = store.timestamp > start
	# Convert to datetime objects, otherwise matplotlib can't handle it
	# TODO: get the right timezone, currnetly thinks its UTC not BST
	timestamps_dt = [datetime.datetime.utcfromtimestamp(x) for x in store.timestamp[mask]]

	fig, ax = plt.subplots()
	ax.plot_date(timestamps_dt, store.wordcount[mask], 'o-')
	ax.set_xlabel('Date')
	ax.set_ylabel('Word count')

	# mondays = mdates.WeekdayLocator(byweekday=mdates.MONDAY)
	# mondaysFmt = mdates.DateFormatter("%d %b")

	# months = mdates.MonthLocator()
	# monthsFmt = mdates.DateFormatter("%b %y")

	# ax.xaxis.set_major_locator(mondays)
	# ax.xaxis.set_major_formatter(mondaysFmt)

	ax.grid(True)
	fig.autofmt_xdate()
	last_commit = datetime.datetime.utcfromtimestamp(store.timestamp[-1])
	last_commit = last_commit.strftime(r"%c")
	plt.suptitle("As of last commit on %s" % last_commit)
	filename = 'wordcount.pdf'
	plt.savefig(filename)
	plt.clf()
	return filename


	def open_pdf(pdf_filename):
	"""Open a PDF file using system's default PDF viewer."""
	if _platform.startswith("linux"):
	# linux
	check_call(["xdg-open", pdf_filename])
	elif _platform == "darwin":
	# OS X
	check_call(["open", pdf_filename])
	elif _platform == "win32":
	# Windows
	check_call(["start", pdf_filename])


	if __name__ == "__main__":

	print 'Current word count:', get_wordcount(MAIN_TEX_FILE)

	# Get our data - either from CSV, or go through old commits
	csv_filename = 'word_count_history.csv'
	generate_data = False
	last_recorded_hash = None
	if not os.path.isfile(csv_filename) or os.stat(csv_filename).st_size == 0:
	generate_data = True
	else:
	store = np.recfromcsv(csv_filename)
	# check if we already have a word count for this commit,
	# if not recheck commits - we've prob missed others
	if get_git_current_hash() not in store.hash:
	generate_data = True
	last_recorded_hash = store.hash[-1]

	if generate_data:
	hashes, timestamps, wordcounts = get_wordcount_history(MAIN_TEX_FILE, last_recorded_hash)
	if last_recorded_hash is None:
	store = make_recarray(hashes, timestamps, wordcounts)
	else:
	store = update_recarray(store, hashes, timestamps, wordcounts)
	write_recarray_to_file(store, csv_filename)

	# Now do any analysis and plotting
	pdf_filename = plot_wordcount_vs_time(store)
	open_pdf(pdf_filename)