Last active
November 14, 2016 20:18
-
-
Save deparkes/f51b5eaf35bdde3a0c00 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Oct 20 11:41:59 2015 | |
Find out from which years you cited most publications in your thesis or | |
dissertation. | |
https://xkcd.com/208/ | |
May need to somehow account for 'missing' years | |
http://pandas.pydata.org/pandas-docs/stable/missing_data.html | |
@author: deparkes | |
""" | |
import re | |
import pandas as pd | |
from collections import Counter | |
import matplotlib.pyplot as plt | |
import matplotlib | |
import numpy as np | |
# Set the style to look like ggplot - not essential, but it does look nice | |
matplotlib.style.use('ggplot') | |
publication_years = [] | |
# Enter a cut-off for what the regex search will consider a year. This is | |
# to help reduce false positives from e.g. pagenumbers in the bbl file. | |
max_year = 2020 | |
# Enter the limits you which to plot to. Useful if you only cite a few papers | |
# from more than a few decades ago. | |
plot_max = 2015 | |
plot_min = 1950 | |
# The bbl file you wish to check. | |
bbl_file = "thesis.bbl" | |
# Output figure name | |
out_name = 'RefCountThesis' | |
# Exactly which regex expression you need to use will depend on exactly what | |
# formatting your bbl file has | |
# The regex string for pulling out 4-digit numbers, (with or without a close | |
# bracket) followed by a full-stop and end of line (me, robin) | |
regex = re.compile("([0-9]{4})\)*\.|$") | |
# Regex for 4-digit numbers in parentheses (james) | |
# regex = re.compile("\(([0-9]{4})\)") | |
# http://pythex.org/ is useful for checking python regular expressions | |
# regex = re.compile("\(([0-9]{4})\)") | |
with open(bbl_file) as f: | |
for line in f: | |
result = regex.search(line) | |
# print result.group(1) | |
if result: | |
if result.group(1) is not None: | |
if int(result.group(1)) < max_year: | |
publication_years.append(int(result.group(1))) | |
# Use counter to determine frequencies of each publication year | |
year_counts = Counter(sorted(publication_years)) | |
# Create a pandas data frame based on the sorted dictionary | |
df = pd.DataFrame.from_dict(year_counts, orient='index') | |
# Sort data frame in year-order | |
df.sort_index(inplace=True) | |
# Create a new index column (so we are not just using the year as an index) | |
df.reset_index(inplace=True) | |
# Create column names | |
df.columns = ["year", "count"] | |
# Fill in missing years | |
# see also: | |
# http://stackoverflow.com/questions/30322693/pandas-dataframe-how-to-find-missing-years-in-a-timeseries | |
# http://stackoverflow.com/questions/25909984/missing-data-insert-rows-in-pandas-and-fill-with-nan | |
# First, create the full range of years that we want | |
year_range = np.arange(df.year.min(), df.year.max()) | |
# make the year column the index | |
df = df.set_index("year") | |
# reindex the dataframe, using the full range of years | |
# by default reindex will place NA/NaN in locations that have no value in the | |
# previous/original index. In this case we want zeros rather than NA, so we use | |
# the fill_value=0 option | |
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html | |
df = df.reindex(year_range, fill_value=0) | |
# Finally we reset the index to go back to a 'normal' pandas index | |
# see also here: http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/ | |
df.reset_index(inplace=True) | |
# Save as tab separated file | |
df.to_csv(out_name + '.dat', sep='\t') | |
# Line plot between the plot limits specified earlier | |
ax = df.plot(x="year", y="count", kind='line', xlim=[plot_min, plot_max]) | |
ax.figure.show() | |
ax.set_xlabel("Publication Year") | |
ax.set_ylabel("Frequency") | |
ax.legend_.remove() | |
# Save figure | |
fig = matplotlib.pyplot.gcf() | |
#fig.set_size_inches(18.5, 10.5) | |
plt.savefig(out_name + '.png', bbox_inches='tight',dpi=600) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment