Last active
May 20, 2019 09:22
-
-
Save kumo/f5d2a5832e4268b723eefb60d5e6143a to your computer and use it in GitHub Desktop.
Python script to generate stats for a collection of files with the format DATE - Author 1, Author 2-Editor.doc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import re | |
from collections import Counter | |
import datetime | |
import csv | |
files = glob.glob('*.*') | |
print "There are ", len(files), " files." | |
editors = [] | |
authors = [] | |
for file in files: | |
parts = file.split('.') | |
ext = parts[1] | |
filename = parts[0] | |
info_parts = filename.split(' - ') | |
if len(info_parts) < 2: | |
print "Skipping ", info_parts, "" | |
continue | |
date = info_parts[0] | |
try: | |
editor = info_parts[1].split('-')[1] | |
editors.append(editor) | |
except IndexError: | |
print "Can't find author in: ", file | |
author = info_parts[1].split('-')[0] | |
authors_names = re.split('[, &]', author) | |
# print authors_names | |
authors.extend(authors_names) | |
print Counter(editors) | |
authors = filter(None,authors) | |
# remove things like Author 1 and Author 2 | |
is_integer = lambda s: not s.isdigit() | |
results = Counter(filter(is_integer, authors)) | |
# todo: print the Counter in a nicer way | |
print results | |
for name, count in results.most_common(): | |
print "%3i | %s" % (count, name) | |
# export CSV file | |
date = datetime.datetime.today().strftime('%Y%m%d') | |
filename = date + "-stats.csv" | |
with open(filename,'w') as csvfile: | |
fieldnames=['author','count'] | |
writer=csv.writer(csvfile) | |
writer.writerow(fieldnames) | |
for row in results.items(): | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If there are 2 files:
this script should count Nuala 2 times, and Tama and Edgeworth as 1 each.