Last active
September 4, 2015 18:30
-
-
Save c-forster/caf3389c74fddffdfcd3 to your computer and use it in GitHub Desktop.
Reduces (Amended) HathiTrust Fiction Metadata to the Paraments of the BSPF Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# Extract and summarize the gender breakdown for data comparable to | |
# that reported by Raven et al in *The English Novel 1770-1829: A | |
# Bibliographical Survey of Prose Fiction Published in the British Isles* | |
# As a practical matter, this means: | |
# - works published between 1770 and 1830 | |
# - published in England or Scotland or Ireland | |
# - individual works (remove duplicates, and count multivol works only once) | |
import csv, sys | |
import re | |
import operator | |
import string | |
puncutationRegex = re.compile('[%s]' % re.escape(string.punctuation)) | |
BritishIsles = [ | |
'enk', # England | |
'stk', # Scotland | |
'ie', # Ireland | |
'wlk', # Wales | |
] | |
def fieldsToWorkID(author, title): | |
""" | |
Accepts author, title, and volume information, and returns a | |
string that attempts to capture this information in a simplified, | |
modified version. (Lower cased, puncutation removed, etc). | |
""" | |
idstring = puncutationRegex.sub('', author) + '-' + puncutationRegex.sub('', title) | |
idstring = '_'.join(idstring.split()) | |
return idstring.lower() | |
def main(filename='fiction_metadata-amended.csv'): | |
vols = [] | |
# Dictionary with years and keys, and lists of genders for values | |
works = {} | |
f = open(filename, 'rt') | |
try: | |
reader = csv.DictReader(f) | |
for row in reader: | |
year = row['date'] | |
author = row['author'] | |
title = row['title'] | |
htid = row['htid'] | |
gender = row['gender'] | |
place = row['place'] | |
# Conditions on whether we'll count a work: | |
# - year of publication (1770-1830) | |
# - place ('in BritishIsles; heh) | |
# - does not have WORKS in title; this is an attempt to | |
# prevent counting "Collected Works of " and similar, | |
# and so restrict ourselves (like the the BSPF) to "new" | |
# works. | |
if ((int(year) >= 1770 and int(year) <= 1830) and | |
(place in BritishIsles) and | |
('works' not in title.lower()) and | |
('novels' not in title.lower())): | |
volID = fieldsToWorkID(author,title) | |
if volID not in vols: | |
vols.append(volID) | |
if year in works: | |
works[year].append(gender) | |
else: | |
works[year] = [gender] | |
finally: | |
f.close() | |
writer = csv.writer(sys.stdout) | |
writer.writerow(['year','totalWorks','male','female','undetected','namemissing']) | |
for year in sorted(works.keys()): | |
output = [year] | |
output.append(len(works[year])) | |
output.append(works[year].count('male')) | |
output.append(works[year].count('female')) | |
output.append(works[year].count('undetected')) | |
output.append(works[year].count('namemissing')) | |
writer.writerow(output) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment