Created
January 8, 2019 15:37
-
-
Save triplingual/1677b0f290c0cdbc7c27d8002d55d49d to your computer and use it in GitHub Desktop.
Python script to get data needed for @agoldst/dfrtopics (older) from JSTOR DFR metadata XML files (newer)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# columns to create and populate: | |
# id, doi, title, author, journaltitle, volume, issue, pubdate, | |
# pagerange, publisher, type, reviewed-work, abstract | |
import xml.etree.ElementTree | |
import unicodecsv as csv # need to install unicodecsv, tho | |
import re | |
import os | |
import sys | |
import glob | |
if len(sys.argv) < 2: | |
print "Looking for XML files in current directory . . ." | |
print "" | |
pathname = os.getcwd() | |
else: | |
print "Using " + sys.argv[1] + " as working path for XML files." | |
pathname = sys.argv[1].rstrip("/") | |
# Get filenames from system | |
metadatafilenames = glob.glob(pathname + "/*.xml") | |
# Start looping through metadata files | |
for articlepath in metadatafilenames: | |
print articlepath | |
e = xml.etree.ElementTree.parse(articlepath).getroot() | |
# Get data | |
# ID (seems to be same as DOI) | |
# DOI | |
doinode = e.find(".//front/article-meta/article-id[@pub-id-type='doi']") | |
jstornode = e.find(".//front/article-meta/article-id[@pub-id-type='jstor']") | |
if (doinode != None): | |
doi = doinode.text | |
elif (jstornode != None): | |
doi = "10.2307/" + jstornode.text | |
# TITLE | |
titlenode = e.find(".//front/article-meta/title-group/article-title") | |
title = "" | |
if (titlenode != None and titlenode.text != None): | |
title = titlenode.text.replace("\n", " ") | |
title = title.replace("\t", " ") | |
title = re.sub(" +", " ", title) | |
# AUTHOR (comma separated if >1) | |
authornode = e.find(".//front/article-meta/contrib-group") | |
fullname = [] | |
if (authornode != None): | |
contributors = authornode.findall("contrib") | |
for contributor in contributors: | |
namenode = contributor.find("string-name") | |
if (namenode.find("given-names") != None ): | |
name = namenode.find("given-names").text | |
familyname = namenode.find("surname").text | |
fullname.append(name + " " + familyname) | |
else: | |
fullname.append(namenode.text) | |
# JOURNALTITLE | |
journaltitle = e.find(".//front/journal-meta/journal-title-group/journal-title").text | |
# VOLUME | |
volume = e.find(".//front/article-meta/volume").text if (e.find(".//front/article-meta/volume") != None) else "" | |
#ISSUE | |
issue = e.find(".//front/article-meta/issue").text if (e.find(".//front/article-meta/issue") != None) else "" | |
# PUBDATE | |
day = e.find(".//front/article-meta/pub-date/day").text | |
month = e.find(".//front/article-meta/pub-date/month").text | |
year = e.find(".//front/article-meta/pub-date/year").text | |
# alt | |
# for dateparts in e.iter('pub-date'): | |
# print dateparts.find('day').text | |
# print dateparts.find('month').text | |
# print dateparts.find('year').text | |
pubdate = year + "-" + month + "-" + day | |
# PAGERANGE | |
pagerange = "" | |
pagerangenode = e.find(".//front/article-meta/page-range") | |
# sometimes no page-range node | |
if (pagerangenode != None): | |
pagerange = pagerangenode.text | |
else: | |
fpagenode = e.find(".//front/article-meta/fpage") | |
lpagenode = e.find(".//front/article-meta/lpage") | |
if (fpagenode != None and lpagenode != None): | |
fpage = fpagenode.text if len(list(fpagenode)) > 0 else "" | |
lpage = lpagenode.text if len(list(lpagenode)) > 0 else "" | |
pagerange = fpage + "-" + lpage | |
# PUBLISHER | |
publisher = e.find(".//front/journal-meta/publisher/publisher-name").text | |
publisher = publisher.replace("\n", " ") | |
publisher = publisher.replace("\t", " ") | |
publisher = re.sub(" +", " ", publisher) | |
# TYPE | |
typename = e.get("article-type") | |
type = "" | |
if typename == "research-article": | |
type = "fla" | |
elif typename == "book-review": | |
type = "brv" | |
elif typename == "misc": | |
type = "mis" | |
elif typename == "editorial": | |
type = "edi" | |
elif typename == "news": | |
type = "nws" | |
# REVIEWED-WORK | |
reviewedworks = [] | |
reviewedworknodes = e.findall(".//front/article-meta/product") | |
if (reviewedworknodes != None): | |
for reviewedworknode in reviewedworknodes: | |
reviewedworktitlenode = reviewedworknode.find("source") | |
# BUT IF IT'S NESTED IN TEXT FORMATTING? | |
if (len(list(reviewedworktitlenode)) > 0): | |
reviewedworktitle = xml.etree.ElementTree.tostring(reviewedworktitlenode[0]).replace("\n", "") | |
reviewedworktitle = reviewedworktitle.replace("<bold>", "") | |
reviewedworktitle = reviewedworktitle.replace("</bold>", "") | |
reviewedworktitle = reviewedworktitle.replace("<italic>", "") | |
reviewedworktitle = reviewedworktitle.replace("</italic>", "") | |
reviewedworktitle = re.sub(" +", " ", reviewedworktitle) | |
reviewedworktitle = reviewedworktitle.lstrip(" ") | |
else: | |
reviewedworktitle = reviewedworktitlenode.text.lstrip(" ") | |
reviewedworknames = reviewedworknode.findall("string-name") | |
reviewedworknamelist = [] | |
for reviewedworkname in reviewedworknames: | |
if (len(list(reviewedworkname)) == 0): | |
if (reviewedworkname.text != None): | |
reviewedworknamelist.append(reviewedworkname.text) | |
else: | |
reviewedworkfirstname = "" | |
reviewedworkfamilyname = "" | |
reviewedworkfirstnamenode = reviewedworkname.find("given-names") | |
if (reviewedworkfirstnamenode != None): | |
reviewedworkfirstname = reviewedworkfirstnamenode.text | |
reviewedworkfamilynamenode = reviewedworkname.find("surname") | |
if (reviewedworkfamilynamenode != None): | |
reviewedworkfamilyname = reviewedworkfamilynamenode.text | |
reviewedworknamelist.append(reviewedworkfirstname + " " + reviewedworkfamilyname) | |
reviewedworks.append(reviewedworktitle + "| " + ", ".join(reviewedworknamelist)) | |
# ABSTRACT | |
abstractnode = e.find(".//front/article-meta/abstract") | |
abstract = "" | |
if (abstractnode != None): | |
abstract = abstractnode.text.replace("\n", " ") | |
abstract.replace("<p>", "") | |
abstract.replace("</p>", "") | |
# Assemble data? | |
# Write to CSV file | |
# Create CSVWriter | |
with open('citations.tsv', 'ab') as tsvfile: | |
metadatawriter = csv.writer(tsvfile, delimiter='\t', quotechar='"') | |
metadatawriter.writerow([doi, doi, title, ", ".join(fullname), journaltitle, volume, issue, pubdate, pagerange, publisher, type, ", ".join(reviewedworks), abstract]) | |
# End looping through metadata files |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment