Skip to content

Instantly share code, notes, and snippets.

@triplingual
Created January 8, 2019 15:37
Show Gist options
  • Save triplingual/1677b0f290c0cdbc7c27d8002d55d49d to your computer and use it in GitHub Desktop.
Save triplingual/1677b0f290c0cdbc7c27d8002d55d49d to your computer and use it in GitHub Desktop.
Python script to get data needed for @agoldst/dfrtopics (older) from JSTOR DFR metadata XML files (newer)
# columns to create and populate:
# id, doi, title, author, journaltitle, volume, issue, pubdate,
# pagerange, publisher, type, reviewed-work, abstract
import xml.etree.ElementTree
import unicodecsv as csv # need to install unicodecsv, tho
import re
import os
import sys
import glob
if len(sys.argv) < 2:
print "Looking for XML files in current directory . . ."
print ""
pathname = os.getcwd()
else:
print "Using " + sys.argv[1] + " as working path for XML files."
pathname = sys.argv[1].rstrip("/")
# Get filenames from system
metadatafilenames = glob.glob(pathname + "/*.xml")
# Start looping through metadata files
for articlepath in metadatafilenames:
print articlepath
e = xml.etree.ElementTree.parse(articlepath).getroot()
# Get data
# ID (seems to be same as DOI)
# DOI
doinode = e.find(".//front/article-meta/article-id[@pub-id-type='doi']")
jstornode = e.find(".//front/article-meta/article-id[@pub-id-type='jstor']")
if (doinode != None):
doi = doinode.text
elif (jstornode != None):
doi = "10.2307/" + jstornode.text
# TITLE
titlenode = e.find(".//front/article-meta/title-group/article-title")
title = ""
if (titlenode != None and titlenode.text != None):
title = titlenode.text.replace("\n", " ")
title = title.replace("\t", " ")
title = re.sub(" +", " ", title)
# AUTHOR (comma separated if >1)
authornode = e.find(".//front/article-meta/contrib-group")
fullname = []
if (authornode != None):
contributors = authornode.findall("contrib")
for contributor in contributors:
namenode = contributor.find("string-name")
if (namenode.find("given-names") != None ):
name = namenode.find("given-names").text
familyname = namenode.find("surname").text
fullname.append(name + " " + familyname)
else:
fullname.append(namenode.text)
# JOURNALTITLE
journaltitle = e.find(".//front/journal-meta/journal-title-group/journal-title").text
# VOLUME
volume = e.find(".//front/article-meta/volume").text if (e.find(".//front/article-meta/volume") != None) else ""
#ISSUE
issue = e.find(".//front/article-meta/issue").text if (e.find(".//front/article-meta/issue") != None) else ""
# PUBDATE
day = e.find(".//front/article-meta/pub-date/day").text
month = e.find(".//front/article-meta/pub-date/month").text
year = e.find(".//front/article-meta/pub-date/year").text
# alt
# for dateparts in e.iter('pub-date'):
# print dateparts.find('day').text
# print dateparts.find('month').text
# print dateparts.find('year').text
pubdate = year + "-" + month + "-" + day
# PAGERANGE
pagerange = ""
pagerangenode = e.find(".//front/article-meta/page-range")
# sometimes no page-range node
if (pagerangenode != None):
pagerange = pagerangenode.text
else:
fpagenode = e.find(".//front/article-meta/fpage")
lpagenode = e.find(".//front/article-meta/lpage")
if (fpagenode != None and lpagenode != None):
fpage = fpagenode.text if len(list(fpagenode)) > 0 else ""
lpage = lpagenode.text if len(list(lpagenode)) > 0 else ""
pagerange = fpage + "-" + lpage
# PUBLISHER
publisher = e.find(".//front/journal-meta/publisher/publisher-name").text
publisher = publisher.replace("\n", " ")
publisher = publisher.replace("\t", " ")
publisher = re.sub(" +", " ", publisher)
# TYPE
typename = e.get("article-type")
type = ""
if typename == "research-article":
type = "fla"
elif typename == "book-review":
type = "brv"
elif typename == "misc":
type = "mis"
elif typename == "editorial":
type = "edi"
elif typename == "news":
type = "nws"
# REVIEWED-WORK
reviewedworks = []
reviewedworknodes = e.findall(".//front/article-meta/product")
if (reviewedworknodes != None):
for reviewedworknode in reviewedworknodes:
reviewedworktitlenode = reviewedworknode.find("source")
# BUT IF IT'S NESTED IN TEXT FORMATTING?
if (len(list(reviewedworktitlenode)) > 0):
reviewedworktitle = xml.etree.ElementTree.tostring(reviewedworktitlenode[0]).replace("\n", "")
reviewedworktitle = reviewedworktitle.replace("<bold>", "")
reviewedworktitle = reviewedworktitle.replace("</bold>", "")
reviewedworktitle = reviewedworktitle.replace("<italic>", "")
reviewedworktitle = reviewedworktitle.replace("</italic>", "")
reviewedworktitle = re.sub(" +", " ", reviewedworktitle)
reviewedworktitle = reviewedworktitle.lstrip(" ")
else:
reviewedworktitle = reviewedworktitlenode.text.lstrip(" ")
reviewedworknames = reviewedworknode.findall("string-name")
reviewedworknamelist = []
for reviewedworkname in reviewedworknames:
if (len(list(reviewedworkname)) == 0):
if (reviewedworkname.text != None):
reviewedworknamelist.append(reviewedworkname.text)
else:
reviewedworkfirstname = ""
reviewedworkfamilyname = ""
reviewedworkfirstnamenode = reviewedworkname.find("given-names")
if (reviewedworkfirstnamenode != None):
reviewedworkfirstname = reviewedworkfirstnamenode.text
reviewedworkfamilynamenode = reviewedworkname.find("surname")
if (reviewedworkfamilynamenode != None):
reviewedworkfamilyname = reviewedworkfamilynamenode.text
reviewedworknamelist.append(reviewedworkfirstname + " " + reviewedworkfamilyname)
reviewedworks.append(reviewedworktitle + "| " + ", ".join(reviewedworknamelist))
# ABSTRACT
abstractnode = e.find(".//front/article-meta/abstract")
abstract = ""
if (abstractnode != None):
abstract = abstractnode.text.replace("\n", " ")
abstract.replace("<p>", "")
abstract.replace("</p>", "")
# Assemble data?
# Write to CSV file
# Create CSVWriter
with open('citations.tsv', 'ab') as tsvfile:
metadatawriter = csv.writer(tsvfile, delimiter='\t', quotechar='"')
metadatawriter.writerow([doi, doi, title, ", ".join(fullname), journaltitle, volume, issue, pubdate, pagerange, publisher, type, ", ".join(reviewedworks), abstract])
# End looping through metadata files
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment