Created
March 30, 2019 19:37
-
-
Save aryamccarthy/18faf5056343cca9a9fed82d95c80bff to your computer and use it in GitHub Desktop.
parse that TACL metadata
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
""" | |
Convert MIT Press XML files for TACL to Anthology XML. | |
""" | |
import logging | |
import xml.etree.ElementTree as etree | |
from pathlib import Path | |
from typing import List, Optional, Tuple | |
__version__ = '0.1' | |
log = logging.getLogger(__name__ if __name__ != '__main__ ' | |
else Path(__file__).stem) | |
def parse_args(): | |
"""Parse command line arguments.""" | |
import argparse | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument('tacl_year_root', metavar='FOLDER', type=Path) | |
parser.add_argument('--outfile', '-o', default=sys.stdout.buffer, help='Output XML file (default stdout)') | |
verbosity = parser.add_mutually_exclusive_group() | |
verbosity.add_argument('-v', '--verbose', action='store_const', | |
const=logging.DEBUG, default=logging.INFO) | |
verbosity.add_argument('-q', '--quiet', dest='verbose', | |
action='store_const', const=logging.WARNING) | |
parser.add_argument('--version', action='version', | |
version=f'%(prog)s v{__version__}') | |
args = parser.parse_args() | |
args.tacl_year_root = args.tacl_year_root.resolve() # Get absolute path. | |
# args.outfile = argparse.FileType(mode='w')(args.outfile) | |
return args | |
def get_volume_info(xml: Path) -> str: | |
log.info("Getting volume info from {}".format(xml)) | |
# So far, their XML for the volume doesn't play nicely with xml.etree. Thus, we hack. | |
paper = etree.Element('paper') | |
paper.attrib['id'] = "1000" # hard-code because there's only one collection. | |
volume_text = xml.stem.split(".")[-1] | |
title_text = "Transactions of the Association for Computational Linguistics" | |
title = etree.Element('title') | |
title.text = "{}, Volume {}".format(title_text, volume_text) | |
paper.append(title) | |
year_text = xml.stem.split(".")[1] | |
year = etree.Element('year') | |
year.text = year_text | |
paper.append(year) | |
return paper | |
def get_paperid(xml: Path) -> str: | |
basename = xml.stem | |
for i in range(1, 4+1): | |
assert basename[-i] in [str(x) for x in range(10)], basename | |
return "1" + basename[-3:] # TACL is always QXX-1YYY. | |
def get_title(xml_front_node: etree.Element) -> str: | |
article_meta = xml_front_node.find('article-meta') | |
title_group = article_meta.find('title-group') | |
title_text = title_group.find('article-title').text | |
return title_text | |
def get_year(xml_front_node: etree.Element) -> str: | |
article_meta = xml_front_node.find('article-meta') | |
pub_date = article_meta.find('pub-date') | |
year_text = pub_date.find('year').text | |
return year_text | |
def get_abstract(xml_front_node: etree.Element) -> str: | |
article_meta = xml_front_node.find('article-meta') | |
abstract = article_meta.find('abstract') | |
abstract_text = "".join(abstract.itertext()).strip() | |
return abstract_text | |
def get_authors(xml_front_node: etree.Element) -> List[Tuple[str, str]]: | |
article_meta = xml_front_node.find('article-meta') | |
contrib_group = article_meta.find('contrib-group') | |
authors = [] | |
for author in contrib_group.findall('contrib'): | |
string_name = author.find('string-name') | |
given_names = string_name.find('given-names').text | |
surname = string_name.find('surname').text | |
authors.append((given_names, surname)) | |
return authors | |
def get_pages(xml_front_node: etree.Element) -> Tuple[str, str]: | |
article_meta = xml_front_node.find('article-meta') | |
fpage = article_meta.find('fpage') | |
lpage = article_meta.find('lpage') | |
return fpage.text, lpage.text | |
def process_xml(xml: Path) -> Optional[etree.Element]: | |
logging.info("Reading {}".format(xml)) | |
paper = etree.Element('paper') | |
paperid = get_paperid(xml) | |
paper.attrib['id'] = paperid | |
tree = etree.parse(xml) | |
root = tree.getroot() | |
front = root.find('front') | |
title_text = get_title(front) | |
title = etree.Element('title') | |
title.text = title_text | |
paper.append(title) | |
authors = get_authors(front) | |
for given_names, surname in authors: | |
first = etree.Element('first') | |
first.text = given_names | |
last = etree.Element('last') | |
last.text = surname | |
author = etree.Element('author') | |
author.append(first) | |
author.append(last) | |
paper.append(author) | |
year_text = get_year(front) | |
year = etree.Element('year') | |
year.text = year_text | |
paper.append(year) | |
abstract_text = get_abstract(front) | |
abstract = etree.Element('abstract') | |
abstract.text = abstract_text | |
paper.append(abstract) | |
pages_tuple = get_pages(front) | |
pages = etree.Element('pages') | |
pages.text = "–".join(pages_tuple) # en-dash, not hyphen! | |
paper.append(pages) | |
return paper | |
if __name__ == '__main__': | |
import sys | |
if sys.version_info < (3,6): | |
sys.stderr.write("Python >=3.6 required.\n") | |
sys.exit(1) | |
args = parse_args() | |
logging.basicConfig(level=args.verbose) | |
prefix = "Q" if "tacl" in args.tacl_year_root.stem else "J" # J for CL, Q for TACL. | |
year_suffix = args.tacl_year_root.stem.split(".")[1][-2:] # Feels hacky, too. | |
volume_id = prefix + year_suffix | |
volume = etree.Element('volume') | |
volume.attrib['id'] = volume_id | |
volume_info = get_volume_info(list(args.tacl_year_root.glob("tacl.20*.*/tacl.20*.*.xml"))[0]) | |
volume.append(volume_info) | |
for xml in sorted(args.tacl_year_root.glob("tacl_a_*/*.xml")): | |
print(xml) | |
pdf = xml.with_suffix(".pdf") | |
if not pdf.is_file(): | |
log.error("Missing pdf for " + xml.name) | |
papernode = process_xml(xml) | |
if papernode is None: | |
continue | |
url_text = "http://www.aclweb.org/anthology/{}-{}".format(volume_id, papernode.attrib['id']) | |
url = etree.Element('url') | |
url.text = url_text | |
papernode.append(url) | |
volume.append(papernode) | |
for paper in volume: | |
for field in paper: | |
field.tail = '\n ' | |
if len(paper): | |
paper.text = '\n ' | |
paper[-1].tail = '\n ' | |
paper.tail = '\n\n ' | |
if len(volume): | |
volume.text = '\n ' | |
volume[-1].tail = '\n' | |
volume.tail = '\n' | |
et = etree.ElementTree(volume) | |
et.write(args.outfile, encoding="UTF-8", xml_declaration=True) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment