kilian-gebhardt · March 12, 2020 12:18
diff --git a/ingest_tsv.py b/ingest_tsv.py
 #!/usr/bin/env python3
 """
 Takes a conference TSV file and, optionally, a conference metadata
 file, and creates the Anthology XML in the ACL Anthology repository.o
 This file can then be added to the repo and committed.

 Example usage:

 - First, fork [the ACL Anthology](https://github.com/acl-org/acl-anthology) to your Github account, and clone it to your drive
 - Grab one of the [conference TSV files](https://drive.google.com/drive/u/0/folders/1hC7FDlsXWVM2HSYgdluz01yotdEd0zW8)
 - Export the [conference list file](https://docs.google.com/spreadsheets/d/1fpxmdV_BPwR6BQHyU9VJQxXeSOmy4__5nQCHBEviyAw/edit#gid=0) to conference-meta.tsv

 Then run it as

    scripts/ingest_tsv.py --anthology /path/to/anthology eamt/eamt.1997.tsv conference-metadata.tsv

 this will create a file `/path/to/anthology/data/xml/1997.eamt.xml`.
 You can then commit this to your Anthology repo, push to your Github, and create a PR.

 Author: Matt Post
 """


 import csv
 import lxml.etree as etree
 import os
 import ssl
 import subprocess
 import sys
 import urllib.request

 from anthology.utils import make_nested, make_simple_element, build_anthology_id, indent


 def download(remote_path, local_path):
    if os.path.exists(local_path):
        print(f"{local_path} already exists, not re-downloading", file=sys.stderr)
        return True
    try:
        print(
            f"-> Downloading file from {remote_path} to {local_path}", file=sys.stderr
        )
        with urllib.request.urlopen(remote_path) as url, open(
                local_path, mode="wb"
        ) as input_file_fh:
            input_file_fh.write(url.read())
    except ssl.SSLError:
        raise Exception(f"Could not download {path}")

    return True


 def extract_pages(source_path , page_range, local_path):
    if os.path.exists(local_path):
        print(f"{local_path} already exists, not re-extracting", file=sys.stderr)
        return True
    if not os.path.exists(source_path):
        print(f"{source_path} does not exists", file=sys.stderr)
        raise Exception(f"Could not extract pdf")
    try:
        page_range = ' A'.join(page_range.split(','))
        print(
            f"-> Extracting pages {page_range} from {source_path} to {local_path}", file=sys.stderr
        )
        command = [f"pdftk A={source_path} cat {page_range} output {local_path}"]
        print(command)
        subprocess.check_call(command, shell=True)
    except ssl.SSLError:
        raise Exception(f"Could not extract pdf")

    return True


 def main(args):
    code, year, _ = os.path.basename(args.tsv_file.name).split(".")

    collection_id = f"{year}.{code}"

    tree = etree.ElementTree(
        make_simple_element("collection", attrib={"id": collection_id})
    )

    volume_id = "1"
    volume = make_simple_element("volume", attrib={"id": volume_id})
    tree.getroot().insert(0, volume)

    # Create the metadata for the paper
    meta = None
    for row in csv.DictReader(args.meta_file, delimiter="\t"):
        if row["Conference code"] == collection_id:
            if row["Completed"] == "FALSE":
                print("Warning: Conference {collection_id} is not marked as completed, can't ingest.")
                sys.exit(1)

            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle", row["Conference title"], parent=meta)
            make_simple_element("publisher", row["Publisher"], parent=meta)
            make_simple_element("address", row["Location"], parent=meta)
            make_simple_element("month", row["Dates held"], parent=meta)
            make_simple_element("year", row["Year"], parent=meta)
            if row["Editors"] != "" and "?" not in row["Editors"]:
                editors = row["Editors"].split(" and ")
                for editor_name in editors:
                    editor = make_simple_element("editor", parent=meta)
                    if ", " in editor_name:
                        last, first = editor_name.split(", ")
                    else:
                        first, last = ' '.join(editor_name.split()[:-1]), editor_name.split()[-1]
                    make_simple_element("first", first, parent=editor)
                    make_simple_element("last", last, parent=editor)
            break
    else:
        print(f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}", file=sys.stderr)
        sys.exit(1)

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    
    if args.frontmatter:
        start_id = 0
    else:
        start_id = 1

    # Create entries for all the papers
    for paperid, row in enumerate(csv.DictReader(args.tsv_file, delimiter='\t'), start_id):
        if paperid == 0:

            title_text = row["Title"]
            pages = row["Pagenumbers"]
            try: 
                pdf = row["Pdf"]
            except KeyError:
                pdf = None
            paper = make_simple_element(
                "frontmatter",
                parent=volume
            )
            frontmatter = make_simple_element("frontmatter", parent=volume)
            make_simple_element("pages", pages, parent=frontmatter)
            
            url = f"{collection_id}-{volume_id}.{paperid}"
            pdf_local_path = os.path.join(collection_id, f"{url}.pdf")
            make_simple_element("url", url, parent=frontmatter)
            if not pdf is None:
                download(pdf, pdf_local_path)
            else:
                pdf_pages = row["pages in pdf"]
                extract_pages(args.proceedings, pdf_pages, pdf_local_path) 
            continue

        title_text = row["Title"]
        author_list = row["Authors"].split(" and ")
        pages = row["Pagenumbers"]
        
        try: 
            pdf = row["Pdf"]
        except KeyError:
            pdf = None

        paper = make_simple_element(
            "paper",
            attrib={"id": str(paperid)},
            parent=volume
        )

        make_simple_element("title", title_text, parent=paper)
        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            print(author_name)
            if ", " in author_name:
                last, first = author_name.split(", ")
            else: 
                first, last = ' '.join(author_name.split()[:-1]), author_name.split()[-1]
            make_simple_element("first", first, parent=author)
            make_simple_element("last", last, parent=author)

        make_simple_element("pages", pages, parent=paper)

        url = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(collection_id, f"{url}.pdf")
        make_simple_element("url", url, parent=paper)
        if not pdf is None:
            download(pdf, pdf_local_path)
        else:
            pdf_pages = row["pages in pdf"]
            extract_pages(args.proceedings, pdf_pages, pdf_local_path) 

        if "Abstract" in row:
            make_simple_element("abstract", row["Abstract"], parent=paper)

        if "Presentation" in row:
            extension = row["Presentation"].split(".")[-1]
            filename = f"{collection_id}-{volume_id}.{paperid}.Presentation.{extension}"
            make_simple_element(
                "attachment",
                filename,
                attrib={"type": "presentation"}
            )
            download(row["Presentation"], os.path.join(collection_id, filename))

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    collection_file = os.path.join(
        args.anthology, "data", "xml", f"{collection_id}.xml"
    )
    tree.write(
        collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True
    )


 if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('tsv_file', type=argparse.FileType("r"))
    parser.add_argument('meta_file', type=argparse.FileType("r"),
                        help="Path to conference metadata file")
    parser.add_argument('--anthology', default=f"{os.environ.get('HOME')}/code/acl-anthology",
                        help="Path to Anthology repo (cloned from https://github.com/acl-org/acl-anthology)")

    parser.add_argument('--proceedings', help="Path to PDF with conference proceedings")
    parser.add_argument('--frontmatter', action="store_true")
    args = parser.parse_args()

    main(args)
	#!/usr/bin/env python3
	"""
	Takes a conference TSV file and, optionally, a conference metadata
	file, and creates the Anthology XML in the ACL Anthology repository.o
	This file can then be added to the repo and committed.

	Example usage:

	- First, fork [the ACL Anthology](https://github.com/acl-org/acl-anthology) to your Github account, and clone it to your drive
	- Grab one of the [conference TSV files](https://drive.google.com/drive/u/0/folders/1hC7FDlsXWVM2HSYgdluz01yotdEd0zW8)
	- Export the [conference list file](https://docs.google.com/spreadsheets/d/1fpxmdV_BPwR6BQHyU9VJQxXeSOmy4__5nQCHBEviyAw/edit#gid=0) to conference-meta.tsv

	Then run it as

	scripts/ingest_tsv.py --anthology /path/to/anthology eamt/eamt.1997.tsv conference-metadata.tsv

	this will create a file `/path/to/anthology/data/xml/1997.eamt.xml`.
	You can then commit this to your Anthology repo, push to your Github, and create a PR.

	Author: Matt Post
	"""


	import csv
	import lxml.etree as etree
	import os
	import ssl
	import subprocess
	import sys
	import urllib.request

	from anthology.utils import make_nested, make_simple_element, build_anthology_id, indent


	def download(remote_path, local_path):
	if os.path.exists(local_path):
	print(f"{local_path} already exists, not re-downloading", file=sys.stderr)
	return True
	try:
	print(
	f"-> Downloading file from {remote_path} to {local_path}", file=sys.stderr
	)
	with urllib.request.urlopen(remote_path) as url, open(
	local_path, mode="wb"
	) as input_file_fh:
	input_file_fh.write(url.read())
	except ssl.SSLError:
	raise Exception(f"Could not download {path}")

	return True


	def extract_pages(source_path , page_range, local_path):
	if os.path.exists(local_path):
	print(f"{local_path} already exists, not re-extracting", file=sys.stderr)
	return True
	if not os.path.exists(source_path):
	print(f"{source_path} does not exists", file=sys.stderr)
	raise Exception(f"Could not extract pdf")
	try:
	page_range = ' A'.join(page_range.split(','))
	print(
	f"-> Extracting pages {page_range} from {source_path} to {local_path}", file=sys.stderr
	)
	command = [f"pdftk A={source_path} cat {page_range} output {local_path}"]
	print(command)
	subprocess.check_call(command, shell=True)
	except ssl.SSLError:
	raise Exception(f"Could not extract pdf")

	return True


	def main(args):
	code, year, _ = os.path.basename(args.tsv_file.name).split(".")

	collection_id = f"{year}.{code}"

	tree = etree.ElementTree(
	make_simple_element("collection", attrib={"id": collection_id})
	)

	volume_id = "1"
	volume = make_simple_element("volume", attrib={"id": volume_id})
	tree.getroot().insert(0, volume)

	# Create the metadata for the paper
	meta = None
	for row in csv.DictReader(args.meta_file, delimiter="\t"):
	if row["Conference code"] == collection_id:
	if row["Completed"] == "FALSE":
	print("Warning: Conference {collection_id} is not marked as completed, can't ingest.")
	sys.exit(1)

	meta = make_simple_element("meta", parent=volume)
	make_simple_element("booktitle", row["Conference title"], parent=meta)
	make_simple_element("publisher", row["Publisher"], parent=meta)
	make_simple_element("address", row["Location"], parent=meta)
	make_simple_element("month", row["Dates held"], parent=meta)
	make_simple_element("year", row["Year"], parent=meta)
	if row["Editors"] != "" and "?" not in row["Editors"]:
	editors = row["Editors"].split(" and ")
	for editor_name in editors:
	editor = make_simple_element("editor", parent=meta)
	if ", " in editor_name:
	last, first = editor_name.split(", ")
	else:
	first, last = ' '.join(editor_name.split()[:-1]), editor_name.split()[-1]
	make_simple_element("first", first, parent=editor)
	make_simple_element("last", last, parent=editor)
	break
	else:
	print(f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}", file=sys.stderr)
	sys.exit(1)

	if not os.path.exists(collection_id):
	print(f"Creating {collection_id}", file=sys.stderr)
	os.makedirs(collection_id)


	if args.frontmatter:
	start_id = 0
	else:
	start_id = 1

	# Create entries for all the papers
	for paperid, row in enumerate(csv.DictReader(args.tsv_file, delimiter='\t'), start_id):
	if paperid == 0:

	title_text = row["Title"]
	pages = row["Pagenumbers"]
	try:
	pdf = row["Pdf"]
	except KeyError:
	pdf = None
	paper = make_simple_element(
	"frontmatter",
	parent=volume
	)
	frontmatter = make_simple_element("frontmatter", parent=volume)
	make_simple_element("pages", pages, parent=frontmatter)

	url = f"{collection_id}-{volume_id}.{paperid}"
	pdf_local_path = os.path.join(collection_id, f"{url}.pdf")
	make_simple_element("url", url, parent=frontmatter)
	if not pdf is None:
	download(pdf, pdf_local_path)
	else:
	pdf_pages = row["pages in pdf"]
	extract_pages(args.proceedings, pdf_pages, pdf_local_path)
	continue

	title_text = row["Title"]
	author_list = row["Authors"].split(" and ")
	pages = row["Pagenumbers"]

	try:
	pdf = row["Pdf"]
	except KeyError:
	pdf = None

	paper = make_simple_element(
	"paper",
	attrib={"id": str(paperid)},
	parent=volume
	)

	make_simple_element("title", title_text, parent=paper)
	for author_name in author_list:
	if author_name == "":
	continue
	author = make_simple_element("author", parent=paper)
	print(author_name)
	if ", " in author_name:
	last, first = author_name.split(", ")
	else:
	first, last = ' '.join(author_name.split()[:-1]), author_name.split()[-1]
	make_simple_element("first", first, parent=author)
	make_simple_element("last", last, parent=author)

	make_simple_element("pages", pages, parent=paper)

	url = f"{collection_id}-{volume_id}.{paperid}"
	pdf_local_path = os.path.join(collection_id, f"{url}.pdf")
	make_simple_element("url", url, parent=paper)
	if not pdf is None:
	download(pdf, pdf_local_path)
	else:
	pdf_pages = row["pages in pdf"]
	extract_pages(args.proceedings, pdf_pages, pdf_local_path)

	if "Abstract" in row:
	make_simple_element("abstract", row["Abstract"], parent=paper)

	if "Presentation" in row:
	extension = row["Presentation"].split(".")[-1]
	filename = f"{collection_id}-{volume_id}.{paperid}.Presentation.{extension}"
	make_simple_element(
	"attachment",
	filename,
	attrib={"type": "presentation"}
	)
	download(row["Presentation"], os.path.join(collection_id, filename))

	indent(tree.getroot())

	# Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
	collection_file = os.path.join(
	args.anthology, "data", "xml", f"{collection_id}.xml"
	)
	tree.write(
	collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True
	)


	if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument('tsv_file', type=argparse.FileType("r"))
	parser.add_argument('meta_file', type=argparse.FileType("r"),
	help="Path to conference metadata file")
	parser.add_argument('--anthology', default=f"{os.environ.get('HOME')}/code/acl-anthology",
	help="Path to Anthology repo (cloned from https://github.com/acl-org/acl-anthology)")

	parser.add_argument('--proceedings', help="Path to PDF with conference proceedings")
	parser.add_argument('--frontmatter', action="store_true")
	args = parser.parse_args()

	main(args)
No results found