Created
August 21, 2016 02:07
-
-
Save sobolevnrm/412763ebae5424a92d3239898b615e2a to your computer and use it in GitHub Desktop.
A very simple RIS file parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Process RIS format following the standard at", | |
http://referencemanager.com/sites/rm/files/m/direct_export_ris.pdf """ | |
import re | |
ALLOWED_TAGS = {"TY" : "Record start", | |
"ER" : "Record end", | |
"A2" : "Secondary author", | |
"A3" : "Tertiary Author", | |
"A4" : "Subsidiary Author", | |
"AB" : "Abstract", | |
"AD" : "Author Address", | |
"AN" : "Accession Number", | |
"AU" : "Author", | |
"C1" : "Custom 1", | |
"C2" : "Custom 2", | |
"C3" : "Custom 3", | |
"C4" : "Custom 4", | |
"C5" : "Custom 5", | |
"C6" : "Custom 6", | |
"C7" : "Custom 7", | |
"C8" : "Custom 8", | |
"CA" : "Caption", | |
"CN" : "Call Number", | |
"CY" : "Place Published", | |
"DA" : "Date", | |
"DB" : "Name of Database", | |
"DO" : "DOI", | |
"DP" : "Database Provider", | |
"ET" : "Edition", | |
"J2" : "Alternate Title", | |
"KW" : "Keywords", | |
"L1" : "File Attachments", | |
"L4" : "Figure", | |
"LA" : "Language", | |
"LB" : "Label", | |
"IS" : "Number", | |
"M3" : "Type of Work", | |
"N1" : "Notes", | |
"NV" : "Number of Volumes", | |
"OP" : "Original Publication", | |
"PB" : "Publisher", | |
"PY" : "Year"} | |
REFERENCE_TYPES = {"ABST" : "Abstract", | |
"ADVS" : "Audiovisual material", | |
"ART" : "Art Work", | |
"BILL" : "Bill/Resolution", | |
"BOOK" : "Book, Whole", | |
"CASE" : "Case", | |
"CHAP" : "Book chapter", | |
"COMP" : "Computer program", | |
"CONF" : "Conference proceeding", | |
"CTLG" : "Catalog", | |
"DATA" : "Data file", | |
"ELEC" : "Electronic Citation", | |
"GEN" : "Generic", | |
"HEAR" : "Hearing", | |
"ICOMM" : "Internet Communication", | |
"INPR" : "In Press", | |
"JFULL" : "Journal (full)", | |
"JOUR" : "Journal", | |
"MAP" : "Map", | |
"MGZN" : "Magazine article", | |
"MPCT" : "Motion picture", | |
"MUSIC" : "Music score", | |
"NEWS" : "Newspaper", | |
"PAMP" : "Pamphlet", | |
"PAT" : "Patent", | |
"PCOMM" : "Personal communication", | |
"RPRT" : "Report", | |
"SER" : "Serial (Book, Monograph)", | |
"SLIDE" : "Slide", | |
"SOUND" : "Sound recording", | |
"STAT" : "Statute", | |
"THES" : "Thesis/Dissertation", | |
"UNBILl" : "Unenacted bill/resolution", | |
"UNPB" : "Unpublished work", | |
"VIDEO" : "Video recording"} | |
class RIS: | |
""" RIS file structure """ | |
def __init__(self, in_file=None): | |
""" Initialize and parse input """ | |
self.records = [] | |
if in_file: | |
self.parse(in_file) | |
def parse(self, in_file): | |
""" Parse input file """ | |
self.current_tag = None | |
self.current_record = None | |
prog = re.compile("^([A-Z][A-Z0-9]) *- *(.*)") | |
lines = [] | |
# Eliminate blank lines | |
for line in in_file: | |
line = line.strip() | |
if len(line) > 0: | |
lines.append(line) | |
for line in lines: | |
match = prog.match(line) | |
if match: | |
tag = match.groups()[0] | |
field = match.groups()[1] | |
self.process_field(tag, field) | |
else: | |
raise ValueError(line) | |
def process_field(self, tag, field): | |
""" Process RIS file field """ | |
if tag == "TY": | |
self.current_record = {tag: field} | |
elif tag == "ER": | |
self.records.append(self.current_record) | |
self.current_record = None | |
elif tag in ["AU", "AD", "KW", "N1"]: | |
if tag in self.current_record: | |
self.current_record[tag].append(field) | |
else: | |
self.current_record[tag] = [field] | |
else: | |
if not tag in self.current_record: | |
self.current_record[tag] = field | |
else: | |
error_str = "Duplicate tag: %s" % tag | |
raise ValueError(error_str) | |
def main(): | |
""" Test the code """ | |
import pprint | |
with open("collaborations.ris", "rt") as ris_file: | |
ris = RIS(ris_file) | |
pp = pprint.PrettyPrinter() | |
pp.pprint(ris.records) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment