Created
June 12, 2014 16:15
-
-
Save jdunic/557be1599861ade0de05 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import glob | |
| import csv | |
| import itertools | |
| from os.path import basename | |
| from collections import OrderedDict | |
| from datetime import datetime | |
| # | |
| def read_ref(file_name): | |
| all_records = [] | |
| #print(file_name) | |
| # | |
| with open(file_name, 'r') as ofile: | |
| record = {} | |
| last_key = '' | |
| lines = ofile.readlines() | |
| lines.pop(0) | |
| lines.pop(0) | |
| # | |
| # skip two lines at start of file | |
| for line in lines: | |
| line = line.strip() | |
| # print "reading line: %s" % line | |
| # | |
| if line == 'ER': | |
| # end of record | |
| # skip next line | |
| ofile.readline() | |
| all_records.append(record) | |
| record = {} | |
| last_key = '' | |
| continue | |
| elif line == '': | |
| continue | |
| # | |
| parts = line.split(' ') | |
| # | |
| if len(parts) == 0: | |
| continue | |
| # | |
| if len(parts[0]) == 2: | |
| # this is a "key" | |
| key = parts[0] | |
| last_key = key | |
| everything_else = ' '.join(parts[1:]) | |
| # | |
| record[key] = everything_else | |
| continue | |
| else: | |
| # this is a continuation | |
| if last_key in ['TI', 'SO', 'SP'] : | |
| # think TI stands for title, it's broken into two lines or more | |
| record[last_key] = '%s %s' % (record[last_key], line) | |
| else: | |
| record[last_key] = "%s\n %s" % (record[last_key], line) | |
| # | |
| for r in all_records: | |
| r.pop('OI', None) | |
| r.pop('IS', None) | |
| r.pop('RI', None) | |
| r.pop('DI', None) | |
| r.pop('CT', None) | |
| r.pop('CL', None) | |
| r.pop('CY', None) | |
| r.pop('SP', None) | |
| for k, v in r.items(): | |
| if k == 'AF': | |
| v = v.replace('\n', ',') | |
| r[k] = v | |
| if k == 'AU': | |
| v = v.replace('\n', ';') | |
| r[k] = v | |
| return all_records | |
| # | |
| path = '/Users/jillian/Desktop/WebOfSci_references/*.txt' | |
| files = glob.glob(path) | |
| # | |
| masterlist = [] | |
| # | |
| for f in files: | |
| species = str(basename(f)[:-4]) | |
| dict_list = read_ref(f) | |
| for d in dict_list: | |
| d['species'] = species | |
| masterlist.append(dict_list) | |
| # | |
| masterlist = list(itertools.chain.from_iterable(masterlist)) | |
| # | |
| time_stamp = datetime.now().strftime('%b%d_%H%M') | |
| # | |
| file_name = 'reference_list_%s.csv' % time_stamp | |
| # Change dictionary key names so that they are readable. | |
| renaming_dict = {'AU':'authors', 'CA':'group_authors', 'BA':'book_authors', | |
| 'GP':'book_group_authors', 'PY':'year', 'TI':'title', | |
| 'PT':'pub_type', 'SO':'source', 'VL':'volume', 'IS':'issue', | |
| 'AR':'article_number', 'BP':'beginning_page', 'EP':'end_page', | |
| 'PD':'pub_date', 'ED':'editors', 'CT':'conference_title', | |
| 'CY':'conference_year', 'PU':'publisher', 'PI':'publisher_city', | |
| 'SN':'ISSN', 'BN':'ISBN', 'DI':'DOI', 'PG':'page_count', | |
| 'UT':'unique_article_identifier'} | |
| refID = 0 | |
| for d in masterlist: | |
| refID += 1 | |
| d['refID'] = refID | |
| for key, value in d.iteritems(): | |
| if key in renaming_dict: | |
| d[renaming_dict[key]] = value | |
| del d[key] | |
| ordered_fieldnames = OrderedDict([ | |
| ('refID', None), ('SentTo', None), ('DataExtracted', None), | |
| ('FullTextAvailable', None), ('PredationInfo', None), | |
| ('PredPreySpecies_in_GOM', None), ('OtherInteractions', None), | |
| ('Notes', None), ('FilteredOnTitle', None), ('Repeat', None), | |
| ('species', None), ('authors', None), ('group_authors', None), | |
| ('book_authors', None), ('book_group_authors', None), ('year', None), | |
| ('title', None), ('pub_type', None), ('source', None), ('volume', None), | |
| ('issue', None), ('article_number', None), ('beginning_page', None), | |
| ('end_page', None), ('pub_date', None), ('editors', None), | |
| ('conference_title', None), ('conference_year', None), ('publisher', None), | |
| ('publisher_city', None), ('ISSN', None), ('ISBN', None), ('DOI', None), | |
| ('page_count', None), ('unique_article_identifier', None)]) | |
| # | |
| with open(file_name, 'wb') as new_file: | |
| dict_writer = csv.DictWriter(new_file, fieldnames = ordered_fieldnames, extrasaction = 'ignore') | |
| dict_writer.writeheader() | |
| dict_writer.writerows(masterlist) | |
| map = {} | |
| with open(file_name, 'r') as fin: | |
| reader = csv.DictReader(fin) | |
| with open('reference_list_%s_clean.csv' % time_stamp, 'w') as fout: | |
| writer = csv.DictWriter(fout, fieldnames = ordered_fieldnames) | |
| # omit this if the file has no header row | |
| #writer.writerow(next(reader)) | |
| for row in list(reader)[0:10]: | |
| print(row) | |
| #(id, name, phone, ref, discard) = row | |
| if map.has_key(title): | |
| print(key) | |
| ref = map[title] | |
| discard = "YES" | |
| print('yes') | |
| else: | |
| map[title] = id | |
| #writer.writerow((id, name, phone, ref, discard)) | |
| # SO suggested solution I was attempting | |
| # See: http://stackoverflow.com/questions/1733166/marking-duplicates-in-a-csv-file | |
| map = {} | |
| with open(r'c:\temp\input.csv', 'r') as fin: | |
| reader = csv.reader(fin) | |
| with open(r'c:\temp\output.csv', 'w') as fout: | |
| writer = csv.writer(fout) | |
| # omit this if the file has no header row | |
| writer.writerow(next(reader)) | |
| for row in reader: | |
| (id, name, phone, ref, discard) = row | |
| if map.has_key(phone): | |
| ref = map[phone] | |
| discard = "YES" | |
| else: | |
| map[phone] = id | |
| writer.writerow((id, name, phone, ref, discard)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment