Skip to content

Instantly share code, notes, and snippets.

@jdunic
Created June 12, 2014 16:15
Show Gist options
  • Select an option

  • Save jdunic/557be1599861ade0de05 to your computer and use it in GitHub Desktop.

Select an option

Save jdunic/557be1599861ade0de05 to your computer and use it in GitHub Desktop.
import os
import glob
import csv
import itertools
from os.path import basename
from collections import OrderedDict
from datetime import datetime
#
def read_ref(file_name):
all_records = []
#print(file_name)
#
with open(file_name, 'r') as ofile:
record = {}
last_key = ''
lines = ofile.readlines()
lines.pop(0)
lines.pop(0)
#
# skip two lines at start of file
for line in lines:
line = line.strip()
# print "reading line: %s" % line
#
if line == 'ER':
# end of record
# skip next line
ofile.readline()
all_records.append(record)
record = {}
last_key = ''
continue
elif line == '':
continue
#
parts = line.split(' ')
#
if len(parts) == 0:
continue
#
if len(parts[0]) == 2:
# this is a "key"
key = parts[0]
last_key = key
everything_else = ' '.join(parts[1:])
#
record[key] = everything_else
continue
else:
# this is a continuation
if last_key in ['TI', 'SO', 'SP'] :
# think TI stands for title, it's broken into two lines or more
record[last_key] = '%s %s' % (record[last_key], line)
else:
record[last_key] = "%s\n %s" % (record[last_key], line)
#
for r in all_records:
r.pop('OI', None)
r.pop('IS', None)
r.pop('RI', None)
r.pop('DI', None)
r.pop('CT', None)
r.pop('CL', None)
r.pop('CY', None)
r.pop('SP', None)
for k, v in r.items():
if k == 'AF':
v = v.replace('\n', ',')
r[k] = v
if k == 'AU':
v = v.replace('\n', ';')
r[k] = v
return all_records
#
path = '/Users/jillian/Desktop/WebOfSci_references/*.txt'
files = glob.glob(path)
#
masterlist = []
#
for f in files:
species = str(basename(f)[:-4])
dict_list = read_ref(f)
for d in dict_list:
d['species'] = species
masterlist.append(dict_list)
#
masterlist = list(itertools.chain.from_iterable(masterlist))
#
time_stamp = datetime.now().strftime('%b%d_%H%M')
#
file_name = 'reference_list_%s.csv' % time_stamp
# Change dictionary key names so that they are readable.
renaming_dict = {'AU':'authors', 'CA':'group_authors', 'BA':'book_authors',
'GP':'book_group_authors', 'PY':'year', 'TI':'title',
'PT':'pub_type', 'SO':'source', 'VL':'volume', 'IS':'issue',
'AR':'article_number', 'BP':'beginning_page', 'EP':'end_page',
'PD':'pub_date', 'ED':'editors', 'CT':'conference_title',
'CY':'conference_year', 'PU':'publisher', 'PI':'publisher_city',
'SN':'ISSN', 'BN':'ISBN', 'DI':'DOI', 'PG':'page_count',
'UT':'unique_article_identifier'}
refID = 0
for d in masterlist:
refID += 1
d['refID'] = refID
for key, value in d.iteritems():
if key in renaming_dict:
d[renaming_dict[key]] = value
del d[key]
ordered_fieldnames = OrderedDict([
('refID', None), ('SentTo', None), ('DataExtracted', None),
('FullTextAvailable', None), ('PredationInfo', None),
('PredPreySpecies_in_GOM', None), ('OtherInteractions', None),
('Notes', None), ('FilteredOnTitle', None), ('Repeat', None),
('species', None), ('authors', None), ('group_authors', None),
('book_authors', None), ('book_group_authors', None), ('year', None),
('title', None), ('pub_type', None), ('source', None), ('volume', None),
('issue', None), ('article_number', None), ('beginning_page', None),
('end_page', None), ('pub_date', None), ('editors', None),
('conference_title', None), ('conference_year', None), ('publisher', None),
('publisher_city', None), ('ISSN', None), ('ISBN', None), ('DOI', None),
('page_count', None), ('unique_article_identifier', None)])
#
with open(file_name, 'wb') as new_file:
dict_writer = csv.DictWriter(new_file, fieldnames = ordered_fieldnames, extrasaction = 'ignore')
dict_writer.writeheader()
dict_writer.writerows(masterlist)
map = {}
with open(file_name, 'r') as fin:
reader = csv.DictReader(fin)
with open('reference_list_%s_clean.csv' % time_stamp, 'w') as fout:
writer = csv.DictWriter(fout, fieldnames = ordered_fieldnames)
# omit this if the file has no header row
#writer.writerow(next(reader))
for row in list(reader)[0:10]:
print(row)
#(id, name, phone, ref, discard) = row
if map.has_key(title):
print(key)
ref = map[title]
discard = "YES"
print('yes')
else:
map[title] = id
#writer.writerow((id, name, phone, ref, discard))
# SO suggested solution I was attempting
# See: http://stackoverflow.com/questions/1733166/marking-duplicates-in-a-csv-file
map = {}
with open(r'c:\temp\input.csv', 'r') as fin:
reader = csv.reader(fin)
with open(r'c:\temp\output.csv', 'w') as fout:
writer = csv.writer(fout)
# omit this if the file has no header row
writer.writerow(next(reader))
for row in reader:
(id, name, phone, ref, discard) = row
if map.has_key(phone):
ref = map[phone]
discard = "YES"
else:
map[phone] = id
writer.writerow((id, name, phone, ref, discard))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment