jdunic · June 12, 2014 16:15
diff --git a/create_title_list.py b/create_title_list.py
 import os
 import glob
 import csv
 import itertools

 from os.path import basename
 from collections import OrderedDict
 from datetime import datetime
 #

 def read_ref(file_name):
    all_records = []
    #print(file_name)
 #
    with open(file_name, 'r') as ofile:
        record = {}
        last_key = ''
        lines = ofile.readlines()
        lines.pop(0)
        lines.pop(0)
 #
    # skip two lines at start of file
        for line in lines:
            line = line.strip()
 #            print "reading line: %s" % line
 #
            if line == 'ER':
                # end of record
                # skip next line
                ofile.readline()
                all_records.append(record)
                record = {}
                last_key = ''
                continue
            elif line == '':
                continue
 #
            parts = line.split(' ')
 #
            if len(parts) == 0:
                continue
 #
            if len(parts[0]) == 2:
                # this is a "key"
                key = parts[0]
                last_key = key
                everything_else = ' '.join(parts[1:])
 #
                record[key] = everything_else
                continue
            else:
                # this is a continuation
                if last_key in ['TI', 'SO', 'SP'] :
                    # think TI stands for title, it's broken into two lines or more
                    record[last_key] = '%s %s' % (record[last_key], line)
                else:
                    record[last_key] = "%s\n %s" % (record[last_key], line)
 #
            for r in all_records:
                r.pop('OI', None)
                r.pop('IS', None)
                r.pop('RI', None)
                r.pop('DI', None)
                r.pop('CT', None)
                r.pop('CL', None)
                r.pop('CY', None)
                r.pop('SP', None)
                for k, v in r.items():
                    if k == 'AF':
                        v = v.replace('\n', ',')
                        r[k] = v
                    if k == 'AU':
                        v = v.replace('\n', ';')
                        r[k] = v
        return all_records
 #

 path = '/Users/jillian/Desktop/WebOfSci_references/*.txt'
 files = glob.glob(path)

 #
 masterlist = []
 #
 for f in files:
    species = str(basename(f)[:-4])
    dict_list = read_ref(f)
    for d in dict_list:
        d['species'] = species
    masterlist.append(dict_list)
 #
 masterlist = list(itertools.chain.from_iterable(masterlist))
 #
 time_stamp = datetime.now().strftime('%b%d_%H%M')
 #
 file_name = 'reference_list_%s.csv' % time_stamp

 # Change dictionary key names so that they are readable.
 renaming_dict = {'AU':'authors', 'CA':'group_authors', 'BA':'book_authors', 
                 'GP':'book_group_authors', 'PY':'year', 'TI':'title', 
                 'PT':'pub_type', 'SO':'source', 'VL':'volume', 'IS':'issue', 
                 'AR':'article_number', 'BP':'beginning_page', 'EP':'end_page', 
                 'PD':'pub_date', 'ED':'editors', 'CT':'conference_title', 
                 'CY':'conference_year', 'PU':'publisher', 'PI':'publisher_city', 
                 'SN':'ISSN', 'BN':'ISBN', 'DI':'DOI', 'PG':'page_count', 
                 'UT':'unique_article_identifier'}
 refID = 0
 for d in masterlist:
    refID += 1
    d['refID'] = refID
    for key, value in d.iteritems():
        if key in renaming_dict:
            d[renaming_dict[key]] = value
            del d[key]

 ordered_fieldnames = OrderedDict([
    ('refID', None), ('SentTo', None), ('DataExtracted', None), 
    ('FullTextAvailable', None), ('PredationInfo', None), 
    ('PredPreySpecies_in_GOM', None), ('OtherInteractions', None), 
    ('Notes', None), ('FilteredOnTitle', None), ('Repeat', None),
    ('species', None), ('authors', None), ('group_authors', None), 
    ('book_authors', None), ('book_group_authors', None), ('year', None), 
    ('title', None), ('pub_type', None), ('source', None), ('volume', None), 
    ('issue', None), ('article_number', None), ('beginning_page', None), 
    ('end_page', None), ('pub_date', None), ('editors', None), 
    ('conference_title', None), ('conference_year', None), ('publisher', None), 
    ('publisher_city', None), ('ISSN', None), ('ISBN', None), ('DOI', None), 
    ('page_count', None), ('unique_article_identifier', None)])

 #
 with open(file_name, 'wb') as new_file:
    dict_writer = csv.DictWriter(new_file, fieldnames = ordered_fieldnames, extrasaction = 'ignore')
    dict_writer.writeheader()
    dict_writer.writerows(masterlist)

 map = {}
 with open(file_name, 'r') as fin:
    reader = csv.DictReader(fin)
    with open('reference_list_%s_clean.csv' % time_stamp, 'w') as fout:
        writer = csv.DictWriter(fout, fieldnames = ordered_fieldnames)
        # omit this if the file has no header row
        #writer.writerow(next(reader))
        for row in list(reader)[0:10]:
            print(row)
            #(id, name, phone, ref, discard) = row
            if map.has_key(title):
                print(key)
                ref = map[title]
                discard = "YES"
                print('yes')
            else:
                map[title] = id
            #writer.writerow((id, name, phone, ref, discard))



 # SO suggested solution I was attempting
 # See: http://stackoverflow.com/questions/1733166/marking-duplicates-in-a-csv-file
 map = {}
 with open(r'c:\temp\input.csv', 'r') as fin:
    reader = csv.reader(fin)
    with open(r'c:\temp\output.csv', 'w') as fout:
        writer = csv.writer(fout)
        # omit this if the file has no header row
        writer.writerow(next(reader))
        for row in reader:
            (id, name, phone, ref, discard) = row
            if map.has_key(phone):
                ref = map[phone]
                discard = "YES"
            else:
                map[phone] = id
            writer.writerow((id, name, phone, ref, discard))
	import os
	import glob
	import csv
	import itertools

	from os.path import basename
	from collections import OrderedDict
	from datetime import datetime
	#

	def read_ref(file_name):
	all_records = []
	#print(file_name)
	#
	with open(file_name, 'r') as ofile:
	record = {}
	last_key = ''
	lines = ofile.readlines()
	lines.pop(0)
	lines.pop(0)
	#
	# skip two lines at start of file
	for line in lines:
	line = line.strip()
	# print "reading line: %s" % line
	#
	if line == 'ER':
	# end of record
	# skip next line
	ofile.readline()
	all_records.append(record)
	record = {}
	last_key = ''
	continue
	elif line == '':
	continue
	#
	parts = line.split(' ')
	#
	if len(parts) == 0:
	continue
	#
	if len(parts[0]) == 2:
	# this is a "key"
	key = parts[0]
	last_key = key
	everything_else = ' '.join(parts[1:])
	#
	record[key] = everything_else
	continue
	else:
	# this is a continuation
	if last_key in ['TI', 'SO', 'SP'] :
	# think TI stands for title, it's broken into two lines or more
	record[last_key] = '%s %s' % (record[last_key], line)
	else:
	record[last_key] = "%s\n %s" % (record[last_key], line)
	#
	for r in all_records:
	r.pop('OI', None)
	r.pop('IS', None)
	r.pop('RI', None)
	r.pop('DI', None)
	r.pop('CT', None)
	r.pop('CL', None)
	r.pop('CY', None)
	r.pop('SP', None)
	for k, v in r.items():
	if k == 'AF':
	v = v.replace('\n', ',')
	r[k] = v
	if k == 'AU':
	v = v.replace('\n', ';')
	r[k] = v
	return all_records
	#

	path = '/Users/jillian/Desktop/WebOfSci_references/*.txt'
	files = glob.glob(path)

	#
	masterlist = []
	#
	for f in files:
	species = str(basename(f)[:-4])
	dict_list = read_ref(f)
	for d in dict_list:
	d['species'] = species
	masterlist.append(dict_list)
	#
	masterlist = list(itertools.chain.from_iterable(masterlist))
	#
	time_stamp = datetime.now().strftime('%b%d_%H%M')
	#
	file_name = 'reference_list_%s.csv' % time_stamp

	# Change dictionary key names so that they are readable.
	renaming_dict = {'AU':'authors', 'CA':'group_authors', 'BA':'book_authors',
	'GP':'book_group_authors', 'PY':'year', 'TI':'title',
	'PT':'pub_type', 'SO':'source', 'VL':'volume', 'IS':'issue',
	'AR':'article_number', 'BP':'beginning_page', 'EP':'end_page',
	'PD':'pub_date', 'ED':'editors', 'CT':'conference_title',
	'CY':'conference_year', 'PU':'publisher', 'PI':'publisher_city',
	'SN':'ISSN', 'BN':'ISBN', 'DI':'DOI', 'PG':'page_count',
	'UT':'unique_article_identifier'}
	refID = 0
	for d in masterlist:
	refID += 1
	d['refID'] = refID
	for key, value in d.iteritems():
	if key in renaming_dict:
	d[renaming_dict[key]] = value
	del d[key]

	ordered_fieldnames = OrderedDict([
	('refID', None), ('SentTo', None), ('DataExtracted', None),
	('FullTextAvailable', None), ('PredationInfo', None),
	('PredPreySpecies_in_GOM', None), ('OtherInteractions', None),
	('Notes', None), ('FilteredOnTitle', None), ('Repeat', None),
	('species', None), ('authors', None), ('group_authors', None),
	('book_authors', None), ('book_group_authors', None), ('year', None),
	('title', None), ('pub_type', None), ('source', None), ('volume', None),
	('issue', None), ('article_number', None), ('beginning_page', None),
	('end_page', None), ('pub_date', None), ('editors', None),
	('conference_title', None), ('conference_year', None), ('publisher', None),
	('publisher_city', None), ('ISSN', None), ('ISBN', None), ('DOI', None),
	('page_count', None), ('unique_article_identifier', None)])

	#
	with open(file_name, 'wb') as new_file:
	dict_writer = csv.DictWriter(new_file, fieldnames = ordered_fieldnames, extrasaction = 'ignore')
	dict_writer.writeheader()
	dict_writer.writerows(masterlist)

	map = {}
	with open(file_name, 'r') as fin:
	reader = csv.DictReader(fin)
	with open('reference_list_%s_clean.csv' % time_stamp, 'w') as fout:
	writer = csv.DictWriter(fout, fieldnames = ordered_fieldnames)
	# omit this if the file has no header row
	#writer.writerow(next(reader))
	for row in list(reader)[0:10]:
	print(row)
	#(id, name, phone, ref, discard) = row
	if map.has_key(title):
	print(key)
	ref = map[title]
	discard = "YES"
	print('yes')
	else:
	map[title] = id
	#writer.writerow((id, name, phone, ref, discard))



	# SO suggested solution I was attempting
	# See: http://stackoverflow.com/questions/1733166/marking-duplicates-in-a-csv-file
	map = {}
	with open(r'c:\temp\input.csv', 'r') as fin:
	reader = csv.reader(fin)
	with open(r'c:\temp\output.csv', 'w') as fout:
	writer = csv.writer(fout)
	# omit this if the file has no header row
	writer.writerow(next(reader))
	for row in reader:
	(id, name, phone, ref, discard) = row
	if map.has_key(phone):
	ref = map[phone]
	discard = "YES"
	else:
	map[phone] = id
	writer.writerow((id, name, phone, ref, discard))
No results found