jdunic · November 15, 2013 17:26
diff --git a/csv_cleanup_tutorial.py b/csv_cleanup_tutorial.py
 #!/usr/bin/env python
 # This top line tells the computer that your script is an executable
 
 # Let's save this file as xxxxx_cleanup.py
 
 # Today we have a datafile in csv (comma separated values) format which is not
 # clean. There are inconsistencies in text formats, 
 
 # Base python comes with a lot of capabilities, but just like in R, we have to
 # import different packages or "modules" in python to allow us to do more. We 
 # will load these at the top of the script so that we can call on them later.
 
 import csv  # lets us to read, write, and manipulate csv files
 import logging  # lets us track events, particularly errors while the script is running 
 import collections  # lets us store objects in a particular order (among other things)
 from datetime import datetime  # lets us manipulate dates and times (which are objects)
 from itertools import izip
 
 # Here we are setting up our logging for tracking errors.
 logging.basicConfig(level=logging.INFO)
 
 
 # We can create the string (set of characters) time_stamp that can later be 
 # appended to filenames, or written into our new csvs. This allows us to keep 
 # track of when log files or the 'cleaned' csvs were created. Particularly if we
 # run this script multiple times on the same data.
 time_stamp = datetime.now().strftime('%b%d_%H:%M')
 # For your reference below are the datetime formats that I have used in this 
 # example. For a full list of formats see: # <http://docs.python.org/2/library/time.html>
 # %b = Locale's abbreviated month name
 # %d = Day of the month as a decimal number [01,31]
 # %H = Hour (24-hour clock) as a decimal number [00,23]
 # %M = Minute as a decimal number [00,59]
 
 # Example of a common type of error. Storing dates and times as datetime objects
 # can help you maintain data integrity, because only valid formats will be accepted.
 # This is an opportunity for quality control and prevention of ambiguous date 
 # formats e.g., 01/05/12 
 def date_cleaning(date_str):
 	date = None
 	date = datetime.strptime(date_str, "%Y")
 	return date
 
 # To allow our script to be flexible, we can define the variable "filename" at 
 # the top of the document. This will be the name of the original file we wish to 
 # clean. By doing this we only need to worry about changing the filename in one 
 # place if we would like to clean a different file.
 
 files = ["/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/Gulf of Alaska all sites 2003 percent cover.csv",
 		 "/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/Gulf of Alaska all sites 2003 percent cover_KB.csv"#,
 		 #"/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/Gulf of Alaska all sites 2003 percent cover_Kodiak.csv"
 		 ]
 
 sites_file = "/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/sites.csv"
 
 # If we want to have all of the errors printed to a file, which might be 
 # particularly useful if we are cleaning a large dataset, we can change our 
 # logging configuration. Here I am setting the variable LOG_FILENAME. You will 
 # see that there is a complete path to where I want the file "XX_cleanup_logging_datetime.txt"
 # to be saved. You will notice that there is a "%s" in the filename, and that
 # the filename is followed by a "% time_stamp". This tells Python that I want 
 # the time_stamp string that I defined above to be appended to my filename.
 LOG_FILENAME = "/Users/jillian/Desktop/data_cleanup/log_files/filename_cleanup_logging_%s." % time_stamp
 
 
 sites = {}
 with open(sites_file, 'rU') as csvfile:
 	csvreader = csv.DictReader(csvfile)
 	for row in csvreader:
 		sites[row['site_code']] = {
 			'longitude': row['longitude'],
 			'latitude': row['latitude'],
 			'name': row['site']
 		}
 
 # As an example you can do this in your terminal:
 #string_with_variable = "It's cat o'clock: %s!" % time_stamp
 #print string_with_variable	
 # header_dict = collections.OrderedDict(izip(fieldnames, fieldnames))
 # header_list = header_dict.values()
 header = ['AREA', 'YEAR', 'SITE', 'DEPTH STRATA', 'SIZE', 'REPLICATE', 'kelp', 
 		  'Agarum clathratum', 'L. sacch.', 'L. bongar.', 'L. yezoensis', 
 		  'juv. Laminaria', 'Cymenthere', 'Costaria', '% fleshy algae total', 
 		  '(reds)', '(browns)', '(greens)', '%upright  corallines', 
 		  '%encrusting corallines', '%encrusting invertebrates', '%rock', 
 		  'Pycnopodia', 'Crossaster', 'Orthasterias', 'Dermasterias', 'Henricia', 
 		  'Tonicella', 'Mopalia', 'Chryptochiton', 'crinoid', 'scallop', 
 		  'S. droebachiensis', 'cucumber', 'Fusitriton', 'Desmarestia',
 		  'Nereocystis', 'Urticina', 'Halocynthia', 'Alaria marginata', 
 		  'Parastichopus', 'L. setchellii', 'SITE NAME', 'SITE LONG', 'SITE LAT']
 
 clean_filename = "./cleaned_%s.csv" % time_stamp
 with open(clean_filename, 'wb') as clean_file:
 	writer = csv.writer(clean_file, delimiter = ",")
 	writer.writerow(header)
 number = 0
 with open('clean_filename', 'a') as clean_file:  # we use 'a' because we want to append data to this file
 	writer = csv.DictWriter(clean_file, fieldnames=header)
 	for f in files:  # I want to read the data from each of the three files that we defined in 'files'
 		number += 1
 		with open(f, 'rU') as csvfile:
 			next(csvfile) # This is a way to skip the first line of a csv
 			csvreader = csv.DictReader(csvfile)
 			fieldnames = csvreader.fieldnames
 			line_number = 0  # Keeping track of line numbers can be useful when debugging and cleaning
 			for row in csvreader:
 				line_number += 1
 				for h in header:
 					if h not in row:
 						row[h] = 0
 					else:
 						try:
 							if h == 'YEAR':
 								row[h] = date_cleaning(row[h])
 						except ValueError as e:
 							logging.warning("filename: %s, line: %s, error: %s" %
 								(f, line_number, e))

 						if h not in ('AREA', 'SITE', 'SIZE', 'YEAR', 'REPLICATE', 
 								     'DEPTH STRATA', 'SITE NAME', 'SITE LONG', 
 								     'SITE LAT') and row[h] != (None or ''):
 							try:
 								row[h] = int(row[h])
 							except ValueError as e:
 								logging.warning("value: %r, line: %s, error: %s" % 
 									(row[h], line_number, e))
 						else:
 							row[h] = 0
 								#logging.warning("filename: %s, line: %s, error: %s" %
 									#(f, line_number, e))
 				writer.writerow(row)

 """
 						if h == 'SITE':
 							site = sites[row[h]]
 							row['SITE NAME'] = site['name']
 							row['SITE LONG'] = site['longitude']
 							row['SITE LAT'] = site['latitude']
 """

 """
 					if h in row: # cleanup row:
 						try: 
 							if h == 'YEAR':
 								row[h] = date_cleaning(row[h])
 						except ValueError as e:
 							logging.warning("filename: %s, line: %s, error: %s" %
 								(f, line_number, e))
 						if h == 'SITE':
 							site = sites[row[h]]
 							row['SITE NAME'] = site['name']
 							row['SITE LONG'] = site['longitude']
 							row['SITE LAT'] = site['latitude']
 						if h != ('AREA' or 'SITE' or 'SIZE' or 'YEAR' or 
 							 'SITE NAME' or 'SITE LONG' or 'SITE LAT'):
 							try:
 								int(h)
 							except ValueError as e:
 								logging.warning("filename: %s, line: %s, error: %s" %
 									(f, line_number, e))
 					else:
 						row[h] = '0'

 				writer.writerow(row)
 """
 
 """
 # Next we tell python that we want to open the file called 'filename' whenever 
 # we call 'csvfile'
 number = 0
 for f in files:
 	number += 1
 	rows = {}
 	with open(f, 'rU') as csvfile: 
 		next(csvfile)  # This is a way to skip the first line of a csv
 		csvreader = csv.DictReader(csvfile)
 		fieldnames = list(set(csvreader.fieldnames) | set(fieldnames))  # Gets a list of the fieldnames (skipped first row)
 
 with open('clean_filename', 'wb') as clean_file:
 	writer = csv.writer(open(clean_filename, 'wb'), dialect='excel')
 
 # We can use what we learned earlier about including variables in strings to
 # set the cleaned output file name. There is a slight difference to what we did
 # before, can you see what it is?
 clean_filename = "/Users/jillian/Desktop/data_cleanup/cleaned_%s.csv" % time_stamp
 cleaner = "your_name"
 
 # Here we are defining the file that we will be writing to
 with open('clean_filename', 'wb') as clean_file:
 	writer = csv.writer(clean_file, delimiter = ",")
 	writer.writerow(fieldnames)
 
 
 # This statement writer.writerow() tells Python to write some array into the 
 # file that we defined as writer. Unlike in R, periods are special characters in
 # Python and are used to separate an object (here writer) from an attribute or 
 # method (here writerow) of that object. This difference between R and Python is 
 # one reason why I use snake_case (or CamelCase) for naming variables even in 
 # rather than separating words with '.'. 
 writer.writerow(["You can include a descriptive string here"])
 writer.writerow(["'Cleaned by: %s, on %s' will show up in the next line." %
 				 (cleaner, time_stamp)])
 writer.writerow([\"""If you would like to write multiple lines of text but keep
 				    it on the same row and continue to follow the 80 character
 				    format in your code, you can use triple quotation marks \"""])
 writer.writerow([''])  # I add this to leave a blank line

 filename = "/Users/jillian/Desktop/data_cleanup/cleaned_Nov14_19:53.csv"
 with open(filename, 'rU') as csvfile:
    csvreader = csv.DictReader(csvfile)
    fieldnames = sorted(csvreader.fieldnames)

 """
	#!/usr/bin/env python
	# This top line tells the computer that your script is an executable

	# Let's save this file as xxxxx_cleanup.py

	# Today we have a datafile in csv (comma separated values) format which is not
	# clean. There are inconsistencies in text formats,

	# Base python comes with a lot of capabilities, but just like in R, we have to
	# import different packages or "modules" in python to allow us to do more. We
	# will load these at the top of the script so that we can call on them later.

	import csv # lets us to read, write, and manipulate csv files
	import logging # lets us track events, particularly errors while the script is running
	import collections # lets us store objects in a particular order (among other things)
	from datetime import datetime # lets us manipulate dates and times (which are objects)
	from itertools import izip

	# Here we are setting up our logging for tracking errors.
	logging.basicConfig(level=logging.INFO)


	# We can create the string (set of characters) time_stamp that can later be
	# appended to filenames, or written into our new csvs. This allows us to keep
	# track of when log files or the 'cleaned' csvs were created. Particularly if we
	# run this script multiple times on the same data.
	time_stamp = datetime.now().strftime('%b%d_%H:%M')
	# For your reference below are the datetime formats that I have used in this
	# example. For a full list of formats see: # <http://docs.python.org/2/library/time.html>
	# %b = Locale's abbreviated month name
	# %d = Day of the month as a decimal number [01,31]
	# %H = Hour (24-hour clock) as a decimal number [00,23]
	# %M = Minute as a decimal number [00,59]

	# Example of a common type of error. Storing dates and times as datetime objects
	# can help you maintain data integrity, because only valid formats will be accepted.
	# This is an opportunity for quality control and prevention of ambiguous date
	# formats e.g., 01/05/12
	def date_cleaning(date_str):
	date = None
	date = datetime.strptime(date_str, "%Y")
	return date

	# To allow our script to be flexible, we can define the variable "filename" at
	# the top of the document. This will be the name of the original file we wish to
	# clean. By doing this we only need to worry about changing the filename in one
	# place if we would like to clean a different file.

	files = ["/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/Gulf of Alaska all sites 2003 percent cover.csv",
	"/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/Gulf of Alaska all sites 2003 percent cover_KB.csv"#,
	#"/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/Gulf of Alaska all sites 2003 percent cover_Kodiak.csv"
	]

	sites_file = "/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/sites.csv"

	# If we want to have all of the errors printed to a file, which might be
	# particularly useful if we are cleaning a large dataset, we can change our
	# logging configuration. Here I am setting the variable LOG_FILENAME. You will
	# see that there is a complete path to where I want the file "XX_cleanup_logging_datetime.txt"
	# to be saved. You will notice that there is a "%s" in the filename, and that
	# the filename is followed by a "% time_stamp". This tells Python that I want
	# the time_stamp string that I defined above to be appended to my filename.
	LOG_FILENAME = "/Users/jillian/Desktop/data_cleanup/log_files/filename_cleanup_logging_%s." % time_stamp


	sites = {}
	with open(sites_file, 'rU') as csvfile:
	csvreader = csv.DictReader(csvfile)
	for row in csvreader:
	sites[row['site_code']] = {
	'longitude': row['longitude'],
	'latitude': row['latitude'],
	'name': row['site']
	}

	# As an example you can do this in your terminal:
	#string_with_variable = "It's cat o'clock: %s!" % time_stamp
	#print string_with_variable
	# header_dict = collections.OrderedDict(izip(fieldnames, fieldnames))
	# header_list = header_dict.values()
	header = ['AREA', 'YEAR', 'SITE', 'DEPTH STRATA', 'SIZE', 'REPLICATE', 'kelp',
	'Agarum clathratum', 'L. sacch.', 'L. bongar.', 'L. yezoensis',
	'juv. Laminaria', 'Cymenthere', 'Costaria', '% fleshy algae total',
	'(reds)', '(browns)', '(greens)', '%upright corallines',
	'%encrusting corallines', '%encrusting invertebrates', '%rock',
	'Pycnopodia', 'Crossaster', 'Orthasterias', 'Dermasterias', 'Henricia',
	'Tonicella', 'Mopalia', 'Chryptochiton', 'crinoid', 'scallop',
	'S. droebachiensis', 'cucumber', 'Fusitriton', 'Desmarestia',
	'Nereocystis', 'Urticina', 'Halocynthia', 'Alaria marginata',
	'Parastichopus', 'L. setchellii', 'SITE NAME', 'SITE LONG', 'SITE LAT']

	clean_filename = "./cleaned_%s.csv" % time_stamp
	with open(clean_filename, 'wb') as clean_file:
	writer = csv.writer(clean_file, delimiter = ",")
	writer.writerow(header)
	number = 0
	with open('clean_filename', 'a') as clean_file: # we use 'a' because we want to append data to this file
	writer = csv.DictWriter(clean_file, fieldnames=header)
	for f in files: # I want to read the data from each of the three files that we defined in 'files'
	number += 1
	with open(f, 'rU') as csvfile:
	next(csvfile) # This is a way to skip the first line of a csv
	csvreader = csv.DictReader(csvfile)
	fieldnames = csvreader.fieldnames
	line_number = 0 # Keeping track of line numbers can be useful when debugging and cleaning
	for row in csvreader:
	line_number += 1
	for h in header:
	if h not in row:
	row[h] = 0
	else:
	try:
	if h == 'YEAR':
	row[h] = date_cleaning(row[h])
	except ValueError as e:
	logging.warning("filename: %s, line: %s, error: %s" %
	(f, line_number, e))

	if h not in ('AREA', 'SITE', 'SIZE', 'YEAR', 'REPLICATE',
	'DEPTH STRATA', 'SITE NAME', 'SITE LONG',
	'SITE LAT') and row[h] != (None or ''):
	try:
	row[h] = int(row[h])
	except ValueError as e:
	logging.warning("value: %r, line: %s, error: %s" %
	(row[h], line_number, e))
	else:
	row[h] = 0
	#logging.warning("filename: %s, line: %s, error: %s" %
	#(f, line_number, e))
	writer.writerow(row)

	"""
	if h == 'SITE':
	site = sites[row[h]]
	row['SITE NAME'] = site['name']
	row['SITE LONG'] = site['longitude']
	row['SITE LAT'] = site['latitude']
	"""

	"""
	if h in row: # cleanup row:
	try:
	if h == 'YEAR':
	row[h] = date_cleaning(row[h])
	except ValueError as e:
	logging.warning("filename: %s, line: %s, error: %s" %
	(f, line_number, e))
	if h == 'SITE':
	site = sites[row[h]]
	row['SITE NAME'] = site['name']
	row['SITE LONG'] = site['longitude']
	row['SITE LAT'] = site['latitude']
	if h != ('AREA' or 'SITE' or 'SIZE' or 'YEAR' or
	'SITE NAME' or 'SITE LONG' or 'SITE LAT'):
	try:
	int(h)
	except ValueError as e:
	logging.warning("filename: %s, line: %s, error: %s" %
	(f, line_number, e))
	else:
	row[h] = '0'

	writer.writerow(row)
	"""

	"""
	# Next we tell python that we want to open the file called 'filename' whenever
	# we call 'csvfile'
	number = 0
	for f in files:
	number += 1
	rows = {}
	with open(f, 'rU') as csvfile:
	next(csvfile) # This is a way to skip the first line of a csv
	csvreader = csv.DictReader(csvfile)
	fieldnames = list(set(csvreader.fieldnames) \| set(fieldnames)) # Gets a list of the fieldnames (skipped first row)

	with open('clean_filename', 'wb') as clean_file:
	writer = csv.writer(open(clean_filename, 'wb'), dialect='excel')

	# We can use what we learned earlier about including variables in strings to
	# set the cleaned output file name. There is a slight difference to what we did
	# before, can you see what it is?
	clean_filename = "/Users/jillian/Desktop/data_cleanup/cleaned_%s.csv" % time_stamp
	cleaner = "your_name"

	# Here we are defining the file that we will be writing to
	with open('clean_filename', 'wb') as clean_file:
	writer = csv.writer(clean_file, delimiter = ",")
	writer.writerow(fieldnames)


	# This statement writer.writerow() tells Python to write some array into the
	# file that we defined as writer. Unlike in R, periods are special characters in
	# Python and are used to separate an object (here writer) from an attribute or
	# method (here writerow) of that object. This difference between R and Python is
	# one reason why I use snake_case (or CamelCase) for naming variables even in
	# rather than separating words with '.'.
	writer.writerow(["You can include a descriptive string here"])
	writer.writerow(["'Cleaned by: %s, on %s' will show up in the next line." %
	(cleaner, time_stamp)])
	writer.writerow([\"""If you would like to write multiple lines of text but keep
	it on the same row and continue to follow the 80 character
	format in your code, you can use triple quotation marks \"""])
	writer.writerow(['']) # I add this to leave a blank line

	filename = "/Users/jillian/Desktop/data_cleanup/cleaned_Nov14_19:53.csv"
	with open(filename, 'rU') as csvfile:
	csvreader = csv.DictReader(csvfile)
	fieldnames = sorted(csvreader.fieldnames)

	"""
No results found