Skip to content

Instantly share code, notes, and snippets.

@jdunic
Created November 15, 2013 17:26
Show Gist options
  • Select an option

  • Save jdunic/3c0ee32628f3edf8727e to your computer and use it in GitHub Desktop.

Select an option

Save jdunic/3c0ee32628f3edf8727e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# This top line tells the computer that your script is an executable
# Let's save this file as xxxxx_cleanup.py
# Today we have a datafile in csv (comma separated values) format which is not
# clean. There are inconsistencies in text formats,
# Base python comes with a lot of capabilities, but just like in R, we have to
# import different packages or "modules" in python to allow us to do more. We
# will load these at the top of the script so that we can call on them later.
import csv # lets us to read, write, and manipulate csv files
import logging # lets us track events, particularly errors while the script is running
import collections # lets us store objects in a particular order (among other things)
from datetime import datetime # lets us manipulate dates and times (which are objects)
from itertools import izip
# Here we are setting up our logging for tracking errors.
logging.basicConfig(level=logging.INFO)
# We can create the string (set of characters) time_stamp that can later be
# appended to filenames, or written into our new csvs. This allows us to keep
# track of when log files or the 'cleaned' csvs were created. Particularly if we
# run this script multiple times on the same data.
time_stamp = datetime.now().strftime('%b%d_%H:%M')
# For your reference below are the datetime formats that I have used in this
# example. For a full list of formats see: # <http://docs.python.org/2/library/time.html>
# %b = Locale's abbreviated month name
# %d = Day of the month as a decimal number [01,31]
# %H = Hour (24-hour clock) as a decimal number [00,23]
# %M = Minute as a decimal number [00,59]
# Example of a common type of error. Storing dates and times as datetime objects
# can help you maintain data integrity, because only valid formats will be accepted.
# This is an opportunity for quality control and prevention of ambiguous date
# formats e.g., 01/05/12
def date_cleaning(date_str):
date = None
date = datetime.strptime(date_str, "%Y")
return date
# To allow our script to be flexible, we can define the variable "filename" at
# the top of the document. This will be the name of the original file we wish to
# clean. By doing this we only need to worry about changing the filename in one
# place if we would like to clean a different file.
files = ["/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/Gulf of Alaska all sites 2003 percent cover.csv",
"/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/Gulf of Alaska all sites 2003 percent cover_KB.csv"#,
#"/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/Gulf of Alaska all sites 2003 percent cover_Kodiak.csv"
]
sites_file = "/Users/jillian/Desktop/data_cleanup/konar_GulfOfAlaska/sites.csv"
# If we want to have all of the errors printed to a file, which might be
# particularly useful if we are cleaning a large dataset, we can change our
# logging configuration. Here I am setting the variable LOG_FILENAME. You will
# see that there is a complete path to where I want the file "XX_cleanup_logging_datetime.txt"
# to be saved. You will notice that there is a "%s" in the filename, and that
# the filename is followed by a "% time_stamp". This tells Python that I want
# the time_stamp string that I defined above to be appended to my filename.
LOG_FILENAME = "/Users/jillian/Desktop/data_cleanup/log_files/filename_cleanup_logging_%s." % time_stamp
sites = {}
with open(sites_file, 'rU') as csvfile:
csvreader = csv.DictReader(csvfile)
for row in csvreader:
sites[row['site_code']] = {
'longitude': row['longitude'],
'latitude': row['latitude'],
'name': row['site']
}
# As an example you can do this in your terminal:
#string_with_variable = "It's cat o'clock: %s!" % time_stamp
#print string_with_variable
# header_dict = collections.OrderedDict(izip(fieldnames, fieldnames))
# header_list = header_dict.values()
header = ['AREA', 'YEAR', 'SITE', 'DEPTH STRATA', 'SIZE', 'REPLICATE', 'kelp',
'Agarum clathratum', 'L. sacch.', 'L. bongar.', 'L. yezoensis',
'juv. Laminaria', 'Cymenthere', 'Costaria', '% fleshy algae total',
'(reds)', '(browns)', '(greens)', '%upright corallines',
'%encrusting corallines', '%encrusting invertebrates', '%rock',
'Pycnopodia', 'Crossaster', 'Orthasterias', 'Dermasterias', 'Henricia',
'Tonicella', 'Mopalia', 'Chryptochiton', 'crinoid', 'scallop',
'S. droebachiensis', 'cucumber', 'Fusitriton', 'Desmarestia',
'Nereocystis', 'Urticina', 'Halocynthia', 'Alaria marginata',
'Parastichopus', 'L. setchellii', 'SITE NAME', 'SITE LONG', 'SITE LAT']
clean_filename = "./cleaned_%s.csv" % time_stamp
with open(clean_filename, 'wb') as clean_file:
writer = csv.writer(clean_file, delimiter = ",")
writer.writerow(header)
number = 0
with open('clean_filename', 'a') as clean_file: # we use 'a' because we want to append data to this file
writer = csv.DictWriter(clean_file, fieldnames=header)
for f in files: # I want to read the data from each of the three files that we defined in 'files'
number += 1
with open(f, 'rU') as csvfile:
next(csvfile) # This is a way to skip the first line of a csv
csvreader = csv.DictReader(csvfile)
fieldnames = csvreader.fieldnames
line_number = 0 # Keeping track of line numbers can be useful when debugging and cleaning
for row in csvreader:
line_number += 1
for h in header:
if h not in row:
row[h] = 0
else:
try:
if h == 'YEAR':
row[h] = date_cleaning(row[h])
except ValueError as e:
logging.warning("filename: %s, line: %s, error: %s" %
(f, line_number, e))
if h not in ('AREA', 'SITE', 'SIZE', 'YEAR', 'REPLICATE',
'DEPTH STRATA', 'SITE NAME', 'SITE LONG',
'SITE LAT') and row[h] != (None or ''):
try:
row[h] = int(row[h])
except ValueError as e:
logging.warning("value: %r, line: %s, error: %s" %
(row[h], line_number, e))
else:
row[h] = 0
#logging.warning("filename: %s, line: %s, error: %s" %
#(f, line_number, e))
writer.writerow(row)
"""
if h == 'SITE':
site = sites[row[h]]
row['SITE NAME'] = site['name']
row['SITE LONG'] = site['longitude']
row['SITE LAT'] = site['latitude']
"""
"""
if h in row: # cleanup row:
try:
if h == 'YEAR':
row[h] = date_cleaning(row[h])
except ValueError as e:
logging.warning("filename: %s, line: %s, error: %s" %
(f, line_number, e))
if h == 'SITE':
site = sites[row[h]]
row['SITE NAME'] = site['name']
row['SITE LONG'] = site['longitude']
row['SITE LAT'] = site['latitude']
if h != ('AREA' or 'SITE' or 'SIZE' or 'YEAR' or
'SITE NAME' or 'SITE LONG' or 'SITE LAT'):
try:
int(h)
except ValueError as e:
logging.warning("filename: %s, line: %s, error: %s" %
(f, line_number, e))
else:
row[h] = '0'
writer.writerow(row)
"""
"""
# Next we tell python that we want to open the file called 'filename' whenever
# we call 'csvfile'
number = 0
for f in files:
number += 1
rows = {}
with open(f, 'rU') as csvfile:
next(csvfile) # This is a way to skip the first line of a csv
csvreader = csv.DictReader(csvfile)
fieldnames = list(set(csvreader.fieldnames) | set(fieldnames)) # Gets a list of the fieldnames (skipped first row)
with open('clean_filename', 'wb') as clean_file:
writer = csv.writer(open(clean_filename, 'wb'), dialect='excel')
# We can use what we learned earlier about including variables in strings to
# set the cleaned output file name. There is a slight difference to what we did
# before, can you see what it is?
clean_filename = "/Users/jillian/Desktop/data_cleanup/cleaned_%s.csv" % time_stamp
cleaner = "your_name"
# Here we are defining the file that we will be writing to
with open('clean_filename', 'wb') as clean_file:
writer = csv.writer(clean_file, delimiter = ",")
writer.writerow(fieldnames)
# This statement writer.writerow() tells Python to write some array into the
# file that we defined as writer. Unlike in R, periods are special characters in
# Python and are used to separate an object (here writer) from an attribute or
# method (here writerow) of that object. This difference between R and Python is
# one reason why I use snake_case (or CamelCase) for naming variables even in
# rather than separating words with '.'.
writer.writerow(["You can include a descriptive string here"])
writer.writerow(["'Cleaned by: %s, on %s' will show up in the next line." %
(cleaner, time_stamp)])
writer.writerow([\"""If you would like to write multiple lines of text but keep
it on the same row and continue to follow the 80 character
format in your code, you can use triple quotation marks \"""])
writer.writerow(['']) # I add this to leave a blank line
filename = "/Users/jillian/Desktop/data_cleanup/cleaned_Nov14_19:53.csv"
with open(filename, 'rU') as csvfile:
csvreader = csv.DictReader(csvfile)
fieldnames = sorted(csvreader.fieldnames)
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment