Skip to content

Instantly share code, notes, and snippets.

@nickpettican
Created August 24, 2016 12:04
Show Gist options
  • Save nickpettican/592985404ad475d6e39a6364132f4e1e to your computer and use it in GitHub Desktop.
Save nickpettican/592985404ad475d6e39a6364132f4e1e to your computer and use it in GitHub Desktop.
Used to clean some data for HavasLynx assignment for Excel and Tableau manipulation
#!/usr/bin/env python
import os
import csv
DATADIR = "C:\Users\User\Downloads\HavasLynx"
DATAFILE = "tin00028.tsv"
EUCODES = "2letterEU.txt"
OUTFILE = "internetuse_EU.csv"
def parse(datafile):
# simple parser
return [line.strip().split() for line in open(datafile, 'r')]
def outputfile(datadone,outfile):
# output as csv to open in excel
with open(outfile, 'wb') as output:
out = csv.writer(output, quoting=csv.QUOTE_ALL, lineterminator='\n')
for line in datadone:
out.writerow(line)
def cleancountries(eucountries):
# returns the EU countries with their 2 letter codes
return [[i[0],i[-1]] for i in eucountries]
def cleaninput(data):
# cleans the input file up for manipulation
cleaned = []
for line in data:
new = []
# I_IU3 contains the data we want
if "I_IU3" in line[0]:
new.append(line[0].replace(line[0], line[0][-2:]))
new.append(line[-1])
cleaned.append(new)
return cleaned
def clean(data,eucountries_clean):
# converts the 2 letter codes to country names for Tableau
datadone = []
datacleaned = cleaninput(data)
for line in datacleaned:
tmp = []
for i in eucountries_clean:
if i[0] in line[0] and line[-1].isdigit():
tmp.append(i[1])
tmp.append(line[-1])
if tmp:
datadone.append(tmp)
return datadone
def main():
# where the main functions are
try:
datafile = os.path.join(DATADIR, DATAFILE)
eucodes = os.path.join(DATADIR, EUCODES)
outfile = os.path.join(DATADIR, OUTFILE)
except:
print "\nOops, problem opening files\nCheck directory and file name.\n"
try:
data = parse(datafile)
eucountries = parse(eucodes)
eucountries_clean = cleancountries(eucountries)
datadone = clean(data,eucountries_clean)
except:
print "\nOops, problem parsing or cleaning the data.\n"
try:
outputfile(datadone,outfile)
print "\n* Success! *\n"
except:
print "\nSomething went wrong outputtinh the file...\n"
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment