Skip to content

Instantly share code, notes, and snippets.

@johnjosephhorton
Created October 2, 2011 03:23
Show Gist options
  • Save johnjosephhorton/1256984 to your computer and use it in GitHub Desktop.
Save johnjosephhorton/1256984 to your computer and use it in GitHub Desktop.
Get data from a poorly formatted BLS table using Python
import urllib2
import csv
FIRST_LINE = 11
LAST_LINE = 38
def get_level(l):
for i, char in enumerate(l):
if char != " ":
break
return i
def clean_line(l):
l = l.replace("\r","")
l = l.replace("\n","")
l = l.split(" ")
return [y.strip() for y in l if y!=""]
f = urllib2.urlopen("ftp://ftp.bls.gov/pub/suppl/empsit.tab1.txt")
lines = [line for line in f][FIRST_LINE:LAST_LINE]
levels = map(get_level, lines)
data_rows = map(clean_line, lines)
headings = [y[0] for y in data_rows]
d_order = dict(zip(headings, range(len(headings))))
d_level = dict(zip(headings, levels))
def one_up(heading):
candidates = headings[:d_order[heading]]
heading_level = d_level[heading]
candidates.reverse()
for c in candidates:
if d_level[c] < heading_level:
return c
else:
return None
def crumb_trail(heading):
if one_up(heading) is None:
return [heading]
else:
return crumb_trail(one_up(heading)) + [heading]
crumb_trails = map(crumb_trail, headings)
max_depth = max(map(len, crumb_trails))
g = open("bls.csv", "w")
header = ["level_%s" % i for i in range(max_depth)] + [
"depth", "level", "normal_seasonal_movement", "estimated_over_month_change",
"sa_adjusted_over_month_change"]
hier = dict(zip(set(levels), range(len(set(levels)))))
def get_hier(heading): return hier[d_level[heading]]
out = csv.writer(g)
out.writerow(header)
for trail, data_row in zip(crumb_trails, data_rows):
industry = [None for i in range(max_depth)]
indices = map(get_hier, trail)
depth = max(indices)
for name, i in zip(trail, indices):
industry[i] = name
out.writerow(industry + [depth] + data_row[:3])
g.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment