Created
January 27, 2013 20:43
-
-
Save ubershmekel/4650376 to your computer and use it in GitHub Desktop.
h1b wages analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Analyze USA h1b salaries | |
data from http://www.foreignlaborcert.doleta.gov/quarterlydata.cfm | |
specifically: | |
http://www.foreignlaborcert.doleta.gov/pdf/quarter_2_2012/PW_FY2012_Q2.csv | |
""" | |
import math | |
import re | |
import csv | |
import os | |
from collections import namedtuple | |
from collections import Counter | |
import numpy as np | |
import matplotlib.pyplot as plt | |
TOPN = 20 | |
def xkcd_colors(cache=[]): | |
""" | |
http://blog.xkcd.com/2010/05/03/color-survey-results/ | |
http://xkcd.com/color/rgb.txt | |
""" | |
if len(cache) > 0: | |
return cache | |
with open('rgb.txt') as fhand: | |
text = fhand.read() | |
colors = re.findall(r'#([0-9a-f]+)', text) | |
not_saturated = [i for i in colors if not saturated(i)] | |
cache.extend(not_saturated) | |
return cache | |
def saturated(color): | |
""" | |
255 * 3 = 765 | |
""" | |
r, g, b = [int(i, 16) for i in [color[:2], color[2:4], color[4:6]]] | |
lux = r + g + b | |
if lux > 510: | |
return True | |
else: | |
return False | |
def load(): | |
global tech_jobs, all_jobs | |
c = csv.reader(open('PW_FY2012_Q2.csv')) | |
header = next(c) | |
header.index('PWD_WAGE_RATE') | |
wage_i = header.index('PWD_WAGE_RATE') | |
field_i = header.index('PWD_SOC_TITLE') | |
job_i = header.index('PW_JOB_TITLE') | |
# PRIMARY_WORKSITE_STATE or EMPLOYER_STATE ? | |
state_i = header.index('PRIMARY_WORKSITE_STATE') | |
Row = namedtuple('Row', 'pay, state, field, job') | |
all_jobs = [] | |
for row in c: | |
field = row[field_i] | |
job = row[job_i] | |
state = row[state_i] | |
pay_str = row[wage_i] | |
try: | |
pay = float(pay_str) | |
except ValueError: | |
# not a number | |
continue | |
if pay < 10000: | |
# under min wage, a bug | |
continue | |
tup = Row(pay, state.strip().lower(), field.strip().lower(), job.strip().lower()) | |
all_jobs.append(tup) | |
print 'len jobs', len(all_jobs) | |
tech_jobs = [] | |
for tup in all_jobs: | |
pay, state, field, job = tup | |
job_name = field + ' - ' + job | |
if re.findall(r'(program|software|computer)', job_name.lower()): | |
tech_jobs.append(tup) | |
print 'len tech jobs', len(tech_jobs) | |
tech_jobs = sorted(tech_jobs) | |
#data = np.array(new_data) | |
#wages = data[:,1].astype(float) | |
def print_ps(wages): | |
percentiles = 75, 50, 25, 10, 5, 1, 0.1 | |
for p in percentiles: | |
p_i = int(len(wages) * (100 - p) / 100.0) | |
min_wage = wages[p_i] | |
print '%s%% earn more than $%0.0fK' % (p, min_wage / 1000) | |
def state_ps(state=None): | |
if state is None: | |
wages = [i.pay for i in tech_jobs] | |
else: | |
print state, 'state' | |
wages = [i.pay for i in tech_jobs if i.state.lower().strip() == state] | |
print len(wages) | |
print_ps(wages) | |
def print_percentiles(): | |
state_ps() | |
state_ps('new york') | |
state_ps('florida') | |
state_ps('california') | |
#print '-------' | |
#for i in tech_jobs[int(len(tech_jobs) * .999):]: | |
# print i | |
COLOR_SCHEMES = { | |
'oldschool': ((59,76,76), (125,140,116), (217,175,95), (127,92,70), (51,36,35)), | |
'citrus': ((34,51,49), (70,102,66), (153,142,61), (229,156,44), (255,116,37)), | |
'goldfish': ((229,106,0), (204,199,148), (153,145,124), (88,89,86), (48,49,51)), | |
'audacity': ((181,40,65), (255,192,81), (255,137,57), (232,95,77), (89,0,81)), | |
} | |
import random | |
DEFAULT_STYLE = ''' | |
.tag_item { | |
text-decoration: none; | |
font-weight: bold; | |
white-space: nowrap; | |
} | |
.tag_item:hover { | |
background-color: #eee; | |
} | |
''' | |
def html_tag_cloud(name_sizes, fn, max_size=100, min_size=5, css_class="tag_item", style=DEFAULT_STYLE, count_fmt="{}"): | |
#colors = COLOR_SCHEMES['audacity'] | |
sizes = np.array([i[1] for i in name_sizes]) | |
normalizer = 1.0 * max_size / sizes.max() | |
sizes *= normalizer | |
#offset = min_size - sizes.min() | |
#sizes += offset | |
sizes = sizes.astype(int) | |
with open(fn, 'w') as fhand: | |
droplet_template = '''<a href="" class="{css_class}" title="{count}" style="font-size: {size}px; color: #{color};">{text}</a> ''' | |
fhand.write('<style>%s</style>' % style) | |
for i, (name, count) in enumerate(name_sizes): | |
#r, g, b = random.randint(0, 255), random.randint(0, 255), random.randint(0, 255) | |
#color = '%x%x%x' % (r, g, b) #random.choice(colors) | |
color = random.choice(xkcd_colors()) | |
#lux = random.randint(0, 600) | |
#r, g, b = | |
line = droplet_template.format(size=sizes[i], text=name, count=count_fmt.format(count), color=color, style=style, css_class=css_class) | |
fhand.write(line) | |
d3_fmt = ''' | |
<body> | |
<style> | |
svg { | |
cursor:default; | |
} | |
</style> | |
<script src="http://d3js.org/d3.v3.min.js"></script> | |
<script src="d3.layout.cloud.js"></script> | |
<div id="all_frequencies"></div> | |
<div id="all_wages"></div> | |
<script> | |
var fill = d3.scale.category20(); | |
var width = 600; | |
var height = 600; | |
var tooltip = d3.select("body") | |
.append("div") | |
.style("position", "absolute") | |
.style("z-index", "10") | |
.style("background-color", "#fff") | |
.style("border", "1px solid #000") | |
.style("padding", "5px") | |
.style("visibility", "hidden") | |
.text("a simple tooltip"); | |
var suffix = "%s"; | |
var words = [ | |
%s | |
]; | |
var normalize_size = 90.0 / d3.max(words, function(d){return d.size}); | |
var cloud = d3.layout.cloud() | |
.words( | |
words | |
) | |
.size([width, height]) | |
.timeInterval(10) | |
.font("Impact") | |
.fontSize(function(d) { return d.o_size * normalize_size; }) | |
.rotate(function(d) { return ~~(Math.random() * 5) * 10 - 20; }) | |
.padding(1) | |
.on("end", draw) | |
.start(); | |
function draw(words) { | |
d3.select("body").append("svg") | |
.attr("width", width) | |
.attr("height", height) | |
.append("g") | |
.attr("transform", "translate(" + (width/2) + "," + (height/2) + ")") | |
.selectAll("text") | |
.data(words) | |
.enter().append("text") | |
.style("font-size", function(d) { return d.size + "px"; }) | |
.style("font-family", "Impact") | |
.style("fill", function(d, i) { return fill(i); }) | |
.attr("text-anchor", "middle") | |
.attr("transform", function(d) { | |
return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")"; | |
}) | |
.text(function(d) { return d.text; }) | |
.on("mouseover", function(d){ | |
d3.select(this).style("opacity", 0.7); | |
tooltip.text(d.o_size + " - " + d.text + " " + suffix); | |
//tooltip.text(d.tip); | |
tooltip.style("visibility", "visible"); | |
}) | |
.on("mousemove", function(){return tooltip.style("top", (event.pageY-10)+"px").style("left",(event.pageX+10)+"px");}) | |
.on("mouseout", function(){ | |
d3.select(this).style("opacity", 1.0); | |
tooltip.style("visibility", "hidden"); | |
}); | |
} | |
</script> | |
</body> | |
''' | |
def d3_tag_cloud(name_sizes, fn, max_size=90, suffix='usd'): | |
#names = [i[0] for i in name_sizes] | |
#sizes = np.array([i[1] for i in name_sizes]) | |
#normalizer = 1.0 * max_size / sizes.max() | |
#sizes *= normalizer | |
#name_sizes = zip(names, sizes) | |
word_fmt = '{text: "%s", size: 1, o_size: %f, tip: "%s"}' | |
with open(fn, 'w') as fhand: | |
words_str = ','.join([word_fmt % (name, size, "%s %s" % (size, suffix)) for name, size in name_sizes]) | |
html = d3_fmt % (suffix, words_str) | |
fhand.write(html) | |
def tag_cloud(jobs, set_name='job'): | |
job_counts = Counter([i.job for i in jobs]) | |
#job_frequencies = sorted(job_counts.iteritems(), key=lambda x: x[1], reverse=True) | |
#print 'len jobs', len(job_frequencies) | |
#job_frequencies = job_frequencies[:TOPN] | |
job_frequencies = job_counts.most_common(TOPN) | |
# ignore rare jobs | |
print 1 | |
pop_jobs = set(i[0] for i in job_frequencies) | |
job_pays = {} | |
for tup in jobs: | |
if tup.job not in pop_jobs: | |
continue | |
wages = job_pays.get(tup.job, []) | |
wages.append(tup.pay) | |
job_pays[tup.job] = wages | |
print 2 | |
job_meds = [] | |
for name, wages in job_pays.items(): | |
job_meds.append((name, np.median(wages))) | |
job_meds = sorted(job_meds, key=lambda x: x[1], reverse=True) | |
d3_tag_cloud(job_frequencies, set_name + '_frequencies.html', suffix='') | |
d3_tag_cloud(job_meds, set_name + '_wages.html') | |
#from pytagcloud import create_tag_image, make_tags | |
#from pytagcloud.lang.counter import get_tag_counts | |
#html_tag_cloud(job_frequencies, set_name + '_frequencies.html') | |
#html_tag_cloud(job_meds, set_name + '_wages.html', count_fmt='${}') | |
#YOUR_TEXT = "A tag cloud is a visual representation for text data, typically\ | |
#used to depict keyword metadata on websites, or to visualize free form text." | |
#tags = make_tags(get_tag_counts(YOUR_TEXT), maxsize=120) | |
#print 3 | |
#tags = make_tags(job_frequencies, maxsize=80) | |
#create_tag_image(tags, 'job_frequencies.png', size=(900, 600), fontname='Lobster') | |
#print 4 | |
#tags = make_tags(job_meds, maxsize=80) | |
#create_tag_image(tags, 'job_median_pay.png', size=(900, 600), fontname='Lobster') | |
def anomalies(): | |
global all_jobs | |
pay_freqs = Counter([i.pay for i in h1b.all_jobs]) | |
pay_freqs.most_common(TOPN) | |
def main(): | |
load() | |
print_percentiles() | |
tag_cloud(all_jobs, 'all') | |
tag_cloud(tech_jobs, 'tech') | |
if __name__ == '__main__': | |
main() | |
''' | |
trunc = 1.0 / 200 | |
bottom = wages[len(wages) * trunc] | |
top = wages[-len(wages) * trunc] | |
#plt.hist(wages, range=[bottom, top]) | |
span = top - bottom | |
n_bins = 100 | |
bins = np.zeros(n_bins) | |
for w in wages: | |
percentile_i = int(n_bins * (w - bottom) / span) | |
if percentile_i < 0: | |
percentile_i = 0 | |
elif percentile_i >= n_bins: | |
percentile_i = n_bins - 1 | |
bins[percentile_i] += 1 | |
plt.bar([i * span / n_bins + bottom for i in range(n_bins)], bins, width=span/n_bins) | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment