Created
July 3, 2012 02:13
-
-
Save charleslparker/3037105 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Set credentials for BigML and for the US Census | |
BIGML_USERNAME=bigml | |
BIGML_API_KEY=**** | |
CENSUS_API_KEY=**** | |
BIGML_AUTH="username=$BIGML_USERNAME;api_key=$BIGML_API_KEY" | |
# Download demographic data into files. Have to do it this way | |
# because the census API doesn't seem to allow returns of more | |
# than eight columns (at least, not with this many rows). | |
# | |
# This is median income for various levels of education | |
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B20004_002E,B20004_003E,B20004_004E,B20004_005E,B20004_006E&for=county:*&key=$CENSUS_API_KEY" > inc.tmp | |
# This is percent of the female population attaining various levels | |
# of education up to associate's degrees | |
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_019E,B15002_028E,B15002_029E,B15002_030E,B15002_031E&for=county:*&key=$CENSUS_API_KEY" > fem1.tmp | |
# This is percent of the female population attaining various levels | |
# of education for bachelor's degrees and above | |
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_032E,B15002_033E,B15002_034E,B15002_035E&for=county:*&key=$CENSUS_API_KEY" > fem2.tmp | |
# This is the number of people in poverty along with the total number | |
# for whom poverty was determined. | |
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B17001_001E,B17001_002E&for=county:*&key=$CENSUS_API_KEY" > pov.tmp | |
# This is percent of the male population attaining various levels | |
# of education up to associate's degrees | |
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_002E,B15002_011E,B15002_012E,B15002_013E,B15002_014E&for=county:*&key=$CENSUS_API_KEY" > male1.tmp | |
# This is percent of the male population attaining various levels | |
# of education for bachelor's degrees and above | |
curl "http://thedataweb.rm.census.gov/data/2010/acs5?get=B15002_015E,B15002_016E,B15002_017E,B15002_018E&for=county:*&key=$CENSUS_API_KEY" > male2.tmp | |
# Concatenate the rows from the male educational data and | |
# the female educational data into a single file for each gender | |
paste -d "," fem1.tmp fem2.tmp > fem-edu-cat.tmp | |
paste -d "," male1.tmp male2.tmp > male-edu-cat.tmp | |
# Normalize the columns of the female education row by the total. | |
# We're also summing some columns so we get percent of the population with, | |
# e.g., "high school or greater" education rather than just "high school", | |
# so we have four columns, high school or greater, associates or greater, | |
# bachelors or greater, and graduate or greater | |
# | |
# We're also stripping off the header here. We'll put a new one | |
# in at the end. | |
awk -F [][,\"]+ 'NR > 1{print ($3 + $4 + $5 + $6 + $9 + $10 + $11 + $12)/$2","($6 + $9 + $10 + $11 + $12)/$2","($9 + $10 + $11 + $12)/$2","($10 + $11 + $12)/$2}' fem-edu-cat.tmp > fem-edu.tmp | |
# Do the same thing with the male education stats. | |
awk -F [][,\"]+ 'NR > 1{print ($3 + $4 + $5 + $6 + $9 + $10 + $11 + $12)/$2","($6 + $9 + $10 + $11 + $12)/$2","($9 + $10 + $11 + $12)/$2","($10 + $11 + $12)/$2}' male-edu-cat.tmp > male-edu.tmp | |
# Just clean the brackets and header out of the income file | |
awk -F [][,\"]+ 'NR > 1{print $2","$3","$4","$5","$6}' inc.tmp > income.tmp | |
# Create the variable for poverty, people in poverty / total | |
awk -F [][,\"]+ 'NR > 1{print $3/$2}' pov.tmp > poverty.tmp | |
# Create a new header with better column names | |
echo "Income: Less than High School,Income: High School,Income: Associates,Income: Bachelors,Income: Graduate,Education: Female - High School,Education: Female - Associate's,Education: Female - Bachelor's,Education: Female - Graduate,Education: Male - High School,Education: Male - Associate's,Education: Male - Bachelor's,Education: Male - Graduate,Poverty" > head.tmp | |
# Concatenate all of the demographic attributes to a single file | |
paste -d "," income.tmp fem-edu.tmp male-edu.tmp poverty.tmp > data.tmp | |
# Concatenate the header with the rest of the data | |
cat head.tmp data.tmp > census-data.csv | |
# Remove temporary files | |
rm *.tmp | |
# Create a BigML dataource with the resulting file | |
curl https://bigml.io/andromeda/source?$BIGML_AUTH -F [email protected] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment