Last active
December 12, 2015 08:19
-
-
Save natematias/4743564 to your computer and use it in GitHub Desktop.
Scrape baby names from the infochimps US census site download
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #scraper requires the SSN scraped babynames dataset from infochimps | |
| #www.infochimps.com/datasets/popular-baby-names-by-year-top-1000-us-social-security-administr | |
| require 'nokogiri' | |
| for year in (1880..2009) | |
| f = File.open("babynames/top-1000-#{year}-num.html") | |
| html = f.read | |
| doc = Nokogiri::HTML(html) | |
| for v in doc.search("table[@bordercolor='#aaabbb'] tr[@align='right']") | |
| cells = v.search('td') | |
| # data = { | |
| # 'year' => year, | |
| # 'rank' => cells[0].inner_html, | |
| # 'male_name' => cells[1].inner_html, | |
| # 'male_count' => cells[2].inner_html, | |
| # 'female_name' => cells[3].inner_html, | |
| # 'female_count' => cells[4].inner_html | |
| #k } | |
| print "M" + "," | |
| print year.to_s + "," #year | |
| print cells[0].inner_html + "," #rank | |
| print cells[1].inner_html + "," #male name | |
| print cells[2].inner_html.gsub(",","") + "\n" #male count | |
| print "F" + "," | |
| print year.to_s + "," #year | |
| print cells[0].inner_html + "," #rank | |
| print cells[3].inner_html + "," #female name | |
| print cells[4].inner_html.gsub(",","") + "\n" #female count | |
| end | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'csv' | |
| def write_names_csv filename, hash | |
| CSV.open(filename, "wb") do |csv| | |
| hash.each do |name, value| | |
| csv << [name, value] | |
| end | |
| end | |
| end | |
| def year_modifier year | |
| year = year.to_i | |
| if year >= 1960 and year <= 1980 | |
| return 1 | |
| elsif year > 1980 | |
| top = 1960 / 10 | |
| bottom = year / 10 | |
| return 1.0 - (top - bottom).to_f/10.0 | |
| elsif year < 1960 | |
| top = year / 10 | |
| bottom = 1980 / 10 | |
| return 1.0 - (top - bottom).to_f/10.0 | |
| end | |
| end | |
| male = {} | |
| female ={} | |
| CSV.foreach("babynames.csv") do |row| | |
| year = row[1].to_i | |
| count = row[4].to_i | |
| name = row[3] | |
| modified_count = count * year_modifier(year) | |
| if(row[0]=="M") | |
| if !male.has_key? name | |
| male[name] = modified_count | |
| else | |
| male[name] = male[name] + modified_count | |
| end | |
| elsif(row[0]=="F") | |
| if !female.has_key? name | |
| female[name] = modified_count | |
| else | |
| female[name] = female[name] + modified_count | |
| end | |
| end | |
| end | |
| write_names_csv "female_names.csv", female | |
| write_names_csv "male_names.csv", male |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment