Last active
June 2, 2016 18:27
-
-
Save adamlutz/e2a7f4ded0e5b94aecb70a6f90e7fd2b to your computer and use it in GitHub Desktop.
mechanized cancer.gov search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'ruby-progressbar' | |
require 'mechanize' | |
require 'pry' | |
require 'csv' | |
agent = Mechanize.new { |agent| | |
agent.user_agent_alias = 'Mac Safari' | |
} | |
query_options = {} | |
biopsy_options = {} | |
agent.get('http://www.cancer.gov/bcrisktool/Default.aspx') do |page| | |
page.form_with(:name => 'risk').fields.each do |field| | |
query_options[field.name] = [] | |
query_options[field.name] = field.options.flatten.map{|x| [x.value, x.text]}.reject {|x| x[0] == '' || x[1] == "Select" || x[1] == '< 35'} | |
end | |
# post-process / simplify full universe of options to shorten length of | |
# script run-time: | |
query_options.delete('history') | |
query_options.delete('genetics') | |
query_options['race'] += query_options['subrace'] | |
query_options.delete('subrace') | |
# move positive matching biopsy options to separate loop | |
# biopsy_options['previous_biopsies'] = query_options['previous_biopsies'] | |
# biopsy_options['biopsy_with_hyperplasia'] = query_options['biopsy_with_hyperplasia'] | |
biopsy_options['ever_had_biopsy'] = [ ["1", "Yes"] ] | |
query_options['ever_had_biopsy'] = biopsy_options['ever_had_biopsy'] #[ ["0", "No"] ] | |
# query_options['previous_biopsies'] = [ ["0", "No"] ] | |
# query_options['biopsy_with_hyperplasia'] = [ ["0", "No"] ] | |
end | |
def permutation_hash(hsh) | |
attrs = hsh.values | |
keys = hsh.keys | |
product = attrs[0].product(*attrs[1..-1]) | |
product.map{ |p| Hash[keys.zip p] } | |
end | |
def build_url(query_options) | |
'http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0&' + query_options.to_a.map { |x| "#{x[0]}=#{x[1].first}" }.join("&") | |
end | |
def parsed_percentages(doc) | |
summary_text = doc.xpath("//ul[@class='gray-bg']").text.strip.gsub(/\r/," ").gsub(/\n/," ").gsub(/\s+/, ' ').gsub("%Average", "% Average") | |
percentage_matches = summary_text.scan(/([0-9]*\.[0-9]+|[0-9]+)(%)/) | |
percentage_matches.flatten.reject {|x| x == '%'} | |
end | |
sample = permutation_hash(query_options) | |
# sample = sample.slice(0,20) | |
p 'about to mechanize ' + sample.count.to_s + ' requests to cancer.gov!' | |
progressbar = ProgressBar.create( :format => '%a %bᗧ%i %p%% %t', | |
:progress_mark => ' ', | |
:remainder_mark => '・', | |
:total => sample.count, | |
:starting_at => 0) | |
CSV.open("with_previous_biopsy_results.csv", "w") do |csv| | |
csv << [ "current_age", | |
"age_at_menarche", | |
"age_at_first_live_birth", | |
"related_with_breast_cancer", | |
"ever_had_biopsy", | |
"previous_biopsies", | |
"biopsy_with_hyperplasia", | |
"race"] + ["risk %", "average risk %", "to age 90 risk %", "average to age 90 risk %", "url"] | |
sample.each_with_index do |unique_params,index| | |
url = build_url(unique_params) | |
doc = agent.get(url).parser | |
csv << [unique_params['current_age'].last] + | |
[unique_params['age_at_menarche'].last] + | |
[unique_params['age_at_first_live_birth'].last] + | |
[unique_params['related_with_breast_cancer'].last] + | |
[unique_params['ever_had_biopsy'].last] + | |
[unique_params['previous_biopsies'].last] + | |
[unique_params['biopsy_with_hyperplasia'].last] + | |
[unique_params['race'].last] + | |
parsed_percentages(doc) + | |
[url] #, summary_text, informational_text # , "summary text", "long-winded informational_text" | |
progressbar.increment | |
end | |
end | |
#http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0¤t_age=35&age_at_menarche=99&age_at_first_live_birth=99&ever_had_biopsy=0&previous_biopsies=0&biopsy_with_hyperplasia=0&related_with_breast_cancer=99&race=2 | |
#http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0¤t_age=36&age_at_menarche=13&age_at_first_live_birth=0&ever_had_biopsy=0&previous_biopsies=0&biopsy_with_hyperplasia=0&related_with_breast_cancer=0&race=8 | |
#http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0¤t_age=35&age_at_menarche=99&age_at_first_live_birth=99&related_with_breast_cancer=99&ever_had_biopsy=0&previous_biopsies=1&biopsy_with_hyperplasia=0&race=7 | |
#http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0¤t_age=36&age_at_menarche=13&age_at_first_live_birth=0&ever_had_biopsy=0&previous_biopsies=1&biopsy_with_hyperplasia=0&related_with_breast_cancer=0&race=8' | |
# doc = agent.get(url).parser | |
# info_text_raw = doc.xpath("//p").text | |
# end_of_info_str = " Home | Contact Us | Policies | Accessibility U.S. Department of Health and Human Services | National Institutes of Health | National Cancer Institute | USA.gov NIH…Turning Discovery Into Health®" | |
# informational_text = info_text_raw.slice(info_text_raw.index("Based")..-1).strip.gsub(/\r/," ").gsub(/\n/," ").gsub(/\s+/, ' ').chomp(end_of_info_str) | |
# with sub-race | |
# http://www.cancer.gov/bcrisktool/RiskAssessment.aspx?genetics=0¤t_age=36&age_at_menarche=13&age_at_first_live_birth=0&ever_had_biopsy=0&previous_biopsies=1&biopsy_with_hyperplasia=0&related_with_breast_cancer=0&race=8&asian=It%20has%20been%20observed%20that%20recent%20immigrants%20from%20rural%20Asia%20may%20have%20a%20lower%20risk%20of%20breast%20cancer%20than%20calculated. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment