nogweii · December 16, 2015 22:39
diff --git a/massage_data.rb b/massage_data.rb
 #!/usr/bin/ruby

 # The StackOverflow data set is available as an XLSX (Microsoft 2007+ Excel
 # Spreadsheet) file, which means that most readers will 'helpfully' convert the
 # data into various formats.
 #
 # Also, the data itself is presented as a range of numbers, and for my
 # statistical analysis I'm using simple numbers, so I take the mean of the
 # range.

 require 'csv'

 # Parse the excel spreadsheet file, converted to CSV by LibreOffice
 stack_overflow_responses = CSV.read("Stack Overflow Survey Responses.csv")

 # The total number of responses is the number of rows, less 2. (The first 2 rows
 # are just headers)
 total_number_of_responses = stack_overflow_responses.length - 2

 # Grab the headers before playing with the data to avoid corruption
 header = stack_overflow_responses[0]
 # ..and then delete them, to avoid any trip ups from unexpected input
 stack_overflow_responses.shift ; stack_overflow_responses.shift

 # Takes a string formatted like a date: MM/DD/YYYY, and returns the mean of the
 # range Excel interpreted as a date. This means the average of (MM + DD) from
 # the date string. Also accepts a simple number (determined by the lack of a '/'
 # in the passed parameter) and runs to_i. Anything else, nil included, returns a
 # 0.
 #
 # A range given in the form of '$1,000 - $5,000' will also be appropriately
 # parsed.
 def datestamp_to_mean(stamp)
    stamp.delete! " ,<$"
    if stamp.nil?
        0
    elsif stamp.index '/'
        date_numbers = stamp.split('/').map {|n| n.delete(',').to_i }
        (date_numbers[0] + date_numbers[1]) / 2
    elsif stamp.index '-'
        range = stamp.split('-').map(&:to_i)
        (range[0] + range[1]) / 2
    else
        stamp.to_i
    end
 end

 # The columns in a given response row, that I care about:
 # 0 = Location
 # 2 = Age
 # 3 = Years experience, either as a small number (<50) or the date's timestamp
 # 5 = Number of employees
 # 100 = Salary

 # Go through each response, and remove any responses that didn't answer all of
 # the questions I'm looking for
 stack_overflow_responses.reject! do |response|
    [0, 2, 3, 5, 100].map {|index| response[index].nil? }.any?
 end

 # Modify each column, parsing it and converting to the mean
 stack_overflow_responses.map! do |response|
    [response[0], datestamp_to_mean(response[2]),  datestamp_to_mean(response[3]), datestamp_to_mean(response[5]), datestamp_to_mean(response[100])]
 end

 CSV.open('stack_overflow_responses.r.csv', 'wb') do |csv|
    csv << header.values_at(0,2,3,5,100)
    stack_overflow_responses.each do |entry|
        csv << entry
    end
 end

 puts "Number of total responses"
 puts total_number_of_responses

 puts "Number of usable responses"
 puts stack_overflow_responses.length
	#!/usr/bin/ruby

	# The StackOverflow data set is available as an XLSX (Microsoft 2007+ Excel
	# Spreadsheet) file, which means that most readers will 'helpfully' convert the
	# data into various formats.
	#
	# Also, the data itself is presented as a range of numbers, and for my
	# statistical analysis I'm using simple numbers, so I take the mean of the
	# range.

	require 'csv'

	# Parse the excel spreadsheet file, converted to CSV by LibreOffice
	stack_overflow_responses = CSV.read("Stack Overflow Survey Responses.csv")

	# The total number of responses is the number of rows, less 2. (The first 2 rows
	# are just headers)
	total_number_of_responses = stack_overflow_responses.length - 2

	# Grab the headers before playing with the data to avoid corruption
	header = stack_overflow_responses[0]
	# ..and then delete them, to avoid any trip ups from unexpected input
	stack_overflow_responses.shift ; stack_overflow_responses.shift

	# Takes a string formatted like a date: MM/DD/YYYY, and returns the mean of the
	# range Excel interpreted as a date. This means the average of (MM + DD) from
	# the date string. Also accepts a simple number (determined by the lack of a '/'
	# in the passed parameter) and runs to_i. Anything else, nil included, returns a
	# 0.
	#
	# A range given in the form of '$1,000 - $5,000' will also be appropriately
	# parsed.
	def datestamp_to_mean(stamp)
	stamp.delete! " ,<$"
	if stamp.nil?
	0
	elsif stamp.index '/'
	date_numbers = stamp.split('/').map {\|n\| n.delete(',').to_i }
	(date_numbers[0] + date_numbers[1]) / 2
	elsif stamp.index '-'
	range = stamp.split('-').map(&:to_i)
	(range[0] + range[1]) / 2
	else
	stamp.to_i
	end
	end

	# The columns in a given response row, that I care about:
	# 0 = Location
	# 2 = Age
	# 3 = Years experience, either as a small number (<50) or the date's timestamp
	# 5 = Number of employees
	# 100 = Salary

	# Go through each response, and remove any responses that didn't answer all of
	# the questions I'm looking for
	stack_overflow_responses.reject! do \|response\|
	[0, 2, 3, 5, 100].map {\|index\| response[index].nil? }.any?
	end

	# Modify each column, parsing it and converting to the mean
	stack_overflow_responses.map! do \|response\|
	[response[0], datestamp_to_mean(response[2]), datestamp_to_mean(response[3]), datestamp_to_mean(response[5]), datestamp_to_mean(response[100])]
	end

	CSV.open('stack_overflow_responses.r.csv', 'wb') do \|csv\|
	csv << header.values_at(0,2,3,5,100)
	stack_overflow_responses.each do \|entry\|
	csv << entry
	end
	end

	puts "Number of total responses"
	puts total_number_of_responses

	puts "Number of usable responses"
	puts stack_overflow_responses.length
No results found