Last active
August 12, 2021 17:35
-
-
Save ejlangev/59436f6a087c9e2768294a7709c4f074 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'pp' | |
require 'ostruct' | |
data = File.open('Query Results.csv').map do |l| | |
pieces = l.tr('{}"', '').strip.split(',', 2) | |
OpenStruct.new(loan_file_id: pieces[0], all_activities: pieces[1].split(','), all_unique_activities: pieces[1].split(',').uniq) | |
end | |
activity_frequency_map = {} | |
common_subset = [] | |
data.each do |d| | |
d.all_unique_activities.each do |name| | |
activity_frequency_map[name] ||= 0 | |
activity_frequency_map[name] += 1 | |
if activity_frequency_map[name] == data.size | |
common_subset << name | |
end | |
end | |
end | |
puts "Common Subset of Activities: #{common_subset.size}" | |
File.open('activity_frequencies.csv', 'w') do |f| | |
File.truncate('activity_frequencies.csv', 0) | |
activity_frequency_map.each_pair do |name, value| | |
f.puts("#{name},#{value}") | |
end | |
end | |
File.open('per_loan_data.csv', 'w') do |f| | |
File.truncate('per_loan_data.csv', 0) | |
data.each do |d| | |
borrower_facing_ad_hoc = d.all_activities.count { |n| ['AnswerAdHocQuestions', 'AdHocDocumentCollection'].include?(n) } | |
all_ad_hoc = d.all_activities.count { |n| n.include?('AdHoc') } | |
f.puts("#{d.loan_file_id},#{d.all_activities.size},#{d.all_unique_activities.size},#{borrower_facing_ad_hoc},#{all_ad_hoc}") | |
end | |
end | |
[0.01, 0.03, 0.05, 0.08, 0.10].each do |percent| | |
rare_activities = activity_frequency_map.keys.select { |n| (activity_frequency_map[n].to_f / data.size) <= percent } | |
loans_with_rare_activities = data.select { |d| d.all_unique_activities.intersection(rare_activities).size > 0 } | |
puts "# of activities that happen on < #{percent * 100.0}% of loans: #{rare_activities.size} (#{(rare_activities.size * 100.0 / activity_frequency_map.keys.size).round(2)}%)" | |
puts "# of loans with an activity that happens < #{percent * 100.0}% of the time: #{loans_with_rare_activities.size} (#{(loans_with_rare_activities.size * 100.0 / data.size).round(2)}%)" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment