Created
June 11, 2012 16:22
-
-
Save plotti/2910991 to your computer and use it in GitHub Desktop.
Smart interest groups matching
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Define how many list places should be considered | |
MAX = 200 | |
#Threshold: The threshold until which the categories should be merged (e.g. 0.2 = 20 % of members are shared) | |
THRESHOLD = 0.2 | |
outfile = CSV.open("data/partitions#{MAX}_#{THRESHOLD}.csv", "wb") | |
final_partition = CSV.open("data/final_partitions#{MAX}_#{THRESHOLD}.csv", "wb") | |
outfile << ["Name","Original Category", "Original Category Place", "Assigned Category", "Assigned Category Place", "Competing Categories", "Details"] | |
members ={} | |
@@communities.each do |community| | |
project = Project.find(community) | |
puts "Reading in project #{project.name}" | |
rows = FasterCSV.read("#{RAILS_ROOT}/data/#{project.name}_sorted_members.csv")[1..MAX] #skip header | |
i,r = 0,{} | |
rows.each do |member| | |
i += 1 | |
r[member[0]] = {:rank => i, :count => member[2].to_i} if !BLACKLIST.include?(member[0]) | |
end | |
members[project.name] = r | |
end | |
merged = {} | |
#First step should be to unite partitions that have a high overlap of members | |
@@communities.each do |community| | |
project = Project.find(community) | |
#puts "Checking merge on project id: #{community}" | |
max_overlap_count,overlap_groups_count,overlap_groups,max_group = 0,0,[],"" | |
members.each do |key,value| | |
if key != project.name && merged[project.name] == nil | |
overlap_count = (value.keys & members[project.name].keys).count #count how many members they have in common don't compare with yourself | |
if overlap_count > max_overlap_count | |
max_overlap_count, max_group = overlap_count, key | |
end | |
end | |
end | |
if max_overlap_count > MAX*THRESHOLD | |
puts "Merged #{project.name} with #{max_group}" | |
merged_name = "#{project.name}_#{max_group}" | |
h = {} | |
# Add the counts and merge the members | |
merged_members = (members[project.name].keys + members[max_group].keys).uniq | |
merged_members.each do |member| | |
count1 = members[project.name][member][:count] rescue 0 | |
count2 = members[max_group][member][:count] rescue 0 | |
h[member] = {:rank => 0 , :count => count1+count2} | |
end | |
members[merged_name] = h | |
#Recalculate the ranking for faster lookup | |
sorted_members = members[merged_name].sort{|a,b| b[1][:count]<=>a[1][:count]}.collect{|a| a[0]} | |
members[merged_name].keys.each do |member| | |
members[merged_name][member][:rank] = sorted_members.index(member)+1 | |
end | |
#Take only the first x members since the categories will grow | |
members[merged_name] = Hash[members[merged_name].sort{|a,b| b[1][:count]<=>a[1][:count]}[0..MAX]] | |
#Point where the merged group is stored | |
[project.name,max_group].each do |entry| | |
members.delete(entry) | |
merged[entry] = merged_name | |
merged.each do |key,value| | |
if value == entry | |
merged[key] = merged_name | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment