Last active
May 14, 2020 01:13
-
-
Save brianmcgue/3b46fac075dc14adc44184fdff133eac to your computer and use it in GitHub Desktop.
An attempt to determine if how correlated the results are for 5.x and 7.x schema
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'json' | |
require 'elastic-site-search' # gem install elastic-site-search | |
require 'pry' | |
class RankingComparer | |
API_ENDPOINT = 'http://localhost:3002/api/v1/' | |
API_KEY = 'uQTxqyzMRxYMXgnnxVU1' | |
PAGE_SIZE = 40 | |
attr_reader :base_engine_name, :new_engine_name | |
def client | |
return @client if defined?(@client) | |
Elastic::SiteSearch.api_key = API_KEY | |
Elastic::SiteSearch.endpoint = API_ENDPOINT | |
@client = Elastic::SiteSearch::Client.new | |
end | |
def initialize(base_engine_name, new_engine_name) | |
@base_engine_name = base_engine_name | |
@new_engine_name = new_engine_name | |
validate_engines! | |
end | |
def validate_engines! | |
available_engines = client.engines.map { |engine| engine['slug'] } | |
raise "#{base_engine_name} is not an available engine" unless available_engines.include?(base_engine_name) | |
raise "#{new_engine_name} is not an available engine" unless available_engines.include?(new_engine_name) | |
end | |
def massaged_results(engine_name, query, page: 1) | |
results = client.search(engine_name, query, :page => page, :per_page => PAGE_SIZE) | |
results.records['page'].each_with_index.each_with_object({}) do |(result, idx), memo| | |
page_offset = (PAGE_SIZE * (page - 1)) | |
memo[result['title'].hash] = idx + 1 + page_offset | |
end | |
end | |
def compare_kendall_order(result1, result2) | |
if result1.nil? | |
if result2.nil? | |
0 | |
else | |
-1 | |
end | |
elsif result2.nil? | |
1 | |
else | |
result1 > result2 ? 1 : -1 | |
end | |
end | |
# based on https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient | |
def kendall_rank_correlation(query) | |
base_results = massaged_results(base_engine_name, query) | |
new_results = massaged_results(new_engine_name, query) | |
return "No results in the base engine for #{query.inspect}" if base_results.empty? | |
return "No results in the new engine for #{query.inspect}" if new_results.empty? | |
new_results_page_2 = massaged_results(new_engine_name, query, :page => 2) | |
new_results.merge!(new_results_page_2) | |
combinations = base_results.keys.combination(2) | |
concordant_pairs = 0 | |
discordant_pairs = 0 | |
combinations.each do |title_hash1, title_hash2| | |
result1_base = base_results[title_hash1] | |
result2_base = base_results[title_hash2] | |
result1_new = new_results[title_hash1] | |
result2_new = new_results[title_hash2] | |
result1 = compare_kendall_order(result1_base, result2_base) | |
result2 = compare_kendall_order(result1_new, result2_new) | |
if result1.zero? || result2.zero? | |
# this is an attempt to make things easier... it means both results from the | |
# base set are not in the top two pages for the new result sets. Technically, | |
# they could still be concordant if they're in the same order, but if they're | |
# not in the first two pages, I think it makes sense to "dock" points and | |
# error on the side of worse correlation and count it as a discordant pair. | |
# The other option is to not count it at all. | |
discordant_pairs += 1 | |
elsif result1 == 1 && result2 == 1 | |
concordant_pairs += 1 | |
elsif result1 == -1 && result2 == -1 | |
concordant_pairs += 1 | |
else | |
discordant_pairs += 1 | |
end | |
end | |
numerator = concordant_pairs - discordant_pairs | |
numerator / combinations.size.to_f | |
end | |
# based on https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient | |
def spearman_rank_correlation(query) | |
base_results = massaged_results(base_engine_name, query) | |
new_results = massaged_results(new_engine_name, query) | |
return "No results in the base engine for #{query.inspect}" if base_results.empty? | |
return "No results in the new engine for #{query.inspect}" if new_results.empty? | |
new_results_page_2 = massaged_results(new_engine_name, query, :page => 2) | |
new_results = new_results_page_2.merge(new_results) | |
new_results_page_3 = massaged_results(new_engine_name, query, :page => 3) | |
new_results = new_results_page_3.merge(new_results) | |
diff_square_sum = base_results.sum do |title_hash, base_rank| | |
new_rank = new_results[title_hash] | |
if new_rank.nil? | |
return <<-ERROR.gsub(/\s+/, ' ').strip | |
Result ##{base_rank} in base engine is not ranked | |
in the top #{new_results.size} for new engine | |
ERROR | |
end | |
diff = base_rank - new_rank | |
diff * diff | |
end | |
# previously, I was using the "worst_new_rank" to determine the denominator | |
# because if a result from the first set isn't in the first page of the | |
# second set, then just using the page size wouldn't accurately reflect | |
# the number of items that we were looking at. Technically, however, | |
# neither does "worst_new_rank"... and now I'm switching to using the page | |
# size because it makes the denominator smaller and thus makes the fraction | |
# larger, so we'll error on the side of caution | |
denominator = base_results.size * ((base_results.size ** 2) - 1) | |
numerator = 6 * diff_square_sum | |
1 - (numerator / denominator.to_f) | |
end | |
end | |
# ranking_comparer = RankingComparer.new('satisfy090', 'satisfy-running') | |
ranking_comparer = RankingComparer.new('satisfy-running', 'satisfy7') | |
puts 'Kendall:' | |
puts 'single word:' | |
puts "#{ranking_comparer.kendall_rank_correlation('shorts')} 'shorts'" | |
puts "#{ranking_comparer.kendall_rank_correlation('singlet')} 'singlet'" | |
puts "#{ranking_comparer.kendall_rank_correlation('shirt')} 'shirt'" | |
puts "#{ranking_comparer.kendall_rank_correlation('cotton')} 'cotton'" | |
puts "#{ranking_comparer.kendall_rank_correlation('jacket')} 'jacket'" | |
puts "#{ranking_comparer.kendall_rank_correlation('hat')} 'hat'" | |
puts "#{ranking_comparer.kendall_rank_correlation('technology')} 'technology'" | |
puts 'more descriptive:' | |
puts "#{ranking_comparer.kendall_rank_correlation('long trail shorts')} 'long trail shorts'" | |
puts "#{ranking_comparer.kendall_rank_correlation('moth eaten shirt')} 'moth eaten shirt'" | |
puts "#{ranking_comparer.kendall_rank_correlation('short distance 8')} 'short distance 8'" | |
puts | |
puts 'Spearman:' | |
puts 'single word:' | |
puts "#{ranking_comparer.spearman_rank_correlation('shorts')} 'shorts'" | |
puts "#{ranking_comparer.spearman_rank_correlation('singlet')} 'singlet'" | |
puts "#{ranking_comparer.spearman_rank_correlation('shirt')} 'shirt'" | |
puts "#{ranking_comparer.spearman_rank_correlation('cotton')} 'cotton'" | |
puts "#{ranking_comparer.spearman_rank_correlation('jacket')} 'jacket'" | |
puts "#{ranking_comparer.spearman_rank_correlation('hat')} 'hat'" | |
puts "#{ranking_comparer.spearman_rank_correlation('technology')} 'technology'" | |
puts 'more descriptive:' | |
puts "#{ranking_comparer.spearman_rank_correlation('long trail shorts')} 'long trail shorts'" | |
puts "#{ranking_comparer.spearman_rank_correlation('moth eaten shirt')} 'moth eaten shirt'" | |
puts "#{ranking_comparer.spearman_rank_correlation('short distance 8')} 'short distance 8'" | |
### OUTPUT 5.x => 7.x | |
# Kendall: | |
# single word: | |
# 0.839572192513369 'shorts' | |
# 0.9563025210084034 'singlet' | |
# 0.7411095305832148 'shirt' | |
# 0.4789915966386555 'cotton' | |
# 0.3 'jacket' | |
# 0.26666666666666666 'hat' | |
# 0.6096096096096096 'technology' | |
# more descriptive: | |
# 0.5334281650071123 'long trail shorts' | |
# 0.5585585585585585 'moth eaten shirt' | |
# 0.7927927927927928 'short distance 8' | |
# | |
# Spearman: | |
# single word: | |
# 0.9413292589763178 'shorts' | |
# 0.9911764705882353 'singlet' | |
# 0.7794069373016741 'shirt' | |
# 0.3680672268907563 'cotton' | |
# 0.37570356472795496 'jacket' | |
# 0.6183864915572233 'hat' | |
# 0.7051920341394026 'technology' | |
# more descriptive: | |
# 0.6456942772732246 'long trail shorts' | |
# 0.7178757705073495 'moth eaten shirt' | |
# 0.8307254623044097 'short distance 8' | |
# ## OUTPUT 0.90 => 5.x | |
# Kendall: | |
# single word: | |
# 0.36541889483065954 'shorts' | |
# 0.8050420168067227 'singlet' | |
# 0.8122332859174964 'shirt' | |
# 0.6893939393939394 'cotton' | |
# 0.5294117647058824 'jacket' | |
# 0.9738562091503268 'hat' | |
# 0.7207207207207207 'technology' | |
# more descriptive: | |
# 0.2857142857142857 'long trail shorts' | |
# -0.2132132132132132 'moth eaten shirt' | |
# 0.18618618618618618 'short distance 8' | |
# | |
# Spearman: | |
# single word: | |
# 0.2551566080977846 'shorts' | |
# 0.8061624649859944 'singlet' | |
# 0.8582995951417004 'shirt' | |
# 0.6649398395721925 'cotton' | |
# 0.826625386996904 'jacket' | |
# 0.9958720330237358 'hat' | |
# 0.8518255097202465 'technology' | |
# more descriptive: | |
# Result #23 in base engine is not ranked in the top 113 for new engine 'long trail shorts' | |
# Result #37 in base engine is not ranked in the top 57 for new engine 'moth eaten shirt' | |
# 0.02987197724039825 'short distance 8' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment