Skip to content

Instantly share code, notes, and snippets.

@tondrej
Created June 23, 2018 09:55
Show Gist options
  • Save tondrej/7a3525971b6439c603a18b1e70fa00b7 to your computer and use it in GitHub Desktop.
Save tondrej/7a3525971b6439c603a18b1e70fa00b7 to your computer and use it in GitHub Desktop.
SQL cosine similarity, "king - man + woman = queen"
-- king - man + woman
with q (
id,
w
) as (
select w.id, +1.0 from fasttext.word as w where (w.word = 'king')
union
select w.id, -1.0 from fasttext.word as w where (w.word = 'man')
union
select w.id, +1.0 from fasttext.word as w where (w.word = 'woman')
),
qv (
i,
value
) as (
select
v.i,
sum(v.value * q.w) as value
from q
join fasttext.vector as v on
(v.modelId = 1) and
(v.wordId = q.id) and
(v.entryType = 0)
group by
v.i
),
tt (
wordId,
sim
) as (
select top 5
v2.wordId,
sum(v1.value * v2.value) / (sqrt(sum(v1.value * v1.value)) * sqrt(sum(v2.value * v2.value))) as sim
from qv as v1
join fasttext.vector as v2 on
(v2.modelId = 1) and
(v2.wordId not in (select id from q)) and
(v2.i = v1.i) and
(v2.entryType = 0)
group by
v2.wordId
order by
2 desc
)
select
w.word,
tt.sim
from tt
join fasttext.word as w on
(w.id = tt.wordId)
order by
tt.sim desc;
/*------------------------
word sim
-------------------------------------------------- ----------------------
queen 0.779682014249644
regnant 0.755401848735527
consort 0.743375493476356
daughter 0.723103235814238
throne 0.721994349834325
(5 rows affected)
------------------------*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment