Created
January 17, 2022 22:09
-
-
Save datasciencemonkey/f6f18bcb67f9a4573ba96a36cca5ca87 to your computer and use it in GitHub Desktop.
compute document similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from flair.embeddings import ( | |
FlairEmbeddings, | |
TransformerWordEmbeddings, | |
StackedEmbeddings, | |
) | |
from flair.data import Sentence | |
from flair.embeddings import DocumentPoolEmbeddings | |
from sklearn.metrics.pairwise import cosine_similarity | |
# init Flair embeddings | |
flair_forward_embedding = FlairEmbeddings("multi-forward") | |
flair_backward_embedding = FlairEmbeddings("multi-backward") | |
transformer_embedding = TransformerWordEmbeddings('bert-base-uncased') | |
# %% | |
baseline = Sentence("machine learning using the user history data") | |
utterance = Sentence("Run autoML job using the user's past data") | |
utterance2 = Sentence("Run sql query on user table") | |
stacked_embeddings = StackedEmbeddings( | |
[flair_forward_embedding, flair_backward_embedding, transformer_embedding] | |
) | |
document_embeddings = DocumentPoolEmbeddings([stacked_embeddings]) | |
# %% | |
document_embeddings.embed(utterance) | |
document_embeddings.embed(utterance2) | |
document_embeddings.embed(baseline) | |
# %% | |
cosine_similarity(baseline.embedding.reshape(1, -1), utterance.embedding.reshape(1, -1)) | |
# %% | |
cosine_similarity(baseline.embedding.reshape(1, -1), utterance2.embedding.reshape(1, -1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment