Created
May 26, 2020 20:38
-
-
Save Felflare/edbd285b56846450688cc0ca9f84a369 to your computer and use it in GitHub Desktop.
This Snippet of code demonstates cross-language sentence embeddings system used for similarity search & match beating LASER embeddings [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/pdf/2004.09813.pdf) by Nils Reimers and Iryna Gurevych.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!git clone [email protected]:UKPLab/sentence-transformers.git\n", | |
"!cd sentence-transformers\n", | |
"!pip install ." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import scipy\n", | |
"import numpy as np\n", | |
"from sentence_transformers import models, SentenceTransformer\n", | |
"\n", | |
"model = SentenceTransformer('distiluse-base-multilingual-cased')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Corpus with example sentences, some sentences were swapped to French, Italian and German\n", | |
"corpus = ['Un homme mange de la nourriture.', #FR 'A man is eating food.',\n", | |
" 'A man is eating a piece of bread.',\n", | |
" 'Das Mädchen trägt ein Baby.', #DE 'The girl is carrying a baby.',\n", | |
" 'A man is riding a horse.',\n", | |
" 'An elderly man is enjoying dinner.',\n", | |
" 'Amis partageant du vin dans un restaurant.', #FR 'Friends sharing wine at a restaurant.',\n", | |
" 'A woman is playing violin.',\n", | |
" 'A child is learning to play a base guitar.',\n", | |
" 'Due uomini hanno spinto i carrelli attraverso i boschi.', #IT 'Two men pushed carts through the woods.',\n", | |
" 'A man is riding a white horse on an enclosed ground.',\n", | |
" 'Una scimmia suona la batteria.', #IT 'A monkey is playing drums.',\n", | |
" 'A cheetah is running behind its prey.']\n", | |
"\n", | |
"corpus_embeddings = model.encode(corpus)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Sample queries to find similar sentences to, some sentences were translated to Russian and German.\n", | |
"queries = ['A man is eating pasta.', \n", | |
" 'Кто-то в костюме гориллы играет на барабане', #RU 'Someone in a gorilla costume is playing a set of drums.', \n", | |
" 'Ein Gepard jagt Beute über ein Feld.'] #DE 'A cheetah chases prey on across a field.']\n", | |
"query_embeddings = model.encode(queries)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Calculate Cosine similarity of query against each sentence i\n", | |
"closest_n = 3\n", | |
"for query, query_embedding in zip(queries, query_embeddings):\n", | |
" distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n", | |
"\n", | |
" results = zip(range(len(distances)), distances)\n", | |
" results = sorted(results, key=lambda x: x[1])\n", | |
"\n", | |
" print(\"\\n======================\\n\")\n", | |
" print(\"Query:\", query)\n", | |
" print(\"\\nTop 3 most similar sentences in corpus:\")\n", | |
"\n", | |
" for idx, distance in results[0:closest_n]:\n", | |
" print(corpus[idx].strip(), \"(Score: %.4f)\" % (1-distance))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "transformers", | |
"language": "python", | |
"name": "transformers" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment