Skip to content

Instantly share code, notes, and snippets.

@Felflare
Created May 26, 2020 20:38
Show Gist options
  • Save Felflare/edbd285b56846450688cc0ca9f84a369 to your computer and use it in GitHub Desktop.
Save Felflare/edbd285b56846450688cc0ca9f84a369 to your computer and use it in GitHub Desktop.
This Snippet of code demonstates cross-language sentence embeddings system used for similarity search & match beating LASER embeddings [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/pdf/2004.09813.pdf) by Nils Reimers and Iryna Gurevych.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!git clone [email protected]:UKPLab/sentence-transformers.git\n",
"!cd sentence-transformers\n",
"!pip install ."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import scipy\n",
"import numpy as np\n",
"from sentence_transformers import models, SentenceTransformer\n",
"\n",
"model = SentenceTransformer('distiluse-base-multilingual-cased')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Corpus with example sentences, some sentences were swapped to French, Italian and German\n",
"corpus = ['Un homme mange de la nourriture.', #FR 'A man is eating food.',\n",
" 'A man is eating a piece of bread.',\n",
" 'Das Mädchen trägt ein Baby.', #DE 'The girl is carrying a baby.',\n",
" 'A man is riding a horse.',\n",
" 'An elderly man is enjoying dinner.',\n",
" 'Amis partageant du vin dans un restaurant.', #FR 'Friends sharing wine at a restaurant.',\n",
" 'A woman is playing violin.',\n",
" 'A child is learning to play a base guitar.',\n",
" 'Due uomini hanno spinto i carrelli attraverso i boschi.', #IT 'Two men pushed carts through the woods.',\n",
" 'A man is riding a white horse on an enclosed ground.',\n",
" 'Una scimmia suona la batteria.', #IT 'A monkey is playing drums.',\n",
" 'A cheetah is running behind its prey.']\n",
"\n",
"corpus_embeddings = model.encode(corpus)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Sample queries to find similar sentences to, some sentences were translated to Russian and German.\n",
"queries = ['A man is eating pasta.', \n",
" 'Кто-то в костюме гориллы играет на барабане', #RU 'Someone in a gorilla costume is playing a set of drums.', \n",
" 'Ein Gepard jagt Beute über ein Feld.'] #DE 'A cheetah chases prey on across a field.']\n",
"query_embeddings = model.encode(queries)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Calculate Cosine similarity of query against each sentence i\n",
"closest_n = 3\n",
"for query, query_embedding in zip(queries, query_embeddings):\n",
" distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n",
"\n",
" results = zip(range(len(distances)), distances)\n",
" results = sorted(results, key=lambda x: x[1])\n",
"\n",
" print(\"\\n======================\\n\")\n",
" print(\"Query:\", query)\n",
" print(\"\\nTop 3 most similar sentences in corpus:\")\n",
"\n",
" for idx, distance in results[0:closest_n]:\n",
" print(corpus[idx].strip(), \"(Score: %.4f)\" % (1-distance))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "transformers",
"language": "python",
"name": "transformers"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment