Last active
June 19, 2023 05:23
-
-
Save mlr/9d7c2d4cacfb854f1f69c370b48708b9 to your computer and use it in GitHub Desktop.
Example of creating text embeddings using Gensim
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Prerequisite: | |
# Download the word2vec-google-news-300 model from Hugging Face | |
# https://huggingface.co/fse/word2vec-google-news-300/tree/main | |
# Download word2vec-google-news-300.model and word2vec-google-news-300.model.vectors.npy | |
# Place them in the same folder as this file. | |
from gensim.models import KeyedVectors | |
# Load the KeyedVectors model | |
model_path = "word2vec-google-news-300.model" | |
model = KeyedVectors.load(model_path) | |
def search_string_to_vector(search_string, model): | |
words = search_string.lower().split() | |
vectors = [model[word] for word in words if word in model] | |
if vectors: | |
search_vector = sum(vectors) / len(vectors) | |
return search_vector | |
else: | |
return None | |
# Example usage | |
search_string = "Hello World!" | |
print("Search string:", search_string) | |
search_vector = search_string_to_vector(search_string, model) | |
if search_vector is not None: | |
print("Search vector:", search_vector) | |
print("Vector length:", len(search_vector)) | |
else: | |
print("No vector representation available for the search string.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment