Created
February 15, 2024 20:35
-
-
Save gd3kr/c4c0687a5f7e91b1a84bcacea6500011 to your computer and use it in GitHub Desktop.
compute embeddings for tweets in tweets.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
a simple script that reads tweets inside a json file, uses openai to compute embeddings and creates two files, metadata.tsv and output.tsv, which cam be used to visualise the tweets and their embeddings in TensorFlow Projector (https://projector.tensorflow.org/) | |
""" | |
# obtain tweets.json from https://gist.github.com/gd3kr/948296cf675469f5028911f8eb276dbc | |
import pandas as pd | |
import json | |
from openai import OpenAI | |
client = OpenAI(api_key="<INSERT OPENAI API KEY HERE>") | |
import numpy as np | |
def sanitize_for_json(text): | |
return json.dumps(text) | |
# read tweets from tweets.json | |
with open('tweets.json', 'r') as file: | |
tweets = json.load(file) | |
# create empty lists to store embeddings and metadata | |
embeddings = [] | |
metadata = [] | |
for i in range(len(tweets)): | |
tweets[i] = sanitize_for_json(tweets[i]) | |
# Chunking tweets into chunks of 500 | |
for i in range(0, len(tweets), 500): | |
chunk = tweets[i:i+500] | |
chunk_array = np.array(chunk) | |
try: | |
response = client.embeddings.create( | |
input=chunk, | |
model="text-embedding-3-small" | |
) | |
chunk_embeddings = np.array([data.embedding for data in response.data]) | |
embeddings.extend(chunk_embeddings) | |
print(chunk_embeddings.shape) # (n, 1536) | |
# save metadata as tsv | |
chunk_no_newlines = [tweet.replace('\n', ' ') for tweet in chunk] | |
metadata.extend(chunk_no_newlines) | |
except Exception as e: | |
print(f"Error occurred while generating embeddings: {e}") | |
# Convert list of all embeddings into data frame | |
embedding_df = pd.DataFrame(embeddings) | |
# Save dataframe as as TSV file without any index and header | |
embedding_df.to_csv('output.tsv', sep='\t', index=None, header=None) | |
# Convert list of all metadata into data frame | |
metadata_df = pd.DataFrame(metadata) | |
metadata_df.to_csv('metadata.tsv', sep='\t', index=None, header=None) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment