Created
July 30, 2023 16:53
-
-
Save bitbutter/f1d9ffe68c621a48541dbc7bc23d359d to your computer and use it in GitHub Desktop.
create embeddings from tweets, then semantic search them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import spacy | |
import zipfile | |
import os | |
import pickle | |
# File paths | |
zip_path = r"pathtotwitterarchive.zip" # Path to zipped twitter archive | |
extract_path = r"somepath\twitter_data" # Path to extract twitter data | |
embeddings_path = r"somepath\tweet_embeddings.pkl" # Path to save embeddings | |
# Load the large English model in SpaCy | |
print("Loading SpaCy model...") | |
nlp = spacy.load('en_core_web_lg') | |
# Extract the .zip file | |
print("Starting extraction of zip file...") | |
with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
zip_ref.extractall(extract_path) | |
print("Finished extraction of zip file.") | |
# Load the Twitter archive | |
print("Starting loading of Twitter archive...") | |
with open(os.path.join(extract_path, 'data', 'tweets.js'), 'r', encoding='utf-8') as f: | |
data = f.read().replace('window.YTD.tweets.part0 = ', '') | |
raw_archive = json.loads(data) | |
print("Finished loading of Twitter archive.") | |
# Extract the actual tweets from the raw data | |
archive = [item['tweet'] for item in raw_archive] | |
# Generate embeddings | |
print("Starting generation of embeddings...") | |
tweets = [] | |
for item in archive: | |
# Extract text and URL | |
text = item['full_text'] | |
url = f"https://twitter.com/i/web/status/{item['id_str']}" | |
# Generate vector using SpaCy | |
vector = nlp(text).vector | |
# Append to the list of tweets | |
tweets.append((text, url, vector)) | |
print("Finished generation of embeddings.") | |
# Save embeddings to a file | |
print("Starting saving of embeddings to a file...") | |
with open(embeddings_path, 'wb') as f: | |
pickle.dump(tweets, f) | |
print("Finished saving of embeddings to a file.") | |
print("Script completed.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import pickle | |
import os | |
from sklearn.metrics.pairwise import cosine_similarity | |
from operator import itemgetter | |
import webbrowser | |
from colorama import Fore, Style | |
# Define a minimum length for tweets | |
min_tweet_length = 30 | |
# Load the large English model in SpaCy | |
nlp = spacy.load('en_core_web_lg') | |
# Load the embeddings | |
embeddings_path = r"somepath\tweet_embeddings.pkl" # Path to embeddings | |
with open(embeddings_path, 'rb') as f: | |
tweets = pickle.load(f) | |
# Filter out short tweets and tweets with more than one other handle | |
tweets = [tweet for tweet in tweets if len(tweet[0]) >= min_tweet_length and tweet[0].count('@') == 0] | |
# Get user input | |
text = input("Enter the text for a new tweet: ") | |
# Calculate the embedding of the input text | |
input_vector = nlp(text).vector | |
# Calculate similarities with the existing tweets | |
similarities = [(tweet[0], tweet[1], cosine_similarity([input_vector], [tweet[2]])) for tweet in tweets] | |
# Sort by similarity | |
similarities.sort(key=itemgetter(2), reverse=True) | |
# Print the 20 most similar tweets | |
print("The 20 most similar tweets are:") | |
for i in range(20): | |
print(f"{i+1}. {Fore.LIGHTBLUE_EX}{similarities[i][0]}{Style.RESET_ALL} (Similarity: {similarities[i][2][0][0]})") | |
print(f"URL: {similarities[i][1]}\n\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment