Skip to content

Instantly share code, notes, and snippets.

@cobanov
Created March 3, 2022 15:38
Show Gist options
  • Save cobanov/dfa3c58be631e0afc306303e75efa766 to your computer and use it in GitHub Desktop.
Save cobanov/dfa3c58be631e0afc306303e75efa766 to your computer and use it in GitHub Desktop.
# Refik Anadol Studio - 2022
import re
import spacy
import pandas as pd
import multiprocessing
from collections import defaultdict
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
def read_file(path_to_file):
with open(path_to_file, "r") as f:
file = f.read().replace("\n", " ")
file = file.strip().split(".")
return file
def cleaning(doc):
# Lemmatizes and removes stopwords
# doc needs to be a spacy Doc object
txt = [token.lemma_ for token in doc if not token.is_stop]
# Word2Vec uses context words to learn the vector representation of a target word,
# if a sentence is only one or two words long,
# the benefit for the training is very small
if len(txt) > 2:
return " ".join(txt)
def briefing(file):
brief_cleaning = (re.sub("[^A-Za-z']+", " ", str(row)).lower() for row in file)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]
df_clean = pd.DataFrame({"clean": txt})
df_clean = df_clean.dropna().drop_duplicates()
print('Brief cleaning done')
return df_clean
def do_bigram(df_clean):
sent = [row.split() for row in df_clean["clean"]]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]
return sentences
def show_freqs(sentences):
word_freq = defaultdict(int)
for sent in sentences:
for i in sent:
word_freq[i] += 1
# print(len(word_freq))
print(sorted(word_freq, key=word_freq.get, reverse=True)[:10])
def train_model(sentences):
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
w2v_model = Word2Vec(
min_count=2,
window=2,
vector_size=300,
sample=6e-5,
alpha=0.03,
min_alpha=0.0007,
negative=20,
workers=cores - 1,
)
w2v_model.build_vocab(sentences, progress_per=10000)
w2v_model.train(
sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1
)
w2v_model.init_sims(replace=True)
return w2v_model
def export_embeddings(w2v_model):
words = list(w2v_model.wv.index_to_key)
words_wp = []
embeddings_wp = []
for word in words:
embeddings_wp.append(w2v_model.wv[word])
words_wp.append(word)
print(len(words_wp))
return words_wp, embeddings_wp
def create_dataset(words_wp, embeddings_wp):
# with open('metadata.csv', 'w', newline='') as f:
# output = csv.writer(f)
# output.writerow(words_wp)
# with open('embeddings.csv', 'w') as f:
# write = csv.writer(f)
# write.writerow(range(len(embeddings_wp[0])))
# write.writerows(embeddings_wp)
dataset = pd.DataFrame(embeddings_wp)
dataset["words"] = pd.Series(words_wp)
return dataset
def write_dataset(dataset, file_name):
dataset.to_csv(file_name, index_label=False)
def main():
path_to_file = "./italian.txt"
file = read_file(path_to_file)
df_clean = briefing(file)
sentences = do_bigram(df_clean)
show_freqs(sentences)
w2v_model = train_model(sentences)
words_wp, embeddings_wp = export_embeddings(w2v_model)
dataset = create_dataset(words_wp, embeddings_wp)
write_dataset(dataset, file_name="./italian-umap-dataset.csv")
w2v_model.wv.save_word2vec_format("word2vec.model")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment