Created
March 3, 2022 15:38
-
-
Save cobanov/dfa3c58be631e0afc306303e75efa766 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Refik Anadol Studio - 2022 | |
import re | |
import spacy | |
import pandas as pd | |
import multiprocessing | |
from collections import defaultdict | |
from gensim.models import Word2Vec | |
from gensim.models.phrases import Phrases, Phraser | |
def read_file(path_to_file): | |
with open(path_to_file, "r") as f: | |
file = f.read().replace("\n", " ") | |
file = file.strip().split(".") | |
return file | |
def cleaning(doc): | |
# Lemmatizes and removes stopwords | |
# doc needs to be a spacy Doc object | |
txt = [token.lemma_ for token in doc if not token.is_stop] | |
# Word2Vec uses context words to learn the vector representation of a target word, | |
# if a sentence is only one or two words long, | |
# the benefit for the training is very small | |
if len(txt) > 2: | |
return " ".join(txt) | |
def briefing(file): | |
brief_cleaning = (re.sub("[^A-Za-z']+", " ", str(row)).lower() for row in file) | |
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"]) | |
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)] | |
df_clean = pd.DataFrame({"clean": txt}) | |
df_clean = df_clean.dropna().drop_duplicates() | |
print('Brief cleaning done') | |
return df_clean | |
def do_bigram(df_clean): | |
sent = [row.split() for row in df_clean["clean"]] | |
phrases = Phrases(sent, min_count=30, progress_per=10000) | |
bigram = Phraser(phrases) | |
sentences = bigram[sent] | |
return sentences | |
def show_freqs(sentences): | |
word_freq = defaultdict(int) | |
for sent in sentences: | |
for i in sent: | |
word_freq[i] += 1 | |
# print(len(word_freq)) | |
print(sorted(word_freq, key=word_freq.get, reverse=True)[:10]) | |
def train_model(sentences): | |
cores = multiprocessing.cpu_count() # Count the number of cores in a computer | |
w2v_model = Word2Vec( | |
min_count=2, | |
window=2, | |
vector_size=300, | |
sample=6e-5, | |
alpha=0.03, | |
min_alpha=0.0007, | |
negative=20, | |
workers=cores - 1, | |
) | |
w2v_model.build_vocab(sentences, progress_per=10000) | |
w2v_model.train( | |
sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1 | |
) | |
w2v_model.init_sims(replace=True) | |
return w2v_model | |
def export_embeddings(w2v_model): | |
words = list(w2v_model.wv.index_to_key) | |
words_wp = [] | |
embeddings_wp = [] | |
for word in words: | |
embeddings_wp.append(w2v_model.wv[word]) | |
words_wp.append(word) | |
print(len(words_wp)) | |
return words_wp, embeddings_wp | |
def create_dataset(words_wp, embeddings_wp): | |
# with open('metadata.csv', 'w', newline='') as f: | |
# output = csv.writer(f) | |
# output.writerow(words_wp) | |
# with open('embeddings.csv', 'w') as f: | |
# write = csv.writer(f) | |
# write.writerow(range(len(embeddings_wp[0]))) | |
# write.writerows(embeddings_wp) | |
dataset = pd.DataFrame(embeddings_wp) | |
dataset["words"] = pd.Series(words_wp) | |
return dataset | |
def write_dataset(dataset, file_name): | |
dataset.to_csv(file_name, index_label=False) | |
def main(): | |
path_to_file = "./italian.txt" | |
file = read_file(path_to_file) | |
df_clean = briefing(file) | |
sentences = do_bigram(df_clean) | |
show_freqs(sentences) | |
w2v_model = train_model(sentences) | |
words_wp, embeddings_wp = export_embeddings(w2v_model) | |
dataset = create_dataset(words_wp, embeddings_wp) | |
write_dataset(dataset, file_name="./italian-umap-dataset.csv") | |
w2v_model.wv.save_word2vec_format("word2vec.model") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment