Skip to content

Instantly share code, notes, and snippets.

@monajalal
Created October 31, 2016 07:25
Show Gist options
  • Save monajalal/b22542c78dd79c7affc9ad7b312b0d5d to your computer and use it in GitHub Desktop.
Save monajalal/b22542c78dd79c7affc9ad7b312b0d5d to your computer and use it in GitHub Desktop.
import gensim
import nltk
from gensim.models import word2vec
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import logging
import re
import itertools
import glob
from collections import defaultdict
import csv
from nltk.stem.wordnet import WordNetLemmatizer
import os
import os.path
import time
start_time = time.time()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.Text8Corpus("/home/mona/mscoco/text8")
model = word2vec.Word2Vec(sentences, workers = 16)
#model.init_sims(replace = True)
model_name = "text8_data"
model.save(model_name)
#model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
#model.init_sims(replace=True)
#model_name = "Google_News"
#model.save(model_name)
stopwords = nltk.corpus.stopwords.words('english')
path = "/home/mona/computer_vision/imgur/tiny_comments/*.txt"
files = glob.glob(path)
csv_file_complete = open("tiny_graph_text8.csv", "wb")
stat_csv_file = open("tiny_stat.csv", "r")
csv_reader = csv.reader(stat_csv_file)
lemmatizer = WordNetLemmatizer()
list_of_rows = []
with open('swear_words_uniq.txt') as swear_words_file:
swear_words = swear_words_file.read()
swear_words = re.sub("[^a-zA-Z]", ' ', swear_words).lower().split()
swear_words_file.close()
for file1, file2 in itertools.combinations(files, 2):
with open(file1) as f1:
f1_text = f1.read()
f1_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',f1_text, flags=re.MULTILINE)
f1_words = re.sub("[^a-zA-Z]", ' ', f1_text).lower().split()
lemmatized_f1_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f1_words if w not in stopwords]
cleaned_f1_words = [w for w in lemmatized_f1_words if w not in swear_words and len(w) > 2]
f1.close()
with open(file2) as f2:
f2_text = f2.read()
f2_words = re.sub("[^a-zA-Z]", ' ', f2_text).lower().split()
lemmatized_f2_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f2_words if w not in stopwords]
cleaned_f2_words = [w for w in lemmatized_f2_words if w not in swear_words and len(w) > 2]
f2.close()
f1_head, f1_tail = os.path.split(file1)
f2_head, f2_tail = os.path.split(file2)
tail_to_numbers = {row[1]: row[0] for row in csv_reader}
stat_csv_file.seek(0)
try:
file1_file_number = tail_to_numbers[f1_tail]
file2_file_number = tail_to_numbers[f2_tail]
except KeyError as e:
print(e)
continue
else:
row_complete = [file1_file_number.strip(), file2_file_number.strip(),
model.wmdistance(cleaned_f1_words, cleaned_f2_words)]
list_of_rows.append(row_complete)
print(len(list_of_rows))
a_complete = csv.writer(csv_file_complete, delimiter=',')
for row in list_of_rows:
print(row)
a_complete.writerow(row)
csv_file_complete.close()
print("--- %s seconds ---" % (time.time() - start_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment