Created
October 31, 2016 07:25
-
-
Save monajalal/b22542c78dd79c7affc9ad7b312b0d5d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
import nltk | |
from gensim.models import word2vec | |
from nltk.corpus import stopwords | |
from nltk.corpus import wordnet | |
import logging | |
import re | |
import itertools | |
import glob | |
from collections import defaultdict | |
import csv | |
from nltk.stem.wordnet import WordNetLemmatizer | |
import os | |
import os.path | |
import time | |
start_time = time.time() | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
sentences = word2vec.Text8Corpus("/home/mona/mscoco/text8") | |
model = word2vec.Word2Vec(sentences, workers = 16) | |
#model.init_sims(replace = True) | |
model_name = "text8_data" | |
model.save(model_name) | |
#model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) | |
#model.init_sims(replace=True) | |
#model_name = "Google_News" | |
#model.save(model_name) | |
stopwords = nltk.corpus.stopwords.words('english') | |
path = "/home/mona/computer_vision/imgur/tiny_comments/*.txt" | |
files = glob.glob(path) | |
csv_file_complete = open("tiny_graph_text8.csv", "wb") | |
stat_csv_file = open("tiny_stat.csv", "r") | |
csv_reader = csv.reader(stat_csv_file) | |
lemmatizer = WordNetLemmatizer() | |
list_of_rows = [] | |
with open('swear_words_uniq.txt') as swear_words_file: | |
swear_words = swear_words_file.read() | |
swear_words = re.sub("[^a-zA-Z]", ' ', swear_words).lower().split() | |
swear_words_file.close() | |
for file1, file2 in itertools.combinations(files, 2): | |
with open(file1) as f1: | |
f1_text = f1.read() | |
f1_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',f1_text, flags=re.MULTILINE) | |
f1_words = re.sub("[^a-zA-Z]", ' ', f1_text).lower().split() | |
lemmatized_f1_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f1_words if w not in stopwords] | |
cleaned_f1_words = [w for w in lemmatized_f1_words if w not in swear_words and len(w) > 2] | |
f1.close() | |
with open(file2) as f2: | |
f2_text = f2.read() | |
f2_words = re.sub("[^a-zA-Z]", ' ', f2_text).lower().split() | |
lemmatized_f2_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f2_words if w not in stopwords] | |
cleaned_f2_words = [w for w in lemmatized_f2_words if w not in swear_words and len(w) > 2] | |
f2.close() | |
f1_head, f1_tail = os.path.split(file1) | |
f2_head, f2_tail = os.path.split(file2) | |
tail_to_numbers = {row[1]: row[0] for row in csv_reader} | |
stat_csv_file.seek(0) | |
try: | |
file1_file_number = tail_to_numbers[f1_tail] | |
file2_file_number = tail_to_numbers[f2_tail] | |
except KeyError as e: | |
print(e) | |
continue | |
else: | |
row_complete = [file1_file_number.strip(), file2_file_number.strip(), | |
model.wmdistance(cleaned_f1_words, cleaned_f2_words)] | |
list_of_rows.append(row_complete) | |
print(len(list_of_rows)) | |
a_complete = csv.writer(csv_file_complete, delimiter=',') | |
for row in list_of_rows: | |
print(row) | |
a_complete.writerow(row) | |
csv_file_complete.close() | |
print("--- %s seconds ---" % (time.time() - start_time)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment