Created
November 6, 2016 00:43
-
-
Save monajalal/fbf87d73cd1f6cd7ba9fa1add1649fba to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import gensim | |
import nltk | |
from gensim.models import word2vec | |
from nltk.corpus import stopwords | |
from nltk.corpus import wordnet | |
import logging | |
import re | |
import itertools | |
import glob | |
from collections import defaultdict | |
import csv | |
from nltk.stem.wordnet import WordNetLemmatizer | |
import os | |
import os.path | |
import time | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
stopwords = nltk.corpus.stopwords.words('english') | |
start_time = time.time() | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
sentences = word2vec.Text8Corpus("text8") | |
model = word2vec.Word2Vec(sentences) | |
#model.init_sims(replace = True) | |
model_name = "text8_data" | |
model.save(model_name) | |
setup_time = time.time() | |
#model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) | |
#model.init_sims(replace=True) | |
#model_name = "Google_News" | |
#model.save(model_name) | |
edge_file_name = sys.argv[1] | |
path = "all_captions/*.txt" | |
files = glob.glob(path) | |
graph_file_name = "graph_mscoco"+ edge_file_name | |
csv_file_complete = open(graph_file_name, "wb") | |
stat_csv_file = open("all_stat.csv", "r") | |
csv_reader = csv.reader(stat_csv_file) | |
edge_file = open(edge_file_name, "r") | |
edge_reader = csv.reader(edge_file) | |
lemmatizer = WordNetLemmatizer() | |
list_of_rows = [] | |
with open('swear_words_uniq.txt') as swear_words_file: | |
swear_words = swear_words_file.read() | |
swear_words = re.sub("[^a-zA-Z]", ' ', swear_words).lower().split() | |
swear_words_file.close() | |
#for file1, file2 in itertools.combinations(files, 2): | |
for edge_row in edge_reader: | |
stat_csv_file.seek(0) | |
for stat_row in csv_reader: | |
if edge_row[0]==stat_row[0]: | |
file1_last = stat_row[1] | |
if edge_row[1]==stat_row[0]: | |
file2_last = stat_row[1] | |
file1 = "all_captions/"+file1_last | |
file2 = "all_captions/"+file2_last | |
with open(file1) as f1: | |
f1_text = f1.read() | |
f1_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',f1_text, flags=re.MULTILINE) | |
f1_words = re.sub("[^a-zA-Z]", ' ', f1_text).lower().split() | |
lemmatized_f1_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f1_words if w not in stopwords] | |
cleaned_f1_words = [w for w in lemmatized_f1_words if w not in swear_words and len(w) > 2] | |
f1.close() | |
with open(file2) as f2: | |
f2_text = f2.read() | |
f2_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',f2_text, flags=re.MULTILINE) | |
f2_words = re.sub("[^a-zA-Z]", ' ', f2_text).lower().split() | |
lemmatized_f2_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f2_words if w not in stopwords] | |
cleaned_f2_words = [w for w in lemmatized_f2_words if w not in swear_words and len(w) > 2] | |
f2.close() | |
row_complete = [edge_row[0], edge_row[1], model.wmdistance(cleaned_f1_words, cleaned_f2_words)] | |
list_of_rows.append(row_complete) | |
#print(row_complete) | |
print(len(list_of_rows)) | |
a_complete = csv.writer(csv_file_complete, delimiter=',') | |
for row in list_of_rows: | |
print(row) | |
a_complete.writerow(row) | |
csv_file_complete.close() | |
print("---%s setup time ---" % (setup_time - start_time)) | |
print("--- %s seconds ---" % (time.time() - start_time)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment