Skip to content

Instantly share code, notes, and snippets.

@monajalal
Created November 6, 2016 00:43
Show Gist options
  • Save monajalal/fbf87d73cd1f6cd7ba9fa1add1649fba to your computer and use it in GitHub Desktop.
Save monajalal/fbf87d73cd1f6cd7ba9fa1add1649fba to your computer and use it in GitHub Desktop.
import sys
import gensim
import nltk
from gensim.models import word2vec
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import logging
import re
import itertools
import glob
from collections import defaultdict
import csv
from nltk.stem.wordnet import WordNetLemmatizer
import os
import os.path
import time
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
start_time = time.time()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.Text8Corpus("text8")
model = word2vec.Word2Vec(sentences)
#model.init_sims(replace = True)
model_name = "text8_data"
model.save(model_name)
setup_time = time.time()
#model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
#model.init_sims(replace=True)
#model_name = "Google_News"
#model.save(model_name)
edge_file_name = sys.argv[1]
path = "all_captions/*.txt"
files = glob.glob(path)
graph_file_name = "graph_mscoco"+ edge_file_name
csv_file_complete = open(graph_file_name, "wb")
stat_csv_file = open("all_stat.csv", "r")
csv_reader = csv.reader(stat_csv_file)
edge_file = open(edge_file_name, "r")
edge_reader = csv.reader(edge_file)
lemmatizer = WordNetLemmatizer()
list_of_rows = []
with open('swear_words_uniq.txt') as swear_words_file:
swear_words = swear_words_file.read()
swear_words = re.sub("[^a-zA-Z]", ' ', swear_words).lower().split()
swear_words_file.close()
#for file1, file2 in itertools.combinations(files, 2):
for edge_row in edge_reader:
stat_csv_file.seek(0)
for stat_row in csv_reader:
if edge_row[0]==stat_row[0]:
file1_last = stat_row[1]
if edge_row[1]==stat_row[0]:
file2_last = stat_row[1]
file1 = "all_captions/"+file1_last
file2 = "all_captions/"+file2_last
with open(file1) as f1:
f1_text = f1.read()
f1_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',f1_text, flags=re.MULTILINE)
f1_words = re.sub("[^a-zA-Z]", ' ', f1_text).lower().split()
lemmatized_f1_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f1_words if w not in stopwords]
cleaned_f1_words = [w for w in lemmatized_f1_words if w not in swear_words and len(w) > 2]
f1.close()
with open(file2) as f2:
f2_text = f2.read()
f2_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',f2_text, flags=re.MULTILINE)
f2_words = re.sub("[^a-zA-Z]", ' ', f2_text).lower().split()
lemmatized_f2_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f2_words if w not in stopwords]
cleaned_f2_words = [w for w in lemmatized_f2_words if w not in swear_words and len(w) > 2]
f2.close()
row_complete = [edge_row[0], edge_row[1], model.wmdistance(cleaned_f1_words, cleaned_f2_words)]
list_of_rows.append(row_complete)
#print(row_complete)
print(len(list_of_rows))
a_complete = csv.writer(csv_file_complete, delimiter=',')
for row in list_of_rows:
print(row)
a_complete.writerow(row)
csv_file_complete.close()
print("---%s setup time ---" % (setup_time - start_time))
print("--- %s seconds ---" % (time.time() - start_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment