Skip to content

Instantly share code, notes, and snippets.

@bowbowbow
Created November 4, 2018 20:34
Show Gist options
  • Select an option

  • Save bowbowbow/765801647952949959746f71d269e105 to your computer and use it in GitHub Desktop.

Select an option

Save bowbowbow/765801647952949959746f71d269e105 to your computer and use it in GitHub Desktop.
import json
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
def get_train(end):
train = []
progress = 0
for i in range(0, end + 1):
with open('./data/koreaherald_1517_{}.json'.format(i)) as f:
data = json.load(f)
# print(data.keys())
# [' author', ' body', ' description', ' time', 'title', ' section']
for article_id in data[' body']:
progress += 1
if progress % 10 == 0:
print('progress: {}/{}'.format(progress, len(data[' body'])))
body = data[' body'][article_id]
arr = []
for sent in sent_tokenize(body):
tokens = [token.lower() for token in word_tokenize(sent)]
tags = nltk.pos_tag(tokens)
for tag in tags:
arr.append('{}/{}'.format(tag[0], tag[1]))
train.append(arr)
return train
if __name__ == "__main__":
embedding_size = 200
train = get_train(7)
embedding_model = Word2Vec(train, size=embedding_size, window=5, min_count=5)
embedding_model.save("./embedding.bin")
word1 = "president/NN"
word2 = "moon/RBS"
print("similar words11 of {}:".format(word1))
print(embedding_model.most_similar(word1))
print("similar words of {}:".format(word2))
print(embedding_model.most_similar(word2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment