Created
February 17, 2020 22:59
-
-
Save heronyang/3d481011132824c9bd736914b6a40481 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Loads a textfile, builds a Word2Vec model, and prints similarity of words. | |
""" | |
import urllib.request | |
import nltk | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from gensim.models import Word2Vec | |
# Source: Project Gutenberg's Alice's Adventures in Wonderland. | |
CORPUS_TEXT_URL = 'https://www.gutenberg.org/files/11/11.txt' | |
def main(): | |
setup() | |
# Gets training data from corpus. | |
data = build_data(get_corpus()) | |
# Trains the word2vec model. | |
model = Word2Vec(data, min_count=1, size=25, window=5, sg=1) | |
# Gets the most similar words with 'alice'. | |
print(model.similarity('alice', 'king')) | |
print(model.similarity('alice', 'tree')) | |
def setup(): | |
nltk.download('punkt') | |
def get_corpus(): | |
return ' '.join( | |
urllib.request.urlopen(CORPUS_TEXT_URL).read().decode("utf-8") | |
.replace('\n', ' ').split() | |
) | |
def build_data(corpus): | |
return [ | |
[word.lower() for word in word_tokenize(sentence)] | |
for sentence in sent_tokenize(corpus) | |
] | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment