Last active
February 27, 2019 12:33
-
-
Save rostegg/f78badb940bb9f8a4bf7dc8092220955 to your computer and use it in GitHub Desktop.
Script for generating a glossary from text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
import json | |
import requests | |
import nltk | |
import random | |
import time | |
# id and key from https://developer.oxforddictionaries.com/ | |
app_id = '' | |
app_key = '' | |
url = 'https://od-api.oxforddictionaries.com:443/api/v1/entries/en/%s' | |
min_word_length = 7 | |
words_count = 50 | |
# models - https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/ | |
def is_valid(pos): return pos[:2] == 'VB' or pos[:2] == 'JJ' or pos[:2] == 'NN' | |
def get_word_definition(word): | |
request_url = (url) % (word) | |
r = requests.get(request_url, headers={ | |
'app_id': app_id, 'app_key': app_key}) | |
response = r.json() | |
definition = response['results'][0]['lexicalEntries'][0]['entries'][0]['senses'][0]['definitions'][0] | |
return definition | |
def clear_text(text): | |
template = '[^A-Za-z ]+' | |
clear_text = re.sub(template, '', text) | |
return clear_text | |
def main(argv): | |
# uncomment at first time | |
# nltk.download() | |
with open('input.txt') as f: | |
text = f.read() | |
text = clear_text(text) | |
tokenized = nltk.word_tokenize(text) | |
valid_words = [word for (word, pos) in nltk.pos_tag( | |
tokenized) if is_valid(pos)] | |
valid_words_set = set( | |
[x.lower() for x in valid_words if (len(x) > min_word_length)]) | |
print(('\tFinded %s words...') % (len(valid_words_set))) | |
global words_count | |
if len(valid_words_set) < words_count: | |
print("\tInsufficient number of words..") | |
exit(1) | |
with open('result.txt', 'w', encoding='utf-8') as f: | |
while words_count != 0: | |
word = valid_words_set.pop() | |
try: | |
definition = get_word_definition(word) | |
result = '%s - %s' % (word, definition) | |
print('\t'+result) | |
f.write(result+'\n') | |
words_count -= 1 | |
except: | |
pass | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment