Skip to content

Instantly share code, notes, and snippets.

@rostegg
Last active February 27, 2019 12:33
Show Gist options
  • Save rostegg/f78badb940bb9f8a4bf7dc8092220955 to your computer and use it in GitHub Desktop.
Save rostegg/f78badb940bb9f8a4bf7dc8092220955 to your computer and use it in GitHub Desktop.
Script for generating a glossary from text
import sys
import re
import json
import requests
import nltk
import random
import time
# id and key from https://developer.oxforddictionaries.com/
app_id = ''
app_key = ''
url = 'https://od-api.oxforddictionaries.com:443/api/v1/entries/en/%s'
min_word_length = 7
words_count = 50
# models - https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/
def is_valid(pos): return pos[:2] == 'VB' or pos[:2] == 'JJ' or pos[:2] == 'NN'
def get_word_definition(word):
request_url = (url) % (word)
r = requests.get(request_url, headers={
'app_id': app_id, 'app_key': app_key})
response = r.json()
definition = response['results'][0]['lexicalEntries'][0]['entries'][0]['senses'][0]['definitions'][0]
return definition
def clear_text(text):
template = '[^A-Za-z ]+'
clear_text = re.sub(template, '', text)
return clear_text
def main(argv):
# uncomment at first time
# nltk.download()
with open('input.txt') as f:
text = f.read()
text = clear_text(text)
tokenized = nltk.word_tokenize(text)
valid_words = [word for (word, pos) in nltk.pos_tag(
tokenized) if is_valid(pos)]
valid_words_set = set(
[x.lower() for x in valid_words if (len(x) > min_word_length)])
print(('\tFinded %s words...') % (len(valid_words_set)))
global words_count
if len(valid_words_set) < words_count:
print("\tInsufficient number of words..")
exit(1)
with open('result.txt', 'w', encoding='utf-8') as f:
while words_count != 0:
word = valid_words_set.pop()
try:
definition = get_word_definition(word)
result = '%s - %s' % (word, definition)
print('\t'+result)
f.write(result+'\n')
words_count -= 1
except:
pass
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment