Skip to content

Instantly share code, notes, and snippets.

@PandaWhoCodes
Created January 27, 2020 18:31
Show Gist options
  • Save PandaWhoCodes/06131ae294f5c1785cc1816bf3d3686e to your computer and use it in GitHub Desktop.
Save PandaWhoCodes/06131ae294f5c1785cc1816bf3d3686e to your computer and use it in GitHub Desktop.
Generate QA
from nltk.corpus import wordnet as wn
from textblob import TextBlob
# from greenteapress_scraper import get_text
from pytutor_scrapper import scrape
import re
# import wikipedia
class Article:
"""Retrieves and analyzes wikipedia articles"""
def __init__(self, data):
self.title = data["title"]
self.page = data["text"]
# print("Got the content")
self.summary = TextBlob(self.page)
def generate_trivia_sentences(self):
sentences = self.summary.sentences
# Remove the first sentence - it's never a good one
del sentences[0]
trivia_sentences = []
for sentence in sentences:
trivia = self.evaluate_sentence(sentence)
if trivia:
trivia_sentences.append(trivia)
return trivia_sentences
def get_similar_words(self, word):
# In the absence of a better method, take the first synset
synsets = wn.synsets(word, pos='n')
# If there aren't any synsets, return an empty list
if len(synsets) == 0:
return []
else:
synset = synsets[0]
# Get the hypernym for this synset (again, take the first)
hypernym = synset.hypernyms()[0]
# Get some hyponyms from this hypernym
hyponyms = hypernym.hyponyms()
# Take the name of the first lemma for the first 8 hyponyms
similar_words = []
for hyponym in hyponyms:
similar_word = hyponym.lemmas()[0].name().replace('_', ' ')
if similar_word != word:
similar_words.append(similar_word)
if len(similar_words) == 8:
break
return similar_words
def evaluate_sentence(self, sentence):
if sentence.tags[0][1] == 'RB' or len(sentence.words) < 6:
# This sentence starts with an adverb or is less than five words long
# and probably won't be a good fit
return None
tag_map = {word.lower(): tag for word, tag in sentence.tags}
replace_nouns = []
for word, tag in sentence.tags:
# For now, only blank out non-proper nouns that don't appear in the article title
if tag == 'NN' and word not in self.title:
# Is it in a noun phrase? If so, blank out the last two words in that phrase
for phrase in sentence.noun_phrases:
if phrase[0] == '\'':
# If it starts with an apostrophe, ignore it
# (this is a weird error that should probably
# be handled elsewhere)
break
if word in phrase:
# Blank out the last two words in this phrase
[replace_nouns.append(phrase_word) for phrase_word in phrase.split()[-2:]]
break
# If we couldn't find the word in any phrases,
# replace it on its own
if len(replace_nouns) == 0:
replace_nouns.append(word)
break
if len(replace_nouns) == 0:
# Return none if we found no words to replace
return None
trivia = {
'title': self.title,
'url': "www.google.com",
'answer': ' '.join(replace_nouns)
}
if len(replace_nouns) == 1:
# If we're only replacing one word, use WordNet to find similar words
trivia['similar_words'] = self.get_similar_words(replace_nouns[0])
else:
# If we're replacing a phrase, don't bother - it's too unlikely to make sense
trivia['similar_words'] = []
# Blank out our replace words (only the first occurrence of the word in the sentence)
replace_phrase = ' '.join(replace_nouns)
blanks_phrase = ('__________ ' * len(replace_nouns)).strip()
expression = re.compile(re.escape(replace_phrase), re.IGNORECASE)
sentence = expression.sub(blanks_phrase, str(sentence), count=1)
trivia['question'] = sentence
return trivia
article = scrape()
a = Article(article)
for questions in a.generate_trivia_sentences():
print("QUESTION: ",questions["question"])
print("\nANSWER: ",questions["answer"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment