Created
January 27, 2020 18:31
-
-
Save PandaWhoCodes/06131ae294f5c1785cc1816bf3d3686e to your computer and use it in GitHub Desktop.
Generate QA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import wordnet as wn | |
from textblob import TextBlob | |
# from greenteapress_scraper import get_text | |
from pytutor_scrapper import scrape | |
import re | |
# import wikipedia | |
class Article: | |
"""Retrieves and analyzes wikipedia articles""" | |
def __init__(self, data): | |
self.title = data["title"] | |
self.page = data["text"] | |
# print("Got the content") | |
self.summary = TextBlob(self.page) | |
def generate_trivia_sentences(self): | |
sentences = self.summary.sentences | |
# Remove the first sentence - it's never a good one | |
del sentences[0] | |
trivia_sentences = [] | |
for sentence in sentences: | |
trivia = self.evaluate_sentence(sentence) | |
if trivia: | |
trivia_sentences.append(trivia) | |
return trivia_sentences | |
def get_similar_words(self, word): | |
# In the absence of a better method, take the first synset | |
synsets = wn.synsets(word, pos='n') | |
# If there aren't any synsets, return an empty list | |
if len(synsets) == 0: | |
return [] | |
else: | |
synset = synsets[0] | |
# Get the hypernym for this synset (again, take the first) | |
hypernym = synset.hypernyms()[0] | |
# Get some hyponyms from this hypernym | |
hyponyms = hypernym.hyponyms() | |
# Take the name of the first lemma for the first 8 hyponyms | |
similar_words = [] | |
for hyponym in hyponyms: | |
similar_word = hyponym.lemmas()[0].name().replace('_', ' ') | |
if similar_word != word: | |
similar_words.append(similar_word) | |
if len(similar_words) == 8: | |
break | |
return similar_words | |
def evaluate_sentence(self, sentence): | |
if sentence.tags[0][1] == 'RB' or len(sentence.words) < 6: | |
# This sentence starts with an adverb or is less than five words long | |
# and probably won't be a good fit | |
return None | |
tag_map = {word.lower(): tag for word, tag in sentence.tags} | |
replace_nouns = [] | |
for word, tag in sentence.tags: | |
# For now, only blank out non-proper nouns that don't appear in the article title | |
if tag == 'NN' and word not in self.title: | |
# Is it in a noun phrase? If so, blank out the last two words in that phrase | |
for phrase in sentence.noun_phrases: | |
if phrase[0] == '\'': | |
# If it starts with an apostrophe, ignore it | |
# (this is a weird error that should probably | |
# be handled elsewhere) | |
break | |
if word in phrase: | |
# Blank out the last two words in this phrase | |
[replace_nouns.append(phrase_word) for phrase_word in phrase.split()[-2:]] | |
break | |
# If we couldn't find the word in any phrases, | |
# replace it on its own | |
if len(replace_nouns) == 0: | |
replace_nouns.append(word) | |
break | |
if len(replace_nouns) == 0: | |
# Return none if we found no words to replace | |
return None | |
trivia = { | |
'title': self.title, | |
'url': "www.google.com", | |
'answer': ' '.join(replace_nouns) | |
} | |
if len(replace_nouns) == 1: | |
# If we're only replacing one word, use WordNet to find similar words | |
trivia['similar_words'] = self.get_similar_words(replace_nouns[0]) | |
else: | |
# If we're replacing a phrase, don't bother - it's too unlikely to make sense | |
trivia['similar_words'] = [] | |
# Blank out our replace words (only the first occurrence of the word in the sentence) | |
replace_phrase = ' '.join(replace_nouns) | |
blanks_phrase = ('__________ ' * len(replace_nouns)).strip() | |
expression = re.compile(re.escape(replace_phrase), re.IGNORECASE) | |
sentence = expression.sub(blanks_phrase, str(sentence), count=1) | |
trivia['question'] = sentence | |
return trivia | |
article = scrape() | |
a = Article(article) | |
for questions in a.generate_trivia_sentences(): | |
print("QUESTION: ",questions["question"]) | |
print("\nANSWER: ",questions["answer"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment