Created
March 7, 2012 23:35
-
-
Save ktibb/1997267 to your computer and use it in GitHub Desktop.
RWET Midterm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import BeautifulSoup | |
import re | |
import nltk | |
from random import choice | |
#import context_free | |
part = dict() | |
words = list() | |
final = list() | |
html = urllib.urlopen('http://www.oprah.com/relationships/What-Kind-of-Woman-Watches-Porn-Researchers-Find-Answers').read() | |
soup = BeautifulSoup.BeautifulSoup(html) | |
#texts = soup.findAll(text=True) | |
texts = soup.find("div", {"class": "arial14"}) | |
def visible(element): | |
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: | |
return False | |
elif re.match('<!--.*-->', str(element)): | |
return False | |
return True | |
visible_texts = filter(visible, texts) | |
#print visible_texts | |
for tag in visible_texts: | |
line = tag.string | |
if line is not None: | |
if line not in ['\n',' <br /> ', '\bu','<i>', '</i>', '']: | |
# print "-----" | |
line.replace("u'"," ") | |
line.strip(";:#-?.,") | |
#print line | |
#add POStagging | |
text = nltk.word_tokenize (line) | |
tagged = nltk.pos_tag(text) | |
#gather all the words of the same part of speech into a dict (with a POS as a key, words as value) | |
for i in tagged: | |
if re.search(r'\W',i[0]) or re.search(r"http",i[0]): | |
pass | |
else: | |
if i[1] in part.keys(): | |
if i[0] in part.values(): | |
pass | |
else: | |
part[i[1]] += "|" + i[0] | |
else: | |
part[i[1]] = i[0] | |
for u in part.keys(): | |
thisone = u+ "->" + part[u] | |
final.append(thisone) | |
#print thisone | |
words = part[u].split("|") | |
random_word = choice(words) | |
#print words | |
#Combine the words together in a Nostradamus Poem | |
#[('IN'+'CD'+'NNS'+'CC'+'CD'+'NNS'+','+'NNP'+'MD'+'VB'+'DT'+'NN'+'NNP'+','+'NNP'+'VBZ'+'MD'+'VB'+'IN'+'PRP$'+'NNP'+','+'NN'+'RB'+'DT'+'NNS'+'CC'+'RB')] | |
#nostradamus = ['IN', 'CD', 'NNS' , 'CC', 'CD' , 'NNS' , 'NNP', 'MD' , 'VB' , 'DT' , 'NN' , 'NNP' , 'NNP','MD' , 'VB' , 'IN' , 'PRP$' , 'NNP' ,'NN' , 'RB' , 'DT' , 'NNS', 'CC', 'RB'] | |
nostradamus = ['IN', 'CD','NNS','CC','CD','NNS', 'NNP', 'MD', 'VB', 'DT', 'NN', 'NNP', 'NNP', 'VBZ', 'MD', 'VB','IN','PRP$','NN', 'TO', 'VB', 'RP', 'DT', 'NNS', 'CC', 'RB', 'DT', 'NN', 'NN', 'MD', 'VB', 'DT', 'NN', 'CD', 'NNS', 'IN', 'DT', 'NN', 'MD', 'VB', 'NNP', 'MD', 'VB', 'CC', 'VBN', 'VB', 'IN', 'DT', 'JJ', 'NNS', 'IN', 'DT', 'NN', 'DT', 'JJ', 'NN', 'MD', 'VB', 'IN', 'CD', 'NNS', 'DT', 'NN', 'MD', 'VB', 'CD', 'NNS', 'TO', 'VB', 'DT', 'JJ', 'NN', 'MD', 'VB', 'DT', 'NNP', 'DT', 'JJ', 'NN', 'MD', 'VB', 'NN', 'IN', 'CD', 'NNS', 'CC', 'CD', 'NNS', 'NNP', 'MD', 'VB', 'DT', 'NN', 'NN', 'DT', 'NNS', 'MD', 'VB', 'IN', 'PRP$', 'NN', 'TO', 'VB', 'RP', 'DT', 'NNS', 'CC', 'RB'] | |
poem = "" | |
for part_of_speech in nostradamus: | |
#print part_of_speech | |
# get a random word that is the current part_of_speech | |
array_of_words = part[part_of_speech].split("|") | |
#print array_of_words | |
#get a random on of these words | |
random_poem_word = choice(array_of_words) | |
# add it to the poem string | |
poem += " " + random_poem_word | |
# add a space after it | |
print poem |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment