Created
April 9, 2012 14:57
-
-
Save ktibb/2344051 to your computer and use it in GitHub Desktop.
Not Working: feeding webtext into Markov code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import BeautifulSoup | |
import re | |
from random import choice | |
#import context_free | |
import sys | |
import markov | |
html = urllib.urlopen('http://www.george-orwell.org/1984/0.html').read() | |
soup = BeautifulSoup.BeautifulSoup(html) | |
#texts = soup.findAll(text=True) | |
#soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4] | |
texts=soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4] | |
#print(texts) | |
#print type(texts) | |
def visible(element): | |
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: | |
return False | |
elif re.match('<!--.*-->', str(element)): | |
return False | |
return True | |
visible_texts = filter(visible, texts) | |
text = visible_texts[0].getText() | |
generator = markov.MarkovGenerator(n=5, max=500) | |
for line in text: | |
stringline=line.encode('utf-8') | |
generator.feed(stringline) | |
# print line | |
# line = line.strip(";:#-?.,") | |
# stringline=line.encode('utf-8') | |
# print stringline | |
# generator.feed(stringline) | |
# word=line.split(" ") | |
for i in range(1): | |
print generator.generate() | |
def feed(self, text): | |
tokens = self.tokenize(text) | |
# discard this line if it's too short | |
if len(tokens) < self.n: | |
return | |
# store the first ngram of this line | |
beginning = tuple(tokens[:self.n]) | |
self.beginnings.append(beginning) | |
for i in range(len(tokens) - self.n): | |
gram = tuple(tokens[i:i+self.n]) | |
next = tokens[i+self.n] # get the element after the gram | |
# if we've already seen this ngram, append; otherwise, set the | |
# value for this key as a new list | |
if gram in self.ngrams: | |
self.ngrams[gram].append(next) | |
else: | |
self.ngrams[gram] = [next] | |
# generate a text from the information in self.ngrams | |
def generate(self): | |
from random import choice | |
# get a random line beginning; convert to a list. | |
current = choice(self.beginnings) | |
output = list(current) | |
for i in range(self.max): | |
if current in self.ngrams: | |
possible_next = self.ngrams[current] | |
next = choice(possible_next) | |
output.append(next) | |
# get the last N entries of the output; we'll use this to look up | |
# an ngram in the next iteration of the loop | |
current = tuple(output[-self.n:]) | |
else: | |
break | |
output_str = self.concatenate(output) | |
return output_str |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment