ktibb · April 9, 2012 14:57
diff --git a/Web_to_Markov.py b/Web_to_Markov.py
 import urllib
 import BeautifulSoup
 import re
 from random import choice
 #import context_free
 import sys
 import markov


 html = urllib.urlopen('http://www.george-orwell.org/1984/0.html').read()
 soup = BeautifulSoup.BeautifulSoup(html)
 #texts = soup.findAll(text=True)

 #soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
 texts=soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
 #print(texts)
 #print type(texts)

 def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element)):
       return False
    return True

 visible_texts = filter(visible, texts)
 text = visible_texts[0].getText()

 generator = markov.MarkovGenerator(n=5, max=500)
 for line in text:
    stringline=line.encode('utf-8')
    generator.feed(stringline)

    
 #    print line
 #    line = line.strip(";:#-?.,")
 #    stringline=line.encode('utf-8')
 #    print stringline
 #    generator.feed(stringline)
 #    word=line.split(" ")


 
 for i in range(1):
    print generator.generate()

    
 def feed(self, text):
     tokens = self.tokenize(text)
     # discard this line if it's too short
     if len(tokens) < self.n:
        return

    # store the first ngram of this line
     beginning = tuple(tokens[:self.n])
     self.beginnings.append(beginning)
 
     for i in range(len(tokens) - self.n):
 
        gram = tuple(tokens[i:i+self.n])
        next = tokens[i+self.n] # get the element after the gram  
        
 # if we've already seen this ngram, append; otherwise, set the
 # value for this key as a new list
     if gram in self.ngrams:
        self.ngrams[gram].append(next)
     else:
        self.ngrams[gram] = [next]
        
        
        
 # generate a text from the information in self.ngrams
 def generate(self):
 
    from random import choice
 
 # get a random line beginning; convert to a list.
    current = choice(self.beginnings)
    output = list(current)
 
    for i in range(self.max):
        if current in self.ngrams:
            possible_next = self.ngrams[current]
            next = choice(possible_next)
            output.append(next)
 # get the last N entries of the output; we'll use this to look up
 # an ngram in the next iteration of the loop
            current = tuple(output[-self.n:])
        else:
            break
 
        output_str = self.concatenate(output)
        return output_str
	import urllib
	import BeautifulSoup
	import re
	from random import choice
	#import context_free
	import sys
	import markov


	html = urllib.urlopen('http://www.george-orwell.org/1984/0.html').read()
	soup = BeautifulSoup.BeautifulSoup(html)
	#texts = soup.findAll(text=True)

	#soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
	texts=soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
	#print(texts)
	#print type(texts)

	def visible(element):
	if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
	return False
	elif re.match('<!--.*-->', str(element)):
	return False
	return True

	visible_texts = filter(visible, texts)
	text = visible_texts[0].getText()

	generator = markov.MarkovGenerator(n=5, max=500)
	for line in text:
	stringline=line.encode('utf-8')
	generator.feed(stringline)


	# print line
	# line = line.strip(";:#-?.,")
	# stringline=line.encode('utf-8')
	# print stringline
	# generator.feed(stringline)
	# word=line.split(" ")



	for i in range(1):
	print generator.generate()


	def feed(self, text):
	tokens = self.tokenize(text)
	# discard this line if it's too short
	if len(tokens) < self.n:
	return

	# store the first ngram of this line
	beginning = tuple(tokens[:self.n])
	self.beginnings.append(beginning)

	for i in range(len(tokens) - self.n):

	gram = tuple(tokens[i:i+self.n])
	next = tokens[i+self.n] # get the element after the gram

	# if we've already seen this ngram, append; otherwise, set the
	# value for this key as a new list
	if gram in self.ngrams:
	self.ngrams[gram].append(next)
	else:
	self.ngrams[gram] = [next]



	# generate a text from the information in self.ngrams
	def generate(self):

	from random import choice

	# get a random line beginning; convert to a list.
	current = choice(self.beginnings)
	output = list(current)

	for i in range(self.max):
	if current in self.ngrams:
	possible_next = self.ngrams[current]
	next = choice(possible_next)
	output.append(next)
	# get the last N entries of the output; we'll use this to look up
	# an ngram in the next iteration of the loop
	current = tuple(output[-self.n:])
	else:
	break

	output_str = self.concatenate(output)
	return output_str