jpotts18 · March 31, 2015 18:09
diff --git a/HTMLDocumentParser b/HTMLDocumentParser
 import re

 import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import RegexpTokenizer
 from nltk.collocations import *
 from bs4 import BeautifulSoup


 class HTMLDocumentParser(object):

 	def __init__(self, path):
 		self.html = open(path,'r').read().decode(encoding='utf-8', errors='ignore')
 		self.soup = BeautifulSoup(self.html)
 		self.doc = self.process_document()

 	def find_meta_description(self):
 		desc = self.soup.findAll(attrs={"name":"description"}) 
 		meta_description = desc[0]['content'].encode('utf-8')
 		if meta_description is not None:
 			return meta_description.lower()
 		return ''

 	def find_title(self):
 		title = self.soup.find('title').get_text(strip=True)
 		if title is not None:
 			return title.lower()
 		return ''

 	def find_headers(self):
 		headers = []
 		headers.extend(self.soup.findAll('h1'))
 		headers.extend(self.soup.findAll('h2'))
 		headers.extend(self.soup.findAll('h3'))
 		headers.extend(self.soup.findAll('h4'))
 		headers.extend(self.soup.findAll('h5'))
 		headers.extend(self.soup.findAll('h6'))
 		return " ".join(map((lambda h: h.get_text(strip=True)), headers))

 	def find_paragraphs(self):
 		paragraphs = self.soup.findAll('p')
 		return " ".join(map((lambda x: x.get_text(strip=True).lower()), paragraphs))

 	def preprocess_words(self, string):
 		tokenizer = RegexpTokenizer(r'\w+')
 		tokens = tokenizer.tokenize(string.lower())
 		filtered_words = filter(lambda token: token not in stopwords.words('english'), tokens)
 		return " ".join(filtered_words)

 	def bigram_collocations(self, freq_filter=3, n_best=10):
 		tokens = nltk.word_tokenize(self.doc['composite'])
 		bigram_measures = nltk.collocations.BigramAssocMeasures()
 		finder = BigramCollocationFinder.from_words(tokens)
 		finder.apply_freq_filter(freq_filter)
 		print finder.nbest(bigram_measures.pmi, n_best)

 	def trigram_collocations(self, raw_freq=2):
 		tokens = nltk.word_tokenize(self.doc['composite'])
 		trigram_measures = nltk.collocations.TrigramAssocMeasures()
 		finder = TrigramCollocationFinder.from_words(tokens)
 		print finder.nbest(trigram_measures.raw_freq, raw_freq)

 	def plot_frequency(self, num=50, cumulative=False):
 		tokens = nltk.word_tokenize(self.doc['composite_cleaned'])
 		freqDist = nltk.FreqDist(tokens)
 		freqDist.plot(num, cumulative=cumulative)

 	def process_document(self):
 		doc = {}
 		doc['meta_description'] = self.find_meta_description()
 		doc['title'] = self.find_title()
 		doc['headers'] = self.find_headers()
 		doc['paragraphs'] = self.find_paragraphs()
 		doc['composite'] = "%s %s %s %s" % (doc['meta_description'], doc['title'], doc['headers'], doc['paragraphs']) 
 		doc['composite_cleaned'] = self.preprocess_words(doc['composite']) 
 		return doc


 doc = HTMLDocumentParser('data/html/us.flukecal.com/terms.html')
 doc = HTMLDocumentParser('data/html/us.flukecal.com/about/fluke-calibration-brands.html')

 doc.trigram_collocations()
 doc.bigram_collocations()

 doc.plot_frequency()
	import re

	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import RegexpTokenizer
	from nltk.collocations import *
	from bs4 import BeautifulSoup


	class HTMLDocumentParser(object):

	def __init__(self, path):
	self.html = open(path,'r').read().decode(encoding='utf-8', errors='ignore')
	self.soup = BeautifulSoup(self.html)
	self.doc = self.process_document()

	def find_meta_description(self):
	desc = self.soup.findAll(attrs={"name":"description"})
	meta_description = desc[0]['content'].encode('utf-8')
	if meta_description is not None:
	return meta_description.lower()
	return ''

	def find_title(self):
	title = self.soup.find('title').get_text(strip=True)
	if title is not None:
	return title.lower()
	return ''

	def find_headers(self):
	headers = []
	headers.extend(self.soup.findAll('h1'))
	headers.extend(self.soup.findAll('h2'))
	headers.extend(self.soup.findAll('h3'))
	headers.extend(self.soup.findAll('h4'))
	headers.extend(self.soup.findAll('h5'))
	headers.extend(self.soup.findAll('h6'))
	return " ".join(map((lambda h: h.get_text(strip=True)), headers))

	def find_paragraphs(self):
	paragraphs = self.soup.findAll('p')
	return " ".join(map((lambda x: x.get_text(strip=True).lower()), paragraphs))

	def preprocess_words(self, string):
	tokenizer = RegexpTokenizer(r'\w+')
	tokens = tokenizer.tokenize(string.lower())
	filtered_words = filter(lambda token: token not in stopwords.words('english'), tokens)
	return " ".join(filtered_words)

	def bigram_collocations(self, freq_filter=3, n_best=10):
	tokens = nltk.word_tokenize(self.doc['composite'])
	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = BigramCollocationFinder.from_words(tokens)
	finder.apply_freq_filter(freq_filter)
	print finder.nbest(bigram_measures.pmi, n_best)

	def trigram_collocations(self, raw_freq=2):
	tokens = nltk.word_tokenize(self.doc['composite'])
	trigram_measures = nltk.collocations.TrigramAssocMeasures()
	finder = TrigramCollocationFinder.from_words(tokens)
	print finder.nbest(trigram_measures.raw_freq, raw_freq)

	def plot_frequency(self, num=50, cumulative=False):
	tokens = nltk.word_tokenize(self.doc['composite_cleaned'])
	freqDist = nltk.FreqDist(tokens)
	freqDist.plot(num, cumulative=cumulative)

	def process_document(self):
	doc = {}
	doc['meta_description'] = self.find_meta_description()
	doc['title'] = self.find_title()
	doc['headers'] = self.find_headers()
	doc['paragraphs'] = self.find_paragraphs()
	doc['composite'] = "%s %s %s %s" % (doc['meta_description'], doc['title'], doc['headers'], doc['paragraphs'])
	doc['composite_cleaned'] = self.preprocess_words(doc['composite'])
	return doc


	doc = HTMLDocumentParser('data/html/us.flukecal.com/terms.html')
	doc = HTMLDocumentParser('data/html/us.flukecal.com/about/fluke-calibration-brands.html')

	doc.trigram_collocations()
	doc.bigram_collocations()

	doc.plot_frequency()