Skip to content

Instantly share code, notes, and snippets.

@jpotts18
Created March 31, 2015 18:09
Show Gist options
  • Save jpotts18/dd9c1faf98b193e7dc46 to your computer and use it in GitHub Desktop.
Save jpotts18/dd9c1faf98b193e7dc46 to your computer and use it in GitHub Desktop.
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.collocations import *
from bs4 import BeautifulSoup
class HTMLDocumentParser(object):
def __init__(self, path):
self.html = open(path,'r').read().decode(encoding='utf-8', errors='ignore')
self.soup = BeautifulSoup(self.html)
self.doc = self.process_document()
def find_meta_description(self):
desc = self.soup.findAll(attrs={"name":"description"})
meta_description = desc[0]['content'].encode('utf-8')
if meta_description is not None:
return meta_description.lower()
return ''
def find_title(self):
title = self.soup.find('title').get_text(strip=True)
if title is not None:
return title.lower()
return ''
def find_headers(self):
headers = []
headers.extend(self.soup.findAll('h1'))
headers.extend(self.soup.findAll('h2'))
headers.extend(self.soup.findAll('h3'))
headers.extend(self.soup.findAll('h4'))
headers.extend(self.soup.findAll('h5'))
headers.extend(self.soup.findAll('h6'))
return " ".join(map((lambda h: h.get_text(strip=True)), headers))
def find_paragraphs(self):
paragraphs = self.soup.findAll('p')
return " ".join(map((lambda x: x.get_text(strip=True).lower()), paragraphs))
def preprocess_words(self, string):
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(string.lower())
filtered_words = filter(lambda token: token not in stopwords.words('english'), tokens)
return " ".join(filtered_words)
def bigram_collocations(self, freq_filter=3, n_best=10):
tokens = nltk.word_tokenize(self.doc['composite'])
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(freq_filter)
print finder.nbest(bigram_measures.pmi, n_best)
def trigram_collocations(self, raw_freq=2):
tokens = nltk.word_tokenize(self.doc['composite'])
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(tokens)
print finder.nbest(trigram_measures.raw_freq, raw_freq)
def plot_frequency(self, num=50, cumulative=False):
tokens = nltk.word_tokenize(self.doc['composite_cleaned'])
freqDist = nltk.FreqDist(tokens)
freqDist.plot(num, cumulative=cumulative)
def process_document(self):
doc = {}
doc['meta_description'] = self.find_meta_description()
doc['title'] = self.find_title()
doc['headers'] = self.find_headers()
doc['paragraphs'] = self.find_paragraphs()
doc['composite'] = "%s %s %s %s" % (doc['meta_description'], doc['title'], doc['headers'], doc['paragraphs'])
doc['composite_cleaned'] = self.preprocess_words(doc['composite'])
return doc
doc = HTMLDocumentParser('data/html/us.flukecal.com/terms.html')
doc = HTMLDocumentParser('data/html/us.flukecal.com/about/fluke-calibration-brands.html')
doc.trigram_collocations()
doc.bigram_collocations()
doc.plot_frequency()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment