Created
March 31, 2015 18:09
-
-
Save jpotts18/dd9c1faf98b193e7dc46 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import RegexpTokenizer | |
from nltk.collocations import * | |
from bs4 import BeautifulSoup | |
class HTMLDocumentParser(object): | |
def __init__(self, path): | |
self.html = open(path,'r').read().decode(encoding='utf-8', errors='ignore') | |
self.soup = BeautifulSoup(self.html) | |
self.doc = self.process_document() | |
def find_meta_description(self): | |
desc = self.soup.findAll(attrs={"name":"description"}) | |
meta_description = desc[0]['content'].encode('utf-8') | |
if meta_description is not None: | |
return meta_description.lower() | |
return '' | |
def find_title(self): | |
title = self.soup.find('title').get_text(strip=True) | |
if title is not None: | |
return title.lower() | |
return '' | |
def find_headers(self): | |
headers = [] | |
headers.extend(self.soup.findAll('h1')) | |
headers.extend(self.soup.findAll('h2')) | |
headers.extend(self.soup.findAll('h3')) | |
headers.extend(self.soup.findAll('h4')) | |
headers.extend(self.soup.findAll('h5')) | |
headers.extend(self.soup.findAll('h6')) | |
return " ".join(map((lambda h: h.get_text(strip=True)), headers)) | |
def find_paragraphs(self): | |
paragraphs = self.soup.findAll('p') | |
return " ".join(map((lambda x: x.get_text(strip=True).lower()), paragraphs)) | |
def preprocess_words(self, string): | |
tokenizer = RegexpTokenizer(r'\w+') | |
tokens = tokenizer.tokenize(string.lower()) | |
filtered_words = filter(lambda token: token not in stopwords.words('english'), tokens) | |
return " ".join(filtered_words) | |
def bigram_collocations(self, freq_filter=3, n_best=10): | |
tokens = nltk.word_tokenize(self.doc['composite']) | |
bigram_measures = nltk.collocations.BigramAssocMeasures() | |
finder = BigramCollocationFinder.from_words(tokens) | |
finder.apply_freq_filter(freq_filter) | |
print finder.nbest(bigram_measures.pmi, n_best) | |
def trigram_collocations(self, raw_freq=2): | |
tokens = nltk.word_tokenize(self.doc['composite']) | |
trigram_measures = nltk.collocations.TrigramAssocMeasures() | |
finder = TrigramCollocationFinder.from_words(tokens) | |
print finder.nbest(trigram_measures.raw_freq, raw_freq) | |
def plot_frequency(self, num=50, cumulative=False): | |
tokens = nltk.word_tokenize(self.doc['composite_cleaned']) | |
freqDist = nltk.FreqDist(tokens) | |
freqDist.plot(num, cumulative=cumulative) | |
def process_document(self): | |
doc = {} | |
doc['meta_description'] = self.find_meta_description() | |
doc['title'] = self.find_title() | |
doc['headers'] = self.find_headers() | |
doc['paragraphs'] = self.find_paragraphs() | |
doc['composite'] = "%s %s %s %s" % (doc['meta_description'], doc['title'], doc['headers'], doc['paragraphs']) | |
doc['composite_cleaned'] = self.preprocess_words(doc['composite']) | |
return doc | |
doc = HTMLDocumentParser('data/html/us.flukecal.com/terms.html') | |
doc = HTMLDocumentParser('data/html/us.flukecal.com/about/fluke-calibration-brands.html') | |
doc.trigram_collocations() | |
doc.bigram_collocations() | |
doc.plot_frequency() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment