Created
April 4, 2018 07:59
-
-
Save navin-mohan/3491e8c38f507742c49768173eee5363 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from html.parser import HTMLParser | |
class TextExtractionParser(HTMLParser): | |
''' | |
Custom HTML Parser that extracts | |
textual data from HTML | |
''' | |
def __init__(self,*args,**kwargs): | |
super().__init__(*args,**kwargs) | |
self.word_list = [] | |
def handle_data(self,data): | |
'called every time a text node is found' | |
self.word_list.append(data) | |
def iterwords(self): | |
for sentence in self.word_list: | |
for word in re.finditer(r'([a-z])\w+',sentence.lower()): | |
yield word.group(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment