Created
February 12, 2012 12:58
-
-
Save jgraham/1808356 to your computer and use it in GitHub Desktop.
HTMLParser backed by html5lib
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tokenizer | |
from constants import tokenTypes, tagTokenTypes | |
class HTMLParser(object): | |
def __init__(self): | |
self.reset() | |
def reset(self): | |
self._tokenizer = None | |
pass | |
def feed(self, data): | |
self._tokenizer = tokenizer.HTMLTokenizer(data) | |
self._process() | |
def close(self): | |
pass | |
def getpos(self): | |
return self._tokenizer.stream.position | |
def _process(self): | |
handlers = dict((tokenTypes[key], value) for key, value in [ | |
("Doctype", self._handle_decl), | |
("Characters", self._handle_data), | |
("SpaceCharacters", self._handle_data), | |
("StartTag", self._handle_starttag), | |
("EndTag", self._handle_endtag), | |
("EmptyTag", self._handle_starttag), | |
("Comment", self._handle_comment), | |
("ParseError", None)]) | |
for token in self._tokenizer: | |
handlers[token["type"]](token) | |
def _handle_decl(self, token): | |
data = "DOCTYPE %s"%token[name] | |
#Add in all the extra data here | |
self.handle_decl(data) | |
def handle_decl(self, data): | |
pass | |
def _handle_data(self, token): | |
self.handle_data(token["data"]) | |
def handle_data(self, data): | |
pass | |
def _handle_starttag(self, token): | |
if token["name"] == "script": | |
self._tokenizer.state = self._tokenizer.scriptDataState | |
elif token["name"] == "style": | |
self._tokenizer.state = self._tokenizer.rawtextState | |
if token["selfClosing"]: | |
self.handle_startendtag(token["name"], token["data"]) | |
else: | |
self.handle_starttag(token["name"], token["data"]) | |
def handle_starttag(self, name, attrs): | |
pass | |
def handle_startendtag(self, name, attrs): | |
self.handle_starttag(name, attrs) | |
self.handle_endtag(name, attrs) | |
def _handle_endtag(self, token): | |
self.handle_endtag(token["name"]) | |
def handle_endtag(self, name): | |
pass | |
def _handle_comment(self, token): | |
self.handle_comment(token["data"]) | |
def handle_comment(self, data): | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment