Skip to content

Instantly share code, notes, and snippets.

@copyninja
Created October 22, 2010 11:44
Show Gist options
  • Save copyninja/640410 to your computer and use it in GitHub Desktop.
Save copyninja/640410 to your computer and use it in GitHub Desktop.
Trying to write a html parser using python3 html.parser module with pyquery and lxml
#!/usr/local/bin/python3
import sys
from urllib.request import Request,urlopen
from urllib.error import URLError,HTTPError
from html.parser import HTMLParser,HTMLParseError
from pyquery import PyQuery as pq
class MyHTMLParser(HTMLParser):
"""
"""
def reset(self):
HTMLParser.reset(self)
self.buffer = ""
self.h1 = False
self.h2 = False
self.h3 = False
self.p = False
self.span = False
self.ul = False
self.ol = False
self.li = False
def handle_starttag(self,tag,attrs):
if tag == "h1" :
self.h1 = True
elif tag == "h2":
self.h2 = True
elif tag == "h3":
self.h3 = True
elif tag == "p":
self.p = True
elif tag == "span":
self.span = True
elif tag == "li":
self.li = True
def handle_data(self,data):
if self.h1 or self.h2 or self.p or self.span or self.li or self.span:
self.process_buffer(data)
def handle_endtag(self,tag):
if self.h1:
self.h1 = False
elif self.h2:
self.h2 = False
elif self.h3:
self.h3 = False
elif self.p:
self.p = False
elif self.span:
self.span = False
elif self.li:
self.li = False
def process_buffer(self,data):
fp = open("contents.txt","w+")
fp.write(data)
fp.close()
def cleanup(page):
"""
remove unwanted sections of the page.
Uses pyquery.
"""
document = pq(page)
#If you want to remove any other section, just add the class or id of the section below with comma seperated
unwanted_sections_list="""
div#jump-to-nav, div.top, div#column-one, div#siteNotice, div#purl, div#head,div#footer, div#head-base, div#page-base, div#stub, div#noprint,
div#disambig,div.NavFrame,#colophon,.editsection,.toctoggle,.tochidden,.catlinks,.navbox,.sisterproject,.ambox,
.toccolours,.topicondiv#f-poweredbyico,div#f-copyrightico,div#featured-star,li#f-viewcount,
li#f-about,li#f-disclaimer,li#f-privacy,.portal, #footer, #mw-head
"""
unwanted_divs = unwanted_sections_list.split(",")
for section in unwanted_divs:
document.remove(section.strip())
return document.wrap('<div></div>').html()
if __name__ == "__main__":
request = Request(url="http://kn.wikipedia.org/wiki/Karnataka")
request.add_header("User-Agent","Python Crawler")
try:
response = urlopen(request).read().decode("utf-8")
myparser = MyHTMLParser()
myparser.feed(cleanup(response))
myparser.close()
# myparser.process_buffer()
except HTTPError as e:
print(("Something Went wrong, Error Code {}".format(e.code)))
sys.exit(1)
except URLError as u:
print(("Something Went Wrong, Reason {}".format(u.reason)))
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment