Skip to content

Instantly share code, notes, and snippets.

@AlfredoTigolo
Last active February 4, 2021 19:51
Show Gist options
  • Save AlfredoTigolo/1254cf8251ae66e586050b5b63ebb3db to your computer and use it in GitHub Desktop.
Save AlfredoTigolo/1254cf8251ae66e586050b5b63ebb3db to your computer and use it in GitHub Desktop.
sample python html parser
# read data from the url and print it
# source : https://www.guru99.com/accessing-internet-data-with-python.html
# source : https://www.pythoncentral.io/html-parser/
# source https://docs.python.org/3/library/html.parser.html
# source https://docs.python.org/3/library/html.entities.html#module-html.entities
# Beautiful Soup https://realpython.com/beautiful-soup-web-scraper-python/
#from cis247 import InputBox
#from cis247 import MessageBox
from cis247.TestHTMLParser import TestHTMLParser
import urllib.request
strUrl = 'http://kuyakuya.freeshell.org/church'
#strUrl = 'http://isschristian.org/live'
#InputBox.ShowDialog ("Enter URL")
#strUrl = str( InputBox.GetInput() )
#webUrl = urllib.request.urlopen ( strUrl ) # open connection
html_page = urllib.request.urlopen ( strUrl )
#print ("result code: " + str (webUrl.getcode()))
print ("result code: " + str (html_page.getcode()))
#data = webUrl.read()
#parser.handle_data( data )
#print (data)
parser = TestHTMLParser()
#parser.feed( str( webUrl.read()) ) #bytes to string?
parser.feed( str ( html_page.read() ) )
#MessageBox.Show ( str ( html_page.read() ) )
#print ("Start tags", parser.lsStartTags)
#print ("Start End tags", parser.lsStartEndTags )
#print ("Commnets", parser.lsComments)
print ( "Links", parser.lsLinks )
parser.close()
# Source https://docs.python.org/3/library/html.parser.html
from html.parser import HTMLParser
from html.entities import name2codepoint
class TestHTMLParser(HTMLParser):
# Initializing lists
lsStartTags = list()
lsEndTags = list()
lsStartEndTags = list()
lsComments = list()
lsLinks = list()
def handle_starttag(self, tag, attrs):
print("Start tag:", tag)
for attr in attrs:
print(" attr:", attr)
self.lsStartTags.append ( tag )
if ( tag == 'a'):
#print ("LINK!")
self.lsLinks.append ( attr )
def handle_endtag(self, tag):
print("Encountered an end tag :", tag)
self.lsEndTags.append ( tag )
def handle_startendtag ( self, startendTag, attr ):
print("Encountered an startendTag :", attr )
self.lsStartEndTags.append ( startendTag )
def handle_data(self, data):
print("Data :", data)
def handle_comment(self, data):
print("Comment :", data)
self.lsComments.append ( data )
def handle_entityref(self, name):
c = chr(name2codepoint[name])
print("Named ent:", c)
def handle_charref(self, name):
if name.startswith('x'):
c = chr(int(name[1:], 16))
else:
c = chr(int(name))
print("Num ent :", c)
def handle_decl(self, data):
print("Decl :", data)
#parser = TestHTMLParser()
#parser.feed('<html><head><title>Test</title></head>'
#'<body><h1>Parse me!</h1></body></html>')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment