Last active
February 4, 2021 19:51
-
-
Save AlfredoTigolo/1254cf8251ae66e586050b5b63ebb3db to your computer and use it in GitHub Desktop.
sample python html parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read data from the url and print it | |
# source : https://www.guru99.com/accessing-internet-data-with-python.html | |
# source : https://www.pythoncentral.io/html-parser/ | |
# source https://docs.python.org/3/library/html.parser.html | |
# source https://docs.python.org/3/library/html.entities.html#module-html.entities | |
# Beautiful Soup https://realpython.com/beautiful-soup-web-scraper-python/ | |
#from cis247 import InputBox | |
#from cis247 import MessageBox | |
from cis247.TestHTMLParser import TestHTMLParser | |
import urllib.request | |
strUrl = 'http://kuyakuya.freeshell.org/church' | |
#strUrl = 'http://isschristian.org/live' | |
#InputBox.ShowDialog ("Enter URL") | |
#strUrl = str( InputBox.GetInput() ) | |
#webUrl = urllib.request.urlopen ( strUrl ) # open connection | |
html_page = urllib.request.urlopen ( strUrl ) | |
#print ("result code: " + str (webUrl.getcode())) | |
print ("result code: " + str (html_page.getcode())) | |
#data = webUrl.read() | |
#parser.handle_data( data ) | |
#print (data) | |
parser = TestHTMLParser() | |
#parser.feed( str( webUrl.read()) ) #bytes to string? | |
parser.feed( str ( html_page.read() ) ) | |
#MessageBox.Show ( str ( html_page.read() ) ) | |
#print ("Start tags", parser.lsStartTags) | |
#print ("Start End tags", parser.lsStartEndTags ) | |
#print ("Commnets", parser.lsComments) | |
print ( "Links", parser.lsLinks ) | |
parser.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Source https://docs.python.org/3/library/html.parser.html | |
from html.parser import HTMLParser | |
from html.entities import name2codepoint | |
class TestHTMLParser(HTMLParser): | |
# Initializing lists | |
lsStartTags = list() | |
lsEndTags = list() | |
lsStartEndTags = list() | |
lsComments = list() | |
lsLinks = list() | |
def handle_starttag(self, tag, attrs): | |
print("Start tag:", tag) | |
for attr in attrs: | |
print(" attr:", attr) | |
self.lsStartTags.append ( tag ) | |
if ( tag == 'a'): | |
#print ("LINK!") | |
self.lsLinks.append ( attr ) | |
def handle_endtag(self, tag): | |
print("Encountered an end tag :", tag) | |
self.lsEndTags.append ( tag ) | |
def handle_startendtag ( self, startendTag, attr ): | |
print("Encountered an startendTag :", attr ) | |
self.lsStartEndTags.append ( startendTag ) | |
def handle_data(self, data): | |
print("Data :", data) | |
def handle_comment(self, data): | |
print("Comment :", data) | |
self.lsComments.append ( data ) | |
def handle_entityref(self, name): | |
c = chr(name2codepoint[name]) | |
print("Named ent:", c) | |
def handle_charref(self, name): | |
if name.startswith('x'): | |
c = chr(int(name[1:], 16)) | |
else: | |
c = chr(int(name)) | |
print("Num ent :", c) | |
def handle_decl(self, data): | |
print("Decl :", data) | |
#parser = TestHTMLParser() | |
#parser.feed('<html><head><title>Test</title></head>' | |
#'<body><h1>Parse me!</h1></body></html>') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment