Created
February 24, 2014 20:48
-
-
Save RobinDavid/9196709 to your computer and use it in GitHub Desktop.
html parser in python to extract h1 text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from html.parser import HTMLParser #Import the parser | |
class HeadingParser(HTMLParser): #create a subclass of HTMLParser which will overload handle.. | |
inHeading = False | |
def handle_starttag(self, tag, attrs): #Triggered when an opening tag is encountered | |
if tag == "h1": #if the tag is <h1> | |
self.inHeading = True #Change a variable which says we are in an header | |
print("Found a Heading 1") | |
def handle_data(self, data): #Triggered when data found (the content of the tag) | |
if self.inHeading: #Useless, used just to filter content of h1's | |
print(data) | |
def handle_endtag(self, tag): #Handle end of a tag | |
if tag =="h1": #Here if it is h1 put back "inHeading" to False | |
self.inHeading = False | |
hParser = HeadingParser() #Create our object | |
file = open("file.html", "r") #Open the file | |
html = file.read() #Read entirely the file | |
file.close() #Close the file | |
hParser.feed(html) #Parse the file contained in the var "html" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment