Skip to content

Instantly share code, notes, and snippets.

@matrixfox
Forked from cloudaice/htmlparser.py
Last active March 5, 2016 20:11
Show Gist options
  • Save matrixfox/f61fc1df5542472f1766 to your computer and use it in GitHub Desktop.
Save matrixfox/f61fc1df5542472f1766 to your computer and use it in GitHub Desktop.
htmlparser
from HTMLParser import HTMLParser
import urllib2
class MyHTMLParser(HTMLParser):
def __init__(self):
self.data=[]
self.href=0
self.linkname=''
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.href=1
def handle_endtag(self,tag):
if tag=='a':
self.linkname=''.join(self.linkname.split())
self.linkname=self.linkname.strip()
if self.linkname:
self.data.append(self.linkname)
self.linkname=''
self.href=0
def handle_data(self, data):
if self.href:
self.linkname+=data
def getresult(self):
for value in self.data:
print value
def main():
parser = MyHTMLParser()
url = 'http://flickr.com'
response = urllib2.urlopen(url)
parser.feed(response.read())
parser.getresult()
parser.close()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment