Skip to content

Instantly share code, notes, and snippets.

@clone1018
Created June 5, 2012 01:11
Show Gist options
  • Save clone1018/2871825 to your computer and use it in GitHub Desktop.
Save clone1018/2871825 to your computer and use it in GitHub Desktop.
import re
class linkextractor:
def __init__(self,htmlPage):
self.htmlcode = htmlPage
def getLinks(self):
linksList = re.findall('<a href=(.*?)>.*?</a>',self.htmlcode)
links = []
for link in linksList:
if link.startswith('"'): link=link[1:] # Remove quotes
if link.endswith('"'): link=link[:-1]
links.append(link)
return links
if __name__ == "__main__":
import sys,getopt
opts, args = getopt.getopt(sys.argv[1:],"")
if len(args) != 1:
print "You must specify a file to process."
sys.exit(1)
print "Linkextractor is processing %s..." % args[0]
file = open(args[0],"rb")
htmlpage = file.read(500000)
file.close()
le = linkextractor(htmlpage)
print le.getLinks()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment