clone1018 · June 5, 2012 01:11
diff --git a/gistfile1.py b/gistfile1.py
 import re

 class linkextractor:
    def __init__(self,htmlPage):
        self.htmlcode = htmlPage
    def getLinks(self):
        linksList = re.findall('<a href=(.*?)>.*?</a>',self.htmlcode)
        links = []
        for link in linksList:
            if link.startswith('"'): link=link[1:]  # Remove quotes
            if link.endswith('"'): link=link[:-1]
            links.append(link)
        return links

 if __name__ == "__main__":
    import sys,getopt
    opts, args = getopt.getopt(sys.argv[1:],"")
    if len(args) != 1:
        print "You must specify a file to process."
        sys.exit(1)
    print "Linkextractor is processing %s..." % args[0]
    file = open(args[0],"rb")
    htmlpage = file.read(500000)
    file.close()
    le = linkextractor(htmlpage)
    print le.getLinks()
	import re

	class linkextractor:
	def __init__(self,htmlPage):
	self.htmlcode = htmlPage
	def getLinks(self):
	linksList = re.findall('<a href=(.?)>.?</a>',self.htmlcode)
	links = []
	for link in linksList:
	if link.startswith('"'): link=link[1:] # Remove quotes
	if link.endswith('"'): link=link[:-1]
	links.append(link)
	return links

	if __name__ == "__main__":
	import sys,getopt
	opts, args = getopt.getopt(sys.argv[1:],"")
	if len(args) != 1:
	print "You must specify a file to process."
	sys.exit(1)
	print "Linkextractor is processing %s..." % args[0]
	file = open(args[0],"rb")
	htmlpage = file.read(500000)
	file.close()
	le = linkextractor(htmlpage)
	print le.getLinks()