Skip to content

Instantly share code, notes, and snippets.

@kkroesch
Created June 30, 2016 10:34
Show Gist options
  • Save kkroesch/3b9fd010c716c5ab4fa631e94a06b102 to your computer and use it in GitHub Desktop.
Save kkroesch/3b9fd010c716c5ab4fa631e94a06b102 to your computer and use it in GitHub Desktop.
Finds references to external scripts and stylesheets from HTML page and prints download instructions for those files.
import sys
import argparse
from HTMLParser import HTMLParser
file_list = []
class AssetFinder(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'script':
for attr in attrs:
if attr[0] == 'src':
file_list.append(attr[1])
if tag == 'link':
for attr in attrs:
if attr[0] == 'href':
file_list.append(attr[1])
def handle_endtag(self, tag):
pass
def handle_data(self, data):
pass
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-i', '--input', required=True)
args = arg_parser.parse_args()
f = open(args.input, 'r')
html = f.read()
parser = AssetFinder()
parser.feed(html)
for f in file_list:
print "curl -lsO", f
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment