Last active
March 21, 2020 20:10
-
-
Save fherbine/59fa5bed4397fc2cbbfe796d49e273c4 to your computer and use it in GitHub Desktop.
Useful to remove all JS scripts from a web page
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import datetime | |
import sys | |
from hashlib import md5 | |
import requests | |
import sh | |
from bs4 import BeautifulSoup | |
DEFAULT_WEBBROWSER = 'firefox' | |
DEFAULT_DST = '/tmp/{}.html'.format( | |
md5(str(datetime.datetime.now()).encode()).hexdigest() | |
) | |
def dispatch_help(): | |
print( | |
""" | |
======= clean-js ======= | |
remove-js and generate .html | |
usage: | |
`clean-js <source-url> [options]` | |
options: | |
-h or --help: dispatch usage | |
-o [web-browser] open in a web-browser (default is firefox) | |
""" | |
) | |
def remove_scripts(string): | |
soup = BeautifulSoup(string.lower(), 'html.parser') | |
to_extract = soup.findAll('script') | |
for item in to_extract: | |
item.extract() | |
return soup.prettify() | |
if __name__ == '__main__': | |
if (len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.argv): | |
dispatch_help() | |
sys.exit(0) | |
sys.argv.pop(0) # pop the program name | |
url = sys.argv.pop(0) | |
r = requests.get(url) | |
if r.status_code not in (200, 201, 202, 203): | |
dispatch_help() | |
sys.exit(-1) | |
content = ''.join(r.text) | |
content = remove_scripts(content) | |
with open(DEFAULT_DST, 'w+') as dst: | |
dst.write(content) | |
if '-o' in sys.argv: | |
idx = sys.argv.index('-o') | |
sys.argv.remove('-o') | |
try: | |
getattr(sh, sys.argv[idx])(DEFAULT_DST) | |
except: | |
getattr(sh, DEFAULT_WEBBROWSER)(DEFAULT_DST) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment