Skip to content

Instantly share code, notes, and snippets.

@angeloped
Created March 6, 2021 20:24
Show Gist options
  • Save angeloped/a740ed3e09ffd246e2d835c06d647540 to your computer and use it in GitHub Desktop.
Save angeloped/a740ed3e09ffd246e2d835c06d647540 to your computer and use it in GitHub Desktop.
A simple Tor-wrapped (onion) .pdf web extractor written in Python. Works on Python 2 and Python 3.
#!/bin/python
import os
import sys
import urllib
import requests
from bs4 import BeautifulSoup
# A simple Tor-wrapped (onion) .pdf web extractor written in Python. Works on Python 2 and Python 3.
proxies = {
'http': "socks5h://localhost:9050",
'https': "socks5h://localhost:9050"
}
if __name__ == "__main__":
if len(sys.argv) == 3:
if os.path.exists(sys.argv[2]):
resp = requests.get(sys.argv[1], proxies=proxies).content
soup = BeautifulSoup(resp, 'html.parser')
for link in soup.find_all('a', href=True):
print(link['href'][-4:])
if link['href'][-4:] == ".pdf":
file_ = "".join([sys.argv[1], link['href']])
try:
file_ = urllib.parse.unquote(file_)
except:
file_ = urllib.unquote(file_)
print("Downloading... ", file_)
data = requests.get(file_, proxies=proxies).content
filename = "{0}/{1}".format(sys.argv[2].replace(os.sep,""),file_.split("/")[-1])
with open(filename, "wb") as f:
f.write(data)
print("Downloaded! ", file_)
else:
print("./{0} <oniondir> <path>".format(sys.argv[0]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment