Created
April 29, 2023 15:35
-
-
Save narskidan/7554c7cc42d2d0cc3fc62340a2018fed to your computer and use it in GitHub Desktop.
Skeleton for Tor crawler (for Eamon)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
# This script assumes you already have Tor installed and running | |
# Snagged from StackOverflow, haven't tested it! | |
def get_tor_session(): | |
session = requests.session() | |
# My Tor daemon is on port 9150 | |
# On your computer, it's more likely 9050 | |
session.proxies = {'http': 'socks5h://127.0.0.1:9150', | |
'https': 'socks5h://127.0.0.1:9150'} | |
return session | |
# We start out with the URL for tor.taxi, a Tor link aggregator | |
seed_url = "http://tortaxi2dev6xjwbaydqzla77rrnth7yn2oqzjfmiuwn5h6vsk2a4syd.onion/" | |
# This regular expression will help us extract Tor links to crawl | |
# ChatGPT actually wrote this regex for me! | |
onion_regex = re.compile("[a-z0-9]+.onion") | |
def crawl(url, found_urls=[], dead_links=[], search_term="hacking", session=None): | |
if session is None: | |
session = get_tor_session() | |
try: | |
# Todo: use beautiful soup to extract only visible text from page... | |
# (including title and meta description of course) | |
html = session.get('http://' + url).text | |
except: | |
dead_links.append(url) | |
return # URL is a dead link, mark it as such so we don't visit it again | |
# Eventually we'll replace all of these in-memory arrays with proper databases... | |
if search_term in html: | |
print('Found search term:', url) | |
found_urls.append() | |
links = [] # ... | |
# Make sure to remove duplicates, and don't add dupes to found_links | |
pass # TODO: crawl recursively | |
# Make sure the basics work... | |
session = get_tor_session() | |
html = session.get(seed_url).text | |
links = onion_regex.findall(html) | |
print(links) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment