Last active
August 7, 2024 13:58
-
-
Save lopes/efeda0f0556e57a683db024124984af9 to your computer and use it in GitHub Desktop.
Connects to the Tor Project's exit-addresses page and parses the exit node data into JSON format. #python #tor #exitnodes #web #scrapper #parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#torpids.py | |
''' | |
Connects to the Tor Project's exit-addresses page and parses the | |
exit node data into JSON format. | |
Data is fetched from: https://check.torproject.org/exit-addresses | |
Data is "Node ID-centric" in Tor Project's page, but here it is | |
"IP address-centric" to make it easier to use the data for | |
detection purposes. In other words, while in the Tor Project's | |
we might have duplicate IP addresses with different Node IDs, | |
here we have unique IP addresses with a list of Node IDs -- | |
the Node IDs might be duplicated, not the IP addresses. | |
Having the JSON data, it should be easy to feed any dataset | |
or database with the exit nodes data for further use in analytics. | |
Author.: Joe Lopes <lopes.id> | |
Date...: 2024-07-04 | |
License: MIT | |
''' | |
from datetime import datetime | |
from urllib.request import urlopen | |
from re import compile, DOTALL | |
from json import dumps | |
from sys import stderr | |
url = 'https://check.torproject.org/exit-addresses' | |
re_exit_node = compile(r'(?P<node>ExitNode\s(?P<node_id>[\dA-Z]+)\nPublished\s(?P<node_published_ts>[\s\d:-]+)\nLastStatus\s(?P<node_status_ts>[\s\d:-]+)\n(?P<exit_addresses>ExitAddress\s[\d.\s:-]+\n?(ExitAddress\s[\d.\s:-]+\n?)*))', DOTALL) | |
re_exit_address = compile(r'ExitAddress\s(?P<address>[^\s]+)\s(?P<address_ts>[-:\d\b]+)') | |
tor_exit_nodes = { | |
'timestamp': datetime.now().isoformat(), | |
'source': f'Generated by {__file__} based on {url}', | |
'total_exit_addresses': 0, | |
'exit_addresses': list() | |
} | |
unique_addresses = set() | |
def fetch_nodes(url): | |
try: | |
with urlopen(url) as response: | |
content = response.read().decode('utf-8') | |
return content | |
except Exception as e: | |
print(f'Error fetching nodes: {e}', file=stderr) | |
return None | |
## | |
# MAIN | |
# | |
raw = fetch_nodes(url) | |
if not raw: | |
exit(1) | |
for match in re_exit_node.finditer(raw): | |
node_id = match.group('node_id') | |
published = match.group('node_published_ts') | |
last_status = match.group('node_status_ts') | |
addresses = match.group('exit_addresses') | |
for a in re_exit_address.finditer(addresses): | |
address = a.group('address') | |
timestamp = a.group('address_ts') | |
if not address in unique_addresses: | |
tor_exit_nodes['exit_addresses'].append({ | |
'address': address, | |
'exit_nodes': [{ | |
'timestamp': timestamp, | |
'exit_node': { | |
'id': node_id, | |
'published': published, | |
'last_status': last_status | |
} | |
}] | |
}) | |
unique_addresses.add(address) | |
else: | |
for e in tor_exit_nodes['exit_addresses']: | |
if e['address'] == address: | |
e['exit_nodes'].append({ | |
'timestamp': timestamp, | |
'exit_node': { | |
'id': node_id, | |
'published': published, | |
'last_status': last_status | |
} | |
}) | |
tor_exit_nodes['total_exit_addresses'] = len(tor_exit_nodes['exit_addresses']) | |
print(dumps(tor_exit_nodes)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment