lopes · August 7, 2024 13:58
diff --git a/torpids.py b/torpids.py
 #!/usr/bin/env python3
 #torpids.py

 '''
 Connects to the Tor Project's exit-addresses page and parses the 
 exit node data into JSON format.

 Data is fetched from: https://check.torproject.org/exit-addresses

 Data is "Node ID-centric" in Tor Project's page, but here it is
 "IP address-centric" to make it easier to use the data for
 detection purposes.  In other words, while in the Tor Project's
 we might have duplicate IP addresses with different Node IDs,
 here we have unique IP addresses with a list of Node IDs --
 the Node IDs might be duplicated, not the IP addresses.

 Having the JSON data, it should be easy to feed any dataset
 or database with the exit nodes data for further use in analytics.

 Author.: Joe Lopes <lopes.id>
 Date...: 2024-07-04
 License: MIT
 '''


 from datetime import datetime
 from urllib.request import urlopen
 from re import compile, DOTALL
 from json import dumps
 from sys import stderr


 url = 'https://check.torproject.org/exit-addresses'
 re_exit_node = compile(r'(?P<node>ExitNode\s(?P<node_id>[\dA-Z]+)\nPublished\s(?P<node_published_ts>[\s\d:-]+)\nLastStatus\s(?P<node_status_ts>[\s\d:-]+)\n(?P<exit_addresses>ExitAddress\s[\d.\s:-]+\n?(ExitAddress\s[\d.\s:-]+\n?)*))', DOTALL)
 re_exit_address = compile(r'ExitAddress\s(?P<address>[^\s]+)\s(?P<address_ts>[-:\d\b]+)')
 tor_exit_nodes = {
  'timestamp': datetime.now().isoformat(),
  'source': f'Generated by {__file__} based on {url}',
  'total_exit_addresses': 0,
  'exit_addresses': list()
 }
 unique_addresses = set()


 def fetch_nodes(url):
  try:
    with urlopen(url) as response:
      content = response.read().decode('utf-8')
    return content
  except Exception as e:
    print(f'Error fetching nodes: {e}', file=stderr)
    return None


 ##
 # MAIN
 #
 raw = fetch_nodes(url)
 if not raw:
  exit(1)

 for match in re_exit_node.finditer(raw):
  node_id = match.group('node_id')
  published = match.group('node_published_ts')
  last_status = match.group('node_status_ts')
  addresses = match.group('exit_addresses')

  for a in re_exit_address.finditer(addresses):
    address = a.group('address')
    timestamp = a.group('address_ts')

    if not address in unique_addresses:
      tor_exit_nodes['exit_addresses'].append({
        'address': address,
        'exit_nodes': [{
          'timestamp': timestamp,
          'exit_node': {
            'id': node_id,
            'published': published,
            'last_status': last_status
          }
        }]
      })
      unique_addresses.add(address)
    else:
      for e in tor_exit_nodes['exit_addresses']:
        if e['address'] == address:
          e['exit_nodes'].append({
            'timestamp': timestamp,
            'exit_node': {
              'id': node_id,
              'published': published,
              'last_status': last_status
            }
          })

 tor_exit_nodes['total_exit_addresses'] = len(tor_exit_nodes['exit_addresses'])
 print(dumps(tor_exit_nodes))
	#!/usr/bin/env python3
	#torpids.py

	'''
	Connects to the Tor Project's exit-addresses page and parses the
	exit node data into JSON format.

	Data is fetched from: https://check.torproject.org/exit-addresses

	Data is "Node ID-centric" in Tor Project's page, but here it is
	"IP address-centric" to make it easier to use the data for
	detection purposes. In other words, while in the Tor Project's
	we might have duplicate IP addresses with different Node IDs,
	here we have unique IP addresses with a list of Node IDs --
	the Node IDs might be duplicated, not the IP addresses.

	Having the JSON data, it should be easy to feed any dataset
	or database with the exit nodes data for further use in analytics.

	Author.: Joe Lopes <lopes.id>
	Date...: 2024-07-04
	License: MIT
	'''


	from datetime import datetime
	from urllib.request import urlopen
	from re import compile, DOTALL
	from json import dumps
	from sys import stderr


	url = 'https://check.torproject.org/exit-addresses'
	re_exit_node = compile(r'(?P<node>ExitNode\s(?P<node_id>[\dA-Z]+)\nPublished\s(?P<node_published_ts>[\s\d:-]+)\nLastStatus\s(?P<node_status_ts>[\s\d:-]+)\n(?P<exit_addresses>ExitAddress\s[\d.\s:-]+\n?(ExitAddress\s[\d.\s:-]+\n?)*))', DOTALL)
	re_exit_address = compile(r'ExitAddress\s(?P<address>[^\s]+)\s(?P<address_ts>[-:\d\b]+)')
	tor_exit_nodes = {
	'timestamp': datetime.now().isoformat(),
	'source': f'Generated by {__file__} based on {url}',
	'total_exit_addresses': 0,
	'exit_addresses': list()
	}
	unique_addresses = set()


	def fetch_nodes(url):
	try:
	with urlopen(url) as response:
	content = response.read().decode('utf-8')
	return content
	except Exception as e:
	print(f'Error fetching nodes: {e}', file=stderr)
	return None


	##
	# MAIN
	#
	raw = fetch_nodes(url)
	if not raw:
	exit(1)

	for match in re_exit_node.finditer(raw):
	node_id = match.group('node_id')
	published = match.group('node_published_ts')
	last_status = match.group('node_status_ts')
	addresses = match.group('exit_addresses')

	for a in re_exit_address.finditer(addresses):
	address = a.group('address')
	timestamp = a.group('address_ts')

	if not address in unique_addresses:
	tor_exit_nodes['exit_addresses'].append({
	'address': address,
	'exit_nodes': [{
	'timestamp': timestamp,
	'exit_node': {
	'id': node_id,
	'published': published,
	'last_status': last_status
	}
	}]
	})
	unique_addresses.add(address)
	else:
	for e in tor_exit_nodes['exit_addresses']:
	if e['address'] == address:
	e['exit_nodes'].append({
	'timestamp': timestamp,
	'exit_node': {
	'id': node_id,
	'published': published,
	'last_status': last_status
	}
	})

	tor_exit_nodes['total_exit_addresses'] = len(tor_exit_nodes['exit_addresses'])
	print(dumps(tor_exit_nodes))