Skip to content

Instantly share code, notes, and snippets.

@mibere
Forked from kimbo/scrape-doh-providers.py
Last active December 25, 2021 14:21
Show Gist options
  • Save mibere/c715a25b9b5c049ec8059f0f86ffe272 to your computer and use it in GitHub Desktop.
Save mibere/c715a25b9b5c049ec8059f0f86ffe272 to your computer and use it in GitHub Desktop.
Scrape DoH provider URLs from cURL's wiki page (see https://raw.githubusercontent.com/wiki/curl/curl/DNS-over-HTTPS)
#!/usr/bin/env python
#
# Scrape Doh provider URLs from Curl's DNS-over-HTTPS wiki (https://raw.githubusercontent.com/wiki/curl/curl/DNS-over-HTTPS).
#
# Example usage: ./scrape_doh_providers.py '"{} - {}".format(o["url"], o["name"])'
#
import argparse
import re
import urllib.request
HTTPS_URL_RE = re.compile(r'https://'
r'(?P<hostname>[0-9a-zA-Z._~-]+)'
r'(?P<port>:[0-9]+)?'
r'(?P<path>[0-9a-zA-Z._~/-]+)?')
PROVIDER_RE = re.compile(r'(\[([^\]]+)\]\(([^)]+))\)|(.*)')
# URLs that are not Doh URLs
do_not_include = ['my.nextdns.io', 'blog.cloudflare.com', 'https://blog.cloudflare.com/welcome-hidden-resolver', 'https://my.nextdns.io/start']
def get_doh_providers():
found_table = False
with urllib.request.urlopen('https://raw.githubusercontent.com/wiki/curl/curl/DNS-over-HTTPS.md') as fp:
for line in fp:
line = line.decode()
if line.startswith('|'):
if not found_table:
found_table = True
continue
cols = line.split('|')
provider_col = cols[1].strip()
website = None
provider_name = None
matches = PROVIDER_RE.findall(provider_col)
if matches[0][3] != '':
provider_name = matches[0][3]
if matches[0][1] != '':
provider_name = matches[0][1]
if matches[0][2] != '':
website = matches[0][2]
if provider_name is not None:
provider_name = re.sub(r'([^[]+)\s?(.*)', r'\1', provider_name)
while provider_name[-1] == ' ':
provider_name = provider_name[:-1]
if len(cols) < 3:
continue
url_col = cols[2]
doh_url_matches = HTTPS_URL_RE.findall(url_col)
if len(doh_url_matches) == 0:
continue
else:
for doh_url in doh_url_matches:
if doh_url[0] in do_not_include:
continue
yield {
'name': provider_name,
'website': website,
'url': 'https://{}{}{}'.format(doh_url[0], ':{}'.format(doh_url[1]) if len(doh_url[1]) != 0 else '', doh_url[2]),
'hostname': doh_url[0],
'port': doh_url[1] if len(doh_url[1]) != 0 else '443',
'path': doh_url[2],
}
if found_table and line.startswith('#'):
break
return
def main():
# example: ./scripts/scrape_doh_providers.py '"{} - {}".format(o["url"], o["name"])'
parser = argparse.ArgumentParser(description='A script to parse DoH provider URLs from cURL\'s wiki page!')
parser.add_argument('format', help='Format of output. Example: \'*(o["url"],o["name"])\'', default='o["url"]',
nargs='?')
args = parser.parse_args()
for o in get_doh_providers():
print(eval(args.format))
if __name__ == '__main__':
main()
@mibere
Copy link
Author

mibere commented May 31, 2020

Source: https://github.com/curl/curl/wiki/DNS-over-HTTPS
Script: https://gist.github.com/kimbo/dd65d539970e3a28a10628f15398247b

python3 scrape-doh-providers.py '"{}".format(o["hostname"])' > doh-servers.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment