Skip to content

Instantly share code, notes, and snippets.

@tdeebswihart
Last active January 2, 2023 01:42
Show Gist options
  • Save tdeebswihart/b9866f700ddab82ab35e69540226a29c to your computer and use it in GitHub Desktop.
Save tdeebswihart/b9866f700ddab82ab35e69540226a29c to your computer and use it in GitHub Desktop.
Compile a blocklist for DNSCloak on iOS from multiple sources
# starter blocklist that contains a few rules that will be used to minimize
# the hosts provided by the urls in the script
*-analytics.*
a.*
ad.*
ad[0-9].*
ads.*
adserv*.*
adserver.*
adtech.*
adtrack.*
adv.*
advert*.*
analytics.*
c.*
collector.*
counter.*
cpm.*
doubleciick.*
doubleclick.*
id.*
log.*
logger.*
logs.*
lp.*
marketing.*
metric.*
metrics.*
mkto-*.com
oascentral.*
openx.*
ox.*
pagerank*-*.*
pagerank*.*
pixel.*
protection.*police.com
pussl*.com
s.*
smetrics.*
sp.*
stat.*.*
stats.*.*
track.*
tracker.*
trackers.*
tracking.*
traffic.*
trk.*
webstats.*
"""
Designed for Pythonista 3
"""
import hashlib
import re
import requests
import os
sources = [
'https://raw.githubusercontent.com/jawz101/MobileAdTrackers/master/hosts',
'https://raw.githubusercontent.com/BlackJack8/iOSAdblockList/master/Hosts.txt',
'https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Hosts/GoodbyeAds.txt'
]
bf = 'blocklist.txt'
hf = 'hosts'
hs = {}
pats = set()
comp = set()
def parse_hostfile_line(l):
return l.strip().split(' ')[-1]
def addhost(h, hs):
ps = reversed(h.split('.'))
d = hs
for p in ps:
if p not in d:
d[p] = {}
d = d[p]
d['*'] = {}
def gethosts(hs, l=0):
for p, sub in hs.items():
if len(sub) == 0 or '*' in sub:
# when we have no subdomains
# or we've an entry that specifies
# 'this domain and all its children'
# in dnscloak, 'ads.foo' blocks not
# only the domain itself but all of
# its subdomains
yield p
#elif '*' in sub:
# yield f'*.{p}'
else:
for s in gethosts(sub, l=l+1):
yield f'{s}.{p}'.strip('.')
count = 0
for spurce in sources:
try:
res = requests.get(source, timeout=5)
assert res.status_code == 200
for line in res.text.split('\n'):
addhost(parse_hostfile_line(line), hs)
count += 1
except:
pass
if os.path.exists(hf):
with open(hf, 'r') as f:
for line in f.readlines():
if line.startswith('#'):
continue
count += 1
addhost(parse_hostfile_line(line), hs)
if os.path.exists(bf):
with open(bf, 'r') as f:
content = f.read()
for h in content.splitlines():
if '#' in h:
continue
count += 1
if '*' in h or '[' in h:
h = h.replace('(', '').replace(')', '')
pats.add(h)
comp.add(re.compile(h.replace('.', '\.')
.replace('*', '.*?')
)
)
else:
addhost(h, hs)
hosts = []
print(f'have {len(comp)} patterns and {count} hosts')
for h in set(h for h in gethosts(hs)):
if '.' not in h or '#' in h:
continue
matched = False
for pat in comp:
if pat.match(h) is not None:
matched = True
break
if not matched:
hosts.append(h)
for p in pats:
if p.startswith('.*'):
p = p[2:]
if '.' in p:
hosts.append(p)
hosts = set(hosts)
print(f'Compacted it down to block {len(hosts)} hosts')
body = '\n'.join(sorted(hosts))
with open(bf, 'w') as f:
f.write(body)
""" set doms = True to look at the most common domains set it to false to find the most common parts of the hostname"""
from collections import Counter
import sys
from pprint import pprint as pp
counter = Counter()
spot = None
doms = True
if len(sys.argv) > 1:
name = sys.argv[1]
else:
name = 'blocklist.txt'
print(name)
if len(sys.argv) > 2:
spot = int(sys.argv[2])
with open(name, 'r') as f:
for line in f:
chunks = line.strip().split('.')
if len(chunks) == 1:
continue
if doms:
if len(chunks) > 2:
words = '.'.join(chunks[1:])
else:
words = '.'.join(chunks)
else:
words = chunks[:-1]
if spot is not None:
counter[words[-spot]]+=1
else:
if isinstance(words, list):
for word in words:
counter[word]+=1
else:
counter[words] += 1
pp(counter.most_common(20))
for k,v in counter.most_common(30):
print(k)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment