Last active
January 2, 2023 01:42
-
-
Save tdeebswihart/b9866f700ddab82ab35e69540226a29c to your computer and use it in GitHub Desktop.
Compile a blocklist for DNSCloak on iOS from multiple sources
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# starter blocklist that contains a few rules that will be used to minimize | |
# the hosts provided by the urls in the script | |
*-analytics.* | |
a.* | |
ad.* | |
ad[0-9].* | |
ads.* | |
adserv*.* | |
adserver.* | |
adtech.* | |
adtrack.* | |
adv.* | |
advert*.* | |
analytics.* | |
c.* | |
collector.* | |
counter.* | |
cpm.* | |
doubleciick.* | |
doubleclick.* | |
id.* | |
log.* | |
logger.* | |
logs.* | |
lp.* | |
marketing.* | |
metric.* | |
metrics.* | |
mkto-*.com | |
oascentral.* | |
openx.* | |
ox.* | |
pagerank*-*.* | |
pagerank*.* | |
pixel.* | |
protection.*police.com | |
pussl*.com | |
s.* | |
smetrics.* | |
sp.* | |
stat.*.* | |
stats.*.* | |
track.* | |
tracker.* | |
trackers.* | |
tracking.* | |
traffic.* | |
trk.* | |
webstats.* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Designed for Pythonista 3 | |
""" | |
import hashlib | |
import re | |
import requests | |
import os | |
sources = [ | |
'https://raw.githubusercontent.com/jawz101/MobileAdTrackers/master/hosts', | |
'https://raw.githubusercontent.com/BlackJack8/iOSAdblockList/master/Hosts.txt', | |
'https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Hosts/GoodbyeAds.txt' | |
] | |
bf = 'blocklist.txt' | |
hf = 'hosts' | |
hs = {} | |
pats = set() | |
comp = set() | |
def parse_hostfile_line(l): | |
return l.strip().split(' ')[-1] | |
def addhost(h, hs): | |
ps = reversed(h.split('.')) | |
d = hs | |
for p in ps: | |
if p not in d: | |
d[p] = {} | |
d = d[p] | |
d['*'] = {} | |
def gethosts(hs, l=0): | |
for p, sub in hs.items(): | |
if len(sub) == 0 or '*' in sub: | |
# when we have no subdomains | |
# or we've an entry that specifies | |
# 'this domain and all its children' | |
# in dnscloak, 'ads.foo' blocks not | |
# only the domain itself but all of | |
# its subdomains | |
yield p | |
#elif '*' in sub: | |
# yield f'*.{p}' | |
else: | |
for s in gethosts(sub, l=l+1): | |
yield f'{s}.{p}'.strip('.') | |
count = 0 | |
for spurce in sources: | |
try: | |
res = requests.get(source, timeout=5) | |
assert res.status_code == 200 | |
for line in res.text.split('\n'): | |
addhost(parse_hostfile_line(line), hs) | |
count += 1 | |
except: | |
pass | |
if os.path.exists(hf): | |
with open(hf, 'r') as f: | |
for line in f.readlines(): | |
if line.startswith('#'): | |
continue | |
count += 1 | |
addhost(parse_hostfile_line(line), hs) | |
if os.path.exists(bf): | |
with open(bf, 'r') as f: | |
content = f.read() | |
for h in content.splitlines(): | |
if '#' in h: | |
continue | |
count += 1 | |
if '*' in h or '[' in h: | |
h = h.replace('(', '').replace(')', '') | |
pats.add(h) | |
comp.add(re.compile(h.replace('.', '\.') | |
.replace('*', '.*?') | |
) | |
) | |
else: | |
addhost(h, hs) | |
hosts = [] | |
print(f'have {len(comp)} patterns and {count} hosts') | |
for h in set(h for h in gethosts(hs)): | |
if '.' not in h or '#' in h: | |
continue | |
matched = False | |
for pat in comp: | |
if pat.match(h) is not None: | |
matched = True | |
break | |
if not matched: | |
hosts.append(h) | |
for p in pats: | |
if p.startswith('.*'): | |
p = p[2:] | |
if '.' in p: | |
hosts.append(p) | |
hosts = set(hosts) | |
print(f'Compacted it down to block {len(hosts)} hosts') | |
body = '\n'.join(sorted(hosts)) | |
with open(bf, 'w') as f: | |
f.write(body) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" set doms = True to look at the most common domains set it to false to find the most common parts of the hostname""" | |
from collections import Counter | |
import sys | |
from pprint import pprint as pp | |
counter = Counter() | |
spot = None | |
doms = True | |
if len(sys.argv) > 1: | |
name = sys.argv[1] | |
else: | |
name = 'blocklist.txt' | |
print(name) | |
if len(sys.argv) > 2: | |
spot = int(sys.argv[2]) | |
with open(name, 'r') as f: | |
for line in f: | |
chunks = line.strip().split('.') | |
if len(chunks) == 1: | |
continue | |
if doms: | |
if len(chunks) > 2: | |
words = '.'.join(chunks[1:]) | |
else: | |
words = '.'.join(chunks) | |
else: | |
words = chunks[:-1] | |
if spot is not None: | |
counter[words[-spot]]+=1 | |
else: | |
if isinstance(words, list): | |
for word in words: | |
counter[word]+=1 | |
else: | |
counter[words] += 1 | |
pp(counter.most_common(20)) | |
for k,v in counter.most_common(30): | |
print(k) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment