tdeebswihart · January 2, 2023 01:42
diff --git a/blocklist.txt b/blocklist.txt
 # starter blocklist that contains a few rules that will be used to minimize
 # the hosts provided by the urls in the script
 *-analytics.*
 a.*
 ad.*
 ad[0-9].*
 ads.*
 adserv*.*
 adserver.*
 adtech.*
 adtrack.*
 adv.*
 advert*.*
 analytics.*
 c.*
 collector.*
 counter.*
 cpm.*
 doubleciick.*
 doubleclick.*
 id.*
 log.*
 logger.*
 logs.*
 lp.*
 marketing.*
 metric.*
 metrics.*
 mkto-*.com
 oascentral.*
 openx.*
 ox.*
 pagerank*-*.*
 pagerank*.*
 pixel.*
 protection.*police.com
 pussl*.com
 s.*
 smetrics.*
 sp.*
 stat.*.*
 stats.*.*
 track.*
 tracker.*
 trackers.*
 tracking.*
 traffic.*
 trk.*
 webstats.*
diff --git a/dnscloak_blocklist_compile.py b/dnscloak_blocklist_compile.py
 """
 Designed for Pythonista 3
 """

 import hashlib
 import re
 import requests
 import os

 sources = [
  'https://raw.githubusercontent.com/jawz101/MobileAdTrackers/master/hosts',
  'https://raw.githubusercontent.com/BlackJack8/iOSAdblockList/master/Hosts.txt',
  'https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Hosts/GoodbyeAds.txt'
  ]
 bf = 'blocklist.txt'
 hf = 'hosts'
 hs = {}
 pats = set()
 comp = set()


 def parse_hostfile_line(l):
  return l.strip().split(' ')[-1]


 def addhost(h, hs):
  ps = reversed(h.split('.'))
  d = hs
  for p in ps:
    if p not in d:
      d[p] = {}
    d = d[p]
  d['*'] = {}


 def gethosts(hs, l=0):
  for p, sub in hs.items():
    if len(sub) == 0 or '*' in sub:
      # when we have no subdomains
      # or we've an entry that specifies
      # 'this domain and all its children'
      # in dnscloak, 'ads.foo' blocks not
      # only the domain itself but all of
      # its subdomains
      yield p
    #elif '*' in sub:
    #  yield f'*.{p}'
    else:
      for s in gethosts(sub, l=l+1):
        yield f'{s}.{p}'.strip('.')


 count = 0
 for spurce in sources:
  try:
    res = requests.get(source, timeout=5)
    assert res.status_code == 200
    for line in res.text.split('\n'):
      addhost(parse_hostfile_line(line), hs)
      count += 1
  except:
    pass

 if os.path.exists(hf):
  with open(hf, 'r') as f:
    for line in f.readlines():
      if line.startswith('#'):
        continue
      count += 1
      addhost(parse_hostfile_line(line), hs)
 if os.path.exists(bf):
  with open(bf, 'r') as f:
    content = f.read()
    for h in content.splitlines():
      if '#' in h:
        continue
      count += 1
      if '*' in h or '[' in h:
        h = h.replace('(', '').replace(')', '')
        pats.add(h)
        comp.add(re.compile(h.replace('.', '\.')
                             .replace('*', '.*?')
                             )
                 )
      else:
        addhost(h, hs)

 hosts = []
 print(f'have {len(comp)} patterns and {count} hosts')
 for h in set(h for h in gethosts(hs)):
  if '.' not in h or '#' in h:
    continue
  matched = False
  for pat in comp:
    if pat.match(h) is not None:
      matched = True
      break
  if not matched:
    hosts.append(h)
 for p in pats:
  if p.startswith('.*'):
    p = p[2:]
  if '.' in p:
    hosts.append(p)
 hosts = set(hosts)
 print(f'Compacted it down to block {len(hosts)} hosts')
 body = '\n'.join(sorted(hosts))
 with open(bf, 'w') as f:
  f.write(body)
diff --git a/top_words_or_domains.py b/top_words_or_domains.py
 """ set doms = True to look at the most common domains  set it to false to find the most common parts of the hostname"""

 from collections import Counter
 import sys
 from pprint import pprint as pp

 counter = Counter()
 spot = None
 doms = True
 if len(sys.argv) > 1:
  name = sys.argv[1]
 else:
  name = 'blocklist.txt'
 print(name)
 if len(sys.argv) > 2:
    spot = int(sys.argv[2])

 with open(name, 'r') as f:
    for line in f:
        chunks = line.strip().split('.')
        if len(chunks) == 1:
          continue
        if doms:
          if len(chunks) > 2:
            words = '.'.join(chunks[1:])
          else:
            words = '.'.join(chunks)
        else:
          words = chunks[:-1]
        if spot is not None:
            counter[words[-spot]]+=1
        else:
          if isinstance(words, list):
            for word in words:
                counter[word]+=1
          else:
            counter[words] += 1

 pp(counter.most_common(20))
 for k,v in counter.most_common(30):
  print(k)
	# starter blocklist that contains a few rules that will be used to minimize
	# the hosts provided by the urls in the script
	-analytics.
	a.*
	ad.*
	ad[0-9].*
	ads.*
	adserv.
	adserver.*
	adtech.*
	adtrack.*
	adv.*
	advert.
	analytics.*
	c.*
	collector.*
	counter.*
	cpm.*
	doubleciick.*
	doubleclick.*
	id.*
	log.*
	logger.*
	logs.*
	lp.*
	marketing.*
	metric.*
	metrics.*
	mkto-*.com
	oascentral.*
	openx.*
	ox.*
	pagerank-.*
	pagerank.
	pixel.*
	protection.*police.com
	pussl*.com
	s.*
	smetrics.*
	sp.*
	stat..
	stats..
	track.*
	tracker.*
	trackers.*
	tracking.*
	traffic.*
	trk.*
	webstats.*
	"""
	Designed for Pythonista 3
	"""

	import hashlib
	import re
	import requests
	import os

	sources = [
	'https://raw.githubusercontent.com/jawz101/MobileAdTrackers/master/hosts',
	'https://raw.githubusercontent.com/BlackJack8/iOSAdblockList/master/Hosts.txt',
	'https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Hosts/GoodbyeAds.txt'
	]
	bf = 'blocklist.txt'
	hf = 'hosts'
	hs = {}
	pats = set()
	comp = set()


	def parse_hostfile_line(l):
	return l.strip().split(' ')[-1]


	def addhost(h, hs):
	ps = reversed(h.split('.'))
	d = hs
	for p in ps:
	if p not in d:
	d[p] = {}
	d = d[p]
	d['*'] = {}


	def gethosts(hs, l=0):
	for p, sub in hs.items():
	if len(sub) == 0 or '*' in sub:
	# when we have no subdomains
	# or we've an entry that specifies
	# 'this domain and all its children'
	# in dnscloak, 'ads.foo' blocks not
	# only the domain itself but all of
	# its subdomains
	yield p
	#elif '*' in sub:
	# yield f'*.{p}'
	else:
	for s in gethosts(sub, l=l+1):
	yield f'{s}.{p}'.strip('.')


	count = 0
	for spurce in sources:
	try:
	res = requests.get(source, timeout=5)
	assert res.status_code == 200
	for line in res.text.split('\n'):
	addhost(parse_hostfile_line(line), hs)
	count += 1
	except:
	pass

	if os.path.exists(hf):
	with open(hf, 'r') as f:
	for line in f.readlines():
	if line.startswith('#'):
	continue
	count += 1
	addhost(parse_hostfile_line(line), hs)
	if os.path.exists(bf):
	with open(bf, 'r') as f:
	content = f.read()
	for h in content.splitlines():
	if '#' in h:
	continue
	count += 1
	if '*' in h or '[' in h:
	h = h.replace('(', '').replace(')', '')
	pats.add(h)
	comp.add(re.compile(h.replace('.', '\.')
	.replace('', '.?')
	)
	)
	else:
	addhost(h, hs)

	hosts = []
	print(f'have {len(comp)} patterns and {count} hosts')
	for h in set(h for h in gethosts(hs)):
	if '.' not in h or '#' in h:
	continue
	matched = False
	for pat in comp:
	if pat.match(h) is not None:
	matched = True
	break
	if not matched:
	hosts.append(h)
	for p in pats:
	if p.startswith('.*'):
	p = p[2:]
	if '.' in p:
	hosts.append(p)
	hosts = set(hosts)
	print(f'Compacted it down to block {len(hosts)} hosts')
	body = '\n'.join(sorted(hosts))
	with open(bf, 'w') as f:
	f.write(body)
	""" set doms = True to look at the most common domains set it to false to find the most common parts of the hostname"""

	from collections import Counter
	import sys
	from pprint import pprint as pp

	counter = Counter()
	spot = None
	doms = True
	if len(sys.argv) > 1:
	name = sys.argv[1]
	else:
	name = 'blocklist.txt'
	print(name)
	if len(sys.argv) > 2:
	spot = int(sys.argv[2])

	with open(name, 'r') as f:
	for line in f:
	chunks = line.strip().split('.')
	if len(chunks) == 1:
	continue
	if doms:
	if len(chunks) > 2:
	words = '.'.join(chunks[1:])
	else:
	words = '.'.join(chunks)
	else:
	words = chunks[:-1]
	if spot is not None:
	counter[words[-spot]]+=1
	else:
	if isinstance(words, list):
	for word in words:
	counter[word]+=1
	else:
	counter[words] += 1

	pp(counter.most_common(20))
	for k,v in counter.most_common(30):
	print(k)