Skip to content

Instantly share code, notes, and snippets.

@ngadmini
Last active October 16, 2024 16:19
Show Gist options
  • Save ngadmini/8bcabcb0b31f02619e7b638065621f75 to your computer and use it in GitHub Desktop.
Save ngadmini/8bcabcb0b31f02619e7b638065621f75 to your computer and use it in GitHub Desktop.
tlds_validation.sh AND invalid_tlds_remover.sh
#!/usr/bin/env bash
# TAGS
# invalid_tlds_remover.sh v2.59
# AUTHOR
# [email protected]
# https://gist.github.com/ngadmini
# https://github.com/ngadmini
# TL;DR
# https://github.com/ngadmini/Grabbing-Blacklist-for-Bind9-RPZ/blob/196230c16a285a9c46748622749be98d9712543d/libs/grab_duplic.sh#L94
set -e
export LC_COLLATE=C
PATH=/usr/local/bin:/usr/bin:/bin:${PATH}
_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
_SRC=~/Documents/rpz_10.x # change to your workdir
_TLD="http://data.iana.org/TLD/tlds-alpha-by-domain.txt"
export LC_ALL=C
clear
printf "\n[INFO] starting %s ...\n" "${0##*/}"
cd "${_DIR}"
find . -maxdepth 1 -type f ! -name "*.sh" -print0 | xargs -r0 rm -rf
printf "[INFO] download %s\n" "${_TLD}"
curl -s "${_TLD}" | sed '/#/d;s/[A-Z]/\L&/g' > iana.tlds
printf "[INFO] save as iana.tlds. use it's as a control to validating TLDs\n"
printf "[INFO] searching invalid TLDs\n"
# REPLACE txt.{adult,malware,publicite,redirector,trust+} WITH YOUR files
sort -us "${_SRC}"/txt.{adult,malware,publicite,redirector,trust+} \
| sed -E '/[.]/!d' | awk -F. '{print $NF}' | sort -u -s > false.tlds
awk 'FILENAME == ARGV[1] && FNR==NR{a[$1];next} !($1 in a)' iana.tlds false.tlds > invalid.tlds
if [[ -s invalid.tlds ]]; then
printf "[INFO] creating regex of invalid TLDs\n"
sed -i ':a;N;$!ba;s/\n/\|/g;s/^/\/\\.\(/;s/$/\)\$\/d/' invalid.tlds
printf "[INFO] removing invalid TLDs\n"
sed -E -i -f invalid.tlds "${_SRC}"/txt.{adult,malware,publicite,redirector,trust+}
else
printf "[INFO] not found invalid TLDS\n"
fi
printf "[DONE] bye!\n\n"
exit 0
#!/usr/bin/env bash
# TAGS
# tlds_validation.sh v2.27
# https://gist.github.com/ngadmini/8bcabcb0b31f02619e7b638065621f75
# AUTHOR
# [email protected]
# TL;DR
# capture invalid TLDs in ${_url2}, except:
# - domains with non ASCII character: [^\x00-\x7F]
# - ipv4 and domain ended with port number: '[:.]?\d+$'
# check it's exception:
# ~ LC_COLLATE=C grep -Pn '([^\x00-\x7F]|[A-Z]|[:.]?\d+$)' domains_isp
# ~ LC_COLLATE=C grep -Pc '([^[:ascii:]]|[A-Z]|[:.]?\d+$)' domains_isp
set -e
export LC_COLLATE=C
PATH=/usr/local/bin:/usr/bin:/bin:${PATH}
_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
_reg1="/#/d;s/[A-Z]/\L&/g"
_reg2="s/[A-Z]/\L&/g;/\(:\|\.\)[0-9]\{1,\}$/d"
_url1="http://data.iana.org/TLD/tlds-alpha-by-domain.txt"
_url2="https://trustpositif.kominfo.go.id/assets/db/domains_isp"
f_tld() { find . -maxdepth 1 -type d -name "tld.*" | sed -r "s/\.\///" | sort; }
f_curl() {
if command -v curl >> /dev/null 2>&1; then
true
else
printf "[FAIL] curl not installed, exiting\n"
return 1
fi
}
f_check() {
printf "[INFO] check availability remote files\n"
for _X in "${_url1}" "${_url2}"; do
if ! [[ $(curl -so /dev/null -Iw "%{http_code}" "${_X}") =~ ^[02]{3}$ ]]; then
printf "[INFO] remote files: %s NOT available, exiting\n" "${_X##*/}"
return 1
fi
done
}
f_changing() {
unset ar_tld
mapfile -t ar_tld < <(f_tld)
if [[ ${#ar_tld[@]} = 2 ]]; then
if [[ ${ar_tld[0]} -ot "${ar_tld[1]}" ]]; then
mapfile -t ar_old < "${ar_tld[0]}/invalid_tlds.txt"
mapfile -t ar_new < "${ar_tld[1]}/invalid_tlds.txt"
elif [[ ${ar_tld[0]} -nt "${ar_tld[1]}" ]]; then
mapfile -t ar_old < "${ar_tld[1]}/invalid_tlds.txt"
mapfile -t ar_new < "${ar_tld[0]}/invalid_tlds.txt"
fi
if [[ ${ar_old[*]} == "${ar_new[*]}" ]]; then
printf "[INFO] there are NO change in invalid TLDs\n"
else
printf "[INFO] there are many changes in invalid TLDs :\n"
printf "%s\n" "$(diff tld.*/invalid_tlds.txt)"
fi
fi
}
printf "\n[INFO] starting TLDs validation, target: %s\n" "${_url2##*/}"
f_curl
mapfile -t ar_tld < <(f_tld)
# check if there are more than 1 tld.* dir. remove except the latest.
if [[ ${#ar_tld[@]} -gt 1 ]]; then
for ((i=1; i < $((${#ar_tld[@]})); i++)); do
_older=$(find . -maxdepth 1 -type d -name "tld.*" -printf '%T+ %p\n' | sort | head -n "$i"| awk -F'\/' '{print $NF}' | sed ':a;N;$!ba;s/\n/ /g')
done
printf "[INFO] there are %s dir, please removing the older: %s\n" "${#ar_tld[@]}" "${_older}"
read -r -p "do you want to remove it's ? [Y/n] " answer
case ${answer:0:1} in
y|Y|"") printf "[INFO] removing dir: %s\n" "${_older}"
rm -rf "${_older}";;
*) printf "\n"
exit 1;;
esac
fi
f_check
_tlds=$(mktemp -d -t tld.XXXX -p "${_DIR}")
cd "${_tlds}"
printf "[INFO] identifying invalid TLDs\n"
curl -s "${_url1}" | sed "${_reg1}" >> tlds-alpha-by-domain.txt
curl --retry-all-errors -f -k -sO "${_url2}"
sed "${_reg2}" "${_url2##*/}" | awk -F. '{print $NF}' | sort -u | grep -Pv "[^\x00-\x7F]" >> tlds_trust.txt
sort {tlds_trust,tlds-alpha-by-domain}.txt | uniq -d > valid_tlds.txt
sort {valid_tlds,tlds_trust}.txt | uniq -u > invalid_tlds.txt
printf "[INFO] there are %'d invalid TLDs AND " "$(wc -l invalid_tlds.txt | awk -F' ' '{printf $(NF-1)}')"
sed ':a;N;$!ba;s/\n/\\\|/g' invalid_tlds.txt | sed 's/^/\/\\.\\\(/;s/$/\\\)\$\/d/' > regex.txt
sed -i.bak 's/$/\$/g;s/^/\\\./g' invalid_tlds.txt
printf "%'d domains with invalid TLDs\n" "$(grep -c -f invalid_tlds.txt "${_url2##*/}")"
cd "${_DIR}"
f_changing
printf "[HINT] please recheck with 'grep -f %s/invalid_tlds.txt %s'\n" "${_tlds##*/}" "${_tlds##*/}/${_url2##*/}"
printf " and check dir %s for complete result\nbye!\n\n" "${_tlds##*/}"
exit 0
@ngadmini
Copy link
Author

ngadimin@Q4OS:~/Documents/TLDs/tlds$ ./tlds_validation.sh 

[INFO] starting TLDs validation, target: domains_isp
[INFO] check availability remote files
[INFO] identifying invalid TLDs
[INFO] there are 188 invalid TLDs AND 458 domains with invalid TLDs
[INFO] there are NO change in invalid TLDs
[HINT] please recheck with 'grep -f tld.Q4xL/invalid_tlds.txt tld.Q4xL/domains_isp'
       and check dir tld.Q4xL for complete result
bye!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment