Last active
October 16, 2024 16:19
-
-
Save ngadmini/8bcabcb0b31f02619e7b638065621f75 to your computer and use it in GitHub Desktop.
tlds_validation.sh AND invalid_tlds_remover.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# TAGS | |
# invalid_tlds_remover.sh v2.59 | |
# AUTHOR | |
# [email protected] | |
# https://gist.github.com/ngadmini | |
# https://github.com/ngadmini | |
# TL;DR | |
# https://github.com/ngadmini/Grabbing-Blacklist-for-Bind9-RPZ/blob/196230c16a285a9c46748622749be98d9712543d/libs/grab_duplic.sh#L94 | |
set -e | |
export LC_COLLATE=C | |
PATH=/usr/local/bin:/usr/bin:/bin:${PATH} | |
_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) | |
_SRC=~/Documents/rpz_10.x # change to your workdir | |
_TLD="http://data.iana.org/TLD/tlds-alpha-by-domain.txt" | |
export LC_ALL=C | |
clear | |
printf "\n[INFO] starting %s ...\n" "${0##*/}" | |
cd "${_DIR}" | |
find . -maxdepth 1 -type f ! -name "*.sh" -print0 | xargs -r0 rm -rf | |
printf "[INFO] download %s\n" "${_TLD}" | |
curl -s "${_TLD}" | sed '/#/d;s/[A-Z]/\L&/g' > iana.tlds | |
printf "[INFO] save as iana.tlds. use it's as a control to validating TLDs\n" | |
printf "[INFO] searching invalid TLDs\n" | |
# REPLACE txt.{adult,malware,publicite,redirector,trust+} WITH YOUR files | |
sort -us "${_SRC}"/txt.{adult,malware,publicite,redirector,trust+} \ | |
| sed -E '/[.]/!d' | awk -F. '{print $NF}' | sort -u -s > false.tlds | |
awk 'FILENAME == ARGV[1] && FNR==NR{a[$1];next} !($1 in a)' iana.tlds false.tlds > invalid.tlds | |
if [[ -s invalid.tlds ]]; then | |
printf "[INFO] creating regex of invalid TLDs\n" | |
sed -i ':a;N;$!ba;s/\n/\|/g;s/^/\/\\.\(/;s/$/\)\$\/d/' invalid.tlds | |
printf "[INFO] removing invalid TLDs\n" | |
sed -E -i -f invalid.tlds "${_SRC}"/txt.{adult,malware,publicite,redirector,trust+} | |
else | |
printf "[INFO] not found invalid TLDS\n" | |
fi | |
printf "[DONE] bye!\n\n" | |
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# TAGS | |
# tlds_validation.sh v2.27 | |
# https://gist.github.com/ngadmini/8bcabcb0b31f02619e7b638065621f75 | |
# AUTHOR | |
# [email protected] | |
# TL;DR | |
# capture invalid TLDs in ${_url2}, except: | |
# - domains with non ASCII character: [^\x00-\x7F] | |
# - ipv4 and domain ended with port number: '[:.]?\d+$' | |
# check it's exception: | |
# ~ LC_COLLATE=C grep -Pn '([^\x00-\x7F]|[A-Z]|[:.]?\d+$)' domains_isp | |
# ~ LC_COLLATE=C grep -Pc '([^[:ascii:]]|[A-Z]|[:.]?\d+$)' domains_isp | |
set -e | |
export LC_COLLATE=C | |
PATH=/usr/local/bin:/usr/bin:/bin:${PATH} | |
_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) | |
_reg1="/#/d;s/[A-Z]/\L&/g" | |
_reg2="s/[A-Z]/\L&/g;/\(:\|\.\)[0-9]\{1,\}$/d" | |
_url1="http://data.iana.org/TLD/tlds-alpha-by-domain.txt" | |
_url2="https://trustpositif.kominfo.go.id/assets/db/domains_isp" | |
f_tld() { find . -maxdepth 1 -type d -name "tld.*" | sed -r "s/\.\///" | sort; } | |
f_curl() { | |
if command -v curl >> /dev/null 2>&1; then | |
true | |
else | |
printf "[FAIL] curl not installed, exiting\n" | |
return 1 | |
fi | |
} | |
f_check() { | |
printf "[INFO] check availability remote files\n" | |
for _X in "${_url1}" "${_url2}"; do | |
if ! [[ $(curl -so /dev/null -Iw "%{http_code}" "${_X}") =~ ^[02]{3}$ ]]; then | |
printf "[INFO] remote files: %s NOT available, exiting\n" "${_X##*/}" | |
return 1 | |
fi | |
done | |
} | |
f_changing() { | |
unset ar_tld | |
mapfile -t ar_tld < <(f_tld) | |
if [[ ${#ar_tld[@]} = 2 ]]; then | |
if [[ ${ar_tld[0]} -ot "${ar_tld[1]}" ]]; then | |
mapfile -t ar_old < "${ar_tld[0]}/invalid_tlds.txt" | |
mapfile -t ar_new < "${ar_tld[1]}/invalid_tlds.txt" | |
elif [[ ${ar_tld[0]} -nt "${ar_tld[1]}" ]]; then | |
mapfile -t ar_old < "${ar_tld[1]}/invalid_tlds.txt" | |
mapfile -t ar_new < "${ar_tld[0]}/invalid_tlds.txt" | |
fi | |
if [[ ${ar_old[*]} == "${ar_new[*]}" ]]; then | |
printf "[INFO] there are NO change in invalid TLDs\n" | |
else | |
printf "[INFO] there are many changes in invalid TLDs :\n" | |
printf "%s\n" "$(diff tld.*/invalid_tlds.txt)" | |
fi | |
fi | |
} | |
printf "\n[INFO] starting TLDs validation, target: %s\n" "${_url2##*/}" | |
f_curl | |
mapfile -t ar_tld < <(f_tld) | |
# check if there are more than 1 tld.* dir. remove except the latest. | |
if [[ ${#ar_tld[@]} -gt 1 ]]; then | |
for ((i=1; i < $((${#ar_tld[@]})); i++)); do | |
_older=$(find . -maxdepth 1 -type d -name "tld.*" -printf '%T+ %p\n' | sort | head -n "$i"| awk -F'\/' '{print $NF}' | sed ':a;N;$!ba;s/\n/ /g') | |
done | |
printf "[INFO] there are %s dir, please removing the older: %s\n" "${#ar_tld[@]}" "${_older}" | |
read -r -p "do you want to remove it's ? [Y/n] " answer | |
case ${answer:0:1} in | |
y|Y|"") printf "[INFO] removing dir: %s\n" "${_older}" | |
rm -rf "${_older}";; | |
*) printf "\n" | |
exit 1;; | |
esac | |
fi | |
f_check | |
_tlds=$(mktemp -d -t tld.XXXX -p "${_DIR}") | |
cd "${_tlds}" | |
printf "[INFO] identifying invalid TLDs\n" | |
curl -s "${_url1}" | sed "${_reg1}" >> tlds-alpha-by-domain.txt | |
curl --retry-all-errors -f -k -sO "${_url2}" | |
sed "${_reg2}" "${_url2##*/}" | awk -F. '{print $NF}' | sort -u | grep -Pv "[^\x00-\x7F]" >> tlds_trust.txt | |
sort {tlds_trust,tlds-alpha-by-domain}.txt | uniq -d > valid_tlds.txt | |
sort {valid_tlds,tlds_trust}.txt | uniq -u > invalid_tlds.txt | |
printf "[INFO] there are %'d invalid TLDs AND " "$(wc -l invalid_tlds.txt | awk -F' ' '{printf $(NF-1)}')" | |
sed ':a;N;$!ba;s/\n/\\\|/g' invalid_tlds.txt | sed 's/^/\/\\.\\\(/;s/$/\\\)\$\/d/' > regex.txt | |
sed -i.bak 's/$/\$/g;s/^/\\\./g' invalid_tlds.txt | |
printf "%'d domains with invalid TLDs\n" "$(grep -c -f invalid_tlds.txt "${_url2##*/}")" | |
cd "${_DIR}" | |
f_changing | |
printf "[HINT] please recheck with 'grep -f %s/invalid_tlds.txt %s'\n" "${_tlds##*/}" "${_tlds##*/}/${_url2##*/}" | |
printf " and check dir %s for complete result\nbye!\n\n" "${_tlds##*/}" | |
exit 0 |
Author
ngadmini
commented
Mar 28, 2023
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment