Skip to content

Instantly share code, notes, and snippets.

@miglen
Created October 6, 2024 07:52
Show Gist options
  • Save miglen/fb178236835745c5772426f946dd28e4 to your computer and use it in GitHub Desktop.
Save miglen/fb178236835745c5772426f946dd28e4 to your computer and use it in GitHub Desktop.
Phishing feeds collector
#!/bin/bash
folder_prefix="./feeds/" # $(date +%Y/%m/%d)
mkdir -p ${folder_prefix}
rm -f ${folder_prefix}/*.txt
curl -L -s https://phishing.army/download/phishing_army_blocklist_extended.txt -o ${folder_prefix}/phishing_army_domains.txt
curl -L -s https://hole.cert.pl/domains/domains.csv | cut -f 2 > ${folder_prefix}/cert_pl_domains.txt
curl -L -s https://urlabuse.com/public/data/phishing_url.txt -o ${folder_prefix}/urlabuse_urls.txt
curl -L -s https://threatview.io/Downloads/DOMAIN-High-Confidence-Feed.txt -o ${folder_prefix}/threatview_domains.txt
curl -L -s https://openphish.com/feed.txt -o ${folder_prefix}/openphish_urls.txt
curl -L -s "https://urlscan.io/api/v1/search/?q=task.tags:\"@phish_report\"&size=10000" | jq -r '.results[].task.url' > ${folder_prefix}/urlscan_urls.txt
curl -L -s https://data.phishtank.com/data/online-valid.json.gz | gunzip | jq -r '.[].url' > phishtank_urls.txt
curl -L -s https://raw.githubusercontent.com/mitchellkrogza/Phishing.Database/refs/heads/master/phishing-domains-ACTIVE.txt -o ${folder_prefix}/phishing_database_domains.txt
curl -L -s https://github.com/mitchellkrogza/Phishing.Database/raw/refs/heads/master/phishing-links-ACTIVE.txt -o ${folder_prefix}/phishing_database_urls.txt
curl -L -s https://phishstats.info/phish_score.csv | grep http | cut -d"," -f 3 | tr -d '"' > ${folder_prefix}/phishing_score_urls.txt
cd ${folder_prefix}
cat ./*domains.txt | egrep -o '([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}' | egrep -v '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sed 's/^www\.//'| sort | uniq | > ./all_domains.txt
cat ./*urls.txt | egrep -i ^http| sed 's/\/\/www\./\/\//' | sort | uniq | egrep -v "^https://docs.google.com/" | > ./all_urls.txt
echo -ne ""
wc -l ./all*.txt
echo -ne top 10 domains from the urls
awk -F/ '{print $3}' ./all_*.txt | egrep -v '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort | uniq -c | sort -nr | head -10
echo -ne top 10 primary domains
awk -F/ '{print $3}' ./all_urls.txt | egrep -v '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | awk -F. '{n=NF-1; print $(n) "." $(n+1)}' | sort | uniq -c | sort -nr | head -10
echo -ne top 10 non-common domains
awk -F/ '{print $3}' ./all_urls.txt | egrep -v '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | awk -F. '{n=NF-1; print $(n) "." $(n+1)}' | egrep -v '\.(com|net|org|gov|edu|mil|io)$' | sort | uniq -c | sort -nr | head -10
echo -ne top 10 TDLs
awk -F/ '{print $3}' ./all_*.txt | awk -F. '{print $NF}' | egrep -v '^(com|net|org|gov|edu|mil|io)$' | sort | uniq -c | sort -nr | head -10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment