-
-
Save bjornjohansen/94a73eeaac961d139f1bb7dcf2f40904 to your computer and use it in GitHub Desktop.
Finding WordPress in Alexa top 1 million sites, see http://crawler.wproll.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
touch checked.csv | |
while IFS=',' read -r POS HOSTNAME; do | |
if cat checked.csv | grep -qxF "$HOSTNAME"; then | |
echo "Skipping $HOSTNAME, already checked." | |
continue | |
fi | |
# Look for the WP REST API | |
ISWP=$(curl -s -L -m 5 --head $HOSTNAME 2>&1 | grep api.w.org) | |
# Look for WP specifics in the HTML output | |
if [[ ! $ISWP ]]; then | |
ISWP=$(curl -s -L -m 5 $HOSTNAME 2>&1 | tee "html/$HOSTNAME.txt" | grep "/wp-content/\|/wp-includes/\|/wp-json/\|/wp-embed.min.js") | |
fi | |
# Check the login cookie, see http://wordpress.stackexchange.com/a/54442 | |
if [[ ! $ISWP ]]; then | |
ISWP=$(curl -s -L -m 5 --head $HOSTNAME/wp-login.php 2>&1 | grep "=WP+Cookie+check;") | |
fi | |
# Look for readme.html | |
if [[ ! $ISWP ]]; then | |
ISWP=$(curl -s -L -m 5 $HOSTNAME/readme.html 2>&1 | grep "wordpress.org/support") | |
fi | |
# Look for WP.com/VIP sites | |
if [[ ! $ISWP ]]; then | |
ISWP=$(curl -s -L -m 5 --head $HOSTNAME 2>&1 | grep "visit automattic.com/jobs") | |
fi | |
if [[ $ISWP ]]; then | |
echo "$POS - $HOSTNAME is WP" | |
echo "$POS,$HOSTNAME" >> topwp.csv | |
else | |
echo "$POS - $HOSTNAME is not WP" | |
fi | |
echo $HOSTNAME >> checked.csv | |
done < top-1m.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment