Last active
February 14, 2017 20:50
-
-
Save 0xAether/4241069 to your computer and use it in GitHub Desktop.
A bash function to scrape the wallpaper site ``simpledesktops.com"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function simpledesks() { | |
useragent='Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36' # Mimic Google Chrome 33.0.1750.117 on 64-bit Windows 8.1 | |
if [ $# != 1 ] | |
then | |
echo 'Enter the number of pages you want to scrape as the only argument. Each page has (give or take) 28 images. The total number of pages at this time is 46.' | |
return 1 | |
elif [ $1 -gt 46 ] | |
then | |
limit=46 | |
else | |
limit=$1 | |
fi | |
limit=$1 | |
counter=1 | |
while [ $counter -le $limit ] | |
do | |
for url in $(curl -s -A "$useragent" http://simpledesktops.com/browse/$counter/ | grep '<h2><a href="/browse/desktops' | sed 's/\t*<h2><a href="\(.*\)">.*<\/a><\/h2>$/http:\/\/simpledesktops.com\1/') | |
do | |
name=$(sed 's/^.*\/[0-9][0-9]\/\(.*\)\/$/\1/' <<< $url) | |
echo -n Downloading $name... | |
if [ $(ls -1 | grep ^$name\....) ] # If we already have the file, no need to re-download | |
then | |
: | |
else | |
curl -s -L -o temp -A "$useragent" $(curl -s $url | grep '<h2><a href="/download' | sed 's/^\t*<h2><a href="\(.*\)">.*<\/a><\/h2>$/http:\/\/simpledesktops.com\1/') | |
mv temp $name.$(file -b temp | cut -f1 -d' ' | tr [A-Z] [a-z] | sed 's/jpeg/jpg/') # I know this causes more overhead, but I've noticed a few files aren't PNG's | |
fi | |
echo ' Done' | |
done | |
counter=$(($counter + 1)) | |
done | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment