Last active
April 28, 2025 08:14
-
-
Save za3k/adc34342419bc8703bebe9460af8016c to your computer and use it in GitHub Desktop.
download-e621.bash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# e621 downloader. maintains a mirror of e621 (just DB export+images, not the HTML) | |
# suggested use is to run this from crontab once per day. | |
# this maintains an infinite number of database snapshots and also keeps deleted images forever (in a separate directory) because I'm a packrat. feel free to modify to delete old stuff. | |
# | |
# written by zachary "za3k" vance. released in the public domain | |
# | |
# dependencies: csvtools (davylandman), chronic (joeyh's moreutils, optional), wget, gzip, coreutils | |
# | |
# config | |
USERNAME=${USERNAME:-} # Your e621 username goes here | |
BASEDIR="/data/e621" # Where on your local filesystem you want to store files (no spaces) | |
# output | |
# | |
# file/directory what is it | |
# | |
# ${BASEDIR}/db .csv.gz database files (from e621 daily dump) | |
# ${BASEDIR}/image-full full-size images. smaller images and previews for video are not downloaded. | |
# ${BASEDIR}/image-deleted images downloaded locally but later deleted from e621. please do not serve publicly | |
# ${BASEDIR}/.index-images.* temp files used by this script | |
if [ -z $USERNAME ]; then | |
echo "Please enter your e621 username so the admin knows who you are." | |
exit 1 | |
fi | |
if [ "$1" == "magic" ]; then | |
#echo "Magic mode" | |
shift; | |
elif [ -z "$PS1" ]; then | |
#echo "Non-interactive (cron) mode" | |
# If this is in batch mode (ex from crontab or /etc/cron.daily) then don't output anything and return status 0--unless there is some error. | |
if [ -e /usr/bin/chronic ]; then | |
exec /usr/bin/chronic "$0" "magic" "$@" | |
exit $? | |
fi | |
else | |
#echo "Interactive mode" | |
: | |
fi | |
echo "Downloading the latest CSV database dump from the web... (may take a minute)" | |
DATE=$(\date -u -d "yesterday" "+%Y-%m-%d") # To deal with server timezone maybe being different than yours, as well as ongoing uploads, stay 1 day behind. | |
wget --directory-prefix=${BASEDIR}/db -c https://e621.net/db_export/{pools,posts,tag_aliases,tag_implications,tags,wiki_pages}-${DATE}.csv.gz --user-agent "za3k's download script - .csv.gz - ${USERNAME}" --no-verbose | |
echo "Calculating the list of images to download from the local database copy..." | |
zcat ${BASEDIR}/db/posts-${DATE}.csv.gz | csvgrep --maxfieldsize 1000000 -i -c is_deleted -m t | csvcut --maxfieldsize 1000000 -c md5,file_ext | tail -n+2 | tr ',' '.' | sort >${BASEDIR}/.index-images.avail | |
if [ -e ${BASEDIR}/.index-images.have ]; then | |
echo "Using cached list of images on the filesystem from last download." | |
else | |
echo "Calculating the list of images we have from the filesystem." | |
find ${BASEDIR}/image-full/ -type f -printf '%f\n' | sort >${BASEDIR}/.index-images.have | |
fi | |
echo "Calculating images to download or delete by comparing the two lists." | |
comm ${BASEDIR}/.index-images.avail ${BASEDIR}/.index-images.have -2 -3 >${BASEDIR}/.index-images.new | |
comm ${BASEDIR}/.index-images.avail ${BASEDIR}/.index-images.have -1 -3 >${BASEDIR}/.index-images.del | |
if [ -s ${BASEDIR}/.index-images.new ]; then | |
echo "Downloading missing images direct from static.e621.net... (may take a long time)" | |
cat ${BASEDIR}/.index-images.new | while read md5; do echo "https://static1.e621.net/data/${md5:0:2}/${md5:2:2}/${md5}"; done | wget --directory-prefix=${BASEDIR}/image-full -c --input-file=- --user-agent "za3k's download script - images - ${USERNAME}" -nH -x --cut-dirs=1 -nc --no-verbose | |
else | |
echo "No new images were found." | |
fi | |
# For now don't delete images we have. Just move the images to a "deleted" directory instead of actually deleting them. | |
if [ -s ${BASEDIR}/.index-images.del ]; then | |
echo "Moving deleted images to archive folder." | |
cat "${BASEDIR}/.index-images.del" | while read md5; do | |
mkdir -p "${BASEDIR}/image-deleted/${md5:0:2}/${md5:2:2}" | |
md5="${md5:0:2}/${md5:2:2}/${md5}" | |
mv ${BASEDIR}/image-{full,deleted}/${md5} | |
done | |
else | |
echo "No deleted images were found." | |
fi | |
echo "Cleaning up afterwards." | |
rm ${BASEDIR}/.index-images.{have,new,del} | |
mv ${BASEDIR}/.index-images.{avail,have} # Keep as a cache to avoid costly filesystem scan |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment