-
-
Save connectthefuture/94c7250e5746f3ef87a3d8b9f39522c9 to your computer and use it in GitHub Desktop.
download a site using wget
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
function wget_site(){ | |
## Remove protocol part of domain to add it to the allow list | |
# source : https://www.cyberciti.biz/faq/get-extract-domain-name-from-url-in-linux-unix-bash/ | |
local url="$1" | |
local domain="$1" | |
domain="${domain#http://}" | |
domain="${domain#https://}" | |
domain="${domain#ftp://}" | |
domain="${domain#scp://}" | |
domain="${domain#scp://}" | |
domain="${domain#sftp://}" | |
## Remove username and/or username:password part of domain | |
domain="${domain#*:*@}" | |
domain="${domain#*@}" | |
## Remove rest of domains | |
domain=${domain%%/*} | |
# --no-clobber, -nc : refuse to download newer copies of downloaded files | |
# --adjust-extension, -E : Appends .html | |
# --span-hosts, -H : retrieve across hosts | |
# --convert-links, -k : create a self-contained dir of all of its external dependencies | |
# --backup-converted, -K : stash a copy of the original HTML | |
# --page-requisites, -p : download all the files that are necessary | |
# --tries=number, -t : number of tries | |
# --timeout=seconds, -T : set dns, connect and read to the same timeout | |
# --limit-rate : Limit the download speed | |
# --no-parent, -np : never leave the existing hierarchy | |
# --max-redirect : maximum redirections allowed per page. | |
# --domains, -D : comma-separated list of accepted domains. | |
# --level, -l : Specify recursion maximum depth. default 5 | |
local wget_args=('-nc' '-E' '-H' '-k' '-K' '-r' '-p' '-t' '1' '-T' '5' '--limit-rate=400k' '-np' '--max-redirect' '1') | |
## download only from the main domain. | |
local wget_allow=('--domains' ${domain}) | |
wget "${wget_args[@]}" "${wget_allow[@]}" "${url}" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment