padeoe · November 14, 2024 12:59 · zhang-ziang · Sep 16, 2024 · RewindL · Sep 26, 2024
diff --git a/README_hfd.md b/README_hfd.md
diff --git a/hfd.sh b/hfd.sh
 #!/usr/bin/env bash
 # Color definitions
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color

 trap 'printf "${YELLOW}\nDownload interrupted. If you re-run the command, you can resume the download from the breakpoint.\n${NC}"; exit 1' INT

 display_help() {
    cat << EOF
 Usage:
  hfd <repo_id> [--include include_pattern1 include_pattern2 ...] [--exclude exclude_pattern1 exclude_pattern2 ...] [--hf_username username] [--hf_token token] [--tool aria2c|wget] [-x threads] [--dataset] [--local-dir path]    

 Description:
  Downloads a model or dataset from Hugging Face using the provided repo ID.

 Parameters:
  repo_id        The Hugging Face repo ID in the format 'org/repo_name'.
  --include       (Optional) Flag to specify string patterns to include files for downloading. Supports multiple patterns.
  --exclude       (Optional) Flag to specify string patterns to exclude files from downloading. Supports multiple patterns.
  include/exclude_pattern The patterns to match against filenames, supports wildcard characters. e.g., '--exclude *.safetensor *.txt', '--include vae/*'.
  --hf_username   (Optional) Hugging Face username for authentication. **NOT EMAIL**.
  --hf_token      (Optional) Hugging Face token for authentication.
  --tool          (Optional) Download tool to use. Can be aria2c (default) or wget.
  -x              (Optional) Number of download threads for aria2c. Defaults to 4.
  --dataset       (Optional) Flag to indicate downloading a dataset.
  --local-dir     (Optional) Local directory path where the model or dataset will be stored.

 Example:
  hfd bigscience/bloom-560m --exclude *.safetensors
  hfd meta-llama/Llama-2-7b --hf_username myuser --hf_token mytoken -x 4
  hfd lavita/medical-qa-shared-task-v1-toy --dataset
 EOF
    exit 1
 }

 MODEL_ID=$1
 shift

 # Default values
 TOOL="aria2c"
 THREADS=4
 HF_ENDPOINT=${HF_ENDPOINT:-"https://huggingface.co"}

 INCLUDE_PATTERNS=()
 EXCLUDE_PATTERNS=()

 while [[ $# -gt 0 ]]; do
    case $1 in
        --include)
            shift
            while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
                INCLUDE_PATTERNS+=("$1")
                shift
            done
            ;;
        --exclude)
            shift
            while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
                EXCLUDE_PATTERNS+=("$1")
                shift
            done
            ;;
        --hf_username) HF_USERNAME="$2"; shift 2 ;;
        --hf_token) HF_TOKEN="$2"; shift 2 ;;
        --tool) TOOL="$2"; shift 2 ;;
        -x) THREADS="$2"; shift 2 ;;
        --dataset) DATASET=1; shift ;;
        --local-dir) LOCAL_DIR="$2"; shift 2 ;;
        *) shift ;;
    esac
 done

 # Check if aria2, wget, curl, git, and git-lfs are installed
 check_command() {
    if ! command -v $1 &>/dev/null; then
        echo -e "${RED}$1 is not installed. Please install it first.${NC}"
        exit 1
    fi
 }

 # Mark current repo safe when using shared file system like samba or nfs
 ensure_ownership() {
    if git status 2>&1 | grep "fatal: detected dubious ownership in repository at" > /dev/null; then
        git config --global --add safe.directory "${PWD}"
        printf "${YELLOW}Detected dubious ownership in repository, mark ${PWD} safe using git, edit ~/.gitconfig if you want to reverse this.\n${NC}" 
    fi
 }

 [[ "$TOOL" == "aria2c" ]] && check_command aria2c
 [[ "$TOOL" == "wget" ]] && check_command wget
 check_command curl; check_command git; check_command git-lfs

 [[ -z "$MODEL_ID" || "$MODEL_ID" =~ ^-h ]] && display_help

 if [[ -z "$LOCAL_DIR" ]]; then
    LOCAL_DIR="${MODEL_ID#*/}"
 fi

 if [[ "$DATASET" == 1 ]]; then
    MODEL_ID="datasets/$MODEL_ID"
 fi
 echo "Downloading to $LOCAL_DIR"

 if [ -d "$LOCAL_DIR/.git" ]; then
    printf "${YELLOW}%s exists, Skip Clone.\n${NC}" "$LOCAL_DIR"
    cd "$LOCAL_DIR" && ensure_ownership && GIT_LFS_SKIP_SMUDGE=1 git pull || { printf "${RED}Git pull failed.${NC}\n"; exit 1; }
 else
    REPO_URL="$HF_ENDPOINT/$MODEL_ID"
    GIT_REFS_URL="${REPO_URL}/info/refs?service=git-upload-pack"
    echo "Testing GIT_REFS_URL: $GIT_REFS_URL"
    response=$(curl -s -o /dev/null -w "%{http_code}" "$GIT_REFS_URL")
    if [ "$response" == "401" ] || [ "$response" == "403" ]; then
        if [[ -z "$HF_USERNAME" || -z "$HF_TOKEN" ]]; then
            printf "${RED}HTTP Status Code: $response.\nThe repository requires authentication, but --hf_username and --hf_token is not passed. Please get token from https://huggingface.co/settings/tokens.\nExiting.\n${NC}"
            exit 1
        fi
        REPO_URL="https://$HF_USERNAME:$HF_TOKEN@${HF_ENDPOINT#https://}/$MODEL_ID"
    elif [ "$response" != "200" ]; then
        printf "${RED}Unexpected HTTP Status Code: $response\n${NC}"
        printf "${YELLOW}Executing debug command: curl -v %s\nOutput:${NC}\n" "$GIT_REFS_URL"
        curl -v "$GIT_REFS_URL"; printf "\n${RED}Git clone failed.\n${NC}"; exit 1
    fi
    echo "GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR"

    GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR && cd "$LOCAL_DIR" || { printf "${RED}Git clone failed.\n${NC}"; exit 1; }

    ensure_ownership

    while IFS= read -r file; do
        truncate -s 0 "$file"
    done <<< $(git lfs ls-files | cut -d ' ' -f 3-)
 fi

 printf "\nStart Downloading lfs files, bash script:\ncd $LOCAL_DIR\n"
 files=$(git lfs ls-files | cut -d ' ' -f 3-)
 declare -a urls

 file_matches_include_patterns() {
    local file="$1"
    for pattern in "${INCLUDE_PATTERNS[@]}"; do
        if [[ "$file" == $pattern ]]; then
            return 0
        fi
    done
    return 1
 }

 file_matches_exclude_patterns() {
    local file="$1"
    for pattern in "${EXCLUDE_PATTERNS[@]}"; do
        if [[ "$file" == $pattern ]]; then
            return 0
        fi
    done
    return 1
 }

 while IFS= read -r file; do
    url="$HF_ENDPOINT/$MODEL_ID/resolve/main/$file"
    file_dir=$(dirname "$file")
    mkdir -p "$file_dir"
    if [[ "$TOOL" == "wget" ]]; then
        download_cmd="wget -c \"$url\" -O \"$file\""
        [[ -n "$HF_TOKEN" ]] && download_cmd="wget --header=\"Authorization: Bearer ${HF_TOKEN}\" -c \"$url\" -O \"$file\""
    else
        download_cmd="aria2c --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""
        [[ -n "$HF_TOKEN" ]] && download_cmd="aria2c --header=\"Authorization: Bearer ${HF_TOKEN}\" --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""
    fi

    if [[ ${#INCLUDE_PATTERNS[@]} -gt 0 ]]; then
        file_matches_include_patterns "$file" || { printf "# %s\n" "$download_cmd"; continue; }
    fi

    if [[ ${#EXCLUDE_PATTERNS[@]} -gt 0 ]]; then
        file_matches_exclude_patterns "$file" && { printf "# %s\n" "$download_cmd"; continue; }
    fi

    printf "%s\n" "$download_cmd"
    urls+=("$url|$file")
 done <<< "$files"

 for url_file in "${urls[@]}"; do
    IFS='|' read -r url file <<< "$url_file"
    printf "${YELLOW}Start downloading ${file}.\n${NC}" 
    file_dir=$(dirname "$file")
    if [[ "$TOOL" == "wget" ]]; then
        [[ -n "$HF_TOKEN" ]] && wget --header="Authorization: Bearer ${HF_TOKEN}" -c "$url" -O "$file" || wget -c "$url" -O "$file"
    else
        [[ -n "$HF_TOKEN" ]] && aria2c --header="Authorization: Bearer ${HF_TOKEN}" --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c "$url" -d "$file_dir" -o "$(basename "$file")" || aria2c --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c "$url" -d "$file_dir" -o "$(basename "$file")"
    fi
    [[ $? -eq 0 ]] && printf "Downloaded %s successfully.\n" "$url" || { printf "${RED}Failed to download %s.\n${NC}" "$url"; exit 1; }
 done

 printf "${GREEN}Download completed successfully.\n${NC}"
	#!/usr/bin/env bash
	# Color definitions
	RED='\033[0;31m'
	GREEN='\033[0;32m'
	YELLOW='\033[1;33m'
	NC='\033[0m' # No Color

	trap 'printf "${YELLOW}\nDownload interrupted. If you re-run the command, you can resume the download from the breakpoint.\n${NC}"; exit 1' INT

	display_help() {
	cat << EOF
	Usage:
	hfd <repo_id> [--include include_pattern1 include_pattern2 ...] [--exclude exclude_pattern1 exclude_pattern2 ...] [--hf_username username] [--hf_token token] [--tool aria2c\|wget] [-x threads] [--dataset] [--local-dir path]

	Description:
	Downloads a model or dataset from Hugging Face using the provided repo ID.

	Parameters:
	repo_id The Hugging Face repo ID in the format 'org/repo_name'.
	--include (Optional) Flag to specify string patterns to include files for downloading. Supports multiple patterns.
	--exclude (Optional) Flag to specify string patterns to exclude files from downloading. Supports multiple patterns.
	include/exclude_pattern The patterns to match against filenames, supports wildcard characters. e.g., '--exclude .safetensor .txt', '--include vae/*'.
	--hf_username (Optional) Hugging Face username for authentication. NOT EMAIL.
	--hf_token (Optional) Hugging Face token for authentication.
	--tool (Optional) Download tool to use. Can be aria2c (default) or wget.
	-x (Optional) Number of download threads for aria2c. Defaults to 4.
	--dataset (Optional) Flag to indicate downloading a dataset.
	--local-dir (Optional) Local directory path where the model or dataset will be stored.

	Example:
	hfd bigscience/bloom-560m --exclude *.safetensors
	hfd meta-llama/Llama-2-7b --hf_username myuser --hf_token mytoken -x 4
	hfd lavita/medical-qa-shared-task-v1-toy --dataset
	EOF
	exit 1
	}

	MODEL_ID=$1
	shift

	# Default values
	TOOL="aria2c"
	THREADS=4
	HF_ENDPOINT=${HF_ENDPOINT:-"https://huggingface.co"}

	INCLUDE_PATTERNS=()
	EXCLUDE_PATTERNS=()

	while [[ $# -gt 0 ]]; do
	case $1 in
	--include)
	shift
	while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
	INCLUDE_PATTERNS+=("$1")
	shift
	done
	;;
	--exclude)
	shift
	while [[ $# -gt 0 && ! $1 =~ ^-- ]]; do
	EXCLUDE_PATTERNS+=("$1")
	shift
	done
	;;
	--hf_username) HF_USERNAME="$2"; shift 2 ;;
	--hf_token) HF_TOKEN="$2"; shift 2 ;;
	--tool) TOOL="$2"; shift 2 ;;
	-x) THREADS="$2"; shift 2 ;;
	--dataset) DATASET=1; shift ;;
	--local-dir) LOCAL_DIR="$2"; shift 2 ;;
	*) shift ;;
	esac
	done

	# Check if aria2, wget, curl, git, and git-lfs are installed
	check_command() {
	if ! command -v $1 &>/dev/null; then
	echo -e "${RED}$1 is not installed. Please install it first.${NC}"
	exit 1
	fi
	}

	# Mark current repo safe when using shared file system like samba or nfs
	ensure_ownership() {
	if git status 2>&1 \| grep "fatal: detected dubious ownership in repository at" > /dev/null; then
	git config --global --add safe.directory "${PWD}"
	printf "${YELLOW}Detected dubious ownership in repository, mark ${PWD} safe using git, edit ~/.gitconfig if you want to reverse this.\n${NC}"
	fi
	}

	[[ "$TOOL" == "aria2c" ]] && check_command aria2c
	[[ "$TOOL" == "wget" ]] && check_command wget
	check_command curl; check_command git; check_command git-lfs

	[[ -z "$MODEL_ID" \|\| "$MODEL_ID" =~ ^-h ]] && display_help

	if [[ -z "$LOCAL_DIR" ]]; then
	LOCAL_DIR="${MODEL_ID#*/}"
	fi

	if [[ "$DATASET" == 1 ]]; then
	MODEL_ID="datasets/$MODEL_ID"
	fi
	echo "Downloading to $LOCAL_DIR"

	if [ -d "$LOCAL_DIR/.git" ]; then
	printf "${YELLOW}%s exists, Skip Clone.\n${NC}" "$LOCAL_DIR"
	cd "$LOCAL_DIR" && ensure_ownership && GIT_LFS_SKIP_SMUDGE=1 git pull \|\| { printf "${RED}Git pull failed.${NC}\n"; exit 1; }
	else
	REPO_URL="$HF_ENDPOINT/$MODEL_ID"
	GIT_REFS_URL="${REPO_URL}/info/refs?service=git-upload-pack"
	echo "Testing GIT_REFS_URL: $GIT_REFS_URL"
	response=$(curl -s -o /dev/null -w "%{http_code}" "$GIT_REFS_URL")
	if [ "$response" == "401" ] \|\| [ "$response" == "403" ]; then
	if [[ -z "$HF_USERNAME" \|\| -z "$HF_TOKEN" ]]; then
	printf "${RED}HTTP Status Code: $response.\nThe repository requires authentication, but --hf_username and --hf_token is not passed. Please get token from https://huggingface.co/settings/tokens.\nExiting.\n${NC}"
	exit 1
	fi
	REPO_URL="https://$HF_USERNAME:$HF_TOKEN@${HF_ENDPOINT#https://}/$MODEL_ID"
	elif [ "$response" != "200" ]; then
	printf "${RED}Unexpected HTTP Status Code: $response\n${NC}"
	printf "${YELLOW}Executing debug command: curl -v %s\nOutput:${NC}\n" "$GIT_REFS_URL"
	curl -v "$GIT_REFS_URL"; printf "\n${RED}Git clone failed.\n${NC}"; exit 1
	fi
	echo "GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR"

	GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR && cd "$LOCAL_DIR" \|\| { printf "${RED}Git clone failed.\n${NC}"; exit 1; }

	ensure_ownership

	while IFS= read -r file; do
	truncate -s 0 "$file"
	done <<< $(git lfs ls-files \| cut -d ' ' -f 3-)
	fi

	printf "\nStart Downloading lfs files, bash script:\ncd $LOCAL_DIR\n"
	files=$(git lfs ls-files \| cut -d ' ' -f 3-)
	declare -a urls

	file_matches_include_patterns() {
	local file="$1"
	for pattern in "${INCLUDE_PATTERNS[@]}"; do
	if [[ "$file" == $pattern ]]; then
	return 0
	fi
	done
	return 1
	}

	file_matches_exclude_patterns() {
	local file="$1"
	for pattern in "${EXCLUDE_PATTERNS[@]}"; do
	if [[ "$file" == $pattern ]]; then
	return 0
	fi
	done
	return 1
	}

	while IFS= read -r file; do
	url="$HF_ENDPOINT/$MODEL_ID/resolve/main/$file"
	file_dir=$(dirname "$file")
	mkdir -p "$file_dir"
	if [[ "$TOOL" == "wget" ]]; then
	download_cmd="wget -c \"$url\" -O \"$file\""
	[[ -n "$HF_TOKEN" ]] && download_cmd="wget --header=\"Authorization: Bearer ${HF_TOKEN}\" -c \"$url\" -O \"$file\""
	else
	download_cmd="aria2c --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""
	[[ -n "$HF_TOKEN" ]] && download_cmd="aria2c --header=\"Authorization: Bearer ${HF_TOKEN}\" --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""
	fi

	if [[ ${#INCLUDE_PATTERNS[@]} -gt 0 ]]; then
	file_matches_include_patterns "$file" \|\| { printf "# %s\n" "$download_cmd"; continue; }
	fi

	if [[ ${#EXCLUDE_PATTERNS[@]} -gt 0 ]]; then
	file_matches_exclude_patterns "$file" && { printf "# %s\n" "$download_cmd"; continue; }
	fi

	printf "%s\n" "$download_cmd"
	urls+=("$url\|$file")
	done <<< "$files"

	for url_file in "${urls[@]}"; do
	IFS='\|' read -r url file <<< "$url_file"
	printf "${YELLOW}Start downloading ${file}.\n${NC}"
	file_dir=$(dirname "$file")
	if [[ "$TOOL" == "wget" ]]; then
	[[ -n "$HF_TOKEN" ]] && wget --header="Authorization: Bearer ${HF_TOKEN}" -c "$url" -O "$file" \|\| wget -c "$url" -O "$file"
	else
	[[ -n "$HF_TOKEN" ]] && aria2c --header="Authorization: Bearer ${HF_TOKEN}" --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c "$url" -d "$file_dir" -o "$(basename "$file")" \|\| aria2c --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c "$url" -d "$file_dir" -o "$(basename "$file")"
	fi
	[[ $? -eq 0 ]] && printf "Downloaded %s successfully.\n" "$url" \|\| { printf "${RED}Failed to download %s.\n${NC}" "$url"; exit 1; }
	done

	printf "${GREEN}Download completed successfully.\n${NC}"