|
#!/usr/bin/env bash |
|
# Color definitions |
|
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' # No Color |
|
|
|
trap 'printf "${YELLOW}\nDownload interrupted. You can resume by re-running the command.\n${NC}"; exit 1' INT |
|
|
|
display_help() { |
|
cat << EOF |
|
Usage: |
|
hfd <REPO_ID> [--include include_pattern1 include_pattern2 ...] [--exclude exclude_pattern1 exclude_pattern2 ...] [--hf_username username] [--hf_token token] [--tool aria2c|wget] [-x threads] [-j jobs] [--dataset] [--local-dir path] [--revision rev] |
|
|
|
Description: |
|
Downloads a model or dataset from Hugging Face using the provided repo ID. |
|
|
|
Arguments: |
|
REPO_ID The Hugging Face repo ID (Required) |
|
Format: 'org_name/repo_name' or legacy format (e.g., gpt2) |
|
Options: |
|
include/exclude_pattern The patterns to match against file path, supports wildcard characters. |
|
e.g., '--exclude *.safetensor *.md', '--include vae/*'. |
|
--include (Optional) Patterns to include files for downloading (supports multiple patterns). |
|
--exclude (Optional) Patterns to exclude files from downloading (supports multiple patterns). |
|
--hf_username (Optional) Hugging Face username for authentication (not email). |
|
--hf_token (Optional) Hugging Face token for authentication. |
|
--tool (Optional) Download tool to use: aria2c (default) or wget. |
|
-x (Optional) Number of download threads for aria2c (default: 4). |
|
-j (Optional) Number of concurrent downloads for aria2c (default: 5). |
|
--dataset (Optional) Flag to indicate downloading a dataset. |
|
--local-dir (Optional) Directory path to store the downloaded data. |
|
Defaults to the current directory with a subdirectory named 'repo_name' |
|
if REPO_ID is is composed of 'org_name/repo_name'. |
|
--revision (Optional) Model/Dataset revision to download (default: main). |
|
|
|
Example: |
|
hfd gpt2 |
|
hfd bigscience/bloom-560m --exclude *.safetensors |
|
hfd meta-llama/Llama-2-7b --hf_username myuser --hf_token mytoken -x 4 |
|
hfd lavita/medical-qa-shared-task-v1-toy --dataset |
|
hfd bartowski/Phi-3.5-mini-instruct-exl2 --revision 5_0 |
|
EOF |
|
exit 1 |
|
} |
|
|
|
[[ -z "$1" || "$1" =~ ^-h || "$1" =~ ^--help ]] && display_help |
|
|
|
REPO_ID=$1 |
|
shift |
|
|
|
# Default values |
|
TOOL="aria2c" |
|
THREADS=4 |
|
CONCURRENT=5 |
|
HF_ENDPOINT=${HF_ENDPOINT:-"https://huggingface.co"} |
|
INCLUDE_PATTERNS=() |
|
EXCLUDE_PATTERNS=() |
|
REVISION="main" |
|
|
|
validate_number() { |
|
[[ "$2" =~ ^[1-9][0-9]*$ && "$2" -le "$3" ]] || { printf "${RED}[Error] $1 must be 1-$3${NC}\n"; exit 1; } |
|
} |
|
|
|
# Argument parsing |
|
while [[ $# -gt 0 ]]; do |
|
case $1 in |
|
--include) shift; while [[ $# -gt 0 && ! ($1 =~ ^--) && ! ($1 =~ ^-[^-]) ]]; do INCLUDE_PATTERNS+=("$1"); shift; done ;; |
|
--exclude) shift; while [[ $# -gt 0 && ! ($1 =~ ^--) && ! ($1 =~ ^-[^-]) ]]; do EXCLUDE_PATTERNS+=("$1"); shift; done ;; |
|
--hf_username) HF_USERNAME="$2"; shift 2 ;; |
|
--hf_token) HF_TOKEN="$2"; shift 2 ;; |
|
--tool) |
|
case $2 in |
|
aria2c|wget) |
|
TOOL="$2" |
|
;; |
|
*) |
|
printf "%b[Error] Invalid tool. Use 'aria2c' or 'wget'.%b\n" "$RED" "$NC" |
|
exit 1 |
|
;; |
|
esac |
|
shift 2 |
|
;; |
|
-x) validate_number "threads (-x)" "$2" 10; THREADS="$2"; shift 2 ;; |
|
-j) validate_number "concurrent downloads (-j)" "$2" 10; CONCURRENT="$2"; shift 2 ;; |
|
--dataset) DATASET=1; shift ;; |
|
--local-dir) LOCAL_DIR="$2"; shift 2 ;; |
|
--revision) REVISION="$2"; shift 2 ;; |
|
*) display_help ;; |
|
esac |
|
done |
|
|
|
# Generate current command string |
|
generate_command_string() { |
|
local cmd_string="REPO_ID=$REPO_ID" |
|
cmd_string+=" TOOL=$TOOL" |
|
cmd_string+=" INCLUDE_PATTERNS=${INCLUDE_PATTERNS[*]}" |
|
cmd_string+=" EXCLUDE_PATTERNS=${EXCLUDE_PATTERNS[*]}" |
|
cmd_string+=" DATASET=${DATASET:-0}" |
|
cmd_string+=" HF_USERNAME=${HF_USERNAME:-}" |
|
cmd_string+=" HF_TOKEN=${HF_TOKEN:-}" |
|
cmd_string+=" HF_TOKEN=${HF_ENDPOINT:-}" |
|
cmd_string+=" REVISION=$REVISION" |
|
echo "$cmd_string" |
|
} |
|
|
|
# Check if aria2, wget, curl are installed |
|
check_command() { |
|
if ! command -v $1 &>/dev/null; then |
|
printf "%b%s is not installed. Please install it first.%b\n" "$RED" "$1" "$NC" |
|
exit 1 |
|
fi |
|
} |
|
|
|
check_command curl; check_command "$TOOL" |
|
|
|
LOCAL_DIR="${LOCAL_DIR:-${REPO_ID#*/}}" |
|
mkdir -p "$LOCAL_DIR/.hfd" |
|
|
|
if [[ "$DATASET" == 1 ]]; then |
|
METADATA_API_PATH="datasets/$REPO_ID" |
|
DOWNLOAD_API_PATH="datasets/$REPO_ID" |
|
CUT_DIRS=5 |
|
else |
|
METADATA_API_PATH="models/$REPO_ID" |
|
DOWNLOAD_API_PATH="$REPO_ID" |
|
CUT_DIRS=4 |
|
fi |
|
|
|
# Modify API URL, construct based on revision |
|
if [[ "$REVISION" != "main" ]]; then |
|
METADATA_API_PATH="$METADATA_API_PATH/revision/$REVISION" |
|
fi |
|
API_URL="$HF_ENDPOINT/api/$METADATA_API_PATH" |
|
|
|
METADATA_FILE="$LOCAL_DIR/.hfd/repo_metadata.json" |
|
|
|
# Fetch and save metadata |
|
fetch_and_save_metadata() { |
|
status_code=$(curl -L -s -w "%{http_code}" -o "$METADATA_FILE" ${HF_TOKEN:+-H "Authorization: Bearer $HF_TOKEN"} "$API_URL") |
|
RESPONSE=$(cat "$METADATA_FILE") |
|
if [ "$status_code" -eq 200 ]; then |
|
printf "%s\n" "$RESPONSE" |
|
else |
|
printf "%b[Error] Failed to fetch metadata from $API_URL. HTTP status code: $status_code.%b\n$RESPONSE\n" "${RED}" "${NC}" >&2 |
|
rm $METADATA_FILE |
|
exit 1 |
|
fi |
|
} |
|
|
|
check_authentication() { |
|
local response="$1" |
|
if command -v jq &>/dev/null; then |
|
local gated |
|
gated=$(echo "$response" | jq -r '.gated // false') |
|
if [[ "$gated" != "false" && ( -z "$HF_TOKEN" || -z "$HF_USERNAME" ) ]]; then |
|
printf "${RED}The repository requires authentication, but --hf_username and --hf_token is not passed. Please get token from https://huggingface.co/settings/tokens.\nExiting.\n${NC}" |
|
exit 1 |
|
fi |
|
else |
|
if echo "$response" | grep -q '"gated":[^f]' && [[ -z "$HF_TOKEN" || -z "$HF_USERNAME" ]]; then |
|
printf "${RED}The repository requires authentication, but --hf_username and --hf_token is not passed. Please get token from https://huggingface.co/settings/tokens.\nExiting.\n${NC}" |
|
exit 1 |
|
fi |
|
fi |
|
} |
|
|
|
if [[ ! -f "$METADATA_FILE" ]]; then |
|
printf "%bFetching repo metadata...%b\n" "$YELLOW" "$NC" |
|
RESPONSE=$(fetch_and_save_metadata) || exit 1 |
|
check_authentication "$RESPONSE" |
|
else |
|
printf "%bUsing cached metadata: $METADATA_FILE%b\n" "$GREEN" "$NC" |
|
RESPONSE=$(cat "$METADATA_FILE") |
|
check_authentication "$RESPONSE" |
|
fi |
|
|
|
should_regenerate_filelist() { |
|
local command_file="$LOCAL_DIR/.hfd/last_download_command" |
|
local current_command=$(generate_command_string) |
|
|
|
# If file list doesn't exist, regenerate |
|
if [[ ! -f "$LOCAL_DIR/$fileslist_file" ]]; then |
|
echo "$current_command" > "$command_file" |
|
return 0 |
|
fi |
|
|
|
# If command file doesn't exist, regenerate |
|
if [[ ! -f "$command_file" ]]; then |
|
echo "$current_command" > "$command_file" |
|
return 0 |
|
fi |
|
|
|
# Compare current command with saved command |
|
local saved_command=$(cat "$command_file") |
|
if [[ "$current_command" != "$saved_command" ]]; then |
|
echo "$current_command" > "$command_file" |
|
return 0 |
|
fi |
|
|
|
return 1 |
|
} |
|
|
|
fileslist_file=".hfd/${TOOL}_urls.txt" |
|
|
|
if should_regenerate_filelist; then |
|
# Remove existing file list if it exists |
|
[[ -f "$LOCAL_DIR/$fileslist_file" ]] && rm "$LOCAL_DIR/$fileslist_file" |
|
|
|
printf "%bGenerating file list...%b\n" "$YELLOW" "$NC" |
|
|
|
# Convert include and exclude patterns to regex |
|
INCLUDE_REGEX="" |
|
EXCLUDE_REGEX="" |
|
if ((${#INCLUDE_PATTERNS[@]})); then |
|
INCLUDE_REGEX=$(printf '%s\n' "${INCLUDE_PATTERNS[@]}" | sed 's/\./\\./g; s/\*/.*/g' | paste -sd '|' -) |
|
fi |
|
if ((${#EXCLUDE_PATTERNS[@]})); then |
|
EXCLUDE_REGEX=$(printf '%s\n' "${EXCLUDE_PATTERNS[@]}" | sed 's/\./\\./g; s/\*/.*/g' | paste -sd '|' -) |
|
fi |
|
|
|
# Check if jq is available |
|
if command -v jq &>/dev/null; then |
|
process_with_jq() { |
|
if [[ "$TOOL" == "aria2c" ]]; then |
|
printf "%s" "$RESPONSE" | jq -r \ |
|
--arg endpoint "$HF_ENDPOINT" \ |
|
--arg repo_id "$DOWNLOAD_API_PATH" \ |
|
--arg token "$HF_TOKEN" \ |
|
--arg include_regex "$INCLUDE_REGEX" \ |
|
--arg exclude_regex "$EXCLUDE_REGEX" \ |
|
--arg revision "$REVISION" \ |
|
' |
|
.siblings[] |
|
| select( |
|
.rfilename != null |
|
and ($include_regex == "" or (.rfilename | test($include_regex))) |
|
and ($exclude_regex == "" or (.rfilename | test($exclude_regex) | not)) |
|
) |
|
| [ |
|
($endpoint + "/" + $repo_id + "/resolve/" + $revision + "/" + .rfilename), |
|
" dir=" + (.rfilename | split("/")[:-1] | join("/")), |
|
" out=" + (.rfilename | split("/")[-1]), |
|
if $token != "" then " header=Authorization: Bearer " + $token else empty end, |
|
"" |
|
] |
|
| join("\n") |
|
' |
|
else |
|
printf "%s" "$RESPONSE" | jq -r \ |
|
--arg endpoint "$HF_ENDPOINT" \ |
|
--arg repo_id "$DOWNLOAD_API_PATH" \ |
|
--arg include_regex "$INCLUDE_REGEX" \ |
|
--arg exclude_regex "$EXCLUDE_REGEX" \ |
|
--arg revision "$REVISION" \ |
|
' |
|
.siblings[] |
|
| select( |
|
.rfilename != null |
|
and ($include_regex == "" or (.rfilename | test($include_regex))) |
|
and ($exclude_regex == "" or (.rfilename | test($exclude_regex) | not)) |
|
) |
|
| ($endpoint + "/" + $repo_id + "/resolve/" + $revision + "/" + .rfilename) |
|
' |
|
fi |
|
} |
|
result=$(process_with_jq) |
|
printf "%s\n" "$result" > "$LOCAL_DIR/$fileslist_file" |
|
else |
|
printf "%b[Warning] jq not installed, using grep/awk for metadata json parsing (slower). Consider installing jq for better parsing performance.%b\n" "$YELLOW" "$NC" |
|
process_with_grep_awk() { |
|
local include_pattern="" |
|
local exclude_pattern="" |
|
local output="" |
|
|
|
if ((${#INCLUDE_PATTERNS[@]})); then |
|
include_pattern=$(printf '%s\n' "${INCLUDE_PATTERNS[@]}" | sed 's/\./\\./g; s/\*/.*/g' | paste -sd '|' -) |
|
fi |
|
if ((${#EXCLUDE_PATTERNS[@]})); then |
|
exclude_pattern=$(printf '%s\n' "${EXCLUDE_PATTERNS[@]}" | sed 's/\./\\./g; s/\*/.*/g' | paste -sd '|' -) |
|
fi |
|
|
|
local files=$(printf '%s' "$RESPONSE" | grep -o '"rfilename":"[^"]*"' | awk -F'"' '{print $4}') |
|
|
|
if [[ -n "$include_pattern" ]]; then |
|
files=$(printf '%s\n' "$files" | grep -E "$include_pattern") |
|
fi |
|
if [[ -n "$exclude_pattern" ]]; then |
|
files=$(printf '%s\n' "$files" | grep -vE "$exclude_pattern") |
|
fi |
|
|
|
while IFS= read -r file; do |
|
if [[ -n "$file" ]]; then |
|
if [[ "$TOOL" == "aria2c" ]]; then |
|
output+="$HF_ENDPOINT/$DOWNLOAD_API_PATH/resolve/$REVISION/$file"$'\n' |
|
output+=" dir=$(dirname "$file")"$'\n' |
|
output+=" out=$(basename "$file")"$'\n' |
|
[[ -n "$HF_TOKEN" ]] && output+=" header=Authorization: Bearer $HF_TOKEN"$'\n' |
|
output+=$'\n' |
|
else |
|
output+="$HF_ENDPOINT/$DOWNLOAD_API_PATH/resolve/$REVISION/$file"$'\n' |
|
fi |
|
fi |
|
done <<< "$files" |
|
|
|
printf '%s' "$output" |
|
} |
|
|
|
result=$(process_with_grep_awk) |
|
printf "%s\n" "$result" > "$LOCAL_DIR/$fileslist_file" |
|
fi |
|
else |
|
printf "%bResume from file list: $LOCAL_DIR/$fileslist_file%b\n" "$GREEN" "$NC" |
|
fi |
|
|
|
# Perform download |
|
printf "${YELLOW}Starting download with $TOOL to $LOCAL_DIR...\n${NC}" |
|
|
|
cd "$LOCAL_DIR" |
|
if [[ "$TOOL" == "aria2c" ]]; then |
|
aria2c --console-log-level=error --file-allocation=none -x "$THREADS" -j "$CONCURRENT" -s "$THREADS" -k 1M -c -i "$fileslist_file" --save-session="$fileslist_file" |
|
elif [[ "$TOOL" == "wget" ]]; then |
|
wget -x -nH --cut-dirs="$CUT_DIRS" ${HF_TOKEN:+--header="Authorization: Bearer $HF_TOKEN"} --input-file="$fileslist_file" --continue |
|
fi |
|
|
|
if [[ $? -eq 0 ]]; then |
|
printf "${GREEN}Download completed successfully. Repo directory: $PWD\n${NC}" |
|
else |
|
printf "${RED}Download encountered errors.\n${NC}" |
|
exit 1 |
|
fi |
同样的问题