|
#!/bin/bash |
|
|
|
################################################################################ |
|
# Script Name: group-zipping.sh |
|
# Version: v1.0.0 |
|
# Date: 2023-04-02 |
|
# Description: Compresses files sharing a common prefix before the first dot in |
|
# their filenames. |
|
# Usage: ./group-zipping.sh [-h] [-i input_folder] [-o output_folder] [-f |
|
# compression_format] [-p pattern] [-j max_jobs] |
|
# Dependencies: GNU Parallel, tar, gzip, bzip2, xz, zip, p7zip-full |
|
# Compatibility: Tested on Linux Ubuntu 22.04 |
|
# Limitations: Large files may take a long time to compress |
|
# Parameters: |
|
# -h, --help Display this help message and exit. |
|
# -i, --input-folder Folder containing files to compress (default: current directory). |
|
# -o, --output-folder Folder to save compressed files (default: input folder). |
|
# -f, --format Compression format (supported: gzip, bzip2, xz; default: gzip). |
|
# -p, --pattern Pattern to filter filenames in input folder (e.g., '*pattern*'; default: all files). |
|
# -j, --jobs Number of parallel jobs (default: 1). |
|
# License: MIT License |
|
# Author: Benito Zaragozí |
|
# URL: https://gist.github.com/benizar/e07d68c1b765df46f6911094aad2f921 |
|
# Contact: benizar@gmail.com |
|
################################################################################ |
|
|
|
supported_formats=("gzip" "bzip2" "xz" "zip" "lzma" "lzip" "lrzip" "7zip") |
|
|
|
# Set default values for input parameters. |
|
input_folder="." |
|
output_folder="" |
|
compression_format="gzip" |
|
pattern="*" |
|
max_jobs=1 |
|
|
|
# Function to print error messages. |
|
function print_error() { |
|
echo "Error: $1" >&2 |
|
} |
|
|
|
# Function to display help message. |
|
function help() { |
|
echo "Usage: ./compress_files.sh [-h] [-i input_folder] [-o output_folder] [-f compression_format] [-p pattern] [-j jobs]" |
|
echo "Compress files that share a common prefix before the first dot in their filenames." |
|
echo "" |
|
echo "Options:" |
|
echo "-h, --help Display this help message and exit." |
|
echo "-i, --input-folder The folder containing the files to compress (default: current directory)." |
|
echo "-o, --output-folder The folder where compressed files will be saved (default: input folder)." |
|
echo "-f, --format The compression format to use (supported: gzip, bzip2, xz; default: gzip)." |
|
echo "-p, --pattern The pattern to filter file names in the input folder (e.g., '*pattern*'; default: all files)." |
|
echo "-j, --jobs The number of parallel jobs to run (default: number of CPU cores)." |
|
exit 0 |
|
} |
|
|
|
# Function to format time duration. |
|
format_time() { |
|
local total_seconds="$1" |
|
local hours=$((total_seconds / 3600)) |
|
local minutes=$(( (total_seconds % 3600) / 60)) |
|
local seconds=$((total_seconds % 60)) |
|
|
|
if [[ $hours -gt 0 ]]; then |
|
printf "%d hours, %d minutes, %d seconds" $hours $minutes $seconds |
|
elif [[ $minutes -gt 0 ]]; then |
|
printf "%d minutes, %d seconds" $minutes $seconds |
|
else |
|
printf "%d seconds" $seconds |
|
fi |
|
} |
|
|
|
# Parse and validate command-line arguments. |
|
if [[ $# -eq 0 ]]; then |
|
help |
|
fi |
|
|
|
while [[ $# -gt 0 ]]; do |
|
key="$1" |
|
case $key in |
|
-h|--help) |
|
help |
|
;; |
|
-i|--input-folder) |
|
input_folder="$2" |
|
shift |
|
shift |
|
;; |
|
-o|--output-folder) |
|
output_folder="$2" |
|
shift |
|
shift |
|
;; |
|
-f|--format) |
|
compression_format="$2" |
|
shift |
|
shift |
|
;; |
|
-p|--pattern) |
|
pattern="$2" |
|
shift |
|
shift |
|
;; |
|
-j|--jobs) |
|
max_jobs="$2" |
|
shift |
|
shift |
|
;; |
|
|
|
*) |
|
echo "Unknown option: $1" |
|
help |
|
;; |
|
esac |
|
done |
|
|
|
# Set output folder to input folder if not provided. |
|
if [ -z "$output_folder" ]; then |
|
output_folder="$input_folder" |
|
fi |
|
|
|
if [ ! -d "$input_folder" ]; then |
|
print_error "Input folder does not exist." |
|
exit 1 |
|
fi |
|
|
|
if [ ! -d "$output_folder" ]; then |
|
print_error "Output folder does not exist." |
|
exit 1 |
|
fi |
|
|
|
|
|
for format in "$compression_format"; do |
|
command="" |
|
for format in "${supported_formats[@]}"; do |
|
if [[ "$compression_format" == "$format" ]]; then |
|
command="$format" |
|
break |
|
fi |
|
done |
|
|
|
if [[ -z "$command" ]]; then |
|
print_error "Unsupported compression format: $compression_format" |
|
help |
|
fi |
|
|
|
if ! command -v "$command" &> /dev/null; then |
|
print_error "$command command not found." |
|
exit 1 |
|
fi |
|
|
|
done |
|
|
|
|
|
# Find unique file prefixes in input folder. |
|
declare -A prefix_map |
|
|
|
for file in "${input_folder}"/*"${pattern}"*.*; do |
|
if [[ -f "$file" ]]; then |
|
prefix="${file##*/}" |
|
prefix="${prefix%%.*}" |
|
prefix_map["$prefix"]=1 |
|
fi |
|
done |
|
|
|
|
|
# Compress files with the same prefix using specified format. |
|
compress_files() { |
|
prefix="$1" |
|
compression_format="$2" |
|
input_folder="$3" |
|
output_folder="$4" |
|
|
|
start_time=$(date +%s) |
|
|
|
# Create a temporary folder with the desired inner folder name. |
|
temp_folder="$(mktemp -d)/${prefix}" |
|
mkdir -p "${temp_folder}" |
|
|
|
# Create symlinks in the temporary folder. |
|
for file in "${input_folder}"/*"${prefix}"*.*; do |
|
ln -s "$(realpath "${file}")" "${temp_folder}/$(basename "${file}")" |
|
done |
|
|
|
|
|
case $compression_format in |
|
gzip) |
|
tar -czvhf "${output_folder}/${prefix}.tar.gz" -C "$(dirname "${temp_folder}")" "${prefix}" > /dev/null |
|
;; |
|
bzip2) |
|
tar -cjvhf "${output_folder}/${prefix}.tar.bz2" -C "$(dirname "${temp_folder}")" "${prefix}" > /dev/null |
|
;; |
|
xz) |
|
tar -cJvhf "${output_folder}/${prefix}.tar.xz" -C "$(dirname "${temp_folder}")" "${prefix}" > /dev/null |
|
;; |
|
zip) |
|
zip -r "${output_folder}/${prefix}.zip" -j "${input_folder}"/*"${prefix}"*.* > /dev/null |
|
;; |
|
lzma) |
|
tar --lzma -cvhf "${output_folder}/${prefix}.tar.lzma" -C "$(dirname "${temp_folder}")" "${prefix}" > /dev/null |
|
;; |
|
lzip) |
|
tar --lzip -cvhf "${output_folder}/${prefix}.tar.lz" -C "$(dirname "${temp_folder}")" "${prefix}" > /dev/null |
|
;; |
|
lrzip) |
|
tar --lrzip -cvhf "${output_folder}/${prefix}.tar.lrz" -C "$(dirname "${temp_folder}")" "${prefix}" > /dev/null |
|
;; |
|
7zip) |
|
cd "${input_folder}" |
|
7z a "${output_folder}/${prefix}.7z" "${input_folder}"/*"${prefix}"*.* > /dev/null |
|
cd .. |
|
;; |
|
*) |
|
echo "Unsupported compression format: $compression_format" |
|
exit 1 |
|
;; |
|
esac |
|
|
|
|
|
# Remove the temporary folder and symbolic links. |
|
rm -rf "${temp_folder}" |
|
|
|
end_time=$(date +%s) |
|
elapsed_time=$((end_time - start_time)) |
|
formatted_time=$(format_time "$elapsed_time") |
|
echo "Finished compressing ${prefix} in ${formatted_time}" |
|
} |
|
|
|
export -f compress_files format_time |
|
|
|
# Display list of output files. |
|
echo "The following files will be created:" |
|
for prefix in "${!prefix_map[@]}"; do |
|
case $compression_format in |
|
gzip) |
|
echo "${output_folder}/${prefix}.tar.gz" |
|
;; |
|
bzip2) |
|
echo "${output_folder}/${prefix}.tar.bz2" |
|
;; |
|
xz) |
|
echo "${output_folder}/${prefix}.tar.xz" |
|
;; |
|
zip) |
|
echo "${output_folder}/${prefix}.zip" |
|
;; |
|
esac |
|
done |
|
|
|
# Get user confirmation to proceed. |
|
read -p "Do you want to proceed? (y/n) " choice |
|
case "$choice" in |
|
y|Y ) |
|
echo "Starting the compression process." |
|
echo "Depending on the number of jobs, file sizes, and other parameters, you'll need to be patient (or you can stare at the output folder to keep an eye on the progress)." |
|
;; |
|
n|N ) |
|
exit 0 |
|
;; |
|
* ) |
|
echo "Invalid choice" |
|
exit 1 |
|
;; |
|
esac |
|
|
|
start_script_time=$(date +%s) |
|
|
|
# Compress files in parallel using GNU Parallel. |
|
parallel --load 90% --nice 1 -j "${max_jobs}" --no-notice compress_files ::: "${!prefix_map[@]}" ::: "${compression_format}" ::: "${input_folder}" ::: "${output_folder}" |
|
|
|
|
|
# Print total script execution time. |
|
end_script_time=$(date +%s) |
|
elapsed_script_time=$((end_script_time - start_script_time)) |
|
formatted_script_time=$(format_time "$elapsed_script_time") |
|
echo "Total script execution time: ${formatted_script_time}" |