Last active
March 18, 2019 15:22
-
-
Save GuyPaddock/b9b27ad8764dce1b9b049a3a9558f736 to your computer and use it in GitHub Desktop.
Split a ZIP archive for Drupal Feeds Fetcher Archive + CSV into separate archives < 100 MB each (needed for Pantheon)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
## | |
# @file | |
# Asset Zip Splitter | |
# | |
# Splits a ZIP archive containing the following file structure into separate | |
# archives that contain no more than 100 MB each: | |
# - *.csv (CSV files that reference image files) | |
# - images/ (a folder of images referenced by the image files) | |
# | |
# This allows the file to be uploaded to a Drupal installation running on | |
# Pantheon that is using Feeds Fetcher Archive to process the CSV file. | |
# Custom code in the Drupal installation is required to handle import of the | |
# images themselves (convert filenames to file references). | |
# | |
# Each CSV file MUST contain a column named "Filename" that contains the | |
# filename of each image asset referenced. When constructing each "bin" (i.e. | |
# smaller ZIP file), this script automatically includes a copy of each CSV | |
# file in each ZIP file, but rows that reference files that are not in the bin | |
# are automatically omitted so that CSV files inside each archive only | |
# references files actually in the bin. | |
# | |
# Usage: | |
# ``` | |
# ./split_zip.sh <ZIP filename> [max file size] | |
# ``` | |
# | |
# If "max file size" is specified (in bytes), then the script will use that | |
# limit when splitting archives. For example: | |
# ``` | |
# ./split_zip.sh my.zip 200000000 | |
# ``` | |
# Would split an archive into ZIP files containing no more than 200 MB. | |
# | |
# The output from the script is saved in a folder called "split/", with each | |
# piece named relative to the original archive (e.g. "my-001.zip", | |
# "my-002.zip", etc). | |
# | |
# Copyright 2019 Inveniem. All rights reserved. | |
# | |
# @author Guy Elsmore-Paddock ([email protected]) | |
# @license GNU General Public License, version 3 or later | |
# | |
# Stop on undefined variables | |
set -u | |
# Stop on non-zero exit | |
# | |
# NOTE: not compatible with `let` :( | |
set -e | |
################################################################################ | |
# Constants | |
################################################################################ | |
## | |
# The default maximum size, in bytes, of a bin of files (100 MB). | |
# | |
# Can be overridden by providing a second parameter to the script. | |
# | |
MAX_BIN_SIZE="${2:-100000000}" | |
## | |
# Path relative to the extract path that contains the images. | |
# | |
RELATIVE_IMAGE_SOURCE_PATH="images" | |
## | |
# Path relative to the folder that contains the ZIP file, where the new ZIP | |
# files will be saved. | |
# | |
RELATIVE_TARGET_PATH="split" | |
################################################################################ | |
# Main Functions | |
################################################################################ | |
print_usage() { | |
{ | |
echo "Usage: ${0} <ZIP filename> [max file size]" | |
echo "" | |
echo "Split a ZIP file that contains a CSV file and images into multiple " | |
echo "zip files that each contain no more than 'max file size' bytes of " | |
echo "content each. If the file size is not specified, defaults to " | |
echo "100,000,000 (100 MB)" | |
} >&2 | |
exit 1 | |
} | |
## | |
# Unzips the provided file to a temporary folder. | |
# | |
# Output global vars: | |
# - $src_path: The absolute path to the folder that contains the contents of | |
# the zip file. | |
# | |
# @param string $1 | |
# The path to the ZIP file to extract. | |
# | |
unzip_source_file() { | |
local zip_path="${1}" | |
local extract_path=$(create_tmp_dir) | |
delete_on_exit "${extract_path}" | |
echo "Unpacking '${zip_path}'..." | |
unzip "${zip_path}" -d "${extract_path}" | |
echo "Done!" | |
echo "" | |
# Globals | |
declare -g src_path="${extract_path}" | |
} | |
## | |
# Remove any existing output folder and then create it. | |
# | |
# The output path is relative to the folder that contains the input ZIP file. | |
# | |
# Output global vars: | |
# - $output_path: The absolute path to the folder that should contain output | |
# files. | |
# | |
# @param string $1 | |
# The path to the ZIP file that the script is operating on. | |
# | |
prepare_output_path() { | |
local zip_filename="${1}" | |
local zip_dir=$(dirname "${zip_filename}") | |
local zip_basename=$(zip_basename "${zip_filename}") | |
# Globals | |
declare -g output_path=$( | |
realpath "${zip_dir}/${zip_basename}-${RELATIVE_TARGET_PATH}" | |
) | |
echo "Clearing and creating output path '${output_path}'..." | |
rm -rf "${output_path}" | |
mkdir -p "${output_path}" | |
echo "Done!" | |
echo "" | |
} | |
## | |
# Ensure that all CSV files in the source path are properly formed and refer to | |
# only files that exist. | |
# | |
# This is a necessary step because the process of breaking up a large set of | |
# files and metadata into smaller sets makes it more difficult to detect when | |
# one or more assets are missing or malformed during import. | |
# | |
# Input global vars: | |
# - $src_path: The absolute path to the folder that contains the contents of | |
# the zip file. | |
# | |
sanity_check_csv_files() { | |
find "${src_path}" -maxdepth 1 -name '*.csv' | while read csv_filepath; do | |
sanity_check_csv_file "${csv_filepath}" | |
done | |
} | |
## | |
# Ensure that the provided CSV file is properly formed and refers to only files | |
# that exist. | |
# | |
# @param string $1 | |
# The path to the CSV file to sanity check. | |
# | |
sanity_check_csv_file() { | |
local csv_filepath="${1}" | |
echo "Sanity-checking CSV files..." | |
sanity_check_csv_columns "${csv_filepath}" | |
sanity_check_csv_filenames "${csv_filepath}" | |
echo "Done!" | |
echo "" | |
} | |
## | |
# Ensure that the provided CSV file has a column named "Filename". | |
# | |
# Several parts of this script rely on the existence of this column. | |
# | |
# @param string $1 | |
# The path to the CSV file to sanity check. | |
# | |
sanity_check_csv_columns() { | |
local csv_filepath="${1}" | |
local image_filename_col_idx=$( | |
csv_get_filename_column_index "${csv_filepath}" | |
) | |
# Ensure that Filename column is present | |
if [[ -z "${image_filename_col_idx}" ]]; then | |
{ | |
echo "" | |
echo "ERROR: CSV file must contain a 'Filename' column: ${csv_filepath}" | |
} >&2 | |
pause_and_exit 2 | |
else | |
echo " - 'Filename' column is present. [success]" | |
fi | |
} | |
## | |
# Ensure that every asset referenced in the provided CSV file exists. | |
# | |
# This is a necessary step because the process of breaking up a large set of | |
# files and metadata into smaller sets makes it more difficult to detect when | |
# one or more assets are missing during import. | |
# | |
# @param string $1 | |
# The path to the CSV file to sanity check. | |
# | |
sanity_check_csv_filenames() { | |
local csv_filepath="${1}" | |
local image_filename_col_idx=$( | |
csv_get_filename_column_index "${csv_filepath}" | |
) | |
# Ensure that all image files exist | |
local current_row_number=2 | |
csv_file_to_psv "${csv_filepath}" | \ | |
tail -n +2 | \ | |
cut --delimiter='|' --fields="${image_filename_col_idx}" | \ | |
while read csv_image_filename; do | |
if [[ ! -f "${src_path}/${csv_image_filename}" ]]; then | |
{ | |
echo "" | |
echo "ERROR: Missing image file." | |
echo " - CSV file: ${csv_filepath}" | |
echo " - Image file: ${csv_image_filename}" | |
echo " - Row: ${current_row_number}" | |
} >&2 | |
pause_and_exit 2 | |
fi | |
let current_row_number="${current_row_number} + 1" | |
done | |
echo " - All referenced files exist. [success]" | |
} | |
## | |
# Determine the file sizes of all files in the source folder. | |
# | |
# Input global vars: | |
# - $src_path: The absolute path to the folder that contains the contents of | |
# the zip file. | |
# | |
# Output global vars: | |
# - $image_filenames: An array of filenames, ordered largest to smallest file. | |
# - $image_sizes: An associative array of filenames => sizes. | |
# | |
read_source_file_sizes() { | |
# Globals | |
declare -ag image_filenames | |
declare -Ag image_sizes | |
echo "Determining sizes of files..." | |
local image_size_output=$( | |
find "${src_path}/${RELATIVE_IMAGE_SOURCE_PATH}" \ | |
-type f \ | |
-exec stat \ | |
--format="%n|%s" '{}' ';' | \ | |
sort --key 2 --reverse --field-separator='|' | |
) | |
# Convert "<FILENAME>|<SIZE>" to: | |
# - An array of filenames, ordered largest to smallest file ($image_filenames) | |
# - An associative array of filenames => sizes ($image_sizes) | |
while IFS='|' read -r filename size; do | |
image_filenames+=("${filename}") | |
image_sizes["${filename}"]="${size}" | |
done <<< "${image_size_output[@]}" | |
echo "Done!" | |
echo "" | |
} | |
## | |
# Copy files from the source path into "bins" of files that each are small | |
# enough than the maximum size. | |
# | |
# Input global vars: | |
# - $src_path: The absolute path to the folder that contains the contents of | |
# the zip file. | |
# - $output_path: The absolute path to the folder that should contain output | |
# files. | |
# - $image_filenames: An array of filenames, ordered largest to smallest file. | |
# - $image_sizes: An associative array of filenames => sizes. | |
# | |
# @param string $1 | |
# The path to the ZIP file that the script is operating on. | |
# | |
organize_files() { | |
local zip_filename="${1}" | |
local zip_basename=$(zip_basename "${zip_filename}") | |
let sequence_number=1 | |
# Follow a greedy algorithm, taking the largest files that will fit in each | |
# batch | |
while [[ "${#image_sizes[@]}" -gt 0 ]]; do | |
calculate_solution | |
copy_and_zip_solution_files "${zip_basename}" "${sequence_number}" | |
let sequence_number="${sequence_number} + 1" | |
done | |
echo "Done!" | |
echo "" | |
echo "Output can be found in: ${output_path}" | |
echo "" | |
pause_and_exit | |
} | |
## | |
# Locate files that are small enough to fit in the current bin. | |
# | |
# The total size of the bin is guaranteed to be no larger than the maximum | |
# total size, but may be smaller if the remaining files do not fit or there | |
# are no files left. | |
# | |
# Input global vars: | |
# - $image_filenames: An array of filenames, ordered largest to smallest file. | |
# - $image_sizes: An associative array of filenames => sizes. | |
# | |
# Output global vars: | |
# - $current_solution: An array containing filenames of all files that will fit | |
# in the current bin. | |
# | |
calculate_solution() { | |
# Globals | |
declare -ag current_solution | |
current_solution=() | |
local current_solution_size=0 | |
# NOTE: We can't loop over the hash keys because they are sorted by hash code, | |
# not size. | |
for filename in "${image_filenames[@]}"; do | |
if [[ -z "${image_sizes[$filename]+_}" ]]; then | |
continue | |
fi | |
local file_size="${image_sizes[$filename]}" | |
# "|| true" => Don't stop on non-zero exit when doing math with `let` :( | |
let new_solution_size="${current_solution_size} + ${file_size}" || true | |
if [[ "${new_solution_size}" -le "${MAX_BIN_SIZE}" ]]; then | |
current_solution+=("${filename}") | |
current_solution_size="${new_solution_size}" | |
unset image_sizes["$filename"] | |
fi | |
done | |
# Check if we have a solution when we expected one. | |
if [[ "${#image_filenames[@]}" -gt 0 && \ | |
"${#current_solution[@]}" -eq 0 ]]; then | |
{ | |
echo "ERROR: Cannot create small enough bin -- remaining files may be " | |
echo "too large." | |
} >&2 | |
pause_and_exit 3 | |
fi | |
} | |
## | |
# Copy all of the files for the current solution to the folder for the | |
# corresponding bin of files. | |
# | |
# Input global vars: | |
# - $current_solution: An array containing filenames of all files that will fit | |
# in the current bin. | |
# | |
# @param string $1 | |
# The prefix to use for each ZIP file. | |
# @param integer $2 | |
# The current sequence number. | |
# | |
copy_and_zip_solution_files() { | |
local zip_prefix="${1}" | |
local sequence_number="${2}" | |
local prefixed_sequence_number=$(printf "%03d" "${sequence_number}") | |
local bin_target_path="${output_path}/${prefixed_sequence_number}" | |
mkdir -p "${bin_target_path}" | |
echo "=== Bin #${prefixed_sequence_number} === " | |
copy_solution_images "${bin_target_path}" | |
copy_solution_csv_files "${bin_target_path}" | |
zip_solution_files "${bin_target_path}" "${zip_prefix}" "${prefixed_sequence_number}" | |
} | |
## | |
# Copy all of the image files/assets for the current solution to the folder for | |
# the corresponding bin of files. | |
# | |
# Input global vars: | |
# - $current_solution: An array containing filenames of all files that will fit | |
# in the current bin. | |
# | |
# @param string $1 | |
# The target path where files are stored for the current solution bin. | |
# | |
copy_solution_images() { | |
local bin_target_path="${1}" | |
echo "Select and copy images:" | |
for filename in "${current_solution[@]}"; do | |
local source_dir=$(dirname "${filename}") | |
local target_dir=$( | |
realpath "${bin_target_path}/${RELATIVE_IMAGE_SOURCE_PATH}" | |
) | |
mkdir -p "${target_dir}" | |
cp -v "${filename}" "${target_dir}" | |
done | |
echo "" | |
} | |
## | |
# Copy a filtered version of each CSV file to the folder for the corresponding | |
# bin of files. | |
# | |
# Rows in the CSV files that refer to files that are not in the specified bin | |
# will automatically be filtered out. | |
# | |
# Input global vars: | |
# - $src_path: The absolute path to the folder that contains the contents of | |
# the zip file. | |
# - $current_solution: An array containing filenames of all files that will fit | |
# in the current bin. | |
# | |
# @param string $1 | |
# The target path where files are stored for the current solution bin. | |
# | |
copy_solution_csv_files() { | |
local bin_target_path="${1}" | |
echo "Filter and copy CSV files:" | |
find "${src_path}" -maxdepth 1 -name '*.csv' | while read csv_filepath; do | |
local image_filename_col_idx=$( | |
csv_get_filename_column_index "${csv_filepath}" | |
) | |
local csv_filename=$(basename "${csv_filepath}") | |
local target_csv_filename=$(realpath "${bin_target_path}/${csv_filename}") | |
echo " - Creating filtered copy of '${csv_filepath}' as '${target_csv_filename}'..." | |
# Always copy header line | |
head -n 1 "${csv_filepath}" > "${target_csv_filename}" | |
# Filter out CSV lines that reference files not in this solution bin. | |
tail -n +2 "${csv_filepath}" | while read csv_line; do | |
local csv_image_filename=$( | |
csv_line_to_psv "${csv_line}" | \ | |
cut --delimiter='|' --fields="${image_filename_col_idx}" | |
) | |
local target_file=$(realpath "${bin_target_path}/${csv_image_filename}") | |
if [[ -f "${target_file}" ]]; then | |
echo "${csv_line}" >> "${target_csv_filename}" | |
fi | |
done | |
done | |
echo "" | |
} | |
## | |
# Convert a bin (folder of files) into a ZIP file. | |
# | |
# The ZIP file is named after the bin sequence number. | |
# | |
# @param string $1 | |
# The target path containing the files to put into the ZIP archive. | |
# @param string $2 | |
# The ZIP file prefix. | |
# @param string $3 | |
# The three-digit bin sequence number. | |
# | |
zip_solution_files() { | |
local bin_target_path="${1}" | |
local zip_prefix="${2}" | |
local bin_sequence_number="${3}" | |
local target_zip_filename=$( | |
realpath "${output_path}/${zip_prefix}-${bin_sequence_number}.zip" | |
) | |
echo "Zipping bin as '${target_zip_filename}'..." | |
cd "${bin_target_path}" | |
zip -9 -rm "${target_zip_filename}" ./ | |
cd - | |
rmdir "${bin_target_path}" | |
echo "" | |
} | |
################################################################################ | |
# Utility Functions | |
################################################################################ | |
## | |
# Create a temporary directory. | |
# | |
# This function should be invoked in a sub-shell so that the directory path can | |
# be echo-ed into a local variable. For example: | |
# local tmp_dir=$(create_tmp_dir) | |
# | |
# If the directory should be removed when this script exits, be sure to call | |
# delete_on_exit and pass it the directory name. For example: | |
# local tmp_dir=$(create_tmp_dir) | |
# delete_on_exit "${tmp_dir}" | |
# | |
# @return string | |
# The path to the temporary folder. | |
# | |
create_tmp_dir() { | |
local tmpdir=$(mktemp -d '/tmp/split_zip.XXXXXXXXXX') | |
echo "${tmpdir}" | |
} | |
## | |
# Delete a file or folder when this script exits normally or abnormally. | |
# | |
# @param string $1 | |
# The path to the file/folder to delete. | |
# | |
delete_on_exit() { | |
local target="${1}" | |
add_on_exit "rm -rf -- ${target}" | |
} | |
# Credit: | |
# https://www.linuxjournal.com/content/use-bash-trap-statement-cleanup-temporary-files | |
declare -a on_exit_items | |
## | |
# Queue-up a command to run when this script exits normally or abnormally. | |
# | |
# @param string $* | |
# The command and arguments to queue-up. | |
# | |
function add_on_exit() { | |
set +u | |
local n=${#on_exit_items[*]} | |
on_exit_items[$n]="$*" | |
# Setup trap on the first item added to the list | |
if [[ $n -eq 0 ]]; then | |
trap dispatch_on_exit_items INT TERM HUP EXIT | |
fi | |
} | |
## | |
# Execute commands that were queued-up for when this script exits. | |
# | |
function dispatch_on_exit_items() { | |
set +u | |
for i in "${on_exit_items[@]}"; do | |
eval $i | |
done | |
} | |
## | |
# Determine which column in a CSV file contains filenames. | |
# | |
# @param string $1 | |
# The path to the CSV file. | |
# | |
csv_get_filename_column_index() { | |
local csv_filepath="${1}" | |
csv_file_to_psv "${csv_filepath}" | \ | |
head -n 1 | \ | |
awk -v RS='|' '/Filename/{print NR; exit}' | |
} | |
## | |
# Convert a Comma-Separated Values (CSV) file to a Pipe-Separated Values (PSV) | |
# file. | |
# | |
# Any interior commas (i.e. commas inside quoted field values) are ignored. | |
# | |
# This is needed so that it's easier to work with data that contains embedded | |
# commas and/or quotes. The pipe symbol does not appear often (if at all) in | |
# data that Inveniem works with. | |
# | |
# @param string $1 | |
# The path to the CSV file. | |
# | |
csv_file_to_psv() { | |
local csv_filepath="${1}" | |
# From: | |
# https://unix.stackexchange.com/a/450813 | |
sed -Ee :1 -e 's/^(([^",]|"[^"]*")*),/\1|/;t1' "${csv_filepath}" | |
} | |
## | |
# Convert a single line of a Comma-Separated Values (CSV) file into a | |
# Pipe-Separated Values (PSV) file. | |
# | |
# Any interior commas (i.e. commas inside quoted field values) are ignored. | |
# | |
# This is needed so that it's easier to work with data that contains embedded | |
# commas and/or quotes. The pipe symbol does not appear often (if at all) in | |
# data that Inveniem works with. | |
# | |
# @param string $1 | |
# The path to the CSV file. | |
# | |
csv_line_to_psv() { | |
local csv_line="${1}" | |
# From: | |
# https://unix.stackexchange.com/a/450813 | |
echo "${csv_line}" | sed -Ee :1 -e 's/^(([^",]|"[^"]*")*),/\1|/;t1' | |
} | |
## | |
# Convert a ZIP file path into just the name of the file without file extension. | |
# | |
# @param string $1 | |
# The path to the target ZIP file. | |
# | |
zip_basename() { | |
local zip_filename="${1}" | |
basename "${zip_filename}" '.zip' | |
} | |
## | |
# Pause with the message "Press ENTER to continue" before exiting the program. | |
# | |
# This makes it easier for users to invoke this script via a drag-and-drop | |
# approach instead of having to run it via CLI to see error output, in the event | |
# that what they gave the program has problems. | |
# | |
# @param integer $1 [optional] | |
# The exit code to return when the script exits. The default is 0 (success). | |
# | |
pause_and_exit() { | |
local exit_code="${1:-0}" | |
echo "" | |
read -p "Press ENTER to continue." < /dev/tty | |
exit "${exit_code}" | |
} | |
## | |
# Shim for `realpath` on systems like OSX. | |
# | |
# Leans on PHP's or Perl's implementation instead. | |
# | |
command -v realpath >/dev/null 2>&1 || realpath() { | |
if command -v php >/dev/null 2>&1; then | |
php -r 'echo realpath($argv[1]);' -- "${1}" | |
elif command -v perl >/dev/null 2>&1; then | |
perl -e 'use Cwd "abs_path";print abs_path(shift)' "${1}" | |
else | |
{ | |
echo "'realpath' is not supported on this system, and there are no" | |
echo "alternatives (PHP, Perl) available." | |
} >&2 | |
pause_and_exit 10 | |
fi | |
} | |
################################################################################ | |
# Main Script Body | |
################################################################################ | |
if [[ $# -lt 1 || $# -gt 2 ]]; then | |
print_usage | |
else | |
zip_filename="${1}" | |
unzip_source_file "${zip_filename}" | |
prepare_output_path "${zip_filename}" | |
sanity_check_csv_files | |
read_source_file_sizes | |
organize_files "${zip_filename}" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment