configure ASVO client and conversion job parameters, see https://github.com/MWATelescope/manta-ray-client
export MWA_ASVO_API_KEY=... # FIXME
export output=ms
export flag_edge_width=80
export avg_time_res=... # FIXME
export avg_freq_res=... # FIXME
define some location in scratch with enough storage to download the data
export outdir="$DHOME/hopper"
define a list of obsids
export obslist="$outdir/obsids.csv"
# for example
# cat <<- 'EoF' > $obslist
# 1090701368
# EoF
setup software: defines how to run jq, rclone, and giant-squid
cat <<- 'EOF' > $outdir/prelude.sh
module load singularity
module load rclone
alias giant-squid='singularity exec -B $PWD docker://mwatelescope/giant-squid:latest /opt/cargo/bin/giant-squid'
alias jq='singularity exec docker://stedolan/jq jq'
# ... or ...
# download jq with
# mkdir -p $MYSOFTWARE/bin; cd $_
# wget https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64;
# chmod +x jq-linux-amd64;
# alias jq="$MYSOFTWARE/bin/jq-linux-amd64"
configure rclone with your acacia storage , see https://pawsey.atlassian.net/wiki/spaces/US/pages/51924486/Configuring+and+running+an+S3+client
export rclone_storage=... # acacia storage profile name from rclone config, e.g. mwasci
export rclone_bucket=... # acacia bucket name to upload files to
request conversion jobs
export params="output=${output},flag_edge_width=${flag_edge_width},avg_time_res=${avg_time_res},avg_freq_res=${avg_freq_res}"
giant-squid submit-conv --parameters=$params $obslist
get ready jobs, and download these from asvo in parallel, keeps them as the tar file
export archive_suffix="_edg${flag_edge_width}_${avg_time_res}s_${avg_freq_res}kHz.${output}"
giant-squid list -j --types conversion --states ready -- $obslist \
| tee /dev/stderr \
| jq -r '.[]|[.jobId,.files[0].fileUrl//"",.files[0].fileSize//"",.files[0].fileHash//""]|@tsv' \
| tee ready.tsv
# schedule download jobs
cat ready.tsv | while read -r jobid url size hash; do
echo "$(date -Is) downloading obsid=$obsid jobid=$jobid url=$url size=$size hash=$hash"
srun -N 1 -c 1 --time=23:55:00 -o $obsid.log wget $url -O${outdir}/${obsid}${archive_suffix}.tar --progress=dot:giga --wait=60 --random-wait
done
wait a bit, delete and retry any downloads that failed, then upload them all to acacia.
srun -N 1 -c 4 --time=23:55:00 rclone copy ${outdir}/*.tar ${rclone_storage}:${rclone_bucket}/
you can just submit the whole lot again like in step 5
giant-squid submit-conv --parameters=$params $obslist
step 7 could also do with a check that the obs isn't already downloaded, and could probably be wrapped in a parent slurm script that limits the number of concurrent tasks. Something like this, might require tweaks
#SBATCH -J "nbconvert"
#SBATCH --nodes=4
#SBATCH --ntasks=4
#SBATCH --cpus-per-task=1
#SBATCH --time=23:55:00
export archive_suffix="_edg${flag_edge_width}_${avg_time_res}s_${avg_freq_res}kHz.${output}"
giant-squid list -j --types conversion --states ready -- $obslist \
| tee /dev/stderr \
| jq -r '.[]|[.jobId,.files[0].fileUrl//"",.files[0].fileSize//"",.files[0].fileHash//""]|@tsv' \
| tee ready.tsv
# schedule download jobs
cat ready.tsv | while read -r jobid url size hash; do
[ -f ${outdir}/${obsid}${archive_suffix}.tar ] && continue # TODO: maybe even check $hash matches?
echo "$(date -Is) downloading obsid=$obsid jobid=$jobid url=$url size=$size hash=$hash"
srun -N 1 -c 1 --time=23:55:00 -o $obsid.log wget $url -O${outdir}/${obsid}${archive_suffix}.tar --progress=dot:giga --wait=60 --random-wait
done