Skip to content

Instantly share code, notes, and snippets.

@elipapa
Last active May 22, 2016 22:49
Show Gist options
  • Save elipapa/78de6b1f86e336382417 to your computer and use it in GitHub Desktop.
Save elipapa/78de6b1f86e336382417 to your computer and use it in GitHub Desktop.
simple bash task queue to download, decompress and upload files to a S3 bucket
#!/bin/bash
while true; do
until [[ $usedspace -ge 60 ]]; do
for file_ready in queue/*.downloaded; do
filepath=${file_ready%.downloaded}
file=${filepath##*/}
srsid=${file/.*/}
if [ ! -e "queue/$srsid.decompressed" ]
then
echo "Decompressing $srsid ..."
mkdir $srsid
if tar xvf $file --use-compress-program=lbzip2 -C $srsid
then
echo "$file succesfully decompressed to $srsid. queueing for upload.. "
touch queue/$srsid.decompressed
else
echo "error in decompressing $file ... is there enough disk space?"
fi
fi
done
done
usedspace=$(df /home/ubuntu/decomp --output=pcent | sed '1d' | awk '{print $1}' | sed 's/\%//')
sleep 1
done
#!/bin/bash
if [ ! -d queue ]; then
mkdir queue
fi
while IFS='' read -r line || [[ -n "$line" ]]; do
srsid=${line/.*/}
echo "looking if sample $srsid has already been completely processed..."
# if file exist on destination bucket, just skip to the next loop iteration
if aws s3 ls s3://finch-data/HMIWGS/$srsid/
then
echo "something for $srsid is in the destination bucket. skipping."
continue
fi
if [[ -e "queue/$srsid.finished" ]]; then
echo "this $srsid was already uploaded decompressed. skipping."
continue
fi
if [ -e "queue/$line.downloaded" ]
then
echo "it seems I have already downloaded $line. skipping."
continue
fi
#check how much space has been taken up
usedspace=$(df /home/ubuntu/decomp --output=pcent | sed '1d' | awk '{print $1}' | sed 's/\%//')
while [[ $usedspace -ge 60 ]]; do
echo "$(date) -- disk getting full. waiting ... "
sleep 60
usedspace=$(df /home/ubuntu/decomp --output=pcent | sed '1d' | awk '{print $1}' | sed 's/\%//')
done
echo "copying $line on to the local drive..."
if aws s3 cp s3://finch-data/Healthy_HMP_stool_WGS/$line .
then
echo "$line copied succesfully, queueing for decompression"
touch queue/$line.downloaded
else
echo "could not copy $line. moving to next sample..."
fi
#update to see if the volume is getting too full
done < "$1"
SRS011061.tar.bz2
SRS011084.tar.bz2
SRS011134.tar.bz2
SRS011239.tar.bz2
SRS011271.tar.bz2
SRS011302.tar.bz2
SRS011405.tar.bz2
SRS011452.tar.bz2
SRS011529.tar.bz2
SRS011586.tar.bz2
SRS012273.tar.bz2
SRS012849.tar.bz2
SRS012902.tar.bz2
SRS012969.tar.bz2
SRS013098.tar.bz2
SRS013158.tar.bz2
SRS013215.tar.bz2
SRS013216.tar.bz2
SRS013476.tar.bz2
SRS013521.tar.bz2
SRS013638.tar.bz2
SRS013639.tar.bz2
SRS013687.tar.bz2
SRS013800.tar.bz2
SRS013951.tar.bz2
SRS014235.tar.bz2
SRS014287.tar.bz2
SRS014313.tar.bz2
SRS014459.tar.bz2
SRS014613.tar.bz2
SRS014683.tar.bz2
SRS014923.tar.bz2
SRS014979.tar.bz2
SRS015065.tar.bz2
SRS015133.tar.bz2
SRS015190.tar.bz2
SRS015217.tar.bz2
SRS015264.tar.bz2
SRS015369.tar.bz2
SRS015431.tar.bz2
SRS015578.tar.bz2
SRS015663.tar.bz2
SRS015782.tar.bz2
SRS015794.tar.bz2
SRS015854.tar.bz2
SRS015890.tar.bz2
SRS015960.tar.bz2
SRS016018.tar.bz2
SRS016056.tar.bz2
SRS016095.tar.bz2
SRS016203.tar.bz2
SRS016267.tar.bz2
SRS016335.tar.bz2
SRS016437.tar.bz2
SRS016438.tar.bz2
SRS016495.tar.bz2
SRS016517.tar.bz2
SRS016585.tar.bz2
SRS016753.tar.bz2
SRS016954.tar.bz2
SRS016989.tar.bz2
SRS016990.tar.bz2
SRS017103.tar.bz2
SRS017191.tar.bz2
SRS017247.tar.bz2
SRS017307.tar.bz2
SRS017433.tar.bz2
SRS017521.tar.bz2
SRS017701.tar.bz2
SRS017821.tar.bz2
SRS017916.tar.bz2
SRS018133.tar.bz2
SRS018313.tar.bz2
SRS018351.tar.bz2
SRS018427.tar.bz2
SRS018575.tar.bz2
SRS018656.tar.bz2
SRS018817.tar.bz2
SRS018984.tar.bz2
SRS019030.tar.bz2
SRS019068.tar.bz2
SRS019161.tar.bz2
SRS019267.tar.bz2
SRS019381.tar.bz2
SRS019397.tar.bz2
SRS019582.tar.bz2
SRS019601.tar.bz2
SRS019685.tar.bz2
SRS019787.tar.bz2
SRS019910.tar.bz2
SRS019968.tar.bz2
SRS020233.tar.bz2
SRS020328.tar.bz2
SRS020622.tar.bz2
SRS020869.tar.bz2
SRS021153.tar.bz2
SRS021219.tar.bz2
SRS021484.tar.bz2
SRS021948.tar.bz2
SRS022071.tar.bz2
SRS022093.tar.bz2
SRS022137.tar.bz2
SRS022524.tar.bz2
SRS022609.tar.bz2
SRS022713.tar.bz2
SRS023176.tar.bz2
SRS023346.tar.bz2
SRS023526.tar.bz2
SRS023583.tar.bz2
SRS023829.tar.bz2
SRS023914.tar.bz2
SRS023971.tar.bz2
SRS024009.tar.bz2
SRS024075.tar.bz2
SRS024132.tar.bz2
SRS024265.tar.bz2
SRS024331.tar.bz2
SRS024388.tar.bz2
SRS024435.tar.bz2
SRS024549.tar.bz2
SRS024625.tar.bz2
SRS024663.tar.bz2
SRS042284.tar.bz2
SRS042628.tar.bz2
SRS043001.tar.bz2
SRS043411.tar.bz2
SRS043667.tar.bz2
SRS043701.tar.bz2
SRS045004.tar.bz2
SRS045528.tar.bz2
SRS045645.tar.bz2
SRS045713.tar.bz2
SRS045739.tar.bz2
SRS047014.tar.bz2
SRS047044.tar.bz2
SRS048164.tar.bz2
SRS048870.tar.bz2
SRS049164.tar.bz2
SRS049402.tar.bz2
SRS049712.tar.bz2
SRS049896.tar.bz2
SRS049900.tar.bz2
SRS049959.tar.bz2
SRS049995.tar.bz2
SRS050026.tar.bz2
SRS050299.tar.bz2
SRS050422.tar.bz2
SRS050752.tar.bz2
SRS050925.tar.bz2
SRS051031.tar.bz2
SRS051882.tar.bz2
SRS052027.tar.bz2
SRS052697.tar.bz2
SRS053214.tar.bz2
SRS053335.tar.bz2
SRS053356.tar.bz2
SRS053398.tar.bz2
SRS053573.tar.bz2
SRS053649.tar.bz2
SRS054352.tar.bz2
SRS054590.tar.bz2
SRS054956.tar.bz2
SRS055533.tar.bz2
SRS055982.tar.bz2
SRS056259.tar.bz2
SRS056273.tar.bz2
SRS056519.tar.bz2
SRS057478.tar.bz2
SRS057717.tar.bz2
SRS058070.tar.bz2
SRS058723.tar.bz2
SRS058770.tar.bz2
SRS062427.tar.bz2
SRS063040.tar.bz2
SRS063489.tar.bz2
SRS063518.tar.bz2
SRS063985.tar.bz2
SRS064276.tar.bz2
SRS064557.tar.bz2
SRS064645.tar.bz2
SRS064973.tar.bz2
SRS065504.tar.bz2
SRS074670.tar.bz2
SRS074964.tar.bz2
SRS075078.tar.bz2
SRS075341.tar.bz2
SRS075398.tar.bz2
SRS075963.tar.bz2
SRS076929.tar.bz2
SRS077086.tar.bz2
SRS077194.tar.bz2
SRS077294.tar.bz2
SRS077335.tar.bz2
SRS077502.tar.bz2
SRS077552.tar.bz2
SRS077730.tar.bz2
SRS077753.tar.bz2
SRS077849.tar.bz2
SRS078176.tar.bz2
SRS078242.tar.bz2
SRS078419.tar.bz2
SRS078665.tar.bz2
SRS097889.tar.bz2
SRS098514.tar.bz2
SRS098571.tar.bz2
SRS098644.tar.bz2
SRS098717.tar.bz2
SRS098827.tar.bz2
SRS100021.tar.bz2
SRS101376.tar.bz2
SRS101433.tar.bz2
SRS103987.tar.bz2
SRS104197.tar.bz2
SRS104311.tar.bz2
SRS104400.tar.bz2
SRS104485.tar.bz2
SRS105153.tar.bz2
SRS140492.tar.bz2
SRS140513.tar.bz2
SRS140645.tar.bz2
SRS142503.tar.bz2
SRS142505.tar.bz2
SRS142599.tar.bz2
SRS142712.tar.bz2
SRS142890.tar.bz2
SRS143070.tar.bz2
SRS143181.tar.bz2
SRS143342.tar.bz2
SRS143417.tar.bz2
SRS143598.tar.bz2
SRS143780.tar.bz2
SRS143876.tar.bz2
SRS143991.tar.bz2
SRS144362.tar.bz2
SRS144506.tar.bz2
SRS144537.tar.bz2
SRS145497.tar.bz2
SRS146764.tar.bz2
SRS146812.tar.bz2
SRS146813.tar.bz2
SRS147022.tar.bz2
SRS147088.tar.bz2
SRS147139.tar.bz2
SRS147346.tar.bz2
SRS147445.tar.bz2
SRS147652.tar.bz2
SRS147766.tar.bz2
SRS147919.tar.bz2
SRS148196.tar.bz2
SRS148424.tar.bz2
SRS148721.tar.bz2
#!/usr/bin/env bash
while true; do
for file_ready in queue/*.decompressed; do
filepath=${file_ready%.decompressed}
if [[ ! -e "$filepath.finished" ]]; then
srsid=${filepath##*/}
echo "queueing $srsid for upload"
# this find hack was necessary since some tar files do not open in a directory
if find $srsid -name "*.fastq" -printf '%P\n' | xargs -I {} -n1 aws s3 cp $srsid/{} s3://finch-data/HMIWGS/stool/$srsid/
then
echo "upload of $srsid succesful. cleaning disk space"
rm -rf $srsid/
rm -f $srsid.tar.bz2
touch queue/$srsid.finished
else
echo "leaving $srsid decompressed samples on disk."
fi
fi
done
sleep 1
done
!#!/usr/bin/env bash
#r3.4xlarge (0.6$ bid)
sudo apt-get install python-pip
sudo pip install awscli
sudo mkfs -t ext4 /dev/xvdb
mkdir decomp
sudo mount /dev/xvdb decomp/
sudo chown -R ubuntu:ubuntu decomp/
cd /home/ubuntu/decomp
sudo -u ubuntu git clone https://gist.github.com/78de6b1f86e336382417.git src
#better to launch a tmux session and run the three processes in parallel
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment