Last active
October 21, 2020 17:19
-
-
Save aaronwolen/17cec4a4862280ca4fcc6131c633b31b to your computer and use it in GitHub Desktop.
Shell script to run ingest/exports with different versions of tiledb-vcf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# variables | |
############# | |
tilevcf="dist/bin/tiledbvcf" | |
tmpdir="/mnt/data/tmp" | |
samplefile="$tmpdir/samples.txt" | |
output_dir="data" | |
export_dir="$output_dir/exports" | |
log_dir="$output_dir/logs" | |
version="v3_211" | |
version="v4_211" | |
version="v4_sample-string-id_211" | |
version="v3_release_208" | |
# local or remote | |
dest=local | |
array_bucket="s3://genomic-datasets/vcf-samples-20" | |
array_filepath="/mnt/data/test-vcf-samples-20-arrays" | |
bedfile_raw="libtiledbvcf/test/inputs/E001_15_coreMarks_dense.bed" | |
bedfile="$output_dir/E001_15_coreMarks_dense_filtered.bed" | |
export_samples="v2-DjrIAzkP,v2-YMaDHIoW,v2-usVwJUmo,v2-ZVudhauk" | |
# setup directories | |
##################### | |
mkdir -p "$export_dir" "$log_dir" "$tmpdir" | |
# filter bedfile | |
###################### | |
awk -F"\t" \ | |
'BEGIN {OFS="\t"}; | |
{ if ($1 <= 3 && $4 == "7_Enh") | |
{$1 = "chr"$1; print} | |
}' "$bedfile_raw" > "$bedfile" | |
echo "Bedfile:" | |
echo $(wc -l $bedfile) | |
# link version specific binary/lib | |
##################################### | |
# rename dist to include version | |
if [[ -d "dist" ]]; then | |
echo "Moving dist/ directory" | |
mv --verbose dist "dist_$version" | |
fi | |
if [[ -L "dist" ]]; then | |
echo "Removing dist/ symbolic link" | |
rm "dist" | |
fi | |
ln -s "dist_${version}" dist | |
ls -al dist | |
$tilevcf version | |
commit=$($tilevcf version | head -n1 | cut -d' ' -f3) | |
echo "$commit" | |
# set array destination | |
######################### | |
if [[ "$dest" == "remote" ]] | |
then | |
uri="$array_bucket/$version/vcf-samples-20" | |
bcf_dir="${array_bucket}/bcfs" | |
else | |
uri="${array_filepath}/${version}/vcf-samples-20" | |
bcf_dir="/mnt/data/genomic-datasets/vcf-samples-20" | |
fi | |
printf "Array URI is: %s\n" "$uri" | |
# setup | |
########## | |
# create samples files | |
if [[ -d "$tmpdir" ]]; then rm -rf "$tmpdir"/*; fi | |
if [[ "$uri" == "s3://"* ]] | |
then | |
echo "Deleting existing remote array" | |
aws s3 rm --recursive "$uri" | |
aws s3 ls "$bcf_dir" \ | |
| awk -v bucket="$bucket/bcfs/" '{ if ($4 ~ /bcf$/) print bucket$4}' > samples.txt | |
else | |
echo "Deleting existing local array" | |
mkdir -p $(dirname "$uri") | |
if [[ -d "$uri" ]]; then rm -rf "$uri"; fi | |
ls "$bcf_dir"/*.bcf > "$samplefile" | |
fi | |
echo "Found the following samples:" | |
cat "$samplefile" | |
# ingest data | |
############### | |
$tilevcf create -u"$uri" -e2 | |
$tilevcf register -u"$uri" -f "$samplefile" -d "$tmpdir" -s 500 | |
$tilevcf store -u"$uri" \ | |
-d "$tmpdir" -s 6000 \ | |
--verbose \ | |
-f "$samplefile" 2>&1 | tee "$log_dir/ingest_${version}_${commit}.log" | |
# perform export | |
################# | |
$tilevcf export \ | |
--uri "$uri" \ | |
--mem-budget-mb 512 \ | |
-Ot -tCHR,POS,REF,ALT,S:GT \ | |
-s "$export_samples" \ | |
-R "$bedfile" \ | |
-d "$export_dir" \ | |
-o "export-${version}_${commit}.tsv" \ | |
--verbose 2>&1 | tee "$log_dir/export_${version}_${commit}.log" | |
# check exports | |
################## | |
exportfile="$export_dir/export-${version}_${commit}.tsv" | |
mlr --icsv --ifs tab head "$exportfile" | |
mlr --icsv --ifs tab \ | |
count -g SAMPLE \ | |
"$exportfile" | |
# compress | |
################## | |
gzip "$exportfile" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment