Skip to content

Instantly share code, notes, and snippets.

@alexpreynolds
Created September 30, 2021 22:27
Show Gist options
  • Save alexpreynolds/d58df3ee1c065c89e8176f45e7a6a903 to your computer and use it in GitHub Desktop.
Save alexpreynolds/d58df3ee1c065c89e8176f45e7a6a903 to your computer and use it in GitHub Desktop.
Draw N uniform random sample intervals of length L from assembly hg38, taking into account chromosome lengths
#!/bin/bash
# set -xe
set -e
declare -r samples_required=${1}
declare -r interval_length=${2}
#
# [hg38]
# fetchChromSizes hg38 | grep -v '.*_.*' | sort -k1,1 | grep -v 'chrM' | cut -f2 | tr '\n' ' '
#
declare -a chroms=(chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2 chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY)
declare -a sizes=(248956422 133797422 135086622 133275309 114364328 107043718 101991189 90338345 83257441 80373285 58617616 242193529 64444167 46709983 50818468 198295559 190214555 181538259 170805979 159345973 145138636 138394717 156040895 57227415)
declare -a cumulative_sizes=()
declare -i accumulated_size=0
for size in ${sizes[@]}; do
cumulative_sizes+=(${accumulated_size})
let accumulated_size+=${size}
done
if [[ ${#chroms[@]} -ne ${#sizes[@]} ]] || [[ ${#chroms[@]} -ne ${#cumulative_sizes[@]} ]] || [[ ${#sizes[@]} -ne ${#cumulative_sizes[@]} ]]
then
echo "Error: Chromosome, sizes, and cumulative sizes arrays are of unequal length"
exit -1
fi
#
# ref. https://gist.github.com/lovasoa/55550cb561110300ad97398d2cf93214
#
function dichotomic_search {
min=$1
max=$2
command=$3
while [[ $min < $max ]]
do
current=$(( (min + max + 1 ) / 2 ))
if $command $current
then min=$current
else max=$(( current - 1 ))
fi
done
echo $min
}
function is_smaller {
element=${cumulative_sizes[$2]}
if [[ ${element} -ge ${1} ]]
then false
else true
fi
}
declare -r highest_index=`expr ${#cumulative_sizes[@]} - 1`
declare -i samples_drawn=0
while [[ ${samples_drawn} -lt ${samples_required} ]]
do
declare -i absolute_position=${accumulated_size}
while [[ ${absolute_position} -ge ${accumulated_size} ]] || [[ ${absolute_position} -lt 0 ]]
do
(( absolute_position=(RANDOM<<15|RANDOM)<<15|RANDOM ))
done
query_index=$(dichotomic_search 0 $highest_index "is_smaller $absolute_position")
start=$(( ${absolute_position} - ${cumulative_sizes[$query_index]} ))
stop=$(( ${start} + ${interval_length} ))
if [[ ${stop} -lt ${sizes[query_index]} ]]
then
echo -e "${chroms[$query_index]}\t${start}\t${stop}"
let samples_drawn+=1
fi
done
@alexpreynolds
Copy link
Author

alexpreynolds commented Sep 30, 2021

% ./sample_uniform_hg38_intervals.sh 10 10000 | sort-bed -
chr1	37174603	37184603
chr10	14542026	14552026
chr11	32419542	32429542
chr12	64434681	64444681
chr21	8019910	8029910
chr5	174494714	174504714
chr6	34761384	34771384
chr7	3579514	3589514
chrX	26732001	26742001
chrY	46893746	46903746

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment