Created
September 30, 2021 22:27
-
-
Save alexpreynolds/d58df3ee1c065c89e8176f45e7a6a903 to your computer and use it in GitHub Desktop.
Draw N uniform random sample intervals of length L from assembly hg38, taking into account chromosome lengths
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# set -xe | |
set -e | |
declare -r samples_required=${1} | |
declare -r interval_length=${2} | |
# | |
# [hg38] | |
# fetchChromSizes hg38 | grep -v '.*_.*' | sort -k1,1 | grep -v 'chrM' | cut -f2 | tr '\n' ' ' | |
# | |
declare -a chroms=(chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2 chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY) | |
declare -a sizes=(248956422 133797422 135086622 133275309 114364328 107043718 101991189 90338345 83257441 80373285 58617616 242193529 64444167 46709983 50818468 198295559 190214555 181538259 170805979 159345973 145138636 138394717 156040895 57227415) | |
declare -a cumulative_sizes=() | |
declare -i accumulated_size=0 | |
for size in ${sizes[@]}; do | |
cumulative_sizes+=(${accumulated_size}) | |
let accumulated_size+=${size} | |
done | |
if [[ ${#chroms[@]} -ne ${#sizes[@]} ]] || [[ ${#chroms[@]} -ne ${#cumulative_sizes[@]} ]] || [[ ${#sizes[@]} -ne ${#cumulative_sizes[@]} ]] | |
then | |
echo "Error: Chromosome, sizes, and cumulative sizes arrays are of unequal length" | |
exit -1 | |
fi | |
# | |
# ref. https://gist.github.com/lovasoa/55550cb561110300ad97398d2cf93214 | |
# | |
function dichotomic_search { | |
min=$1 | |
max=$2 | |
command=$3 | |
while [[ $min < $max ]] | |
do | |
current=$(( (min + max + 1 ) / 2 )) | |
if $command $current | |
then min=$current | |
else max=$(( current - 1 )) | |
fi | |
done | |
echo $min | |
} | |
function is_smaller { | |
element=${cumulative_sizes[$2]} | |
if [[ ${element} -ge ${1} ]] | |
then false | |
else true | |
fi | |
} | |
declare -r highest_index=`expr ${#cumulative_sizes[@]} - 1` | |
declare -i samples_drawn=0 | |
while [[ ${samples_drawn} -lt ${samples_required} ]] | |
do | |
declare -i absolute_position=${accumulated_size} | |
while [[ ${absolute_position} -ge ${accumulated_size} ]] || [[ ${absolute_position} -lt 0 ]] | |
do | |
(( absolute_position=(RANDOM<<15|RANDOM)<<15|RANDOM )) | |
done | |
query_index=$(dichotomic_search 0 $highest_index "is_smaller $absolute_position") | |
start=$(( ${absolute_position} - ${cumulative_sizes[$query_index]} )) | |
stop=$(( ${start} + ${interval_length} )) | |
if [[ ${stop} -lt ${sizes[query_index]} ]] | |
then | |
echo -e "${chroms[$query_index]}\t${start}\t${stop}" | |
let samples_drawn+=1 | |
fi | |
done |
Author
alexpreynolds
commented
Sep 30, 2021
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment