Last active
January 7, 2016 23:51
-
-
Save mojaveazure/7336b7ff5d8b150490ad to your computer and use it in GitHub Desktop.
Extract regions from a BED file and format it for use with ANGSD and ANGSD-wrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
set -o pipefail | |
# Extract regions from a BED file and format it for use with ANGSD and ANGSD-wrapper | |
function Usage() { | |
echo -e "\ | |
Usage: $(basename $0) BED_FILE NUMBER [OUTFILE] [SEED] \n\ | |
Where: BED_FILE is a BED file to extract regions from \n\ | |
NUMBER is the number of regions to extract \n\ | |
OUTFILE is an optional name for the output file \n\ | |
(defaults to $(pwd -P)/RegionsFromBed.txt) \n\ | |
SEED is an optional seed value, otherwise generated randomly \n\ | |
" >&2 | |
exit 1 | |
} | |
# Check to see if we have enough arguments | |
if [[ "$#" -lt 2 ]]; then Usage; fi | |
# Assign our arguments | |
BED_FILE=$1 # What is our BED file? | |
NUMBER=$2 # How many regions are we extracting? | |
OUTFILE=${3:-$(pwd -P)/RegionsFromBed.txt} # What is our output file? | |
SEED=${4:-${RANDOM}} # What is our seed? | |
# Some basic checks | |
if [[ -f "${BED_FILE}" ]]; then LINES=$(wc -l "${BED_FILE}"); else echo "Failed to find ${BED_FILE}, exiting" >&2; exit 1; fi # Make sure the BED file exists. If so, find out how many lines there are. If not, exit with error | |
if [[ "${NUMBER}" > "${LINES}" ]]; then echo "${NUMBER} is greater than ${LINES} in ${BED_FILE}, exiting..." >&2; exit 1; fi # Make sure we're not extracting more regions than listed in the BED_FILE | |
if [[ -f "${OUTFILE}" ]]; then echo "${OUTFILE} exists, will overwrite in five seconds..." >&2; sleep 5; rm -f "${OUTFILE}"; fi # If a file of the same name as our output file, warn of overwrite | |
declare -a COLLECTED=() | |
COUNTER=0 | |
# Start the extraction process | |
for iteration in $(seq "${NUMBER}") # For the current iteration | |
do | |
PREVIOUS=true | |
while PREVIOUS == true | |
do | |
ITER="$((${RANDOM}%${LINES}))" | |
if [[ "${COLLECTED[@]}" =~ "${ITER}" ]] | |
then | |
PREVIOUS=true | |
else | |
COLLECTED["${COUNTER}"]="${ITER}" | |
let "COUNTER += 1" | |
PREVIOUS=false | |
fi | |
done | |
INFO="$(head -"${ITER}" "${BED_FILE}" | tail -1)" | |
CONTIG="$(cut -f 1 -d '\t' ${INFO})" | |
START="$(cut -f 2 -d '\t' ${INFO})" | |
END="$(cut -f 3 -d '\t' ${INFO})" | |
echo "${CONTIG}:${START}-${END}" >> "${OUTFILE}" | |
done | |
echo -e "SEED:\t${SEED}" > $(dirname "${OUTFILE}")/seed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment