Skip to content

Instantly share code, notes, and snippets.

@mojaveazure
Created September 26, 2015 22:55
Show Gist options
  • Save mojaveazure/87bcffd3a1e98ba7d7cf to your computer and use it in GitHub Desktop.
Save mojaveazure/87bcffd3a1e98ba7d7cf to your computer and use it in GitHub Desktop.
Create an adapter sequence file from the barcodes in sample names
#!/bin/bash
set -e
set -o pipefail
###########################################################################################################################
######################################### Usage Information #########################################
# This is a script to extract adapter sequences that use barcodes embedded in the names of files
# To use, add a list of samples to the SAMPLE_INFO field on line 34
# This should look like:
# SAMPLE_INFO=${HOME}/directory/sample_list.txt
# Set the smallest size barcode in the MIN_BAR field on line 37
# This shoudl look like:
# MIN_BAR=6
# Define an outfile in the OUT field on line 41
# This should look like:
# OUT=${HOME}/directory/out_adapters.txt
# This defaults to a file in the current working directory called extracted_adapters with the date and time attached
# If there's a sequence before the barcode in the adapter sequence, type it into the PRECEDING_SEQ field on line 45
# This should look like:
# PRECEDING_SEQ="ACTG"
# If there's a sequence trailing the barcode in the adapter sequence, type it into the FOLLOWING_SEQ field on line 49
# This should look like:
# FOLLOWING_SEQ="ACTG"
# To set a base name for the adapter sequences, type it into the HEADER field on line 53
# This should look like:
# HEADER=Adapter
# Each adapter sequence found will be called ${HEADER}(sequential numbers from 0 to the number of sequences)
# To run, type the following command:
# bash barcodeToAdapter.sh
###########################################################################################################################
# Enter a sample list
SAMPLE_INFO=
# Define the smallest size barcode
MIN_BAR=2
# Set an output file
# This is optional
OUT=
# Set a sequence that comes immediately before the barcode in the adapter sequence
# This is optional
PRECEDING_SEQ=
# Set a sequence that comes immediately after the barcode in the adapter sequence
# This is optional
FOLLOWING_SEQ=
# Set a title to call each adapter sequence
# This is optional
HEADER=
###########################################################################################################################
#################### Do the work here ####################
#################### Don't change anything here ####################
#################### Unless you know what you're doing ####################
###########################################################################################################################
# Define a function to extract the barcode sequence from the name of a single file
function getBarcode() {
local name=`basename "$1"`
local minbar="$2"
regex="([ACGT]{$minbar,})"
[[ $name =~ $regex ]]
barcode=`echo "${BASH_REMATCH[1]}"`
echo "$barcode"
}
export -f getBarcode
# Make sure we have our sample list
if [[ -z "${SAMPLE_INFO}" ]]
then
echo "Please hard-code a sample list!"
exit 1
fi
# Make sure we have our smallest barcode size
if [[ -z "${MIN_BAR}" ]]
then
echo "Please hard-code a minimum barcode size!"
exit 1
fi
# Set an array to hold extracted barcodes
declare -a all_barcodes=()
# Create a counter to know which index we're storing a barcode into
counter=0
for file in `cat "${SAMPLE_INFO}"` # For each file in our sample list
do
code=`getBarcode "$file" "${MIN_BAR}"` # Get the barcode sequence
all_barcodes["$counter"]="$code" # Add it to the barcode array
let "counter += 1" # Increment the counter
done
declare -a barcode_list=(`tr ' ' '\n' <<< "${all_barcodes[@]}" | sort -u | tr '\n' ' '`) # Create an array of unique barcode sequences
# Are we missing a header definition?
if [[ -z "${HEADER}" ]]
then
HEADER="Adapter" # If so, default to "Adapter"
fi
# Are we missing a name for our output file?
if [[ -z "${OUT}" ]]
then
# If so,
TIME=`date +%m-%d-%y-%H.%M.%S` # What is the date and time?
OUT="`pwd`/extracted_adapters_${TIME}.txt" # Create a timestamped name for the output file
fi
# Write each adapter sequence to the output file
for count in `seq 0 "$(( ${#barcode_list[@]} - 1 ))"`
do
echo -e ">${HEADER}$count\n${PRECEDING_SEQ}${barcode_list[$count]}${FOLLOWING_SEQ}" >> "${OUT}"
done
# Tell us where the output file is
echo "Adapter file can be found at ${OUT}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment