Created
September 26, 2015 22:55
-
-
Save mojaveazure/87bcffd3a1e98ba7d7cf to your computer and use it in GitHub Desktop.
Create an adapter sequence file from the barcodes in sample names
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
set -o pipefail | |
########################################################################################################################### | |
######################################### Usage Information ######################################### | |
# This is a script to extract adapter sequences that use barcodes embedded in the names of files | |
# To use, add a list of samples to the SAMPLE_INFO field on line 34 | |
# This should look like: | |
# SAMPLE_INFO=${HOME}/directory/sample_list.txt | |
# Set the smallest size barcode in the MIN_BAR field on line 37 | |
# This shoudl look like: | |
# MIN_BAR=6 | |
# Define an outfile in the OUT field on line 41 | |
# This should look like: | |
# OUT=${HOME}/directory/out_adapters.txt | |
# This defaults to a file in the current working directory called extracted_adapters with the date and time attached | |
# If there's a sequence before the barcode in the adapter sequence, type it into the PRECEDING_SEQ field on line 45 | |
# This should look like: | |
# PRECEDING_SEQ="ACTG" | |
# If there's a sequence trailing the barcode in the adapter sequence, type it into the FOLLOWING_SEQ field on line 49 | |
# This should look like: | |
# FOLLOWING_SEQ="ACTG" | |
# To set a base name for the adapter sequences, type it into the HEADER field on line 53 | |
# This should look like: | |
# HEADER=Adapter | |
# Each adapter sequence found will be called ${HEADER}(sequential numbers from 0 to the number of sequences) | |
# To run, type the following command: | |
# bash barcodeToAdapter.sh | |
########################################################################################################################### | |
# Enter a sample list | |
SAMPLE_INFO= | |
# Define the smallest size barcode | |
MIN_BAR=2 | |
# Set an output file | |
# This is optional | |
OUT= | |
# Set a sequence that comes immediately before the barcode in the adapter sequence | |
# This is optional | |
PRECEDING_SEQ= | |
# Set a sequence that comes immediately after the barcode in the adapter sequence | |
# This is optional | |
FOLLOWING_SEQ= | |
# Set a title to call each adapter sequence | |
# This is optional | |
HEADER= | |
########################################################################################################################### | |
#################### Do the work here #################### | |
#################### Don't change anything here #################### | |
#################### Unless you know what you're doing #################### | |
########################################################################################################################### | |
# Define a function to extract the barcode sequence from the name of a single file | |
function getBarcode() { | |
local name=`basename "$1"` | |
local minbar="$2" | |
regex="([ACGT]{$minbar,})" | |
[[ $name =~ $regex ]] | |
barcode=`echo "${BASH_REMATCH[1]}"` | |
echo "$barcode" | |
} | |
export -f getBarcode | |
# Make sure we have our sample list | |
if [[ -z "${SAMPLE_INFO}" ]] | |
then | |
echo "Please hard-code a sample list!" | |
exit 1 | |
fi | |
# Make sure we have our smallest barcode size | |
if [[ -z "${MIN_BAR}" ]] | |
then | |
echo "Please hard-code a minimum barcode size!" | |
exit 1 | |
fi | |
# Set an array to hold extracted barcodes | |
declare -a all_barcodes=() | |
# Create a counter to know which index we're storing a barcode into | |
counter=0 | |
for file in `cat "${SAMPLE_INFO}"` # For each file in our sample list | |
do | |
code=`getBarcode "$file" "${MIN_BAR}"` # Get the barcode sequence | |
all_barcodes["$counter"]="$code" # Add it to the barcode array | |
let "counter += 1" # Increment the counter | |
done | |
declare -a barcode_list=(`tr ' ' '\n' <<< "${all_barcodes[@]}" | sort -u | tr '\n' ' '`) # Create an array of unique barcode sequences | |
# Are we missing a header definition? | |
if [[ -z "${HEADER}" ]] | |
then | |
HEADER="Adapter" # If so, default to "Adapter" | |
fi | |
# Are we missing a name for our output file? | |
if [[ -z "${OUT}" ]] | |
then | |
# If so, | |
TIME=`date +%m-%d-%y-%H.%M.%S` # What is the date and time? | |
OUT="`pwd`/extracted_adapters_${TIME}.txt" # Create a timestamped name for the output file | |
fi | |
# Write each adapter sequence to the output file | |
for count in `seq 0 "$(( ${#barcode_list[@]} - 1 ))"` | |
do | |
echo -e ">${HEADER}$count\n${PRECEDING_SEQ}${barcode_list[$count]}${FOLLOWING_SEQ}" >> "${OUT}" | |
done | |
# Tell us where the output file is | |
echo "Adapter file can be found at ${OUT}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment