Last active
August 13, 2016 18:15
-
-
Save mfcovington/ece00080acd1af8b17f47fa5d82b2b41 to your computer and use it in GitHub Desktop.
Merging and Summarizing FDA Samples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################ | |
# Auto-merge by sample and end # | |
################################ | |
BASE_DIR=/Volumes/seagate/giriget | |
for SAMPLE in {1..144}; do | |
for END in R1 R2; do | |
# Gather files to process | |
FILE_LIST=`ls $BASE_DIR/raw/*/*fda_${SAMPLE}_*_${END}_*q.gz $BASE_DIR/raw/*/*fda_aj_${SAMPLE}_*_${END}_*q.gz` | |
# Skip non-existent samples | |
if [[ -z $FILE_LIST ]]; then | |
echo "$SAMPLE-$END" >> $BASE_DIR/skipped | |
continue | |
fi | |
# Log files to be merged | |
echo "$SAMPLE-$END" >> $BASE_DIR/log | |
for FILE in $FILE_LIST; do | |
echo $FILE >> $BASE_DIR/log | |
done | |
# Merge FASTQ files by sample and end | |
printf -v SAMPLE_PADDED "%03d" $SAMPLE | |
cat $FILE_LIST > $BASE_DIR/merged/fda_$SAMPLE_PADDED.$END.fq.gz | |
done | |
done | |
##################################################### | |
# Manually merge Sample 78 due to inconsistent name # | |
##################################################### | |
for END in R1 R2; do | |
FILE=$BASE_DIR/raw/mix-06/Mix_6_fda_aj_78dil_cirna_S9_${END}_001.fastq.gz | |
echo -e "78dil-$END\tManual Merge Due to Inconsistent Name" >> $BASE_DIR/log | |
echo $FILE >> $BASE_DIR/log | |
cp $FILE $BASE_DIR/merged/fda_078.$END.fq.gz | |
done | |
######################################### | |
# Manually merge samples named "GTCCGC" # | |
######################################### | |
for END in R1 R2; do | |
FILE1=$BASE_DIR/raw/mix-09/Mix9_GTCCGC_S14_L001_${END}_001.fastq.gz | |
FILE2=$BASE_DIR/raw/mix-09/Mix9_GTCCGC_S14_L002_${END}_001.fastq.gz | |
echo -e "78dil-$END\tManual Merge Due to Inconsistent Name" >> $BASE_DIR/log | |
echo $FILE1 >> $BASE_DIR/log | |
echo $FILE2 >> $BASE_DIR/log | |
cat $FILE1 $FILE2 >> $BASE_DIR/merged/GTCCGC.$END.fq.gz | |
done | |
######################################################### | |
# Manually merge Samples 2 & 6 due to ambiguous pooling # | |
######################################################### | |
mkdir $BASE_DIR/merged.bad | |
mv $BASE_DIR/merged/fda_002.* $BASE_DIR/merged.bad/ | |
mv $BASE_DIR/merged/fda_006.* $BASE_DIR/merged.bad/ | |
# Gather files to process | |
FILE_LIST_002=`ls $BASE_DIR/raw/*/*fda_aj_2_* | grep -v fda_aj_6_` | |
FILE_LIST_006=`ls $BASE_DIR/raw/*/*fda_aj_6_* | grep -v fda_aj_2_` | |
function log_and_merge { | |
# Log files to be merged | |
echo -e "$SAMPLE-$END\tManual Merge Due to Ambiguous Pooling" >> $BASE_DIR/log | |
for FILE in $FILE_LIST; do | |
echo $FILE >> $BASE_DIR/log | |
done | |
# Merge FASTQ files by sample and end | |
cat $FILE_LIST > $BASE_DIR/merged/fda_$SAMPLE_PADDED.$END.fq.gz | |
} | |
# Sample 2 | |
for END in R1 R2; do | |
FILE_LIST=`printf -- '%s' "${FILE_LIST_002[@]}" | grep $END` | |
SAMPLE=2 | |
printf -v SAMPLE_PADDED "%03d" $SAMPLE | |
log_and_merge | |
done | |
# Sample 6 | |
for END in R1 R2; do | |
FILE_LIST=`printf -- '%s' "${FILE_LIST_006[@]}" | grep $END` | |
SAMPLE=6 | |
printf -v SAMPLE_PADDED "%03d" $SAMPLE | |
log_and_merge | |
done | |
##################################################### | |
# Manually merge Sample 72 due to ambiguous pooling # | |
##################################################### | |
mv $BASE_DIR/merged/fda_072.* $BASE_DIR/merged.bad/ | |
# Gather files to process | |
FILE_LIST_72_6=`ls $BASE_DIR/raw/*/*_72_* | grep -v index11` | |
FILE_LIST_72_11=`ls $BASE_DIR/raw/*/*_72_* | grep index11` | |
function log_and_merge_with_index { | |
# Log files to be merged | |
echo -e "$SAMPLE-INDEX_$INDEX-$END\tManual Merge Due to Ambiguous Pooling" >> $BASE_DIR/log | |
for FILE in $FILE_LIST; do | |
echo $FILE >> $BASE_DIR/log | |
done | |
# Merge FASTQ files by sample and end | |
cat $FILE_LIST > $BASE_DIR/merged/fda_$SAMPLE_PADDED.INDEX_$INDEX.$END.fq.gz | |
} | |
# Sample 72 Index 6 | |
for END in R1 R2; do | |
FILE_LIST=`printf -- '%s' "${FILE_LIST_72_6[@]}" | grep $END` | |
SAMPLE=72 | |
INDEX=6 | |
printf -v SAMPLE_PADDED "%03d" $SAMPLE | |
log_and_merge_with_index | |
done | |
# Sample 72 Index 11 | |
for END in R1 R2; do | |
FILE_LIST=`printf -- '%s' "${FILE_LIST_72_11[@]}" | grep $END` | |
SAMPLE=72 | |
INDEX=11 | |
printf -v SAMPLE_PADDED "%03d" $SAMPLE | |
log_and_merge_with_index | |
done | |
#################################################### | |
# Log line counts and read counts for each FQ file # | |
#################################################### | |
# Confirm correct number of files | |
ls $BASE_DIR/merged | wc -l | |
# 242 | |
cd $BASE_DIR/merged | |
echo -e "FILE\tLINES\tREADS" > $BASE_DIR/counts | |
for FQ in *.fq.gz; do | |
LINES=`gunzip -c $FQ | wc -l` | |
READS=$((LINES / 4)) | |
RECORD="$FQ\t$LINES\t$READS" | |
# Warn if file is not a multiple of 4 lines | |
if (( $LINES % 4 != 0 )); then | |
RECORD="$RECORD\tERROR: Truncated File Detected" | |
fi | |
echo -e $RECORD >> $BASE_DIR/counts | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment