Last active
October 2, 2017 16:05
-
-
Save genomewalker/cb741d00497efab510a5eb7124ff0667 to your computer and use it in GitHub Desktop.
Scripts for analysing original alma data set
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# UCLUST workflow --------------------------------------------------------- | |
library(tidyverse) | |
library(cowplot) | |
library(ggpubr) | |
setwd("~/Downloads/alma_original_test") | |
uclust_df <- read_tsv("16s.insilico_mock.tsv", col_names = T) | |
uclust_16S_summary <- uclust_df%>% dplyr::rename(otu_name = `#OTU ID`) %>% | |
gather(label, count, -otu_name) %>% | |
tbl_df %>% | |
group_by(label) %>% | |
mutate(rel_abun = count/sum(count)) %>% | |
arrange(otu_name, label) %>% | |
filter(count > 0) | |
uclust_16S_prevalence <- uclust_16S_summary %>% | |
dplyr::select(label,otu_name, count) %>% | |
group_by(otu_name) %>% | |
summarise(prev = sum(count >0), total_counts = sum(count)) | |
uclust_16S_abs_singletons <- uclust_16S_prevalence %>% dplyr::filter(total_counts <= 1, prev <= 1) | |
uclust_16S_abun_singletons <- uclust_16S_prevalence %>% dplyr::filter(total_counts > 1, prev <= 1) | |
uclust_16S_abs_singletons_names <- uclust_16S_abs_singletons$otu_name | |
uclust_16S_abun_doubletons <- uclust_16S_prevalence %>% filter(!(otu_name %in% uclust_16S_abs_singletons_names)) %>% dplyr::filter( prev <= 2) | |
# Remove absolute singletons | |
uclust_16S_prevalence <- uclust_16S_prevalence %>% filter(!(otu_name %in% uclust_16S_abs_singletons_names)) | |
uclust_16S_counts <- data.frame(class = c("Total ASVs","Absolute singletons", "Abundant singletons"), | |
counts = c(length(unique(uclust_16S_summary$otu_name)), | |
dim(uclust_16S_abs_singletons)[1], | |
dim(uclust_16S_abun_singletons)[1])) | |
print(uclust_16S_counts) | |
uclust_16S_counts$class <- factor(uclust_16S_counts$class, levels=c("Total ASVs","Absolute singletons", "Abundant singletons")) | |
ggplot(uclust_16S_counts, aes(class, counts)) + | |
geom_bar(stat = "identity") + theme_bw() + | |
xlab("") + ylab("# ASVs") | |
# VS ref ------------------------------------------------------------------ | |
# Get mock communities ---------------------------------------------------- | |
# EVEN ERR2098507 | |
# STAG ERR2098508 | |
uclust_mock <- uclust_16S_summary %>% | |
filter(otu_name %in% uclust_16S_prevalence$otu_name) %>% | |
filter(grepl("Even", label) | grepl("Staggered", label)) | |
uclust_mock_even <- uclust_mock %>% | |
filter(grepl("Even", label)) %>% | |
ungroup %>% | |
select(otu_name, count, label) | |
uclust_mock_stag <- uclust_mock %>% | |
filter(grepl("Staggered", label)) %>% | |
ungroup %>% | |
select(otu_name, count, label) | |
# Compare references | |
# usearch10.0.240_i86osx32 -usearch_global rep_otus.fasta \ | |
# -db references_even_mock_noprimers.fasta | |
# -strand both -id 0.5 -blast6out otu_rep_vs_even.tsv | |
alma_mock <- read_tsv("alma_mock_data.txt", col_names = TRUE) | |
uclust_even_vs_ref <- read_tsv("otu_rep_vs_even.tsv", col_names = F) %>% | |
select(X1, X2, X3) %>% | |
dplyr::rename(otu_name = X1, ref = X2, id = X3) %>% | |
separate(otu_name,into = "otu_name", sep = " ", remove = TRUE, extra = "drop") %>% | |
inner_join(uclust_mock_even) | |
uclust_even_den <- ggplot(uclust_even_vs_ref, aes(id/100, fill = label, color = label)) + | |
geom_density(alpha = 0.6) + | |
theme_light() + | |
xlab("Identity") + | |
ylab("Density") + | |
scale_x_continuous(labels = scales::percent) + | |
ggtitle("uclust") | |
uclust_even_vs_ref_100 <- uclust_even_vs_ref %>% | |
select(-otu_name) %>% | |
group_by(label) %>% | |
mutate(N=sum(count),prop = count/N) %>% | |
filter(id >= 97) %>% | |
group_by(ref, label) %>% | |
summarise(prop = sum(prop)) %>% | |
ungroup() %>% group_by(ref) %>% | |
summarise(prop = mean((prop))) %>% | |
right_join(alma_mock %>% filter(!is.na(exp_even))) %>% | |
mutate(exp_even = exp_even/100, obs_even = obs_even/100) %>% | |
select(-contains("Stag"), -ref) %>% | |
dplyr::rename(emose_dada_obs = prop, alma_exp = exp_even, alma_obs = obs_even) %>% | |
gather(exp,values, c(emose_dada_obs, alma_exp, alma_obs)) %>% | |
ungroup() #%>% filter(!is.na(label)) | |
uclust_even_vs_ref_100$ref_name <- factor(uclust_even_vs_ref_100$ref_name, levels = rev(sort(unique(uclust_even_vs_ref_100$ref_name)))) | |
uclust_even_vs_ref_100_comp <- ggplot(uclust_even_vs_ref_100, aes(ref_name, values, fill = exp)) + | |
geom_bar(position = "dodge", stat = "identity") + | |
coord_flip() + | |
theme_light() + | |
theme(legend.position = "top", | |
legend.title = element_blank()) + | |
scale_fill_manual(values = rev(c( "#1B9E77", "#D95F02" ,"#7570B3"))) + | |
ylab("Proportion (hits >= 97%)") + | |
xlab("") + | |
scale_y_continuous(labels = scales::percent) | |
# Staggered | |
# Compare references | |
# usearch10.0.240_i86osx32 -usearch_global rep_otus.fasta \ | |
# -db references_staggered_mock_noprimers.fasta | |
# -strand both -id 0.5 -blast6out otu_rep_vs_stag.tsv | |
uclust_stag_vs_ref <- read_tsv("otu_rep_vs_stag.tsv", col_names = F) %>% | |
select(X1, X2, X3) %>% | |
dplyr::rename(otu_name = X1, ref = X2, id = X3) %>% | |
separate(otu_name,into = "otu_name", sep = " ", remove = TRUE, extra = "drop") %>% | |
inner_join(uclust_mock_stag) | |
uclust_stag_den <- ggplot(uclust_stag_vs_ref, aes(id/100, fill = label, color = label)) + | |
geom_density(alpha = 0.6) + | |
theme_light() + | |
xlab("Identity") + | |
ylab("Density") + | |
scale_x_continuous(labels = scales::percent) + | |
ggtitle("UCLUST") | |
uclust_stag_vs_ref_100 <- uclust_stag_vs_ref %>% | |
select(-otu_name) %>% | |
group_by(label) %>% | |
mutate(N=sum(count),prop = count/N) %>% | |
filter(id >= 97) %>% | |
group_by(ref, label) %>% | |
summarise(prop = sum(prop)) %>% | |
ungroup() %>% group_by(ref) %>% | |
summarise(prop = mean((prop))) %>% | |
right_join(alma_mock) %>% | |
mutate(exp_stag = exp_stag/100, obs_stag = obs_stag/100) %>% | |
select(-contains("even"), -ref) %>% | |
dplyr::rename(emose_dada_obs = prop, alma_exp = exp_stag, alma_obs = obs_stag) %>% | |
gather(exp,values, c(emose_dada_obs, alma_exp, alma_obs)) %>% | |
ungroup() | |
uclust_stag_vs_ref_100$ref_name <- factor(uclust_stag_vs_ref_100$ref_name, levels = rev(sort(unique(uclust_stag_vs_ref_100$ref_name)))) | |
uclust_stag_vs_ref_100_comp <- ggplot(uclust_stag_vs_ref_100, aes(ref_name, values, fill = exp)) + | |
geom_bar(position = "dodge", stat = "identity") + | |
coord_flip() + | |
theme_light() + | |
theme(legend.position = "top", | |
legend.title = element_blank()) + | |
scale_fill_manual(values = rev(c( "#1B9E77", "#D95F02" ,"#7570B3"))) + | |
ylab("Proportion (hits >= 97%)") + | |
xlab("") + | |
scale_y_continuous(labels = scales::percent) | |
all_combined_stag <- bind_rows(uclust_stag_vs_ref_100 %>% | |
spread(exp, values) %>% | |
select(ref_name, emose_dada_obs, alma_exp) %>% | |
dplyr::rename(X = emose_dada_obs, Y = alma_exp) %>% | |
mutate(exp = "UCLUST"), | |
uclust_stag_vs_ref_100 %>% | |
spread(exp, values) %>% | |
select(ref_name, alma_obs, alma_exp) %>% | |
dplyr::rename(X = alma_obs, Y = alma_exp) %>% | |
mutate(exp = "Parada et al. 2015")) | |
all <- ggpubr::ggscatter(all_combined_stag, x = "X", y = "Y", | |
add = "reg.line", # Add regression line | |
conf.int = TRUE, # Add confidence interval | |
color = "exp", | |
palette = get_palette(c("#00AFBB", "#FC4E07"), 3), # Color by groups "cyl" | |
#shape = "exp", # Change point shape by groups "cyl" | |
fullrange = TRUE, | |
alpha = 0.9, # Extending the regression line | |
facet.by = "exp" | |
#rug = TRUE | |
)+ stat_cor(aes(color = exp)) + | |
scale_y_log10(labels = scales::percent) + | |
scale_x_log10(labels = scales::percent) + | |
ylab("515F-Y/926R Observed abundance") + | |
xlab("Expected abundance") + | |
theme_light() + | |
theme(legend.position = "none") | |
ggdraw() + | |
draw_plot(uclust_even_vs_ref_100_comp, x = 0, y = .5, width = .5, height = .5) + | |
draw_plot(uclust_stag_vs_ref_100_comp, x = .5, y = .5, width = .5, height = .5) + | |
draw_plot(all, x = 0, y = 0, width = 1, height = 0.5) + | |
draw_plot_label(label = c("A", "B", "C"), size = 15, | |
x = c(0, 0.5, 0), y = c(1, 1, 0.5)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#set -x | |
DIR="${1}" | |
EXT1="${2}" | |
EXT2="${3}" | |
# programs | |
TRIMEXE=trimmomatic | |
USEAEXE=~/Downloads/usearch10.0.240_i86osx32 | |
# Folders | |
QCFILES=qc_files | |
MEFILES=merged_files | |
FASTA=16s_fna | |
CHIMFILES=${FASTA}/usearch61_chimera_checking/ | |
NOCHIMFILES=${FASTA}/no_chimera | |
OTUFILES=otu_picking_16s_uclust99/ | |
TMP=tmp | |
# Get all pair-ends | |
FILES=($(find "${DIR}" -name "*${EXT1}*" -exec basename {} \;)) | |
mkdir -p "${QCFILES}" "${TMP}" "${MEFILES}" "${FASTA}" "${OTUFILES}" "${NOCHIMFILES}" | |
for element in $(seq 0 $((${#FILES[@]} - 1))); do | |
FWD="${FILES[$element]}" | |
REV="${FILES[$element]/${EXT1}/${EXT2}}" | |
# quality trim | |
QCF="${FWD/fastq.gz/qc.fastq.gz}" | |
QCR=$"{REV/fastq.gz/qc.fastq.gz}" | |
"${TRIMEXE}" PE -phred33 "${DIR}"/"${FWD}" "${DIR}"/"${REV}" \ | |
"${TMP}"/R1_pe "${TMP}"/R1_se "${TMP}"/R2_pe "${TMP}"/R2_se \ | |
SLIDINGWINDOW:4:20 MINLEN:200 | |
mv "${TMP}"/R1_pe "${QCFILES}"/"${QCF}" | |
mv "${TMP}"/R2_pe "${QCFILES}"/"${QCR}" | |
rm "${TMP}"/R1_se | |
rm "${TMP}"/R2_se | |
# Merge reads | |
ME="${MEFILES}"/"${FILES[$element]/${EXT1}/merged.fastq}" | |
MEFA="${FASTA}"/"${FILES[$element]/${EXT1}/merged.fasta}" | |
SE1="${MEFILES}"/notmerged_"${QCF}" | |
SE2="${MEFILES}"/notmerged_"${QCR}" | |
"${USEAEXE}" -fastq_mergepairs "${QCFILES}"/"${QCF}" -reverse "${QCFILES}"/"${QCR}" \ | |
-fastqout "${ME}" -fastaout "${MEFA}" -fastq_maxdiffs 3 -fastq_minmergelen 300 \ | |
-fastqout_notmerged_fwd "${SE1}" -fastqout_notmerged_rev "${SE2}" | |
# Convert to fasta Skipped because to slow for the test | |
# convert_fastaqual_fastq.py -f "${ME}" -o "${FASTA}" -c fastq_to_fastaqual | |
# identify chimeric sequences | |
identify_chimeric_seqs.py -m usearch61 -i "${MEFA}" -o "${CHIMFILES}" --suppress_usearch61_ref | |
NOCHIM="${MEFA/merged.fasta/nochimera.merged.fasta}" | |
filter_fasta.py -f "${MEFA}" -o "${NOCHIM}" -s "${CHIMFILES}"/chimeras.txt -n | |
# rename reads to be used with qiime (using gawk for mac os X) | |
SAMPLE="${FWD/.${EXT1}/}" | |
gawk -i inplace -vS="${SAMPLE}" 'BEGIN{i=1}{if ($0 ~ />/){print ">"S"_"i; i++}else{print $0}}' "${NOCHIM}" | |
mv "${NOCHIM}" "${NOCHIMFILES}" | |
done | |
# Combine all fasta files | |
cat "${NOCHIMFILES}"/*.fasta > "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed.fna | |
# Pick OTUs | |
pick_otus.py -i "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed.fna -o "${OTUFILES}" -s 0.99 | |
# Pick representative | |
pick_rep_set.py -i "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed_otus.txt -f "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed.fna \ | |
-m most_abundant -o "${OTUFILES}"/rep_otus.fasta | |
# Make OTU table | |
make_otu_table.py -i "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed_otus.txt -o "${OTUFILES}"/16s.insilico_mock.biom | |
# Convert to tsv | |
biom convert -i "${OTUFILES}"/16s.insilico_mock.tsv -o test --to-tsv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment