genomewalker · April 8, 2020 06:35
diff --git a/norm.R b/norm.R
 library(tidyverse)
 library(DESeq2)
 library(phyloseq)
 library(taxonomizr)

 # Read metadata
 metadata <- read_csv("data/metadata_v10.csv", )
 names(metadata) <- c("label", "age_ka_bp", "age_type",  "site_abrev", "region", "dating_lab_id",
                     "C14_age", "OSL_age", "age_errors", "group1", "lat", "lon")

 # Read abundance tables
 data_raw <- read_csv("data/readsNO_genus.csv")
 data_norm <- read_csv("data/readsNO-norm_genus_richness.csv")
 data_prop <- read_csv("data/readsNO-prop_genus_with_metadata.csv")

 # get intersection of samples and taxa
 data_all <- data_raw %>%
  pivot_longer(cols = c(-taxa, -taxa_ID), names_to = "sample", values_to = "counts") %>%
  inner_join(data_norm %>%
               rename(taxa = genus) %>%
               pivot_longer(cols = -taxa, names_to = "sample", values_to = "counts_scaled")) %>%
  inner_join(data_prop %>%
               select(-age, -region, -group1) %>%
               pivot_longer(cols = -sample, names_to = "taxa", values_to = "prop"))

 samples <- data_all$sample %>% unique()
 taxas <- data_all$taxa_ID %>% unique()

 # Get full taxonomic paths
 tax_paths <- taxas %>%
  map_dfr(function(X){getTaxonomy(X) %>% as_tibble(rownames = "taxid")})

 # Create a phyloseq object to keep everything nice and tidy
 physeq_raw <- phyloseq(otu_table(data_raw %>% 
                               select(taxa_ID, all_of(samples)) %>% 
                               column_to_rownames("taxa_ID") %>% as.data.frame(), taxa_are_rows = TRUE),
                   sample_data(metadata %>% column_to_rownames("label") %>% as.data.frame()),
                   tax_table(tax_paths %>% column_to_rownames("taxid") %>% as.matrix()))

 # Convert physeq object to DESeq2 
 physeq_raw_ds <- phyloseq_to_deseq2(physeq_raw, ~1)

 # We need a custom geometric function to deal with 0s
 gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
 }

 geoMeans <- apply(counts(physeq_raw_ds), 1, gm_mean)

 # We want to use poscounts as its designed for metagenomic studies
 diagdds <- estimateSizeFactors(physeq_raw_ds, type="poscounts", locfunc = genefilter::shorth, geoMeans = geoMeans)
 diagdds <- estimateDispersions(diagdds, fitType='local')
 diagvst <- varianceStabilizingTransformation(diagdds, blind = FALSE)
 diagvst <- assay(diagvst)
 # As we will have negative values (probably correspond to “less than one count” after rescaling)
 # we will convert them to 0 to avoid problems with distance/dissimilarity measures that have problems with
 # negative values
 diagvst[diagvst < 0] <- 0
 physeq_raw_vst <- physeq_raw
 physeq_raw_norm <- physeq_raw

 # Those are the variance stabilized counts
 otu_table(physeq_raw_vst) <- otu_table(diagvst, taxa_are_rows = TRUE)
 # This just divides each sample by the size factor.
 otu_table(physeq_raw_norm) <- otu_table(counts(diagdds, normalized = TRUE), taxa_are_rows = TRUE)
	library(tidyverse)
	library(DESeq2)
	library(phyloseq)
	library(taxonomizr)

	# Read metadata
	metadata <- read_csv("data/metadata_v10.csv", )
	names(metadata) <- c("label", "age_ka_bp", "age_type", "site_abrev", "region", "dating_lab_id",
	"C14_age", "OSL_age", "age_errors", "group1", "lat", "lon")

	# Read abundance tables
	data_raw <- read_csv("data/readsNO_genus.csv")
	data_norm <- read_csv("data/readsNO-norm_genus_richness.csv")
	data_prop <- read_csv("data/readsNO-prop_genus_with_metadata.csv")

	# get intersection of samples and taxa
	data_all <- data_raw %>%
	pivot_longer(cols = c(-taxa, -taxa_ID), names_to = "sample", values_to = "counts") %>%
	inner_join(data_norm %>%
	rename(taxa = genus) %>%
	pivot_longer(cols = -taxa, names_to = "sample", values_to = "counts_scaled")) %>%
	inner_join(data_prop %>%
	select(-age, -region, -group1) %>%
	pivot_longer(cols = -sample, names_to = "taxa", values_to = "prop"))

	samples <- data_all$sample %>% unique()
	taxas <- data_all$taxa_ID %>% unique()

	# Get full taxonomic paths
	tax_paths <- taxas %>%
	map_dfr(function(X){getTaxonomy(X) %>% as_tibble(rownames = "taxid")})

	# Create a phyloseq object to keep everything nice and tidy
	physeq_raw <- phyloseq(otu_table(data_raw %>%
	select(taxa_ID, all_of(samples)) %>%
	column_to_rownames("taxa_ID") %>% as.data.frame(), taxa_are_rows = TRUE),
	sample_data(metadata %>% column_to_rownames("label") %>% as.data.frame()),
	tax_table(tax_paths %>% column_to_rownames("taxid") %>% as.matrix()))

	# Convert physeq object to DESeq2
	physeq_raw_ds <- phyloseq_to_deseq2(physeq_raw, ~1)

	# We need a custom geometric function to deal with 0s
	gm_mean = function(x, na.rm=TRUE){
	exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
	}

	geoMeans <- apply(counts(physeq_raw_ds), 1, gm_mean)

	# We want to use poscounts as its designed for metagenomic studies
	diagdds <- estimateSizeFactors(physeq_raw_ds, type="poscounts", locfunc = genefilter::shorth, geoMeans = geoMeans)
	diagdds <- estimateDispersions(diagdds, fitType='local')
	diagvst <- varianceStabilizingTransformation(diagdds, blind = FALSE)
	diagvst <- assay(diagvst)
	# As we will have negative values (probably correspond to “less than one count” after rescaling)
	# we will convert them to 0 to avoid problems with distance/dissimilarity measures that have problems with
	# negative values
	diagvst[diagvst < 0] <- 0
	physeq_raw_vst <- physeq_raw
	physeq_raw_norm <- physeq_raw

	# Those are the variance stabilized counts
	otu_table(physeq_raw_vst) <- otu_table(diagvst, taxa_are_rows = TRUE)
	# This just divides each sample by the size factor.
	otu_table(physeq_raw_norm) <- otu_table(counts(diagdds, normalized = TRUE), taxa_are_rows = TRUE)