Last active
July 5, 2017 18:00
-
-
Save MarcinKosinski/bf8924c2ac1da613af675e66c6a12545 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| extractSurvival <- function(cohorts){ | |
| survivalData <- list() | |
| for(i in cohorts){ | |
| get(paste0(i, ".clinical"), envir = .GlobalEnv) %>% | |
| select(patient.bcr_patient_barcode, | |
| patient.vital_status, | |
| patient.days_to_last_followup, | |
| patient.days_to_death ) %>% | |
| mutate(bcr_patient_barcode = toupper(patient.bcr_patient_barcode), | |
| patient.vital_status = ifelse(patient.vital_status %>% | |
| as.character() =="dead",1,0), | |
| barcode = patient.bcr_patient_barcode %>% | |
| as.character(), | |
| times = ifelse( !is.na(patient.days_to_last_followup), | |
| patient.days_to_last_followup %>% | |
| as.character() %>% | |
| as.numeric(), | |
| patient.days_to_death %>% | |
| as.character() %>% | |
| as.numeric() ) | |
| ) %>% | |
| filter(!is.na(times)) -> survivalData[[i]] | |
| } | |
| do.call(rbind,survivalData) %>% | |
| select(bcr_patient_barcode, patient.vital_status, times) %>% | |
| unique | |
| } | |
| extractMutations <- function(cohorts, prc){ | |
| mutationsData <- list() | |
| for(i in cohorts){ | |
| get(paste0(i, ".mutations"), envir = .GlobalEnv) %>% | |
| select(Hugo_Symbol, bcr_patient_barcode) %>% | |
| filter(nchar(bcr_patient_barcode)==15) %>% | |
| filter(substr(bcr_patient_barcode, 14, 15)=="01") %>% | |
| unique -> mutationsData[[i]] | |
| } | |
| do.call(rbind,mutationsData) %>% unique -> mutationsData | |
| mutationsData %>% | |
| group_by(Hugo_Symbol) %>% | |
| summarise(count = n()) %>% | |
| arrange(desc(count)) %>% | |
| mutate(count_prc = count/length(unique(mutationsData$bcr_patient_barcode))) %>% | |
| filter_(paste0("count_prc > ",prc)) %>% | |
| select(Hugo_Symbol) %>% | |
| unlist -> topGenes | |
| mutationsData %>% | |
| filter(Hugo_Symbol %in% topGenes) -> mutationsData_top | |
| mutationsData_top %>% | |
| dplyr::group_by(bcr_patient_barcode) %>% | |
| dplyr::summarise(count = n()) %>% | |
| group_by(count) %>% | |
| summarise(total = n()) %>% | |
| arrange(desc(count)) | |
| # | |
| # mutationsData_top %>% | |
| # spread(Hugo_Symbol, bcr_patient_barcode) -> mutationsData_top_sp | |
| as.data.table(mutationsData_top) -> mutationsData_top_DT | |
| dcast.data.table(mutationsData_top_DT, bcr_patient_barcode ~ Hugo_Symbol , fill = 0) %>% | |
| as.data.frame -> mutationsData_top_dcasted | |
| mutationsData_top_dcasted[,-1][mutationsData_top_dcasted[,-1] != "0"] <- 1 | |
| mutationsData_top_dcasted -> result | |
| names(result) <- gsub(names(result),pattern = "-", replacement = "") | |
| result | |
| } | |
| extractCohortIntersection <- function(){ | |
| data(package = "RTCGA.mutations")$results[,3] %>% | |
| gsub(".mutations", "", x = .) -> mutations_data | |
| data(package = "RTCGA.clinical")$results[,3] %>% | |
| gsub(".clinical", "", x = .) -> clinical_data | |
| intersect(mutations_data, clinical_data) | |
| } | |
| prepareCoxDataSplit <- function(mutationsData, survivalData, groups, seed = 4561){ | |
| mutationsData %>% | |
| mutate(bcr_patient_barcode = substr(bcr_patient_barcode,1,12)) %>% | |
| left_join(survivalData, | |
| by = "bcr_patient_barcode") -> coxData | |
| coxData <- coxData[, -c(1,2)] | |
| coxData %>% | |
| filter(times > 0) %>% | |
| filter(!is.na(times)) -> coxData | |
| apply(coxData[,-c(1092, 1093)], MARGIN = 2,function(x){ | |
| as.numeric(as.character(x)) | |
| }) -> coxData[,-c(1092, 1093)] | |
| set.seed(seed) | |
| sample(groups, replace = TRUE, size = 6085) -> groups | |
| split(coxData, groups) #coxData_split | |
| } | |
| prepareForumlaSGD <- function(coxData){ | |
| as.formula(paste("Surv(times, patient.vital_status) ~ ", | |
| paste(names(coxData[[1]])[-c(1092, 1093)], | |
| collapse="+"), collapse = "")) | |
| } | |
| full_cox_loglik_matrix <- function(beta, x, censored){ | |
| order(x$times) -> order2 | |
| x[order2, ] -> xORD | |
| censored[order2] -> censORD | |
| sum(censORD*(beta%*%x[, -which(names(x)=='times')] - | |
| log(cumsum(exp(beta1*rev(x1) + beta2*rev(x2)))))) | |
| } | |
| library(dplyr) | |
| library(RTCGA.clinical) | |
| library(RTCGA.mutations) | |
| library(data.table) | |
| library(coxphSGD) | |
| library(archivist) | |
| (extractCohortIntersection() -> cohorts) | |
| head(extractSurvival(cohorts) -> survivalData) | |
| extractMutations(cohorts, 0.02) -> mutationsData | |
| mutationsData[1:8, c(1,4,56,100,207,801)] | |
| set.seed(4561) | |
| prepareCoxDataSplit(mutationsData,survivalData, groups = 100) -> coxData_split | |
| head(coxData_split[[1]][c(1,10), c(210,302,356,898,911,1092:1093)]) | |
| prepareForumlaSGD(coxData_split) -> formulaSGD | |
| coxData_split[99:100] -> testCox | |
| coxData_split[1:98] -> trainCox | |
| coxphSGD(formulaSGD, data = trainCox, max.iter = 490) -> model_1_over_t | |
| coxphSGD(formulaSGD, data = trainCox, max.iter = 490, | |
| learningRates = function(t){1/(50*sqrt(t))}) -> model_1_over_50sqrt_t | |
| coxphSGD(formulaSGD, data = trainCox, max.iter = 490, | |
| learningRates = function(t){1/(100*sqrt(t))}) -> model_1_over_100sqrt_t | |
| saveToRepo(testCox) | |
| saveToRepo(trainCox) | |
| saveToRepo(model_1_over_t) | |
| saveToRepo(model_1_over_50sqrt_t) | |
| saveToRepo(model_1_over_100sqrt_t) | |
| model_1_over_t$coefficients[c(98, 196, 294, 392, 490)] -> coeffs_1_overt | |
| do.call(rbind, coeffs_1_overt) %>% | |
| as.data.frame %>% | |
| mutate(epoka = 1:5) %>% | |
| tidyr::gather(key=epoka) ->gat | |
| names(gat)[2]<- "gen" | |
| library(ggplot2) | |
| gat %>% | |
| ggplot(aes(value)) + geom_histogram(fill=4, col = 1, binwidth = 0.01) + | |
| facet_grid(epoka~.) + | |
| theme_classic(base_size = 15) + | |
| ylab('liczność') + xlab('wartości współczynników modelu') + | |
| ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') + coord_cartesian(xlim = c(-0.5,0.5)) -> fig1 | |
| model_1_over_50sqrt_t$coefficients[c(98, 196, 294, 392, 490)] -> coeffs_over_50sqrt_t | |
| do.call(rbind, coeffs_over_50sqrt_t) %>% | |
| as.data.frame %>% | |
| mutate(epoka = 1:5) %>% | |
| tidyr::gather(key=epoka) ->gat2 | |
| names(gat2)[2]<- "gen" | |
| library(ggplot2) | |
| gat2 %>% | |
| ggplot(aes(value)) + geom_histogram(fill=5, col = 1, binwidth = 0.01) + | |
| facet_grid(epoka~.) + | |
| theme_classic(base_size = 15) + | |
| ylab('liczność') + xlab('wartości współczynników modelu') + | |
| ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') + coord_cartesian(xlim = c(-0.25,0.25)) -> fig2 | |
| model_1_over_100sqrt_t$coefficients[c(98, 196, 294, 392, 490)] -> coeffs_over_100sqrt_t | |
| do.call(rbind, coeffs_over_100sqrt_t) %>% | |
| as.data.frame %>% | |
| mutate(epoka = 1:5) %>% | |
| tidyr::gather(key=epoka) ->gat3 | |
| names(gat3)[2]<- "gen" | |
| gat3 %>% | |
| ggplot(aes(value)) + geom_histogram(fill=6, col = 1, binwidth = 0.01) + | |
| facet_grid(epoka~.) + | |
| theme_classic(base_size = 15) + | |
| ylab('liczność') + xlab('wartości współczynników modelu') + | |
| ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') + coord_cartesian(xlim = c(-0.1,0.1)) -> fig3 | |
| gat %>% | |
| mutate(model = "model1") %>% | |
| bind_rows(gat2 %>% | |
| mutate(model = "model2")) %>% | |
| bind_rows(gat3 %>% | |
| mutate(model = "model3")) %>% | |
| ggplot(aes(value)) + geom_histogram(fill=4, col = 1, binwidth = 0.001) + | |
| facet_grid(epoka~model) + | |
| theme_classic(base_size = 15) + | |
| ylab('liczność') + xlab('wartość współczynnika modelu') + | |
| ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') -> fig4 | |
| do.call(rbind, coeffs_over_100sqrt_t) %>% t %>% as.data.frame -> model3 | |
| do.call(rbind, coeffs_over_50sqrt_t) %>% t %>% as.data.frame -> model2 | |
| do.call(rbind, coeffs_1_overt) %>% t %>% as.data.frame -> model1 | |
| names(model3) <- c("epoka1", "epoka2","epoka3", "epoka4", "epoka5") | |
| names(model2) <- c("epoka1", "epoka2","epoka3", "epoka4", "epoka5") | |
| names(model1) <- c("epoka1", "epoka2","epoka3", "epoka4", "epoka5") | |
| library(GGally) | |
| ggpairs( title = 'Różnice między współczynnikami w kolejnych epokach', | |
| model1, | |
| upper = "blank", | |
| lower = list(continuous = "points") | |
| ) -> fig5 | |
| saveToRepo(fig1) # 3dc2e14a037bbaae9e892dd255150c28 | |
| saveToRepo(fig2) # 6e5065abdbeae888a2836c1e59460424 | |
| saveToRepo(fig3) # ac9a06e9c6428581cdacb2c1e56e5b57 | |
| saveToRepo(fig4) # 60e327d310c600493c6c661f7330b868 | |
| saveToRepo(fig5) # db5b77ff56b7128244e88d905173cf22 | |
| pdf('hist_overt_t.pdf', width = 10, height = 8 ) | |
| fig2 | |
| dev.off() | |
| alink('3dc2e14a037bbaae9e892dd255150c28', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
| alink('6e5065abdbeae888a2836c1e59460424', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
| alink('ac9a06e9c6428581cdacb2c1e56e5b57', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
| alink('60e327d310c600493c6c661f7330b868', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
| alink('db5b77ff56b7128244e88d905173cf22', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
| names(tail(sort(abs(tail(model_1_over_t$coefficients,1)[[1]])),40)) -> top40 | |
| tail(model_1_over_t$coefficients,1)[[1]][which(names(tail(model_1_over_t$coefficients,1)[[1]]) %in% top40)] -> real_top40 | |
| data.frame(beta = sort(real_top40, decreasing = TRUE), | |
| exp_beta = exp(sort(real_top40, decreasing = TRUE))) %>% xtable::xtable() | |
| do.call(rbind, trainCox) -> trainCox_df | |
| colMeans(trainCox_df[, -c(1092:1093)]) -> srednie_trainCox_df | |
| data.frame( beta =tail(model_1_over_t$coefficients,1)[[1]], | |
| beta_star = srednie_trainCox_df) -> wyniki | |
| data.frame( gen = rownames(wyniki), | |
| beta = round(wyniki[, 1], 3), | |
| exp_beta = round(exp(wyniki[, 1]),2), | |
| srednia = round(wyniki[, 2],3), | |
| beta_star = round(wyniki[, 1]/wyniki[, 2], 2) ) %>% | |
| arrange(desc(beta_star)) %>% | |
| .[1:40,] %>% xtable::xtable() | |
| aoptions('repoDir', getwd()) | |
| loadFromLocalRepo('446ac4dcb7d65bf39057bb341b296f1a') | |
| do.call(rbind, model_1_over_t$coefficients) -> wspolczynniki | |
| qplot(1:491,wspolczynniki[ ,which(colnames(wspolczynniki) == "BRAF")]) + | |
| xlab('Krok algorytmu') + | |
| ylab('Wartość współczynnika') + | |
| theme_minimal(base_size = 20) + | |
| ggtitle('Wartości współczynnika genu BRAF') -> fig8 | |
| qplot(1:491,wspolczynniki[ ,which(colnames(wspolczynniki) == "EGFR")]) + | |
| xlab('Krok algorytmu') + | |
| ylab('Wartość współczynnika') + | |
| theme_minimal(base_size = 20) + | |
| ggtitle('Wartości współczynnika genu EGFR') -> fig9 | |
| saveToRepo(fig8) # 3cd8e1f6da766d3028fc2d8f5fef220f | |
| alink('3cd8e1f6da766d3028fc2d8f5fef220f', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
| saveToRepo(fig9) # 0f5e8e306cc2cba11fceb2bb5f34cbeb | |
| alink('0f5e8e306cc2cba11fceb2bb5f34cbeb', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex') | |
| pdf('fig8.pdf', width = 10, height = 8 ) | |
| fig8 | |
| dev.off() | |
| pdf('fig9.pdf', width = 10, height = 8 ) | |
| fig9 | |
| dev.off() | |
| library(archivist) | |
| setLocalRepo(getwd()) | |
| loadFromLocalRepo('1a06bef4a60a237bb65ca3e2f3f23515') | |
| loadFromLocalRepo('3eebc99bd231b16a3ea4dbeec9ab5edb') | |
| do.call(rbind,testCox) -> testCox_binded | |
| do.call(rbind,trainCox) -> trainCox_binded | |
| library(dplyr) | |
| library(survMisc) | |
| trainCox_binded %>% | |
| select(patient.vital_status, times, BRAF, EGFR) %>% | |
| survfit( Surv(times, patient.vital_status) ~ BRAF, data = .) %>% | |
| survMisc:::autoplot.survfit( titleSize=12, type="CI", palette = "Set1", | |
| alpha = 0.7, survLineSize=2, | |
| pX = 0.3, sigP =3, title = "Przeżycie a mutacja genu BRAF" ) -> km_plot | |
| xlims <- c(0,5000) | |
| library(ggplot2) | |
| km_plot[[1]] <- km_plot[[1]] + | |
| coord_cartesian(xlim = c(xlims[1],xlims[2])) + | |
| theme_light(base_size = 22) | |
| km_plot[[2]] <- km_plot[[2]] + | |
| coord_cartesian(xlim = xlims)+ | |
| theme_light(base_size = 22) | |
| pdf("BRAF.pdf", width = 10, height = 8) | |
| survMisc::autoplot(km_plot) | |
| dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment