nickpettican · March 9, 2016 14:57
diff --git a/RP1_R_commands b/RP1_R_commands
 # write out table with data
 write.table(merged_data, file='merged_data.tsv', quote=FALSE, sep='\t')

 # open multiple datasets
 temp = list.files(pattern="*.tsv")
 gsub('.stv','',temp)
 gsub('_output.tsv','_abundance',temp)
 temp2 <- gsub('_output.tsv','_abundance',temp)
 df <- read.table(temp[1])
 df <- read.table(temp[1],col.names=c('ID','T','P',temp2[1]))
 df <- read.table(temp[1],header=T,col.names=c('ID','T','P',temp2[1]))
 temp = list.files(pattern="*.tsv")
 for (i in 1:length(temp)) assign(temp[i], read.table(temp[i], header=TRUE, col.names = c('GeneID','TCount','PCount',temp2[i])))

 temp = list.files(pattern="*.tsv")
 for (i in 1:length(temp)) assign(temp[i], read.table(temp[i]))

 temp = list.files(pattern="*.tsv")
 for (i in 1:length(temp)) assign(temp[i], read.table(temp[i], header=TRUE, col.names = c('GeneID','TCount','PCount',temp2[i])))

 # merge different datasets
 df_all <- merge(merge(df1,df2,by=c('PaxID','ENSPID'),all=TRUE),df3,by=c('PaxID','ENSPID'),all=TRUE)

 # change column names
 colnames(ensembl_export02) <- c("GeneID", "TransID", "ProtIDe")

 # sort the columns to the gene ID, so long as the column in gene ID is called GeneID
 sort.merged_data <- merged_data[order(GeneID) , ]

 # assign random rows from dataframe to other dataframe
 sampling_all_int02 <- all_int_analysis[sample(nrow(all_int_analysis), 100), ]
 # in this case we assign 100 rows

 # import table ignore header and comments
 WHOLE_ORGANISM_integrated <- read.delim("C:/MSc/RP1/WHOLE_ORGANISM_integrated.txt", header=FALSE, comment.char="#")

 # remove characters from column elements, note that start,finish delimit the position in the element that's kept 
 dataframe$column1 <- substr(dataframe$column1, start, finish)

 # make average of all other columns except the first (usually where the geneIDs are) while excluding NA
 newdataname <- data.frame(ID=dataname[,1], Means=rowMeans(dataname[,-1],na.rm = TRUE))

 # removes all dataframes
 rm(list = ls())

 # plot graph with red points 
 plot(PCount,PAbundance,pch=21,bg="red")

 #plot graph with transparency 
 plot(PCount,CellLineAbundance,pch=16,col=rgb(0,100,0,50,maxColorValue=255))

 #plot 3D
 scatter3D(temp_calc_morethan19$PCount,temp_calc_morethan19$range_div_median,temp_calc_morethan19$range_div_range,phi=40,pch=16,col=rgb(0,100,0,50,maxColorValue=255),main="quartRange median PCount \n>=19 is_data",zlab="intQuartRange/maxminRange",ylab="intQuartRange/median",xlab="PCount")

 # make three graphs next to each other
 par(mfrow=c(1,3))
 plot """""

 par(resetPar())     ## reset the pars to defaults
 par("mfrow")        ## back to default

 # draw regression line
 abline(lm(BPAbundance~BPCount),col="blue")

 # spearmans product-moment correlation
 cor.test(PCount,PAbundance)

 # spearman's rank correlation
 cor.test(PCount,PAbundance,method="spearman")

 # calculate max min of each row (gene)
 pmax(dataframe$column,dataframe$column)
 apply(dataframe$columntostartfrom,1,max)
 #or
 apply(dataframe[,4:lastone],1,max,na.rm=TRUE)
 # apply is for matrices, the number 1 is for rows and 2 is for columns

 # boxplot for non-parametric data, like the one we have
 boxplot(columnname)
 boxplot(t(dataframe)) # to apply only to rows
 # fivenum gives min, max, lower-hinge, median, upper-hinge
 fivenum(columnname)

 quantile(rows)
 # to calculate quantiles of each row:
 all_int_analysis$upper_quart <- apply(all_int_merged_new[,4:22],1,quantile,probs=c(.75),na.rm=TRUE)

 randomForest()
 varImpPlot(the model) # will show the importance of each tissue for predicting
	# write out table with data
	write.table(merged_data, file='merged_data.tsv', quote=FALSE, sep='\t')

	# open multiple datasets
	temp = list.files(pattern="*.tsv")
	gsub('.stv','',temp)
	gsub('_output.tsv','_abundance',temp)
	temp2 <- gsub('_output.tsv','_abundance',temp)
	df <- read.table(temp[1])
	df <- read.table(temp[1],col.names=c('ID','T','P',temp2[1]))
	df <- read.table(temp[1],header=T,col.names=c('ID','T','P',temp2[1]))
	temp = list.files(pattern="*.tsv")
	for (i in 1:length(temp)) assign(temp[i], read.table(temp[i], header=TRUE, col.names = c('GeneID','TCount','PCount',temp2[i])))

	temp = list.files(pattern="*.tsv")
	for (i in 1:length(temp)) assign(temp[i], read.table(temp[i]))

	temp = list.files(pattern="*.tsv")
	for (i in 1:length(temp)) assign(temp[i], read.table(temp[i], header=TRUE, col.names = c('GeneID','TCount','PCount',temp2[i])))

	# merge different datasets
	df_all <- merge(merge(df1,df2,by=c('PaxID','ENSPID'),all=TRUE),df3,by=c('PaxID','ENSPID'),all=TRUE)

	# change column names
	colnames(ensembl_export02) <- c("GeneID", "TransID", "ProtIDe")

	# sort the columns to the gene ID, so long as the column in gene ID is called GeneID
	sort.merged_data <- merged_data[order(GeneID) , ]

	# assign random rows from dataframe to other dataframe
	sampling_all_int02 <- all_int_analysis[sample(nrow(all_int_analysis), 100), ]
	# in this case we assign 100 rows

	# import table ignore header and comments
	WHOLE_ORGANISM_integrated <- read.delim("C:/MSc/RP1/WHOLE_ORGANISM_integrated.txt", header=FALSE, comment.char="#")

	# remove characters from column elements, note that start,finish delimit the position in the element that's kept
	dataframe$column1 <- substr(dataframe$column1, start, finish)

	# make average of all other columns except the first (usually where the geneIDs are) while excluding NA
	newdataname <- data.frame(ID=dataname[,1], Means=rowMeans(dataname[,-1],na.rm = TRUE))

	# removes all dataframes
	rm(list = ls())

	# plot graph with red points
	plot(PCount,PAbundance,pch=21,bg="red")

	#plot graph with transparency
	plot(PCount,CellLineAbundance,pch=16,col=rgb(0,100,0,50,maxColorValue=255))

	#plot 3D
	scatter3D(temp_calc_morethan19$PCount,temp_calc_morethan19$range_div_median,temp_calc_morethan19$range_div_range,phi=40,pch=16,col=rgb(0,100,0,50,maxColorValue=255),main="quartRange median PCount \n>=19 is_data",zlab="intQuartRange/maxminRange",ylab="intQuartRange/median",xlab="PCount")

	# make three graphs next to each other
	par(mfrow=c(1,3))
	plot """""

	par(resetPar()) ## reset the pars to defaults
	par("mfrow") ## back to default

	# draw regression line
	abline(lm(BPAbundance~BPCount),col="blue")

	# spearmans product-moment correlation
	cor.test(PCount,PAbundance)

	# spearman's rank correlation
	cor.test(PCount,PAbundance,method="spearman")

	# calculate max min of each row (gene)
	pmax(dataframe$column,dataframe$column)
	apply(dataframe$columntostartfrom,1,max)
	#or
	apply(dataframe[,4:lastone],1,max,na.rm=TRUE)
	# apply is for matrices, the number 1 is for rows and 2 is for columns

	# boxplot for non-parametric data, like the one we have
	boxplot(columnname)
	boxplot(t(dataframe)) # to apply only to rows
	# fivenum gives min, max, lower-hinge, median, upper-hinge
	fivenum(columnname)

	quantile(rows)
	# to calculate quantiles of each row:
	all_int_analysis$upper_quart <- apply(all_int_merged_new[,4:22],1,quantile,probs=c(.75),na.rm=TRUE)

	randomForest()
	varImpPlot(the model) # will show the importance of each tissue for predicting