seandavi · October 9, 2024 15:47
diff --git a/data_description.md b/data_description.md
diff --git a/IDPT_ml.R b/IDPT_ml.R
 # Install Bioconductor and required packages
 # This command installs the BiocManager package for managing Bioconductor packages
 install.packages("BiocManager")

 # Use BiocManager to install specific packages for data analysis and visualization
 BiocManager::install(c("GEOquery", "SummarizedExperiment", "ggplot2",
                       "party", "ggparty", "partykit", "randomForest"))

 # Load necessary packages for modeling and visualization
 library(party)          # For creating classification trees
 library(ggparty)        # For visualizing classification trees with ggplot2
 library(partykit)       # Provides tools for working with party objects

 # Load the iris dataset, which is a well-known dataset in R for classification
 data(iris)

 # Display the first few rows of the iris dataset
 head(iris)

 # Create a classification tree model using conditional inference trees
 # The model predicts the Species based on all other variables in the dataset
 py <- party::ctree(Species ~ ., data = iris)

 # Visualize the classification tree created by the model
 plot(py)

 # Define a function to create and visualize a classification tree
 plot_random_sample <- function(fraction) {
  # Create a random sample of the iris dataset
  sample_indices <- sample(1:nrow(iris), fraction * nrow(iris))  # Randomly select indices
  iris_sample <- iris[sample_indices, ]  # Create a sample dataset

  # Create a classification tree model from the random sample
  py_sample <- party::ctree(Species ~ ., data = iris_sample)

  # Visualize the classification tree for the sampled data
  plot(py_sample)
 }

 # Call the function to plot a classification tree using 50% of the iris dataset
 plot_random_sample(0.5)
 plot_random_sample(0.5)
 plot_random_sample(0.5)

 # Load the randomForest package for random forest modeling
 library(randomForest)

 # Create a random forest model to classify Species based on the iris dataset
 # 'ntree = 1000' specifies that 1000 trees will be used in the model
 rf = randomForest(Species ~ ., data = iris, ntree = 1000)

 # Display the importance of each feature in the random forest model
 importance(rf)

 ## Perform PCA (Principal Component Analysis)
 library(GEOquery)  # Load GEOquery package for accessing GEO datasets
 gse = getGEO("GSE103512")[[1]]  # Retrieve a specific GEO dataset

 library(SummarizedExperiment)  # Load the package for working with summarized experiment data
 se = as(gse, "SummarizedExperiment")  # Convert the GEO dataset to a SummarizedExperiment object

 # Create a contingency table to show the relationship between cancer types and normal samples
 with(colData(se),table(`cancer.type.ch1`,`normal.ch1`))

 # Calculate the standard deviation for each gene across all samples
 sds = apply(assay(se, 'exprs'), 1, sd)
 # Select the top 500 genes with the highest standard deviation
 dat = assay(se, 'exprs')[order(sds, decreasing = TRUE)[1:500],]

 # Perform PCA on the transposed data
 # 't(dat)' transposes the dataset so that samples are rows and genes are columns
 # 'center = TRUE' centers the data by subtracting the mean
 # 'scale. = TRUE' scales the data to have unit variance
 pca_results = prcomp(t(dat), center = TRUE, scale. = TRUE)

 # Convert PCA results into a data frame
 # 'pca_results$x' contains the principal component scores (new coordinates for our data)
 pca_vals = as.data.frame(pca_results$x)

 # Add additional information to the PCA results for further analysis
 # 'Type' is a factor representing different cancer types
 pca_vals$Type = factor(colData(se)[,'cancer.type.ch1'])
 # 'Normal' is a factor representing whether the sample is normal or not
 pca_vals$Normal = factor(colData(se)[,'normal.ch1'])

 # Display the first few rows of the PCA results
 head(pca_vals)

 # Plot the proportion of variance explained by each principal component
 # This can help in determining how many components to retain for further analysis
 screeplot(pca_results)

 # Create a scatter plot using ggplot2 to visualize the first two principal components
 # 'aes' maps the x and y axes to the first two principal components (PC1 and PC2)
 # 'shape' distinguishes the points based on the Normal variable
 # 'color' distinguishes the points based on the Type variable
 library(ggplot2)  # Load ggplot2 for creating graphics
 ggplot(pca_vals, aes(x=PC1, y=PC2, shape=Normal, color=Type)) +
  geom_point(alpha=0.6) +  # Add points to the plot with some transparency
  theme(text=element_text(size = 18))  # Increase text size for better readability
	# Install Bioconductor and required packages
	# This command installs the BiocManager package for managing Bioconductor packages
	install.packages("BiocManager")

	# Use BiocManager to install specific packages for data analysis and visualization
	BiocManager::install(c("GEOquery", "SummarizedExperiment", "ggplot2",
	"party", "ggparty", "partykit", "randomForest"))

	# Load necessary packages for modeling and visualization
	library(party) # For creating classification trees
	library(ggparty) # For visualizing classification trees with ggplot2
	library(partykit) # Provides tools for working with party objects

	# Load the iris dataset, which is a well-known dataset in R for classification
	data(iris)

	# Display the first few rows of the iris dataset
	head(iris)

	# Create a classification tree model using conditional inference trees
	# The model predicts the Species based on all other variables in the dataset
	py <- party::ctree(Species ~ ., data = iris)

	# Visualize the classification tree created by the model
	plot(py)

	# Define a function to create and visualize a classification tree
	plot_random_sample <- function(fraction) {
	# Create a random sample of the iris dataset
	sample_indices <- sample(1:nrow(iris), fraction * nrow(iris)) # Randomly select indices
	iris_sample <- iris[sample_indices, ] # Create a sample dataset

	# Create a classification tree model from the random sample
	py_sample <- party::ctree(Species ~ ., data = iris_sample)

	# Visualize the classification tree for the sampled data
	plot(py_sample)
	}

	# Call the function to plot a classification tree using 50% of the iris dataset
	plot_random_sample(0.5)
	plot_random_sample(0.5)
	plot_random_sample(0.5)

	# Load the randomForest package for random forest modeling
	library(randomForest)

	# Create a random forest model to classify Species based on the iris dataset
	# 'ntree = 1000' specifies that 1000 trees will be used in the model
	rf = randomForest(Species ~ ., data = iris, ntree = 1000)

	# Display the importance of each feature in the random forest model
	importance(rf)

	## Perform PCA (Principal Component Analysis)
	library(GEOquery) # Load GEOquery package for accessing GEO datasets
	gse = getGEO("GSE103512")[[1]] # Retrieve a specific GEO dataset

	library(SummarizedExperiment) # Load the package for working with summarized experiment data
	se = as(gse, "SummarizedExperiment") # Convert the GEO dataset to a SummarizedExperiment object

	# Create a contingency table to show the relationship between cancer types and normal samples
	with(colData(se),table(`cancer.type.ch1`,`normal.ch1`))

	# Calculate the standard deviation for each gene across all samples
	sds = apply(assay(se, 'exprs'), 1, sd)
	# Select the top 500 genes with the highest standard deviation
	dat = assay(se, 'exprs')[order(sds, decreasing = TRUE)[1:500],]

	# Perform PCA on the transposed data
	# 't(dat)' transposes the dataset so that samples are rows and genes are columns
	# 'center = TRUE' centers the data by subtracting the mean
	# 'scale. = TRUE' scales the data to have unit variance
	pca_results = prcomp(t(dat), center = TRUE, scale. = TRUE)

	# Convert PCA results into a data frame
	# 'pca_results$x' contains the principal component scores (new coordinates for our data)
	pca_vals = as.data.frame(pca_results$x)

	# Add additional information to the PCA results for further analysis
	# 'Type' is a factor representing different cancer types
	pca_vals$Type = factor(colData(se)[,'cancer.type.ch1'])
	# 'Normal' is a factor representing whether the sample is normal or not
	pca_vals$Normal = factor(colData(se)[,'normal.ch1'])

	# Display the first few rows of the PCA results
	head(pca_vals)

	# Plot the proportion of variance explained by each principal component
	# This can help in determining how many components to retain for further analysis
	screeplot(pca_results)

	# Create a scatter plot using ggplot2 to visualize the first two principal components
	# 'aes' maps the x and y axes to the first two principal components (PC1 and PC2)
	# 'shape' distinguishes the points based on the Normal variable
	# 'color' distinguishes the points based on the Type variable
	library(ggplot2) # Load ggplot2 for creating graphics
	ggplot(pca_vals, aes(x=PC1, y=PC2, shape=Normal, color=Type)) +
	geom_point(alpha=0.6) + # Add points to the plot with some transparency
	theme(text=element_text(size = 18)) # Increase text size for better readability