|
# Install Bioconductor and required packages |
|
# This command installs the BiocManager package for managing Bioconductor packages |
|
install.packages("BiocManager") |
|
|
|
# Use BiocManager to install specific packages for data analysis and visualization |
|
BiocManager::install(c("GEOquery", "SummarizedExperiment", "ggplot2", |
|
"party", "ggparty", "partykit", "randomForest")) |
|
|
|
# Load necessary packages for modeling and visualization |
|
library(party) # For creating classification trees |
|
library(ggparty) # For visualizing classification trees with ggplot2 |
|
library(partykit) # Provides tools for working with party objects |
|
|
|
# Load the iris dataset, which is a well-known dataset in R for classification |
|
data(iris) |
|
|
|
# Display the first few rows of the iris dataset |
|
head(iris) |
|
|
|
# Create a classification tree model using conditional inference trees |
|
# The model predicts the Species based on all other variables in the dataset |
|
py <- party::ctree(Species ~ ., data = iris) |
|
|
|
# Visualize the classification tree created by the model |
|
plot(py) |
|
|
|
# Define a function to create and visualize a classification tree |
|
plot_random_sample <- function(fraction) { |
|
# Create a random sample of the iris dataset |
|
sample_indices <- sample(1:nrow(iris), fraction * nrow(iris)) # Randomly select indices |
|
iris_sample <- iris[sample_indices, ] # Create a sample dataset |
|
|
|
# Create a classification tree model from the random sample |
|
py_sample <- party::ctree(Species ~ ., data = iris_sample) |
|
|
|
# Visualize the classification tree for the sampled data |
|
plot(py_sample) |
|
} |
|
|
|
# Call the function to plot a classification tree using 50% of the iris dataset |
|
plot_random_sample(0.5) |
|
plot_random_sample(0.5) |
|
plot_random_sample(0.5) |
|
|
|
# Load the randomForest package for random forest modeling |
|
library(randomForest) |
|
|
|
# Create a random forest model to classify Species based on the iris dataset |
|
# 'ntree = 1000' specifies that 1000 trees will be used in the model |
|
rf = randomForest(Species ~ ., data = iris, ntree = 1000) |
|
|
|
# Display the importance of each feature in the random forest model |
|
importance(rf) |
|
|
|
## Perform PCA (Principal Component Analysis) |
|
library(GEOquery) # Load GEOquery package for accessing GEO datasets |
|
gse = getGEO("GSE103512")[[1]] # Retrieve a specific GEO dataset |
|
|
|
library(SummarizedExperiment) # Load the package for working with summarized experiment data |
|
se = as(gse, "SummarizedExperiment") # Convert the GEO dataset to a SummarizedExperiment object |
|
|
|
# Create a contingency table to show the relationship between cancer types and normal samples |
|
with(colData(se),table(`cancer.type.ch1`,`normal.ch1`)) |
|
|
|
# Calculate the standard deviation for each gene across all samples |
|
sds = apply(assay(se, 'exprs'), 1, sd) |
|
# Select the top 500 genes with the highest standard deviation |
|
dat = assay(se, 'exprs')[order(sds, decreasing = TRUE)[1:500],] |
|
|
|
# Perform PCA on the transposed data |
|
# 't(dat)' transposes the dataset so that samples are rows and genes are columns |
|
# 'center = TRUE' centers the data by subtracting the mean |
|
# 'scale. = TRUE' scales the data to have unit variance |
|
pca_results = prcomp(t(dat), center = TRUE, scale. = TRUE) |
|
|
|
# Convert PCA results into a data frame |
|
# 'pca_results$x' contains the principal component scores (new coordinates for our data) |
|
pca_vals = as.data.frame(pca_results$x) |
|
|
|
# Add additional information to the PCA results for further analysis |
|
# 'Type' is a factor representing different cancer types |
|
pca_vals$Type = factor(colData(se)[,'cancer.type.ch1']) |
|
# 'Normal' is a factor representing whether the sample is normal or not |
|
pca_vals$Normal = factor(colData(se)[,'normal.ch1']) |
|
|
|
# Display the first few rows of the PCA results |
|
head(pca_vals) |
|
|
|
# Plot the proportion of variance explained by each principal component |
|
# This can help in determining how many components to retain for further analysis |
|
screeplot(pca_results) |
|
|
|
# Create a scatter plot using ggplot2 to visualize the first two principal components |
|
# 'aes' maps the x and y axes to the first two principal components (PC1 and PC2) |
|
# 'shape' distinguishes the points based on the Normal variable |
|
# 'color' distinguishes the points based on the Type variable |
|
library(ggplot2) # Load ggplot2 for creating graphics |
|
ggplot(pca_vals, aes(x=PC1, y=PC2, shape=Normal, color=Type)) + |
|
geom_point(alpha=0.6) + # Add points to the plot with some transparency |
|
theme(text=element_text(size = 18)) # Increase text size for better readability |