Skip to content

Instantly share code, notes, and snippets.

@tkuennen
Created December 3, 2025 05:41
Show Gist options
  • Select an option

  • Save tkuennen/9341d3c9ca98c6f39c761043124b844e to your computer and use it in GitHub Desktop.

Select an option

Save tkuennen/9341d3c9ca98c6f39c761043124b844e to your computer and use it in GitHub Desktop.
pip install lifelines
# #############################################################################
# # TCGA-BRCA Multiomics Data Integration and Survival Analysis
# #############################################################################
#
# This notebook demonstrates the integration of Transcriptomics (mRNA), Copy Number Variation (CNV),
# and Clinical data for Breast Invasive Carcinoma (BRCA) from The Cancer Genome Atlas (TCGA).
#
# **ASSUMPTION:** You have downloaded the following files from a source like the
# UCSC Xena Browser and placed them in the same directory as this notebook:
# 1. mRNA Expression: 'TCGA-BRCA.log2.normalized.mRNA.tsv'
# 2. Gene-level CNV: 'TCGA-BRCA.GISTIC.tsv'
# 3. Clinical/Survival Data: 'TCGA-BRCA.clinical.tsv'
# --- 1. SETUP: IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# NOTE: The 'lifelines' library is required for survival analysis.
# If you get a ModuleNotFoundError, run: pip install lifelines
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test
# Configuration for plotting
sns.set_theme(style="whitegrid", palette="Set2")
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.size'] = 10
print("Required libraries loaded successfully.")
# --- 2. DATA LOADING (Simulating Real TCGA File Loading) ---
def load_tcga_data(file_path, index_col=0):
"""Loads a TCGA TSV file and performs basic cleanup."""
print(f"Loading data from: {file_path}...")
try:
df = pd.read_csv(file_path, sep='\t', index_col=index_col)
# Transpose Omics data so samples are rows and features (genes) are columns
if 'mRNA' in file_path or 'GISTIC' in file_path:
df = df.T
print(f"Transposed data to {df.shape[0]} samples x {df.shape[1]} features.")
return df
except FileNotFoundError:
print(f"!!! ERROR: File not found at {file_path}. Generating mock data for demonstration.")
# --- MOCK DATA FALLBACK (If files are not present) ---
if 'mRNA' in file_path:
return pd.DataFrame(np.random.rand(500, 1500), index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)], columns=[f'Gene_{i}' for i in range(1500)])
elif 'GISTIC' in file_path:
return pd.DataFrame(np.random.randint(-2, 3, size=(500, 1000)), index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)], columns=[f'CNV_{i}' for i in range(1000)])
else: # Clinical data fallback
return pd.DataFrame({
'OS.time': np.random.randint(100, 3000, 500),
'OS': np.random.randint(0, 2, 500),
'ER_Status_ihc': np.random.choice(['Positive', 'Negative'], 500),
'PR_Status_ihc': np.random.choice(['Positive', 'Negative'], 500),
'HER2_Status_ihc': np.random.choice(['Positive', 'Negative'], 500),
}, index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)])
# Load the omics layers
clinical_df = load_tcga_data('TCGA-BRCA.clinical.tsv', index_col='sample')
rna_df = load_tcga_data('TCGA-BRCA.log2.normalized.mRNA.tsv')
cnv_df = load_tcga_data('TCGA-BRCA.GISTIC.tsv')
# --- 3. DATA HARMONIZATION & PREPROCESSING (TCGA ID Cleaning) ---
# TCGA IDs are hierarchical. We need to match the patient (12 digits, e.g., TCGA-A1-A0EX)
# or the sample (16 digits, e.g., TCGA-A1-A0EX-01).
def clean_tcga_ids(index, level='sample'):
"""Cleans TCGA Sample IDs to a common length (e.g., 12-digit patient or 15-digit sample)."""
if level == 'patient':
# Use 12 digits (patient barcode)
return [idx[:12] for idx in index]
elif level == 'sample':
# Use 15 digits (sample barcode)
return [idx[:15] for idx in index]
return index
# 3.1 Clean and standardize Omics sample IDs (15 digits is standard for primary tumor -01)
rna_df.index = clean_tcga_ids(rna_df.index)
cnv_df.index = clean_tcga_ids(cnv_df.index)
# 3.2 Clean Clinical IDs (often 12 digits, but we will align to the 15-digit omics IDs for max overlap)
# For simplicity in this structure, we assume clinical data is indexed by the first 15 characters of the sample ID.
clinical_df.index = clinical_tcga_ids = clean_tcga_ids(clinical_df.index)
# 3.3 Match samples across all dataframes
common_samples = list(set(clinical_df.index) & set(rna_df.index) & set(cnv_df.index))
clinical_df_matched = clinical_df.loc[common_samples]
rna_df_matched = rna_df.loc[common_samples]
cnv_df_matched = cnv_df.loc[common_samples]
print(f"\n--- Data Harmonization Summary ---")
print(f"Total initial samples in Clinical: {clinical_df.shape[0]}")
print(f"Total matched samples for full multiomics analysis: {len(common_samples)}")
# --- 4. CLINICAL DATA STRATIFICATION (Creating Subtypes) ---
# Based on the user's images, define molecular subtypes (Luminal, HER2-enriched, Triple-Negative)
def define_subtype(row):
# Use standard TCGA clinical column names (may need adjustment based on exact file)
er = str(row.get('ER_Status_ihc', 'Unknown')).lower()
pr = str(row.get('PR_Status_ihc', 'Unknown')).lower()
her2 = str(row.get('HER2_Status_ihc', 'Unknown')).lower()
if er == 'positive' and her2 == 'negative':
return 'Luminal A/B'
elif her2 == 'positive':
return 'HER2-Enriched'
elif er == 'negative' and pr == 'negative' and her2 == 'negative':
return 'Triple-Negative'
else:
return 'Other/Unknown'
clinical_df_matched['Subtype'] = clinical_df_matched.apply(define_subtype, axis=1)
clinical_df_final = clinical_df_matched[clinical_df_matched['Subtype'] != 'Other/Unknown']
rna_df_final = rna_df_matched.loc[clinical_df_final.index]
cnv_df_final = cnv_df_matched.loc[clinical_df_final.index]
print(f"Samples remaining after subtyping: {clinical_df_final.shape[0]}")
print(clinical_df_final['Subtype'].value_counts())
# --- 5. MULTI-OMICS INTEGRATION (Early Integration Strategy: PCA) ---
# 5.1 Select highly variable genes (improves PCA clustering)
# In real analysis, you'd filter by median absolute deviation (MAD) or Variance.
# Mock filtering for demonstration: keep top 50% of genes by variance.
rna_var = rna_df_final.var(axis=0).sort_values(ascending=False)
top_rna_features = rna_var.head(int(rna_var.shape[0] * 0.5)).index
rna_scaled = rna_df_final[top_rna_features]
# 5.2 Standardize and combine features
scaler = StandardScaler()
X_rna_scaled = scaler.fit_transform(rna_scaled)
X_cnv = cnv_df_final # CNV data is already categorical/discrete, usually no scaling needed.
# Concatenate (Early Integration)
# Use a smaller subset of CNV features just for visualization purposes if the number is too high
cnv_subset = cnv_df_final.iloc[:, :500]
X_combined = pd.DataFrame(
np.hstack([X_rna_scaled, cnv_subset.values]),
index=rna_scaled.index
)
# 5.3 Apply PCA for dimensionality reduction and visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_combined)
pca_df = pd.DataFrame(data=X_pca, index=X_combined.index, columns=['PC1', 'PC2'])
pca_df['Subtype'] = clinical_df_final['Subtype']
# Visualization: PCA plot
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Subtype', data=pca_df, style='Subtype', s=100, alpha=0.7)
plt.title('PCA of Integrated TCGA-BRCA Multi-Omics Data')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.2f}%)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.2f}%)')
plt.legend(title='Molecular Subtype')
plt.show()
# --- 6. SURVIVAL ANALYSIS (Kaplan-Meier) ---
print("\n--- 6. Survival Analysis: Subtype Prognosis ---")
# Prepare data for survival analysis (using OS.time and OS)
# NOTE: TCGA time is often in days. We convert to years.
T = clinical_df_final['OS.time'] / 365.25 # Time in years (TCGA-BRCA standard)
E = clinical_df_final['OS'] # Event (1=Dead, 0=Alive)
plt.figure(figsize=(10, 6))
kmf = KaplanMeierFitter()
# Plot Kaplan-Meier curves for each major subtype
subtypes = ['Luminal A/B', 'HER2-Enriched', 'Triple-Negative']
for subtype in subtypes:
subset_mask = clinical_df_final['Subtype'] == subtype
if subset_mask.any():
kmf.fit(T[subset_mask], event_observed=E[subset_mask], label=subtype)
kmf.plot_survival_function(ci_show=False, linewidth=2)
plt.title('Overall Survival (OS) by TCGA-BRCA Molecular Subtype')
plt.xlabel('Time (Years)')
plt.ylabel('Survival Probability (Kaplan-Meier)')
plt.legend(title='Subtype')
plt.ylim(0, 1.05)
plt.grid(True, alpha=0.5, linestyle='--')
plt.show()
# --- 7. BIVARIATE/CLINICAL FEATURE ANALYSIS (Conceptual) ---
# Example: Cox Proportional Hazards Model (Requires lifelines installation)
# This step models how a specific gene expression level (e.g., ERBB2 for HER2+)
# predicts survival, controlling for other factors.
# Mock gene for Bivariate Cox Model
example_gene = rna_df_final.columns[50] # Placeholder gene
rna_df_final['ERBB2_like_Expr'] = rna_df_final[example_gene]
# Create the final dataset for the Cox model
cox_data = clinical_df_final[['OS.time', 'OS', 'Subtype']].copy()
cox_data['T_Years'] = cox_data['OS.time'] / 365.25
cox_data = cox_data.join(rna_df_final['ERBB2_like_Expr'])
# Cox model only accepts numerical features; one-hot encode the 'Subtype'
cox_data = pd.get_dummies(cox_data, columns=['Subtype'], drop_first=True)
cox_data = cox_data.dropna() # Drop any rows with missing data
if cox_data.shape[0] > 100:
cph = CoxPHFitter()
# Fit the model: Predict survival based on time/event, using gene expression and subtypes as covariates
cph.fit(cox_data, duration_col='T_Years', event_col='OS', formula="ERBB2_like_Expr + Subtype_HER2-Enriched + Subtype_Triple-Negative")
print("\n--- 7. Cox Proportional Hazards Model Summary ---")
print("Model estimates show hazard ratio (exp(coef)) for each feature:")
print(cph.summary[['exp(coef)', 'p']].sort_values(by='p').head(5))
print("\nNotebook Complete: The analysis flow demonstrates the standard steps for TCGA multiomics integration, subtyping, and prognostic analysis.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment