Created
December 3, 2025 05:41
-
-
Save tkuennen/9341d3c9ca98c6f39c761043124b844e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| pip install lifelines | |
| # ############################################################################# | |
| # # TCGA-BRCA Multiomics Data Integration and Survival Analysis | |
| # ############################################################################# | |
| # | |
| # This notebook demonstrates the integration of Transcriptomics (mRNA), Copy Number Variation (CNV), | |
| # and Clinical data for Breast Invasive Carcinoma (BRCA) from The Cancer Genome Atlas (TCGA). | |
| # | |
| # **ASSUMPTION:** You have downloaded the following files from a source like the | |
| # UCSC Xena Browser and placed them in the same directory as this notebook: | |
| # 1. mRNA Expression: 'TCGA-BRCA.log2.normalized.mRNA.tsv' | |
| # 2. Gene-level CNV: 'TCGA-BRCA.GISTIC.tsv' | |
| # 3. Clinical/Survival Data: 'TCGA-BRCA.clinical.tsv' | |
| # --- 1. SETUP: IMPORT LIBRARIES --- | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from scipy import stats | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| # NOTE: The 'lifelines' library is required for survival analysis. | |
| # If you get a ModuleNotFoundError, run: pip install lifelines | |
| from lifelines import KaplanMeierFitter, CoxPHFitter | |
| from lifelines.statistics import logrank_test | |
| # Configuration for plotting | |
| sns.set_theme(style="whitegrid", palette="Set2") | |
| plt.rcParams['font.family'] = 'sans-serif' | |
| plt.rcParams['font.size'] = 10 | |
| print("Required libraries loaded successfully.") | |
| # --- 2. DATA LOADING (Simulating Real TCGA File Loading) --- | |
| def load_tcga_data(file_path, index_col=0): | |
| """Loads a TCGA TSV file and performs basic cleanup.""" | |
| print(f"Loading data from: {file_path}...") | |
| try: | |
| df = pd.read_csv(file_path, sep='\t', index_col=index_col) | |
| # Transpose Omics data so samples are rows and features (genes) are columns | |
| if 'mRNA' in file_path or 'GISTIC' in file_path: | |
| df = df.T | |
| print(f"Transposed data to {df.shape[0]} samples x {df.shape[1]} features.") | |
| return df | |
| except FileNotFoundError: | |
| print(f"!!! ERROR: File not found at {file_path}. Generating mock data for demonstration.") | |
| # --- MOCK DATA FALLBACK (If files are not present) --- | |
| if 'mRNA' in file_path: | |
| return pd.DataFrame(np.random.rand(500, 1500), index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)], columns=[f'Gene_{i}' for i in range(1500)]) | |
| elif 'GISTIC' in file_path: | |
| return pd.DataFrame(np.random.randint(-2, 3, size=(500, 1000)), index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)], columns=[f'CNV_{i}' for i in range(1000)]) | |
| else: # Clinical data fallback | |
| return pd.DataFrame({ | |
| 'OS.time': np.random.randint(100, 3000, 500), | |
| 'OS': np.random.randint(0, 2, 500), | |
| 'ER_Status_ihc': np.random.choice(['Positive', 'Negative'], 500), | |
| 'PR_Status_ihc': np.random.choice(['Positive', 'Negative'], 500), | |
| 'HER2_Status_ihc': np.random.choice(['Positive', 'Negative'], 500), | |
| }, index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)]) | |
| # Load the omics layers | |
| clinical_df = load_tcga_data('TCGA-BRCA.clinical.tsv', index_col='sample') | |
| rna_df = load_tcga_data('TCGA-BRCA.log2.normalized.mRNA.tsv') | |
| cnv_df = load_tcga_data('TCGA-BRCA.GISTIC.tsv') | |
| # --- 3. DATA HARMONIZATION & PREPROCESSING (TCGA ID Cleaning) --- | |
| # TCGA IDs are hierarchical. We need to match the patient (12 digits, e.g., TCGA-A1-A0EX) | |
| # or the sample (16 digits, e.g., TCGA-A1-A0EX-01). | |
| def clean_tcga_ids(index, level='sample'): | |
| """Cleans TCGA Sample IDs to a common length (e.g., 12-digit patient or 15-digit sample).""" | |
| if level == 'patient': | |
| # Use 12 digits (patient barcode) | |
| return [idx[:12] for idx in index] | |
| elif level == 'sample': | |
| # Use 15 digits (sample barcode) | |
| return [idx[:15] for idx in index] | |
| return index | |
| # 3.1 Clean and standardize Omics sample IDs (15 digits is standard for primary tumor -01) | |
| rna_df.index = clean_tcga_ids(rna_df.index) | |
| cnv_df.index = clean_tcga_ids(cnv_df.index) | |
| # 3.2 Clean Clinical IDs (often 12 digits, but we will align to the 15-digit omics IDs for max overlap) | |
| # For simplicity in this structure, we assume clinical data is indexed by the first 15 characters of the sample ID. | |
| clinical_df.index = clinical_tcga_ids = clean_tcga_ids(clinical_df.index) | |
| # 3.3 Match samples across all dataframes | |
| common_samples = list(set(clinical_df.index) & set(rna_df.index) & set(cnv_df.index)) | |
| clinical_df_matched = clinical_df.loc[common_samples] | |
| rna_df_matched = rna_df.loc[common_samples] | |
| cnv_df_matched = cnv_df.loc[common_samples] | |
| print(f"\n--- Data Harmonization Summary ---") | |
| print(f"Total initial samples in Clinical: {clinical_df.shape[0]}") | |
| print(f"Total matched samples for full multiomics analysis: {len(common_samples)}") | |
| # --- 4. CLINICAL DATA STRATIFICATION (Creating Subtypes) --- | |
| # Based on the user's images, define molecular subtypes (Luminal, HER2-enriched, Triple-Negative) | |
| def define_subtype(row): | |
| # Use standard TCGA clinical column names (may need adjustment based on exact file) | |
| er = str(row.get('ER_Status_ihc', 'Unknown')).lower() | |
| pr = str(row.get('PR_Status_ihc', 'Unknown')).lower() | |
| her2 = str(row.get('HER2_Status_ihc', 'Unknown')).lower() | |
| if er == 'positive' and her2 == 'negative': | |
| return 'Luminal A/B' | |
| elif her2 == 'positive': | |
| return 'HER2-Enriched' | |
| elif er == 'negative' and pr == 'negative' and her2 == 'negative': | |
| return 'Triple-Negative' | |
| else: | |
| return 'Other/Unknown' | |
| clinical_df_matched['Subtype'] = clinical_df_matched.apply(define_subtype, axis=1) | |
| clinical_df_final = clinical_df_matched[clinical_df_matched['Subtype'] != 'Other/Unknown'] | |
| rna_df_final = rna_df_matched.loc[clinical_df_final.index] | |
| cnv_df_final = cnv_df_matched.loc[clinical_df_final.index] | |
| print(f"Samples remaining after subtyping: {clinical_df_final.shape[0]}") | |
| print(clinical_df_final['Subtype'].value_counts()) | |
| # --- 5. MULTI-OMICS INTEGRATION (Early Integration Strategy: PCA) --- | |
| # 5.1 Select highly variable genes (improves PCA clustering) | |
| # In real analysis, you'd filter by median absolute deviation (MAD) or Variance. | |
| # Mock filtering for demonstration: keep top 50% of genes by variance. | |
| rna_var = rna_df_final.var(axis=0).sort_values(ascending=False) | |
| top_rna_features = rna_var.head(int(rna_var.shape[0] * 0.5)).index | |
| rna_scaled = rna_df_final[top_rna_features] | |
| # 5.2 Standardize and combine features | |
| scaler = StandardScaler() | |
| X_rna_scaled = scaler.fit_transform(rna_scaled) | |
| X_cnv = cnv_df_final # CNV data is already categorical/discrete, usually no scaling needed. | |
| # Concatenate (Early Integration) | |
| # Use a smaller subset of CNV features just for visualization purposes if the number is too high | |
| cnv_subset = cnv_df_final.iloc[:, :500] | |
| X_combined = pd.DataFrame( | |
| np.hstack([X_rna_scaled, cnv_subset.values]), | |
| index=rna_scaled.index | |
| ) | |
| # 5.3 Apply PCA for dimensionality reduction and visualization | |
| pca = PCA(n_components=2) | |
| X_pca = pca.fit_transform(X_combined) | |
| pca_df = pd.DataFrame(data=X_pca, index=X_combined.index, columns=['PC1', 'PC2']) | |
| pca_df['Subtype'] = clinical_df_final['Subtype'] | |
| # Visualization: PCA plot | |
| plt.figure(figsize=(10, 8)) | |
| sns.scatterplot(x='PC1', y='PC2', hue='Subtype', data=pca_df, style='Subtype', s=100, alpha=0.7) | |
| plt.title('PCA of Integrated TCGA-BRCA Multi-Omics Data') | |
| plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.2f}%)') | |
| plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.2f}%)') | |
| plt.legend(title='Molecular Subtype') | |
| plt.show() | |
| # --- 6. SURVIVAL ANALYSIS (Kaplan-Meier) --- | |
| print("\n--- 6. Survival Analysis: Subtype Prognosis ---") | |
| # Prepare data for survival analysis (using OS.time and OS) | |
| # NOTE: TCGA time is often in days. We convert to years. | |
| T = clinical_df_final['OS.time'] / 365.25 # Time in years (TCGA-BRCA standard) | |
| E = clinical_df_final['OS'] # Event (1=Dead, 0=Alive) | |
| plt.figure(figsize=(10, 6)) | |
| kmf = KaplanMeierFitter() | |
| # Plot Kaplan-Meier curves for each major subtype | |
| subtypes = ['Luminal A/B', 'HER2-Enriched', 'Triple-Negative'] | |
| for subtype in subtypes: | |
| subset_mask = clinical_df_final['Subtype'] == subtype | |
| if subset_mask.any(): | |
| kmf.fit(T[subset_mask], event_observed=E[subset_mask], label=subtype) | |
| kmf.plot_survival_function(ci_show=False, linewidth=2) | |
| plt.title('Overall Survival (OS) by TCGA-BRCA Molecular Subtype') | |
| plt.xlabel('Time (Years)') | |
| plt.ylabel('Survival Probability (Kaplan-Meier)') | |
| plt.legend(title='Subtype') | |
| plt.ylim(0, 1.05) | |
| plt.grid(True, alpha=0.5, linestyle='--') | |
| plt.show() | |
| # --- 7. BIVARIATE/CLINICAL FEATURE ANALYSIS (Conceptual) --- | |
| # Example: Cox Proportional Hazards Model (Requires lifelines installation) | |
| # This step models how a specific gene expression level (e.g., ERBB2 for HER2+) | |
| # predicts survival, controlling for other factors. | |
| # Mock gene for Bivariate Cox Model | |
| example_gene = rna_df_final.columns[50] # Placeholder gene | |
| rna_df_final['ERBB2_like_Expr'] = rna_df_final[example_gene] | |
| # Create the final dataset for the Cox model | |
| cox_data = clinical_df_final[['OS.time', 'OS', 'Subtype']].copy() | |
| cox_data['T_Years'] = cox_data['OS.time'] / 365.25 | |
| cox_data = cox_data.join(rna_df_final['ERBB2_like_Expr']) | |
| # Cox model only accepts numerical features; one-hot encode the 'Subtype' | |
| cox_data = pd.get_dummies(cox_data, columns=['Subtype'], drop_first=True) | |
| cox_data = cox_data.dropna() # Drop any rows with missing data | |
| if cox_data.shape[0] > 100: | |
| cph = CoxPHFitter() | |
| # Fit the model: Predict survival based on time/event, using gene expression and subtypes as covariates | |
| cph.fit(cox_data, duration_col='T_Years', event_col='OS', formula="ERBB2_like_Expr + Subtype_HER2-Enriched + Subtype_Triple-Negative") | |
| print("\n--- 7. Cox Proportional Hazards Model Summary ---") | |
| print("Model estimates show hazard ratio (exp(coef)) for each feature:") | |
| print(cph.summary[['exp(coef)', 'p']].sort_values(by='p').head(5)) | |
| print("\nNotebook Complete: The analysis flow demonstrates the standard steps for TCGA multiomics integration, subtyping, and prognostic analysis.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment