tkuennen · December 3, 2025 05:41
diff --git a/TCGA-BRCA-Multiomics-Analysis-Notebook.py b/TCGA-BRCA-Multiomics-Analysis-Notebook.py
 pip install lifelines
 # #############################################################################
 # # TCGA-BRCA Multiomics Data Integration and Survival Analysis
 # #############################################################################
 #
 # This notebook demonstrates the integration of Transcriptomics (mRNA), Copy Number Variation (CNV),
 # and Clinical data for Breast Invasive Carcinoma (BRCA) from The Cancer Genome Atlas (TCGA).
 #
 # **ASSUMPTION:** You have downloaded the following files from a source like the
 # UCSC Xena Browser and placed them in the same directory as this notebook:
 # 1. mRNA Expression: 'TCGA-BRCA.log2.normalized.mRNA.tsv'
 # 2. Gene-level CNV: 'TCGA-BRCA.GISTIC.tsv'
 # 3. Clinical/Survival Data: 'TCGA-BRCA.clinical.tsv'

 # --- 1. SETUP: IMPORT LIBRARIES ---

 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 from scipy import stats
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 # NOTE: The 'lifelines' library is required for survival analysis.
 # If you get a ModuleNotFoundError, run: pip install lifelines
 from lifelines import KaplanMeierFitter, CoxPHFitter
 from lifelines.statistics import logrank_test

 # Configuration for plotting
 sns.set_theme(style="whitegrid", palette="Set2")
 plt.rcParams['font.family'] = 'sans-serif'
 plt.rcParams['font.size'] = 10
 print("Required libraries loaded successfully.")


 # --- 2. DATA LOADING (Simulating Real TCGA File Loading) ---

 def load_tcga_data(file_path, index_col=0):
    """Loads a TCGA TSV file and performs basic cleanup."""
    print(f"Loading data from: {file_path}...")
    try:
        df = pd.read_csv(file_path, sep='\t', index_col=index_col)
        # Transpose Omics data so samples are rows and features (genes) are columns
        if 'mRNA' in file_path or 'GISTIC' in file_path:
            df = df.T
            print(f"Transposed data to {df.shape[0]} samples x {df.shape[1]} features.")
        return df
    except FileNotFoundError:
        print(f"!!! ERROR: File not found at {file_path}. Generating mock data for demonstration.")
        # --- MOCK DATA FALLBACK (If files are not present) ---
        if 'mRNA' in file_path:
            return pd.DataFrame(np.random.rand(500, 1500), index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)], columns=[f'Gene_{i}' for i in range(1500)])
        elif 'GISTIC' in file_path:
            return pd.DataFrame(np.random.randint(-2, 3, size=(500, 1000)), index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)], columns=[f'CNV_{i}' for i in range(1000)])
        else: # Clinical data fallback
            return pd.DataFrame({
                'OS.time': np.random.randint(100, 3000, 500),
                'OS': np.random.randint(0, 2, 500),
                'ER_Status_ihc': np.random.choice(['Positive', 'Negative'], 500),
                'PR_Status_ihc': np.random.choice(['Positive', 'Negative'], 500),
                'HER2_Status_ihc': np.random.choice(['Positive', 'Negative'], 500),
            }, index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)])

 # Load the omics layers
 clinical_df = load_tcga_data('TCGA-BRCA.clinical.tsv', index_col='sample')
 rna_df = load_tcga_data('TCGA-BRCA.log2.normalized.mRNA.tsv')
 cnv_df = load_tcga_data('TCGA-BRCA.GISTIC.tsv')


 # --- 3. DATA HARMONIZATION & PREPROCESSING (TCGA ID Cleaning) ---

 # TCGA IDs are hierarchical. We need to match the patient (12 digits, e.g., TCGA-A1-A0EX)
 # or the sample (16 digits, e.g., TCGA-A1-A0EX-01).

 def clean_tcga_ids(index, level='sample'):
    """Cleans TCGA Sample IDs to a common length (e.g., 12-digit patient or 15-digit sample)."""
    if level == 'patient':
        # Use 12 digits (patient barcode)
        return [idx[:12] for idx in index]
    elif level == 'sample':
        # Use 15 digits (sample barcode)
        return [idx[:15] for idx in index]
    return index

 # 3.1 Clean and standardize Omics sample IDs (15 digits is standard for primary tumor -01)
 rna_df.index = clean_tcga_ids(rna_df.index)
 cnv_df.index = clean_tcga_ids(cnv_df.index)

 # 3.2 Clean Clinical IDs (often 12 digits, but we will align to the 15-digit omics IDs for max overlap)
 # For simplicity in this structure, we assume clinical data is indexed by the first 15 characters of the sample ID.
 clinical_df.index = clinical_tcga_ids = clean_tcga_ids(clinical_df.index)

 # 3.3 Match samples across all dataframes
 common_samples = list(set(clinical_df.index) & set(rna_df.index) & set(cnv_df.index))
 clinical_df_matched = clinical_df.loc[common_samples]
 rna_df_matched = rna_df.loc[common_samples]
 cnv_df_matched = cnv_df.loc[common_samples]

 print(f"\n--- Data Harmonization Summary ---")
 print(f"Total initial samples in Clinical: {clinical_df.shape[0]}")
 print(f"Total matched samples for full multiomics analysis: {len(common_samples)}")


 # --- 4. CLINICAL DATA STRATIFICATION (Creating Subtypes) ---

 # Based on the user's images, define molecular subtypes (Luminal, HER2-enriched, Triple-Negative)
 def define_subtype(row):
    # Use standard TCGA clinical column names (may need adjustment based on exact file)
    er = str(row.get('ER_Status_ihc', 'Unknown')).lower()
    pr = str(row.get('PR_Status_ihc', 'Unknown')).lower()
    her2 = str(row.get('HER2_Status_ihc', 'Unknown')).lower()

    if er == 'positive' and her2 == 'negative':
        return 'Luminal A/B'
    elif her2 == 'positive':
        return 'HER2-Enriched'
    elif er == 'negative' and pr == 'negative' and her2 == 'negative':
        return 'Triple-Negative'
    else:
        return 'Other/Unknown'

 clinical_df_matched['Subtype'] = clinical_df_matched.apply(define_subtype, axis=1)
 clinical_df_final = clinical_df_matched[clinical_df_matched['Subtype'] != 'Other/Unknown']
 rna_df_final = rna_df_matched.loc[clinical_df_final.index]
 cnv_df_final = cnv_df_matched.loc[clinical_df_final.index]

 print(f"Samples remaining after subtyping: {clinical_df_final.shape[0]}")
 print(clinical_df_final['Subtype'].value_counts())


 # --- 5. MULTI-OMICS INTEGRATION (Early Integration Strategy: PCA) ---

 # 5.1 Select highly variable genes (improves PCA clustering)
 # In real analysis, you'd filter by median absolute deviation (MAD) or Variance.
 # Mock filtering for demonstration: keep top 50% of genes by variance.
 rna_var = rna_df_final.var(axis=0).sort_values(ascending=False)
 top_rna_features = rna_var.head(int(rna_var.shape[0] * 0.5)).index
 rna_scaled = rna_df_final[top_rna_features]

 # 5.2 Standardize and combine features
 scaler = StandardScaler()
 X_rna_scaled = scaler.fit_transform(rna_scaled)
 X_cnv = cnv_df_final # CNV data is already categorical/discrete, usually no scaling needed.

 # Concatenate (Early Integration)
 # Use a smaller subset of CNV features just for visualization purposes if the number is too high
 cnv_subset = cnv_df_final.iloc[:, :500]
 X_combined = pd.DataFrame(
    np.hstack([X_rna_scaled, cnv_subset.values]),
    index=rna_scaled.index
 )

 # 5.3 Apply PCA for dimensionality reduction and visualization
 pca = PCA(n_components=2)
 X_pca = pca.fit_transform(X_combined)
 pca_df = pd.DataFrame(data=X_pca, index=X_combined.index, columns=['PC1', 'PC2'])
 pca_df['Subtype'] = clinical_df_final['Subtype']

 # Visualization: PCA plot
 plt.figure(figsize=(10, 8))
 sns.scatterplot(x='PC1', y='PC2', hue='Subtype', data=pca_df, style='Subtype', s=100, alpha=0.7)
 plt.title('PCA of Integrated TCGA-BRCA Multi-Omics Data')
 plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.2f}%)')
 plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.2f}%)')
 plt.legend(title='Molecular Subtype')
 plt.show() 


 # --- 6. SURVIVAL ANALYSIS (Kaplan-Meier) ---

 print("\n--- 6. Survival Analysis: Subtype Prognosis ---")
 # Prepare data for survival analysis (using OS.time and OS)
 # NOTE: TCGA time is often in days. We convert to years.
 T = clinical_df_final['OS.time'] / 365.25  # Time in years (TCGA-BRCA standard)
 E = clinical_df_final['OS']  # Event (1=Dead, 0=Alive)

 plt.figure(figsize=(10, 6))
 kmf = KaplanMeierFitter()

 # Plot Kaplan-Meier curves for each major subtype
 subtypes = ['Luminal A/B', 'HER2-Enriched', 'Triple-Negative']
 for subtype in subtypes:
    subset_mask = clinical_df_final['Subtype'] == subtype
    if subset_mask.any():
        kmf.fit(T[subset_mask], event_observed=E[subset_mask], label=subtype)
        kmf.plot_survival_function(ci_show=False, linewidth=2)

 plt.title('Overall Survival (OS) by TCGA-BRCA Molecular Subtype')
 plt.xlabel('Time (Years)')
 plt.ylabel('Survival Probability (Kaplan-Meier)')
 plt.legend(title='Subtype')
 plt.ylim(0, 1.05)
 plt.grid(True, alpha=0.5, linestyle='--')
 plt.show() 


 # --- 7. BIVARIATE/CLINICAL FEATURE ANALYSIS (Conceptual) ---

 # Example: Cox Proportional Hazards Model (Requires lifelines installation)
 # This step models how a specific gene expression level (e.g., ERBB2 for HER2+)
 # predicts survival, controlling for other factors.

 # Mock gene for Bivariate Cox Model
 example_gene = rna_df_final.columns[50] # Placeholder gene
 rna_df_final['ERBB2_like_Expr'] = rna_df_final[example_gene]

 # Create the final dataset for the Cox model
 cox_data = clinical_df_final[['OS.time', 'OS', 'Subtype']].copy()
 cox_data['T_Years'] = cox_data['OS.time'] / 365.25
 cox_data = cox_data.join(rna_df_final['ERBB2_like_Expr'])

 # Cox model only accepts numerical features; one-hot encode the 'Subtype'
 cox_data = pd.get_dummies(cox_data, columns=['Subtype'], drop_first=True)
 cox_data = cox_data.dropna() # Drop any rows with missing data

 if cox_data.shape[0] > 100:
    cph = CoxPHFitter()
    # Fit the model: Predict survival based on time/event, using gene expression and subtypes as covariates
    cph.fit(cox_data, duration_col='T_Years', event_col='OS', formula="ERBB2_like_Expr + Subtype_HER2-Enriched + Subtype_Triple-Negative")

    print("\n--- 7. Cox Proportional Hazards Model Summary ---")
    print("Model estimates show hazard ratio (exp(coef)) for each feature:")
    print(cph.summary[['exp(coef)', 'p']].sort_values(by='p').head(5))

 print("\nNotebook Complete: The analysis flow demonstrates the standard steps for TCGA multiomics integration, subtyping, and prognostic analysis.")
	pip install lifelines
	# #############################################################################
	# # TCGA-BRCA Multiomics Data Integration and Survival Analysis
	# #############################################################################
	#
	# This notebook demonstrates the integration of Transcriptomics (mRNA), Copy Number Variation (CNV),
	# and Clinical data for Breast Invasive Carcinoma (BRCA) from The Cancer Genome Atlas (TCGA).
	#
	# ASSUMPTION: You have downloaded the following files from a source like the
	# UCSC Xena Browser and placed them in the same directory as this notebook:
	# 1. mRNA Expression: 'TCGA-BRCA.log2.normalized.mRNA.tsv'
	# 2. Gene-level CNV: 'TCGA-BRCA.GISTIC.tsv'
	# 3. Clinical/Survival Data: 'TCGA-BRCA.clinical.tsv'

	# --- 1. SETUP: IMPORT LIBRARIES ---

	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	from scipy import stats
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import StandardScaler
	# NOTE: The 'lifelines' library is required for survival analysis.
	# If you get a ModuleNotFoundError, run: pip install lifelines
	from lifelines import KaplanMeierFitter, CoxPHFitter
	from lifelines.statistics import logrank_test

	# Configuration for plotting
	sns.set_theme(style="whitegrid", palette="Set2")
	plt.rcParams['font.family'] = 'sans-serif'
	plt.rcParams['font.size'] = 10
	print("Required libraries loaded successfully.")


	# --- 2. DATA LOADING (Simulating Real TCGA File Loading) ---

	def load_tcga_data(file_path, index_col=0):
	"""Loads a TCGA TSV file and performs basic cleanup."""
	print(f"Loading data from: {file_path}...")
	try:
	df = pd.read_csv(file_path, sep='\t', index_col=index_col)
	# Transpose Omics data so samples are rows and features (genes) are columns
	if 'mRNA' in file_path or 'GISTIC' in file_path:
	df = df.T
	print(f"Transposed data to {df.shape[0]} samples x {df.shape[1]} features.")
	return df
	except FileNotFoundError:
	print(f"!!! ERROR: File not found at {file_path}. Generating mock data for demonstration.")
	# --- MOCK DATA FALLBACK (If files are not present) ---
	if 'mRNA' in file_path:
	return pd.DataFrame(np.random.rand(500, 1500), index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)], columns=[f'Gene_{i}' for i in range(1500)])
	elif 'GISTIC' in file_path:
	return pd.DataFrame(np.random.randint(-2, 3, size=(500, 1000)), index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)], columns=[f'CNV_{i}' for i in range(1000)])
	else: # Clinical data fallback
	return pd.DataFrame({
	'OS.time': np.random.randint(100, 3000, 500),
	'OS': np.random.randint(0, 2, 500),
	'ER_Status_ihc': np.random.choice(['Positive', 'Negative'], 500),
	'PR_Status_ihc': np.random.choice(['Positive', 'Negative'], 500),
	'HER2_Status_ihc': np.random.choice(['Positive', 'Negative'], 500),
	}, index=[f'TCGA-A1-A0EX-01{i}' for i in range(500)])

	# Load the omics layers
	clinical_df = load_tcga_data('TCGA-BRCA.clinical.tsv', index_col='sample')
	rna_df = load_tcga_data('TCGA-BRCA.log2.normalized.mRNA.tsv')
	cnv_df = load_tcga_data('TCGA-BRCA.GISTIC.tsv')


	# --- 3. DATA HARMONIZATION & PREPROCESSING (TCGA ID Cleaning) ---

	# TCGA IDs are hierarchical. We need to match the patient (12 digits, e.g., TCGA-A1-A0EX)
	# or the sample (16 digits, e.g., TCGA-A1-A0EX-01).

	def clean_tcga_ids(index, level='sample'):
	"""Cleans TCGA Sample IDs to a common length (e.g., 12-digit patient or 15-digit sample)."""
	if level == 'patient':
	# Use 12 digits (patient barcode)
	return [idx[:12] for idx in index]
	elif level == 'sample':
	# Use 15 digits (sample barcode)
	return [idx[:15] for idx in index]
	return index

	# 3.1 Clean and standardize Omics sample IDs (15 digits is standard for primary tumor -01)
	rna_df.index = clean_tcga_ids(rna_df.index)
	cnv_df.index = clean_tcga_ids(cnv_df.index)

	# 3.2 Clean Clinical IDs (often 12 digits, but we will align to the 15-digit omics IDs for max overlap)
	# For simplicity in this structure, we assume clinical data is indexed by the first 15 characters of the sample ID.
	clinical_df.index = clinical_tcga_ids = clean_tcga_ids(clinical_df.index)

	# 3.3 Match samples across all dataframes
	common_samples = list(set(clinical_df.index) & set(rna_df.index) & set(cnv_df.index))
	clinical_df_matched = clinical_df.loc[common_samples]
	rna_df_matched = rna_df.loc[common_samples]
	cnv_df_matched = cnv_df.loc[common_samples]

	print(f"\n--- Data Harmonization Summary ---")
	print(f"Total initial samples in Clinical: {clinical_df.shape[0]}")
	print(f"Total matched samples for full multiomics analysis: {len(common_samples)}")


	# --- 4. CLINICAL DATA STRATIFICATION (Creating Subtypes) ---

	# Based on the user's images, define molecular subtypes (Luminal, HER2-enriched, Triple-Negative)
	def define_subtype(row):
	# Use standard TCGA clinical column names (may need adjustment based on exact file)
	er = str(row.get('ER_Status_ihc', 'Unknown')).lower()
	pr = str(row.get('PR_Status_ihc', 'Unknown')).lower()
	her2 = str(row.get('HER2_Status_ihc', 'Unknown')).lower()

	if er == 'positive' and her2 == 'negative':
	return 'Luminal A/B'
	elif her2 == 'positive':
	return 'HER2-Enriched'
	elif er == 'negative' and pr == 'negative' and her2 == 'negative':
	return 'Triple-Negative'
	else:
	return 'Other/Unknown'

	clinical_df_matched['Subtype'] = clinical_df_matched.apply(define_subtype, axis=1)
	clinical_df_final = clinical_df_matched[clinical_df_matched['Subtype'] != 'Other/Unknown']
	rna_df_final = rna_df_matched.loc[clinical_df_final.index]
	cnv_df_final = cnv_df_matched.loc[clinical_df_final.index]

	print(f"Samples remaining after subtyping: {clinical_df_final.shape[0]}")
	print(clinical_df_final['Subtype'].value_counts())


	# --- 5. MULTI-OMICS INTEGRATION (Early Integration Strategy: PCA) ---

	# 5.1 Select highly variable genes (improves PCA clustering)
	# In real analysis, you'd filter by median absolute deviation (MAD) or Variance.
	# Mock filtering for demonstration: keep top 50% of genes by variance.
	rna_var = rna_df_final.var(axis=0).sort_values(ascending=False)
	top_rna_features = rna_var.head(int(rna_var.shape[0] * 0.5)).index
	rna_scaled = rna_df_final[top_rna_features]

	# 5.2 Standardize and combine features
	scaler = StandardScaler()
	X_rna_scaled = scaler.fit_transform(rna_scaled)
	X_cnv = cnv_df_final # CNV data is already categorical/discrete, usually no scaling needed.

	# Concatenate (Early Integration)
	# Use a smaller subset of CNV features just for visualization purposes if the number is too high
	cnv_subset = cnv_df_final.iloc[:, :500]
	X_combined = pd.DataFrame(
	np.hstack([X_rna_scaled, cnv_subset.values]),
	index=rna_scaled.index
	)

	# 5.3 Apply PCA for dimensionality reduction and visualization
	pca = PCA(n_components=2)
	X_pca = pca.fit_transform(X_combined)
	pca_df = pd.DataFrame(data=X_pca, index=X_combined.index, columns=['PC1', 'PC2'])
	pca_df['Subtype'] = clinical_df_final['Subtype']

	# Visualization: PCA plot
	plt.figure(figsize=(10, 8))
	sns.scatterplot(x='PC1', y='PC2', hue='Subtype', data=pca_df, style='Subtype', s=100, alpha=0.7)
	plt.title('PCA of Integrated TCGA-BRCA Multi-Omics Data')
	plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.2f}%)')
	plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.2f}%)')
	plt.legend(title='Molecular Subtype')
	plt.show()


	# --- 6. SURVIVAL ANALYSIS (Kaplan-Meier) ---

	print("\n--- 6. Survival Analysis: Subtype Prognosis ---")
	# Prepare data for survival analysis (using OS.time and OS)
	# NOTE: TCGA time is often in days. We convert to years.
	T = clinical_df_final['OS.time'] / 365.25 # Time in years (TCGA-BRCA standard)
	E = clinical_df_final['OS'] # Event (1=Dead, 0=Alive)

	plt.figure(figsize=(10, 6))
	kmf = KaplanMeierFitter()

	# Plot Kaplan-Meier curves for each major subtype
	subtypes = ['Luminal A/B', 'HER2-Enriched', 'Triple-Negative']
	for subtype in subtypes:
	subset_mask = clinical_df_final['Subtype'] == subtype
	if subset_mask.any():
	kmf.fit(T[subset_mask], event_observed=E[subset_mask], label=subtype)
	kmf.plot_survival_function(ci_show=False, linewidth=2)

	plt.title('Overall Survival (OS) by TCGA-BRCA Molecular Subtype')
	plt.xlabel('Time (Years)')
	plt.ylabel('Survival Probability (Kaplan-Meier)')
	plt.legend(title='Subtype')
	plt.ylim(0, 1.05)
	plt.grid(True, alpha=0.5, linestyle='--')
	plt.show()


	# --- 7. BIVARIATE/CLINICAL FEATURE ANALYSIS (Conceptual) ---

	# Example: Cox Proportional Hazards Model (Requires lifelines installation)
	# This step models how a specific gene expression level (e.g., ERBB2 for HER2+)
	# predicts survival, controlling for other factors.

	# Mock gene for Bivariate Cox Model
	example_gene = rna_df_final.columns[50] # Placeholder gene
	rna_df_final['ERBB2_like_Expr'] = rna_df_final[example_gene]

	# Create the final dataset for the Cox model
	cox_data = clinical_df_final[['OS.time', 'OS', 'Subtype']].copy()
	cox_data['T_Years'] = cox_data['OS.time'] / 365.25
	cox_data = cox_data.join(rna_df_final['ERBB2_like_Expr'])

	# Cox model only accepts numerical features; one-hot encode the 'Subtype'
	cox_data = pd.get_dummies(cox_data, columns=['Subtype'], drop_first=True)
	cox_data = cox_data.dropna() # Drop any rows with missing data

	if cox_data.shape[0] > 100:
	cph = CoxPHFitter()
	# Fit the model: Predict survival based on time/event, using gene expression and subtypes as covariates
	cph.fit(cox_data, duration_col='T_Years', event_col='OS', formula="ERBB2_like_Expr + Subtype_HER2-Enriched + Subtype_Triple-Negative")

	print("\n--- 7. Cox Proportional Hazards Model Summary ---")
	print("Model estimates show hazard ratio (exp(coef)) for each feature:")
	print(cph.summary[['exp(coef)', 'p']].sort_values(by='p').head(5))

	print("\nNotebook Complete: The analysis flow demonstrates the standard steps for TCGA multiomics integration, subtyping, and prognostic analysis.")
No results found