natesheehan · March 1, 2024 11:05
diff --git a/themes.py b/themes.py
 import pandas as pd
 import matplotlib.pyplot as plt

 # Load the CSV file
 file_path = 'Interviews-Aggregated - Sheet1 (2).csv'  # Make sure to update this path
 data = pd.read_csv(file_path)

 # Clean the `Theme` column
 data['Theme'] = data['Theme'].str.strip()  # Trim whitespace
 data = data[data['Theme'] != '']  # Remove rows where `Theme` is blank

 # Extract roles from `Interview_ID`
 data['Role'] = data['Interview_ID'].apply(lambda x: str(x).split('_')[0] if pd.notnull(x) else 'Unknown')

 # Map themes to groups
 theme_grouping = {
    'Barriers in Scientific Collaboration': 'Open Science Practices and Scientific Collaboration',
    'Understandings of Open Science': 'Open Science Practices and Scientific Collaboration',
    'Diversity and Inclusion in Research': 'Open Science Practices and Scientific Collaboration',
    'Data Management and Curation': 'Data Management, Curation, and Governance',
    'Governance and Policy in Data Centric Practices': 'Data Management, Curation, and Governance',
    'Socio-Political Data Infrastructures': 'Socio-Political Dynamics of Open Infrastructures',
    'Funding, Resource Allocation, and Economic Factors': 'Socio-Political Dynamics of Open Infrastructures',
    'Research Environment Dynamics': 'Socio-Political Dynamics of Open Infrastructures'
 }
 data['Theme Group'] = data['Theme'].map(theme_grouping)

 # Remove rows where `Theme Group` is NaN because they don't match any group
 data = data.dropna(subset=['Theme Group'])

 # Calculate the number of unique Interview_IDs per role
 interview_counts = data.groupby('Role')['Interview_ID'].nunique()

 # Get a summary of the number of themes per role
 theme_counts = data.groupby(['Role', 'Theme Group']).size().reset_index(name='Counts')

 # Apply weighting: divide the theme count by the number of unique Interview_IDs for normalization
 theme_counts['Weighted Counts'] = theme_counts.apply(lambda x: x['Counts'] / interview_counts[x['Role']], axis=1)

 # Pivot for plotting
 theme_counts_pivot = theme_counts.pivot(index='Role', columns='Theme Group', values='Weighted Counts').fillna(0)

 # Plotting the weighted theme groups
 theme_counts_pivot.plot(kind='bar', figsize=(14, 8), width=0.8)
 plt.title('Weighted Breakdown of Theme Groups Across Different Roles')
 plt.ylabel('Weighted Number of Themes per Interviewee')
 plt.xlabel('Role')
 plt.xticks(rotation=45)
 plt.legend(title='Theme Groups', bbox_to_anchor=(1.05, 1), loc='upper left')
 plt.tight_layout()

 # Save plot to file
 plt.savefig('weighted_grouped_theme_plot.png', bbox_inches='tight')
 plt.clf()
	import pandas as pd
	import matplotlib.pyplot as plt

	# Load the CSV file
	file_path = 'Interviews-Aggregated - Sheet1 (2).csv' # Make sure to update this path
	data = pd.read_csv(file_path)

	# Clean the `Theme` column
	data['Theme'] = data['Theme'].str.strip() # Trim whitespace
	data = data[data['Theme'] != ''] # Remove rows where `Theme` is blank

	# Extract roles from `Interview_ID`
	data['Role'] = data['Interview_ID'].apply(lambda x: str(x).split('_')[0] if pd.notnull(x) else 'Unknown')

	# Map themes to groups
	theme_grouping = {
	'Barriers in Scientific Collaboration': 'Open Science Practices and Scientific Collaboration',
	'Understandings of Open Science': 'Open Science Practices and Scientific Collaboration',
	'Diversity and Inclusion in Research': 'Open Science Practices and Scientific Collaboration',
	'Data Management and Curation': 'Data Management, Curation, and Governance',
	'Governance and Policy in Data Centric Practices': 'Data Management, Curation, and Governance',
	'Socio-Political Data Infrastructures': 'Socio-Political Dynamics of Open Infrastructures',
	'Funding, Resource Allocation, and Economic Factors': 'Socio-Political Dynamics of Open Infrastructures',
	'Research Environment Dynamics': 'Socio-Political Dynamics of Open Infrastructures'
	}
	data['Theme Group'] = data['Theme'].map(theme_grouping)

	# Remove rows where `Theme Group` is NaN because they don't match any group
	data = data.dropna(subset=['Theme Group'])

	# Calculate the number of unique Interview_IDs per role
	interview_counts = data.groupby('Role')['Interview_ID'].nunique()

	# Get a summary of the number of themes per role
	theme_counts = data.groupby(['Role', 'Theme Group']).size().reset_index(name='Counts')

	# Apply weighting: divide the theme count by the number of unique Interview_IDs for normalization
	theme_counts['Weighted Counts'] = theme_counts.apply(lambda x: x['Counts'] / interview_counts[x['Role']], axis=1)

	# Pivot for plotting
	theme_counts_pivot = theme_counts.pivot(index='Role', columns='Theme Group', values='Weighted Counts').fillna(0)

	# Plotting the weighted theme groups
	theme_counts_pivot.plot(kind='bar', figsize=(14, 8), width=0.8)
	plt.title('Weighted Breakdown of Theme Groups Across Different Roles')
	plt.ylabel('Weighted Number of Themes per Interviewee')
	plt.xlabel('Role')
	plt.xticks(rotation=45)
	plt.legend(title='Theme Groups', bbox_to_anchor=(1.05, 1), loc='upper left')
	plt.tight_layout()

	# Save plot to file
	plt.savefig('weighted_grouped_theme_plot.png', bbox_inches='tight')
	plt.clf()