Created
March 1, 2024 11:05
-
-
Save natesheehan/f59bdd0c072326ad563681ead64324d5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
# Load the CSV file | |
file_path = 'Interviews-Aggregated - Sheet1 (2).csv' # Make sure to update this path | |
data = pd.read_csv(file_path) | |
# Clean the `Theme` column | |
data['Theme'] = data['Theme'].str.strip() # Trim whitespace | |
data = data[data['Theme'] != ''] # Remove rows where `Theme` is blank | |
# Extract roles from `Interview_ID` | |
data['Role'] = data['Interview_ID'].apply(lambda x: str(x).split('_')[0] if pd.notnull(x) else 'Unknown') | |
# Map themes to groups | |
theme_grouping = { | |
'Barriers in Scientific Collaboration': 'Open Science Practices and Scientific Collaboration', | |
'Understandings of Open Science': 'Open Science Practices and Scientific Collaboration', | |
'Diversity and Inclusion in Research': 'Open Science Practices and Scientific Collaboration', | |
'Data Management and Curation': 'Data Management, Curation, and Governance', | |
'Governance and Policy in Data Centric Practices': 'Data Management, Curation, and Governance', | |
'Socio-Political Data Infrastructures': 'Socio-Political Dynamics of Open Infrastructures', | |
'Funding, Resource Allocation, and Economic Factors': 'Socio-Political Dynamics of Open Infrastructures', | |
'Research Environment Dynamics': 'Socio-Political Dynamics of Open Infrastructures' | |
} | |
data['Theme Group'] = data['Theme'].map(theme_grouping) | |
# Remove rows where `Theme Group` is NaN because they don't match any group | |
data = data.dropna(subset=['Theme Group']) | |
# Calculate the number of unique Interview_IDs per role | |
interview_counts = data.groupby('Role')['Interview_ID'].nunique() | |
# Get a summary of the number of themes per role | |
theme_counts = data.groupby(['Role', 'Theme Group']).size().reset_index(name='Counts') | |
# Apply weighting: divide the theme count by the number of unique Interview_IDs for normalization | |
theme_counts['Weighted Counts'] = theme_counts.apply(lambda x: x['Counts'] / interview_counts[x['Role']], axis=1) | |
# Pivot for plotting | |
theme_counts_pivot = theme_counts.pivot(index='Role', columns='Theme Group', values='Weighted Counts').fillna(0) | |
# Plotting the weighted theme groups | |
theme_counts_pivot.plot(kind='bar', figsize=(14, 8), width=0.8) | |
plt.title('Weighted Breakdown of Theme Groups Across Different Roles') | |
plt.ylabel('Weighted Number of Themes per Interviewee') | |
plt.xlabel('Role') | |
plt.xticks(rotation=45) | |
plt.legend(title='Theme Groups', bbox_to_anchor=(1.05, 1), loc='upper left') | |
plt.tight_layout() | |
# Save plot to file | |
plt.savefig('weighted_grouped_theme_plot.png', bbox_inches='tight') | |
plt.clf() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment