Created
July 11, 2023 13:33
-
-
Save PaulKinlan/7f25055ac899e1667a6f80119d7c3b05 to your computer and use it in GitHub Desktop.
Parse BCD data to get browser release data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from datetime import datetime | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
# Function to convert a string to a datetime object | |
def convert_string_to_date(date_string): | |
return datetime.strptime(date_string, "%Y-%m-%d") | |
# Function to convert version numbers to integers where possible | |
def convert_version_to_int(version): | |
try: | |
return int(version) | |
except ValueError: | |
try: | |
return int(version.split('.')[0]) | |
except ValueError: | |
return version | |
# Load JSON file and process the data | |
with open('path_to_your_data_file.json', 'r') as f: | |
data = json.load(f) | |
# Extract 'browsers' data | |
browsers_data = data['browsers'] | |
# Initialize an empty dictionary to store the release dates | |
release_dates = {} | |
# Loop over each browser | |
for browser, details in browsers_data.items(): | |
# Get the 'releases' field which contains version information | |
releases = details.get('releases', {}) | |
# For each version, get the release date | |
for version, version_details in releases.items(): | |
release_date = version_details.get('release_date') | |
if release_date: | |
# Convert the version to an integer where possible | |
version = convert_version_to_int(version) | |
# Convert the release date to a datetime object | |
release_date = convert_string_to_date(release_date) | |
# Add the release date to the dictionary | |
if browser not in release_dates: | |
release_dates[browser] = {} | |
release_dates[browser][version] = release_date | |
# Convert the dictionary to a pandas DataFrame | |
df = pd.concat({k: pd.Series(v) for k, v in release_dates.items()}).reset_index() | |
df.columns = ['Browser', 'Version', 'Release Date'] | |
# Calculate the time difference between each release for each browser | |
df['Time Difference'] = df.groupby('Browser')['Release Date'].diff() | |
df['Time Difference'] = df['Time Difference'].dt.days | |
# Filter out rows where the time difference is more than 3 standard deviations from the mean | |
df_no_outliers = df[np.abs(df['Time Difference'] - df['Time Difference'].mean()) <= (3 * df['Time Difference'].std())] | |
# Extract the year from the release date | |
df_no_outliers['Year'] = df_no_outliers['Release Date'].dt.year | |
# Filter out rows where the year is before 2008 | |
df_no_outliers = df_no_outliers[df_no_outliers['Year'] >= 2008] | |
# Calculate the average time difference for each browser for each year | |
average_time_difference_yearly = df_no_outliers.groupby(['Browser', 'Year'])['Time Difference'].mean().reset_index() | |
# Pivot the data for plotting | |
pivot_df = average_time_difference_yearly.pivot(index='Year', columns='Browser', values='Time Difference') | |
# Define a color palette with more distinct colors | |
color_palette = ['b', 'g', 'r', 'c', 'm', 'y', 'k', | |
'#FF5733', '#900C3F', '#DAF7A6', '#581845', '#C70039', '#FFC300', '#273746', '#641E16'] | |
# Plot the data with the new color palette | |
pivot_df.plot(kind='line', marker='o', figsize=(12, 8), color=color_palette) | |
plt.ylabel('Average Time Between Releases (Days)') | |
plt.title('Average Time Between Browser Version Releases Over Time (2008 and Later, Outliers Removed)') | |
plt.grid(True) | |
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment