Skip to content

Instantly share code, notes, and snippets.

@PaulKinlan
Created July 11, 2023 13:33
Show Gist options
  • Save PaulKinlan/7f25055ac899e1667a6f80119d7c3b05 to your computer and use it in GitHub Desktop.
Save PaulKinlan/7f25055ac899e1667a6f80119d7c3b05 to your computer and use it in GitHub Desktop.
Parse BCD data to get browser release data
import json
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Function to convert a string to a datetime object
def convert_string_to_date(date_string):
return datetime.strptime(date_string, "%Y-%m-%d")
# Function to convert version numbers to integers where possible
def convert_version_to_int(version):
try:
return int(version)
except ValueError:
try:
return int(version.split('.')[0])
except ValueError:
return version
# Load JSON file and process the data
with open('path_to_your_data_file.json', 'r') as f:
data = json.load(f)
# Extract 'browsers' data
browsers_data = data['browsers']
# Initialize an empty dictionary to store the release dates
release_dates = {}
# Loop over each browser
for browser, details in browsers_data.items():
# Get the 'releases' field which contains version information
releases = details.get('releases', {})
# For each version, get the release date
for version, version_details in releases.items():
release_date = version_details.get('release_date')
if release_date:
# Convert the version to an integer where possible
version = convert_version_to_int(version)
# Convert the release date to a datetime object
release_date = convert_string_to_date(release_date)
# Add the release date to the dictionary
if browser not in release_dates:
release_dates[browser] = {}
release_dates[browser][version] = release_date
# Convert the dictionary to a pandas DataFrame
df = pd.concat({k: pd.Series(v) for k, v in release_dates.items()}).reset_index()
df.columns = ['Browser', 'Version', 'Release Date']
# Calculate the time difference between each release for each browser
df['Time Difference'] = df.groupby('Browser')['Release Date'].diff()
df['Time Difference'] = df['Time Difference'].dt.days
# Filter out rows where the time difference is more than 3 standard deviations from the mean
df_no_outliers = df[np.abs(df['Time Difference'] - df['Time Difference'].mean()) <= (3 * df['Time Difference'].std())]
# Extract the year from the release date
df_no_outliers['Year'] = df_no_outliers['Release Date'].dt.year
# Filter out rows where the year is before 2008
df_no_outliers = df_no_outliers[df_no_outliers['Year'] >= 2008]
# Calculate the average time difference for each browser for each year
average_time_difference_yearly = df_no_outliers.groupby(['Browser', 'Year'])['Time Difference'].mean().reset_index()
# Pivot the data for plotting
pivot_df = average_time_difference_yearly.pivot(index='Year', columns='Browser', values='Time Difference')
# Define a color palette with more distinct colors
color_palette = ['b', 'g', 'r', 'c', 'm', 'y', 'k',
'#FF5733', '#900C3F', '#DAF7A6', '#581845', '#C70039', '#FFC300', '#273746', '#641E16']
# Plot the data with the new color palette
pivot_df.plot(kind='line', marker='o', figsize=(12, 8), color=color_palette)
plt.ylabel('Average Time Between Releases (Days)')
plt.title('Average Time Between Browser Version Releases Over Time (2008 and Later, Outliers Removed)')
plt.grid(True)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment