Skip to content

Instantly share code, notes, and snippets.

@lesteve
Last active February 28, 2025 16:34
Show Gist options
  • Save lesteve/6099307249f3235e16063ad847ff0348 to your computer and use it in GitHub Desktop.
Save lesteve/6099307249f3235e16063ad847ff0348 to your computer and use it in GitHub Desktop.
# %%
from pathlib import Path
import re
from packaging.version import Version
def find_python_version():
content = Path("pyproject.toml").read_text()
python_versions = re.findall(r"Python :: (\d+\.\d+)", content)
if not python_versions:
content = Path("setup.py").read_text()
python_versions = re.findall(r"Python :: (\d+\.\d+)", content)
python_versions = [Version(v) for v in python_versions]
min_python_version = sorted(python_versions)[0]
return str(min_python_version)
def find_dependencies_version(dep):
return (
subprocess.check_output([sys.executable, "sklearn/_min_dependencies.py", dep])
.decode()
.strip()
)
# %%
import sys
import subprocess
import pandas as pd
# older versions done by hand
min_version_data = [
{
"scikit-learn": "0.21",
"python": "3.5",
"numpy": "1.11.0",
"scipy": "0.17.0",
"joblib": "0.11",
},
{
"scikit-learn": "0.22",
"python": "3.5",
"numpy": "1.11.0",
"scipy": "0.17.0",
"joblib": "0.11",
},
{
"scikit-learn": "0.23",
"python": "3.6",
"numpy": "1.13.3",
"scipy": "0.19.1",
"joblib": "0.11",
},
]
version_list = [f"1.{minor}" for minor in range(7)]
dependency_list = ["numpy", "scipy", "pandas", "joblib", "threadpoolctl"]
for version in version_list:
d = {"scikit-learn": version}
subprocess.check_call(f"git checkout {version}.X".split())
d["python"] = find_python_version()
for dep in dependency_list:
d[dep] = find_dependencies_version(dep)
min_version_data.append(d)
min_version_df = pd.DataFrame(min_version_data)
# %%
# get release dates from Python and scikit-learn and then
# https://devguide.python.org/versions/
# https://pypi.org/project/scikit-learn/1.1.0/#history
import pandas as pd
df_list = pd.read_html("https://devguide.python.org/versions/")
df = pd.concat(df_list).astype({"Branch": str})
release_dates = {}
python_version_info = {
version: release_date
for version, release_date in zip(df["Branch"], df["First release"])
}
python_version_info = {
version: pd.to_datetime(release_date)
for version, release_date in python_version_info.items()
}
release_dates["python"] = python_version_info
# %%
import requests
def get_release_time(package, version):
r = requests.get(f"https://pypi.org/pypi/{package}/{version}/json")
json_content = r.json()
sdist_list = [
each for each in json_content["urls"] if each["packagetype"] == "sdist"
]
# for some reason 0.21 doesn't have a sdist ...
if len(sdist_list) == 0:
sdist_list = json_content["urls"]
return pd.to_datetime(sdist_list[0]["upload_time"])
for dep in ["scikit-learn"] + dependency_list:
release_dates[dep] = {
ver: get_release_time(dep, ver)
for ver in min_version_df[dep]
if isinstance(ver, str)
}
release_dates
# %%
def previous_minor_release(ver):
major, minor, *_ = ver.split(".")
previous_minor = int(minor) - 1
return f"{major}.{previous_minor}"
# %%
for each in ["scikit-learn", "python"] + dependency_list:
min_version_df[f"{each}-date"] = min_version_df[each].map(release_dates[each])
for each in ["python"] + dependency_list:
min_version_df[f"{each}-date-diff"] = (
min_version_df["scikit-learn-date"] - min_version_df[f"{each}-date"]
).dt.days / 365
# %%
print(min_version_df[["scikit-learn", "python", "python-date-diff"]])
print(min_version_df[["scikit-learn", "python", "numpy", "numpy-date-diff"]])
print(min_version_df[["scikit-learn", "python", "scipy", "scipy-date-diff"]])
print(min_version_df[["scikit-learn", "python", "pandas", "scipy-date-diff"]])
print(min_version_df[["scikit-learn", "python", "joblib", "joblib-date-diff"]])
print(
min_version_df[
["scikit-learn", "python", "threadpoolctl", "threadpoolctl-date-diff"]
]
)
# %%
# Plan with minor versions X.Y
plan = [
{
"scikit-learn": "1.7",
"scikit-learn-date": pd.to_datetime("2025-06-01"),
"python": "3.10",
"numpy": "1.22.0",
"scipy": "1.8.0",
"pandas": "1.4.0",
"joblib": "1.2.0",
"threadpoolctl": "3.1.0",
},
{
"scikit-learn": "1.8",
"scikit-learn-date": pd.to_datetime("2025-12-01"),
"python": "3.11",
"numpy": "1.24.0",
"scipy": "1.11.0",
"pandas": "2.0.0",
"joblib": "1.3.0",
"threadpoolctl": "3.2.0",
},
{
"scikit-learn": "1.9",
"scikit-learn-date": pd.to_datetime("2026-06-01"),
"python": "3.11.0",
"numpy": "1.24.0",
"scipy": "1.11.0",
"pandas": "2.0.0",
"joblib": "1.4.0",
"threadpoolctl": "3.5.0",
},
{
"scikit-learn": "1.10",
"scikit-learn-date": pd.to_datetime("2026-12-01"),
"python": "3.12",
"numpy": "1.26.0",
"scipy": "1.12.0",
"pandas": "2.2.0",
"joblib": "1.4.0",
"threadpoolctl": "3.5.0",
},
]
# Plan with bugfix versions X.Y.Z
# plan = [
# {
# "scikit-learn": "1.7",
# "scikit-learn-date": pd.to_datetime("2025-06-01"),
# "python": "3.10",
# "numpy": "1.21.2",
# "scipy": "1.8.0",
# "pandas": "1.3.4",
# "joblib": "1.2",
# "threadpoolctl": "3.1",
# },
# {
# "scikit-learn": "1.8",
# "scikit-learn-date": pd.to_datetime("2025-12-01"),
# "python": "3.11",
# "numpy": "1.23.3",
# "scipy": "1.10.1",
# "pandas": "1.5.2",
# "joblib": "1.3",
# "threadpoolctl": "3.2",
# },
# {
# "scikit-learn": "1.9",
# "scikit-learn-date": pd.to_datetime("2026-06-01"),
# "python": "3.11",
# "numpy": "1.23.3",
# "scipy": "1.10.1",
# "pandas": "1.5.2",
# "joblib": "1.4",
# "threadpoolctl": "3.5",
# },
# {
# "scikit-learn": "1.10",
# "scikit-learn-date": pd.to_datetime("2026-12-01"),
# "python": "3.12",
# "numpy": "1.26.0",
# "scipy": "1.11.3",
# "pandas": "2.2.0",
# "joblib": "1.4",
# "threadpoolctl": "3.5",
# },
# ]
plan = pd.DataFrame(plan)
for each in ["numpy", "scipy", "pandas", "joblib", "threadpoolctl"]:
for ver in plan[each]:
release_dates[each][ver] = get_release_time(each, ver)
for each in ["python", "numpy", "scipy", "pandas", "joblib", "threadpoolctl"]:
diff_days = (
plan["scikit-learn-date"] - plan[each].map(release_dates[each])
).dt.days.apply(
lambda days: (
pd.NaT
if pd.isna(days)
else f"{days // 365} years {round(days % 365 / 30)} months"
)
)
plan[f"{each}-date-diff"] = diff_days
print(
plan[
["scikit-learn", "scikit-learn-date", "python", "python-date-diff"]
].to_markdown(index=False)
)
print(
plan[
[
"scikit-learn",
"scikit-learn-date",
"numpy",
"numpy-date-diff",
"scipy",
"scipy-date-diff",
"pandas",
"pandas-date-diff",
]
].to_markdown(index=False)
)
print(
plan[
[
"scikit-learn",
"scikit-learn-date",
"joblib",
"joblib-date-diff",
"threadpoolctl",
"threadpoolctl-date-diff",
]
].to_markdown(index=False)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment