NickCrews · June 17, 2026 20:03 · NickCrews · Jun 17, 2026
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/env python3
 # /// script
 # requires-python = ">=3.9"
 # dependencies = [
 #     "matplotlib>=3.7",
 # ]
 # ///
 """
 Ibis backend popularity proxy. `uv run <this_gist_url>`

 For each Ibis backend (and a couple not yet in Ibis), measure the monthly PyPI
 downloads of its driver package as a proxy for engine popularity, then plot.

 Data source: ClickHouse public "clickpy" PyPI dataset (no auth).
  endpoint: https://sql-clickhouse.clickhouse.com/?user=play

 Caveats (see chart legend):
  * generic/shared drivers (pyodbc, psycopg, psycopg2) overstate the backend
  * standalone engines (duckdb, polars, pyspark) are mostly downloaded for
    direct use, not via Ibis
  * sqlite uses the stdlib -> no driver package to measure (omitted)
  * this measures the engine ecosystem, NOT Ibis usage of that backend
 """

 from __future__ import annotations

 import urllib.request

 import matplotlib

 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 from matplotlib.patches import Patch

 # Last COMPLETE calendar month. Bump this when you re-run.
 MONTH = "2026-05-01"

 CLICKHOUSE = "https://sql-clickhouse.clickhouse.com/?user=play"

 # (backend, driver pypi package, category)
 # Driver packages come from ibis pyproject.toml [project.optional-dependencies].
 # category: 0 dedicated driver | 1 generic/shared driver (overstated)
 #           2 standalone engine (overstated) | 3 not yet in Ibis
 # sqlite intentionally excluded: stdlib, no measurable driver package.
 BACKENDS = [
    ("snowflake", "snowflake-connector-python", 0),
    ("databricks", "databricks-sql-connector", 0),
    ("bigquery", "google-cloud-bigquery", 0),
    ("postgres", "psycopg", 0),
    ("materialize", "psycopg", 1),  # shares postgres driver
    ("polars", "polars", 2),
    ("pyspark", "pyspark", 2),
    ("risingwave", "psycopg2", 1),  # generic pg driver
    ("duckdb", "duckdb", 2),
    ("mssql", "pyodbc", 1),  # generic ODBC
    ("oracle", "oracledb", 0),
    ("mysql", "mysqlclient", 0),
    ("clickhouse", "clickhouse-connect", 0),
    ("athena", "pyathena", 0),
    ("trino", "trino", 0),
    ("exasol", "pyexasol", 0),
    ("impala", "impyla", 0),
    ("datafusion", "datafusion", 0),
    ("druid", "pydruid", 0),
    ("singlestoredb", "singlestoredb", 0),
    ("flink", "apache-flink", 0),
    # not yet in Ibis:
    ("starrocks", "starrocks", 3),
    ("gizmosql", "gizmosql", 3),
 ]

 # annotation suffix for the driver label, per category
 SUFFIX = {0: "", 1: "*", 2: "", 3: " [NOT in ibis]"}


 def fetch_downloads(packages, month):
    """Return {package: total_downloads} for the given month from clickpy.

    The monthly table is an aggregating MV with unmerged parts, so we must
    SUM(count) GROUP BY project rather than reading rows directly.
    """
    pkg_list = ",".join("'" + p + "'" for p in sorted(set(packages)))
    sql = (
        "SELECT project, sum(count) AS c "
        "FROM pypi.pypi_downloads_per_month "
        f"WHERE month = '{month}' AND project IN ({pkg_list}) "
        "GROUP BY project FORMAT TSV"
    )
    req = urllib.request.Request(CLICKHOUSE, data=sql.encode())
    with urllib.request.urlopen(req, timeout=60) as r:
        text = r.read().decode()
    out = {}
    for line in text.strip().splitlines():
        proj, cnt = line.split("\t")
        out[proj] = int(cnt)
    return out


 def main():
    dl = fetch_downloads([b[1] for b in BACKENDS], MONTH)

    rows = []
    for backend, pkg, cat in BACKENDS:
        rows.append((backend, pkg, dl.get(pkg, 0), cat))
    rows.sort(key=lambda r: r[2])

    colors = {0: "#2563eb", 1: "#f59e0b", 2: "#8b5cf6", 3: "#16a34a"}
    labels = [f"{b}  ({pkg}{SUFFIX[c]})" for b, pkg, _, c in rows]
    vals = [r[2] for r in rows]
    cols = [colors[r[3]] for r in rows]

    fig, ax = plt.subplots(figsize=(11, 9))
    ax.barh(range(len(rows)), vals, color=cols, edgecolor="white")
    ax.set_yticks(range(len(rows)))
    ax.set_yticklabels(labels, fontsize=9)
    ax.set_xscale("log")
    ax.set_xlabel(
        f"PyPI downloads, {MONTH[:7]} (log scale) — source: ClickHouse clickpy dataset"
    )
    ax.set_title(
        "Ibis backend popularity proxy: monthly PyPI downloads of each "
        "backend's driver package",
        fontsize=11,
        pad=12,
    )
    for i, v in enumerate(vals):
        txt = f"{v / 1e6:.1f}M" if v >= 1e6 else f"{v / 1e3:.0f}k"
        ax.text(v * 1.05, i, txt, va="center", fontsize=8)
    ax.set_xlim(2e3, 5e8)

    legend = [
        Patch(color="#2563eb", label="Ibis backend, dedicated driver"),
        Patch(
            color="#f59e0b", label="Ibis backend, GENERIC/SHARED driver (overstated *)"
        ),
        Patch(
            color="#8b5cf6", label="Ibis backend, also a standalone engine (overstated)"
        ),
        Patch(color="#16a34a", label="Not yet in Ibis"),
    ]
    ax.legend(handles=legend, loc="lower right", fontsize=8, framealpha=0.95)
    ax.text(
        2e3,
        -1.8,
        "sqlite: omitted — stdlib, no driver package to measure.",
        fontsize=8,
        style="italic",
        color="#555",
    )

    plt.tight_layout()
    plt.savefig("ibis_backend_popularity.png", dpi=150, bbox_inches="tight")
    print("saved ibis_backend_popularity.png")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# /// script
	# requires-python = ">=3.9"
	# dependencies = [
	# "matplotlib>=3.7",
	# ]
	# ///
	"""
	Ibis backend popularity proxy. `uv run <this_gist_url>`

	For each Ibis backend (and a couple not yet in Ibis), measure the monthly PyPI
	downloads of its driver package as a proxy for engine popularity, then plot.

	Data source: ClickHouse public "clickpy" PyPI dataset (no auth).
	endpoint: https://sql-clickhouse.clickhouse.com/?user=play

	Caveats (see chart legend):
	* generic/shared drivers (pyodbc, psycopg, psycopg2) overstate the backend
	* standalone engines (duckdb, polars, pyspark) are mostly downloaded for
	direct use, not via Ibis
	* sqlite uses the stdlib -> no driver package to measure (omitted)
	* this measures the engine ecosystem, NOT Ibis usage of that backend
	"""

	from __future__ import annotations

	import urllib.request

	import matplotlib

	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	from matplotlib.patches import Patch

	# Last COMPLETE calendar month. Bump this when you re-run.
	MONTH = "2026-05-01"

	CLICKHOUSE = "https://sql-clickhouse.clickhouse.com/?user=play"

	# (backend, driver pypi package, category)
	# Driver packages come from ibis pyproject.toml [project.optional-dependencies].
	# category: 0 dedicated driver \| 1 generic/shared driver (overstated)
	# 2 standalone engine (overstated) \| 3 not yet in Ibis
	# sqlite intentionally excluded: stdlib, no measurable driver package.
	BACKENDS = [
	("snowflake", "snowflake-connector-python", 0),
	("databricks", "databricks-sql-connector", 0),
	("bigquery", "google-cloud-bigquery", 0),
	("postgres", "psycopg", 0),
	("materialize", "psycopg", 1), # shares postgres driver
	("polars", "polars", 2),
	("pyspark", "pyspark", 2),
	("risingwave", "psycopg2", 1), # generic pg driver
	("duckdb", "duckdb", 2),
	("mssql", "pyodbc", 1), # generic ODBC
	("oracle", "oracledb", 0),
	("mysql", "mysqlclient", 0),
	("clickhouse", "clickhouse-connect", 0),
	("athena", "pyathena", 0),
	("trino", "trino", 0),
	("exasol", "pyexasol", 0),
	("impala", "impyla", 0),
	("datafusion", "datafusion", 0),
	("druid", "pydruid", 0),
	("singlestoredb", "singlestoredb", 0),
	("flink", "apache-flink", 0),
	# not yet in Ibis:
	("starrocks", "starrocks", 3),
	("gizmosql", "gizmosql", 3),
	]

	# annotation suffix for the driver label, per category
	SUFFIX = {0: "", 1: "*", 2: "", 3: " [NOT in ibis]"}


	def fetch_downloads(packages, month):
	"""Return {package: total_downloads} for the given month from clickpy.

	The monthly table is an aggregating MV with unmerged parts, so we must
	SUM(count) GROUP BY project rather than reading rows directly.
	"""
	pkg_list = ",".join("'" + p + "'" for p in sorted(set(packages)))
	sql = (
	"SELECT project, sum(count) AS c "
	"FROM pypi.pypi_downloads_per_month "
	f"WHERE month = '{month}' AND project IN ({pkg_list}) "
	"GROUP BY project FORMAT TSV"
	)
	req = urllib.request.Request(CLICKHOUSE, data=sql.encode())
	with urllib.request.urlopen(req, timeout=60) as r:
	text = r.read().decode()
	out = {}
	for line in text.strip().splitlines():
	proj, cnt = line.split("\t")
	out[proj] = int(cnt)
	return out


	def main():
	dl = fetch_downloads([b[1] for b in BACKENDS], MONTH)

	rows = []
	for backend, pkg, cat in BACKENDS:
	rows.append((backend, pkg, dl.get(pkg, 0), cat))
	rows.sort(key=lambda r: r[2])

	colors = {0: "#2563eb", 1: "#f59e0b", 2: "#8b5cf6", 3: "#16a34a"}
	labels = [f"{b} ({pkg}{SUFFIX[c]})" for b, pkg, _, c in rows]
	vals = [r[2] for r in rows]
	cols = [colors[r[3]] for r in rows]

	fig, ax = plt.subplots(figsize=(11, 9))
	ax.barh(range(len(rows)), vals, color=cols, edgecolor="white")
	ax.set_yticks(range(len(rows)))
	ax.set_yticklabels(labels, fontsize=9)
	ax.set_xscale("log")
	ax.set_xlabel(
	f"PyPI downloads, {MONTH[:7]} (log scale) — source: ClickHouse clickpy dataset"
	)
	ax.set_title(
	"Ibis backend popularity proxy: monthly PyPI downloads of each "
	"backend's driver package",
	fontsize=11,
	pad=12,
	)
	for i, v in enumerate(vals):
	txt = f"{v / 1e6:.1f}M" if v >= 1e6 else f"{v / 1e3:.0f}k"
	ax.text(v * 1.05, i, txt, va="center", fontsize=8)
	ax.set_xlim(2e3, 5e8)

	legend = [
	Patch(color="#2563eb", label="Ibis backend, dedicated driver"),
	Patch(
	color="#f59e0b", label="Ibis backend, GENERIC/SHARED driver (overstated *)"
	),
	Patch(
	color="#8b5cf6", label="Ibis backend, also a standalone engine (overstated)"
	),
	Patch(color="#16a34a", label="Not yet in Ibis"),
	]
	ax.legend(handles=legend, loc="lower right", fontsize=8, framealpha=0.95)
	ax.text(
	2e3,
	-1.8,
	"sqlite: omitted — stdlib, no driver package to measure.",
	fontsize=8,
	style="italic",
	color="#555",
	)

	plt.tight_layout()
	plt.savefig("ibis_backend_popularity.png", dpi=150, bbox_inches="tight")
	print("saved ibis_backend_popularity.png")


	if __name__ == "__main__":
	main()
No results found