Skip to content

Instantly share code, notes, and snippets.

@NickCrews
Created June 17, 2026 20:03
Show Gist options
  • Select an option

  • Save NickCrews/a10156b48d6031d7dcb33dd68efbdc24 to your computer and use it in GitHub Desktop.

Select an option

Save NickCrews/a10156b48d6031d7dcb33dd68efbdc24 to your computer and use it in GitHub Desktop.
ibis backend popularity comparison
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "matplotlib>=3.7",
# ]
# ///
"""
Ibis backend popularity proxy. `uv run <this_gist_url>`
For each Ibis backend (and a couple not yet in Ibis), measure the monthly PyPI
downloads of its driver package as a proxy for engine popularity, then plot.
Data source: ClickHouse public "clickpy" PyPI dataset (no auth).
endpoint: https://sql-clickhouse.clickhouse.com/?user=play
Caveats (see chart legend):
* generic/shared drivers (pyodbc, psycopg, psycopg2) overstate the backend
* standalone engines (duckdb, polars, pyspark) are mostly downloaded for
direct use, not via Ibis
* sqlite uses the stdlib -> no driver package to measure (omitted)
* this measures the engine ecosystem, NOT Ibis usage of that backend
"""
from __future__ import annotations
import urllib.request
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
# Last COMPLETE calendar month. Bump this when you re-run.
MONTH = "2026-05-01"
CLICKHOUSE = "https://sql-clickhouse.clickhouse.com/?user=play"
# (backend, driver pypi package, category)
# Driver packages come from ibis pyproject.toml [project.optional-dependencies].
# category: 0 dedicated driver | 1 generic/shared driver (overstated)
# 2 standalone engine (overstated) | 3 not yet in Ibis
# sqlite intentionally excluded: stdlib, no measurable driver package.
BACKENDS = [
("snowflake", "snowflake-connector-python", 0),
("databricks", "databricks-sql-connector", 0),
("bigquery", "google-cloud-bigquery", 0),
("postgres", "psycopg", 0),
("materialize", "psycopg", 1), # shares postgres driver
("polars", "polars", 2),
("pyspark", "pyspark", 2),
("risingwave", "psycopg2", 1), # generic pg driver
("duckdb", "duckdb", 2),
("mssql", "pyodbc", 1), # generic ODBC
("oracle", "oracledb", 0),
("mysql", "mysqlclient", 0),
("clickhouse", "clickhouse-connect", 0),
("athena", "pyathena", 0),
("trino", "trino", 0),
("exasol", "pyexasol", 0),
("impala", "impyla", 0),
("datafusion", "datafusion", 0),
("druid", "pydruid", 0),
("singlestoredb", "singlestoredb", 0),
("flink", "apache-flink", 0),
# not yet in Ibis:
("starrocks", "starrocks", 3),
("gizmosql", "gizmosql", 3),
]
# annotation suffix for the driver label, per category
SUFFIX = {0: "", 1: "*", 2: "", 3: " [NOT in ibis]"}
def fetch_downloads(packages, month):
"""Return {package: total_downloads} for the given month from clickpy.
The monthly table is an aggregating MV with unmerged parts, so we must
SUM(count) GROUP BY project rather than reading rows directly.
"""
pkg_list = ",".join("'" + p + "'" for p in sorted(set(packages)))
sql = (
"SELECT project, sum(count) AS c "
"FROM pypi.pypi_downloads_per_month "
f"WHERE month = '{month}' AND project IN ({pkg_list}) "
"GROUP BY project FORMAT TSV"
)
req = urllib.request.Request(CLICKHOUSE, data=sql.encode())
with urllib.request.urlopen(req, timeout=60) as r:
text = r.read().decode()
out = {}
for line in text.strip().splitlines():
proj, cnt = line.split("\t")
out[proj] = int(cnt)
return out
def main():
dl = fetch_downloads([b[1] for b in BACKENDS], MONTH)
rows = []
for backend, pkg, cat in BACKENDS:
rows.append((backend, pkg, dl.get(pkg, 0), cat))
rows.sort(key=lambda r: r[2])
colors = {0: "#2563eb", 1: "#f59e0b", 2: "#8b5cf6", 3: "#16a34a"}
labels = [f"{b} ({pkg}{SUFFIX[c]})" for b, pkg, _, c in rows]
vals = [r[2] for r in rows]
cols = [colors[r[3]] for r in rows]
fig, ax = plt.subplots(figsize=(11, 9))
ax.barh(range(len(rows)), vals, color=cols, edgecolor="white")
ax.set_yticks(range(len(rows)))
ax.set_yticklabels(labels, fontsize=9)
ax.set_xscale("log")
ax.set_xlabel(
f"PyPI downloads, {MONTH[:7]} (log scale) — source: ClickHouse clickpy dataset"
)
ax.set_title(
"Ibis backend popularity proxy: monthly PyPI downloads of each "
"backend's driver package",
fontsize=11,
pad=12,
)
for i, v in enumerate(vals):
txt = f"{v / 1e6:.1f}M" if v >= 1e6 else f"{v / 1e3:.0f}k"
ax.text(v * 1.05, i, txt, va="center", fontsize=8)
ax.set_xlim(2e3, 5e8)
legend = [
Patch(color="#2563eb", label="Ibis backend, dedicated driver"),
Patch(
color="#f59e0b", label="Ibis backend, GENERIC/SHARED driver (overstated *)"
),
Patch(
color="#8b5cf6", label="Ibis backend, also a standalone engine (overstated)"
),
Patch(color="#16a34a", label="Not yet in Ibis"),
]
ax.legend(handles=legend, loc="lower right", fontsize=8, framealpha=0.95)
ax.text(
2e3,
-1.8,
"sqlite: omitted — stdlib, no driver package to measure.",
fontsize=8,
style="italic",
color="#555",
)
plt.tight_layout()
plt.savefig("ibis_backend_popularity.png", dpi=150, bbox_inches="tight")
print("saved ibis_backend_popularity.png")
if __name__ == "__main__":
main()
@NickCrews

Copy link
Copy Markdown
Author

Inspired by ibis-project/ibis#12017. This gives, on Jun 17 2026:
ibis_backend_popularity

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment