Skip to content

Instantly share code, notes, and snippets.

@minrk
Last active November 26, 2024 07:40
Show Gist options
  • Save minrk/7ad21b18f7a5b3908d74f108dc564cd5 to your computer and use it in GitHub Desktop.
Save minrk/7ad21b18f7a5b3908d74f108dc564cd5 to your computer and use it in GitHub Desktop.
"""
Script to patch pvc_name associations in KubeSpawner
for https://github.com/jupyterhub/zero-to-jupyterhub-k8s/issues/3574
Meant to be run via `kubectl exec` in the hub pod.
If a server has multiple matching pods, `select_pvc` is used to pick which one to associate.
Default: pick the oldest match. Edit `select_pvc` to change this behavior.
for a dry run (to see what it will do):
cat collect_pvcs.py | kubectl exec -i $(kubectl get pod -l component=hub -o name) -- python3
to apply actual changes:
cat collect_pvcs.py | kubectl exec -i $(kubectl get pod -l component=hub -o name) -- python3 - --apply
License: CC-0 (public domain)
https://gist.github.com/minrk/7ad21b18f7a5b3908d74f108dc564cd5
"""
import asyncio
import os
from argparse import ArgumentParser
from kubernetes_asyncio.config import load_incluster_config
from kubernetes_asyncio.client import CoreV1Api
from kubernetes_asyncio.client.api_client import ApiClient
from sqlalchemy.orm import raiseload, joinedload, load_only
from jupyterhub.app import JupyterHub
from jupyterhub import orm
namespace = os.environ["POD_NAMESPACE"]
def select_pvc(pvc_list):
"""This function picks a pvc when there are multiple matches
Edit this function to change how PVCs are chosen.
Default: pick the oldest one because the z2jh 4.0 upgrade bug
will result in creating new PVCs that it shouldn't have,
so keep the old one.
"""
def get_creation_time(pvc):
return pvc.metadata.creation_timestamp
return sorted(pvc_list, key=get_creation_time)[0]
async def yield_pvcs(k8s, namespace):
"""yield all pvcs in a namespace"""
_continue = True
while _continue is not None:
if _continue is True:
# first page
_continue = None
pvc_list = await k8s.list_namespaced_persistent_volume_claim(
namespace,
_continue=_continue,
label_selector="component=singleuser-storage",
)
_continue = pvc_list._metadata._continue
for pvc in pvc_list.items:
yield pvc
async def collect_user_pvcs():
"""Collect a dict of (username, servername): [pvc]"""
load_incluster_config()
user_pvcs = {}
async with ApiClient() as api_client:
k8s = CoreV1Api(api_client)
async for pvc in yield_pvcs(k8s, namespace):
annotations = pvc.metadata.annotations
username = annotations.get("hub.jupyter.org/username")
servername = annotations.get("hub.jupyter.org/servername", "")
user_pvcs.setdefault((username, servername), []).append(pvc)
return user_pvcs
def connect_db():
"""Connect to the JupyterHub database"""
hub = JupyterHub()
hub.load_config_file(hub.config_file)
db_url = hub.db_url
print(f"Connecting to {db_url}")
db = orm.new_session_factory(db_url, **hub.db_kwargs)()
return db
async def main():
parser = ArgumentParser()
parser.add_argument("--apply", action="store_true", help="")
args = parser.parse_args()
dry_run = not args.apply
user_pvcs = await collect_user_pvcs()
db = connect_db()
changes_made = 0
db.get_bind().echo = True
for spawner in db.query(orm.Spawner).options(load_only(orm.Spawner.state, orm.Spawner.name)).options(joinedload(orm.Spawner.user).load_only(orm.User.name)).options(raiseload('*')):
state = spawner.state or {}
username = spawner.user.name
servername = spawner.name
label = f"{username}/{servername}"
key = (username, servername)
found_pvcs = user_pvcs.pop(key, [])
state_pvc_name = state.get("pvc_name", None)
if len(found_pvcs) == 1:
pvc = found_pvcs[0]
pvc_name = pvc.metadata.name
print(f"{label} has pvc {pvc_name}")
if state_pvc_name == pvc_name:
print(f" {label} is linked to only matching pvc {pvc_name} (good)")
elif state_pvc_name:
print(
f" {label} is linked to pvc {state_pvc_name}, but labels match {pvc_name}!"
)
# TODO: try to fix it? Perhaps more likely labeling is wrong
else:
print(f" {label} is not linked to a pvc")
print(f" {pvc_name}: created at {pvc.metadata.creation_timestamp}")
if not state_pvc_name:
new_state = {}
new_state.update(state)
new_state["pvc_name"] = pvc_name
print(
f"!!!!! linking server {label} to pvc {pvc_name} {'(dry run)' * dry_run} !!!!"
)
changes_made += 1
if not dry_run:
# actually persist pvc_name in state
spawner.state = new_state
db.commit()
elif found_pvcs:
found_pvc_names = [pvc.metadata.name for pvc in found_pvcs]
print(f"{label} has multiple matching pvcs! {found_pvc_names}")
if state_pvc_name:
print(f" {label} is linked to {state_pvc_name}")
else:
print(f" {label} is not linked to a pvc")
for pvc in found_pvcs:
print(
f" {pvc.metadata.name}: created at {pvc.metadata.creation_timestamp}"
)
# selection rules for picking a pvc to link
selected_pvc = select_pvc(found_pvcs)
if selected_pvc and selected_pvc.metadata.name != state_pvc_name:
changes_made += 1
pvc_name = selected_pvc.metadata.name
new_state = {}
new_state.update(state)
new_state["pvc_name"] = pvc_name
print(
f"!!!!! linking server {label} to pvc {pvc_name} {'(dry run)' * dry_run} !!!!"
)
if not dry_run:
# actually persist pvc_name in state
spawner.state = new_state
db.commit()
else:
print(f"{label} has no matching pvc")
# print any PVCs that we didn't find links for
for key, pvcs in user_pvcs.items():
pvc_names = [pvc.metadata.name for pvc in pvcs]
# display every pvc we found, just in case our identification is wrong
print(f"PVCs not linked to a server ({key}): {pvc_names}")
if changes_made:
if dry_run:
print("This was a dry run, no changes were made.")
print(f"To actually apply the above {changes_made} changes, re-run with `--apply`")
else:
print(f"Made {changes_made} changes")
else:
print("Nothing to do!")
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment