Skip to content

Instantly share code, notes, and snippets.

@tdewin
Last active May 6, 2026 13:17
Show Gist options
  • Select an option

  • Save tdewin/8254b00a03ac073623f7359ff2566739 to your computer and use it in GitHub Desktop.

Select an option

Save tdewin/8254b00a03ac073623f7359ff2566739 to your computer and use it in GitHub Desktop.
gather-insights ( MIT LICENSE, VAGUELY TESTED, BE CAREFUL, NO WARRANTY, FOR DEMO ENV ONLY)
# MIT LICENSE: sample code only, review and only for demo env
# BE CAREFUL AI GENERATED BASED ON gather.py PROMPT with kubernetes api
# BE CAREFUL AI GENERATED BASED ON gather.py PROMPT with kubernetes api
# BE CAREFUL AI GENERATED BASED ON gather.py PROMPT with kubernetes api
# BE CAREFUL AI GENERATED BASED ON gather.py PROMPT with kubernetes api
# BE CAREFUL AI GENERATED BASED ON gather.py PROMPT with kubernetes api
# Makefile
'''
run:
venv/bin/python3 collect.py
py:
venv/bin/python3
terminal:
sh -c "source venv/bin/activate;bash"
env:
python -m venv venv
dep:
venv/bin/pip install kubernetes
pip:
sudo dnf install python3-pip
'''
# disable warnings by manually importing first
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import socket
import time
import base64
import tempfile
import tarfile
import yaml
from pathlib import Path
from kubernetes import client, config, stream
from kubernetes.client.exceptions import ApiException
NS = "openshift-insights"
JOB_NAME = "insights-operator-job-manual"
LOCAL_DIR = Path("./insights-data")
# The Template embedded directly
GATHER_JOB_YAML = """
apiVersion: batch/v1
kind: Job
metadata:
name: {job_name}
annotations:
config.openshift.io/inject-proxy: insights-operator
spec:
backoffLimit: 6
ttlSecondsAfterFinished: 600
template:
spec:
restartPolicy: OnFailure
serviceAccountName: operator
# nodeSelector:
# beta.kubernetes.io/os: linux
# node-role.kubernetes.io/master: ""
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Exists
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 900
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 900
volumes:
- name: snapshots
emptyDir: {{}}
- name: service-ca-bundle
configMap:
name: service-ca-bundle
optional: true
initContainers:
- name: insights-operator
image: {image_url}
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- name: snapshots
mountPath: /var/lib/insights-operator
- name: service-ca-bundle
mountPath: /var/run/configmaps/service-ca-bundle
readOnly: true
ports:
- containerPort: 8443
name: https
resources:
requests:
cpu: 10m
memory: 70Mi
args:
- gather
- -v=4
- --config=/etc/insights-operator/server.yaml
containers:
- name: sleepy
image: quay.io/openshift/origin-base:latest
args:
- /bin/sh
- -c
- sleep 10m
volumeMounts: [{{name: snapshots, mountPath: /var/lib/insights-operator}}]
"""
def main():
# Load kubernetes configuration (e.g., ~/.kube/config)
config.load_kube_config()
v1 = client.CoreV1Api()
apps_v1 = client.AppsV1Api()
batch_v1 = client.BatchV1Api()
# 1. Check for existing job and wait up to 10 times
print(f"Checking for existing job: {JOB_NAME}...")
job_clear = False
for i in range(10):
try:
batch_v1.read_namespaced_job(name=JOB_NAME, namespace=NS)
print(f"Job is busy. Waiting 10 seconds... ({i+1}/10)")
time.sleep(10)
except ApiException as e:
if e.status == 404:
job_clear = True
break
raise
if not job_clear:
print(f"Error: Job {JOB_NAME} still exists after waiting. Exiting.")
return
# 2. Extract Image URL from the live deployment
print("Fetching current insights-operator image...")
try:
deployment = apps_v1.read_namespaced_deployment(name="insights-operator", namespace=NS)
img_url = deployment.spec.template.spec.containers[0].image
except ApiException as e:
print(f"Error: Could not find insights-operator deployment. Details: {e}")
return
# 3. Apply the YAML with injected Image
print("Deploying Gather Job...")
formatted_yaml = GATHER_JOB_YAML.format(job_name=JOB_NAME, image_url=img_url)
job_dict = yaml.safe_load(formatted_yaml)
batch_v1.create_namespaced_job(namespace=NS, body=job_dict)
# 4. Identify the specific Pod Name
pod_name = ""
print("Waiting for Pod to be created...")
while not pod_name:
pods = v1.list_namespaced_pod(namespace=NS, label_selector=f"job-name={JOB_NAME}")
if pods.items:
pod_name = pods.items[0].metadata.name
else:
time.sleep(1)
print(f"Tracking Pod: {pod_name}")
# 5. Wait for the Init Container to terminate with exit code 0
print("Waiting for init container 'insights-operator' to complete...")
init_completed = False
while not init_completed:
pod = v1.read_namespaced_pod(name=pod_name, namespace=NS)
if pod.status.init_container_statuses:
init_state = pod.status.init_container_statuses[0].state
if init_state.terminated:
if init_state.terminated.exit_code == 0:
print("✅ Data collection complete.")
init_completed = True
else:
print(f"⚠️ Init container failed with exit code: {init_state.terminated.exit_code}")
return
if pod.status.phase == "Failed":
print("⚠️ Pod is in error state (Failed). Check logs.")
return
if not init_completed:
time.sleep(5)
# 5.5 Wait for the main container ('sleepy') to actually be running
# This prevents the 'container not found' 500 Error when exec-ing too quickly
print("Waiting for main container 'sleepy' to start...")
while True:
pod = v1.read_namespaced_pod(name=pod_name, namespace=NS)
if pod.status.phase == "Running":
if pod.status.container_statuses:
sleepy_state = pod.status.container_statuses[0].state
if sleepy_state.running:
break
time.sleep(2)
# 6. Transfer Data locally (Native replacement for 'oc cp')
LOCAL_DIR.mkdir(exist_ok=True)
print(f"Downloading data natively to {LOCAL_DIR}...")
# We use tar + base64 inside the pod to safely stream binary data over the K8s WebSocket
exec_cmd = ['/bin/sh', '-c', 'tar czf - -C /var/lib/insights-operator . 2>/dev/null | base64 -w 0']
resp = stream.stream(
v1.connect_get_namespaced_pod_exec,
pod_name,
NS,
container="sleepy",
command=exec_cmd,
stderr=True, stdin=False,
stdout=True, tty=False
)
clean_resp = resp.strip()
if not clean_resp:
print("❌ Error: Received empty data stream from the pod.")
return
try:
tar_data = base64.b64decode(clean_resp)
with tempfile.NamedTemporaryFile(delete=False, suffix=".tar.gz") as tmp:
tmp.write(tar_data)
tmp_name = tmp.name
with tarfile.open(tmp_name, 'r:gz') as tar:
tar.extractall(path=LOCAL_DIR)
except Exception as e:
print(f"❌ Error decoding or extracting archive: {e}")
return
finally:
if 'tmp_name' in locals() and Path(tmp_name).exists():
Path(tmp_name).unlink() # Clean up temp file
# 7. Cleanup the remote Job
print("Cleaning up remote Job...")
batch_v1.delete_namespaced_job(
name=JOB_NAME,
namespace=NS,
body=client.V1DeleteOptions(propagation_policy='Background')
)
# 8. Find the most recent .gz file (last 5 mins)
now = time.time()
gz_files = [f for f in LOCAL_DIR.rglob("*.gz") if (now - f.stat().st_mtime) < 300]
if gz_files:
latest_bundle = gz_files[0]
hostname = socket.gethostname()
print("\n" + "="*40)
print(f"BUNDLE READY: {latest_bundle.name}")
print("SUGGESTED TRANSFER:")
print(f"scp {latest_bundle} user@host:~/collect/{hostname}")
print("="*40)
else:
print("\n❌ No recent .gz bundle found in local directory.")
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# BE CAREFUL AI GENERATED BASED ON BASH SCRIPT ABOVE
# BE CAREFUL AI GENERATED BASED ON BASH SCRIPT ABOVE
# BE CAREFUL AI GENERATED BASED ON BASH SCRIPT ABOVE
# BE CAREFUL AI GENERATED BASED ON BASH SCRIPT ABOVE
# MIT LICENSE: sample code only, review and only for demo env
#
import socket
import json
import subprocess
import time
from pathlib import Path
# --- Configuration ---
NS = "openshift-insights"
JOB_NAME = "insights-operator-job"
LOCAL_DIR = Path("./insights-data")
# The Template embedded directly
GATHER_JOB_YAML = """
apiVersion: batch/v1
kind: Job
metadata:
name: {job_name}
annotations:
config.openshift.io/inject-proxy: insights-operator
spec:
backoffLimit: 6
ttlSecondsAfterFinished: 600
template:
spec:
restartPolicy: OnFailure
serviceAccountName: operator
# nodeSelector:
# beta.kubernetes.io/os: linux
# node-role.kubernetes.io/master: ""
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Exists
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 900
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 900
volumes:
- name: snapshots
emptyDir: {{}}
- name: service-ca-bundle
configMap:
name: service-ca-bundle
optional: true
initContainers:
- name: insights-operator
image: {image_url}
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- name: snapshots
mountPath: /var/lib/insights-operator
- name: service-ca-bundle
mountPath: /var/run/configmaps/service-ca-bundle
readOnly: true
ports:
- containerPort: 8443
name: https
resources:
requests:
cpu: 10m
memory: 70Mi
args:
- gather
- -v=4
- --config=/etc/insights-operator/server.yaml
containers:
- name: sleepy
image: quay.io/openshift/origin-base:latest
args:
- /bin/sh
- -c
- sleep 10m
volumeMounts: [{{name: snapshots, mountPath: /var/lib/insights-operator}}]
"""
def run_shell(cmd, input_data=None):
"""Executes shell commands and returns stripped output."""
res = subprocess.run(
cmd,
shell=True,
input=input_data,
capture_output=True,
text=True
)
return res.stdout.strip()
def run_oc(cmd, input_data=None):
"""Executes oc commands and returns stripped output."""
res = subprocess.run(
f"oc {cmd}",
shell=True,
input=input_data,
capture_output=True,
text=True
)
return res.stdout.strip()
def main():
# 1. Cleanup: Ensure no old job is hanging around
print(f"Checking for existing job: {JOB_NAME}...")
while run_oc(f"get pod -n {NS} -l job-name={JOB_NAME} -o name"):
print(f"Job is busy. Please run: oc delete job -n {NS} {JOB_NAME}")
time.sleep(3)
# 2. Extract Image URL from the live deployment
print("Fetching current insights-operator image...")
deploy_raw = run_oc(f"get -n {NS} deployment insights-operator -o json")
if not deploy_raw:
print("Error: Could not find insights-operator deployment.")
return
img_url = json.loads(deploy_raw)['spec']['template']['spec']['containers'][0]['image']
# 3. Apply the YAML with injected Image
print(f"Deploying Gather Job...")
formatted_yaml = GATHER_JOB_YAML.format(job_name=JOB_NAME, image_url=img_url)
run_oc(f"apply -n {NS} -f -", input_data=formatted_yaml)
# 4. Identify the specific Pod Name
pod_name = ""
while not pod_name:
pod_name = run_oc(f"get pod -n {NS} -l job-name={JOB_NAME} -o custom-columns=:metadata.name --no-headers")
time.sleep(1)
print(f"Tracking Pod: {pod_name}")
# 5. Wait for the Init Container to terminate with exit code 0
print("Waiting for init container 'insights-operator' to complete...")
exit_path = "'{.status.initContainerStatuses[0].state.terminated.exitCode}'"
while True:
exit_code = run_oc(f"get -n {NS} pod {pod_name} -o jsonpath={exit_path}")
if exit_code == "0":
print("✅ Data collection complete.")
break
# Check for error state
if "error" in run_oc(f"get -n {NS} pod {pod_name}").lower():
print("⚠️ Pod is in error state. Check logs.")
time.sleep(5)
# 6. Transfer Data locally
LOCAL_DIR.mkdir(exist_ok=True)
print(f"Downloading data to {LOCAL_DIR}...")
run_oc(f"cp {NS}/{pod_name}:/var/lib/insights-operator {LOCAL_DIR}")
# 7. Cleanup the remote Job
run_oc(f"delete job -n {NS} {JOB_NAME}")
# 8. Find the most recent .gz file (last 5 mins)
now = time.time()
gz_files = [f for f in LOCAL_DIR.rglob("*.gz") if (now - f.stat().st_mtime) < 300]
if gz_files:
latest_bundle = gz_files[0]
hostname = socket.gethostname()
print("\n" + "="*40)
print(f"BUNDLE READY: {latest_bundle.name}")
print(f"SUGGESTED TRANSFER:")
print(f"scp {latest_bundle} user@host:~/collect/{hostname}")
print("="*40)
else:
print("\n❌ No recent .gz bundle found in local directory.")
if __name__ == "__main__":
main()
NS=openshift-insights
JOB=insights-operator-job
while [ -n "$(oc get pod -n $NS -l job-name=$JOB -o name)" ]; do
echo 'still busy..'
echo "consider: oc delete job -n $NS $JOB"
sleep 3
done
IMG=$(oc get -n $NS deployment insights-operator -o json | jq .spec.template.spec.containers[0].image)
cat gather-job.yaml | sed "s,{{INSIGHTS_IMG}},$IMG," | oc apply -n $NS -f -
POD_NAME=$(oc get pod -n $NS -l job-name=$JOB -o custom-columns=:metadata.name --no-headers)
echo $NS $JOB $POD_NAME
until oc get -n $NS pod "$POD_NAME" -o jsonpath='{.status.initContainerStatuses[0].state.terminated.exitCode}' | grep -q "^0$"; do
echo "waiting.."
oc get -n $NS pod "$POD_NAME" --no-headers
sleep 5
done
oc cp $NS/$POD_NAME:/var/lib/insights-operator ./insights-data
oc delete job -n $NS $JOB
# find recent GZ bundle
GZ=$(find insights-data -iname '*.gz' -mmin -5)
echo scp $GZ user@host:~/collect/$(hostname)
apiVersion: batch/v1
kind: Job
metadata:
name: insights-operator-job
annotations:
config.openshift.io/inject-proxy: insights-operator
spec:
backoffLimit: 6
ttlSecondsAfterFinished: 600
template:
spec:
restartPolicy: OnFailure
serviceAccountName: operator
# nodeSelector:
# beta.kubernetes.io/os: linux
# node-role.kubernetes.io/master: ""
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Exists
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 900
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 900
volumes:
- name: snapshots
emptyDir: {}
- name: service-ca-bundle
configMap:
name: service-ca-bundle
optional: true
initContainers:
- name: insights-operator
image: {{INSIGHTS_IMG}}
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- name: snapshots
mountPath: /var/lib/insights-operator
- name: service-ca-bundle
mountPath: /var/run/configmaps/service-ca-bundle
readOnly: true
ports:
- containerPort: 8443
name: https
resources:
requests:
cpu: 10m
memory: 70Mi
args:
- gather
- -v=4
- --config=/etc/insights-operator/server.yaml
containers:
- name: sleepy
image: quay.io/openshift/origin-base:latest
args:
- /bin/sh
- -c
- sleep 10m
volumeMounts: [{name: snapshots, mountPath: /var/lib/insights-operator}]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment