python --version
python3 --version
| module github.com/pfnet-research/nvidia-create-symlinks | |
| go 1.19 | |
| require ( | |
| github.com/NVIDIA/nvidia-container-toolkit v1.12.0-rc.2.0.20230127101129-9fc2c5912242 // indirect | |
| github.com/cpuguy83/go-md2man/v2 v2.0.1 // indirect | |
| github.com/fsnotify/fsnotify v1.5.4 // indirect | |
| github.com/russross/blackfriday/v2 v2.1.0 // indirect | |
| github.com/sirupsen/logrus v1.9.0 // indirect |
NOTE: This seems fixed our cluster. BUT I do see some still reporting cgroup2 having same issue, for example here. So YMMV.
DISCLAIMER: This seems works in our env. may not work in others. I'm still not sure what is the real root cause(s) yet. Not even 100% sure it full fixes in our env - it's been good for 2 weeks. But if it reappears, (for example, under certain use cases. high load or something), I'll be doomed.
Switching to cgroup v2 seems fixed the nvml suddenly go away in pod issue.
| # Inspired by https://github.com/vpenso/ganglia-sensors/blob/master/lib/python_modules/infiniband.py#/ | |
| import logging | |
| import re | |
| import sys | |
| import json | |
| import time | |
| import subprocess |
| package main | |
| import ( | |
| "bytes" | |
| "fmt" | |
| "io" | |
| "log" | |
| "net" | |
| "regexp" | |
| "strings" |
| #!/bin/bash | |
| set -o errexit | |
| set -o xtrace | |
| main() { | |
| local namespaces=$(list_namespaces) | |
| for namespace in $namespaces; do | |
| local tasks=$(list_tasks $namespace) |
| // C++ includes used for precompiling -*- C++ -*- | |
| // Copyright (C) 2003-2015 Free Software Foundation, Inc. | |
| // | |
| // This file is part of the GNU ISO C++ Library. This library is free | |
| // software; you can redistribute it and/or modify it under the | |
| // terms of the GNU General Public License as published by the | |
| // Free Software Foundation; either version 3, or (at your option) | |
| // any later version. |
| # -*- coding: utf-8 -*- | |
| """Example how to measure average time taken per batch. | |
| """ | |
| import time | |
| import numpy as np | |
| import pandas as pd | |
| from tensorflow.keras.models import Model | |
| from tensorflow.keras.layers import Input, Dense |
| #!/usr/bin/python | |
| bpf_text = """ | |
| #include <linux/ptrace.h> | |
| #include <linux/sched.h> /* For TASK_COMM_LEN */ | |
| #include <linux/icmp.h> | |
| #include <linux/netdevice.h> | |
| struct probe_icmp_data_t | |
| { |