Last active
January 6, 2025 15:12
-
-
Save DaisukeMiyamoto/7f73f618b5885a2113eb52b0af62f19b to your computer and use it in GitHub Desktop.
set up CloudWatch GPU monitor for AWS ParallelCluster
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -x | |
. /etc/parallelcluster/cfnconfig | |
nvidia-smi | |
if [ $? = 0 ] ; then | |
CLUSTER_NAME=$(echo ${stack_name} | sed -e "s/parallelcluster-//g") | |
sudo pip install boto3 pynvml | |
wget -P /tmp https://s3.amazonaws.com/aws-bigdata-blog/artifacts/GPUMonitoring/gpumon.py | |
sed \ | |
-e "s/sleep_interval = 10/sleep_interval = 10/g" \ | |
-e "s/EC2_REGION = 'us-east-1'/EC2_REGION = '${cfn_region}'/g" \ | |
-e "s/my_NameSpace = 'DeepLearningTrain'/my_NameSpace = 'ParallelCluster-GPU'/g" \ | |
-e "s/INSTANCE_ID = urllib2.urlopen(BASE_URL + 'instance-id').read()/INSTANCE_ID = '${CLUSTER_NAME}-$(hostname)'/g" \ | |
-e "s/ 'Name': 'InstanceId',/ 'Name': 'InstanceName',/g" \ | |
/tmp/gpumon.py > /usr/local/bin/gpumon.py | |
cat << EOS > /etc/systemd/system/gpumon.service | |
[Unit] | |
Description = gpumon | |
[Service] | |
ExecStart =/bin/python /usr/local/bin/gpumon.py | |
Restart = always | |
Type = simple | |
[Install] | |
WantedBy = multi-user.target | |
EOS | |
sudo systemctl daemon-reload | |
sudo systemctl start gpumon | |
sudo systemctl status gpumon | |
sudo systemctl enable gpumon | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment