- Start with GPU EC2 instance that has service role with policy for putting metrics on Amazon CloudWatch (https://aws.amazon.com/blogs/machine-learning/monitoring-gpu-utilization-with-amazon-cloudwatch/).
- Put a bunch of 299x299 jpeg images in test-images/ Ex.
for run in {1..10} do wget via.placeholder.com/299x299.jpg done
. - pip3 install nvidia-ml-py3
- Edit run.sh to run gpumon.py and another process (which is presumably using the GPU)..
- Open CloudWatch in AWS Console to view graphs.
Last active
April 20, 2023 11:54
-
-
Save alexwal/4961f4d7330ad3465ecd04eb301bc497 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"). | |
# You may not use this file except in compliance with the License. | |
# A copy of the License is located at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# or in the "license" file accompanying this file. This file is distributed | |
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either | |
# express or implied. See the License for the specific language governing | |
# permissions and limitations under the License. | |
import urllib | |
import boto3 | |
from pynvml import * | |
from datetime import datetime | |
from time import sleep | |
### CHOOSE REGION #### | |
EC2_REGION = 'us-east-1' | |
###CHOOSE NAMESPACE PARMETERS HERE### | |
my_NameSpace = 'DeepLearningTrain' | |
### CHOOSE PUSH INTERVAL #### | |
sleep_interval = 10 | |
### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) #### | |
store_reso = 60 | |
#Instance information | |
BASE_URL = 'http://169.254.169.254/latest/meta-data/' | |
INSTANCE_ID = urllib.request.urlopen(BASE_URL + 'instance-id').read().decode('utf-8') | |
IMAGE_ID = urllib.request.urlopen(BASE_URL + 'ami-id').read().decode('utf-8') | |
INSTANCE_TYPE = urllib.request.urlopen(BASE_URL + 'instance-type').read().decode('utf-8') | |
INSTANCE_AZ = urllib.request.urlopen(BASE_URL + 'placement/availability-zone').read().decode('utf-8') | |
EC2_REGION = INSTANCE_AZ[:-1] | |
TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H') | |
TMP_FILE = '/tmp/GPU_TEMP' | |
TMP_FILE_SAVED = TMP_FILE + TIMESTAMP | |
print(EC2_REGION) | |
# Create CloudWatch client | |
cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION) | |
# Flag to push to CloudWatch | |
PUSH_TO_CW = True | |
def getPowerDraw(handle): | |
try: | |
powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0 | |
powDrawStr = '%.2f' % powDraw | |
except NVMLError as err: | |
powDrawStr = handleError(err) | |
PUSH_TO_CW = False | |
return powDrawStr | |
def getTemp(handle): | |
try: | |
temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)) | |
except NVMLError as err: | |
temp = handleError(err) | |
PUSH_TO_CW = False | |
return temp | |
def getUtilization(handle): | |
try: | |
util = nvmlDeviceGetUtilizationRates(handle) | |
gpu_util = str(util.gpu) | |
mem_util = str(util.memory) | |
except NVMLError as err: | |
error = handleError(err) | |
gpu_util = error | |
mem_util = error | |
PUSH_TO_CW = False | |
return util, gpu_util, mem_util | |
def logResults(i, util, gpu_util, mem_util, powDrawStr, temp): | |
try: | |
gpu_logs = open(TMP_FILE_SAVED, 'a+') | |
writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n' | |
gpu_logs.write(writeString) | |
except: | |
print(("Error writing to file ", gpu_logs)) | |
finally: | |
gpu_logs.close() | |
if (PUSH_TO_CW): | |
MY_DIMENSIONS=[ | |
{ | |
'Name': 'InstanceId', | |
'Value': INSTANCE_ID | |
}, | |
{ | |
'Name': 'ImageId', | |
'Value': IMAGE_ID | |
}, | |
{ | |
'Name': 'InstanceType', | |
'Value': INSTANCE_TYPE | |
}, | |
{ | |
'Name': 'GPUNumber', | |
'Value': str(i) | |
} | |
] | |
cloudwatch.put_metric_data( | |
MetricData=[ | |
{ | |
'MetricName': 'GPU Usage', | |
'Dimensions': MY_DIMENSIONS, | |
'Unit': 'Percent', | |
'StorageResolution': store_reso, | |
'Value': util.gpu | |
}, | |
{ | |
'MetricName': 'Memory Usage', | |
'Dimensions': MY_DIMENSIONS, | |
'Unit': 'Percent', | |
'StorageResolution': store_reso, | |
'Value': util.memory | |
}, | |
{ | |
'MetricName': 'Power Usage (Watts)', | |
'Dimensions': MY_DIMENSIONS, | |
'Unit': 'None', | |
'StorageResolution': store_reso, | |
'Value': float(powDrawStr) | |
}, | |
{ | |
'MetricName': 'Temperature (C)', | |
'Dimensions': MY_DIMENSIONS, | |
'Unit': 'None', | |
'StorageResolution': store_reso, | |
'Value': int(temp) | |
}, | |
], | |
Namespace=my_NameSpace | |
) | |
nvmlInit() | |
deviceCount = nvmlDeviceGetCount() | |
def main(): | |
try: | |
while True: | |
PUSH_TO_CW = True | |
# Find the metrics for each GPU on instance | |
for i in range(deviceCount): | |
handle = nvmlDeviceGetHandleByIndex(i) | |
powDrawStr = getPowerDraw(handle) | |
temp = getTemp(handle) | |
util, gpu_util, mem_util = getUtilization(handle) | |
logResults(i, util, gpu_util, mem_util, powDrawStr, temp) | |
sleep(sleep_interval) | |
finally: | |
nvmlShutdown() | |
if __name__=='__main__': | |
main() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import tensorflow as tf | |
# Get matching filenames | |
pattern = '../../test-images/*.jpg.*' # A bunch of 299x299x3 jpegs | |
matching_filenames = glob.iglob(pattern) | |
# Pipeline settings | |
NUM_GPUS = 2 | |
BATCH_SIZE = 32 | |
INNER_LOOP_ITERS = 3 | |
FEED_BATCH_SIZE = NUM_GPUS * BATCH_SIZE * INNER_LOOP_ITERS # ONE batch will be split into NUM_GPUS batches, so make the size appropriate. | |
# Generate batches of images (jpeg encoded) | |
batches, batch = [], [] | |
id_batches, id_batch = [], [] | |
for i, filename in enumerate(matching_filenames): | |
data = open(filename, 'rb').read() | |
batch.append(data) | |
id_batch.append(i) | |
if len(batch) % FEED_BATCH_SIZE == 0: | |
batches.append(batch) | |
id_batches.append(id_batch) | |
batch = [] | |
id_batch = [] | |
########################################### | |
# Tensorflow pipeline for decoding images # | |
########################################### | |
# Create a dataset returning slices of `image_strings` | |
image_strings = tf.placeholder(tf.string, shape=[None]) | |
id_ints = tf.placeholder(tf.int32, shape=[None]) | |
dataset = tf.data.Dataset.from_tensor_slices((id_ints, image_strings)) | |
# Parse every image in the dataset using `map` | |
def _parse_function(image_id, image_string): | |
image_decoded = tf.image.decode_jpeg(image_string, channels=3) | |
image = tf.cast(image_decoded, tf.float32) | |
return image_id, image | |
dataset = dataset.map(_parse_function) | |
dataset = dataset.batch(BATCH_SIZE) | |
dataset = dataset.repeat() | |
# Create iterator and final tensor | |
# a = tf.ones(shape = [9,5,5,3]) | |
# b = tf.random_uniform(shape = [9,5,3,1]) | |
# c = tf.matmul(a,b) [shape=9,5,5,1] | |
iterator = dataset.make_initializable_iterator() | |
results = [] | |
for i in range(NUM_GPUS): | |
with tf.device('/gpu:{}'.format(i)): | |
image_ids, images = iterator.get_next() | |
W = tf.Variable(tf.random_normal(shape=(BATCH_SIZE, 299, 3, 1))) | |
result = tf.matmul(images, W) | |
results.append((image_ids, result)) | |
init_op = tf.global_variables_initializer() | |
# Feed image data to the dataset pipeline | |
config = tf.ConfigProto(device_count={'GPU': NUM_GPUS}, allow_soft_placement=True, log_device_placement=True) | |
with tf.Session(config=config) as sess: | |
sess.run(init_op) | |
while True: | |
for id_batch, batch in zip(id_batches, batches): | |
# Initialize dataset iterator with new inputs (batch and id_batch both have `FEED_BATCH_SIZE` elements) | |
sess.run(iterator.initializer, {image_strings: batch, id_ints: id_batch}) | |
for _ in range(INNER_LOOP_ITERS): | |
all_results = sess.run(results) | |
for result in all_results: | |
ids, vectors = result | |
print('Got ids:', ids) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
if pgrep -f "gpumon.py" &>/dev/null; then | |
echo "gpumon already running" | |
else | |
echo "gpumon not already running" | |
python3 gpumon.py & | |
fi | |
python3 how-to-use-all-gpus-in-tensorflow.py | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nvmlDeviceGetUtilizationRates returns not really actual parameters. I mean It retrieves the current utilization rates. It can be 0 - 100. It don't show a real picture if you have not constant input to a GPU.
nvmlDeviceGetSamples allow to get an array of rates and then calculate the real average rate.