alexwal · April 20, 2023 11:54 · sturfee-petrl · Apr 13, 2018
diff --git a/readme.md b/readme.md
diff --git a/gpumon.py b/gpumon.py
 # Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License").
 # You may not use this file except in compliance with the License.
 # A copy of the License is located at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #  
 #  or in the "license" file accompanying this file. This file is distributed 
 #  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
 #  express or implied. See the License for the specific language governing 
 #  permissions and limitations under the License.


 import urllib
 import boto3
 from pynvml import *
 from datetime import datetime
 from time import sleep

 ### CHOOSE REGION ####
 EC2_REGION = 'us-east-1'

 ###CHOOSE NAMESPACE PARMETERS HERE###
 my_NameSpace = 'DeepLearningTrain' 

 ### CHOOSE PUSH INTERVAL ####
 sleep_interval = 10

 ### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
 store_reso = 60

 #Instance information
 BASE_URL = 'http://169.254.169.254/latest/meta-data/'
 INSTANCE_ID = urllib.request.urlopen(BASE_URL + 'instance-id').read().decode('utf-8')
 IMAGE_ID = urllib.request.urlopen(BASE_URL + 'ami-id').read().decode('utf-8')
 INSTANCE_TYPE = urllib.request.urlopen(BASE_URL + 'instance-type').read().decode('utf-8')
 INSTANCE_AZ = urllib.request.urlopen(BASE_URL + 'placement/availability-zone').read().decode('utf-8')
 EC2_REGION = INSTANCE_AZ[:-1]

 TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
 TMP_FILE = '/tmp/GPU_TEMP'
 TMP_FILE_SAVED = TMP_FILE + TIMESTAMP

 print(EC2_REGION)
 # Create CloudWatch client
 cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION)


 # Flag to push to CloudWatch
 PUSH_TO_CW = True

 def getPowerDraw(handle):
    try:
        powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
        powDrawStr = '%.2f' % powDraw
    except NVMLError as err:
        powDrawStr = handleError(err)
        PUSH_TO_CW = False
    return powDrawStr

 def getTemp(handle):
    try:
        temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
    except NVMLError as err:
        temp = handleError(err) 
        PUSH_TO_CW = False
    return temp

 def getUtilization(handle):
    try:
        util = nvmlDeviceGetUtilizationRates(handle)
        gpu_util = str(util.gpu)
        mem_util = str(util.memory)
    except NVMLError as err:
        error = handleError(err)
        gpu_util = error
        mem_util = error
        PUSH_TO_CW = False
    return util, gpu_util, mem_util

 def logResults(i, util, gpu_util, mem_util, powDrawStr, temp):
    try:
        gpu_logs = open(TMP_FILE_SAVED, 'a+')
        writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n'
        gpu_logs.write(writeString)
    except:
        print(("Error writing to file ", gpu_logs))
    finally:
        gpu_logs.close()
    if (PUSH_TO_CW):
        MY_DIMENSIONS=[
                    {
                        'Name': 'InstanceId',
                        'Value': INSTANCE_ID
                    },
                    {
                        'Name': 'ImageId',
                        'Value': IMAGE_ID
                    },
                    {
                        'Name': 'InstanceType',
                        'Value': INSTANCE_TYPE
                    },
                    {
                        'Name': 'GPUNumber',
                        'Value': str(i)
                    }
                ]
        cloudwatch.put_metric_data(
            MetricData=[
                {
                    'MetricName': 'GPU Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.gpu
                },
                {
                    'MetricName': 'Memory Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.memory
                },
                {
                    'MetricName': 'Power Usage (Watts)',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'None',
                    'StorageResolution': store_reso,
                    'Value': float(powDrawStr)
                },
                {
                    'MetricName': 'Temperature (C)',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'None',
                    'StorageResolution': store_reso,
                    'Value': int(temp)
                },            
        ],
            Namespace=my_NameSpace
        )
    

 nvmlInit()
 deviceCount = nvmlDeviceGetCount()

 def main():
    try:
        while True:
            PUSH_TO_CW = True
            # Find the metrics for each GPU on instance
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)

                powDrawStr = getPowerDraw(handle)
                temp = getTemp(handle)
                util, gpu_util, mem_util = getUtilization(handle)
                logResults(i, util, gpu_util, mem_util, powDrawStr, temp)

            sleep(sleep_interval)

    finally:
        nvmlShutdown()

 if __name__=='__main__':
    main()


diff --git a/how-to-use-all-gpus-in-tensorflow.py b/how-to-use-all-gpus-in-tensorflow.py
 import glob
 import tensorflow as tf

 # Get matching filenames
 pattern = '../../test-images/*.jpg.*' # A bunch of 299x299x3 jpegs
 matching_filenames = glob.iglob(pattern)

 # Pipeline settings
 NUM_GPUS = 2
 BATCH_SIZE = 32
 INNER_LOOP_ITERS = 3
 FEED_BATCH_SIZE = NUM_GPUS * BATCH_SIZE * INNER_LOOP_ITERS # ONE batch will be split into NUM_GPUS batches, so make the size appropriate.

 # Generate batches of images (jpeg encoded)
 batches, batch = [], []
 id_batches, id_batch = [], []
 for i, filename in enumerate(matching_filenames):
  data = open(filename, 'rb').read()
  batch.append(data)
  id_batch.append(i)
  if len(batch) % FEED_BATCH_SIZE == 0:
    batches.append(batch)
    id_batches.append(id_batch)
    batch = []
    id_batch = []

 ###########################################
 # Tensorflow pipeline for decoding images #
 ###########################################

 # Create a dataset returning slices of `image_strings`
 image_strings = tf.placeholder(tf.string, shape=[None])
 id_ints = tf.placeholder(tf.int32, shape=[None])
 dataset = tf.data.Dataset.from_tensor_slices((id_ints, image_strings))

 # Parse every image in the dataset using `map`
 def _parse_function(image_id, image_string):
  image_decoded = tf.image.decode_jpeg(image_string, channels=3)
  image = tf.cast(image_decoded, tf.float32)
  return image_id, image
 dataset = dataset.map(_parse_function)
 dataset = dataset.batch(BATCH_SIZE)
 dataset = dataset.repeat()

 # Create iterator and final tensor
 # a = tf.ones(shape = [9,5,5,3])   
 # b = tf.random_uniform(shape = [9,5,3,1])   
 # c = tf.matmul(a,b) [shape=9,5,5,1]

 iterator = dataset.make_initializable_iterator()
 results = []
 for i in range(NUM_GPUS):
    with tf.device('/gpu:{}'.format(i)):
        image_ids, images = iterator.get_next()
        W = tf.Variable(tf.random_normal(shape=(BATCH_SIZE, 299, 3, 1)))
        result = tf.matmul(images, W)
        results.append((image_ids, result))
 init_op = tf.global_variables_initializer()

 # Feed image data to the dataset pipeline
 config = tf.ConfigProto(device_count={'GPU': NUM_GPUS}, allow_soft_placement=True, log_device_placement=True)
 with tf.Session(config=config) as sess:
  sess.run(init_op)
  while True:
    for id_batch, batch in zip(id_batches, batches):
      # Initialize dataset iterator with new inputs (batch and id_batch both have `FEED_BATCH_SIZE` elements)
      sess.run(iterator.initializer, {image_strings: batch, id_ints: id_batch})
      for _ in range(INNER_LOOP_ITERS):
        all_results = sess.run(results)
        for result in all_results:
            ids, vectors = result
            print('Got ids:', ids)

diff --git a/run.sh b/run.sh
 #!/bin/bash
 if pgrep -f "gpumon.py" &>/dev/null; then
    echo "gpumon already running"
 else
    echo "gpumon not already running"
    python3 gpumon.py &
 fi
 python3 how-to-use-all-gpus-in-tensorflow.py
	# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License").
	# You may not use this file except in compliance with the License.
	# A copy of the License is located at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# or in the "license" file accompanying this file. This file is distributed
	# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
	# express or implied. See the License for the specific language governing
	# permissions and limitations under the License.


	import urllib
	import boto3
	from pynvml import *
	from datetime import datetime
	from time import sleep

	### CHOOSE REGION ####
	EC2_REGION = 'us-east-1'

	###CHOOSE NAMESPACE PARMETERS HERE###
	my_NameSpace = 'DeepLearningTrain'

	### CHOOSE PUSH INTERVAL ####
	sleep_interval = 10

	### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
	store_reso = 60

	#Instance information
	BASE_URL = 'http://169.254.169.254/latest/meta-data/'
	INSTANCE_ID = urllib.request.urlopen(BASE_URL + 'instance-id').read().decode('utf-8')
	IMAGE_ID = urllib.request.urlopen(BASE_URL + 'ami-id').read().decode('utf-8')
	INSTANCE_TYPE = urllib.request.urlopen(BASE_URL + 'instance-type').read().decode('utf-8')
	INSTANCE_AZ = urllib.request.urlopen(BASE_URL + 'placement/availability-zone').read().decode('utf-8')
	EC2_REGION = INSTANCE_AZ[:-1]

	TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
	TMP_FILE = '/tmp/GPU_TEMP'
	TMP_FILE_SAVED = TMP_FILE + TIMESTAMP

	print(EC2_REGION)
	# Create CloudWatch client
	cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION)


	# Flag to push to CloudWatch
	PUSH_TO_CW = True

	def getPowerDraw(handle):
	try:
	powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
	powDrawStr = '%.2f' % powDraw
	except NVMLError as err:
	powDrawStr = handleError(err)
	PUSH_TO_CW = False
	return powDrawStr

	def getTemp(handle):
	try:
	temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
	except NVMLError as err:
	temp = handleError(err)
	PUSH_TO_CW = False
	return temp

	def getUtilization(handle):
	try:
	util = nvmlDeviceGetUtilizationRates(handle)
	gpu_util = str(util.gpu)
	mem_util = str(util.memory)
	except NVMLError as err:
	error = handleError(err)
	gpu_util = error
	mem_util = error
	PUSH_TO_CW = False
	return util, gpu_util, mem_util

	def logResults(i, util, gpu_util, mem_util, powDrawStr, temp):
	try:
	gpu_logs = open(TMP_FILE_SAVED, 'a+')
	writeString = str(i) + ',' + gpu_util + ',' + mem_util + ',' + powDrawStr + ',' + temp + '\n'
	gpu_logs.write(writeString)
	except:
	print(("Error writing to file ", gpu_logs))
	finally:
	gpu_logs.close()
	if (PUSH_TO_CW):
	MY_DIMENSIONS=[
	{
	'Name': 'InstanceId',
	'Value': INSTANCE_ID
	},
	{
	'Name': 'ImageId',
	'Value': IMAGE_ID
	},
	{
	'Name': 'InstanceType',
	'Value': INSTANCE_TYPE
	},
	{
	'Name': 'GPUNumber',
	'Value': str(i)
	}
	]
	cloudwatch.put_metric_data(
	MetricData=[
	{
	'MetricName': 'GPU Usage',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'Percent',
	'StorageResolution': store_reso,
	'Value': util.gpu
	},
	{
	'MetricName': 'Memory Usage',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'Percent',
	'StorageResolution': store_reso,
	'Value': util.memory
	},
	{
	'MetricName': 'Power Usage (Watts)',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'None',
	'StorageResolution': store_reso,
	'Value': float(powDrawStr)
	},
	{
	'MetricName': 'Temperature (C)',
	'Dimensions': MY_DIMENSIONS,
	'Unit': 'None',
	'StorageResolution': store_reso,
	'Value': int(temp)
	},
	],
	Namespace=my_NameSpace
	)


	nvmlInit()
	deviceCount = nvmlDeviceGetCount()

	def main():
	try:
	while True:
	PUSH_TO_CW = True
	# Find the metrics for each GPU on instance
	for i in range(deviceCount):
	handle = nvmlDeviceGetHandleByIndex(i)

	powDrawStr = getPowerDraw(handle)
	temp = getTemp(handle)
	util, gpu_util, mem_util = getUtilization(handle)
	logResults(i, util, gpu_util, mem_util, powDrawStr, temp)

	sleep(sleep_interval)

	finally:
	nvmlShutdown()

	if __name__=='__main__':
	main()
	import glob
	import tensorflow as tf

	# Get matching filenames
	pattern = '../../test-images/.jpg.' # A bunch of 299x299x3 jpegs
	matching_filenames = glob.iglob(pattern)

	# Pipeline settings
	NUM_GPUS = 2
	BATCH_SIZE = 32
	INNER_LOOP_ITERS = 3
	FEED_BATCH_SIZE = NUM_GPUS * BATCH_SIZE * INNER_LOOP_ITERS # ONE batch will be split into NUM_GPUS batches, so make the size appropriate.

	# Generate batches of images (jpeg encoded)
	batches, batch = [], []
	id_batches, id_batch = [], []
	for i, filename in enumerate(matching_filenames):
	data = open(filename, 'rb').read()
	batch.append(data)
	id_batch.append(i)
	if len(batch) % FEED_BATCH_SIZE == 0:
	batches.append(batch)
	id_batches.append(id_batch)
	batch = []
	id_batch = []

	###########################################
	# Tensorflow pipeline for decoding images #
	###########################################

	# Create a dataset returning slices of `image_strings`
	image_strings = tf.placeholder(tf.string, shape=[None])
	id_ints = tf.placeholder(tf.int32, shape=[None])
	dataset = tf.data.Dataset.from_tensor_slices((id_ints, image_strings))

	# Parse every image in the dataset using `map`
	def _parse_function(image_id, image_string):
	image_decoded = tf.image.decode_jpeg(image_string, channels=3)
	image = tf.cast(image_decoded, tf.float32)
	return image_id, image
	dataset = dataset.map(_parse_function)
	dataset = dataset.batch(BATCH_SIZE)
	dataset = dataset.repeat()

	# Create iterator and final tensor
	# a = tf.ones(shape = [9,5,5,3])
	# b = tf.random_uniform(shape = [9,5,3,1])
	# c = tf.matmul(a,b) [shape=9,5,5,1]

	iterator = dataset.make_initializable_iterator()
	results = []
	for i in range(NUM_GPUS):
	with tf.device('/gpu:{}'.format(i)):
	image_ids, images = iterator.get_next()
	W = tf.Variable(tf.random_normal(shape=(BATCH_SIZE, 299, 3, 1)))
	result = tf.matmul(images, W)
	results.append((image_ids, result))
	init_op = tf.global_variables_initializer()

	# Feed image data to the dataset pipeline
	config = tf.ConfigProto(device_count={'GPU': NUM_GPUS}, allow_soft_placement=True, log_device_placement=True)
	with tf.Session(config=config) as sess:
	sess.run(init_op)
	while True:
	for id_batch, batch in zip(id_batches, batches):
	# Initialize dataset iterator with new inputs (batch and id_batch both have `FEED_BATCH_SIZE` elements)
	sess.run(iterator.initializer, {image_strings: batch, id_ints: id_batch})
	for _ in range(INNER_LOOP_ITERS):
	all_results = sess.run(results)
	for result in all_results:
	ids, vectors = result
	print('Got ids:', ids)
	#!/bin/bash
	if pgrep -f "gpumon.py" &>/dev/null; then
	echo "gpumon already running"
	else
	echo "gpumon not already running"
	python3 gpumon.py &
	fi
	python3 how-to-use-all-gpus-in-tensorflow.py