prerakmody · September 11, 2020 17:11
diff --git a/3dcnn.py b/3dcnn.py
 import torch
 import torchsummary
 torch.set_default_tensor_type(torch.FloatTensor)

 from pynvml import *
 try: nvmlInit()
 except:pass

 class VoxelNetFeat(torch.nn.Module):

    def __init__(self, verbose=False):
        super(VoxelNetFeat, self).__init__()

        self.verbose = verbose

        self.backbone_block1 = torch.nn.Sequential() 
        self.backbone_block1.add_module("BBone1_Conv3D", torch.nn.Conv3d(in_channels=1, out_channels=32, kernel_size=(7,7,7),padding=(3,3,3), bias=False))
        self.backbone_block1.add_module("BBone1_MPool3D", torch.nn.MaxPool3d(kernel_size=(2, 2, 2)))
        self.backbone_block1.add_module("BBone1_BN3D", torch.nn.BatchNorm3d(num_features=32))
        self.backbone_block1.add_module("BBone1_LRelu", torch.nn.ReLU(inplace=True))
        
        self.backbone_block2 = torch.nn.Sequential() 
        self.backbone_block2.add_module("BBone2_Conv3D", torch.nn.Conv3d(in_channels=32, out_channels=64, kernel_size=(5,5,5),padding=(2,2,2), bias=False))
        self.backbone_block2.add_module("BBone2_MPool3D", torch.nn.MaxPool3d(kernel_size=(2, 2, 2)))
        self.backbone_block2.add_module("BBone2_BN3D", torch.nn.BatchNorm3d(num_features=64))
        self.backbone_block2.add_module("BBone2_LRelu", torch.nn.ReLU(inplace=True))
        
        self.backbone_block3 = torch.nn.Sequential()
        self.backbone_block3.add_module("BBone3_Conv3D", torch.nn.Conv3d(in_channels=64, out_channels=128, kernel_size=(3,3,3), padding=(1,1,1), bias=False))
        self.backbone_block3.add_module("BBone3_BN3D", torch.nn.BatchNorm3d(num_features=128))
        self.backbone_block3.add_module("BBone3_LRelu", torch.nn.ReLU(inplace=True))
    
    def forward(self, x):
        
        if self.verbose: print (' ---- [VoxelNetFeatOrig] x: ', x.shape)
        x = self.backbone_block1(x)
        if self.verbose: print (' ---- [VoxelNetFeatOrig] backbone_block1(x): ', x.shape)
        x = self.backbone_block2(x)
        if self.verbose: print (' ---- [VoxelNetFeatOrig] backbone_block2(x): ', x.shape)
        x_backbone = self.backbone_block3(x)
        if self.verbose: print (' ---- [VoxelNetFeatOrig] backbone_block3(x): ', x_backbone.shape)

        return x_backbone
 
 class VoxelNetJointOrig(torch.nn.Module):
    def __init__(self, count_joints, in_channels, inter_channels, tag): #tag='Joints1'
        super(VoxelNetJointOrig, self).__init__()

        self.count_joints = count_joints

        self.task_joints = torch.nn.Sequential()
        self.task_joints.add_module(tag + "_Conv3D_1", torch.nn.Conv3d(in_channels=in_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
        self.task_joints.add_module(tag + "_LRelu_1", torch.nn.ReLU(inplace=True))
        self.task_joints.add_module(tag + "_Conv3D_2", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
        self.task_joints.add_module(tag + "_LRelu_2", torch.nn.ReLU(inplace=True))
        self.task_joints.add_module(tag + "_Conv3D_3", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
        self.task_joints.add_module(tag + "_LRelu_3", torch.nn.ReLU(inplace=True))
        self.task_joints.add_module(tag + "_Conv3D_4", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(1,1,1), padding=(0,0,0)))
        self.task_joints.add_module(tag + "_LRelu_4", torch.nn.ReLU(inplace=True))
        self.task_joints.add_module(tag + "_Conv3D_5", torch.nn.Conv3d(in_channels=inter_channels, out_channels=self.count_joints, kernel_size=(1,1,1), padding=(0,0,0)))
    
    def forward(self, x):
        return self.task_joints(x)

 class VoxelNetBPartOrig(torch.nn.Module):
    def __init__(self, count_bparts, in_channels, inter_channels, tag): #tag='BParts1'
        super(VoxelNetBPartOrig, self).__init__()

        self.count_bparts = count_bparts

        self.task_bparts = torch.nn.Sequential()
        self.task_bparts.add_module(tag + "_Conv3D_1", torch.nn.Conv3d(in_channels=in_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
        self.task_bparts.add_module(tag + "_LRelu_1", torch.nn.ReLU(inplace=True))
        self.task_bparts.add_module(tag + "_Conv3D_2", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
        self.task_bparts.add_module(tag + "_LRelu_2", torch.nn.ReLU(inplace=True))
        self.task_bparts.add_module(tag + "_Conv3D_3", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
        self.task_bparts.add_module(tag + "_LRelu_3", torch.nn.ReLU(inplace=True))
        self.task_bparts.add_module(tag + "_Conv3D_4", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(1,1,1), padding=(0,0,0)))
        self.task_bparts.add_module(tag + "_LRelu_4", torch.nn.ReLU(inplace=True))
        self.task_bparts.add_module(tag + "_Conv3D_5", torch.nn.Conv3d(in_channels=inter_channels, out_channels=self.count_bparts, kernel_size=(1,1,1), padding=(0,0,0)))
    
    def forward(self, x):
        return self.task_bparts(x)
 
 def get_interpolated(x, align_corners_interpolate=False):
    return torch.nn.functional.interpolate(x, scale_factor=4, mode='trilinear', align_corners=align_corners_interpolate)

 def print_gpustats():
    import os
    info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(int(os.environ['CUDA_VISIBLE_DEVICES'])))
    str_gpu = '%.4f' % (info.used/1024/1024/1000) + '/' + '%.4f' % (info.total/1024/1024/1000) + ' GB'
    print (' - GPU: ' + str_gpu)
    
 class VoxelNetOrig(torch.nn.Module):
    def __init__(self, count_joints, count_bparts, iters=1
                    , output='raw', nonlinearity_order='first', batch_norm=True
                    , net_interpolate=False, inference_only=False, align_corners_interpolate=False):
        super(VoxelNetOrig, self).__init__()

        self.count_joints = count_joints
        self.count_bparts = count_bparts
        self.net_interpolate = net_interpolate
        self.align_corners_interpolate = align_corners_interpolate
        self.iters = iters

        self.x_backbone_net = VoxelNetFeatOrig().cuda()
        self.x_joints1_net = VoxelNetJointOrig(self.count_joints, in_channels=128, inter_channels=128, tag="Joints1").cuda()
        self.x_bparts1_net = VoxelNetBPartOrig(self.count_bparts, in_channels=128, inter_channels=128, tag="BParts1").cuda()

        if self.iters == 3:
            self.x_joints2_net = VoxelNetJointOrig(self.count_joints, in_channels=128 + self.count_joints + self.count_bparts, inter_channels=128, tag="Joints2").cuda()
            self.x_bparts2_net = VoxelNetBPartOrig(self.count_bparts, in_channels=128 + self.count_joints + self.count_bparts, inter_channels=128, tag="BParts2").cuda()

            self.x_joints3_net = VoxelNetJointOrig(self.count_joints, in_channels=128 + self.count_joints + self.count_bparts, inter_channels=128, tag="Joints3").cuda()
            self.x_bparts3_net = VoxelNetBPartOrig(self.count_bparts, in_channels=128 + self.count_joints + self.count_bparts, inter_channels=128, tag="BParts3").cuda()
    
    def forward(self, x):
        
        x_backbone = self.x_backbone_net(x)
        x_joints1 = self.x_joints1_net(x_backbone)
        x_bparts1 = self.x_bparts1_net(x_backbone)

        if  self.iters == 1:     
            if self.net_interpolate is False:
                return [], [x_joints1], [x_bparts1]
            else:
                x_joints1_interpolate  = get_interpolated(x_joints1)
                x_bparts1_interpolate = get_interpolated(x_bparts1)
                return x_backbone, [x_joints1, x_joints1_interpolate], [x_bparts1, x_bparts1_interpolate]

        if self.iters == 3:
            x_iter1_op = torch.cat([x_backbone, x_joints1, x_bparts1], dim=1)
            x_joints2 = self.x_joints2_net(x_iter1_op)
            x_bparts2 = self.x_bparts2_net(x_iter1_op)

            x_iter2_op = torch.cat([x_backbone, x_joints2, x_bparts2], dim=1)
            x_joints3 = self.x_joints3_net(x_iter2_op)
            x_bparts3 = self.x_bparts3_net(x_iter2_op)

            if self.net_interpolate is False:
                return [], [x_joints1, x_joints2, x_joints3], [x_bparts1, x_bparts2, x_bparts3]
            else:
                x_joints1_interpolate  = get_interpolated(x_joints1)
                x_bparts1_interpolate = get_interpolated(x_bparts1)
                return [], [x_joints1, x_joints1_interpolate, x_joints2, [], x_joints3, []], [x_bparts1, x_bparts1_interpolate, x_bparts2, [], x_bparts3, []]
              
              
              
 if __name__ == "__main__":

    x = torch.rand(1,1,256,256,160).float()

    if 1:
        net = VoxelNetOrig(count_joints=8, count_bparts=7).cuda()
        if 0:
            x_backbone, x_joints, x_bparts = net(x.cuda())
            print (' - [VoxelNetOrig] x_joints1: ', x_joints[0].shape, ' || x_bparts: ', x_bparts[0].shape)
            print_gpustats()
        else:
            from torchsummary import summary
            summary(net.cuda(), input_size=(1,256,256,160))
diff --git a/basic_model.py b/basic_model.py
 import torch
 import torchvision
 torch.set_default_tensor_type(torch.FloatTensor)
 if not torch.cuda.is_available():
    print (' - [ERROR] CUDA is not available!')
    USE_CUDA = False
    sys.exit(1)
 else:
    USE_CUDA = True

 class DNN(torch.nn.Module):

    def __init__(self, dim_in, dim_out):
        super(DNN, self).__init__()
        self.fc1 = torch.nn.Linear(dim_in, 256)
        self.fc2 = torch.nn.Linear(256, 256)
        self.fc3 = torch.nn.Linear(256, dim_out)

    def forward(self,x):
        x = torch.nn.relu(self.fc1(x))
        x = torch.nn.relu(self.fc2(x))
        x = self.fc3(x)
        return x

 class DQN(torch.nn.Module):

    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = torch.nn.Conv2d(3, 16, kernel_size=5, stride=2)
        self.bn1 = torch.nn.BatchNorm2d(16)
        self.conv2 = torch.nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = torch.nn.BatchNorm2d(32)
        self.conv3 = torch.nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = torch.nn.BatchNorm2d(32)

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1

        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        self.head = torch.nn.Linear(linear_input_size, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = torch.nn.relu(self.bn1(self.conv1(x)))
        x = torch.nn.relu(self.bn2(self.conv2(x)))
        x = torch.nn.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))
diff --git a/commands.py b/commands.py
 # Running a python script as background process
 # CUDA_VISIBLE_DEVICES=1 nohup python3 -u <yourfile>.py > <outputfile>.log &
diff --git a/conda.MD b/conda.MD
diff --git a/firstrun.py b/firstrun.py
 import torch
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 USE_GPU = torch.cuda.is_available()

 if __name__ == "__main__":  
  print(' - cuda', torch.cuda.current_device(), torch.cuda.device_count(), torch.cuda.get_device_name(0))
  
  
diff --git a/google_colab.py b/google_colab.py
 ##################### memory footprint support libraries/code #####################
 !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
 !pip install gputil
 !pip install psutil
 !pip install humanize
 import psutil
 import humanize
 import os
 import GPUtil as GPU
 GPUs = GPU.getGPUs()
 # XXX: only one GPU on Colab and isn’t guaranteed
 gpu = GPUs[0]
 def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
 printm()

 ##################### 
 # ! pip install jupyter_contrib_nbextensions
 # ! jupyter contrib nbextension install --user
 # !jupyter nbextension enable codefolding/main
 !jupyter nbextension enable hinterland/hinterland
diff --git a/hacks.py b/hacks.py
 # Pytorch intricacies
 1. [Accumulating gradients to have large batch size in small GPU])(https://discuss.pytorch.org/t/how-to-implement-accumulated-gradient-in-pytorch-i-e-iter-size-in-caffe-prototxt/2522/4)
diff --git a/install.py b/install.py
 # Installation
 1. Check CUDA version 
 	- `nvcc --version`
 2. Accordingly install [pytorch](https://pytorch.org/resources)
 3. Open python console
 	- `import torch`
 	- `torch.cuda.get_device_name(0)`
 	- `torch.cuda.is_available()`
 	- `watch -n 0.1 nvidia-smi`
 	- `torch.rand(3,3).cuda()` # check the nvidia-smi processes for a python process
 	
 	
diff --git a/model_investigate.py b/model_investigate.py
 from torchsummary import summary


 if __name__ == "__main__":  
  summary(your_model.to("cpu"), input_size=(3, 448, 448))
  summary(your_model.cuda(), input_size=(3, 448, 448))
  
  
 """
 CUDA_VISIBLE_DEVICES="" python model_investigate.py
 """
diff --git a/model_investigate2.py b/model_investigate2.py
 import hiddenlayer as hl # pip install hiddenlayer

 if __name__ == "__main__":
  
  transforms = [
              # Fold Conv, BN, RELU layers into one
              hl.transforms.Fold("Conv > BatchNorm > LeakyRelu", "ConvBnRelu"),
              hl.transforms.Fold("ConvBnRelu > MaxPool", "ConvBnReluMax"),
              hl.transforms.Fold("Constant > Reshape > Transpose", "ConstantReshape")
  ]
  
  g = hl.build_graph(model, torch.zeros([1, 3, 416, 416]).cuda(), transforms=transforms)
  g.save(os.path.join("pytorch_yolov2.pdf"))
  # hl.build_graph(model, torch.zeros([1, 3, 416, 416]).cuda(), transforms=transforms)
diff --git a/pytorch_interpolation_3D.py b/pytorch_interpolation_3D.py
 import torch
 import numpy as np

 """
 1 1
 2 2
 - the above is one slice of the 2x2x2 cube
 """
 tmp = np.expand_dims(np.expand_dims(np.array( [[[1,1],[2,2]],[[1,1],[2,2]]]),axis=0),axis=0)
 tmp_torch = torch.tensor(tmp, dtype=torch.float32)
 tmp_torch_interp = torch.nn.functional.interpolate(tmp_torch, scale_factor=2, mode='trilinear', align_corners=True)

 print (' - original shape : ', tmp_torch.shape)
 print (' - interp shape : ', tmp_torch_interp.shape)
 print (tmp_torch)
 print (tmp_torch_interp)


 tmp = np.expand_dims(np.expand_dims(np.random.random((2,2,2)),axis=0),axis=0)
 tmp_torch = torch.tensor(tmp, dtype=torch.float32)
 tmp_torch_interp = torch.nn.functional.interpolate(tmp_torch, scale_factor=2, mode='trilinear')
 print (tmp_torch)
 print (tmp_torch_interp)
diff --git a/type_conversions.py b/type_conversions.py
 import torch
 import numpy as np

 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

 if __name__ == "__main__":
  data = np.random.random((3))
  torch.from_numpy(data).unsqueeze(0).to(DEVICE)
	import torch
	import torchsummary
	torch.set_default_tensor_type(torch.FloatTensor)

	from pynvml import *
	try: nvmlInit()
	except:pass

	class VoxelNetFeat(torch.nn.Module):

	def __init__(self, verbose=False):
	super(VoxelNetFeat, self).__init__()

	self.verbose = verbose

	self.backbone_block1 = torch.nn.Sequential()
	self.backbone_block1.add_module("BBone1_Conv3D", torch.nn.Conv3d(in_channels=1, out_channels=32, kernel_size=(7,7,7),padding=(3,3,3), bias=False))
	self.backbone_block1.add_module("BBone1_MPool3D", torch.nn.MaxPool3d(kernel_size=(2, 2, 2)))
	self.backbone_block1.add_module("BBone1_BN3D", torch.nn.BatchNorm3d(num_features=32))
	self.backbone_block1.add_module("BBone1_LRelu", torch.nn.ReLU(inplace=True))

	self.backbone_block2 = torch.nn.Sequential()
	self.backbone_block2.add_module("BBone2_Conv3D", torch.nn.Conv3d(in_channels=32, out_channels=64, kernel_size=(5,5,5),padding=(2,2,2), bias=False))
	self.backbone_block2.add_module("BBone2_MPool3D", torch.nn.MaxPool3d(kernel_size=(2, 2, 2)))
	self.backbone_block2.add_module("BBone2_BN3D", torch.nn.BatchNorm3d(num_features=64))
	self.backbone_block2.add_module("BBone2_LRelu", torch.nn.ReLU(inplace=True))

	self.backbone_block3 = torch.nn.Sequential()
	self.backbone_block3.add_module("BBone3_Conv3D", torch.nn.Conv3d(in_channels=64, out_channels=128, kernel_size=(3,3,3), padding=(1,1,1), bias=False))
	self.backbone_block3.add_module("BBone3_BN3D", torch.nn.BatchNorm3d(num_features=128))
	self.backbone_block3.add_module("BBone3_LRelu", torch.nn.ReLU(inplace=True))

	def forward(self, x):

	if self.verbose: print (' ---- [VoxelNetFeatOrig] x: ', x.shape)
	x = self.backbone_block1(x)
	if self.verbose: print (' ---- [VoxelNetFeatOrig] backbone_block1(x): ', x.shape)
	x = self.backbone_block2(x)
	if self.verbose: print (' ---- [VoxelNetFeatOrig] backbone_block2(x): ', x.shape)
	x_backbone = self.backbone_block3(x)
	if self.verbose: print (' ---- [VoxelNetFeatOrig] backbone_block3(x): ', x_backbone.shape)

	return x_backbone

	class VoxelNetJointOrig(torch.nn.Module):
	def __init__(self, count_joints, in_channels, inter_channels, tag): #tag='Joints1'
	super(VoxelNetJointOrig, self).__init__()

	self.count_joints = count_joints

	self.task_joints = torch.nn.Sequential()
	self.task_joints.add_module(tag + "_Conv3D_1", torch.nn.Conv3d(in_channels=in_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
	self.task_joints.add_module(tag + "_LRelu_1", torch.nn.ReLU(inplace=True))
	self.task_joints.add_module(tag + "_Conv3D_2", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
	self.task_joints.add_module(tag + "_LRelu_2", torch.nn.ReLU(inplace=True))
	self.task_joints.add_module(tag + "_Conv3D_3", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
	self.task_joints.add_module(tag + "_LRelu_3", torch.nn.ReLU(inplace=True))
	self.task_joints.add_module(tag + "_Conv3D_4", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(1,1,1), padding=(0,0,0)))
	self.task_joints.add_module(tag + "_LRelu_4", torch.nn.ReLU(inplace=True))
	self.task_joints.add_module(tag + "_Conv3D_5", torch.nn.Conv3d(in_channels=inter_channels, out_channels=self.count_joints, kernel_size=(1,1,1), padding=(0,0,0)))

	def forward(self, x):
	return self.task_joints(x)

	class VoxelNetBPartOrig(torch.nn.Module):
	def __init__(self, count_bparts, in_channels, inter_channels, tag): #tag='BParts1'
	super(VoxelNetBPartOrig, self).__init__()

	self.count_bparts = count_bparts

	self.task_bparts = torch.nn.Sequential()
	self.task_bparts.add_module(tag + "_Conv3D_1", torch.nn.Conv3d(in_channels=in_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
	self.task_bparts.add_module(tag + "_LRelu_1", torch.nn.ReLU(inplace=True))
	self.task_bparts.add_module(tag + "_Conv3D_2", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
	self.task_bparts.add_module(tag + "_LRelu_2", torch.nn.ReLU(inplace=True))
	self.task_bparts.add_module(tag + "_Conv3D_3", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(3,3,3), padding=(1,1,1)))
	self.task_bparts.add_module(tag + "_LRelu_3", torch.nn.ReLU(inplace=True))
	self.task_bparts.add_module(tag + "_Conv3D_4", torch.nn.Conv3d(in_channels=inter_channels, out_channels=inter_channels, kernel_size=(1,1,1), padding=(0,0,0)))
	self.task_bparts.add_module(tag + "_LRelu_4", torch.nn.ReLU(inplace=True))
	self.task_bparts.add_module(tag + "_Conv3D_5", torch.nn.Conv3d(in_channels=inter_channels, out_channels=self.count_bparts, kernel_size=(1,1,1), padding=(0,0,0)))

	def forward(self, x):
	return self.task_bparts(x)

	def get_interpolated(x, align_corners_interpolate=False):
	return torch.nn.functional.interpolate(x, scale_factor=4, mode='trilinear', align_corners=align_corners_interpolate)

	def print_gpustats():
	import os
	info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(int(os.environ['CUDA_VISIBLE_DEVICES'])))
	str_gpu = '%.4f' % (info.used/1024/1024/1000) + '/' + '%.4f' % (info.total/1024/1024/1000) + ' GB'
	print (' - GPU: ' + str_gpu)

	class VoxelNetOrig(torch.nn.Module):
	def __init__(self, count_joints, count_bparts, iters=1
	, output='raw', nonlinearity_order='first', batch_norm=True
	, net_interpolate=False, inference_only=False, align_corners_interpolate=False):
	super(VoxelNetOrig, self).__init__()

	self.count_joints = count_joints
	self.count_bparts = count_bparts
	self.net_interpolate = net_interpolate
	self.align_corners_interpolate = align_corners_interpolate
	self.iters = iters

	self.x_backbone_net = VoxelNetFeatOrig().cuda()
	self.x_joints1_net = VoxelNetJointOrig(self.count_joints, in_channels=128, inter_channels=128, tag="Joints1").cuda()
	self.x_bparts1_net = VoxelNetBPartOrig(self.count_bparts, in_channels=128, inter_channels=128, tag="BParts1").cuda()

	if self.iters == 3:
	self.x_joints2_net = VoxelNetJointOrig(self.count_joints, in_channels=128 + self.count_joints + self.count_bparts, inter_channels=128, tag="Joints2").cuda()
	self.x_bparts2_net = VoxelNetBPartOrig(self.count_bparts, in_channels=128 + self.count_joints + self.count_bparts, inter_channels=128, tag="BParts2").cuda()

	self.x_joints3_net = VoxelNetJointOrig(self.count_joints, in_channels=128 + self.count_joints + self.count_bparts, inter_channels=128, tag="Joints3").cuda()
	self.x_bparts3_net = VoxelNetBPartOrig(self.count_bparts, in_channels=128 + self.count_joints + self.count_bparts, inter_channels=128, tag="BParts3").cuda()

	def forward(self, x):

	x_backbone = self.x_backbone_net(x)
	x_joints1 = self.x_joints1_net(x_backbone)
	x_bparts1 = self.x_bparts1_net(x_backbone)

	if self.iters == 1:
	if self.net_interpolate is False:
	return [], [x_joints1], [x_bparts1]
	else:
	x_joints1_interpolate = get_interpolated(x_joints1)
	x_bparts1_interpolate = get_interpolated(x_bparts1)
	return x_backbone, [x_joints1, x_joints1_interpolate], [x_bparts1, x_bparts1_interpolate]

	if self.iters == 3:
	x_iter1_op = torch.cat([x_backbone, x_joints1, x_bparts1], dim=1)
	x_joints2 = self.x_joints2_net(x_iter1_op)
	x_bparts2 = self.x_bparts2_net(x_iter1_op)

	x_iter2_op = torch.cat([x_backbone, x_joints2, x_bparts2], dim=1)
	x_joints3 = self.x_joints3_net(x_iter2_op)
	x_bparts3 = self.x_bparts3_net(x_iter2_op)

	if self.net_interpolate is False:
	return [], [x_joints1, x_joints2, x_joints3], [x_bparts1, x_bparts2, x_bparts3]
	else:
	x_joints1_interpolate = get_interpolated(x_joints1)
	x_bparts1_interpolate = get_interpolated(x_bparts1)
	return [], [x_joints1, x_joints1_interpolate, x_joints2, [], x_joints3, []], [x_bparts1, x_bparts1_interpolate, x_bparts2, [], x_bparts3, []]



	if __name__ == "__main__":

	x = torch.rand(1,1,256,256,160).float()

	if 1:
	net = VoxelNetOrig(count_joints=8, count_bparts=7).cuda()
	if 0:
	x_backbone, x_joints, x_bparts = net(x.cuda())
	print (' - [VoxelNetOrig] x_joints1: ', x_joints[0].shape, ' \|\| x_bparts: ', x_bparts[0].shape)
	print_gpustats()
	else:
	from torchsummary import summary
	summary(net.cuda(), input_size=(1,256,256,160))
	import torch
	import torchvision
	torch.set_default_tensor_type(torch.FloatTensor)
	if not torch.cuda.is_available():
	print (' - [ERROR] CUDA is not available!')
	USE_CUDA = False
	sys.exit(1)
	else:
	USE_CUDA = True

	class DNN(torch.nn.Module):

	def __init__(self, dim_in, dim_out):
	super(DNN, self).__init__()
	self.fc1 = torch.nn.Linear(dim_in, 256)
	self.fc2 = torch.nn.Linear(256, 256)
	self.fc3 = torch.nn.Linear(256, dim_out)

	def forward(self,x):
	x = torch.nn.relu(self.fc1(x))
	x = torch.nn.relu(self.fc2(x))
	x = self.fc3(x)
	return x

	class DQN(torch.nn.Module):

	def __init__(self, h, w, outputs):
	super(DQN, self).__init__()
	self.conv1 = torch.nn.Conv2d(3, 16, kernel_size=5, stride=2)
	self.bn1 = torch.nn.BatchNorm2d(16)
	self.conv2 = torch.nn.Conv2d(16, 32, kernel_size=5, stride=2)
	self.bn2 = torch.nn.BatchNorm2d(32)
	self.conv3 = torch.nn.Conv2d(32, 32, kernel_size=5, stride=2)
	self.bn3 = torch.nn.BatchNorm2d(32)

	# Number of Linear input connections depends on output of conv2d layers
	# and therefore the input image size, so compute it.
	def conv2d_size_out(size, kernel_size = 5, stride = 2):
	return (size - (kernel_size - 1) - 1) // stride + 1

	convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
	convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
	linear_input_size = convw * convh * 32
	self.head = torch.nn.Linear(linear_input_size, outputs)

	# Called with either one element to determine next action, or a batch
	# during optimization. Returns tensor([[left0exp,right0exp]...]).
	def forward(self, x):
	x = torch.nn.relu(self.bn1(self.conv1(x)))
	x = torch.nn.relu(self.bn2(self.conv2(x)))
	x = torch.nn.relu(self.bn3(self.conv3(x)))
	return self.head(x.view(x.size(0), -1))
	# Running a python script as background process
	# CUDA_VISIBLE_DEVICES=1 nohup python3 -u <yourfile>.py > <outputfile>.log &
	import torch
	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	USE_GPU = torch.cuda.is_available()

	if __name__ == "__main__":
	print(' - cuda', torch.cuda.current_device(), torch.cuda.device_count(), torch.cuda.get_device_name(0))
	##################### memory footprint support libraries/code #####################
	!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
	!pip install gputil
	!pip install psutil
	!pip install humanize
	import psutil
	import humanize
	import os
	import GPUtil as GPU
	GPUs = GPU.getGPUs()
	# XXX: only one GPU on Colab and isn’t guaranteed
	gpu = GPUs[0]
	def printm():
	process = psutil.Process(os.getpid())
	print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " \| Proc size: " + humanize.naturalsize( process.memory_info().rss))
	print("GPU RAM Free: {0:.0f}MB \| Used: {1:.0f}MB \| Util {2:3.0f}% \| Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
	printm()

	#####################
	# ! pip install jupyter_contrib_nbextensions
	# ! jupyter contrib nbextension install --user
	# !jupyter nbextension enable codefolding/main
	!jupyter nbextension enable hinterland/hinterland
	# Pytorch intricacies
	1. [Accumulating gradients to have large batch size in small GPU])(https://discuss.pytorch.org/t/how-to-implement-accumulated-gradient-in-pytorch-i-e-iter-size-in-caffe-prototxt/2522/4)
	# Installation
	1. Check CUDA version
	- `nvcc --version`
	2. Accordingly install [pytorch](https://pytorch.org/resources)
	3. Open python console
	- `import torch`
	- `torch.cuda.get_device_name(0)`
	- `torch.cuda.is_available()`
	- `watch -n 0.1 nvidia-smi`
	- `torch.rand(3,3).cuda()` # check the nvidia-smi processes for a python process
	from torchsummary import summary


	if __name__ == "__main__":
	summary(your_model.to("cpu"), input_size=(3, 448, 448))
	summary(your_model.cuda(), input_size=(3, 448, 448))


	"""
	CUDA_VISIBLE_DEVICES="" python model_investigate.py
	"""
	import hiddenlayer as hl # pip install hiddenlayer

	if __name__ == "__main__":

	transforms = [
	# Fold Conv, BN, RELU layers into one
	hl.transforms.Fold("Conv > BatchNorm > LeakyRelu", "ConvBnRelu"),
	hl.transforms.Fold("ConvBnRelu > MaxPool", "ConvBnReluMax"),
	hl.transforms.Fold("Constant > Reshape > Transpose", "ConstantReshape")
	]

	g = hl.build_graph(model, torch.zeros([1, 3, 416, 416]).cuda(), transforms=transforms)
	g.save(os.path.join("pytorch_yolov2.pdf"))
	# hl.build_graph(model, torch.zeros([1, 3, 416, 416]).cuda(), transforms=transforms)
	import torch
	import numpy as np

	"""
	1 1
	2 2
	- the above is one slice of the 2x2x2 cube
	"""
	tmp = np.expand_dims(np.expand_dims(np.array( [[[1,1],[2,2]],[[1,1],[2,2]]]),axis=0),axis=0)
	tmp_torch = torch.tensor(tmp, dtype=torch.float32)
	tmp_torch_interp = torch.nn.functional.interpolate(tmp_torch, scale_factor=2, mode='trilinear', align_corners=True)

	print (' - original shape : ', tmp_torch.shape)
	print (' - interp shape : ', tmp_torch_interp.shape)
	print (tmp_torch)
	print (tmp_torch_interp)


	tmp = np.expand_dims(np.expand_dims(np.random.random((2,2,2)),axis=0),axis=0)
	tmp_torch = torch.tensor(tmp, dtype=torch.float32)
	tmp_torch_interp = torch.nn.functional.interpolate(tmp_torch, scale_factor=2, mode='trilinear')
	print (tmp_torch)
	print (tmp_torch_interp)