Created
November 17, 2021 05:49
-
-
Save twmht/d9b2ae03162942b1ebc0bed86133ce1f to your computer and use it in GitHub Desktop.
reproduce cupy runtime error with pytorch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import cupy as cp | |
import numpy as np | |
def fp16_clamp(x, min=None, max=None): | |
if not x.is_cuda and x.dtype == torch.float16: | |
# clamp for cpu float16, tensor fp16 has no clamp implementation | |
return x.float().clamp(min, max).half() | |
return x.clamp(min, max) | |
def torch_bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): | |
"""Calculate overlap between two set of bboxes. | |
FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889 | |
Note: | |
Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou', | |
there are some new generated variable when calculating IOU | |
using bbox_overlaps function: | |
1) is_aligned is False | |
area1: M x 1 | |
area2: N x 1 | |
lt: M x N x 2 | |
rb: M x N x 2 | |
wh: M x N x 2 | |
overlap: M x N x 1 | |
union: M x N x 1 | |
ious: M x N x 1 | |
Total memory: | |
S = (9 x N x M + N + M) * 4 Byte, | |
When using FP16, we can reduce: | |
R = (9 x N x M + N + M) * 4 / 2 Byte | |
R large than (N + M) * 4 * 2 is always true when N and M >= 1. | |
Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2, | |
N + 1 < 3 * N, when N or M is 1. | |
Given M = 40 (ground truth), N = 400000 (three anchor boxes | |
in per grid, FPN, R-CNNs), | |
R = 275 MB (one times) | |
A special case (dense detection), M = 512 (ground truth), | |
R = 3516 MB = 3.43 GB | |
When the batch size is B, reduce: | |
B x R | |
Therefore, CUDA memory runs out frequently. | |
Experiments on GeForce RTX 2080Ti (11019 MiB): | |
| dtype | M | N | Use | Real | Ideal | | |
|:----:|:----:|:----:|:----:|:----:|:----:| | |
| FP32 | 512 | 400000 | 8020 MiB | -- | -- | | |
| FP16 | 512 | 400000 | 4504 MiB | 3516 MiB | 3516 MiB | | |
| FP32 | 40 | 400000 | 1540 MiB | -- | -- | | |
| FP16 | 40 | 400000 | 1264 MiB | 276MiB | 275 MiB | | |
2) is_aligned is True | |
area1: N x 1 | |
area2: N x 1 | |
lt: N x 2 | |
rb: N x 2 | |
wh: N x 2 | |
overlap: N x 1 | |
union: N x 1 | |
ious: N x 1 | |
Total memory: | |
S = 11 x N * 4 Byte | |
When using FP16, we can reduce: | |
R = 11 x N * 4 / 2 Byte | |
So do the 'giou' (large than 'iou'). | |
Time-wise, FP16 is generally faster than FP32. | |
When gpu_assign_thr is not -1, it takes more time on cpu | |
but not reduce memory. | |
There, we can reduce half the memory and keep the speed. | |
If ``is_aligned`` is ``False``, then calculate the overlaps between each | |
bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned | |
pair of bboxes1 and bboxes2. | |
Args: | |
bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty. | |
bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty. | |
B indicates the batch dim, in shape (B1, B2, ..., Bn). | |
If ``is_aligned`` is ``True``, then m and n must be equal. | |
mode (str): "iou" (intersection over union), "iof" (intersection over | |
foreground) or "giou" (generalized intersection over union). | |
Default "iou". | |
is_aligned (bool, optional): If True, then m and n must be equal. | |
Default False. | |
eps (float, optional): A value added to the denominator for numerical | |
stability. Default 1e-6. | |
Returns: | |
Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) | |
Example: | |
>>> bboxes1 = torch.FloatTensor([ | |
>>> [0, 0, 10, 10], | |
>>> [10, 10, 20, 20], | |
>>> [32, 32, 38, 42], | |
>>> ]) | |
>>> bboxes2 = torch.FloatTensor([ | |
>>> [0, 0, 10, 20], | |
>>> [0, 10, 10, 19], | |
>>> [10, 10, 20, 20], | |
>>> ]) | |
>>> overlaps = bbox_overlaps(bboxes1, bboxes2) | |
>>> assert overlaps.shape == (3, 3) | |
>>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True) | |
>>> assert overlaps.shape == (3, ) | |
Example: | |
>>> empty = torch.empty(0, 4) | |
>>> nonempty = torch.FloatTensor([[0, 0, 10, 9]]) | |
>>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) | |
>>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) | |
>>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) | |
""" | |
assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}' | |
# Either the boxes are empty or the length of boxes' last dimension is 4 | |
assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0) | |
assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0) | |
# Batch dim must be the same | |
# Batch dim: (B1, B2, ... Bn) | |
assert bboxes1.shape[:-2] == bboxes2.shape[:-2] | |
batch_shape = bboxes1.shape[:-2] | |
rows = bboxes1.size(-2) | |
cols = bboxes2.size(-2) | |
if is_aligned: | |
assert rows == cols | |
if rows * cols == 0: | |
if is_aligned: | |
return bboxes1.new(batch_shape + (rows, )) | |
else: | |
return bboxes1.new(batch_shape + (rows, cols)) | |
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * ( | |
bboxes1[..., 3] - bboxes1[..., 1]) | |
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * ( | |
bboxes2[..., 3] - bboxes2[..., 1]) | |
if is_aligned: | |
lt = torch.max(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2] | |
rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2] | |
wh = fp16_clamp(rb - lt, min=0) | |
overlap = wh[..., 0] * wh[..., 1] | |
if mode in ['iou', 'giou']: | |
union = area1 + area2 - overlap | |
else: | |
union = area1 | |
if mode == 'giou': | |
enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2]) | |
enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:]) | |
else: | |
lt = torch.max(bboxes1[..., :, None, :2], | |
bboxes2[..., None, :, :2]) # [B, rows, cols, 2] | |
rb = torch.min(bboxes1[..., :, None, 2:], | |
bboxes2[..., None, :, 2:]) # [B, rows, cols, 2] | |
wh = fp16_clamp(rb - lt, min=0) | |
overlap = wh[..., 0] * wh[..., 1] | |
if mode in ['iou', 'giou']: | |
union = area1[..., None] + area2[..., None, :] - overlap | |
else: | |
union = area1[..., None] | |
if mode == 'giou': | |
enclosed_lt = torch.min(bboxes1[..., :, None, :2], | |
bboxes2[..., None, :, :2]) | |
enclosed_rb = torch.max(bboxes1[..., :, None, 2:], | |
bboxes2[..., None, :, 2:]) | |
eps = union.new_tensor([eps]) | |
union = torch.max(union, eps) | |
ious = overlap / union | |
if mode in ['iou', 'iof']: | |
return ious | |
# calculate gious | |
enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0) | |
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] | |
enclose_area = torch.max(enclose_area, eps) | |
gious = ious - (enclose_area - union) / enclose_area | |
return gious | |
def cupy_bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): | |
"""Calculate overlap between two set of bboxes. | |
Args: | |
bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty. | |
bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty. | |
B indicates the batch dim, in shape (B1, B2, ..., Bn). | |
If ``is_aligned`` is ``True``, then m and n must be equal. | |
mode (str): "iou" (intersection over union), "iof" (intersection over | |
foreground) or "giou" (generalized intersection over union). | |
Default "iou". | |
is_aligned (bool, optional): If True, then m and n must be equal. | |
Default False. | |
eps (float, optional): A value added to the denominator for numerical | |
stability. Default 1e-6. | |
Returns: | |
Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) | |
""" | |
assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}' | |
# Either the boxes are empty or the length of boxes' last dimension is 4 | |
assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0) | |
assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0) | |
# Batch dim must be the same | |
# Batch dim: (B1, B2, ... Bn) | |
assert bboxes1.shape[:-2] == bboxes2.shape[:-2] | |
batch_shape = bboxes1.shape[:-2] | |
rows = bboxes1.shape[-2] | |
cols = bboxes2.shape[-2] | |
if is_aligned: | |
assert rows == cols | |
if rows * cols == 0: | |
if is_aligned: | |
return cp.empty(batch_shape + (rows, )) | |
else: | |
return cp.empty(batch_shape + (rows, cols)) | |
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * ( | |
bboxes1[..., 3] - bboxes1[..., 1]) | |
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * ( | |
bboxes2[..., 3] - bboxes2[..., 1]) | |
if is_aligned: | |
lt = cp.maximum(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2] | |
rb = cp.minimum(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2] | |
wh = fp16_clamp(rb - lt, min=0) | |
overlap = wh[..., 0] * wh[..., 1] | |
if mode in ['iou', 'giou']: | |
union = area1 + area2 - overlap | |
else: | |
union = area1 | |
if mode == 'giou': | |
enclosed_lt = cp.minimum(bboxes1[..., :2], bboxes2[..., :2]) | |
enclosed_rb = cp.maximum(bboxes1[..., 2:], bboxes2[..., 2:]) | |
else: | |
lt = cp.maximum(bboxes1[..., :, None, :2], | |
bboxes2[..., None, :, :2]) # [B, rows, cols, 2] | |
rb = cp.minimum(bboxes1[..., :, None, 2:], | |
bboxes2[..., None, :, 2:]) # [B, rows, cols, 2] | |
wh = fp16_clamp(rb - lt, min=0) | |
overlap = wh[..., 0] * wh[..., 1] | |
if mode in ['iou', 'giou']: | |
union = area1[..., None] + area2[..., None, :] - overlap | |
else: | |
union = area1[..., None] | |
if mode == 'giou': | |
enclosed_lt = cp.minimum(bboxes1[..., :, None, :2], | |
bboxes2[..., None, :, :2]) | |
enclosed_rb = cp.maximum(bboxes1[..., :, None, 2:], | |
bboxes2[..., None, :, 2:]) | |
eps = cp.array([eps], dtype=cp.float32) | |
union = cp.maximum(union, eps) | |
ious = overlap / union | |
if mode in ['iou', 'iof']: | |
return ious | |
# calculate gious | |
enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0) | |
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] | |
enclose_area = cp.maximum(enclose_area, eps) | |
gious = ious - (enclose_area - union) / enclose_area | |
return gious | |
def cupy_fast_nms(multi_bboxes, | |
multi_scores, | |
score_thr, | |
iou_thr, | |
top_k, | |
max_num=-1): | |
"""Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_. | |
Fast NMS allows already-removed detections to suppress other detections so | |
that every instance can be decided to be kept or discarded in parallel, | |
which is not possible in traditional NMS. This relaxation allows us to | |
implement Fast NMS entirely in standard GPU-accelerated matrix operations. | |
Args: | |
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) | |
multi_scores (Tensor): shape (n, #class+1), where the last column | |
contains scores of the background class, but this will be ignored. | |
score_thr (float): bbox threshold, bboxes with scores lower than it | |
will not be considered. | |
iou_thr (float): IoU threshold to be considered as conflicted. | |
top_k (int): if there are more than top_k bboxes before NMS, | |
only top top_k will be kept. | |
max_num (int): if there are more than max_num bboxes after NMS, | |
only top max_num will be kept. If -1, keep all the bboxes. | |
Default: -1. | |
Returns: | |
tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1), | |
and (k, coeffs_dim). Dets are boxes with scores. | |
Labels are 0-based. | |
""" | |
scores = cp.ascontiguousarray(cp.transpose(multi_scores[:, :-1], (1,0))) # [#class, n] | |
idx = cp.argsort(-scores, axis=1) | |
scores = cp.take_along_axis(scores, idx, axis=1) | |
idx = cp.ascontiguousarray(idx[:, :top_k]) | |
scores = scores[:, :top_k] # [#class, topk] | |
num_classes, num_dets = idx.shape | |
boxes = multi_bboxes[idx.reshape(-1), :].reshape(num_classes, num_dets, 4) | |
iou = cupy_bbox_overlaps(boxes, boxes) # [#class, topk, topk] | |
iou = cp.triu(iou, k=1) | |
iou_max = cp.amax(iou, axis=1) | |
# Now just filter out the ones higher than the threshold | |
keep = iou_max <= iou_thr | |
# Second thresholding introduces 0.2 mAP gain at negligible time cost | |
keep *= scores > score_thr | |
# Assign each kept detection to its corresponding class | |
classes = cp.broadcast_to(cp.arange(num_classes)[:, None], keep.shape) | |
classes = classes[keep] | |
boxes = boxes[keep] | |
scores = scores[keep] | |
# Only keep the top max_num highest scores across all classes | |
# scores, idx = scores.sort(0, descending=True) | |
idx = cp.argsort(-scores, axis=0) | |
scores = scores[idx] | |
if max_num > 0: | |
idx = idx[:max_num] | |
scores = scores[:max_num] | |
classes = classes[idx] | |
boxes = boxes[idx] | |
cls_dets = cp.concatenate([boxes, scores[:, None]], axis=1) | |
return cls_dets, classes | |
def torch_fast_nms(multi_bboxes, | |
multi_scores, | |
multi_coeffs, | |
score_thr, | |
iou_thr, | |
top_k, | |
max_num=-1): | |
"""Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_. | |
Fast NMS allows already-removed detections to suppress other detections so | |
that every instance can be decided to be kept or discarded in parallel, | |
which is not possible in traditional NMS. This relaxation allows us to | |
implement Fast NMS entirely in standard GPU-accelerated matrix operations. | |
Args: | |
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) | |
multi_scores (Tensor): shape (n, #class+1), where the last column | |
contains scores of the background class, but this will be ignored. | |
multi_coeffs (Tensor): shape (n, #class*coeffs_dim). | |
score_thr (float): bbox threshold, bboxes with scores lower than it | |
will not be considered. | |
iou_thr (float): IoU threshold to be considered as conflicted. | |
top_k (int): if there are more than top_k bboxes before NMS, | |
only top top_k will be kept. | |
max_num (int): if there are more than max_num bboxes after NMS, | |
only top max_num will be kept. If -1, keep all the bboxes. | |
Default: -1. | |
Returns: | |
tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1), | |
and (k, coeffs_dim). Dets are boxes with scores. | |
Labels are 0-based. | |
""" | |
scores = multi_scores[:, :-1].t() # [#class, n] | |
scores, idx = scores.sort(1, descending=True) | |
idx = idx[:, :top_k].contiguous() | |
scores = scores[:, :top_k] # [#class, topk] | |
num_classes, num_dets = idx.size() | |
boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4) | |
coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1) | |
iou = torch_bbox_overlaps(boxes, boxes) # [#class, topk, topk] | |
iou.triu_(diagonal=1) | |
iou_max, _ = iou.max(dim=1) | |
# Now just filter out the ones higher than the threshold | |
keep = iou_max <= iou_thr | |
# Second thresholding introduces 0.2 mAP gain at negligible time cost | |
keep *= scores > score_thr | |
# Assign each kept detection to its corresponding class | |
classes = torch.arange( | |
num_classes, device=boxes.device)[:, None].expand_as(keep) | |
classes = classes[keep] | |
boxes = boxes[keep] | |
coeffs = coeffs[keep] | |
scores = scores[keep] | |
# Only keep the top max_num highest scores across all classes | |
scores, idx = scores.sort(0, descending=True) | |
if max_num > 0: | |
idx = idx[:max_num] | |
scores = scores[:max_num] | |
classes = classes[idx] | |
boxes = boxes[idx] | |
coeffs = coeffs[idx] | |
cls_dets = torch.cat([boxes, scores[:, None]], dim=1) | |
return cls_dets, classes, coeffs | |
num_class = 5 | |
n = 1000 | |
score_thr = 0.01 | |
iou_thr = 0.01 | |
top_k = 100 | |
multi_bboxes = np.random.randn(n, 4).astype(np.float32) | |
multi_scores = np.random.randn(n, num_class + 1).astype(np.float32) | |
torch_multi_bboxes = torch.from_numpy(multi_bboxes).cuda() | |
torch_multi_scores = torch.from_numpy(multi_scores).cuda() | |
multi_coeffs = np.random.randn(n, num_class).astype(np.float32) | |
torch_multi_coeffs = torch.from_numpy(multi_coeffs).cuda() | |
# FIXME: when running torch with cuda, and then run cp would throw cudaErrorIllegalAddress | |
torch_cls_dets, torch_labels, _ = torch_fast_nms(torch_multi_bboxes, torch_multi_scores, torch_multi_coeffs, score_thr, iou_thr, top_k) | |
cp_multi_bboxes = cp.asarray(multi_bboxes) | |
cp_multi_scores = cp.asarray(multi_scores) | |
# FIXME: when running cp fast nms first, and then run mmdet fast nms, pytest would stuck | |
cls_dets, labels = cupy_fast_nms(cp_multi_bboxes, cp_multi_scores, score_thr, iou_thr, top_k) | |
assert(cls_dets.ndim == 2) | |
assert(cls_dets.shape[1] == 5) | |
assert(labels.shape[0] == cls_dets.shape[0]) | |
assert(np.allclose(cp.asnumpy(cls_dets), torch_cls_dets.numpy())) | |
assert(np.allclose(cp.asnumpy(labels), torch_labels.numpy())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment