twmht · November 17, 2021 05:49
diff --git a/reproduce1.py b/reproduce1.py
 import torch
 import cupy as cp
 import numpy as np

 def fp16_clamp(x, min=None, max=None):
    if not x.is_cuda and x.dtype == torch.float16:
        # clamp for cpu float16, tensor fp16 has no clamp implementation
        return x.float().clamp(min, max).half()

    return x.clamp(min, max)

 def torch_bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
    """Calculate overlap between two set of bboxes.

    FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
    Note:
        Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
        there are some new generated variable when calculating IOU
        using bbox_overlaps function:

        1) is_aligned is False
            area1: M x 1
            area2: N x 1
            lt: M x N x 2
            rb: M x N x 2
            wh: M x N x 2
            overlap: M x N x 1
            union: M x N x 1
            ious: M x N x 1

            Total memory:
                S = (9 x N x M + N + M) * 4 Byte,

            When using FP16, we can reduce:
                R = (9 x N x M + N + M) * 4 / 2 Byte
                R large than (N + M) * 4 * 2 is always true when N and M >= 1.
                Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
                           N + 1 < 3 * N, when N or M is 1.

            Given M = 40 (ground truth), N = 400000 (three anchor boxes
            in per grid, FPN, R-CNNs),
                R = 275 MB (one times)

            A special case (dense detection), M = 512 (ground truth),
                R = 3516 MB = 3.43 GB

            When the batch size is B, reduce:
                B x R

            Therefore, CUDA memory runs out frequently.

            Experiments on GeForce RTX 2080Ti (11019 MiB):

            |   dtype   |   M   |   N   |   Use    |   Real   |   Ideal   |
            |:----:|:----:|:----:|:----:|:----:|:----:|
            |   FP32   |   512 | 400000 | 8020 MiB |   --   |   --   |
            |   FP16   |   512 | 400000 |   4504 MiB | 3516 MiB | 3516 MiB |
            |   FP32   |   40 | 400000 |   1540 MiB |   --   |   --   |
            |   FP16   |   40 | 400000 |   1264 MiB |   276MiB   | 275 MiB |

        2) is_aligned is True
            area1: N x 1
            area2: N x 1
            lt: N x 2
            rb: N x 2
            wh: N x 2
            overlap: N x 1
            union: N x 1
            ious: N x 1

            Total memory:
                S = 11 x N * 4 Byte

            When using FP16, we can reduce:
                R = 11 x N * 4 / 2 Byte

        So do the 'giou' (large than 'iou').

        Time-wise, FP16 is generally faster than FP32.

        When gpu_assign_thr is not -1, it takes more time on cpu
        but not reduce memory.
        There, we can reduce half the memory and keep the speed.

    If ``is_aligned`` is ``False``, then calculate the overlaps between each
    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
    pair of bboxes1 and bboxes2.

    Args:
        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
            B indicates the batch dim, in shape (B1, B2, ..., Bn).
            If ``is_aligned`` is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union), "iof" (intersection over
            foreground) or "giou" (generalized intersection over union).
            Default "iou".
        is_aligned (bool, optional): If True, then m and n must be equal.
            Default False.
        eps (float, optional): A value added to the denominator for numerical
            stability. Default 1e-6.

    Returns:
        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)

    Example:
        >>> bboxes1 = torch.FloatTensor([
        >>>     [0, 0, 10, 10],
        >>>     [10, 10, 20, 20],
        >>>     [32, 32, 38, 42],
        >>> ])
        >>> bboxes2 = torch.FloatTensor([
        >>>     [0, 0, 10, 20],
        >>>     [0, 10, 10, 19],
        >>>     [10, 10, 20, 20],
        >>> ])
        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
        >>> assert overlaps.shape == (3, 3)
        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
        >>> assert overlaps.shape == (3, )

    Example:
        >>> empty = torch.empty(0, 4)
        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
    """

    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
    # Either the boxes are empty or the length of boxes' last dimension is 4
    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)

    # Batch dim must be the same
    # Batch dim: (B1, B2, ... Bn)
    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
    batch_shape = bboxes1.shape[:-2]

    rows = bboxes1.size(-2)
    cols = bboxes2.size(-2)
    if is_aligned:
        assert rows == cols

    if rows * cols == 0:
        if is_aligned:
            return bboxes1.new(batch_shape + (rows, ))
        else:
            return bboxes1.new(batch_shape + (rows, cols))

    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
        bboxes1[..., 3] - bboxes1[..., 1])
    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
        bboxes2[..., 3] - bboxes2[..., 1])

    if is_aligned:
        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]

        wh = fp16_clamp(rb - lt, min=0)
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ['iou', 'giou']:
            union = area1 + area2 - overlap
        else:
            union = area1
        if mode == 'giou':
            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
    else:
        lt = torch.max(bboxes1[..., :, None, :2],
                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
        rb = torch.min(bboxes1[..., :, None, 2:],
                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]

        wh = fp16_clamp(rb - lt, min=0)
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ['iou', 'giou']:
            union = area1[..., None] + area2[..., None, :] - overlap
        else:
            union = area1[..., None]
        if mode == 'giou':
            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
                                    bboxes2[..., None, :, :2])
            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
                                    bboxes2[..., None, :, 2:])

    eps = union.new_tensor([eps])
    union = torch.max(union, eps)
    ious = overlap / union
    if mode in ['iou', 'iof']:
        return ious
    # calculate gious
    enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
    enclose_area = torch.max(enclose_area, eps)
    gious = ious - (enclose_area - union) / enclose_area
    return gious

 def cupy_bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
    """Calculate overlap between two set of bboxes.
    Args:
        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
            B indicates the batch dim, in shape (B1, B2, ..., Bn).
            If ``is_aligned`` is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union), "iof" (intersection over
            foreground) or "giou" (generalized intersection over union).
            Default "iou".
        is_aligned (bool, optional): If True, then m and n must be equal.
            Default False.
        eps (float, optional): A value added to the denominator for numerical
            stability. Default 1e-6.

    Returns:
        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
    """

    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
    # Either the boxes are empty or the length of boxes' last dimension is 4
    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)

    # Batch dim must be the same
    # Batch dim: (B1, B2, ... Bn)
    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
    batch_shape = bboxes1.shape[:-2]

    rows = bboxes1.shape[-2]
    cols = bboxes2.shape[-2]
    if is_aligned:
        assert rows == cols

    if rows * cols == 0:
        if is_aligned:
            return cp.empty(batch_shape + (rows, ))
        else:
            return cp.empty(batch_shape + (rows, cols))

    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
        bboxes1[..., 3] - bboxes1[..., 1])
    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
        bboxes2[..., 3] - bboxes2[..., 1])

    if is_aligned:
        lt = cp.maximum(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
        rb = cp.minimum(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]

        wh = fp16_clamp(rb - lt, min=0)
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ['iou', 'giou']:
            union = area1 + area2 - overlap
        else:
            union = area1
        if mode == 'giou':
            enclosed_lt = cp.minimum(bboxes1[..., :2], bboxes2[..., :2])
            enclosed_rb = cp.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
    else:
        lt = cp.maximum(bboxes1[..., :, None, :2],
                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
        rb = cp.minimum(bboxes1[..., :, None, 2:],
                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]

        wh = fp16_clamp(rb - lt, min=0)
        overlap = wh[..., 0] * wh[..., 1]

        if mode in ['iou', 'giou']:
            union = area1[..., None] + area2[..., None, :] - overlap
        else:
            union = area1[..., None]
        if mode == 'giou':
            enclosed_lt = cp.minimum(bboxes1[..., :, None, :2],
                                    bboxes2[..., None, :, :2])
            enclosed_rb = cp.maximum(bboxes1[..., :, None, 2:],
                                    bboxes2[..., None, :, 2:])

    eps = cp.array([eps], dtype=cp.float32)
    union = cp.maximum(union, eps)
    ious = overlap / union
    if mode in ['iou', 'iof']:
        return ious
    # calculate gious
    enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
    enclose_area = cp.maximum(enclose_area, eps)
    gious = ious - (enclose_area - union) / enclose_area
    return gious
 def cupy_fast_nms(multi_bboxes,
             multi_scores,
             score_thr,
             iou_thr,
             top_k,
             max_num=-1):
    """Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.

    Fast NMS allows already-removed detections to suppress other detections so
    that every instance can be decided to be kept or discarded in parallel,
    which is not possible in traditional NMS. This relaxation allows us to
    implement Fast NMS entirely in standard GPU-accelerated matrix operations.

    Args:
        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
        multi_scores (Tensor): shape (n, #class+1), where the last column
            contains scores of the background class, but this will be ignored.
        score_thr (float): bbox threshold, bboxes with scores lower than it
            will not be considered.
        iou_thr (float): IoU threshold to be considered as conflicted.
        top_k (int): if there are more than top_k bboxes before NMS,
            only top top_k will be kept.
        max_num (int): if there are more than max_num bboxes after NMS,
            only top max_num will be kept. If -1, keep all the bboxes.
            Default: -1.

    Returns:
        tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
            and (k, coeffs_dim). Dets are boxes with scores.
            Labels are 0-based.
    """

    scores = cp.ascontiguousarray(cp.transpose(multi_scores[:, :-1], (1,0)))  # [#class, n]
    idx = cp.argsort(-scores, axis=1)
    scores = cp.take_along_axis(scores, idx, axis=1)

    idx = cp.ascontiguousarray(idx[:, :top_k])
    scores = scores[:, :top_k]  # [#class, topk]
    num_classes, num_dets = idx.shape
    boxes = multi_bboxes[idx.reshape(-1), :].reshape(num_classes, num_dets, 4)

    iou = cupy_bbox_overlaps(boxes, boxes)  # [#class, topk, topk]
    iou = cp.triu(iou, k=1)
    iou_max = cp.amax(iou, axis=1)


    # Now just filter out the ones higher than the threshold
    keep = iou_max <= iou_thr

    # Second thresholding introduces 0.2 mAP gain at negligible time cost
    keep *= scores > score_thr

    # Assign each kept detection to its corresponding class
    classes = cp.broadcast_to(cp.arange(num_classes)[:, None], keep.shape)
    classes = classes[keep]

    boxes = boxes[keep]
    scores = scores[keep]

    # Only keep the top max_num highest scores across all classes
    #  scores, idx = scores.sort(0, descending=True)
    idx = cp.argsort(-scores, axis=0)
    scores = scores[idx]
    if max_num > 0:
        idx = idx[:max_num]
        scores = scores[:max_num]

    classes = classes[idx]
    boxes = boxes[idx]

    cls_dets = cp.concatenate([boxes, scores[:, None]], axis=1)
    return cls_dets, classes

 def torch_fast_nms(multi_bboxes,
             multi_scores,
             multi_coeffs,
             score_thr,
             iou_thr,
             top_k,
             max_num=-1):
    """Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.

    Fast NMS allows already-removed detections to suppress other detections so
    that every instance can be decided to be kept or discarded in parallel,
    which is not possible in traditional NMS. This relaxation allows us to
    implement Fast NMS entirely in standard GPU-accelerated matrix operations.

    Args:
        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
        multi_scores (Tensor): shape (n, #class+1), where the last column
            contains scores of the background class, but this will be ignored.
        multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
        score_thr (float): bbox threshold, bboxes with scores lower than it
            will not be considered.
        iou_thr (float): IoU threshold to be considered as conflicted.
        top_k (int): if there are more than top_k bboxes before NMS,
            only top top_k will be kept.
        max_num (int): if there are more than max_num bboxes after NMS,
            only top max_num will be kept. If -1, keep all the bboxes.
            Default: -1.

    Returns:
        tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
            and (k, coeffs_dim). Dets are boxes with scores.
            Labels are 0-based.
    """

    scores = multi_scores[:, :-1].t()  # [#class, n]
    scores, idx = scores.sort(1, descending=True)

    idx = idx[:, :top_k].contiguous()
    scores = scores[:, :top_k]  # [#class, topk]
    num_classes, num_dets = idx.size()
    boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
    coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)

    iou = torch_bbox_overlaps(boxes, boxes)  # [#class, topk, topk]
    iou.triu_(diagonal=1)
    iou_max, _ = iou.max(dim=1)

    # Now just filter out the ones higher than the threshold
    keep = iou_max <= iou_thr

    # Second thresholding introduces 0.2 mAP gain at negligible time cost
    keep *= scores > score_thr

    # Assign each kept detection to its corresponding class
    classes = torch.arange(
        num_classes, device=boxes.device)[:, None].expand_as(keep)
    classes = classes[keep]

    boxes = boxes[keep]
    coeffs = coeffs[keep]
    scores = scores[keep]

    # Only keep the top max_num highest scores across all classes
    scores, idx = scores.sort(0, descending=True)
    if max_num > 0:
        idx = idx[:max_num]
        scores = scores[:max_num]

    classes = classes[idx]
    boxes = boxes[idx]
    coeffs = coeffs[idx]

    cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
    return cls_dets, classes, coeffs

 num_class = 5
 n = 1000
 score_thr = 0.01
 iou_thr = 0.01
 top_k = 100

 multi_bboxes = np.random.randn(n, 4).astype(np.float32)
 multi_scores = np.random.randn(n, num_class + 1).astype(np.float32)

 torch_multi_bboxes = torch.from_numpy(multi_bboxes).cuda()
 torch_multi_scores = torch.from_numpy(multi_scores).cuda()
 multi_coeffs = np.random.randn(n, num_class).astype(np.float32)
 torch_multi_coeffs = torch.from_numpy(multi_coeffs).cuda()

 # FIXME: when running torch with cuda, and then run cp would throw cudaErrorIllegalAddress
 torch_cls_dets, torch_labels, _ = torch_fast_nms(torch_multi_bboxes, torch_multi_scores, torch_multi_coeffs, score_thr, iou_thr, top_k)

 cp_multi_bboxes = cp.asarray(multi_bboxes)
 cp_multi_scores = cp.asarray(multi_scores)

 # FIXME: when running cp fast nms first, and then run mmdet fast nms, pytest would stuck
 cls_dets, labels = cupy_fast_nms(cp_multi_bboxes, cp_multi_scores, score_thr, iou_thr, top_k)

 assert(cls_dets.ndim == 2)
 assert(cls_dets.shape[1] == 5)
 assert(labels.shape[0] == cls_dets.shape[0])

 assert(np.allclose(cp.asnumpy(cls_dets), torch_cls_dets.numpy()))
 assert(np.allclose(cp.asnumpy(labels), torch_labels.numpy()))
	import torch
	import cupy as cp
	import numpy as np

	def fp16_clamp(x, min=None, max=None):
	if not x.is_cuda and x.dtype == torch.float16:
	# clamp for cpu float16, tensor fp16 has no clamp implementation
	return x.float().clamp(min, max).half()

	return x.clamp(min, max)

	def torch_bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
	"""Calculate overlap between two set of bboxes.

	FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
	Note:
	Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
	there are some new generated variable when calculating IOU
	using bbox_overlaps function:

	1) is_aligned is False
	area1: M x 1
	area2: N x 1
	lt: M x N x 2
	rb: M x N x 2
	wh: M x N x 2
	overlap: M x N x 1
	union: M x N x 1
	ious: M x N x 1

	Total memory:
	S = (9 x N x M + N + M) * 4 Byte,

	When using FP16, we can reduce:
	R = (9 x N x M + N + M) * 4 / 2 Byte
	R large than (N + M) * 4 * 2 is always true when N and M >= 1.
	Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
	N + 1 < 3 * N, when N or M is 1.

	Given M = 40 (ground truth), N = 400000 (three anchor boxes
	in per grid, FPN, R-CNNs),
	R = 275 MB (one times)

	A special case (dense detection), M = 512 (ground truth),
	R = 3516 MB = 3.43 GB

	When the batch size is B, reduce:
	B x R

	Therefore, CUDA memory runs out frequently.

	Experiments on GeForce RTX 2080Ti (11019 MiB):

	\| dtype \| M \| N \| Use \| Real \| Ideal \|
	\|:----:\|:----:\|:----:\|:----:\|:----:\|:----:\|
	\| FP32 \| 512 \| 400000 \| 8020 MiB \| -- \| -- \|
	\| FP16 \| 512 \| 400000 \| 4504 MiB \| 3516 MiB \| 3516 MiB \|
	\| FP32 \| 40 \| 400000 \| 1540 MiB \| -- \| -- \|
	\| FP16 \| 40 \| 400000 \| 1264 MiB \| 276MiB \| 275 MiB \|

	2) is_aligned is True
	area1: N x 1
	area2: N x 1
	lt: N x 2
	rb: N x 2
	wh: N x 2
	overlap: N x 1
	union: N x 1
	ious: N x 1

	Total memory:
	S = 11 x N * 4 Byte

	When using FP16, we can reduce:
	R = 11 x N * 4 / 2 Byte

	So do the 'giou' (large than 'iou').

	Time-wise, FP16 is generally faster than FP32.

	When gpu_assign_thr is not -1, it takes more time on cpu
	but not reduce memory.
	There, we can reduce half the memory and keep the speed.

	If ``is_aligned`` is ``False``, then calculate the overlaps between each
	bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
	pair of bboxes1 and bboxes2.

	Args:
	bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
	bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
	B indicates the batch dim, in shape (B1, B2, ..., Bn).
	If ``is_aligned`` is ``True``, then m and n must be equal.
	mode (str): "iou" (intersection over union), "iof" (intersection over
	foreground) or "giou" (generalized intersection over union).
	Default "iou".
	is_aligned (bool, optional): If True, then m and n must be equal.
	Default False.
	eps (float, optional): A value added to the denominator for numerical
	stability. Default 1e-6.

	Returns:
	Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)

	Example:
	>>> bboxes1 = torch.FloatTensor([
	>>> [0, 0, 10, 10],
	>>> [10, 10, 20, 20],
	>>> [32, 32, 38, 42],
	>>> ])
	>>> bboxes2 = torch.FloatTensor([
	>>> [0, 0, 10, 20],
	>>> [0, 10, 10, 19],
	>>> [10, 10, 20, 20],
	>>> ])
	>>> overlaps = bbox_overlaps(bboxes1, bboxes2)
	>>> assert overlaps.shape == (3, 3)
	>>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
	>>> assert overlaps.shape == (3, )

	Example:
	>>> empty = torch.empty(0, 4)
	>>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
	>>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
	>>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
	>>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
	"""

	assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
	# Either the boxes are empty or the length of boxes' last dimension is 4
	assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
	assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)

	# Batch dim must be the same
	# Batch dim: (B1, B2, ... Bn)
	assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
	batch_shape = bboxes1.shape[:-2]

	rows = bboxes1.size(-2)
	cols = bboxes2.size(-2)
	if is_aligned:
	assert rows == cols

	if rows * cols == 0:
	if is_aligned:
	return bboxes1.new(batch_shape + (rows, ))
	else:
	return bboxes1.new(batch_shape + (rows, cols))

	area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
	bboxes1[..., 3] - bboxes1[..., 1])
	area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
	bboxes2[..., 3] - bboxes2[..., 1])

	if is_aligned:
	lt = torch.max(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2]
	rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2]

	wh = fp16_clamp(rb - lt, min=0)
	overlap = wh[..., 0] * wh[..., 1]

	if mode in ['iou', 'giou']:
	union = area1 + area2 - overlap
	else:
	union = area1
	if mode == 'giou':
	enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
	enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
	else:
	lt = torch.max(bboxes1[..., :, None, :2],
	bboxes2[..., None, :, :2]) # [B, rows, cols, 2]
	rb = torch.min(bboxes1[..., :, None, 2:],
	bboxes2[..., None, :, 2:]) # [B, rows, cols, 2]

	wh = fp16_clamp(rb - lt, min=0)
	overlap = wh[..., 0] * wh[..., 1]

	if mode in ['iou', 'giou']:
	union = area1[..., None] + area2[..., None, :] - overlap
	else:
	union = area1[..., None]
	if mode == 'giou':
	enclosed_lt = torch.min(bboxes1[..., :, None, :2],
	bboxes2[..., None, :, :2])
	enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
	bboxes2[..., None, :, 2:])

	eps = union.new_tensor([eps])
	union = torch.max(union, eps)
	ious = overlap / union
	if mode in ['iou', 'iof']:
	return ious
	# calculate gious
	enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
	enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
	enclose_area = torch.max(enclose_area, eps)
	gious = ious - (enclose_area - union) / enclose_area
	return gious

	def cupy_bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
	"""Calculate overlap between two set of bboxes.
	Args:
	bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
	bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
	B indicates the batch dim, in shape (B1, B2, ..., Bn).
	If ``is_aligned`` is ``True``, then m and n must be equal.
	mode (str): "iou" (intersection over union), "iof" (intersection over
	foreground) or "giou" (generalized intersection over union).
	Default "iou".
	is_aligned (bool, optional): If True, then m and n must be equal.
	Default False.
	eps (float, optional): A value added to the denominator for numerical
	stability. Default 1e-6.

	Returns:
	Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
	"""

	assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
	# Either the boxes are empty or the length of boxes' last dimension is 4
	assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
	assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)

	# Batch dim must be the same
	# Batch dim: (B1, B2, ... Bn)
	assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
	batch_shape = bboxes1.shape[:-2]

	rows = bboxes1.shape[-2]
	cols = bboxes2.shape[-2]
	if is_aligned:
	assert rows == cols

	if rows * cols == 0:
	if is_aligned:
	return cp.empty(batch_shape + (rows, ))
	else:
	return cp.empty(batch_shape + (rows, cols))

	area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
	bboxes1[..., 3] - bboxes1[..., 1])
	area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
	bboxes2[..., 3] - bboxes2[..., 1])

	if is_aligned:
	lt = cp.maximum(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2]
	rb = cp.minimum(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2]

	wh = fp16_clamp(rb - lt, min=0)
	overlap = wh[..., 0] * wh[..., 1]

	if mode in ['iou', 'giou']:
	union = area1 + area2 - overlap
	else:
	union = area1
	if mode == 'giou':
	enclosed_lt = cp.minimum(bboxes1[..., :2], bboxes2[..., :2])
	enclosed_rb = cp.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
	else:
	lt = cp.maximum(bboxes1[..., :, None, :2],
	bboxes2[..., None, :, :2]) # [B, rows, cols, 2]
	rb = cp.minimum(bboxes1[..., :, None, 2:],
	bboxes2[..., None, :, 2:]) # [B, rows, cols, 2]

	wh = fp16_clamp(rb - lt, min=0)
	overlap = wh[..., 0] * wh[..., 1]

	if mode in ['iou', 'giou']:
	union = area1[..., None] + area2[..., None, :] - overlap
	else:
	union = area1[..., None]
	if mode == 'giou':
	enclosed_lt = cp.minimum(bboxes1[..., :, None, :2],
	bboxes2[..., None, :, :2])
	enclosed_rb = cp.maximum(bboxes1[..., :, None, 2:],
	bboxes2[..., None, :, 2:])

	eps = cp.array([eps], dtype=cp.float32)
	union = cp.maximum(union, eps)
	ious = overlap / union
	if mode in ['iou', 'iof']:
	return ious
	# calculate gious
	enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
	enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
	enclose_area = cp.maximum(enclose_area, eps)
	gious = ious - (enclose_area - union) / enclose_area
	return gious
	def cupy_fast_nms(multi_bboxes,
	multi_scores,
	score_thr,
	iou_thr,
	top_k,
	max_num=-1):
	"""Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.

	Fast NMS allows already-removed detections to suppress other detections so
	that every instance can be decided to be kept or discarded in parallel,
	which is not possible in traditional NMS. This relaxation allows us to
	implement Fast NMS entirely in standard GPU-accelerated matrix operations.

	Args:
	multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
	multi_scores (Tensor): shape (n, #class+1), where the last column
	contains scores of the background class, but this will be ignored.
	score_thr (float): bbox threshold, bboxes with scores lower than it
	will not be considered.
	iou_thr (float): IoU threshold to be considered as conflicted.
	top_k (int): if there are more than top_k bboxes before NMS,
	only top top_k will be kept.
	max_num (int): if there are more than max_num bboxes after NMS,
	only top max_num will be kept. If -1, keep all the bboxes.
	Default: -1.

	Returns:
	tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
	and (k, coeffs_dim). Dets are boxes with scores.
	Labels are 0-based.
	"""

	scores = cp.ascontiguousarray(cp.transpose(multi_scores[:, :-1], (1,0))) # [#class, n]
	idx = cp.argsort(-scores, axis=1)
	scores = cp.take_along_axis(scores, idx, axis=1)

	idx = cp.ascontiguousarray(idx[:, :top_k])
	scores = scores[:, :top_k] # [#class, topk]
	num_classes, num_dets = idx.shape
	boxes = multi_bboxes[idx.reshape(-1), :].reshape(num_classes, num_dets, 4)

	iou = cupy_bbox_overlaps(boxes, boxes) # [#class, topk, topk]
	iou = cp.triu(iou, k=1)
	iou_max = cp.amax(iou, axis=1)


	# Now just filter out the ones higher than the threshold
	keep = iou_max <= iou_thr

	# Second thresholding introduces 0.2 mAP gain at negligible time cost
	keep *= scores > score_thr

	# Assign each kept detection to its corresponding class
	classes = cp.broadcast_to(cp.arange(num_classes)[:, None], keep.shape)
	classes = classes[keep]

	boxes = boxes[keep]
	scores = scores[keep]

	# Only keep the top max_num highest scores across all classes
	# scores, idx = scores.sort(0, descending=True)
	idx = cp.argsort(-scores, axis=0)
	scores = scores[idx]
	if max_num > 0:
	idx = idx[:max_num]
	scores = scores[:max_num]

	classes = classes[idx]
	boxes = boxes[idx]

	cls_dets = cp.concatenate([boxes, scores[:, None]], axis=1)
	return cls_dets, classes

	def torch_fast_nms(multi_bboxes,
	multi_scores,
	multi_coeffs,
	score_thr,
	iou_thr,
	top_k,
	max_num=-1):
	"""Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.

	Fast NMS allows already-removed detections to suppress other detections so
	that every instance can be decided to be kept or discarded in parallel,
	which is not possible in traditional NMS. This relaxation allows us to
	implement Fast NMS entirely in standard GPU-accelerated matrix operations.

	Args:
	multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
	multi_scores (Tensor): shape (n, #class+1), where the last column
	contains scores of the background class, but this will be ignored.
	multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
	score_thr (float): bbox threshold, bboxes with scores lower than it
	will not be considered.
	iou_thr (float): IoU threshold to be considered as conflicted.
	top_k (int): if there are more than top_k bboxes before NMS,
	only top top_k will be kept.
	max_num (int): if there are more than max_num bboxes after NMS,
	only top max_num will be kept. If -1, keep all the bboxes.
	Default: -1.

	Returns:
	tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
	and (k, coeffs_dim). Dets are boxes with scores.
	Labels are 0-based.
	"""

	scores = multi_scores[:, :-1].t() # [#class, n]
	scores, idx = scores.sort(1, descending=True)

	idx = idx[:, :top_k].contiguous()
	scores = scores[:, :top_k] # [#class, topk]
	num_classes, num_dets = idx.size()
	boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
	coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)

	iou = torch_bbox_overlaps(boxes, boxes) # [#class, topk, topk]
	iou.triu_(diagonal=1)
	iou_max, _ = iou.max(dim=1)

	# Now just filter out the ones higher than the threshold
	keep = iou_max <= iou_thr

	# Second thresholding introduces 0.2 mAP gain at negligible time cost
	keep *= scores > score_thr

	# Assign each kept detection to its corresponding class
	classes = torch.arange(
	num_classes, device=boxes.device)[:, None].expand_as(keep)
	classes = classes[keep]

	boxes = boxes[keep]
	coeffs = coeffs[keep]
	scores = scores[keep]

	# Only keep the top max_num highest scores across all classes
	scores, idx = scores.sort(0, descending=True)
	if max_num > 0:
	idx = idx[:max_num]
	scores = scores[:max_num]

	classes = classes[idx]
	boxes = boxes[idx]
	coeffs = coeffs[idx]

	cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
	return cls_dets, classes, coeffs

	num_class = 5
	n = 1000
	score_thr = 0.01
	iou_thr = 0.01
	top_k = 100

	multi_bboxes = np.random.randn(n, 4).astype(np.float32)
	multi_scores = np.random.randn(n, num_class + 1).astype(np.float32)

	torch_multi_bboxes = torch.from_numpy(multi_bboxes).cuda()
	torch_multi_scores = torch.from_numpy(multi_scores).cuda()
	multi_coeffs = np.random.randn(n, num_class).astype(np.float32)
	torch_multi_coeffs = torch.from_numpy(multi_coeffs).cuda()

	# FIXME: when running torch with cuda, and then run cp would throw cudaErrorIllegalAddress
	torch_cls_dets, torch_labels, _ = torch_fast_nms(torch_multi_bboxes, torch_multi_scores, torch_multi_coeffs, score_thr, iou_thr, top_k)

	cp_multi_bboxes = cp.asarray(multi_bboxes)
	cp_multi_scores = cp.asarray(multi_scores)

	# FIXME: when running cp fast nms first, and then run mmdet fast nms, pytest would stuck
	cls_dets, labels = cupy_fast_nms(cp_multi_bboxes, cp_multi_scores, score_thr, iou_thr, top_k)

	assert(cls_dets.ndim == 2)
	assert(cls_dets.shape[1] == 5)
	assert(labels.shape[0] == cls_dets.shape[0])

	assert(np.allclose(cp.asnumpy(cls_dets), torch_cls_dets.numpy()))
	assert(np.allclose(cp.asnumpy(labels), torch_labels.numpy()))