Source code for ignite.metrics.vision.object_detection_average_precision_recall

from typing import Callable, cast, Dict, List, Optional, Sequence, Tuple, Union

import torch
from typing_extensions import Literal

from ignite.metrics import MetricGroup

from ignite.metrics.mean_average_precision import _BaseAveragePrecision, _cat_and_agg_tensors
from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce


[docs]def coco_tensor_list_to_dict_list(
    output: Tuple[
        Union[List[torch.Tensor], List[Dict[str, torch.Tensor]]],
        Union[List[torch.Tensor], List[Dict[str, torch.Tensor]]],
    ]
) -> Tuple[List[Dict[str, torch.Tensor]], List[Dict[str, torch.Tensor]]]:
    """Convert either of output's `y_pred` or `y` from list of `(N, 6)` tensors to list of str-to-tensor dictionaries,
    or keep them unchanged if they're already in the deisred format.

    Input format is a `(N, 6)` or (`N, 5)` tensor which `N` is the number of predicted/target bounding boxes for the
    image and the second dimension contains `(x1, y1, x2, y2, confidence, class)`/`(x1, y1, x2, y2, class[, iscrowd])`.
    Output format is a str-to-tensor dictionary containing 'bbox' and `class` keys, plus `confidence` key for `y_pred`
    and possibly `iscrowd` for `y`.

    Args:
        output: `(y_pred,y)` tuple whose members are either list of tensors or list of dicts.

    Returns:
        `(y_pred,y)` tuple whose members are list of str-to-tensor dictionaries.
    """
    y_pred, y = output
    if len(y_pred) > 0 and isinstance(y_pred[0], torch.Tensor):
        y_pred = [{"bbox": t[:, :4], "confidence": t[:, 4], "class": t[:, 5]} for t in cast(List[torch.Tensor], y_pred)]
    if len(y) > 0 and isinstance(y[0], torch.Tensor):
        if y[0].size(1) == 5:
            y = [{"bbox": t[:, :4], "class": t[:, 4]} for t in cast(List[torch.Tensor], y)]
        else:
            y = [{"bbox": t[:, :4], "class": t[:, 4], "iscrowd": t[:, 5]} for t in cast(List[torch.Tensor], y)]
    return cast(Tuple[List[Dict[str, torch.Tensor]], List[Dict[str, torch.Tensor]]], (y_pred, y))


[docs]class ObjectDetectionAvgPrecisionRecall(Metric, _BaseAveragePrecision):
    _tps: List[torch.Tensor]
    _fps: List[torch.Tensor]
    _scores: List[torch.Tensor]
    _y_pred_labels: List[torch.Tensor]
    _y_true_count: torch.Tensor
    _num_classes: int

    def __init__(
        self,
        iou_thresholds: Optional[Union[Sequence[float], torch.Tensor]] = None,
        rec_thresholds: Optional[Union[Sequence[float], torch.Tensor]] = None,
        num_classes: int = 80,
        max_detections_per_image_per_class: int = 100,
        area_range: Literal["small", "medium", "large", "all"] = "all",
        output_transform: Callable = lambda x: x,
        device: Union[str, torch.device] = torch.device("cpu"),
        skip_unrolling: bool = False,
    ) -> None:
        r"""Calculate mean average precision & recall for evaluating an object detector in the COCO way.


        Average precision is computed by averaging precision over increasing levels of recall thresholds.
        In COCO, the maximum precision across thresholds greater or equal a recall threshold is
        considered as the average summation operand; In other words, the precision peek across lower or equal
        sensivity levels is used for a recall threshold:

        .. math::
            \text{Average Precision} = \sum_{k=1}^{\#rec\_thresholds} (r_k - r_{k-1}) max(P_{k:})

        Average recall is the detector's maximum recall, considering all matched detections as TP,
        averaged over classes.

        Args:
            iou_thresholds: sequence of IoU thresholds to be considered for computing mean average precision & recall.
                Values should be between 0 and 1. If not given, COCO's default (.5, .55, ..., .95) would be used.
            rec_thresholds: sequence of recall thresholds to be considered for computing mean average precision.
                Values should be between 0 and 1. If not given, COCO's default (.0, .01, .02, ..., 1.) would be used.
            num_classes: number of categories. Default is 80, that of the COCO dataset.
            area_range: area range which only objects therein are considered in evaluation. By default, 'all'.
            max_detections_per_image_per_class: maximum number of detections per class in each image to consider
                for evaluation. The most confident ones are selected.
            output_transform: a callable that is used to transform the :class:`~ignite.engine.engine.Engine`'s
                ``process_function``'s output into the form expected by the metric. An already provided example is
                :func:`~ignite.metrics.vision.object_detection_average_precision_recall.coco_tensor_list_to_dict_list`
                which accepts `y_pred` and `y` as lists of tensors and transforms them to the expected format.
                Default is the identity function.
            device: specifies which device updates are accumulated on. Setting the
                metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
                non-blocking. By default, CPU.
            skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
                true for multi-output model, for example, if ``y_pred`` and ``y`` contain multi-ouput as
                ``(y_pred_a, y_pred_b)`` and ``(y_a, y_b)``, in which case the update method is called for
                ``(y_pred_a, y_a)`` and ``(y_pred_b, y_b)``.Alternatively, ``output_transform`` can be used to handle
                this.

        .. versionadded:: 0.5.2
        """
        try:
            from torchvision.ops.boxes import _box_inter_union, box_area

            def box_iou(pred_boxes: torch.Tensor, gt_boxes: torch.Tensor, iscrowd: torch.BoolTensor) -> torch.Tensor:
                inter, union = _box_inter_union(pred_boxes, gt_boxes)
                union[:, iscrowd] = box_area(pred_boxes).reshape(-1, 1)
                iou = inter / union
                iou[iou.isnan()] = 0
                return iou

            self.box_iou = box_iou
        except ImportError:
            raise ModuleNotFoundError("This metric requires torchvision to be installed.")

        if iou_thresholds is None:
            iou_thresholds = torch.linspace(0.5, 0.95, 10, dtype=torch.double)

        self._iou_thresholds = self._setup_thresholds(iou_thresholds, "iou_thresholds")

        if rec_thresholds is None:
            rec_thresholds = torch.linspace(0, 1, 101, dtype=torch.double)

        self._num_classes = num_classes
        self._area_range = area_range
        self._max_detections_per_image_per_class = max_detections_per_image_per_class

        super(ObjectDetectionAvgPrecisionRecall, self).__init__(
            output_transform=output_transform,
            device=device,
            skip_unrolling=skip_unrolling,
        )
        super(Metric, self).__init__(
            rec_thresholds=rec_thresholds,
            class_mean=None,
        )
        precision = torch.double if torch.device(device).type != "mps" else torch.float32
        self.rec_thresholds = cast(torch.Tensor, self.rec_thresholds).to(device=device, dtype=precision)

[docs]    @reinit__is_reduced
    def reset(self) -> None:
        self._tps = []
        self._fps = []
        self._scores = []
        self._y_pred_labels = []
        self._y_true_count = torch.zeros((self._num_classes,), device=self._device)

    def _match_area_range(self, bboxes: torch.Tensor) -> torch.Tensor:
        from torchvision.ops.boxes import box_area

        areas = box_area(bboxes)
        if self._area_range == "all":
            min_area = 0
            max_area = 1e10
        elif self._area_range == "small":
            min_area = 0
            max_area = 1024
        elif self._area_range == "medium":
            min_area = 1024
            max_area = 9216
        elif self._area_range == "large":
            min_area = 9216
            max_area = 1e10
        return torch.logical_and(areas >= min_area, areas <= max_area)

    def _check_matching_input(
        self, output: Tuple[List[Dict[str, torch.Tensor]], List[Dict[str, torch.Tensor]]]
    ) -> None:
        y_pred, y = output
        if len(y_pred) != len(y):
            raise ValueError(f"y_pred and y should have the same number of samples, given {len(y_pred)} and {len(y)}.")
        if len(y_pred) == 0:
            raise ValueError("y_pred and y should contain at least one sample.")

        y_pred_keys = {"bbox", "scores", "labels"}
        if (y_pred[0].keys() & y_pred_keys) != y_pred_keys:
            raise ValueError(
                "y_pred sample dictionaries should have 'bbox', 'scores'"
                f" and 'labels' keys, given keys: {y_pred[0].keys()}"
            )

        y_keys = {"bbox", "labels"}
        if (y[0].keys() & y_keys) != y_keys:
            raise ValueError(
                "y sample dictionaries should have 'bbox', 'labels'"
                f" and optionally 'iscrowd' keys, given keys: {y[0].keys()}"
            )

    def _compute_recall_and_precision(
        self, TP: torch.Tensor, FP: torch.Tensor, scores: torch.Tensor, y_true_count: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        r"""Measuring recall & precision

        This method is different from that of MeanAveragePrecision since in the pycocotools reference implementation,
        when there are predictions with the same scores, they're considered associated with different thresholds in the
        course of measuring recall values, although it's not logically correct as those predictions are really
        associated with a single threshold, thus outputing a single recall value.

        Shape of function inputs and return values follow the table below. N\ :sub:`pred` is the number of detections
        or predictions. ``...`` stands for the possible additional dimensions. Finally, \#unique scores represents
        number of unique scores in ``scores`` which is actually the number of thresholds.

        ============== ======================
        **Object**     **Shape**
        ============== ======================
        TP and FP      (..., N\ :sub:`pred`)
        scores         (N\ :sub:`pred`,)
        y_true_count   () (A single float,
                       greater than zero)
        recall         (..., \#unique scores)
        precision      (..., \#unique scores)
        ============== ======================

        Returns:
            `(recall, precision)`
        """
        indices = torch.argsort(scores, dim=-1, stable=True, descending=True)
        tp = TP[..., indices]
        tp_summation = tp.cumsum(dim=-1)
        if tp_summation.device.type != "mps":
            tp_summation = tp_summation.double()

        fp = FP[..., indices]
        fp_summation = fp.cumsum(dim=-1)
        if fp_summation.device.type != "mps":
            fp_summation = fp_summation.double()

        recall = tp_summation / y_true_count
        predicted_positive = tp_summation + fp_summation
        precision = tp_summation / torch.where(predicted_positive == 0, 1, predicted_positive)

        return recall, precision

    def _compute_average_precision(self, recall: torch.Tensor, precision: torch.Tensor) -> torch.Tensor:
        """Measuring average precision.
        This method is overriden since :math:`1/#recall_thresholds` is used instead of :math:`r_k - r_{k-1}`
        as the recall differential in COCO's reference implementation i.e., pycocotools.

        Args:
            recall: n-dimensional tensor whose last dimension is the dimension of the samples. Should be ordered in
                ascending order in its last dimension.
            precision: like ``recall`` in the shape.

        Returns:
            average_precision: (n-1)-dimensional tensor containing the average precision for mean dimensions.
        """
        if precision.device.type == "mps":
            # Manual fallback to CPU if precision is on MPS due to the error:
            # NotImplementedError: The operator 'aten::_cummax_helper' is not currently implemented for the MPS device
            device = precision.device
            precision_integrand = precision.flip(-1).cpu()
            precision_integrand = precision_integrand.cummax(dim=-1).values
            precision_integrand = precision_integrand.to(device=device).flip(-1)
        else:
            precision_integrand = precision.flip(-1).cummax(dim=-1).values.flip(-1)
        rec_thresholds = cast(torch.Tensor, self.rec_thresholds).repeat((*recall.shape[:-1], 1))
        rec_thresh_indices = (
            torch.searchsorted(recall, rec_thresholds)
            if recall.size(-1) != 0
            else torch.LongTensor([], device=self._device)
        )
        precision_integrand = precision_integrand.take_along_dim(
            rec_thresh_indices.where(rec_thresh_indices != recall.size(-1), 0), dim=-1
        ).where(rec_thresh_indices != recall.size(-1), 0)
        return torch.sum(precision_integrand, dim=-1) / len(cast(torch.Tensor, self.rec_thresholds))

[docs]    @reinit__is_reduced
    def update(self, output: Tuple[List[Dict[str, torch.Tensor]], List[Dict[str, torch.Tensor]]]) -> None:
        r"""Metric update method using prediction and target.

        Args:
            output: a tuple, (y_pred, y), of two same-length lists, each one containing
                str-to-tensor dictionaries whose items is as follows. N\ :sub:`det` and
                N\ :sub:`gt` are number of detections and ground truths for a sample
                respectively.

                ======== ================== =================================================
                **y_pred items**
                -----------------------------------------------------------------------------
                Key      Value shape        Description
                ======== ================== =================================================
                'bbox'   (N\ :sub:`det`, 4) Bounding boxes of form (x1, y1, x2, y2)
                                            containing top left and bottom right coordinates.
                'scores' (N\ :sub:`det`,)   Confidence score of detections.
                'labels' (N\ :sub:`det`,)   Predicted category number of detections in
                                            `torch.long` dtype.
                ======== ================== =================================================

                ========= ================= =================================================
                **y items**
                -----------------------------------------------------------------------------
                Key       Value shape        Description
                ========= ================= =================================================
                'bbox'    (N\ :sub:`gt`, 4) Bounding boxes of form (x1, y1, x2, y2)
                                            containing top left and bottom right coordinates.
                'labels'  (N\ :sub:`gt`,)   Category number of ground truths in `torch.long`
                                            dtype.
                'iscrowd' (N\ :sub:`gt`,)   Whether ground truth boxes are crowd ones or not.
                                            This key is optional.
                ========= ================= =================================================
        """
        self._check_matching_input(output)
        for pred, target in zip(*output):
            labels = target["labels"]
            gt_boxes = target["bbox"]
            gt_is_crowd = (
                target["iscrowd"].bool() if "iscrowd" in target else torch.zeros_like(labels, dtype=torch.bool)
            )
            gt_ignore = ~self._match_area_range(gt_boxes) | gt_is_crowd
            self._y_true_count += torch.bincount(labels[~gt_ignore], minlength=self._num_classes).to(
                device=self._device
            )

            # Matching logic of object detection mAP, according to COCO reference implementation.
            if len(pred["labels"]):
                best_detections_index = torch.argsort(pred["scores"], stable=True, descending=True)
                max_best_detections_index = torch.cat(
                    [
                        best_detections_index[pred["labels"][best_detections_index] == c][
                            : self._max_detections_per_image_per_class
                        ]
                        for c in range(self._num_classes)
                    ]
                )
                pred_boxes = pred["bbox"][max_best_detections_index]
                pred_labels = pred["labels"][max_best_detections_index]
                if not len(labels):
                    tp = torch.zeros(
                        (len(self._iou_thresholds), len(max_best_detections_index)),
                        dtype=torch.uint8,
                        device=self._device,
                    )
                    self._tps.append(tp)
                    self._fps.append(~tp & self._match_area_range(pred_boxes).to(self._device))
                else:
                    ious = self.box_iou(pred_boxes, gt_boxes, cast(torch.BoolTensor, gt_is_crowd))
                    category_no_match = labels.expand(len(pred_labels), -1) != pred_labels.view(-1, 1)
                    NO_MATCH = -3
                    ious[category_no_match] = NO_MATCH
                    ious = ious.unsqueeze(-1).repeat((1, 1, len(self._iou_thresholds)))
                    ious[ious < self._iou_thresholds] = NO_MATCH
                    IGNORANCE = -2
                    ious[:, gt_ignore] += IGNORANCE
                    for i in range(len(pred_labels)):
                        # Flip is done to give priority to the last item with maximal value,
                        # as torch.max selects the first one.
                        match_gts = ious[i].flip(0).max(0)
                        match_gts_indices = ious.size(1) - 1 - match_gts.indices
                        for t in range(len(self._iou_thresholds)):
                            if match_gts.values[t] > NO_MATCH and not gt_is_crowd[match_gts_indices[t]]:
                                ious[:, match_gts_indices[t], t] = NO_MATCH
                                ious[i, match_gts_indices[t], t] = match_gts.values[t]

                    max_ious = ious.max(1).values
                    self._tps.append((max_ious >= 0).T.to(dtype=torch.uint8, device=self._device))
                    self._fps.append(
                        ((max_ious <= NO_MATCH).T & self._match_area_range(pred_boxes)).to(
                            dtype=torch.uint8, device=self._device
                        )
                    )

                scores = pred["scores"][max_best_detections_index]
                if self._device.type == "mps" and scores.dtype == torch.double:
                    scores = scores.to(dtype=torch.float32)
                self._scores.append(scores.to(self._device))
                self._y_pred_labels.append(pred_labels.to(dtype=torch.int, device=self._device))

    @sync_all_reduce("_y_true_count")
    def _compute(self) -> torch.Tensor:
        pred_labels = _cat_and_agg_tensors(self._y_pred_labels, cast(Tuple[int], ()), torch.int, self._device)
        TP = _cat_and_agg_tensors(self._tps, (len(self._iou_thresholds),), torch.uint8, self._device)
        FP = _cat_and_agg_tensors(self._fps, (len(self._iou_thresholds),), torch.uint8, self._device)
        fp_precision = torch.double if self._device.type != "mps" else torch.float32
        scores = _cat_and_agg_tensors(self._scores, cast(Tuple[int], ()), fp_precision, self._device)

        average_precisions_recalls = -torch.ones(
            (2, self._num_classes, len(self._iou_thresholds)),
            device=self._device,
            dtype=fp_precision,
        )
        for cls in range(self._num_classes):
            if self._y_true_count[cls] == 0:
                continue

            cls_labels = pred_labels == cls
            if sum(cls_labels) == 0:
                average_precisions_recalls[:, cls] = 0.0
                continue

            recall, precision = self._compute_recall_and_precision(
                TP[..., cls_labels], FP[..., cls_labels], scores[cls_labels], self._y_true_count[cls]
            )
            average_precision_for_cls_per_iou_threshold = self._compute_average_precision(recall, precision)
            average_precisions_recalls[0, cls] = average_precision_for_cls_per_iou_threshold
            average_precisions_recalls[1, cls] = recall[..., -1]
        return average_precisions_recalls

[docs]    def compute(self) -> Tuple[float, float]:
        average_precisions_recalls = self._compute()
        if (average_precisions_recalls == -1).all():
            return -1.0, -1.0
        ap = average_precisions_recalls[0][average_precisions_recalls[0] > -1].mean().item()
        ar = average_precisions_recalls[1][average_precisions_recalls[1] > -1].mean().item()
        return ap, ar


[docs]class CommonObjectDetectionMetrics(MetricGroup):
    """
    Common Object Detection metrics. Included metrics are as follows:

    =============== ==========================================
    **Metric name**    **Description**
    =============== ==========================================
    AP@50..95       Average precision averaged over
                    .50 to.95 IOU thresholds
    AR-100          Average recall with maximum 100 detections
    AP@50           Average precision with IOU threshold=.50
    AP@75           Average precision with IOU threshold=.75
    AP-S            Average precision over small objects
                    (< 32px * 32px)
    AR-S            Average recall over small objects
    AP-M            Average precision over medium objects
                    (S < . < 96px * 96px)
    AR-M            Average recall over medium objects
    AP-L            Average precision over large objects
                    (M < . < 1e5px * 1e5px)
    AR-L            Average recall over large objects
                    greater than zero)
    AR-1            Average recall with maximum 1 detection
    AR-10           Average recall with maximum 10 detections
    =============== ==========================================

    .. versionadded:: 0.5.2
    """

    _state_dict_all_req_keys = ("metrics", "ap_50_95")

    ap_50_95: ObjectDetectionAvgPrecisionRecall

    def __init__(
        self,
        num_classes: int = 80,
        output_transform: Callable = lambda x: x,
        device: Union[str, torch.device] = torch.device("cpu"),
        skip_unrolling: bool = True,
    ):
        self.ap_50_95 = ObjectDetectionAvgPrecisionRecall(num_classes=num_classes, device=device)

        super().__init__(
            {
                "S": ObjectDetectionAvgPrecisionRecall(num_classes=num_classes, device=device, area_range="small"),
                "M": ObjectDetectionAvgPrecisionRecall(num_classes=num_classes, device=device, area_range="medium"),
                "L": ObjectDetectionAvgPrecisionRecall(num_classes=num_classes, device=device, area_range="large"),
                "1": ObjectDetectionAvgPrecisionRecall(
                    num_classes=num_classes, device=device, max_detections_per_image_per_class=1
                ),
                "10": ObjectDetectionAvgPrecisionRecall(
                    num_classes=num_classes, device=device, max_detections_per_image_per_class=10
                ),
            },
            output_transform,
            skip_unrolling=skip_unrolling,
        )

[docs]    def reset(self) -> None:
        super().reset()
        self.ap_50_95.reset()

[docs]    def update(self, output: Sequence[torch.Tensor]) -> None:
        super().update(output)
        self.ap_50_95.update(output)

[docs]    def compute(self) -> Dict[str, float]:
        average_precisions_recalls = self.ap_50_95._compute()

        average_precisions_50 = average_precisions_recalls[0, :, 0]
        average_precisions_75 = average_precisions_recalls[0, :, 5]
        if (average_precisions_50 == -1).all():
            AP_50 = AP_75 = AP_50_95 = AR_100 = -1.0
        else:
            AP_50 = average_precisions_50[average_precisions_50 > -1].mean().item()
            AP_75 = average_precisions_75[average_precisions_75 > -1].mean().item()
            AP_50_95 = average_precisions_recalls[0][average_precisions_recalls[0] > -1].mean().item()
            AR_100 = average_precisions_recalls[1][average_precisions_recalls[1] > -1].mean().item()

        result = super().compute()
        return {
            "AP@50..95": AP_50_95,
            "AR-100": AR_100,
            "AP@50": AP_50,
            "AP@75": AP_75,
            "AP-S": result["S"][0],
            "AR-S": result["S"][1],
            "AP-M": result["M"][0],
            "AR-M": result["M"][1],
            "AP-L": result["L"][0],
            "AR-L": result["L"][1],
            "AR-1": result["1"][1],
            "AR-10": result["10"][1],
        }