Merge branch 'develop' into fix-resize-crop

omkar-334 · web-flow · commit 4277360f2172 · 2026-05-19T17:40:29.000+05:30
diff --git a/src/rfdetr/export/_tflite/converter.py b/src/rfdetr/export/_tflite/converter.py
@@ -47,13 +47,10 @@
     this normalization at inference time.
 
 Note:
-    **Segmentation model export is not validated.**  The same ``convert_kwargs``
-    are applied to segmentation models as to detection models, but the
-    segmentation output path introduces additional ops (``ScatterND``,
-    ``Resize``, extra ``GridSample`` calls in the mask resampling path) that
-    have not been exercised end-to-end through TFLite.  Treat segmentation
-    TFLite export as experimental and verify outputs against the ONNX baseline
-    before deployment.
+    Segmentation models additionally emit a ``masks`` output.  FP32, FP16,
+    and dynamic-range INT8 all match the PyTorch baseline closely (INT8 mask
+    fidelity is marginally lower).  Verified on the non-plus segmentation
+    variants: Nano, Small, Medium, Large, and Preview.
 """
 
 from __future__ import annotations
@@ -479,9 +476,10 @@ def export_tflite(
         substituted with TFLite-native pseudo-operators to avoid a missing
         TensorFlow Flex delegate at inference time.
 
-        Segmentation export (``pred_masks`` output) is **not validated** in
-        the current implementation; additional operators may need to be
-        added to ``replace_to_pseudo_operators`` for segmentation models.
+        Segmentation models additionally emit a ``masks`` output, decoded by
+        :func:`rfdetr.export._tflite.inference._run_inference`.  Verified on
+        the non-plus segmentation variants (Nano, Small, Medium, Large,
+        Preview).
     """
     onnx_path = Path(onnx_path)
     output_dir = Path(output_dir)
diff --git a/src/rfdetr/export/_tflite/inference.py b/src/rfdetr/export/_tflite/inference.py
@@ -7,9 +7,9 @@
 """TFLite inference helpers for RF-DETR exported models.
 
 These functions handle interpreter creation, image preprocessing, and
-detection decoding without requiring PyTorch or the RF-DETR training stack —
-only ``tflite-runtime`` (or ``tensorflow``), ``numpy``, ``supervision``, and
-``Pillow`` are needed at inference time.
+decoding of detection and segmentation-mask outputs without requiring PyTorch
+or the RF-DETR training stack: only ``tflite-runtime`` (or ``tensorflow``),
+``numpy``, ``supervision``, and ``Pillow`` are needed at inference time.
 """
 
 from __future__ import annotations
@@ -19,12 +19,16 @@
 
 import numpy as np
 import supervision as sv
+from numpy.typing import NDArray
 from PIL import Image as PILImage
 
 from rfdetr.utilities.logger import get_logger
 
 logger = get_logger()
 
+# PILImage.Resampling was introduced in Pillow 9.1; fall back to the legacy constant.
+_PIL_BILINEAR = getattr(PILImage, "Resampling", PILImage).BILINEAR
+
 
 def _create_interpreter(model_path: str | Path) -> Any:
     """Load a TFLite model, allocate tensors, and log I/O shapes.
@@ -64,6 +68,38 @@ def _create_interpreter(model_path: str | Path) -> Any:
     return interp
 
 
+def _decode_masks(mask_logits: NDArray[Any], out_size: tuple[int, int]) -> NDArray[np.bool_]:
+    """Upsample raw mask logits to image size and threshold at zero.
+
+    Approximates ``PostProcess.forward``: bilinear resize followed by ``> 0``.
+    Uses Pillow's bilinear resampling rather than ``F.interpolate`` (no PyTorch
+    dependency at inference time); border pixels may differ slightly due to
+    distinct half-pixel conventions.
+
+    Args:
+        mask_logits: Raw mask logits of shape ``(K, Hm, Wm)``.
+        out_size: Target ``(width, height)`` in pixels.
+
+    Returns:
+        Boolean mask array of shape ``(K, height, width)``.
+
+    Raises:
+        ValueError: If *mask_logits* is not rank-3.
+    """
+    if mask_logits.ndim != 3:
+        raise ValueError(
+            f"_decode_masks expects rank-3 (K, Hm, Wm); got shape {mask_logits.shape}. "
+            "This usually means the rank-4 mask-output heuristic in _run_inference matched the wrong tensor."
+        )
+    width, height = out_size
+    out = np.empty((mask_logits.shape[0], height, width), dtype=np.bool_)
+    for i, logit_map in enumerate(mask_logits):
+        mask_img = PILImage.fromarray(logit_map.astype(np.float32), mode="F")
+        resized = mask_img.resize((width, height), _PIL_BILINEAR)
+        out[i] = np.asarray(resized) > 0.0
+    return out
+
+
 def _run_inference(
     interp: Any,
     image_path: str | Path,
@@ -75,6 +111,8 @@ def _run_inference(
     normalises the image with ImageNet statistics, invokes the model, then
     decodes the ``dets`` / ``labels`` output tensors into a
     :class:`supervision.Detections` object with pixel-space ``xyxy`` boxes.
+    For segmentation exports the ``masks`` output is also decoded into
+    ``Detections.mask``.
 
     Args:
         interp: Allocated TFLite interpreter returned by ``_create_interpreter``.
@@ -83,8 +121,8 @@ def _run_inference(
 
     Returns:
         A tuple of ``(detections, pil_img)`` where ``detections`` contains
-        pixel-space ``xyxy`` boxes and ``pil_img`` is the original PIL image
-        at its original resolution.
+        pixel-space ``xyxy`` boxes (and ``mask`` for segmentation models) and
+        ``pil_img`` is the original PIL image at its original resolution.
     """
     inp_det = interp.get_input_details()
     out_det = interp.get_output_details()
@@ -119,11 +157,11 @@ def _run_inference(
     boxes_idx = next((i for i, od in enumerate(out_det) if "dets" in str(od.get("name", ""))), None)
     logits_idx = next((i for i, od in enumerate(out_det) if "labels" in str(od.get("name", ""))), None)
     if boxes_idx is None or logits_idx is None:
-        # onnx2tf sometimes renames outputs to generic "Identity", "Identity_N" instead
-        # of preserving the original ONNX node names. Fall back to shape-based
-        # matching for the detection outputs only: boxes (*, 4) and logits
-        # (*, num_classes+1). Segmentation exports may include additional outputs
-        # such as masks; unnamed extra outputs are not resolved by this fallback.
+        # onnx2tf sometimes renames outputs to generic "Identity", "Identity_N"
+        # instead of preserving the original ONNX node names. Fall back to
+        # shape-based matching: boxes are the rank-3 tensor with last dim 4,
+        # logits the rank-3 tensor with last dim != 4. A rank-4 mask output,
+        # if present, is matched separately below.
         logger.debug(
             "Name-based output matching failed (available: %s). Falling back to shape-based matching.",
             available_output_names,
@@ -177,4 +215,22 @@ def _run_inference(
     xyxy = np.stack([cx - bw / 2, cy - bh / 2, cx + bw / 2, cy + bh / 2], axis=1)
     xyxy *= np.array([ow, oh, ow, oh], dtype=np.float32)
 
-    return sv.Detections(xyxy=xyxy, confidence=scores[keep], class_id=cls[keep].astype(int)), pil_img
+    # Segmentation exports add a rank-4 mask output; decode it when present.
+    mask_idx = next((i for i, od in enumerate(out_det) if "masks" in str(od.get("name", ""))), None)
+    if mask_idx is None:
+        rank4_candidates = [i for i, od in enumerate(out_det) if len(od["shape"]) == 4]
+        if len(rank4_candidates) == 1:
+            mask_idx = rank4_candidates[0]
+        elif len(rank4_candidates) >= 2:
+            logger.warning(
+                "Ambiguous rank-4 outputs (%d candidates); skipping mask decode. "
+                "Name your mask output to contain 'masks' to disambiguate.",
+                len(rank4_candidates),
+            )
+    masks = None
+    if mask_idx is not None and keep.any():
+        raw_masks = interp.get_tensor(out_det[mask_idx]["index"])[0]  # (Q, Hm, Wm)
+        masks = _decode_masks(raw_masks[keep], (ow, oh))
+
+    detections = sv.Detections(xyxy=xyxy, confidence=scores[keep], class_id=cls[keep].astype(int), mask=masks)
+    return detections, pil_img
diff --git a/tests/export/test_tflite_inference.py b/tests/export/test_tflite_inference.py
@@ -9,6 +9,7 @@
 Covers:
 * ``_create_interpreter()`` — interpreter loading with tflite_runtime / tensorflow fallback
 * ``_run_inference()`` — image preprocessing, invocation, and detection decoding
+* ``_decode_masks()`` — segmentation mask upsampling and thresholding
 """
 
 from __future__ import annotations
@@ -22,7 +23,7 @@
 import supervision as sv
 from PIL import Image as PILImage
 
-from rfdetr.export._tflite.inference import _create_interpreter, _run_inference
+from rfdetr.export._tflite.inference import _create_interpreter, _decode_masks, _run_inference
 
 # ---------------------------------------------------------------------------
 # Shared helpers / factories
@@ -439,3 +440,147 @@ def _get_tensor(index: int) -> np.ndarray:
         dets, _ = _run_inference(interp, rgb_image, threshold=0.3)
         assert isinstance(dets, sv.Detections)
         assert len(dets) >= 1
+
+
+# ---------------------------------------------------------------------------
+# TestMaskDecoding
+# ---------------------------------------------------------------------------
+
+
+class TestMaskDecoding:
+    """Tests for ``_decode_masks()`` and mask decoding in ``_run_inference()``."""
+
+    @pytest.fixture()
+    def rgb_image(self, tmp_path: Path) -> Path:
+        """Write a small RGB JPEG to a temp file and return its path."""
+        p = tmp_path / "image.jpg"
+        _save_rgb_image(p)
+        return p
+
+    def test_decode_masks_shape_and_dtype(self) -> None:
+        """Output shape is (K, height, width) from out_size=(width, height); dtype is bool."""
+        out = _decode_masks(np.zeros((3, 10, 10), dtype=np.float32), (40, 20))
+        assert out.shape == (3, 20, 40)
+        assert out.dtype == bool
+
+    def test_decode_masks_thresholds_at_zero(self) -> None:
+        """Positive logits decode to True, negative logits to False."""
+        logits = np.stack(
+            [
+                np.full((8, 8), 5.0, dtype=np.float32),
+                np.full((8, 8), -5.0, dtype=np.float32),
+            ]
+        )
+        out = _decode_masks(logits, (16, 16))
+        assert out[0].all()
+        assert not out[1].any()
+
+    def test_decode_masks_empty_input(self) -> None:
+        """Zero masks in yields a (0, height, width) array, not an error."""
+        out = _decode_masks(np.zeros((0, 10, 10), dtype=np.float32), (32, 32))
+        assert out.shape == (0, 32, 32)
+
+    def test_run_inference_decodes_masks_for_seg_model(self, rgb_image: Path) -> None:
+        """A 3-output segmentation export populates Detections.mask at image size."""
+        boxes = _make_boxes()
+        logits = _make_logits(high_conf_idx=0)
+        masks = np.full((1, 10, 28, 28), -10.0, dtype=np.float32)
+        masks[0, 0] = 10.0  # query 0 (the kept detection) gets an all-positive mask
+
+        def _get_tensor(index: int) -> np.ndarray:
+            return {1: boxes, 2: logits, 3: masks}[index]
+
+        interp = mock.MagicMock()
+        interp.get_input_details.return_value = [{"shape": _INPUT_SHAPE, "index": 0, "dtype": np.float32}]
+        interp.get_output_details.return_value = [
+            {"shape": [1, 10, 4], "name": "Identity_0", "index": 1},
+            {"shape": [1, 10, 82], "name": "Identity_1", "index": 2},
+            {"shape": [1, 10, 28, 28], "name": "Identity_2", "index": 3},
+        ]
+        interp.get_tensor.side_effect = _get_tensor
+
+        dets, img = _run_inference(interp, rgb_image, threshold=0.3)
+        assert dets.mask is not None
+        assert dets.mask.shape == (len(dets), img.height, img.width)
+        assert dets.mask.dtype == bool
+        assert dets.mask[0].all()  # query 0's all-positive logits decode to a full mask
+
+    def test_run_inference_no_mask_for_detection_model(self, rgb_image: Path) -> None:
+        """A 2-output detection export leaves Detections.mask as None."""
+        interp = _make_interp(logits=_make_logits(high_conf_idx=0))
+        dets, _ = _run_inference(interp, rgb_image, threshold=0.3)
+        assert dets.mask is None
+
+    def test_run_inference_name_based_mask_detection(self, rgb_image: Path) -> None:
+        """Output named 'masks:0' exercises the name-based path and sets Detections.mask."""
+        boxes = _make_boxes()
+        logits = _make_logits(high_conf_idx=0)
+        masks = np.full((1, 10, 28, 28), 10.0, dtype=np.float32)
+
+        def _get_tensor(index: int) -> np.ndarray:
+            return {1: boxes, 2: logits, 3: masks}[index]
+
+        interp = mock.MagicMock()
+        interp.get_input_details.return_value = [{"shape": _INPUT_SHAPE, "index": 0, "dtype": np.float32}]
+        interp.get_output_details.return_value = [
+            {"shape": [1, 10, 4], "name": "serving_default_dets:0", "index": 1},
+            {"shape": [1, 10, 82], "name": "serving_default_labels:0", "index": 2},
+            {"shape": [1, 10, 28, 28], "name": "serving_default_masks:0", "index": 3},
+        ]
+        interp.get_tensor.side_effect = _get_tensor
+
+        dets, _ = _run_inference(interp, rgb_image, threshold=0.3)
+        assert dets.mask is not None
+
+    def test_run_inference_seg_model_no_detections_returns_none_mask(self, rgb_image: Path) -> None:
+        """Seg model with all scores below threshold returns mask=None (keep.any() is False)."""
+        boxes = _make_boxes()
+        logits = _make_logits(high_conf_idx=None)  # all scores near zero, below threshold
+        masks = np.full((1, 10, 28, 28), 10.0, dtype=np.float32)
+
+        def _get_tensor(index: int) -> np.ndarray:
+            return {1: boxes, 2: logits, 3: masks}[index]
+
+        interp = mock.MagicMock()
+        interp.get_input_details.return_value = [{"shape": _INPUT_SHAPE, "index": 0, "dtype": np.float32}]
+        interp.get_output_details.return_value = [
+            {"shape": [1, 10, 4], "name": "Identity_0", "index": 1},
+            {"shape": [1, 10, 82], "name": "Identity_1", "index": 2},
+            {"shape": [1, 10, 28, 28], "name": "Identity_2", "index": 3},
+        ]
+        interp.get_tensor.side_effect = _get_tensor
+
+        dets, _ = _run_inference(interp, rgb_image, threshold=0.3)
+        assert len(dets) == 0
+        assert dets.mask is None
+
+    def test_decode_masks_raises_on_wrong_rank(self) -> None:
+        """_decode_masks raises ValueError when input is not rank-3."""
+        with pytest.raises(ValueError, match="rank-3"):
+            _decode_masks(np.zeros((10, 28, 28, 1), dtype=np.float32), (56, 56))
+
+    def test_decode_masks_exact_zero_logit_decodes_to_false(self) -> None:
+        """Logit exactly 0.0 is not > 0.0 and decodes to False (strict threshold)."""
+        zero_logits = np.zeros((1, 8, 8), dtype=np.float32)
+        out = _decode_masks(zero_logits, (16, 16))
+        assert not out.any()
+
+    def test_decode_masks_non_square_logit_input(self) -> None:
+        """Non-square logit map (K, Hm, Wm) with Hm != Wm resizes to the correct output shape."""
+        logits = np.full((3, 7, 14), 5.0, dtype=np.float32)
+        out = _decode_masks(logits, (56, 28))  # out_size=(width=56, height=28)
+        assert out.shape == (3, 28, 56)
+        assert out.all()  # all-positive logits → all True
+
+    def test_decode_masks_parity_positive_negative_regions(self) -> None:
+        """Positive/negative logit regions map correctly after bilinear upsample + threshold.
+
+        Uses high-magnitude logits (±10) so no ambiguity near the boundary; verifies
+        the core _decode_masks contract matches the >0 PostProcess.forward equivalent.
+        """
+        logits = np.full((1, 14, 14), -10.0, dtype=np.float32)
+        logits[0, :7, :] = 10.0  # top half strongly positive, bottom half strongly negative
+        out = _decode_masks(logits, (28, 28))
+        # Interior rows well away from the half-way boundary
+        assert out[0, 1:6, :].all()  # top rows → all True
+        assert not out[0, 15:27, :].any()  # bottom rows → all False