fix

xadupre · xadupre · commit 5d39ae1b83bc · 2026-04-03T14:06:50.000+02:00
diff --git a/_unittests/ut_export/test_api.py b/_unittests/ut_export/test_api.py
@@ -46,7 +46,7 @@ def forward(self, x, y):
 
     @hide_stdout()
     @ignore_warnings(FutureWarning)
-    @requires_transformers("4.50")
+    @requires_transformers("4.57")
     def test_tiny_llm_to_onnx(self):
         import onnxruntime
 
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -74,7 +74,7 @@ def test_sdpa_mask_patched(self):
         patched_sdpa_mask = patch_transformers.patched_sdpa_mask
         kwargs = {
             "batch_size": 1,
-            "cache_position": torch.tensor([3], dtype=torch.int64),
+            "q_length": torch.tensor([3], dtype=torch.int64),
             "kv_length": 4,
             "kv_offset": 0,
             "mask_function": transformers.masking_utils.causal_mask_function,
@@ -89,7 +89,7 @@ def test_sdpa_mask_patched(self):
 
         kwargs = {
             "batch_size": 1,
-            "cache_position": torch.tensor([3], dtype=torch.int64),
+            "q_length": torch.tensor([3], dtype=torch.int64),
             "kv_length": 4,
             "kv_offset": 0,
             "mask_function": transformers.masking_utils.causal_mask_function,
diff --git a/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_masking_utils.py b/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_masking_utils.py
@@ -248,9 +248,10 @@ def patched_sdpa_mask_recent_torch(
         return attention_mask
 
     def patched_sdpa_mask(
-        batch_size: int,
-        cache_position: torch.Tensor,
-        kv_length: int,
+        batch_size: int = 0,
+        q_length: int = 0,
+        kv_length: int = 0,
+        q_offset: int = 0,
         kv_offset: int = 0,
         mask_function: Callable = causal_mask_function,
         attention_mask: torch.Tensor | None = None,
@@ -262,7 +263,79 @@ def patched_sdpa_mask(
         **kwargs,
     ) -> torch.Tensor | None:
         """manual patch for function ``transformers.masking_utils.sdpa_mask``."""
-        q_length = cache_position.shape[0]
+        if isinstance(q_length, torch.Tensor):
+            # `cache_position` is deprecated as an arg,
+            # and will be removed in Transformers v5.6. Please use `q_length` and "
+            # `q_offset` instead, similarly to `kv_length` and `kv_offset`"
+            cache_position = q_length
+            device = q_length.device
+            q_length = q_length.shape[0]
+
+            # Potentially pad the 2D mask
+            padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
+
+            # Under specific conditions, we can avoid materializing the mask
+            #   1. Causal masks can rely on the `is_causal` argument
+            #   2. Bidirectional do not need any further processing (no bias)
+            if allow_is_causal_skip and _ignore_causal_mask_sdpa(
+                padding_mask, q_length, kv_length, kv_offset, local_size
+            ):
+                return None
+            if allow_is_bidirectional_skip and _ignore_bidirectional_mask_sdpa(
+                padding_mask, kv_length, local_size
+            ):
+                return None
+
+            # Potentially add the padding 2D mask
+            if padding_mask is not None:
+                mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+
+            batch_arange = torch.arange(batch_size, device=device)
+            head_arange = torch.arange(1, device=device)
+            # Similar to `kv_arange = torch.arange(start=kv_offset,
+            # end=kv_offset + kv_length, device=cache_position.device)`
+            # but without data-dependent slicing (i.e. torch.compile friendly)
+            kv_arange = torch.arange(kv_length, device=device) + kv_offset
+
+            # Actual mask creation
+            # Option 1: Fast non-vmap mask creation (default)
+            # PATCHED
+            use_vmap = False
+            if not use_vmap:
+                # Apply mask function element-wise through broadcasting
+                attention_mask = mask_function(
+                    *_non_vmap_expansion_sdpa(
+                        batch_arange, head_arange, cache_position, kv_arange
+                    )
+                )
+                # Expand the mask to match batch size
+                # and query length if they weren't used in the mask function
+                attention_mask = attention_mask.expand(batch_size, -1, q_length, kv_length)
+
+            # Option 2: Vmap mask creation (torch>=2.6 and custom patterns)
+            # elif _is_torch_greater_or_equal_than_2_6:
+            # This creates the 4D mask easily.
+            # Note that we need this context manager as vmap
+            # cannot handle slicing a tensor from
+            # scalar tensor (it internally calls `.item()` which vmap does not allow,
+            # but this context works around it
+            # We don't need to add an offset to the mask_function either,
+            # as we vmap directly the correct indices for k and kv indices
+            #    with TransformGetItemToIndex():
+            #        attention_mask = _vmap_expansion_sdpa(mask_function)(
+            #            batch_arange, head_arange, cache_position, kv_arange
+            #        )
+
+            # Option 3: Error out since it indicates that the user did something custom,
+            # which they shouldn't have (torch<2.6)
+            else:
+                raise ValueError(
+                    "The vmap functionality for mask creation "
+                    "is only supported from torch>=2.6. "
+                    "Please update your torch version or use "
+                    "`use_vmap=False` with index-based masks."
+                )
+            return attention_mask
 
         # Potentially pad the 2D mask
         padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
@@ -283,46 +356,17 @@ def patched_sdpa_mask(
         if padding_mask is not None:
             mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
 
-        batch_arange = torch.arange(batch_size, device=cache_position.device)
-        head_arange = torch.arange(1, device=cache_position.device)
-        # Similar to `kv_arange = torch.arange(start=kv_offset,
-        # end=kv_offset + kv_length, device=cache_position.device)`
-        # but without data-dependent slicing (i.e. torch.compile friendly)
-        kv_arange = torch.arange(kv_length, device=cache_position.device) + kv_offset
+        batch_arange = torch.arange(batch_size, device=device)
+        head_arange = torch.arange(1, device=device)
+        q_arange = torch.arange(q_length, device=device) + q_offset
+        kv_arange = torch.arange(kv_length, device=device) + kv_offset
+
+        # Apply mask function element-wise through broadcasting
+        attention_mask = mask_function(
+            *_non_vmap_expansion_sdpa(batch_arange, head_arange, q_arange, kv_arange)
+        )
+        # Expand the mask to match batch size and query
+        # length if they weren't used in the mask function
+        attention_mask = attention_mask.expand(batch_size, -1, q_length, kv_length)
 
-        # Actual mask creation
-        # Option 1: Fast non-vmap mask creation (default)
-        # PATCHED
-        use_vmap = False
-        if not use_vmap:
-            # Apply mask function element-wise through broadcasting
-            attention_mask = mask_function(
-                *_non_vmap_expansion_sdpa(batch_arange, head_arange, cache_position, kv_arange)
-            )
-            # Expand the mask to match batch size
-            # and query length if they weren't used in the mask function
-            attention_mask = attention_mask.expand(batch_size, -1, q_length, kv_length)
-
-        # Option 2: Vmap mask creation (torch>=2.6 and custom patterns)
-        # elif _is_torch_greater_or_equal_than_2_6:
-        # This creates the 4D mask easily.
-        # Note that we need this context manager as vmap cannot handle slicing a tensor from
-        # scalar tensor (it internally calls `.item()` which vmap does not allow,
-        # but this context works around it
-        # We don't need to add an offset to the mask_function either,
-        # as we vmap directly the correct indices for k and kv indices
-        #    with TransformGetItemToIndex():
-        #        attention_mask = _vmap_expansion_sdpa(mask_function)(
-        #            batch_arange, head_arange, cache_position, kv_arange
-        #        )
-
-        # Option 3: Error out since it indicates that the user did something custom,
-        # which they shouldn't have (torch<2.6)
-        else:
-            raise ValueError(
-                "The vmap functionality for mask creation "
-                "is only supported from torch>=2.6. "
-                "Please update your torch version or use "
-                "`use_vmap=False` with index-based masks."
-            )
         return attention_mask