From e825de57c080ae53eecb51b80f1a4a7ad47b5afb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 24 Feb 2026 10:03:14 +0100
Subject: [PATCH 01/35] Bump github/codeql-action from 4.32.3 to 4.32.4 (#2780)

Bumps [github/codeql-action](https://github.com/github/codeql-action)
from 4.32.3 to 4.32.4.
---
 .github/workflows/openssf-scorecard.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index 8bf5e86d03ed..6ca1f6682784 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -72,6 +72,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@9e907b5e64f6b83e7804b09294d44122997950d6 # v4.32.3
+        uses: github/codeql-action/upload-sarif@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4
         with:
           sarif_file: results.sarif

From d545555e133d2ca233aa8f01d895f8992be02723 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 24 Feb 2026 12:38:47 +0100
Subject: [PATCH 02/35] Weekly pre-commit autoupdate (#2779)

This PR updates the `.pre-commit-config.yaml` using `pre-commit
autoupdate`.
---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ace139f8d179..008adb4589d0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -69,7 +69,7 @@ repos:
     -   id: black
         exclude: "dpnp/_version.py"
 -   repo: https://github.com/pycqa/isort
-    rev: 7.0.0
+    rev: 8.0.0
     hooks:
     -   id: isort
         name: isort (python)
@@ -123,11 +123,11 @@ repos:
     -   id: pretty-format-toml
         args: [--autofix]
 -   repo: https://github.com/rhysd/actionlint
-    rev: v1.7.10
+    rev: v1.7.11
     hooks:
     -   id: actionlint
 -   repo: https://github.com/BlankSpruce/gersemi
-    rev: 0.25.4
+    rev: 0.26.0
     hooks:
     -   id: gersemi
         exclude: "dpnp/backend/cmake/Modules/"

From 4ca0ffa1fffcd997609583ee67c9f7b56566c3ec Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 2 Mar 2026 17:56:24 +0100
Subject: [PATCH 03/35] Weekly pre-commit autoupdate (#2788)

This PR updates the `.pre-commit-config.yaml` using `pre-commit
autoupdate`.
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 008adb4589d0..66245039ce3c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
 -   repo: https://github.com/PyCQA/bandit
-    rev: '1.9.3'
+    rev: '1.9.4'
     hooks:
     -   id: bandit
         pass_filenames: false

From 783f29813629cda5b3834ae72e1c90464d8095a0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Mar 2026 19:01:51 +0100
Subject: [PATCH 04/35] Bump actions/download-artifact from 7.0.0 to 8.0.0
 (#2790)

Bumps
[actions/download-artifact](https://github.com/actions/download-artifact)
from 7.0.0 to 8.0.0.
---
 .github/workflows/check-onemath.yaml |  4 ++--
 .github/workflows/conda-package.yml  | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml
index 7b18cdfaba64..9a6e92bfc36b 100644
--- a/.github/workflows/check-onemath.yaml
+++ b/.github/workflows/check-onemath.yaml
@@ -87,7 +87,7 @@ jobs:
           fetch-depth: 0
 
       - name: Download artifact
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
         with:
           name: ${{ env.environment-file-name }}
           path: ${{ env.environment-file-loc }}
@@ -181,7 +181,7 @@ jobs:
           fetch-depth: 0
 
       - name: Download artifact
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
         with:
           name: ${{ env.environment-file-name }}
           path: ${{ env.environment-file-loc }}
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index d2ac90621aaa..2ef8e42ca705 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -151,7 +151,7 @@ jobs:
           path: ${{ env.dpnp-repo-path }}
 
       - name: Download artifact
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.pkg-path-in-channel }}
@@ -280,7 +280,7 @@ jobs:
           path: ${{ env.dpnp-repo-path }}
 
       - name: Download artifact
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.pkg-path-in-channel }}
@@ -439,12 +439,12 @@ jobs:
           fetch-depth: ${{ env.fetch-depth }}
 
       - name: Download artifact
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
 
       - name: Download wheels artifact
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
 
@@ -528,7 +528,7 @@ jobs:
           path: ${{ env.dpnp-repo-path }}
 
       - name: Download artifact
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ env.python-ver }}
           path: ${{ env.pkg-path-in-channel }}

From fd64f3cb7d0d9cbaab62b2a162a962e3880750c2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Mar 2026 20:27:59 +0100
Subject: [PATCH 05/35] Bump actions/upload-artifact from 6.0.0 to 7.0.0
 (#2789)

Bumps
[actions/upload-artifact](https://github.com/actions/upload-artifact)
from 6.0.0 to 7.0.0.
---
 .github/workflows/check-onemath.yaml    | 2 +-
 .github/workflows/conda-package.yml     | 4 ++--
 .github/workflows/openssf-scorecard.yml | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml
index 9a6e92bfc36b..409117c692b9 100644
--- a/.github/workflows/check-onemath.yaml
+++ b/.github/workflows/check-onemath.yaml
@@ -57,7 +57,7 @@ jobs:
           cat ${{ env.environment-file }}
 
       - name: Upload artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: ${{ env.environment-file-name }}
           path: ${{ env.environment-file }}
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 2ef8e42ca705..a12486300aa0 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -102,13 +102,13 @@ jobs:
           MAX_BUILD_CMPL_MKL_VERSION: '2026.0a0'
 
       - name: Upload artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.CONDA_BLD }}${{ env.package-name }}-*.conda
 
       - name: Upload wheels artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
           path: ${{ env.WHEELS_OUTPUT_FOLDER }}${{ env.package-name }}-*.whl
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index 6ca1f6682784..8b4cc3b93f64 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -64,7 +64,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: SARIF file
           path: results.sarif

From 7803d3a783e0ea9a00ed6e2aeb31eeab1132e5b2 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Tue, 3 Mar 2026 13:21:00 +0100
Subject: [PATCH 06/35] Mute expecting runtime warning raised in tests (#2792)

The PR updates marks observing `RuntimeWarning` as expected to suppress
them.
Also the PR update DLPack tests to avoid raising the warning in SYCL
queue relating tests.
---
 dpnp/tests/test_mathematical.py | 1 +
 dpnp/tests/test_special.py      | 3 +++
 dpnp/tests/test_sycl_queue.py   | 4 ++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index 760c1a0ceb2e..e1f32bbd7931 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -1733,6 +1733,7 @@ def test_nan_infs_complex(self):
 
 
 class TestSpacing:
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
     @pytest.mark.parametrize("sign", [1, -1])
     @pytest.mark.parametrize("dt", get_float_dtypes())
     def test_basic(self, sign, dt):
diff --git a/dpnp/tests/test_special.py b/dpnp/tests/test_special.py
index 44b2d38e6919..1ebb64d8da7f 100644
--- a/dpnp/tests/test_special.py
+++ b/dpnp/tests/test_special.py
@@ -94,6 +94,9 @@ def _check_variant_func(
             # calling numpy testing func, because it's more verbose
             assert_allclose(x.asnumpy(), y.asnumpy(), rtol=rtol, atol=atol)
 
+    @pytest.mark.usefixtures(
+        "suppress_overflow_encountered_in_cast_numpy_warnings"
+    )
     def test_erfc(self, inverse):
         self._check_variant_func(
             inverse,
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index d1853579036a..699cd81c96f6 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -1086,7 +1086,7 @@ def test_array_creation_from_dpctl(copy, device):
 @pytest.mark.parametrize("arr_dtype", get_all_dtypes(no_float16=True))
 @pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)])
 def test_from_dlpack(arr_dtype, shape, device):
-    X = dpnp.empty(shape=shape, dtype=arr_dtype, device=device)
+    X = dpnp.ones(shape=shape, dtype=arr_dtype, device=device)
     Y = dpnp.from_dlpack(X)
     assert_array_equal(X, Y)
     assert X.__dlpack_device__() == Y.__dlpack_device__()
@@ -1101,7 +1101,7 @@ def test_from_dlpack(arr_dtype, shape, device):
 @pytest.mark.parametrize("device", valid_dev, ids=dev_ids)
 @pytest.mark.parametrize("arr_dtype", get_all_dtypes(no_float16=True))
 def test_from_dlpack_with_dpt(arr_dtype, device):
-    X = dpctl.tensor.empty((64,), dtype=arr_dtype, device=device)
+    X = dpt.ones((64,), dtype=arr_dtype, device=device)
     Y = dpnp.from_dlpack(X)
     assert_array_equal(X, Y)
     assert isinstance(Y, dpnp.dpnp_array.dpnp_array)

From 2c864972d7ce576c82b8c8a2393d38c1c0b384f3 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Wed, 4 Mar 2026 13:20:24 -0600
Subject: [PATCH 07/35] Add `scipy.linalg.lu()` decomposition support (#2787)

This PR adds `dpnp.scipy.linalg.lu()` with support for all three output
modes: default `(P, L, U)`, `permute_l=True (PL, U)`, and
`p_indices=True` `(p, L, U)`, including batched inputs.

Fixes: https://github.com/IntelPython/dpnp/issues/2786
---
 CHANGELOG.md                                  |   1 +
 dpnp/scipy/linalg/__init__.py                 |   3 +-
 dpnp/scipy/linalg/_decomp_lu.py               | 153 +++++-
 dpnp/scipy/linalg/_utils.py                   | 255 +++++++++-
 dpnp/tests/test_linalg.py                     | 450 ++++++++++++++++++
 dpnp/tests/test_sycl_queue.py                 |  12 +
 dpnp/tests/test_usm_type.py                   |  12 +
 .../linalg_tests/test_decomp_lu.py            |  15 +-
 8 files changed, 886 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f177be311f84..61cde1ddfefc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.ndarray.__bytes__` method [#2671](https://github.com/IntelPython/dpnp/pull/2671)
 * Added implementation of `dpnp.divmod` [#2674](https://github.com/IntelPython/dpnp/pull/2674)
 * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595)
+* Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
 
 ### Changed
 
diff --git a/dpnp/scipy/linalg/__init__.py b/dpnp/scipy/linalg/__init__.py
index 3afc08a6fdb9..81eadd890fa9 100644
--- a/dpnp/scipy/linalg/__init__.py
+++ b/dpnp/scipy/linalg/__init__.py
@@ -35,9 +35,10 @@
 
 """
 
-from ._decomp_lu import lu_factor, lu_solve
+from ._decomp_lu import lu, lu_factor, lu_solve
 
 __all__ = [
+    "lu",
     "lu_factor",
     "lu_solve",
 ]
diff --git a/dpnp/scipy/linalg/_decomp_lu.py b/dpnp/scipy/linalg/_decomp_lu.py
index 292d7fffe4b4..823b2fccc230 100644
--- a/dpnp/scipy/linalg/_decomp_lu.py
+++ b/dpnp/scipy/linalg/_decomp_lu.py
@@ -46,11 +46,154 @@
 )
 
 from ._utils import (
+    dpnp_lu,
     dpnp_lu_factor,
     dpnp_lu_solve,
 )
 
 
+def lu(
+    a, permute_l=False, overwrite_a=False, check_finite=True, p_indices=False
+):
+    """
+    Compute LU decomposition of a matrix with partial pivoting.
+
+    The decomposition satisfies::
+
+        A = P @ L @ U
+
+    where `P` is a permutation matrix, `L` is lower triangular with unit
+    diagonal elements, and `U` is upper triangular. If `permute_l` is set to
+    ``True`` then `L` is returned already permuted and hence satisfying
+    ``A = L @ U``.
+
+    For full documentation refer to :obj:`scipy.linalg.lu`.
+
+    Parameters
+    ----------
+    a : (..., M, N) {dpnp.ndarray, usm_ndarray}
+        Input array to decompose.
+    permute_l : bool, optional
+        Perform the multiplication ``P @ L`` (Default: do not permute).
+
+        Default: ``False``.
+    overwrite_a : {None, bool}, optional
+        Whether to overwrite data in `a` (may increase performance).
+
+        Default: ``False``.
+    check_finite : {None, bool}, optional
+        Whether to check that the input matrix contains only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+
+        Default: ``True``.
+    p_indices : bool, optional
+        If ``True`` the permutation information is returned as row indices
+        instead of a permutation matrix.
+
+        Default: ``False``.
+
+    Returns
+    -------
+    **(If ``permute_l`` is ``False``)**
+
+    p : (..., M, M) dpnp.ndarray or (..., M) dpnp.ndarray
+        If `p_indices` is ``False`` (default), the permutation matrix.
+        The permutation matrix always has a real dtype (``float32`` or
+        ``float64``) even when `a` is complex, since it only contains
+        0s and 1s.
+        If `p_indices` is ``True``, a 1-D (or batched) array of row
+        permutation indices such that ``A = L[p] @ U``.
+    l : (..., M, K) dpnp.ndarray
+        Lower triangular or trapezoidal matrix with unit diagonal.
+        ``K = min(M, N)``.
+    u : (..., K, N) dpnp.ndarray
+        Upper triangular or trapezoidal matrix.
+
+    **(If ``permute_l`` is ``True``)**
+
+    pl : (..., M, K) dpnp.ndarray
+        Permuted ``L`` matrix: ``pl = P @ L``.
+        ``K = min(M, N)``.
+    u : (..., K, N) dpnp.ndarray
+        Upper triangular or trapezoidal matrix.
+
+    Notes
+    -----
+    Permutation matrices are costly since they are nothing but row reorder of
+    ``L`` and hence indices are strongly recommended to be used instead if the
+    permutation is required. The relation in the 2D case then becomes simply
+    ``A = L[P, :] @ U``. In higher dimensions, it is better to use `permute_l`
+    to avoid complicated indexing tricks.
+
+    In the 2D case, if one has the indices however, for some reason, the
+    permutation matrix is still needed then it can be constructed by
+    ``dpnp.eye(M)[P, :]``.
+
+    Warning
+    -------
+    This function synchronizes in order to validate array elements
+    when ``check_finite=True``, and also synchronizes to compute the
+    permutation from LAPACK pivot indices.
+
+    See Also
+    --------
+    :obj:`dpnp.scipy.linalg.lu_factor` : LU factorize a matrix
+                                         (compact representation).
+    :obj:`dpnp.scipy.linalg.lu_solve` : Solve an equation system using
+                                        the LU factorization of a matrix.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> A = np.array([[2, 5, 8, 7], [5, 2, 2, 8],
+    ...               [7, 5, 6, 6], [5, 4, 4, 8]])
+    >>> p, l, u = np.scipy.linalg.lu(A)
+    >>> np.allclose(A, p @ l @ u)
+    array(True)
+
+    Retrieve the permutation as row indices with ``p_indices=True``:
+
+    >>> p, l, u = np.scipy.linalg.lu(A, p_indices=True)
+    >>> p
+    array([1, 3, 0, 2])
+    >>> np.allclose(A, l[p] @ u)
+    array(True)
+
+    Return the permuted ``L`` directly with ``permute_l=True``:
+
+    >>> pl, u = np.scipy.linalg.lu(A, permute_l=True)
+    >>> np.allclose(A, pl @ u)
+    array(True)
+
+    Non-square matrices are supported:
+
+    >>> B = np.array([[1, 2, 3], [4, 5, 6]])
+    >>> p, l, u = np.scipy.linalg.lu(B)
+    >>> np.allclose(B, p @ l @ u)
+    array(True)
+
+    Batched input:
+
+    >>> C = np.random.randn(3, 2, 4, 4)
+    >>> p, l, u = np.scipy.linalg.lu(C)
+    >>> np.allclose(C, p @ l @ u)
+    array(True)
+
+    """
+
+    dpnp.check_supported_arrays_type(a)
+    assert_stacked_2d(a)
+
+    return dpnp_lu(
+        a,
+        overwrite_a=overwrite_a,
+        check_finite=check_finite,
+        p_indices=p_indices,
+        permute_l=permute_l,
+    )
+
+
 def lu_factor(a, overwrite_a=False, check_finite=True):
     """
     Compute the pivoted LU decomposition of `a` matrix.
@@ -180,13 +323,13 @@ def lu_solve(lu_and_piv, b, trans=0, overwrite_b=False, check_finite=True):
 
     """
 
-    lu, piv = lu_and_piv
-    dpnp.check_supported_arrays_type(lu, piv, b)
-    assert_stacked_2d(lu)
-    assert_stacked_square(lu)
+    lu_matrix, piv = lu_and_piv
+    dpnp.check_supported_arrays_type(lu_matrix, piv, b)
+    assert_stacked_2d(lu_matrix)
+    assert_stacked_square(lu_matrix)
 
     return dpnp_lu_solve(
-        lu,
+        lu_matrix,
         piv,
         b,
         trans=trans,
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index f00db6fdfb92..d083f1c2c0a2 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -49,7 +49,7 @@
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
 from dpnp.dpnp_utils import get_usm_allocations
-from dpnp.linalg.dpnp_utils_linalg import _common_type
+from dpnp.linalg.dpnp_utils_linalg import _common_type, _real_type
 
 
 def _align_lu_solve_broadcast(lu, b):
@@ -83,6 +83,48 @@ def _align_lu_solve_broadcast(lu, b):
     return lu, b
 
 
+def _apply_permutation_to_rows(mat, perm_indices):
+    """
+    Apply a permutation to the rows (axis=-2) of a matrix.
+
+    Returns ``out`` such that
+    ``out[..., i, :] = mat[..., perm_indices[..., i], :]``.
+
+    For 2-D inputs this is equivalent to ``mat[perm_indices]`` (a single
+    device gather).  For batched inputs :func:`dpnp.take_along_axis` is
+    used so the operation stays entirely on the device.
+
+    Parameters
+    ----------
+    mat : dpnp.ndarray, shape (..., M, N)
+        Matrix whose rows are to be permuted.
+    perm_indices : dpnp.ndarray, shape (..., M)
+        Permutation indices (dtype int64).
+
+    Returns
+    -------
+    out : dpnp.ndarray, shape (..., M, N)
+        Row-permuted matrix.
+    """
+
+    if perm_indices.ndim == 1:
+        # 2-D case: simple fancy indexing, single kernel launch.
+        return mat[perm_indices]
+
+    # Batched case: ensure *mat* has the same batch dimensions as
+    # *perm_indices*. This is needed, for example, when permuting
+    # a shared identity matrix across a batch.
+    target_shape = perm_indices.shape[:-1] + mat.shape[-2:]
+    if mat.shape != target_shape:
+        mat = dpnp.broadcast_to(mat, target_shape)
+
+    # Expand (..., M) → (..., M, 1), then broadcast to the full shape
+    # of *mat* so take_along_axis can gather along axis -2.
+    idx = dpnp.expand_dims(perm_indices, axis=-1)
+    idx = dpnp.broadcast_to(idx, target_shape).copy()
+    return dpnp.take_along_axis(mat, idx, axis=-2)
+
+
 def _batched_lu_factor_scipy(a, res_type):  # pylint: disable=too-many-locals
     """SciPy-compatible LU factorization for batched inputs."""
 
@@ -338,6 +380,71 @@ def _map_trans_to_mkl(trans):
     raise ValueError("`trans` must be 0 (N), 1 (T), or 2 (C)")
 
 
+def _pivots_to_permutation(piv, m):
+    """
+    Convert 0-based LAPACK pivot indices (sequential row swaps)
+    to a permutation array.
+
+    The returned permutation ``perm`` satisfies ``A[perm] = L @ U``
+    (i.e. the forward row-permutation produced by LAPACK).
+
+    The computation is performed entirely on the device.  A host-side
+    Python loop of ``K = min(M, N)`` iterations drives the sequential
+    swap logic, but each iteration only launches device kernels
+    (:func:`dpnp.take_along_axis` for gather,
+    :func:`dpnp.put_along_axis` for scatter); **no data is transferred
+    between host and device**.
+
+    .. note::
+
+        A future custom SYCL kernel could fuse all ``K`` swap steps
+        into a single launch to eliminate per-step kernel overhead.
+
+    Parameters
+    ----------
+    piv : dpnp.ndarray, shape (..., K)
+        0-based pivot indices as returned by :obj:`dpnp_lu_factor`.
+    m : int
+        Number of rows of the original matrix.
+
+    Returns
+    -------
+    perm : dpnp.ndarray, shape (..., M), dtype int64
+        Permutation indices.
+    """
+
+    batch_shape = piv.shape[:-1]
+    k = piv.shape[-1]
+
+    # Initialise the identity permutation on the device.
+    perm = dpnp.broadcast_to(
+        dpnp.arange(
+            m,
+            dtype=dpnp.int64,
+            usm_type=piv.usm_type,
+            sycl_queue=piv.sycl_queue,
+        ),
+        (*batch_shape, m),
+    ).copy()
+
+    # Apply sequential row swaps entirely on the device.
+    # Each iteration launches a small number of device kernels (gather +
+    # slice-assign + scatter) but never transfers data to the host.
+    for i in range(k):
+        # Pivot target for step *i*: shape (..., 1)
+        j = piv[..., i : i + 1]
+
+        # Gather the two values to be swapped.
+        val_i = perm[..., i : i + 1].copy()  # slice (free)
+        val_j = dpnp.take_along_axis(perm, j, axis=-1)  # gather
+
+        # Perform the swap.
+        perm[..., i : i + 1] = val_j  # slice assign
+        dpnp.put_along_axis(perm, j, val_i, axis=-1)  # scatter
+
+    return perm
+
+
 def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
     """
     dpnp_lu_factor(a, overwrite_a=False, check_finite=True)
@@ -432,6 +539,152 @@ def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
     return (a_h, ipiv_h)
 
 
+def _assemble_lu_output(
+    low,
+    up,
+    inv_perm,
+    permute_l,
+    p_indices,
+    m,
+    real_type,
+    a_usm_type,
+    a_sycl_queue,
+):
+    """Select and build the correct dpnp_lu return value."""
+    if permute_l:
+        return _apply_permutation_to_rows(low, inv_perm), up
+    if p_indices:
+        return inv_perm, low, up
+    eye_m = dpnp.eye(
+        m, dtype=real_type, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+    )
+    return (
+        _apply_permutation_to_rows(eye_m, inv_perm),
+        low,
+        up,
+    )  # perm_matrix, L, U
+
+
+def dpnp_lu(
+    a,
+    overwrite_a=False,
+    check_finite=True,
+    p_indices=False,
+    permute_l=False,
+):
+    """
+    dpnp_lu(a, overwrite_a=False, check_finite=True, p_indices=False,
+            permute_l=False)
+
+    Compute pivoted LU decomposition and return separate P, L, U matrices
+    (SciPy-compatible behavior).
+
+    This function mimics the behavior of `scipy.linalg.lu` including
+    support for `permute_l`, `p_indices`, `overwrite_a`, and `check_finite`.
+
+    """
+
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+    m, n = a.shape[-2:]
+    k = min(m, n)
+    batch_shape = a.shape[:-2]
+
+    res_type = _common_type(a)
+
+    # The permutation matrix P uses a real dtype (SciPy convention):
+    # P only contains 0s and 1s, so complex storage would be wasteful.
+    real_type = _real_type(res_type)
+
+    # ---- Fast path: scalar (1x1) matrices ----
+    # For 1x1 input, P = I, L = I, U = A.  This avoids invoking LAPACK
+    # entirely (matches SciPy's scalar fast path).
+    if m == 1 and n == 1:
+        if check_finite:
+            if not dpnp.isfinite(a).all():
+                raise ValueError("array must not contain infs or NaNs")
+
+        low = dpnp.ones_like(a, dtype=res_type)
+        up = dpnp.astype(a, res_type, copy=not overwrite_a)
+        inv_perm = dpnp.zeros_like(a, shape=(*batch_shape, 1), dtype=dpnp.int64)
+
+        return _assemble_lu_output(
+            low,
+            up,
+            inv_perm,
+            permute_l,
+            p_indices,
+            m,
+            real_type,
+            a_usm_type,
+            a_sycl_queue,
+        )
+
+    # ---- Fast path: empty arrays ----
+    if a.size == 0:
+        low = dpnp.empty_like(a, shape=(*batch_shape, m, k), dtype=res_type)
+        up = dpnp.empty_like(a, shape=(*batch_shape, k, n), dtype=res_type)
+        inv_perm = dpnp.empty_like(a, shape=(*batch_shape, m), dtype=dpnp.int64)
+        return _assemble_lu_output(
+            low,
+            up,
+            inv_perm,
+            permute_l,
+            p_indices,
+            m,
+            real_type,
+            a_usm_type,
+            a_sycl_queue,
+        )
+
+    # ---- General case: LAPACK factorization ----
+    lu_compact, piv = dpnp_lu_factor(
+        a, overwrite_a=overwrite_a, check_finite=check_finite
+    )
+
+    # ---- Extract L: lower-triangular with unit diagonal ----
+    # L has shape (..., M, K).
+    low = dpnp.tril(lu_compact[..., :, :k], k=-1)
+    low += dpnp.eye(
+        m,
+        k,
+        dtype=lu_compact.dtype,
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
+
+    # ---- Extract U: upper-triangular ----
+    # U has shape (..., K, N).
+    up = dpnp.triu(lu_compact[..., :k, :])
+
+    # ---- Convert pivot indices → row permutation ----
+    # ``perm`` (forward): A[perm] = L @ U.
+    # This is the only step that requires a host transfer because the
+    # sequential swap semantics of LAPACK pivots cannot be parallelised.
+    # Only the small pivot array (min(M, N) elements per slice) is
+    # transferred; all subsequent work stays on the device.
+    perm = _pivots_to_permutation(piv, m)
+
+    # ``inv_perm`` (inverse): A = L[inv_perm] @ U.
+    # This is SciPy's ``p_indices`` convention.
+    # ``dpnp.argsort`` is an efficient on-device O(M log M) operation
+    # that avoids a second host round-trip.
+    inv_perm = dpnp.argsort(perm, axis=-1).astype(dpnp.int64)
+
+    # ---- Assemble output (SciPy convention) ----
+    return _assemble_lu_output(
+        low,
+        up,
+        inv_perm,
+        permute_l,
+        p_indices,
+        m,
+        real_type,
+        a_usm_type,
+        a_sycl_queue,
+    )
+
+
 def dpnp_lu_solve(lu, piv, b, trans=0, overwrite_b=False, check_finite=True):
     """
     dpnp_lu_solve(lu, piv, b, trans=0, overwrite_b=False, check_finite=True)
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 31d99d71ce49..7d8018fa83a2 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2605,6 +2605,456 @@ def test_invalid_shapes(self, a_shape, b_shape):
             dpnp.scipy.linalg.lu_solve((lu, piv), b, check_finite=False)
 
 
+class TestLu:
+    @staticmethod
+    def _make_nonsingular_np(shape, dtype, order):
+        A = generate_random_numpy_array(shape, dtype, order)
+        m, n = shape
+        k = min(m, n)
+        for i in range(k):
+            off = numpy.sum(numpy.abs(A[i, :n])) - numpy.abs(A[i, i])
+            A[i, i] = A.dtype.type(off + 1.0)
+        return A
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
+    def test_lu_default(self, shape, order, dtype):
+        a_np = self._make_nonsingular_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+
+        m, n = shape
+        k = min(m, n)
+        assert P.shape == (m, m)
+        assert L.shape == (m, k)
+        assert U.shape == (k, n)
+
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        A_rec = P @ L @ U
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
+    def test_lu_permute_l(self, shape, order, dtype):
+        a_np = self._make_nonsingular_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        PL, U = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+
+        m, n = shape
+        k = min(m, n)
+        assert PL.shape == (m, k)
+        assert U.shape == (k, n)
+
+        A_cast = a_dp.astype(PL.dtype, copy=False)
+        A_rec = PL @ U
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
+    def test_lu_p_indices(self, shape, order, dtype):
+        a_np = self._make_nonsingular_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        p, L, U = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+
+        m, n = shape
+        k = min(m, n)
+        assert p.shape == (m,)
+        assert L.shape == (m, k)
+        assert U.shape == (k, n)
+        assert dpnp.issubdtype(p.dtype, dpnp.integer)
+
+        A_rec = L[p] @ U
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize("in_dtype", get_float_complex_dtypes())
+    def test_p_matrix_dtype(self, in_dtype):
+        expected_p_dtype = numpy.dtype(in_dtype).char.lower()
+
+        a_np = self._make_nonsingular_np((4, 4), in_dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+
+        assert P.dtype == expected_p_dtype
+        assert dpnp.issubdtype(P.dtype, dpnp.floating)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_p_indices_dtype(self, dtype):
+        a_np = self._make_nonsingular_np((4, 4), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        p, _, _ = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+        assert dpnp.issubdtype(p.dtype, dpnp.integer)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_l_structure(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        _, L, _ = dpnp.scipy.linalg.lu(a_dp)
+        L_np = dpnp.asnumpy(L)
+
+        # unit diagonal
+        diag_abs = numpy.abs(numpy.diag(L_np))
+        assert_allclose(diag_abs, numpy.ones(5, dtype=diag_abs.dtype))
+        # lower triangular
+        assert_allclose(numpy.triu(L_np, 1), numpy.zeros_like(L_np))
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_u_upper_triangular(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        _, _, U = dpnp.scipy.linalg.lu(a_dp)
+        U_np = dpnp.asnumpy(U)
+        assert_allclose(numpy.tril(U_np, -1), numpy.zeros_like(U_np))
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_p_is_permutation(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        P, _, _ = dpnp.scipy.linalg.lu(a_dp)
+        P_np = dpnp.asnumpy(P)
+
+        assert_allclose(P_np.sum(axis=0), numpy.ones(5, dtype=P_np.dtype))
+        assert_allclose(P_np.sum(axis=1), numpy.ones(5, dtype=P_np.dtype))
+        assert_allclose(
+            P_np.T @ P_np, numpy.eye(5, dtype=P_np.dtype), atol=1e-15
+        )
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_modes_consistency(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        PL, U2 = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+        p, L3, U3 = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        A1 = P @ L @ U
+        A2 = PL @ U2
+        p_np = dpnp.asnumpy(p)
+        A3_np = dpnp.asnumpy(L3)[p_np] @ dpnp.asnumpy(U3)
+
+        assert dpnp.allclose(A1, A_cast, rtol=1e-6, atol=1e-6)
+        assert dpnp.allclose(A2, A_cast, rtol=1e-6, atol=1e-6)
+        assert_allclose(A3_np, dpnp.asnumpy(A_cast), rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_p_times_l_equals_pl(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        P, L, _ = dpnp.scipy.linalg.lu(a_dp)
+        PL, _ = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+        assert dpnp.allclose(P @ L, PL, rtol=1e-12, atol=1e-12)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_p_indices_to_matrix(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        P, _, _ = dpnp.scipy.linalg.lu(a_dp)
+        p, _, _ = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+        P_from_idx = dpnp.eye(5, dtype=P.dtype)[p]
+        assert dpnp.allclose(P_from_idx, P, rtol=1e-15, atol=1e-15)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_overwrite_a_false(self, dtype):
+        a_dp = dpnp.array([[4, 3], [6, 3]], dtype=dtype, order="F")
+        a_dp_orig = a_dp.copy()
+        dpnp.scipy.linalg.lu(a_dp, overwrite_a=False)
+        assert dpnp.allclose(a_dp, a_dp_orig)
+
+    @pytest.mark.parametrize("shape", [(0, 0), (0, 2), (2, 0)])
+    def test_empty_inputs(self, shape):
+        a_dp = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        m, n = shape
+        k = min(m, n)
+        assert P.shape == (m, m)
+        assert L.shape == (m, k)
+        assert U.shape == (k, n)
+
+    @pytest.mark.parametrize("shape", [(0, 0), (0, 2), (2, 0)])
+    def test_empty_permute_l(self, shape):
+        a_dp = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+        PL, U = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+        m, n = shape
+        k = min(m, n)
+        assert PL.shape == (m, k)
+        assert U.shape == (k, n)
+
+    @pytest.mark.parametrize("shape", [(0, 0), (0, 2), (2, 0)])
+    def test_empty_p_indices(self, shape):
+        a_dp = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+        p, L, U = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+        m, n = shape
+        k = min(m, n)
+        assert p.shape == (m,)
+        assert L.shape == (m, k)
+        assert U.shape == (k, n)
+
+    @pytest.mark.parametrize(
+        "sl",
+        [
+            (slice(None, None, 2), slice(None, None, 2)),
+            (slice(None, None, -1), slice(None, None, -1)),
+        ],
+    )
+    def test_strided(self, sl):
+        base = self._make_nonsingular_np((7, 7), dpnp.default_float_type(), "F")
+        a_np = base[sl]
+        a_dp = dpnp.array(a_np)
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        A_rec = P @ L @ U
+        assert dpnp.allclose(A_rec, a_dp, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    def test_singular_matrix(self):
+        a_np = numpy.array([[1.0, 2.0], [2.0, 4.0]])
+        a_dp = dpnp.array(a_np)
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        A_rec = dpnp.asnumpy(P @ L @ U)
+        assert_allclose(A_rec, a_np, atol=1e-12)
+
+    def test_identity_matrix(self):
+        n = 4
+        I_dp = dpnp.eye(n, dtype=dpnp.default_float_type())
+        P, L, U = dpnp.scipy.linalg.lu(I_dp)
+        I_np = numpy.eye(n)
+        assert_allclose(dpnp.asnumpy(P), I_np, atol=1e-15)
+        assert_allclose(dpnp.asnumpy(L), I_np, atol=1e-15)
+        assert_allclose(dpnp.asnumpy(U), I_np, atol=1e-15)
+
+    def test_1d_input_raises(self):
+        a_dp = dpnp.array([1.0, 2.0, 3.0])
+        with pytest.raises(ValueError):
+            dpnp.scipy.linalg.lu(a_dp)
+
+    @pytest.mark.parametrize("bad", [numpy.inf, -numpy.inf, numpy.nan])
+    def test_check_finite_raises(self, bad):
+        a_dp = dpnp.array([[1.0, 2.0], [3.0, bad]], order="F")
+        assert_raises(ValueError, dpnp.scipy.linalg.lu, a_dp, check_finite=True)
+
+    @pytest.mark.parametrize("bad", [numpy.inf, -numpy.inf, numpy.nan])
+    def test_check_finite_raises_scalar(self, bad):
+        # Covers the 1x1 scalar fast path in dpnp_lu
+        a_dp = dpnp.array([[bad]])
+        assert_raises(ValueError, dpnp.scipy.linalg.lu, a_dp, check_finite=True)
+
+    def test_check_finite_disabled(self):
+        a_dp = dpnp.array([[1.0, numpy.nan], [3.0, 4.0]])
+        result = dpnp.scipy.linalg.lu(a_dp, check_finite=False)
+        assert len(result) == 3
+
+
+class TestLuBatched:
+    @staticmethod
+    def _make_nonsingular_nd_np(shape, dtype, order):
+        A = generate_random_numpy_array(shape, dtype, order)
+        m, n = shape[-2], shape[-1]
+        k = min(m, n)
+        A3 = A.reshape((-1, m, n))
+        for B in A3:
+            for i in range(k):
+                off = numpy.sum(numpy.abs(B[i, :n])) - numpy.abs(B[i, i])
+                B[i, i] = A.dtype.type(off + 1.0)
+        A = A3.reshape(shape)
+        A = numpy.array(A, order=order)
+        return A
+
+    @staticmethod
+    def _reconstruct_p_indices(p, L, U):
+        """Reconstruct A from (p, L, U) for batched p_indices mode."""
+        idx = dpnp.expand_dims(p, axis=-1)
+        idx = dpnp.broadcast_to(idx, L.shape).copy()
+        PL = dpnp.take_along_axis(L, idx, axis=-2)
+        return PL @ U
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2, 2), (3, 4, 4), (2, 3, 5, 2), (4, 1, 3)],
+        ids=["(2,2,2)", "(3,4,4)", "(2,3,5,2)", "(4,1,3)"],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_default_batched(self, shape, order, dtype):
+        a_np = self._make_nonsingular_nd_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+
+        m, n = shape[-2], shape[-1]
+        k = min(m, n)
+        assert P.shape == (*shape[:-2], m, m)
+        assert L.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        A_rec = P @ L @ U
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2, 2), (3, 4, 4), (2, 3, 5, 2), (4, 1, 3)],
+        ids=["(2,2,2)", "(3,4,4)", "(2,3,5,2)", "(4,1,3)"],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_permute_l_batched(self, shape, order, dtype):
+        a_np = self._make_nonsingular_nd_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        PL, U = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+
+        m, n = shape[-2], shape[-1]
+        k = min(m, n)
+        assert PL.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+        A_cast = a_dp.astype(PL.dtype, copy=False)
+        A_rec = PL @ U
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2, 2), (3, 4, 4), (2, 3, 5, 2), (4, 1, 3)],
+        ids=["(2,2,2)", "(3,4,4)", "(2,3,5,2)", "(4,1,3)"],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_p_indices_batched(self, shape, order, dtype):
+        a_np = self._make_nonsingular_nd_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        p, L, U = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+
+        m, n = shape[-2], shape[-1]
+        k = min(m, n)
+        assert p.shape == (*shape[:-2], m)
+        assert L.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+        assert dpnp.issubdtype(p.dtype, dpnp.integer)
+
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        A_rec = self._reconstruct_p_indices(p, L, U)
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    @pytest.mark.parametrize("order", ["C", "F"])
+    def test_overwrite_a(self, dtype, order):
+        a_np = self._make_nonsingular_nd_np((3, 2, 2), dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+        a_dp_orig = a_dp.copy()
+
+        dpnp.scipy.linalg.lu(a_dp, overwrite_a=False)
+        assert dpnp.allclose(a_dp, a_dp_orig)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_modes_consistency_batched(self, dtype):
+        a_np = self._make_nonsingular_nd_np((3, 4, 4), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        PL, U2 = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+        p, L3, U3 = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+
+        A1 = P @ L @ U
+        A2 = PL @ U2
+        A3 = self._reconstruct_p_indices(p, L3, U3)
+
+        A_cast2 = a_dp.astype(L.dtype, copy=False)
+        assert dpnp.allclose(A1, A_cast2, rtol=1e-6, atol=1e-6)
+        assert dpnp.allclose(A2, A_cast2, rtol=1e-6, atol=1e-6)
+        assert dpnp.allclose(A3, A_cast2, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape", [(0, 2, 2), (2, 0, 2), (2, 2, 0), (0, 0, 0)]
+    )
+    def test_empty_inputs(self, shape):
+        a = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+
+        P, L, U = dpnp.scipy.linalg.lu(a)
+        m, n = shape[-2:]
+        k = min(m, n)
+        assert P.shape == (*shape[:-2], m, m)
+        assert L.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+    @pytest.mark.parametrize(
+        "shape", [(0, 2, 2), (2, 0, 2), (2, 2, 0), (0, 0, 0)]
+    )
+    def test_empty_permute_l(self, shape):
+        a = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+
+        PL, U = dpnp.scipy.linalg.lu(a, permute_l=True)
+        m, n = shape[-2:]
+        k = min(m, n)
+        assert PL.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+    @pytest.mark.parametrize(
+        "shape", [(0, 2, 2), (2, 0, 2), (2, 2, 0), (0, 0, 0)]
+    )
+    def test_empty_p_indices(self, shape):
+        a = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+
+        p, L, U = dpnp.scipy.linalg.lu(a, p_indices=True)
+        m, n = shape[-2:]
+        k = min(m, n)
+        assert p.shape == (*shape[:-2], m)
+        assert L.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+    def test_strided(self):
+        a_np = self._make_nonsingular_nd_np(
+            (5, 3, 3), dpnp.default_float_type(), "F"
+        )
+        a_dp = dpnp.array(a_np, order="F")
+        a_stride = a_dp[::2]
+
+        P, L, U = dpnp.scipy.linalg.lu(a_stride)
+        for i in range(a_stride.shape[0]):
+            A_rec = dpnp.asnumpy(P[i] @ L[i] @ U[i])
+            A_orig = dpnp.asnumpy(a_stride[i].astype(L.dtype, copy=False))
+            assert_allclose(A_rec, A_orig, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    def test_singular_matrix(self):
+        a = dpnp.zeros((3, 2, 2), dtype=dpnp.default_float_type())
+        a[0] = dpnp.array([[1.0, 2.0], [2.0, 4.0]])
+        a[1] = dpnp.eye(2)
+        a[2] = dpnp.array([[1.0, 1.0], [1.0, 1.0]])
+
+        P, L, U = dpnp.scipy.linalg.lu(a)
+        A_rec = P @ L @ U
+        assert dpnp.allclose(A_rec, a, rtol=1e-6, atol=1e-6)
+
+    def test_check_finite_raises(self):
+        a = dpnp.ones((2, 3, 3), dtype=dpnp.default_float_type(), order="F")
+        a[1, 0, 0] = dpnp.nan
+        assert_raises(ValueError, dpnp.scipy.linalg.lu, a, check_finite=True)
+
+
 class TestMatrixPower:
     @pytest.mark.parametrize("dtype", get_all_dtypes())
     @pytest.mark.parametrize(
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index 699cd81c96f6..560f235bb56f 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -1666,6 +1666,18 @@ def test_lu_factor(self, data, device):
             param_queue = param.sycl_queue
             assert_sycl_queue_equal(param_queue, a.sycl_queue)
 
+    @pytest.mark.parametrize(
+        "data",
+        [[[1.0, 2.0], [3.0, 5.0]], [[]], [[[1.0, 2.0], [3.0, 5.0]]], [[[]]]],
+    )
+    def test_lu(self, data, device):
+        a = dpnp.array(data, device=device)
+        result = dpnp.scipy.linalg.lu(a)
+
+        for param in result:
+            param_queue = param.sycl_queue
+            assert_sycl_queue_equal(param_queue, a.sycl_queue)
+
     @pytest.mark.parametrize(
         "a_data, b_data",
         [
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index 4fc0f2b958fa..b73eb67d51ee 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -1527,6 +1527,18 @@ def test_lstsq(self, m, n, nrhs, usm_type, usm_type_other):
                 [usm_type, usm_type_other]
             )
 
+    @pytest.mark.parametrize(
+        "data",
+        [[[1.0, 2.0], [3.0, 5.0]], [[]], [[[1.0, 2.0], [3.0, 5.0]]], [[[]]]],
+    )
+    def test_lu(self, data, usm_type):
+        a = dpnp.array(data, usm_type=usm_type)
+        result = dpnp.scipy.linalg.lu(a)
+
+        assert a.usm_type == usm_type
+        for param in result:
+            assert param.usm_type == a.usm_type
+
     @pytest.mark.parametrize(
         "data",
         [[[1.0, 2.0], [3.0, 5.0]], [[]], [[[1.0, 2.0], [3.0, 5.0]]], [[[]]]],
diff --git a/dpnp/tests/third_party/cupyx/scipy_tests/linalg_tests/test_decomp_lu.py b/dpnp/tests/third_party/cupyx/scipy_tests/linalg_tests/test_decomp_lu.py
index fb51c3e39244..440521419652 100644
--- a/dpnp/tests/third_party/cupyx/scipy_tests/linalg_tests/test_decomp_lu.py
+++ b/dpnp/tests/third_party/cupyx/scipy_tests/linalg_tests/test_decomp_lu.py
@@ -124,7 +124,6 @@ def test_lu_factor_reconstruction_singular(self, dtype):
 )
 @testing.fix_random()
 @testing.with_requires("scipy")
-@pytest.mark.skip("lu() is not supported yet")
 class TestLU(unittest.TestCase):
 
     @testing.for_dtypes("fdFD")
@@ -132,7 +131,7 @@ def test_lu(self, dtype):
         a_cpu = testing.shaped_random(self.shape, numpy, dtype=dtype)
         a_gpu = cupy.asarray(a_cpu)
         result_cpu = scipy.linalg.lu(a_cpu, permute_l=self.permute_l)
-        result_gpu = cupy.linalg.lu(a_gpu, permute_l=self.permute_l)
+        result_gpu = cupy.scipy.linalg.lu(a_gpu, permute_l=self.permute_l)
         assert len(result_cpu) == len(result_gpu)
         if not self.permute_l:
             # check permutation matrix
@@ -140,22 +139,22 @@ def test_lu(self, dtype):
             result_gpu = list(result_gpu)
             P_cpu = result_cpu.pop(0)
             P_gpu = result_gpu.pop(0)
-            cupy.testing.assert_array_equal(P_gpu, P_cpu)
-        cupy.testing.assert_allclose(result_gpu[0], result_cpu[0], atol=1e-5)
-        cupy.testing.assert_allclose(result_gpu[1], result_cpu[1], atol=1e-5)
+            testing.assert_array_equal(P_gpu, P_cpu)
+        testing.assert_allclose(result_gpu[0], result_cpu[0], atol=1e-5)
+        testing.assert_allclose(result_gpu[1], result_cpu[1], atol=1e-5)
 
     @testing.for_dtypes("fdFD")
     def test_lu_reconstruction(self, dtype):
         m, n = self.shape
         A = testing.shaped_random(self.shape, cupy, dtype=dtype)
         if self.permute_l:
-            PL, U = cupy.linalg.lu(A, permute_l=self.permute_l)
+            PL, U = cupy.scipy.linalg.lu(A, permute_l=self.permute_l)
             PLU = PL @ U
         else:
-            P, L, U = cupy.linalg.lu(A, permute_l=self.permute_l)
+            P, L, U = cupy.scipy.linalg.lu(A, permute_l=self.permute_l)
             PLU = P @ L @ U
         # check that reconstruction is close to original
-        cupy.testing.assert_allclose(PLU, A, atol=1e-5)
+        testing.assert_allclose(PLU, A, atol=1e-5)
 
 
 @testing.parameterize(

From a2825a1f6297da6c0cce05b3fa4bd867fa5ac133 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Thu, 5 Mar 2026 13:08:22 +0100
Subject: [PATCH 08/35] Import exceptions directly from `dpnp.exceptions`
 module (#2798)

This PR updates the code to consistently use the `dpnp.exceptions`
module as the single source of the exceptions where it's applicable.
---
 dpnp/dpnp_algo/dpnp_fill.py                                | 3 ++-
 dpnp/dpnp_array.py                                         | 2 +-
 dpnp/dpnp_iface_indexing.py                                | 5 +++--
 dpnp/dpnp_iface_logic.py                                   | 3 ++-
 dpnp/dpnp_iface_manipulation.py                            | 2 +-
 dpnp/dpnp_iface_mathematical.py                            | 3 ++-
 dpnp/dpnp_utils/dpnp_utils_einsum.py                       | 2 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py                | 3 +--
 dpnp/dpnp_utils/dpnp_utils_statistics.py                   | 2 +-
 dpnp/fft/dpnp_utils_fft.py                                 | 2 +-
 dpnp/tests/test_array_api_info.py                          | 3 ++-
 dpnp/tests/test_arraymanipulation.py                       | 2 +-
 dpnp/tests/test_counting.py                                | 2 +-
 dpnp/tests/test_fft.py                                     | 2 +-
 dpnp/tests/test_fill.py                                    | 2 +-
 dpnp/tests/test_flipping.py                                | 2 +-
 dpnp/tests/test_indexing.py                                | 3 +--
 dpnp/tests/test_linalg.py                                  | 3 +--
 dpnp/tests/test_logic.py                                   | 2 +-
 dpnp/tests/test_manipulation.py                            | 2 +-
 dpnp/tests/test_mathematical.py                            | 7 ++-----
 dpnp/tests/test_nanfunctions.py                            | 2 +-
 dpnp/tests/test_product.py                                 | 3 +--
 dpnp/tests/test_sort.py                                    | 2 +-
 dpnp/tests/test_sycl_queue.py                              | 2 +-
 dpnp/tests/third_party/cupy/core_tests/test_ndarray.py     | 4 +---
 dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py   | 2 +-
 .../tests/third_party/cupy/manipulation_tests/test_dims.py | 2 +-
 .../tests/third_party/cupy/manipulation_tests/test_join.py | 5 +++--
 .../third_party/cupy/manipulation_tests/test_transpose.py  | 2 +-
 dpnp/tests/third_party/cupy/math_tests/test_sumprod.py     | 2 +-
 dpnp/tests/third_party/cupy/sorting_tests/test_sort.py     | 2 +-
 .../third_party/cupy/statistics_tests/test_meanvar.py      | 2 +-
 dpnp/tests/third_party/cupy/testing/_loops.py              | 2 +-
 34 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 112ea3af0fdb..c3bfa8fa2e80 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -38,6 +38,7 @@
 )
 
 import dpnp
+from dpnp.exceptions import ExecutionPlacementError
 
 
 def dpnp_fill(arr, val):
@@ -50,7 +51,7 @@ def dpnp_fill(arr, val):
         if val.shape != ():
             raise ValueError("`val` must be a scalar or 0D-array")
         if dpu.get_execution_queue((exec_q, val.sycl_queue)) is None:
-            raise dpu.ExecutionPlacementError(
+            raise ExecutionPlacementError(
                 "Input arrays have incompatible queues."
             )
         a_val = dpt.astype(val, arr.dtype)
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index bb864d4444a9..dad67fc1b584 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -39,11 +39,11 @@
 
 import dpctl.tensor as dpt
 import dpctl.tensor._type_utils as dtu
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp
 
 from . import memory as dpm
+from .exceptions import AxisError
 
 
 def _get_unwrapped_index_key(key):
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 7718412701e8..db70f1fd2384 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -63,6 +63,7 @@
 )
 from .dpnp_array import dpnp_array
 from .dpnp_utils import call_origin, get_usm_allocations
+from .exceptions import ExecutionPlacementError
 
 
 def _ravel_multi_index_checks(multi_index, dims, order):
@@ -129,7 +130,7 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0):
             )
 
         if dpu.get_execution_queue((q, out.sycl_queue)) is None:
-            raise dpu.ExecutionPlacementError(
+            raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
 
@@ -291,7 +292,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
             )
 
         if dpu.get_execution_queue((q, out.sycl_queue)) is None:
-            raise dpu.ExecutionPlacementError(
+            raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
 
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index 1834f25a0485..6464bd49af1b 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -55,6 +55,7 @@
 
 from .dpnp_array import dpnp_array
 from .dpnp_utils import get_usm_allocations
+from .exceptions import ExecutionPlacementError
 
 
 def _isclose_scalar_tol(a, b, rtol, atol, equal_nan):
@@ -1267,7 +1268,7 @@ def isin(
             )
             is None
         ):
-            raise dpu.ExecutionPlacementError(
+            raise ExecutionPlacementError(
                 "Input arrays have incompatible allocation queues"
             )
         usm_element = dpnp.get_usm_ndarray(element)
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index dd872485a602..ff7ac85666a1 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -48,7 +48,6 @@
 import dpctl.tensor as dpt
 import numpy
 from dpctl.tensor._numpy_helper import (
-    AxisError,
     normalize_axis_index,
     normalize_axis_tuple,
 )
@@ -60,6 +59,7 @@
 # pylint: disable=no-name-in-module
 from .dpnp_utils import get_usm_allocations
 from .dpnp_utils.dpnp_utils_pad import dpnp_pad
+from .exceptions import AxisError
 
 
 class InsertDeleteParams(NamedTuple):
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index e339c24d384c..366a3363404a 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -84,6 +84,7 @@
 from .dpnp_utils import get_usm_allocations
 from .dpnp_utils.dpnp_utils_linearalgebra import dpnp_cross
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
+from .exceptions import ExecutionPlacementError
 
 
 def _get_max_min(dtype):
@@ -273,7 +274,7 @@ def _process_ediff1d_args(arg, arg_name, ary_dtype, ary_sycl_queue, usm_type):
         usm_type = dpu.get_coerced_usm_type([usm_type, arg.usm_type])
         # check that arrays have the same allocation queue
         if dpu.get_execution_queue([ary_sycl_queue, arg.sycl_queue]) is None:
-            raise dpu.ExecutionPlacementError(
+            raise ExecutionPlacementError(
                 f"ary and {arg_name} must be allocated on the same SYCL queue"
             )
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_einsum.py b/dpnp/dpnp_utils/dpnp_utils_einsum.py
index 284268e2868b..4a1a58635989 100644
--- a/dpnp/dpnp_utils/dpnp_utils_einsum.py
+++ b/dpnp/dpnp_utils/dpnp_utils_einsum.py
@@ -33,11 +33,11 @@
 
 import dpctl
 import numpy
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
+from dpnp.exceptions import ExecutionPlacementError
 
 _einsum_symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 191b8aa65d13..d2a1cdfbac46 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -32,16 +32,15 @@
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
-    AxisError,
     normalize_axis_index,
     normalize_axis_tuple,
 )
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
+from dpnp.exceptions import AxisError, ExecutionPlacementError
 
 __all__ = [
     "dpnp_cross",
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index 3a3bc04a31af..c8414b661851 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -31,10 +31,10 @@
 import dpctl
 import dpctl.tensor as dpt
 from dpctl.tensor._numpy_helper import normalize_axis_tuple
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp
 from dpnp.dpnp_array import dpnp_array
+from dpnp.exceptions import ExecutionPlacementError
 
 __all__ = ["dpnp_cov", "dpnp_median"]
 
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 709494e6255e..28032b9d3be2 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -49,10 +49,10 @@
     normalize_axis_index,
     normalize_axis_tuple,
 )
-from dpctl.utils import ExecutionPlacementError
 
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
+from dpnp.exceptions import ExecutionPlacementError
 
 from ..dpnp_array import dpnp_array
 from ..dpnp_utils import map_dtype_to_device
diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py
index b310192ffc59..0e2fe7dc5a04 100644
--- a/dpnp/tests/test_array_api_info.py
+++ b/dpnp/tests/test_array_api_info.py
@@ -1,9 +1,10 @@
 import numpy
 import pytest
-from dpctl import SyclDeviceCreationError, get_devices, select_default_device
+from dpctl import get_devices, select_default_device
 from dpctl.tensor._tensor_impl import default_device_complex_type
 
 import dpnp
+from dpnp.exceptions import SyclDeviceCreationError
 from dpnp.tests.helper import (
     has_support_aspect64,
     is_win_platform,
diff --git a/dpnp/tests/test_arraymanipulation.py b/dpnp/tests/test_arraymanipulation.py
index ba83ee94d8b0..fe74368a8c81 100644
--- a/dpnp/tests/test_arraymanipulation.py
+++ b/dpnp/tests/test_arraymanipulation.py
@@ -3,10 +3,10 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
 import dpnp
+from dpnp.exceptions import AxisError
 
 from .helper import get_all_dtypes, get_float_complex_dtypes
 from .third_party.cupy import testing
diff --git a/dpnp/tests/test_counting.py b/dpnp/tests/test_counting.py
index 762abd58b687..821471068fd1 100644
--- a/dpnp/tests/test_counting.py
+++ b/dpnp/tests/test_counting.py
@@ -1,6 +1,5 @@
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import (
     assert_allclose,
     assert_equal,
@@ -8,6 +7,7 @@
 )
 
 import dpnp
+from dpnp.exceptions import AxisError
 
 from .helper import (
     get_all_dtypes,
diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index 226420057748..b10bf1b46016 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -2,11 +2,11 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_raises
 
 import dpnp
 from dpnp.dpnp_utils import map_dtype_to_device
+from dpnp.exceptions import ExecutionPlacementError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_fill.py b/dpnp/tests/test_fill.py
index 3102de395d93..db53ab976cba 100644
--- a/dpnp/tests/test_fill.py
+++ b/dpnp/tests/test_fill.py
@@ -1,9 +1,9 @@
 import dpctl
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_array_equal
 
 import dpnp
+from dpnp.exceptions import ExecutionPlacementError
 
 
 @pytest.mark.parametrize(
diff --git a/dpnp/tests/test_flipping.py b/dpnp/tests/test_flipping.py
index cc84242f4557..f48db162f002 100644
--- a/dpnp/tests/test_flipping.py
+++ b/dpnp/tests/test_flipping.py
@@ -2,12 +2,12 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import (
     assert_equal,
 )
 
 import dpnp
+from dpnp.exceptions import AxisError
 
 from .helper import (
     get_all_dtypes,
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index 9a55efe138b7..b6cae0733d40 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -4,9 +4,7 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from dpctl.tensor._type_utils import _to_device_supported_dtype
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_,
     assert_array_equal,
@@ -17,6 +15,7 @@
 
 import dpnp
 from dpnp.dpnp_array import dpnp_array
+from dpnp.exceptions import AxisError, ExecutionPlacementError
 
 from .helper import (
     get_abs_array,
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 7d8018fa83a2..170a2a7b5a13 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -4,8 +4,6 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
     assert_array_equal,
@@ -15,6 +13,7 @@
 )
 
 import dpnp
+from dpnp.exceptions import AxisError, ExecutionPlacementError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_logic.py b/dpnp/tests/test_logic.py
index cae51e6777ef..e68ba8162442 100644
--- a/dpnp/tests/test_logic.py
+++ b/dpnp/tests/test_logic.py
@@ -1,7 +1,6 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
     assert_array_equal,
@@ -10,6 +9,7 @@
 )
 
 import dpnp
+from dpnp.exceptions import ExecutionPlacementError
 
 from .helper import (
     generate_random_numpy_array,
diff --git a/dpnp/tests/test_manipulation.py b/dpnp/tests/test_manipulation.py
index 8ddba08dbb92..c35050afaa86 100644
--- a/dpnp/tests/test_manipulation.py
+++ b/dpnp/tests/test_manipulation.py
@@ -3,7 +3,6 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import (
     assert_array_equal,
     assert_equal,
@@ -11,6 +10,7 @@
 )
 
 import dpnp
+from dpnp.exceptions import AxisError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index e1f32bbd7931..4bac0e0cc314 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -2,11 +2,7 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import (
-    AxisError,
-    normalize_axis_index,
-)
-from dpctl.utils import ExecutionPlacementError
+from dpctl.tensor._numpy_helper import normalize_axis_index
 from numpy.testing import (
     assert_allclose,
     assert_array_equal,
@@ -18,6 +14,7 @@
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import map_dtype_to_device
+from dpnp.exceptions import AxisError, ExecutionPlacementError
 
 from .helper import (
     LTS_VERSION,
diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py
index d92cee045a72..48520015d354 100644
--- a/dpnp/tests/test_nanfunctions.py
+++ b/dpnp/tests/test_nanfunctions.py
@@ -2,7 +2,6 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
     assert_almost_equal,
@@ -13,6 +12,7 @@
 )
 
 import dpnp
+from dpnp.exceptions import ExecutionPlacementError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_product.py b/dpnp/tests/test_product.py
index afe767a5e5d9..3ac324b055e8 100644
--- a/dpnp/tests/test_product.py
+++ b/dpnp/tests/test_product.py
@@ -1,12 +1,11 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_allclose, assert_array_equal, assert_raises
 
 import dpnp
 from dpnp.dpnp_utils import map_dtype_to_device
+from dpnp.exceptions import AxisError, ExecutionPlacementError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_sort.py b/dpnp/tests/test_sort.py
index 5e883c575f85..27a2afe79b6a 100644
--- a/dpnp/tests/test_sort.py
+++ b/dpnp/tests/test_sort.py
@@ -1,9 +1,9 @@
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
 import dpnp
+from dpnp.exceptions import AxisError
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index 560f235bb56f..b0f746720af8 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -5,13 +5,13 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_array_equal, assert_raises
 
 import dpnp
 import dpnp.linalg
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
+from dpnp.exceptions import ExecutionPlacementError
 
 from .helper import (
     generate_random_numpy_array,
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py b/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
index 95d753c90473..ac6073a3098e 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
@@ -6,12 +6,12 @@
 import dpctl
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 # from cupy_backends.cuda.api import driver
 # from cupy_backends.cuda.api import runtime
 # from cupy_backends.cuda import stream as stream_module
 import dpnp as cupy
+from dpnp.exceptions import AxisError
 
 # from cupy import _util
 # from cupy import _core
@@ -19,8 +19,6 @@
 # from cupy import get_array_module
 from dpnp.tests.third_party.cupy import testing
 
-# from cupy.exceptions import AxisError
-
 
 def get_array_module(*args):
     for arg in args:
diff --git a/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py b/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py
index c241824fa81d..d6a163906ce9 100644
--- a/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py
+++ b/dpnp/tests/third_party/cupy/lib_tests/test_shape_base.py
@@ -2,9 +2,9 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+from dpnp.exceptions import AxisError
 from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py
index 7355d07e1d9b..ae0f6ce18b47 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_dims.py
@@ -2,9 +2,9 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+from dpnp.exceptions import AxisError
 from dpnp.tests.third_party.cupy import testing
 
 
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_join.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_join.py
index 0695de034e0c..838bb3646c1e 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_join.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_join.py
@@ -2,11 +2,12 @@
 import pytest
 
 if numpy.lib.NumpyVersion(numpy.__version__) >= "2.0.0b1":
-    from numpy.exceptions import AxisError, ComplexWarning
+    from numpy.exceptions import ComplexWarning
 else:
-    from numpy import AxisError, ComplexWarning
+    from numpy import ComplexWarning
 
 import dpnp as cupy
+from dpnp.exceptions import AxisError
 from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py b/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py
index 7e7a62dce52a..0a3555fe7798 100644
--- a/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py
+++ b/dpnp/tests/third_party/cupy/manipulation_tests/test_transpose.py
@@ -2,9 +2,9 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+from dpnp.exceptions import AxisError
 from dpnp.tests.third_party.cupy import testing
 
 
diff --git a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
index b8f98456a13a..b1c1e569ae2f 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_sumprod.py
@@ -2,9 +2,9 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+from dpnp.exceptions import AxisError
 from dpnp.tests.helper import (
     has_support_aspect16,
     has_support_aspect64,
diff --git a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
index 7e0eade13254..ba64ef949cb0 100644
--- a/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
+++ b/dpnp/tests/third_party/cupy/sorting_tests/test_sort.py
@@ -4,9 +4,9 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+from dpnp.exceptions import AxisError
 from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
index bf5d37df2fba..2eda8849e819 100644
--- a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
+++ b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
@@ -2,9 +2,9 @@
 
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+from dpnp.exceptions import AxisError
 from dpnp.tests.helper import has_support_aspect16, has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
 
diff --git a/dpnp/tests/third_party/cupy/testing/_loops.py b/dpnp/tests/third_party/cupy/testing/_loops.py
index 63cd09147c4b..026c451e71e3 100644
--- a/dpnp/tests/third_party/cupy/testing/_loops.py
+++ b/dpnp/tests/third_party/cupy/testing/_loops.py
@@ -10,9 +10,9 @@
 import numpy
 import pytest
 from dpctl import select_default_device
-from dpctl.tensor._numpy_helper import AxisError
 
 import dpnp as cupy
+from dpnp.exceptions import AxisError
 from dpnp.tests import config
 from dpnp.tests.third_party.cupy.testing import _array, _parameterized
 from dpnp.tests.third_party.cupy.testing._pytest_impl import is_available

From 4b644661997e8df576027c02cd2f62cfb1e6f4d1 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Fri, 6 Mar 2026 15:54:40 +0100
Subject: [PATCH 09/35] Update docstrings for `dpnp.scipy.linalg` LU functions
 (#2802)

This PR implements few improvements for LU function in
`dpnp.scipy.linalg` namespace:
* resolved issue with `Returns` section formatting in `lu` function
* changed to proper `func:` role for the functions cross-references in
`See also` section
* used `Warnings` section name instead of `Warning`
* changed `{None, bool}` to `bool` for overwrite_a, check_finite, and
overwrite_b parameters
---
 dpnp/dpnp_iface_histograms.py   | 20 ++++++------
 dpnp/dpnp_iface_manipulation.py |  8 ++---
 dpnp/dpnp_iface_nanfunctions.py | 16 +++++-----
 dpnp/scipy/linalg/_decomp_lu.py | 56 +++++++++++++++------------------
 4 files changed, 48 insertions(+), 52 deletions(-)

diff --git a/dpnp/dpnp_iface_histograms.py b/dpnp/dpnp_iface_histograms.py
index 8f3363e79fe0..0a2f18fe3644 100644
--- a/dpnp/dpnp_iface_histograms.py
+++ b/dpnp/dpnp_iface_histograms.py
@@ -306,8 +306,8 @@ def bincount(x, weights=None, minlength=0):
 
     For full documentation refer to :obj:`numpy.bincount`.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function synchronizes in order to calculate binning edges.
     This may harm performance in some applications.
 
@@ -504,8 +504,8 @@ def histogram(a, bins=10, range=None, density=None, weights=None):
 
     For full documentation refer to :obj:`numpy.histogram`.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function may synchronize in order to check a monotonically increasing
     array of bin edges. This may harm performance in some applications.
 
@@ -675,8 +675,8 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
 
     For full documentation refer to :obj:`numpy.histogram_bin_edges`.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function may synchronize in order to check a monotonically increasing
     array of bin edges. This may harm performance in some applications.
 
@@ -767,8 +767,8 @@ def histogram2d(x, y, bins=10, range=None, density=None, weights=None):
 
     For full documentation refer to :obj:`numpy.histogram2d`.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function may synchronize in order to check a monotonically increasing
     array of bin edges. This may harm performance in some applications.
 
@@ -1100,8 +1100,8 @@ def histogramdd(sample, bins=10, range=None, density=None, weights=None):
 
     For full documentation refer to :obj:`numpy.histogramdd`.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function may synchronize in order to check a monotonically increasing
     array of bin edges. This may harm performance in some applications.
 
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index ff7ac85666a1..0594a406ac5a 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -829,8 +829,8 @@ def asfarray(a, dtype=None, *, device=None, usm_type=None, sycl_queue=None):
     out : dpnp.ndarray
         The input `a` as a float ndarray.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function is deprecated in favor of :obj:`dpnp.asarray` and
     will be removed in a future release.
 
@@ -3099,8 +3099,8 @@ def resize(a, new_shape):
     be used. In most other cases either indexing (to reduce the size) or
     padding (to increase the size) may be a more appropriate solution.
 
-    Warning
-    -------
+    Warnings
+    --------
     This functionality does **not** consider axes separately, i.e. it does not
     apply interpolation/extrapolation.
     It fills the return array with the required number of elements, iterating
diff --git a/dpnp/dpnp_iface_nanfunctions.py b/dpnp/dpnp_iface_nanfunctions.py
index b8abad2a2088..a5fb750cf586 100644
--- a/dpnp/dpnp_iface_nanfunctions.py
+++ b/dpnp/dpnp_iface_nanfunctions.py
@@ -122,15 +122,15 @@ def nanargmax(a, axis=None, out=None, *, keepdims=False):
 
     For full documentation refer to :obj:`numpy.nanargmax`.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function synchronizes in order to test for all-NaN slices in the array.
     This may harm performance in some applications. To avoid synchronization,
     the user is recommended to filter NaNs themselves and use `dpnp.argmax`
     on the filtered array.
 
-    Warning
-    -------
+    Warnings
+    --------
     The results cannot be trusted if a slice contains only NaNs
     and -Infs.
 
@@ -206,15 +206,15 @@ def nanargmin(a, axis=None, out=None, *, keepdims=False):
 
     For full documentation refer to :obj:`numpy.nanargmin`.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function synchronizes in order to test for all-NaN slices in the array.
     This may harm performance in some applications. To avoid synchronization,
     the user is recommended to filter NaNs themselves and use `dpnp.argmax`
     on the filtered array.
 
-    Warning
-    -------
+    Warnings
+    --------
     The results cannot be trusted if a slice contains only NaNs
     and -Infs.
 
diff --git a/dpnp/scipy/linalg/_decomp_lu.py b/dpnp/scipy/linalg/_decomp_lu.py
index 823b2fccc230..f96d56b0e423 100644
--- a/dpnp/scipy/linalg/_decomp_lu.py
+++ b/dpnp/scipy/linalg/_decomp_lu.py
@@ -77,11 +77,11 @@ def lu(
         Perform the multiplication ``P @ L`` (Default: do not permute).
 
         Default: ``False``.
-    overwrite_a : {None, bool}, optional
+    overwrite_a : bool, optional
         Whether to overwrite data in `a` (may increase performance).
 
         Default: ``False``.
-    check_finite : {None, bool}, optional
+    check_finite : bool, optional
         Whether to check that the input matrix contains only finite numbers.
         Disabling may give a performance gain, but may result in problems
         (crashes, non-termination) if the inputs do contain infinities or NaNs.
@@ -95,23 +95,19 @@ def lu(
 
     Returns
     -------
-    **(If ``permute_l`` is ``False``)**
+    The tuple ``(p, l, u)`` is returned if ``permute_l`` is ``False``
+    (default), else the tuple ``(pl, u)`` is returned, where:
 
     p : (..., M, M) dpnp.ndarray or (..., M) dpnp.ndarray
-        If `p_indices` is ``False`` (default), the permutation matrix.
-        The permutation matrix always has a real dtype (``float32`` or
-        ``float64``) even when `a` is complex, since it only contains
-        0s and 1s.
+        Permutation matrix or permutation indices.
+        If `p_indices` is ``False`` (default), a permutation matrix.
+        The permutation matrix always has a real-valued floating-point dtype
+        even when `a` is complex, since it only contains 0s and 1s.
         If `p_indices` is ``True``, a 1-D (or batched) array of row
         permutation indices such that ``A = L[p] @ U``.
     l : (..., M, K) dpnp.ndarray
         Lower triangular or trapezoidal matrix with unit diagonal.
         ``K = min(M, N)``.
-    u : (..., K, N) dpnp.ndarray
-        Upper triangular or trapezoidal matrix.
-
-    **(If ``permute_l`` is ``True``)**
-
     pl : (..., M, K) dpnp.ndarray
         Permuted ``L`` matrix: ``pl = P @ L``.
         ``K = min(M, N)``.
@@ -130,18 +126,18 @@ def lu(
     permutation matrix is still needed then it can be constructed by
     ``dpnp.eye(M)[P, :]``.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function synchronizes in order to validate array elements
     when ``check_finite=True``, and also synchronizes to compute the
     permutation from LAPACK pivot indices.
 
     See Also
     --------
-    :obj:`dpnp.scipy.linalg.lu_factor` : LU factorize a matrix
-                                         (compact representation).
-    :obj:`dpnp.scipy.linalg.lu_solve` : Solve an equation system using
-                                        the LU factorization of a matrix.
+    :func:`dpnp.scipy.linalg.lu_factor` : LU factorize a matrix
+                                          (compact representation).
+    :func:`dpnp.scipy.linalg.lu_solve` : Solve an equation system using
+                                         the LU factorization of a matrix.
 
     Examples
     --------
@@ -211,11 +207,11 @@ def lu_factor(a, overwrite_a=False, check_finite=True):
     ----------
     a : (..., M, N) {dpnp.ndarray, usm_ndarray}
         Input array to decompose.
-    overwrite_a : {None, bool}, optional
+    overwrite_a : bool, optional
         Whether to overwrite data in `a` (may increase performance).
 
         Default: ``False``.
-    check_finite : {None, bool}, optional
+    check_finite : bool, optional
         Whether to check that the input matrix contains only finite numbers.
         Disabling may give a performance gain, but may result in problems
         (crashes, non-termination) if the inputs do contain infinities or NaNs.
@@ -233,15 +229,15 @@ def lu_factor(a, overwrite_a=False, check_finite=True):
         row i of matrix was interchanged with row piv[i].
         Where ``K = min(M, N)``.
 
-    Warning
-    -------
+    Warnings
+    --------
     This function synchronizes in order to validate array elements
     when ``check_finite=True``.
 
     See Also
     --------
-    :obj:`dpnp.scipy.linalg.lu_solve` : Solve an equation system using
-                                        the LU factorization of `a` matrix.
+    :func:`dpnp.scipy.linalg.lu_solve` : Solve an equation system using
+                                         the LU factorization of `a` matrix.
 
     Examples
     --------
@@ -273,7 +269,7 @@ def lu_solve(lu_and_piv, b, trans=0, overwrite_b=False, check_finite=True):
     lu, piv : {tuple of dpnp.ndarrays or usm_ndarrays}
         LU factorization of matrix `a` (..., M, M) together with pivot indices.
     b : {(M,), (..., M, K)} {dpnp.ndarray, usm_ndarray}
-        Right-hand side
+        Right-hand side.
     trans : {0, 1, 2} , optional
         Type of system to solve:
 
@@ -286,11 +282,11 @@ def lu_solve(lu_and_piv, b, trans=0, overwrite_b=False, check_finite=True):
         =====  =================
 
         Default: ``0``.
-    overwrite_b : {None, bool}, optional
+    overwrite_b : bool, optional
         Whether to overwrite data in `b` (may increase performance).
 
         Default: ``False``.
-    check_finite : {None, bool}, optional
+    check_finite : bool, optional
         Whether to check that the input matrix contains only finite numbers.
         Disabling may give a performance gain, but may result in problems
         (crashes, non-termination) if the inputs do contain infinities or NaNs.
@@ -302,14 +298,14 @@ def lu_solve(lu_and_piv, b, trans=0, overwrite_b=False, check_finite=True):
     x : {(M,), (..., M, K)} dpnp.ndarray
         Solution to the system
 
-    Warning
-    -------
+    Warnings
+    --------
     This function synchronizes in order to validate array elements
     when ``check_finite=True``.
 
     See Also
     --------
-    :obj:`dpnp.scipy.linalg.lu_factor` : LU factorize a matrix.
+    :func:`dpnp.scipy.linalg.lu_factor` : LU factorize a matrix.
 
     Examples
     --------

From 7fae3a6ea29c4fe231d01cf78ed09b3a8c956004 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 13 Mar 2026 17:31:49 +0100
Subject: [PATCH 10/35] Weekly pre-commit autoupdate (#2808)

This PR updates the `.pre-commit-config.yaml` using `pre-commit
autoupdate`.
---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 66245039ce3c..d08ddc36c1ab 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -57,19 +57,19 @@ repos:
     hooks:
     -   id: pyupgrade
 -   repo: https://github.com/codespell-project/codespell
-    rev: v2.4.1
+    rev: v2.4.2
     hooks:
     -   id: codespell
         args: ["-L", "abd"]  # ignore "abd" used in einsum tests
         additional_dependencies:
             - tomli
 -   repo: https://github.com/psf/black
-    rev: 26.1.0
+    rev: 26.3.0
     hooks:
     -   id: black
         exclude: "dpnp/_version.py"
 -   repo: https://github.com/pycqa/isort
-    rev: 8.0.0
+    rev: 8.0.1
     hooks:
     -   id: isort
         name: isort (python)

From c5f212a2a88232c865481ee688ef4929f107d922 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Wed, 18 Mar 2026 12:00:27 +0100
Subject: [PATCH 11/35] Mute expecting runtime warning raised in the test
 (#2822)

This PR follows up #2792 and marks one more `RuntimeWarning` warning as
expected due to possible overflow.
---
 dpnp/tests/test_special.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dpnp/tests/test_special.py b/dpnp/tests/test_special.py
index 1ebb64d8da7f..075bef5aeca3 100644
--- a/dpnp/tests/test_special.py
+++ b/dpnp/tests/test_special.py
@@ -106,6 +106,9 @@ def test_erfc(self, inverse):
             atol=self.tol,
         )
 
+    @pytest.mark.usefixtures(
+        "suppress_overflow_encountered_in_cast_numpy_warnings"
+    )
     def test_erfcx(self, inverse):
         self._check_variant_func(
             inverse,

From 7d9765b772599f123ca247f12b9d3bcd386ea33c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 18 Mar 2026 13:11:11 +0100
Subject: [PATCH 12/35] Bump styfle/cancel-workflow-action from 0.13.0 to
 0.13.1 (#2820)

Bumps
[styfle/cancel-workflow-action](https://github.com/styfle/cancel-workflow-action)
from 0.13.0 to 0.13.1.
---
 .github/workflows/build-sphinx.yml       | 2 +-
 .github/workflows/check-onemath.yaml     | 2 +-
 .github/workflows/conda-package.yml      | 2 +-
 .github/workflows/cron-run-tests.yaml    | 2 +-
 .github/workflows/generate_coverage.yaml | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index 0745ca1ca9dc..60530556efcc 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -47,7 +47,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 
diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml
index 409117c692b9..9296ba2cc903 100644
--- a/.github/workflows/check-onemath.yaml
+++ b/.github/workflows/check-onemath.yaml
@@ -34,7 +34,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index a12486300aa0..17ee76b6567c 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -49,7 +49,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 
diff --git a/.github/workflows/cron-run-tests.yaml b/.github/workflows/cron-run-tests.yaml
index f8e8394c6713..ea4fd4f14fc3 100644
--- a/.github/workflows/cron-run-tests.yaml
+++ b/.github/workflows/cron-run-tests.yaml
@@ -43,7 +43,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index 2cbe97ab0242..bfc3c7357a3e 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -33,7 +33,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 

From 19efa58b4797e28aacaa5715be1b5ef93bb02e2e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 18 Mar 2026 14:32:04 +0100
Subject: [PATCH 13/35] Weekly pre-commit autoupdate (#2818)

This PR updates the `.pre-commit-config.yaml` using `pre-commit
autoupdate`.
---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d08ddc36c1ab..92b81fe95852 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -64,7 +64,7 @@ repos:
         additional_dependencies:
             - tomli
 -   repo: https://github.com/psf/black
-    rev: 26.3.0
+    rev: 26.3.1
     hooks:
     -   id: black
         exclude: "dpnp/_version.py"
@@ -94,7 +94,7 @@ repos:
     -   id: clang-format
         args: ["-i"]
 -   repo: https://github.com/gitleaks/gitleaks
-    rev: v8.30.0
+    rev: v8.30.1
     hooks:
     -   id: gitleaks
 -   repo: https://github.com/jumanjihouse/pre-commit-hooks
@@ -127,7 +127,7 @@ repos:
     hooks:
     -   id: actionlint
 -   repo: https://github.com/BlankSpruce/gersemi
-    rev: 0.26.0
+    rev: 0.26.1
     hooks:
     -   id: gersemi
         exclude: "dpnp/backend/cmake/Modules/"

From 3a6af45d0130b0b8c7eb7dbd41a3d7e3fd18a4d5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 18 Mar 2026 16:25:24 +0100
Subject: [PATCH 14/35] Bump actions/download-artifact from 8.0.0 to 8.0.1
 (#2819)

Bumps
[actions/download-artifact](https://github.com/actions/download-artifact)
from 8.0.0 to 8.0.1.
---
 .github/workflows/check-onemath.yaml |  4 ++--
 .github/workflows/conda-package.yml  | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml
index 9296ba2cc903..acbfcac96890 100644
--- a/.github/workflows/check-onemath.yaml
+++ b/.github/workflows/check-onemath.yaml
@@ -87,7 +87,7 @@ jobs:
           fetch-depth: 0
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.environment-file-name }}
           path: ${{ env.environment-file-loc }}
@@ -181,7 +181,7 @@ jobs:
           fetch-depth: 0
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.environment-file-name }}
           path: ${{ env.environment-file-loc }}
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 17ee76b6567c..f10a372edf23 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -151,7 +151,7 @@ jobs:
           path: ${{ env.dpnp-repo-path }}
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.pkg-path-in-channel }}
@@ -280,7 +280,7 @@ jobs:
           path: ${{ env.dpnp-repo-path }}
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.pkg-path-in-channel }}
@@ -439,12 +439,12 @@ jobs:
           fetch-depth: ${{ env.fetch-depth }}
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
 
       - name: Download wheels artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
 
@@ -528,7 +528,7 @@ jobs:
           path: ${{ env.dpnp-repo-path }}
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ env.python-ver }}
           path: ${{ env.pkg-path-in-channel }}

From 506a7756bc69839d3e7d0d9b9c914cb4d9684121 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 18 Mar 2026 17:30:54 +0100
Subject: [PATCH 15/35] Bump github/codeql-action from 4.32.4 to 4.32.6 (#2809)

Bumps [github/codeql-action](https://github.com/github/codeql-action)
from 4.32.4 to 4.32.6.
---
 .github/workflows/openssf-scorecard.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index 8b4cc3b93f64..5d7e0677281e 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -72,6 +72,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4
+        uses: github/codeql-action/upload-sarif@0d579ffd059c29b07949a3cce3983f0780820c98 # v4.32.6
         with:
           sarif_file: results.sarif

From cba3d51374714064122210bf6aba9631dfd06e53 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 18 Mar 2026 19:07:48 +0100
Subject: [PATCH 16/35] Bump mshick/add-pr-comment from 2.8.2 to 3.9.0 (#2821)

Bumps [mshick/add-pr-comment](https://github.com/mshick/add-pr-comment)
from 2.8.2 to 3.9.0.
---
 .github/workflows/build-sphinx.yml  | 4 ++--
 .github/workflows/conda-package.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index 60530556efcc..87a7311b95e4 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -224,7 +224,7 @@ jobs:
         if: env.GH_EVENT_OPEN_PR_UPSTREAM == 'true'
         env:
           PR_NUM: ${{ github.event.number }}
-        uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
+        uses: mshick/add-pr-comment@ffd016c7e151d97d69d21a843022fd4cd5b96fe5 # v3.9.0.8.3.9.0
         with:
           message-id: url_to_docs
           message: |
@@ -268,7 +268,7 @@ jobs:
           git push tokened_docs gh-pages
 
       - name: Modify the comment with URL to official documentation
-        uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
+        uses: mshick/add-pr-comment@ffd016c7e151d97d69d21a843022fd4cd5b96fe5 # v3.9.0.8.3.9.0
         with:
           message-id: url_to_docs
           find: |
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index f10a372edf23..c894c530a20e 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -654,7 +654,7 @@ jobs:
 
       - name: Post result to PR
         if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork }}
-        uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
+        uses: mshick/add-pr-comment@ffd016c7e151d97d69d21a843022fd4cd5b96fe5 # v3.9.0.8.3.9.0
         with:
           message-id: array_api_results
           message: |

From 21d5e1ace58640d8ab4b2831c832d28e6ccfc448 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Wed, 18 Mar 2026 20:34:25 +0100
Subject: [PATCH 17/35] Upgrade `clang-format` to v22 and improve `pylint`
 pre-commit configuration (#2813)

## Summary

- Migrate clang-format from `pocc/pre-commit-hooks` to
`pre-commit/mirrors-clang-format` v22.1.0
- Update `.clang-format` config to maintain consistent code style across
versions
- Configure pylint to properly handle C extension modules (numpy, dpctl)
- Remove manual clang-format-12 installation from GitHub workflow
- Reformat all C++ files with clang-format v22
- Add reformatting commit to `.git-blame-ignore-revs`

## Changes

### Pre-commit configuration
- Switch to `pre-commit/mirrors-clang-format` (v22.1.0) for better
version pinning and consistency across environments
- Add `--disable=c-extension-no-member` to pylint to avoid false
positives on C extension objects

### Code style
- Update `.clang-format`: Set `AfterControlStatement: Never` to preserve
existing brace placement style
- Reformat all C++ source files with clang-format v22 (net -159 lines
due to template parameter formatting improvements)

### Pylint configuration
- Add `extension-pkg-allow-list = ["numpy"]` to help pylint understand
numpy better
- Add `generated-members` patterns to handle dynamically created
attributes on numpy types

### CI/CD
- Remove obsolete `apt-get install clang-format-12` step from pre-commit
workflow
---
 .clang-format                                 |   2 +-
 .git-blame-ignore-revs                        |   3 +
 .github/workflows/pre-commit.yml              |   7 --
 .pre-commit-config.yaml                       |   7 +-
 benchmarks/asv.conf.json                      |  12 +-
 dpnp/backend/extensions/blas/dot_common.hpp   |   7 +-
 dpnp/backend/extensions/blas/gemm.cpp         |   3 +-
 dpnp/backend/extensions/blas/gemm_batch.cpp   |   3 +-
 dpnp/backend/extensions/blas/gemv.cpp         |   3 +-
 dpnp/backend/extensions/blas/syrk.cpp         |   3 +-
 dpnp/backend/extensions/common/ext/common.hpp |   6 +-
 .../extensions/common/ext/dispatch_table.hpp  |   9 +-
 .../elementwise_functions/common.hpp          | 107 +++++++++---------
 .../elementwise_functions.hpp                 |  49 +++-----
 .../simplify_iteration_space.cpp              |   3 +-
 dpnp/backend/extensions/fft/common.hpp        |   8 +-
 dpnp/backend/extensions/fft/out_of_place.tpp  |   5 +-
 .../extensions/lapack/evd_batch_common.hpp    |   3 +-
 .../backend/extensions/lapack/geqrf_batch.cpp |  14 +--
 dpnp/backend/extensions/lapack/gesv.cpp       |  19 ++--
 dpnp/backend/extensions/lapack/gesv_batch.cpp |  11 +-
 .../extensions/lapack/gesv_common_utils.hpp   |   6 +-
 dpnp/backend/extensions/lapack/gesvd.cpp      |   7 +-
 .../backend/extensions/lapack/gesvd_batch.cpp |  10 +-
 .../extensions/lapack/gesvd_common_utils.hpp  |   6 +-
 dpnp/backend/extensions/lapack/getrf.cpp      |  16 +--
 dpnp/backend/extensions/lapack/getrs.cpp      |   3 +-
 .../backend/extensions/lapack/getrs_batch.cpp |   3 +-
 dpnp/backend/extensions/lapack/heevd.cpp      |   4 +-
 .../backend/extensions/lapack/heevd_batch.cpp |   4 +-
 .../extensions/lapack/linalg_exceptions.hpp   |   5 +-
 .../backend/extensions/lapack/orgqr_batch.cpp |  18 +--
 dpnp/backend/extensions/lapack/syevd.cpp      |   4 +-
 .../backend/extensions/lapack/syevd_batch.cpp |   4 +-
 .../backend/extensions/lapack/ungqr_batch.cpp |  18 +--
 .../extensions/statistics/bincount.cpp        |   5 +-
 .../statistics/histogram_common.hpp           |  55 ++-------
 .../extensions/statistics/histogramdd.cpp     |   5 +-
 .../statistics/sliding_window1d.hpp           |  68 +++--------
 .../ufunc/elementwise_functions/erf_funcs.cpp |   3 +-
 .../elementwise_functions/interpolate.cpp     |   4 +-
 .../ufunc/elementwise_functions/populate.hpp  |  22 ++--
 dpnp/backend/extensions/ufunc/ufunc_py.cpp    |   5 +-
 dpnp/backend/extensions/vm/common.hpp         |  10 +-
 dpnp/backend/kernels/dpnp_krnl_random.cpp     |   6 +-
 .../kernels/elementwise_functions/degrees.hpp |   5 +-
 .../kernels/elementwise_functions/divmod.hpp  |   5 +-
 .../kernels/elementwise_functions/fabs.hpp    |   5 +-
 .../kernels/elementwise_functions/fmax.hpp    |   6 +-
 .../kernels/elementwise_functions/fmin.hpp    |   6 +-
 .../kernels/elementwise_functions/isclose.hpp |   6 +-
 .../elementwise_functions/nan_to_num.hpp      |   3 +-
 .../kernels/elementwise_functions/radians.hpp |   5 +-
 dpnp/backend/src/dpnp_fptr.hpp                |   3 +-
 dpnp/backend/src/queue_sycl.cpp               |   3 +-
 dpnp/backend/src/queue_sycl.hpp               |  10 +-
 dpnp/backend/tests/test_random.cpp            |   5 +-
 pyproject.toml                                |   4 +
 58 files changed, 241 insertions(+), 400 deletions(-)

diff --git a/.clang-format b/.clang-format
index 622a5bf67634..4304e0ed40d4 100644
--- a/.clang-format
+++ b/.clang-format
@@ -16,7 +16,7 @@ BinPackParameters: false
 BraceWrapping:
   AfterCaseLabel:  true
   AfterClass:      true
-  AfterControlStatement: MultiLine
+  AfterControlStatement: Never
   AfterEnum:       true
   AfterFunction:   true
   AfterNamespace:  true
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 841f009ace89..e4fe0bcc4b2f 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -17,3 +17,6 @@ c106d91b866f4acd30226b68519b12a73a881490
 
 # Add pygrep-hooks to pre-commit config
 e62718415aa3660da5f607e352c991a063a54219
+
+# Bump clang-format from 12.0.1 to 22.1.0 version
+c2d65bd451a7d8e5b6319147da95e9dabf7a382b
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index d5f9f22146fc..d8f59405ce89 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -15,13 +15,6 @@ jobs:
     timeout-minutes: 10
 
     steps:
-      - name: Set up clang-format
-        run: |
-          sudo apt-get install -y clang-format-12
-          sudo unlink /usr/bin/clang-format
-          sudo ln -s /usr/bin/clang-format-12 /usr/bin/clang-format
-          clang-format --version
-
       - name: Set up pip packages
         uses: BSFishy/pip-action@8f2d471d809dc20b6ada98c91910b6ae6243f318 # v1
         with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 92b81fe95852..57ec9e2a2a8e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -88,8 +88,8 @@ repos:
         additional_dependencies:
             - flake8-docstrings==1.7.0
             - flake8-bugbear==24.12.12
--   repo: https://github.com/pocc/pre-commit-hooks
-    rev: v1.3.5
+-   repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v22.1.0
     hooks:
     -   id: clang-format
         args: ["-i"]
@@ -114,7 +114,8 @@ repos:
             "-sn", # Don't display the score
             "--disable=import-error",
             "--disable=redefined-builtin",
-            "--disable=unused-wildcard-import"
+            "--disable=unused-wildcard-import",
+            "--disable=c-extension-no-member"
             ]
         files: '^dpnp/(dpnp_iface.*|fft|linalg|scipy|dpnp_array)'
 -   repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
index c5e5663e21fb..3d0e7f88d55f 100644
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -15,7 +15,9 @@
 
     // List of branches to benchmark. If not provided, defaults to "master"
     // (for git) or "tip" (for mercurial).
-    "branches": ["HEAD"],
+    "branches": [
+        "HEAD"
+    ],
 
     // The DVCS being used.  If not set, it will be automatically
     // determined from "repo" by looking at the protocol in the URL
@@ -35,7 +37,9 @@
 
     // The Pythons you'd like to test against.  If not provided, defaults
     // to the current version of Python used to run `asv`.
-    "pythons": ["3.7"],
+    "pythons": [
+        "3.7"
+    ],
 
     // The matrix of dependencies to test.  Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty
@@ -53,7 +57,6 @@
     // environments in.  If not provided, defaults to "env"
     "env_dir": "env",
 
-
     // The directory (relative to the current directory) that raw benchmark
     // results are stored in.  If not provided, defaults to "results".
     "results_dir": "results",
@@ -79,7 +82,8 @@
     // skipped for the matching benchmark.
     //
     // "regressions_first_commits": {
-    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this
+    //    commit
     //    "another_benchmark": null,   // Skip regression detection altogether
     // }
 }
diff --git a/dpnp/backend/extensions/blas/dot_common.hpp b/dpnp/backend/extensions/blas/dot_common.hpp
index 1672e7217cba..383804ff1718 100644
--- a/dpnp/backend/extensions/blas/dot_common.hpp
+++ b/dpnp/backend/extensions/blas/dot_common.hpp
@@ -97,8 +97,7 @@ std::pair<sycl::event, sycl::event>
 
     if (!dpctl::utils::queues_are_compatible(
             exec_q,
-            {vectorX.get_queue(), vectorY.get_queue(), result.get_queue()}))
-    {
+            {vectorX.get_queue(), vectorY.get_queue(), result.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
@@ -120,8 +119,8 @@ std::pair<sycl::event, sycl::event>
     const int vectorY_typenum = vectorY.get_typenum();
     const int result_typenum = result.get_typenum();
 
-    if (result_typenum != vectorX_typenum || result_typenum != vectorY_typenum)
-    {
+    if (result_typenum != vectorX_typenum ||
+        result_typenum != vectorY_typenum) {
         throw py::value_error("Given arrays must be of the same type.");
     }
 
diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp
index 48c1ae98ead4..86f751baf2e0 100644
--- a/dpnp/backend/extensions/blas/gemm.cpp
+++ b/dpnp/backend/extensions/blas/gemm.cpp
@@ -181,8 +181,7 @@ std::tuple<sycl::event, sycl::event, bool>
 
     if (!dpctl::utils::queues_are_compatible(
             exec_q,
-            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()}))
-    {
+            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
diff --git a/dpnp/backend/extensions/blas/gemm_batch.cpp b/dpnp/backend/extensions/blas/gemm_batch.cpp
index a6cd7ac4e130..d02b035922c0 100644
--- a/dpnp/backend/extensions/blas/gemm_batch.cpp
+++ b/dpnp/backend/extensions/blas/gemm_batch.cpp
@@ -237,8 +237,7 @@ std::tuple<sycl::event, sycl::event, bool>
 
     if (!dpctl::utils::queues_are_compatible(
             exec_q,
-            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()}))
-    {
+            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
diff --git a/dpnp/backend/extensions/blas/gemv.cpp b/dpnp/backend/extensions/blas/gemv.cpp
index a9c5414ef8c7..0b6ae78bc76e 100644
--- a/dpnp/backend/extensions/blas/gemv.cpp
+++ b/dpnp/backend/extensions/blas/gemv.cpp
@@ -169,8 +169,7 @@ std::pair<sycl::event, sycl::event>
 
     if (!dpctl::utils::queues_are_compatible(
             exec_q,
-            {matrixA.get_queue(), vectorX.get_queue(), vectorY.get_queue()}))
-    {
+            {matrixA.get_queue(), vectorX.get_queue(), vectorY.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
diff --git a/dpnp/backend/extensions/blas/syrk.cpp b/dpnp/backend/extensions/blas/syrk.cpp
index 8b0ebce3d888..9668e72b57f6 100644
--- a/dpnp/backend/extensions/blas/syrk.cpp
+++ b/dpnp/backend/extensions/blas/syrk.cpp
@@ -248,8 +248,7 @@ std::pair<sycl::event, sycl::event>
     }
 
     if (!dpctl::utils::queues_are_compatible(
-            exec_q, {matrixA.get_queue(), resultC.get_queue()}))
-    {
+            exec_q, {matrixA.get_queue(), resultC.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
diff --git a/dpnp/backend/extensions/common/ext/common.hpp b/dpnp/backend/extensions/common/ext/common.hpp
index d626b56ea00c..f0ce1722bfb1 100644
--- a/dpnp/backend/extensions/common/ext/common.hpp
+++ b/dpnp/backend/extensions/common/ext/common.hpp
@@ -213,8 +213,7 @@ sycl::nd_range<1>
 pybind11::dtype dtype_from_typenum(int dst_typenum);
 
 template <typename dispatchT,
-          template <typename fnT, typename T>
-          typename factoryT,
+          template <typename fnT, typename T> typename factoryT,
           int _num_types = type_dispatch::num_types>
 inline void init_dispatch_vector(dispatchT dispatch_vector[])
 {
@@ -223,8 +222,7 @@ inline void init_dispatch_vector(dispatchT dispatch_vector[])
 }
 
 template <typename dispatchT,
-          template <typename fnT, typename D, typename S>
-          typename factoryT,
+          template <typename fnT, typename D, typename S> typename factoryT,
           int _num_types = type_dispatch::num_types>
 inline void init_dispatch_table(dispatchT dispatch_table[][_num_types])
 {
diff --git a/dpnp/backend/extensions/common/ext/dispatch_table.hpp b/dpnp/backend/extensions/common/ext/dispatch_table.hpp
index 4cfe1bd57250..6655f054f355 100644
--- a/dpnp/backend/extensions/common/ext/dispatch_table.hpp
+++ b/dpnp/backend/extensions/common/ext/dispatch_table.hpp
@@ -99,8 +99,7 @@ using SupportedDTypeList2 = std::vector<DTypePair>;
 
 template <typename FnT,
           typename SupportedTypes,
-          template <typename>
-          typename Func>
+          template <typename> typename Func>
 struct TableBuilder
 {
     template <typename _FnT, typename T>
@@ -125,8 +124,7 @@ struct TableBuilder
 
 template <typename FnT,
           typename SupportedTypes,
-          template <typename, typename>
-          typename Func>
+          template <typename, typename> typename Func>
 struct TableBuilder2
 {
     template <typename _FnT, typename T1, typename T2>
@@ -232,8 +230,7 @@ class DispatchTable2
     }
 
     template <typename SupportedTypes,
-              template <typename, typename>
-              typename Func>
+              template <typename, typename> typename Func>
     void populate_dispatch_table()
     {
         using TBulder = typename TableBuilder2<FnT, SupportedTypes, Func>::type;
diff --git a/dpnp/backend/extensions/elementwise_functions/common.hpp b/dpnp/backend/extensions/elementwise_functions/common.hpp
index df2b3afe53b9..f3b15c8d6774 100644
--- a/dpnp/backend/extensions/elementwise_functions/common.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/common.hpp
@@ -131,8 +131,7 @@ struct UnaryTwoOutputsContigFunctor
         else if constexpr (enable_sg_loadstore &&
                            UnaryTwoOutputsOpT::supports_sg_loadstore::value &&
                            UnaryTwoOutputsOpT::supports_vec::value &&
-                           (vec_sz > 1))
-        {
+                           (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -171,8 +170,7 @@ struct UnaryTwoOutputsContigFunctor
         }
         else if constexpr (enable_sg_loadstore &&
                            UnaryTwoOutputsOpT::supports_sg_loadstore::value &&
-                           std::is_same_v<resT1, argT>)
-        {
+                           std::is_same_v<resT1, argT>) {
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
@@ -214,8 +212,7 @@ struct UnaryTwoOutputsContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           UnaryTwoOutputsOpT::supports_sg_loadstore::value)
-        {
+                           UnaryTwoOutputsOpT::supports_sg_loadstore::value) {
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
@@ -359,8 +356,7 @@ struct BinaryTwoOutputsContigFunctor
 
         if constexpr (enable_sg_loadstore &&
                       BinaryOperatorT::supports_sg_loadstore::value &&
-                      BinaryOperatorT::supports_vec::value && (vec_sz > 1))
-        {
+                      BinaryOperatorT::supports_vec::value && (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -405,8 +401,7 @@ struct BinaryTwoOutputsContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           BinaryOperatorT::supports_sg_loadstore::value)
-        {
+                           BinaryOperatorT::supports_sg_loadstore::value) {
             auto sg = ndit.get_sub_group();
             const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -528,21 +523,18 @@ struct BinaryTwoOutputsStridedFunctor
  * dpctl::tensor::kernels::elementwise_common namespace.
  */
 template <typename argTy,
-          template <typename T>
-          class UnaryTwoOutputsType,
+          template <typename T> class UnaryTwoOutputsType,
           template <typename A,
                     typename R1,
                     typename R2,
                     std::uint8_t vs,
                     std::uint8_t nv,
-                    bool enable>
-          class UnaryTwoOutputsContigFunctorT,
+                    bool enable> class UnaryTwoOutputsContigFunctorT,
           template <typename A,
                     typename R1,
                     typename R2,
                     std::uint8_t vs,
-                    std::uint8_t nv>
-          class kernel_name,
+                    std::uint8_t nv> class kernel_name,
           std::uint8_t vec_sz = 4u,
           std::uint8_t n_vecs = 2u>
 sycl::event
@@ -576,8 +568,7 @@ sycl::event
 
         if (is_aligned<required_alignment>(arg_p) &&
             is_aligned<required_alignment>(res1_p) &&
-            is_aligned<required_alignment>(res2_p))
-        {
+            is_aligned<required_alignment>(res2_p)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl =
@@ -613,12 +604,15 @@ sycl::event
  * dpctl::tensor::kernels::elementwise_common namespace.
  */
 template <typename argTy,
-          template <typename T>
-          class UnaryTwoOutputsType,
-          template <typename A, typename R1, typename R2, typename I>
-          class UnaryTwoOutputsStridedFunctorT,
-          template <typename A, typename R1, typename R2, typename I>
-          class kernel_name>
+          template <typename T> class UnaryTwoOutputsType,
+          template <typename A,
+                    typename R1,
+                    typename R2,
+                    typename I> class UnaryTwoOutputsStridedFunctorT,
+          template <typename A,
+                    typename R1,
+                    typename R2,
+                    typename I> class kernel_name>
 sycl::event unary_two_outputs_strided_impl(
     sycl::queue &exec_q,
     std::size_t nelems,
@@ -665,27 +659,25 @@ sycl::event unary_two_outputs_strided_impl(
  * @note It extends binary_contig_impl from
  * dpctl::tensor::kernels::elementwise_common namespace.
  */
-template <typename argTy1,
-          typename argTy2,
-          template <typename T1, typename T2>
-          class BinaryTwoOutputsType,
-          template <typename T1,
-                    typename T2,
-                    typename T3,
-                    typename T4,
-                    std::uint8_t vs,
-                    std::uint8_t nv,
-                    bool enable_sg_loadstore>
-          class BinaryTwoOutputsContigFunctorT,
-          template <typename T1,
-                    typename T2,
-                    typename T3,
-                    typename T4,
-                    std::uint8_t vs,
-                    std::uint8_t nv>
-          class kernel_name,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u>
+template <
+    typename argTy1,
+    typename argTy2,
+    template <typename T1, typename T2> class BinaryTwoOutputsType,
+    template <typename T1,
+              typename T2,
+              typename T3,
+              typename T4,
+              std::uint8_t vs,
+              std::uint8_t nv,
+              bool enable_sg_loadstore> class BinaryTwoOutputsContigFunctorT,
+    template <typename T1,
+              typename T2,
+              typename T3,
+              typename T4,
+              std::uint8_t vs,
+              std::uint8_t nv> class kernel_name,
+    std::uint8_t vec_sz = 4u,
+    std::uint8_t n_vecs = 2u>
 sycl::event
     binary_two_outputs_contig_impl(sycl::queue &exec_q,
                                    std::size_t nelems,
@@ -726,8 +718,7 @@ sycl::event
         if (is_aligned<required_alignment>(arg1_tp) &&
             is_aligned<required_alignment>(arg2_tp) &&
             is_aligned<required_alignment>(res1_tp) &&
-            is_aligned<required_alignment>(res2_tp))
-        {
+            is_aligned<required_alignment>(res2_tp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl = BinaryTwoOutputsContigFunctorT<argTy1, argTy2, resTy1,
@@ -761,15 +752,19 @@ sycl::event
  * @note It extends binary_strided_impl from
  * dpctl::tensor::kernels::elementwise_common namespace.
  */
-template <
-    typename argTy1,
-    typename argTy2,
-    template <typename T1, typename T2>
-    class BinaryTwoOutputsType,
-    template <typename T1, typename T2, typename T3, typename T4, typename IndT>
-    class BinaryTwoOutputsStridedFunctorT,
-    template <typename T1, typename T2, typename T3, typename T4, typename IndT>
-    class kernel_name>
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2> class BinaryTwoOutputsType,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename IndT> class BinaryTwoOutputsStridedFunctorT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename IndT> class kernel_name>
 sycl::event binary_two_outputs_strided_impl(
     sycl::queue &exec_q,
     std::size_t nelems,
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
index c996ac07df02..6a29c9a33c5a 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
@@ -309,8 +309,7 @@ std::pair<sycl::event, sycl::event>
 
     // check that types are supported
     if (dst1_typeid != func_output_typeids.first ||
-        dst2_typeid != func_output_typeids.second)
-    {
+        dst2_typeid != func_output_typeids.second) {
         throw py::value_error(
             "One of destination arrays has unexpected elemental data type.");
     }
@@ -362,8 +361,7 @@ std::pair<sycl::event, sycl::event>
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(src, dst1) && !same_logical_tensors(src, dst1)) ||
         (overlap(src, dst2) && !same_logical_tensors(src, dst2)) ||
-        (overlap(dst1, dst2) && !same_logical_tensors(dst1, dst2)))
-    {
+        (overlap(dst1, dst2) && !same_logical_tensors(dst1, dst2))) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
 
@@ -430,8 +428,7 @@ std::pair<sycl::event, sycl::event>
         simplified_dst2_strides, src_offset, dst1_offset, dst2_offset);
 
     if (nd == 1 && simplified_src_strides[0] == 1 &&
-        simplified_dst1_strides[0] == 1 && simplified_dst2_strides[0] == 1)
-    {
+        simplified_dst1_strides[0] == 1 && simplified_dst2_strides[0] == 1) {
         // Special case of contiguous data
         auto contig_fn = contig_dispatch_vector[src_typeid];
 
@@ -625,8 +622,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
     auto const &same_logical_tensors =
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) ||
-        (overlap(src2, dst) && !same_logical_tensors(src2, dst)))
-    {
+        (overlap(src2, dst) && !same_logical_tensors(src2, dst))) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
     // check memory overlap
@@ -693,8 +689,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
 
         if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
             isEqual(simplified_src2_strides, unit_stride) &&
-            isEqual(simplified_dst_strides, unit_stride))
-        {
+            isEqual(simplified_dst_strides, unit_stride)) {
             auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
 
             if (contig_fn != nullptr) {
@@ -716,8 +711,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_src2_strides, zero_one_strides) &&
                 isEqual(simplified_src1_strides, {simplified_shape[1], one}) &&
-                isEqual(simplified_dst_strides, {simplified_shape[1], one}))
-            {
+                isEqual(simplified_dst_strides, {simplified_shape[1], one})) {
                 auto matrix_row_broadcast_fn =
                     contig_matrix_row_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
@@ -731,8 +725,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                         is_aligned<required_alignment>(
                             src2_data + src2_offset * src2_itemsize) &&
                         is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
+                            dst_data + dst_offset * dst_itemsize)) {
                         std::size_t n0 = simplified_shape[0];
                         std::size_t n1 = simplified_shape[1];
                         sycl::event comp_ev = matrix_row_broadcast_fn(
@@ -749,8 +742,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
             }
             if (isEqual(simplified_src1_strides, one_zero_strides) &&
                 isEqual(simplified_src2_strides, {one, simplified_shape[0]}) &&
-                isEqual(simplified_dst_strides, {one, simplified_shape[0]}))
-            {
+                isEqual(simplified_dst_strides, {one, simplified_shape[0]})) {
                 auto row_matrix_broadcast_fn =
                     contig_row_matrix_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
@@ -765,8 +757,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                         is_aligned<required_alignment>(
                             src2_data + src2_offset * src2_itemsize) &&
                         is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
+                            dst_data + dst_offset * dst_itemsize)) {
                         std::size_t n0 = simplified_shape[1];
                         std::size_t n1 = simplified_shape[0];
                         sycl::event comp_ev = row_matrix_broadcast_fn(
@@ -839,8 +830,7 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
     }
 
     if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
+        src2_typeid >= td_ns::num_types) {
         throw std::runtime_error("binary output type lookup failed");
     }
     int dst_typeid = output_types_table[src1_typeid][src2_typeid];
@@ -898,8 +888,8 @@ std::pair<sycl::event, sycl::event>
     }
 
     // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src1, src2, dst1, dst2}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {src1, src2, dst1, dst2})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
@@ -955,8 +945,7 @@ std::pair<sycl::event, sycl::event>
         (overlap(src1, dst2) && !same_logical_tensors(src1, dst2)) ||
         (overlap(src2, dst1) && !same_logical_tensors(src2, dst1)) ||
         (overlap(src2, dst2) && !same_logical_tensors(src2, dst2)) ||
-        (overlap(dst1, dst2)))
-    {
+        (overlap(dst1, dst2))) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
 
@@ -1031,8 +1020,7 @@ std::pair<sycl::event, sycl::event>
     if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
         isEqual(simplified_src2_strides, unit_stride) &&
         isEqual(simplified_dst1_strides, unit_stride) &&
-        isEqual(simplified_dst2_strides, unit_stride))
-    {
+        isEqual(simplified_dst2_strides, unit_stride)) {
         auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
 
         if (contig_fn != nullptr) {
@@ -1107,8 +1095,7 @@ std::pair<py::object, py::object> py_binary_two_outputs_ufunc_result_type(
     }
 
     if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
+        src2_typeid >= td_ns::num_types) {
         throw std::runtime_error("binary output type lookup failed");
     }
     std::pair<int, int> dst_typeids =
@@ -1263,8 +1250,7 @@ std::pair<sycl::event, sycl::event>
             std::initializer_list<py::ssize_t>{1};
 
         if ((nd == 1) && isEqual(simplified_rhs_strides, unit_stride) &&
-            isEqual(simplified_lhs_strides, unit_stride))
-        {
+            isEqual(simplified_lhs_strides, unit_stride)) {
             auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
 
             if (contig_fn != nullptr) {
@@ -1283,8 +1269,7 @@ std::pair<sycl::event, sycl::event>
             static constexpr py::ssize_t one{1};
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_rhs_strides, one_zero_strides) &&
-                isEqual(simplified_lhs_strides, {one, simplified_shape[0]}))
-            {
+                isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) {
                 auto row_matrix_broadcast_fn =
                     contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
                                                               [lhs_typeid];
diff --git a/dpnp/backend/extensions/elementwise_functions/simplify_iteration_space.cpp b/dpnp/backend/extensions/elementwise_functions/simplify_iteration_space.cpp
index e34cb74fcb0a..c60602ccb01d 100644
--- a/dpnp/backend/extensions/elementwise_functions/simplify_iteration_space.cpp
+++ b/dpnp/backend/extensions/elementwise_functions/simplify_iteration_space.cpp
@@ -292,8 +292,7 @@ void simplify_iteration_space_4(
         simplified_dst_strides.reserve(nd);
 
         if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
-            (src3_strides[0] < 0) && (dst_strides[0] < 0))
-        {
+            (src3_strides[0] < 0) && (dst_strides[0] < 0)) {
             simplified_src1_strides.push_back(-src1_strides[0]);
             simplified_src2_strides.push_back(-src2_strides[0]);
             simplified_src3_strides.push_back(-src3_strides[0]);
diff --git a/dpnp/backend/extensions/fft/common.hpp b/dpnp/backend/extensions/fft/common.hpp
index f76da9721316..44f0b43f8597 100644
--- a/dpnp/backend/extensions/fft/common.hpp
+++ b/dpnp/backend/extensions/fft/common.hpp
@@ -56,8 +56,7 @@ class DescriptorWrapper
     {
         mkl_dft::precision fft_prec = get_precision();
         if (fft_prec == mkl_dft::precision::DOUBLE &&
-            !q.get_device().has(sycl::aspect::fp64))
-        {
+            !q.get_device().has(sycl::aspect::fp64)) {
             throw py::value_error("Descriptor is double precision but the "
                                   "device does not support double precision.");
         }
@@ -66,10 +65,7 @@ class DescriptorWrapper
         queue_ptr_ = std::make_unique<sycl::queue>(q);
     }
 
-    descr_type &get_descriptor()
-    {
-        return descr_;
-    }
+    descr_type &get_descriptor() { return descr_; }
 
     const sycl::queue &get_queue() const
     {
diff --git a/dpnp/backend/extensions/fft/out_of_place.tpp b/dpnp/backend/extensions/fft/out_of_place.tpp
index 290408dc60bc..ed5cd37df7f1 100644
--- a/dpnp/backend/extensions/fft/out_of_place.tpp
+++ b/dpnp/backend/extensions/fft/out_of_place.tpp
@@ -82,9 +82,8 @@ std::pair<sycl::event, sycl::event>
     }
 
     sycl::queue exec_q = descr.get_queue();
-    if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {in.get_queue(), out.get_queue()}))
-    {
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q, {in.get_queue(), out.get_queue()})) {
         throw py::value_error("USM allocations are not compatible with the "
                               "execution queue of the descriptor.");
     }
diff --git a/dpnp/backend/extensions/lapack/evd_batch_common.hpp b/dpnp/backend/extensions/lapack/evd_batch_common.hpp
index e1debdc35934..d2edffcf520a 100644
--- a/dpnp/backend/extensions/lapack/evd_batch_common.hpp
+++ b/dpnp/backend/extensions/lapack/evd_batch_common.hpp
@@ -75,8 +75,7 @@ std::pair<sycl::event, sycl::event>
                       expected_eig_vecs_nd, expected_eig_vals_nd);
 
     if (eig_vecs_shape[2] != eig_vals_shape[0] ||
-        eig_vecs_shape[0] != eig_vals_shape[1])
-    {
+        eig_vecs_shape[0] != eig_vals_shape[1]) {
         throw py::value_error(
             "The shape of 'eig_vals' must be (batch_size, n), "
             "where batch_size = " +
diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
index e0821e23e440..033c3db01b10 100644
--- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp
+++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
@@ -98,13 +98,13 @@ static sycl::event geqrf_batch_impl(sycl::queue &exec_q,
 
         geqrf_batch_event = mkl_lapack::geqrf_batch(
             exec_q,
-            m, // The number of rows in each matrix in the batch; (0 ≤ m).
-               // It must be a non-negative integer.
-            n, // The number of columns in each matrix in the batch; (0 ≤ n).
-               // It must be a non-negative integer.
-            a, // Pointer to the batch of matrices, each of size (m x n).
-            lda,      // The leading dimension of each matrix in the batch.
-                      // For row major layout, lda ≥ max(1, m).
+            m,   // The number of rows in each matrix in the batch; (0 ≤ m).
+                 // It must be a non-negative integer.
+            n,   // The number of columns in each matrix in the batch; (0 ≤ n).
+                 // It must be a non-negative integer.
+            a,   // Pointer to the batch of matrices, each of size (m x n).
+            lda, // The leading dimension of each matrix in the batch.
+                 // For row major layout, lda ≥ max(1, m).
             stride_a, // Stride between consecutive matrices in the batch.
             tau, // Pointer to the array of scalar factors of the elementary
                  // reflectors for each matrix in the batch.
diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 0569fab2c350..bec24db585a6 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -114,14 +114,14 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
     try {
         getrf_event = mkl_lapack::getrf(
             exec_q,
-            n,    // The order of the square matrix A (0 ≤ n).
-                  // It must be a non-negative integer.
-            n,    // The number of columns in the square matrix A (0 ≤ n).
-                  // It must be a non-negative integer.
-            a,    // Pointer to the square matrix A (n x n).
-            lda,  // The leading dimension of matrix A.
-                  // It must be at least max(1, n).
-            ipiv, // Pointer to the output array of pivot indices.
+            n,          // The order of the square matrix A (0 ≤ n).
+                        // It must be a non-negative integer.
+            n,          // The number of columns in the square matrix A (0 ≤ n).
+                        // It must be a non-negative integer.
+            a,          // Pointer to the square matrix A (n x n).
+            lda,        // The leading dimension of matrix A.
+                        // It must be at least max(1, n).
+            ipiv,       // Pointer to the output array of pivot indices.
             scratchpad, // Pointer to scratchpad memory to be used by MKL
                         // routine for storing intermediate results.
             scratchpad_size, depends);
@@ -242,8 +242,7 @@ std::pair<sycl::event, sycl::event>
     // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty
     // events
     if (helper::check_zeros_shape(coeff_matrix_nd, coeff_matrix_shape) ||
-        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape))
-    {
+        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape)) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
     }
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index ce02f8517eb5..893279245344 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -258,10 +258,10 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
         try {
             gesv_event = mkl_lapack::gesv(
                 exec_q,
-                n,    // The order of the square matrix A
-                      // and the number of rows in matrix B (0 ≤ n).
-                nrhs, // The number of right-hand sides,
-                      // i.e., the number of columns in matrix B (0 ≤ nrhs).
+                n,       // The order of the square matrix A
+                         // and the number of rows in matrix B (0 ≤ n).
+                nrhs,    // The number of right-hand sides,
+                         // i.e., the number of columns in matrix B (0 ≤ nrhs).
                 a_batch, // Pointer to the square coefficient matrix A (n x n).
                 lda, // The leading dimension of a, must be at least max(1, n).
                 current_ipiv, // The pivot indices that define the permutation
@@ -341,8 +341,7 @@ std::pair<sycl::event, sycl::event>
     // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty
     // events
     if (helper::check_zeros_shape(coeff_matrix_nd, coeff_matrix_shape) ||
-        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape))
-    {
+        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape)) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
     }
diff --git a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
index d86d7e29413e..62f1e9589a0b 100644
--- a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
+++ b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
@@ -64,8 +64,7 @@ inline void common_gesv_checks(sycl::queue &exec_q,
     }
 
     if (dependent_vals_nd < min_dependent_vals_ndim ||
-        dependent_vals_nd > max_dependent_vals_ndim)
-    {
+        dependent_vals_nd > max_dependent_vals_ndim) {
         throw py::value_error("The dependent values array has ndim=" +
                               std::to_string(dependent_vals_nd) + ", but a " +
                               std::to_string(min_dependent_vals_ndim) +
@@ -95,8 +94,7 @@ inline void common_gesv_checks(sycl::queue &exec_q,
 
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {coeff_matrix, dependent_vals}))
-    {
+                                             {coeff_matrix, dependent_vals})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues.");
     }
diff --git a/dpnp/backend/extensions/lapack/gesvd.cpp b/dpnp/backend/extensions/lapack/gesvd.cpp
index d46179ac3b9a..e347837e3cfe 100644
--- a/dpnp/backend/extensions/lapack/gesvd.cpp
+++ b/dpnp/backend/extensions/lapack/gesvd.cpp
@@ -171,8 +171,7 @@ std::pair<sycl::event, sycl::event>
     // Ensure `m` and 'n' are non-zero, otherwise return empty
     // events
     if (gesvd_utils::check_zeros_shape_gesvd(a_array, out_s, out_u, out_vt,
-                                             jobu_val, jobvt_val))
-    {
+                                             jobu_val, jobvt_val)) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
     }
@@ -223,8 +222,8 @@ struct GesvdContigFactory
 {
     fnT get()
     {
-        if constexpr (types::GesvdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::GesvdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return gesvd_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/gesvd_batch.cpp b/dpnp/backend/extensions/lapack/gesvd_batch.cpp
index eb9903ba6e1e..868facc200e2 100644
--- a/dpnp/backend/extensions/lapack/gesvd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesvd_batch.cpp
@@ -102,8 +102,7 @@ static sycl::event gesvd_batch_impl(sycl::queue &exec_q,
     std::int64_t vt_size = 0;
 
     if (jobu == oneapi::mkl::jobsvd::somevec ||
-        jobu == oneapi::mkl::jobsvd::vectorsina)
-    {
+        jobu == oneapi::mkl::jobsvd::vectorsina) {
         u_size = m * k;
         vt_size = k * n;
     }
@@ -238,8 +237,7 @@ std::pair<sycl::event, sycl::event>
     // Ensure `batch_size`, `m` and 'n' are non-zero, otherwise return empty
     // events
     if (gesvd_utils::check_zeros_shape_gesvd(a_array, out_s, out_u, out_vt,
-                                             jobu_val, jobvt_val))
-    {
+                                             jobu_val, jobvt_val)) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
     }
@@ -293,8 +291,8 @@ struct GesvdBatchContigFactory
 {
     fnT get()
     {
-        if constexpr (types::GesvdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::GesvdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return gesvd_batch_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/gesvd_common_utils.hpp b/dpnp/backend/extensions/lapack/gesvd_common_utils.hpp
index ce2d9c1eb474..1cd2c8ac4997 100644
--- a/dpnp/backend/extensions/lapack/gesvd_common_utils.hpp
+++ b/dpnp/backend/extensions/lapack/gesvd_common_utils.hpp
@@ -122,8 +122,7 @@ inline void common_gesvd_checks(sycl::queue &exec_q,
 
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {a_array, out_s, out_u, out_vt}))
-    {
+                                             {a_array, out_s, out_u, out_vt})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues.");
     }
@@ -131,8 +130,7 @@ inline void common_gesvd_checks(sycl::queue &exec_q,
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
     if (overlap(a_array, out_s) || overlap(a_array, out_u) ||
         overlap(a_array, out_vt) || overlap(out_s, out_u) ||
-        overlap(out_s, out_vt) || overlap(out_u, out_vt))
-    {
+        overlap(out_s, out_vt) || overlap(out_u, out_vt)) {
         throw py::value_error("Arrays have overlapping segments of memory");
     }
 
diff --git a/dpnp/backend/extensions/lapack/getrf.cpp b/dpnp/backend/extensions/lapack/getrf.cpp
index abf20aff643a..870ccc8e811a 100644
--- a/dpnp/backend/extensions/lapack/getrf.cpp
+++ b/dpnp/backend/extensions/lapack/getrf.cpp
@@ -91,14 +91,14 @@ static sycl::event getrf_impl(sycl::queue &exec_q,
 
         getrf_event = mkl_lapack::getrf(
             exec_q,
-            m,    // The number of rows in the input matrix A (0 ≤ m).
-                  // It must be a non-negative integer.
-            n,    // The number of columns in the input matrix A (0 ≤ n).
-                  // It must be a non-negative integer.
-            a,    // Pointer to the input matrix A (m x n).
-            lda,  // The leading dimension of matrix A.
-                  // It must be at least max(1, m).
-            ipiv, // Pointer to the output array of pivot indices.
+            m,          // The number of rows in the input matrix A (0 ≤ m).
+                        // It must be a non-negative integer.
+            n,          // The number of columns in the input matrix A (0 ≤ n).
+                        // It must be a non-negative integer.
+            a,          // Pointer to the input matrix A (m x n).
+            lda,        // The leading dimension of matrix A.
+                        // It must be at least max(1, m).
+            ipiv,       // Pointer to the output array of pivot indices.
             scratchpad, // Pointer to scratchpad memory to be used by MKL
                         // routine for storing intermediate results.
             scratchpad_size, depends);
diff --git a/dpnp/backend/extensions/lapack/getrs.cpp b/dpnp/backend/extensions/lapack/getrs.cpp
index 8108afd97003..94e1a1027898 100644
--- a/dpnp/backend/extensions/lapack/getrs.cpp
+++ b/dpnp/backend/extensions/lapack/getrs.cpp
@@ -208,8 +208,7 @@ std::pair<sycl::event, sycl::event>
 
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {a_array, b_array, ipiv_array}))
-    {
+                                             {a_array, b_array, ipiv_array})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
diff --git a/dpnp/backend/extensions/lapack/getrs_batch.cpp b/dpnp/backend/extensions/lapack/getrs_batch.cpp
index 9fc6ce1a5dfc..f4fb446c328d 100644
--- a/dpnp/backend/extensions/lapack/getrs_batch.cpp
+++ b/dpnp/backend/extensions/lapack/getrs_batch.cpp
@@ -253,8 +253,7 @@ std::pair<sycl::event, sycl::event>
 
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {a_array, b_array, ipiv_array}))
-    {
+                                             {a_array, b_array, ipiv_array})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp
index 5990e5344a17..96d6a03e9b8e 100644
--- a/dpnp/backend/extensions/lapack/heevd.cpp
+++ b/dpnp/backend/extensions/lapack/heevd.cpp
@@ -124,8 +124,8 @@ struct HeevdContigFactory
 {
     fnT get()
     {
-        if constexpr (types::HeevdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::HeevdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return heevd_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp
index e1c1a96bc320..e8614498bd41 100644
--- a/dpnp/backend/extensions/lapack/heevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp
@@ -161,8 +161,8 @@ struct HeevdBatchContigFactory
 {
     fnT get()
     {
-        if constexpr (types::HeevdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::HeevdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return heevd_batch_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/linalg_exceptions.hpp b/dpnp/backend/extensions/lapack/linalg_exceptions.hpp
index d087adfbd2b6..c823d1995a4e 100644
--- a/dpnp/backend/extensions/lapack/linalg_exceptions.hpp
+++ b/dpnp/backend/extensions/lapack/linalg_exceptions.hpp
@@ -37,10 +37,7 @@ class LinAlgError : public std::exception
 public:
     explicit LinAlgError(const char *message) : msg_(message) {}
 
-    const char *what() const noexcept override
-    {
-        return msg_.c_str();
-    }
+    const char *what() const noexcept override { return msg_.c_str(); }
 
 private:
     std::string msg_;
diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
index ef1c85b91f4a..a29fe9b342fc 100644
--- a/dpnp/backend/extensions/lapack/orgqr_batch.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
@@ -100,15 +100,15 @@ static sycl::event orgqr_batch_impl(sycl::queue &exec_q,
 
         orgqr_batch_event = mkl_lapack::orgqr_batch(
             exec_q,
-            m, // The number of rows in each matrix in the batch; (0 ≤ m).
-               // It must be a non-negative integer.
-            n, // The number of columns in each matrix in the batch; (0 ≤ n).
-               // It must be a non-negative integer.
-            k, // The number of elementary reflectors
-               // whose product defines the matrices Qi; (0 ≤ k ≤ n).
-            a, // Pointer to the batch of matrices, each of size (m x n).
-            lda,      // The leading dimension of each matrix in the batch.
-                      // For row major layout, lda ≥ max(1, m).
+            m,   // The number of rows in each matrix in the batch; (0 ≤ m).
+                 // It must be a non-negative integer.
+            n,   // The number of columns in each matrix in the batch; (0 ≤ n).
+                 // It must be a non-negative integer.
+            k,   // The number of elementary reflectors
+                 // whose product defines the matrices Qi; (0 ≤ k ≤ n).
+            a,   // Pointer to the batch of matrices, each of size (m x n).
+            lda, // The leading dimension of each matrix in the batch.
+                 // For row major layout, lda ≥ max(1, m).
             stride_a, // Stride between consecutive matrices in the batch.
             tau, // Pointer to the array of scalar factors of the elementary
                  // reflectors for each matrix in the batch.
diff --git a/dpnp/backend/extensions/lapack/syevd.cpp b/dpnp/backend/extensions/lapack/syevd.cpp
index af69cf9e6b7e..3ecd386299ac 100644
--- a/dpnp/backend/extensions/lapack/syevd.cpp
+++ b/dpnp/backend/extensions/lapack/syevd.cpp
@@ -124,8 +124,8 @@ struct SyevdContigFactory
 {
     fnT get()
     {
-        if constexpr (types::SyevdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::SyevdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return syevd_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp
index 0c326e5d79bb..13237d27a35c 100644
--- a/dpnp/backend/extensions/lapack/syevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp
@@ -161,8 +161,8 @@ struct SyevdBatchContigFactory
 {
     fnT get()
     {
-        if constexpr (types::SyevdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::SyevdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return syevd_batch_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/ungqr_batch.cpp b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
index 7c890d968b0a..04de27cb257c 100644
--- a/dpnp/backend/extensions/lapack/ungqr_batch.cpp
+++ b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
@@ -100,15 +100,15 @@ static sycl::event ungqr_batch_impl(sycl::queue &exec_q,
 
         ungqr_batch_event = mkl_lapack::ungqr_batch(
             exec_q,
-            m, // The number of rows in each matrix in the batch; (0 ≤ m).
-               // It must be a non-negative integer.
-            n, // The number of columns in each matrix in the batch; (0 ≤ n).
-               // It must be a non-negative integer.
-            k, // The number of elementary reflectors
-               // whose product defines the matrices Qi; (0 ≤ k ≤ n).
-            a, // Pointer to the batch of matrices, each of size (m x n).
-            lda,      // The leading dimension of each matrix in the batch.
-                      // For row major layout, lda ≥ max(1, m).
+            m,   // The number of rows in each matrix in the batch; (0 ≤ m).
+                 // It must be a non-negative integer.
+            n,   // The number of columns in each matrix in the batch; (0 ≤ n).
+                 // It must be a non-negative integer.
+            k,   // The number of elementary reflectors
+                 // whose product defines the matrices Qi; (0 ≤ k ≤ n).
+            a,   // Pointer to the batch of matrices, each of size (m x n).
+            lda, // The leading dimension of each matrix in the batch.
+                 // For row major layout, lda ≥ max(1, m).
             stride_a, // Stride between consecutive matrices in the batch.
             tau, // Pointer to the array of scalar factors of the elementary
                  // reflectors for each matrix in the batch.
diff --git a/dpnp/backend/extensions/statistics/bincount.cpp b/dpnp/backend/extensions/statistics/bincount.cpp
index ba258cd55447..9bfe5c2a2449 100644
--- a/dpnp/backend/extensions/statistics/bincount.cpp
+++ b/dpnp/backend/extensions/statistics/bincount.cpp
@@ -59,10 +59,7 @@ struct BincountEdges
     {
     }
 
-    boundsT get_bounds() const
-    {
-        return {min, max};
-    }
+    boundsT get_bounds() const { return {min, max}; }
 
     template <int _Dims, typename dT>
     size_t get_bin(const sycl::nd_item<_Dims> &,
diff --git a/dpnp/backend/extensions/statistics/histogram_common.hpp b/dpnp/backend/extensions/statistics/histogram_common.hpp
index 539b42475fbf..02fc66f26610 100644
--- a/dpnp/backend/extensions/statistics/histogram_common.hpp
+++ b/dpnp/backend/extensions/statistics/histogram_common.hpp
@@ -64,10 +64,7 @@ struct CachedData
         local_data = LocalData(shape, cgh);
     }
 
-    T *get_ptr() const
-    {
-        return &local_data[0];
-    }
+    T *get_ptr() const { return &local_data[0]; }
 
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &item) const
@@ -83,15 +80,9 @@ struct CachedData
         }
     }
 
-    size_t size() const
-    {
-        return local_data.size();
-    }
+    size_t size() const { return local_data.size(); }
 
-    T &operator[](const sycl::id<Dims> &id) const
-    {
-        return local_data[id];
-    }
+    T &operator[](const sycl::id<Dims> &id) const { return local_data[id]; }
 
     template <typename = std::enable_if_t<Dims == 1>>
     T &operator[](const size_t id) const
@@ -119,25 +110,16 @@ struct UncachedData
         _shape = shape;
     }
 
-    T *get_ptr() const
-    {
-        return global_data;
-    }
+    T *get_ptr() const { return global_data; }
 
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &) const
     {
     }
 
-    size_t size() const
-    {
-        return _shape.size();
-    }
+    size_t size() const { return _shape.size(); }
 
-    T &operator[](const sycl::id<Dims> &id) const
-    {
-        return global_data[id];
-    }
+    T &operator[](const sycl::id<Dims> &id) const { return global_data[id]; }
 
     template <typename = std::enable_if_t<Dims == 1>>
     T &operator[](const size_t id) const
@@ -235,10 +217,7 @@ struct HistWithLocalCopies
         }
     }
 
-    uint32_t size() const
-    {
-        return local_hist.size();
-    }
+    uint32_t size() const { return local_hist.size(); }
 
 private:
     LocalHist local_hist;
@@ -251,10 +230,7 @@ struct HistGlobalMemory
     static constexpr bool const sync_after_init = false;
     static constexpr bool const sync_before_finalize = false;
 
-    HistGlobalMemory(T *global_data)
-    {
-        global_hist = global_data;
-    }
+    HistGlobalMemory(T *global_data) { global_hist = global_data; }
 
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &) const
@@ -280,24 +256,15 @@ struct HistGlobalMemory
 template <typename T = uint32_t>
 struct NoWeights
 {
-    constexpr T get(size_t) const
-    {
-        return 1;
-    }
+    constexpr T get(size_t) const { return 1; }
 };
 
 template <typename T>
 struct Weights
 {
-    Weights(T *weights)
-    {
-        data = weights;
-    }
+    Weights(T *weights) { data = weights; }
 
-    T get(size_t id) const
-    {
-        return data[id];
-    }
+    T get(size_t id) const { return data[id]; }
 
 private:
     T *data = nullptr;
diff --git a/dpnp/backend/extensions/statistics/histogramdd.cpp b/dpnp/backend/extensions/statistics/histogramdd.cpp
index a5ed4a8c7d1c..bd2177073333 100644
--- a/dpnp/backend/extensions/statistics/histogramdd.cpp
+++ b/dpnp/backend/extensions/statistics/histogramdd.cpp
@@ -90,10 +90,7 @@ struct EdgesDd
         }
     }
 
-    boundsT get_bounds() const
-    {
-        return {&min[0], &max[0]};
-    }
+    boundsT get_bounds() const { return {&min[0], &max[0]}; }
 
     auto get_bin_for_dim(const EdgesT &val,
                          const EdgesT *edges_data,
diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.hpp b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
index c5a5bac111dd..f33a23609666 100644
--- a/dpnp/backend/extensions/statistics/sliding_window1d.hpp
+++ b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
@@ -129,30 +129,15 @@ class _RegistryDataStorage
         return sycl::shift_group_right(sbgroup, data[y], x);
     }
 
-    constexpr SizeT size_y() const
-    {
-        return _size;
-    }
+    constexpr SizeT size_y() const { return _size; }
 
-    SizeT size_x() const
-    {
-        return sbgroup.get_max_local_range()[0];
-    }
+    SizeT size_x() const { return sbgroup.get_max_local_range()[0]; }
 
-    SizeT total_size() const
-    {
-        return size_x() * size_y();
-    }
+    SizeT total_size() const { return size_x() * size_y(); }
 
-    ncT *ptr()
-    {
-        return data;
-    }
+    ncT *ptr() { return data; }
 
-    SizeT x() const
-    {
-        return sbgroup.get_local_linear_id();
-    }
+    SizeT x() const { return sbgroup.get_local_linear_id(); }
 
 protected:
     const sycl::sub_group sbgroup;
@@ -277,8 +262,7 @@ struct RegistryData : public _RegistryDataStorage<T, Size>
 
     T *load(const T *const data, const bool &mask, const T &default_v)
     {
-        return load(
-            data, [mask](auto &&) { return mask; }, default_v);
+        return load(data, [mask](auto &&) { return mask; }, default_v);
     }
 
     T *load(const T *const data)
@@ -349,10 +333,7 @@ struct RegistryData : public _RegistryDataStorage<T, Size>
         return store(data, [mask](auto &&) { return mask; });
     }
 
-    T *store(T *const data)
-    {
-        return store(data, true);
-    }
+    T *store(T *const data) { return store(data, true); }
 };
 
 template <typename T, uint32_t Size>
@@ -379,10 +360,7 @@ struct RegistryWindow : public RegistryData<T, Size>
         }
     }
 
-    void advance_left(const T &fill_value)
-    {
-        advance_left(1, fill_value);
-    }
+    void advance_left(const T &fill_value) { advance_left(1, fill_value); }
 
     void advance_left()
     {
@@ -400,25 +378,13 @@ class Span
 
     Span(T *const data, const SizeT size) : data_(data), size_(size) {}
 
-    T *begin() const
-    {
-        return data();
-    }
+    T *begin() const { return data(); }
 
-    T *end() const
-    {
-        return data() + size();
-    }
+    T *end() const { return data() + size(); }
 
-    SizeT size() const
-    {
-        return size_;
-    }
+    SizeT size() const { return size_; }
 
-    T *data() const
-    {
-        return data_;
-    }
+    T *data() const { return data_; }
 
 protected:
     T *const data_;
@@ -443,15 +409,9 @@ class PaddedSpan : public Span<T, SizeT>
     {
     }
 
-    T *padded_begin() const
-    {
-        return this->begin() - pad();
-    }
+    T *padded_begin() const { return this->begin() - pad(); }
 
-    SizeT pad() const
-    {
-        return pad_;
-    }
+    SizeT pad() const { return pad_; }
 
 protected:
     const SizeT pad_;
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
index 5254e50d3faf..6f10e651fe25 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
@@ -184,8 +184,7 @@ using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
     };
 
 template <template <typename fnT, typename T> typename contigFactoryT,
-          template <typename fnT, typename T>
-          typename stridedFactoryT>
+          template <typename fnT, typename T> typename stridedFactoryT>
 static void populate(py::module_ m,
                      const char *name,
                      const char *docstring,
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index 33c7ab19b9ab..fca8c43f816e 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -237,8 +237,8 @@ struct InterpolateFactory
     fnT get()
     {
         if constexpr (std::is_same_v<
-                          typename InterpolateOutputType<T>::value_type, void>)
-        {
+                          typename InterpolateOutputType<T>::value_type,
+                          void>) {
             return nullptr;
         }
         else {
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/populate.hpp b/dpnp/backend/extensions/ufunc/elementwise_functions/populate.hpp
index f0c630562aae..2971c3eb4aca 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/populate.hpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/populate.hpp
@@ -158,8 +158,7 @@ namespace ext_ns = ext::common;
             if constexpr (std::is_same_v<typename OutputType<T>::value_type1,  \
                                          void> ||                              \
                           std::is_same_v<typename OutputType<T>::value_type2,  \
-                                         void>)                                \
-            {                                                                  \
+                                         void>) {                              \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
@@ -210,8 +209,7 @@ namespace ext_ns = ext::common;
             if constexpr (std::is_same_v<typename OutputType<T>::value_type1,  \
                                          void> ||                              \
                           std::is_same_v<typename OutputType<T>::value_type2,  \
-                                         void>)                                \
-            {                                                                  \
+                                         void>) {                              \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
@@ -263,8 +261,8 @@ namespace ext_ns = ext::common;
         fnT get()                                                              \
         {                                                                      \
             if constexpr (std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type, void>)  \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type,         \
+                              void>) {                                         \
                                                                                \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
@@ -312,8 +310,8 @@ namespace ext_ns = ext::common;
         fnT get()                                                              \
         {                                                                      \
             if constexpr (std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type, void>)  \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type,         \
+                              void>) {                                         \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
@@ -368,8 +366,8 @@ namespace ext_ns = ext::common;
                               typename OutputType<T1, T2>::value_type1,        \
                               void> ||                                         \
                           std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type2, void>) \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type2,        \
+                              void>) {                                         \
                                                                                \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
@@ -425,8 +423,8 @@ namespace ext_ns = ext::common;
                               typename OutputType<T1, T2>::value_type1,        \
                               void> ||                                         \
                           std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type2, void>) \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type2,        \
+                              void>) {                                         \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
diff --git a/dpnp/backend/extensions/ufunc/ufunc_py.cpp b/dpnp/backend/extensions/ufunc/ufunc_py.cpp
index 516d7187d479..7180a4c44be4 100644
--- a/dpnp/backend/extensions/ufunc/ufunc_py.cpp
+++ b/dpnp/backend/extensions/ufunc/ufunc_py.cpp
@@ -32,7 +32,4 @@
 
 namespace ufunc_ns = dpnp::extensions::ufunc;
 
-PYBIND11_MODULE(_ufunc_impl, m)
-{
-    ufunc_ns::init_elementwise_functions(m);
-}
+PYBIND11_MODULE(_ufunc_impl, m) { ufunc_ns::init_elementwise_functions(m); }
diff --git a/dpnp/backend/extensions/vm/common.hpp b/dpnp/backend/extensions/vm/common.hpp
index 6ee73504ce96..325aba7fafd2 100644
--- a/dpnp/backend/extensions/vm/common.hpp
+++ b/dpnp/backend/extensions/vm/common.hpp
@@ -181,8 +181,7 @@ bool need_to_call_unary_two_outputs_ufunc(
 
     // check that types are supported
     if (dst1_typeid != func_output_typeids.first ||
-        dst2_typeid != func_output_typeids.second)
-    {
+        dst2_typeid != func_output_typeids.second) {
         return false;
     }
 
@@ -425,8 +424,7 @@ bool need_to_call_binary_ufunc(sycl::queue &exec_q,
             if constexpr (std::is_same_v<typename OutputType<T>::value_type1,  \
                                          void> ||                              \
                           std::is_same_v<typename OutputType<T>::value_type2,  \
-                                         void>)                                \
-            {                                                                  \
+                                         void>) {                              \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
@@ -471,8 +469,8 @@ bool need_to_call_binary_ufunc(sycl::queue &exec_q,
         fnT get()                                                              \
         {                                                                      \
             if constexpr (std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type, void>)  \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type,         \
+                              void>) {                                         \
                 return nullptr;                                                \
             }                                                                  \
             else {                                                             \
diff --git a/dpnp/backend/kernels/dpnp_krnl_random.cpp b/dpnp/backend/kernels/dpnp_krnl_random.cpp
index be78704c9ccf..faef700a0407 100644
--- a/dpnp/backend/kernels/dpnp_krnl_random.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_random.cpp
@@ -1077,8 +1077,7 @@ DPCTLSyclEventRef
         // input parameters which follow the condition
         if (is_cpu_queue ||
             (!is_cpu_queue && (p_size >= ((size_t)ntrial * 16)) &&
-             (ntrial <= 16)))
-        {
+             (ntrial <= 16))) {
             DPNPC_ptr_adapter<_DataType> result_ptr(q_ref, result, size, true,
                                                     true);
             _DataType *result1 = result_ptr.get_ptr();
@@ -1399,8 +1398,7 @@ DPCTLSyclEventRef
                 size_t j;
                 int cv = pvec[idx[i]];
                 // TODO vectorize
-                for (j = i + 1; (j < size) && (pvec[idx[j]] == cv); j++) {
-                }
+                for (j = i + 1; (j < size) && (pvec[idx[j]] == cv); j++) {}
 
                 if (j <= i) {
                     throw std::runtime_error(
diff --git a/dpnp/backend/kernels/elementwise_functions/degrees.hpp b/dpnp/backend/kernels/elementwise_functions/degrees.hpp
index 73d2566e9546..8493a1821a6a 100644
--- a/dpnp/backend/kernels/elementwise_functions/degrees.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/degrees.hpp
@@ -44,10 +44,7 @@ struct DegreesFunctor
     // do both argT and resT support subgroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &x) const
-    {
-        return sycl::degrees(x);
-    }
+    resT operator()(const argT &x) const { return sycl::degrees(x); }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &x) const
diff --git a/dpnp/backend/kernels/elementwise_functions/divmod.hpp b/dpnp/backend/kernels/elementwise_functions/divmod.hpp
index 35ea4a9fccc5..37a79cffb7f2 100644
--- a/dpnp/backend/kernels/elementwise_functions/divmod.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/divmod.hpp
@@ -115,9 +115,6 @@ struct DivmodFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 } // namespace dpnp::kernels::divmod
diff --git a/dpnp/backend/kernels/elementwise_functions/fabs.hpp b/dpnp/backend/kernels/elementwise_functions/fabs.hpp
index 7c5ed96f226e..13c5e81898b7 100644
--- a/dpnp/backend/kernels/elementwise_functions/fabs.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/fabs.hpp
@@ -44,9 +44,6 @@ struct FabsFunctor
     // do both argT and resT support subgroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &x) const
-    {
-        return sycl::fabs(x);
-    }
+    resT operator()(const argT &x) const { return sycl::fabs(x); }
 };
 } // namespace dpnp::kernels::fabs
diff --git a/dpnp/backend/kernels/elementwise_functions/fmax.hpp b/dpnp/backend/kernels/elementwise_functions/fmax.hpp
index d28e7ba17b8d..ac5f81dbc698 100644
--- a/dpnp/backend/kernels/elementwise_functions/fmax.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/fmax.hpp
@@ -55,8 +55,7 @@ struct FmaxFunctor
             return in1 >= in2 ? in1 : in2;
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
 
             using realT = typename argT1::value_type;
@@ -64,8 +63,7 @@ struct FmaxFunctor
             const realT in2i = std::imag(in2);
 
             if (sycl::isnan(in2r) || sycl::isnan(in2i) ||
-                mu_ns::greater_equal_complex<argT1>(in1, in2))
-            {
+                mu_ns::greater_equal_complex<argT1>(in1, in2)) {
                 return in1;
             }
             return in2;
diff --git a/dpnp/backend/kernels/elementwise_functions/fmin.hpp b/dpnp/backend/kernels/elementwise_functions/fmin.hpp
index 340dfc8ec545..0cbc0385ce69 100644
--- a/dpnp/backend/kernels/elementwise_functions/fmin.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/fmin.hpp
@@ -55,8 +55,7 @@ struct FminFunctor
             return in1 <= in2 ? in1 : in2;
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
 
             using realT = typename argT1::value_type;
@@ -64,8 +63,7 @@ struct FminFunctor
             const realT in2i = std::imag(in2);
 
             if (sycl::isnan(in2r) || sycl::isnan(in2i) ||
-                mu_ns::less_equal_complex<argT1>(in1, in2))
-            {
+                mu_ns::less_equal_complex<argT1>(in1, in2)) {
                 return in1;
             }
             return in2;
diff --git a/dpnp/backend/kernels/elementwise_functions/isclose.hpp b/dpnp/backend/kernels/elementwise_functions/isclose.hpp
index 5086797435b1..179ad1ad8d2a 100644
--- a/dpnp/backend/kernels/elementwise_functions/isclose.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/isclose.hpp
@@ -81,8 +81,7 @@ inline bool isclose(const std::complex<T> a,
     }
 
     if (sycl::isnan(a.real()) && sycl::isnan(a.imag()) &&
-        sycl::isnan(b.real()) && sycl::isnan(b.imag()))
-    {
+        sycl::isnan(b.real()) && sycl::isnan(b.imag())) {
         return equal_nan;
     }
 
@@ -311,8 +310,7 @@ sycl::event
         using dpctl::tensor::kernels::alignment_utils::required_alignment;
         if (is_aligned<required_alignment>(a_tp) &&
             is_aligned<required_alignment>(b_tp) &&
-            is_aligned<required_alignment>(out_tp))
-        {
+            is_aligned<required_alignment>(out_tp)) {
             constexpr bool enable_sg_loadstore = true;
             using IsCloseFunc =
                 IsCloseContigScalarFunctor<T, scT, resTy, vec_sz, n_vecs,
diff --git a/dpnp/backend/kernels/elementwise_functions/nan_to_num.hpp b/dpnp/backend/kernels/elementwise_functions/nan_to_num.hpp
index e33ede58ac41..07c55feaf944 100644
--- a/dpnp/backend/kernels/elementwise_functions/nan_to_num.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/nan_to_num.hpp
@@ -261,8 +261,7 @@ sycl::event nan_to_num_contig_impl(sycl::queue &exec_q,
         using dpctl::tensor::kernels::alignment_utils::is_aligned;
         using dpctl::tensor::kernels::alignment_utils::required_alignment;
         if (is_aligned<required_alignment>(in_tp) &&
-            is_aligned<required_alignment>(out_tp))
-        {
+            is_aligned<required_alignment>(out_tp)) {
             constexpr bool enable_sg_loadstore = true;
             using NanToNumFunc = NanToNumContigFunctor<T, scT, vec_sz, n_vecs,
                                                        enable_sg_loadstore>;
diff --git a/dpnp/backend/kernels/elementwise_functions/radians.hpp b/dpnp/backend/kernels/elementwise_functions/radians.hpp
index ae598f3089d1..cb676249a6ac 100644
--- a/dpnp/backend/kernels/elementwise_functions/radians.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/radians.hpp
@@ -44,10 +44,7 @@ struct RadiansFunctor
     // do both argT and resT support subgroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &x) const
-    {
-        return sycl::radians(x);
-    }
+    resT operator()(const argT &x) const { return sycl::radians(x); }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &x) const
diff --git a/dpnp/backend/src/dpnp_fptr.hpp b/dpnp/backend/src/dpnp_fptr.hpp
index 9fd2dedb4a2d..15d6b7ab20ee 100644
--- a/dpnp/backend/src/dpnp_fptr.hpp
+++ b/dpnp/backend/src/dpnp_fptr.hpp
@@ -128,8 +128,7 @@ class dpnp_less_comp
     {
         if constexpr (both_types_are_same<
                           dpnp_remove_cvref_t<_Xp>, dpnp_remove_cvref_t<_Yp>,
-                          std::complex<float>, std::complex<double>>)
-        {
+                          std::complex<float>, std::complex<double>>) {
             bool ret = false;
             _Xp a = std::forward<_Xp>(__x);
             _Yp b = std::forward<_Yp>(__y);
diff --git a/dpnp/backend/src/queue_sycl.cpp b/dpnp/backend/src/queue_sycl.cpp
index b768ae21fafe..715193959cf1 100644
--- a/dpnp/backend/src/queue_sycl.cpp
+++ b/dpnp/backend/src/queue_sycl.cpp
@@ -87,8 +87,7 @@
 
     std::cout << "Available SYCL devices:" << std::endl;
     for (std::vector<sycl::device>::const_iterator it = devices.cbegin();
-         it != devices.cend(); ++it)
-    {
+         it != devices.cend(); ++it) {
         std::cout
             // not yet implemented error << " " <<
             // it->has(sycl::aspect::usm_shared_allocations)  << " "
diff --git a/dpnp/backend/src/queue_sycl.hpp b/dpnp/backend/src/queue_sycl.hpp
index 7aae5d00374e..6100a03c872a 100644
--- a/dpnp/backend/src/queue_sycl.hpp
+++ b/dpnp/backend/src/queue_sycl.hpp
@@ -30,10 +30,10 @@
 #ifndef QUEUE_SYCL_H // Cython compatibility
 #define QUEUE_SYCL_H
 
-//#pragma clang diagnostic push
-//#pragma clang diagnostic ignored "-Wpass-failed"
+// #pragma clang diagnostic push
+// #pragma clang diagnostic ignored "-Wpass-failed"
 #include <sycl/sycl.hpp>
-//#pragma clang diagnostic pop
+// #pragma clang diagnostic pop
 
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-parameter"
@@ -115,8 +115,8 @@ class backend_sycl
                      ? sycl::property_list{sycl::property::queue::
                                                enable_profiling()}
                      : sycl::property_list{}},
-          rng_mt19937_engine_{queue_, default_seed}, rng_mcg59_engine_{
-                                                         queue_, default_seed}
+          rng_mt19937_engine_{queue_, default_seed},
+          rng_mcg59_engine_{queue_, default_seed}
     {
     }
 
diff --git a/dpnp/backend/tests/test_random.cpp b/dpnp/backend/tests/test_random.cpp
index 5b7999724174..bda2658e1de4 100644
--- a/dpnp/backend/tests/test_random.cpp
+++ b/dpnp/backend/tests/test_random.cpp
@@ -43,10 +43,7 @@
 class RandomTestCase : public ::testing::Test
 {
 public:
-    static void SetUpTestCase()
-    {
-        _get_device_mem();
-    }
+    static void SetUpTestCase() { _get_device_mem(); }
 
     static void TearDownTestCase()
     {
diff --git a/pyproject.toml b/pyproject.toml
index d659428877fc..78ebe9d9aa66 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -163,6 +163,10 @@ allow-wildcard-with-all = true
 [tool.pylint.miscellaneous]
 notes = ["FIXME", "XXX"]
 
+[tool.pylint.typecheck]
+extension-pkg-allow-list = ["numpy"]
+generated-members = ["numpy.*", "finfo.*", "iinfo.*"]
+
 [tool.versioneer]
 VCS = "git"
 parentdir_prefix = "dpnp-"

From 54d2109bb474483708d5a7be7a3955cd42377e10 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Wed, 18 Mar 2026 23:45:12 +0100
Subject: [PATCH 18/35] Resolve an issue with `.data.ptr` ignoring USM offset
 on array views (#2812)

The `create_data()` function had an early return when `usm_data` was
already a dpnp memory class instance, which skipped setting the `ptr`
attribute. This caused all array views to report the same base pointer,
making `arr[0].data.ptr == arr[1].data.ptr` even though they point to
different memory locations.

The PR proposes a fix ensuring `ptr` is always set to `x._pointer`,
which points to the start of the array's data (including any offset for
views), rather than `usm_data._pointer` which points to the base buffer.

Additionally, always create a new memory wrapper instance to avoid
shared state when the same `usm_data` is used for multiple views.

This PR closes #2781.

- [x] Have you provided a meaningful PR description?
- [x] Have you added a test, reproducer or referred to an issue with a
reproducer?
- [x] Have you tested your changes locally for CPU and GPU devices?
- [x] Have you made sure that new changes do not introduce compiler
warnings?
- [ ] Have you checked performance impact of proposed changes?
- [ ] Have you added documentation for your changes, if necessary?
- [x] Have you added your changes to the changelog?
---
 CHANGELOG.md              |  1 +
 dpnp/memory/_memory.py    | 25 ++++++-----
 dpnp/tests/test_memory.py | 94 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 61cde1ddfefc..b6f419f215c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -76,6 +76,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Resolved an issue causing `dpnp.linspace` to return an incorrect output shape when inputs were passed as arrays [#2712](https://github.com/IntelPython/dpnp/pull/2712)
 * Resolved an issue where `dpnp` always returns the base allocation pointer, when the view start is expected [#2651](https://github.com/IntelPython/dpnp/pull/2651)
 * Fixed an issue causing an exception in `dpnp.geomspace` and `dpnp.logspace` when called with explicit `device` keyword but any input array is allocated on another device [#2723](https://github.com/IntelPython/dpnp/pull/2723)
+* Fixed `.data.ptr` property on array views to correctly return the pointer to the view's data location instead of the base allocation pointer [#2812](https://github.com/IntelPython/dpnp/pull/2812)
 
 ### Security
 
diff --git a/dpnp/memory/_memory.py b/dpnp/memory/_memory.py
index f978c5e50db2..70d93c04d6a5 100644
--- a/dpnp/memory/_memory.py
+++ b/dpnp/memory/_memory.py
@@ -98,13 +98,18 @@ def create_data(x):
     usm_data = x.usm_data
 
     if isinstance(usm_data, tuple(dispatch.values())):
-        return usm_data
-
-    cls = dispatch.get(type(usm_data), None)
-    if cls:
-        data = cls(usm_data)
-        # `ptr` is expecting to point at the start of the array's data,
-        # while `usm_data._pointer` is a pointer at the start of memory buffer
-        data.ptr = x._pointer
-        return data
-    raise TypeError(f"Expected USM memory, but got {type(usm_data)}")
+        # usm_data is already an instance of MemoryUSM<type> class
+        cls = usm_data.__class__
+    elif (cls := dispatch.get(type(usm_data))) is not None:
+        pass  # cls is set
+    else:
+        raise TypeError(f"Expected USM memory, but got {type(usm_data)}")
+
+    # create a new instance each time since usm_data might be a view
+    # of another array
+    data = cls(usm_data)
+
+    # `ptr` is expecting to point at the start of the array's data,
+    # while `usm_data._pointer` is a pointer at the start of memory buffer
+    data.ptr = x._pointer
+    return data
diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py
index 1bc0da8c1535..6a3d6ac5afae 100644
--- a/dpnp/tests/test_memory.py
+++ b/dpnp/tests/test_memory.py
@@ -27,6 +27,41 @@ def test_wrong_usm_data(self):
         with pytest.raises(TypeError):
             dpm.create_data(d)
 
+    def test_dpctl_view(self):
+        a = dpt.arange(10)
+        view = a[3:]
+
+        data = dpm.create_data(view)
+        assert data.ptr == view._pointer
+
+    def test_dpctl_different_views(self):
+        a = dpt.reshape(dpt.arange(12), (3, 4))
+
+        data0 = dpm.create_data(a[0])
+        data1 = dpm.create_data(a[1])
+
+        # Verify independent wrapper objects
+        assert data0 is not data1
+
+        # Verify correct pointers
+        assert data0.ptr == a[0]._pointer
+        assert data1.ptr == a[1]._pointer
+        assert data0.ptr != data1.ptr
+
+    def test_repeated_calls(self):
+        a = dpt.arange(20)
+        view = a[5:15]
+
+        # Multiple calls should return independent objects with same ptr
+        data1 = dpm.create_data(view)
+        data2 = dpm.create_data(view)
+
+        assert data1 is not data2, "Should create independent wrapper objects"
+        assert data1.ptr == data2.ptr, "Both should point to same location"
+        assert data1.ptr == view._pointer
+
+
+class TestNdarray:
     def test_ndarray_from_data(self):
         a = dpnp.empty(5)
         b = dpnp.ndarray(a.shape, buffer=a.data)
@@ -42,3 +77,62 @@ def test_view_non_zero_offset(self):
         pl = dpnp.ndarray((n, m), dtype=a.dtype, buffer=sl)
         assert pl.data.ptr == sl.data.ptr
         assert a.data.ptr != sl.data.ptr
+
+    def test_slices_2d(self):
+        # Create 2D array and verify slices have different pointers
+        a = dpnp.arange(12, dtype=dpnp.float32).reshape(3, 4)
+
+        # Each row should have a different pointer
+        row0_ptr = a[0].data.ptr
+        row1_ptr = a[1].data.ptr
+        row2_ptr = a[2].data.ptr
+
+        assert (
+            row0_ptr != row1_ptr
+        ), "a[0] and a[1] should have different pointers"
+        assert (
+            row1_ptr != row2_ptr
+        ), "a[1] and a[2] should have different pointers"
+
+        # Check byte offsets match expected stride
+        stride = a.strides[0]  # stride between rows in bytes
+        assert row1_ptr - row0_ptr == stride
+        assert row2_ptr - row1_ptr == stride
+
+    def test_slices_multidimensional(self):
+        # 3D array
+        a = dpnp.zeros((5, 10, 20), dtype=dpnp.int32)
+
+        # Different slices along first axis should have different pointers
+        slice0_ptr = a[0].data.ptr
+        slice1_ptr = a[1].data.ptr
+
+        assert slice0_ptr != slice1_ptr
+        assert slice1_ptr - slice0_ptr == a.strides[0]
+
+    def test_repeated_access(self):
+        a = dpnp.arange(20).reshape(4, 5)
+
+        # Multiple accesses to same slice should give same ptr value
+        ptr1 = a[2].data.ptr
+        ptr2 = a[2].data.ptr
+
+        assert ptr1 == ptr2, "Same slice should have consistent ptr value"
+
+        # But different slices should have different ptrs
+        assert a[0].data.ptr != a[2].data.ptr
+
+    def test_array_on_view_with_slicing(self):
+        # Original array
+        a = dpnp.arange(24, dtype=dpnp.float32).reshape(6, 4)
+
+        # Create view using slicing
+        view = a[2:5]
+
+        # Construct new array from view
+        new_arr = dpnp.ndarray(view.shape, dtype=view.dtype, buffer=view)
+
+        # Pointers should match
+        assert new_arr.data.ptr == view.data.ptr
+        # And should be different from base array
+        assert new_arr.data.ptr != a.data.ptr

From db486b9be5994030f99e8af5d541790a1252c902 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Thu, 19 Mar 2026 10:17:37 +0100
Subject: [PATCH 19/35] Fix `diagonal()` strides calculation for empty results
 (#2814)

Previously `dpnp.diagonal()` returned incorrect strides when the
diagonal was empty (e.g., when offset >= array width). The stride was
set to itemsize instead of the correct `st_n + st_m`.

This PR:
* fixes stride calculation to consistently use (st_n + st_m)
* simplifies implementation using unified formula to calculate resulting
shape, strides, and offset
* extends tests coverage including scenarios with empty diagonals,
views, and non-contiguous arrays

The PR closes #2761.
---
 CHANGELOG.md                |  1 +
 dpnp/dpnp_iface_indexing.py | 28 ++++++---------
 dpnp/tests/test_indexing.py | 69 ++++++++++++++++++++++++++++++++++---
 3 files changed, 77 insertions(+), 21 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b6f419f215c2..ad71d5692e2a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -77,6 +77,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Resolved an issue where `dpnp` always returns the base allocation pointer, when the view start is expected [#2651](https://github.com/IntelPython/dpnp/pull/2651)
 * Fixed an issue causing an exception in `dpnp.geomspace` and `dpnp.logspace` when called with explicit `device` keyword but any input array is allocated on another device [#2723](https://github.com/IntelPython/dpnp/pull/2723)
 * Fixed `.data.ptr` property on array views to correctly return the pointer to the view's data location instead of the base allocation pointer [#2812](https://github.com/IntelPython/dpnp/pull/2812)
+* Resolved an issue with strides calculation in `dpnp.diagonal` to return correct values for empty diagonals [#2814](https://github.com/IntelPython/dpnp/pull/2814)
 
 ### Security
 
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index db70f1fd2384..2a90f6cff637 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -719,24 +719,18 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
         offset = -offset
 
     a_shape = a.shape
-    a_straides = a.strides
+    a_strides = a.strides
     n, m = a_shape[-2:]
-    st_n, st_m = a_straides[-2:]
-
-    # Compute shape, strides and offset of the resulting diagonal array
-    # based on the input offset
-    if offset == 0:
-        out_shape = a_shape[:-2] + (min(n, m),)
-        out_strides = a_straides[:-2] + (st_n + st_m,)
-        out_offset = 0
-    elif 0 < offset < m:
-        out_shape = a_shape[:-2] + (min(n, m - offset),)
-        out_strides = a_straides[:-2] + (st_n + st_m,)
-        out_offset = st_m // a.itemsize * offset
-    else:
-        out_shape = a_shape[:-2] + (0,)
-        out_strides = a_straides[:-2] + (a.itemsize,)
-        out_offset = 0
+    st_n, st_m = a_strides[-2:]
+
+    # Compute the diagonal array as a view:
+    # - stride: sum of row and column strides (diag advances in both dimensions)
+    # - shape: determined by diagonal size using max(0, min(n, m - offset))
+    # - offset: starting position in buffer for non-zero offsets
+    diag_size = max(0, min(n, m - offset))
+    out_shape = a_shape[:-2] + (diag_size,)
+    out_strides = a_strides[:-2] + (st_n + st_m,)
+    out_offset = st_m // a.itemsize * offset
 
     return dpnp_array(
         out_shape, buffer=a, strides=out_strides, offset=out_offset
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index b6cae0733d40..27f34f6288b3 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -18,6 +18,7 @@
 from dpnp.exceptions import AxisError, ExecutionPlacementError
 
 from .helper import (
+    generate_random_numpy_array,
     get_abs_array,
     get_all_dtypes,
     get_array,
@@ -44,7 +45,9 @@ def wrapped(a, axis, **kwargs):
 
 
 class TestDiagonal:
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
     @pytest.mark.parametrize("offset", [-3, -1, 0, 1, 3])
     @pytest.mark.parametrize(
         "shape",
@@ -58,7 +61,7 @@ class TestDiagonal:
             "(2, 2, 2, 3)",
         ],
     )
-    def test_diagonal_offset(self, shape, dtype, offset):
+    def test_offset(self, shape, dtype, offset):
         a = numpy.arange(numpy.prod(shape), dtype=dtype).reshape(shape)
         a_dp = dpnp.array(a)
         expected = numpy.diagonal(a, offset)
@@ -74,7 +77,7 @@ def test_diagonal_offset(self, shape, dtype, offset):
             ((4, 3, 5, 2), [(0, 1), (1, 2), (2, 3), (0, 3)]),
         ],
     )
-    def test_diagonal_axes(self, shape, axis_pairs, dtype):
+    def test_axes(self, shape, axis_pairs, dtype):
         a = numpy.arange(numpy.prod(shape), dtype=dtype).reshape(shape)
         a_dp = dpnp.array(a)
         for axis1, axis2 in axis_pairs:
@@ -91,7 +94,7 @@ def test_linalg_diagonal(self, offset):
         result = dpnp.linalg.diagonal(a_dp, offset=offset)
         assert_array_equal(expected, result)
 
-    def test_diagonal_errors(self):
+    def test_errors(self):
         a = dpnp.arange(12).reshape(3, 4)
 
         # unsupported type
@@ -115,6 +118,64 @@ def test_diagonal_errors(self):
         assert_raises(ValueError, a.diagonal, axis1=1, axis2=1)
         assert_raises(ValueError, a.diagonal, axis1=1, axis2=-1)
 
+    @pytest.mark.parametrize("dt", get_all_dtypes(no_none=True))
+    @pytest.mark.parametrize(
+        "shape, offset",
+        [
+            ((2, 5), 5),  # offset >= m
+            ((2, 5), 10),  # offset >> m
+            ((4, 5), 6),  # offset >= m
+            ((2, 5), -5),  # negative offset >= n
+            ((3, 3, 4), 5),  # 3D array, offset >= m
+        ],
+    )
+    def test_empty_strides(self, dt, shape, offset):
+        a = generate_random_numpy_array(shape=shape, dtype=dt)
+        ia = dpnp.array(a)
+
+        expected = numpy.diagonal(a, offset)
+        result = dpnp.diagonal(ia, offset)
+
+        # Check both shape and strides match NumPy
+        assert expected.shape == result.shape
+        assert expected.strides == result.strides
+        assert_array_equal(expected, result)
+
+    @pytest.mark.parametrize("dt", get_all_dtypes(no_none=True))
+    def test_view(self, dt):
+        a = generate_random_numpy_array(shape=(3, 4), dtype=dt)
+        a = dpnp.array(a)
+        ia = a.copy()
+
+        diag = dpnp.diagonal(a)
+        diag[1] = 17  # modify a diagonal element
+        ia[1, 1] = 17  # do the same in original copy of the array
+
+        assert (a == ia).all()
+
+    @pytest.mark.parametrize("dt", get_all_dtypes(no_none=True))
+    @pytest.mark.parametrize(
+        "slice_spec, offset",
+        [
+            ((slice(None), slice(None, None, 2)), 0),  # skip columns
+            ((slice(None, None, 2), slice(None)), 1),  # skip rows
+            ((slice(None, None, 2), slice(None, None, 2)), 0),  # skip both
+        ],
+    )
+    def test_noncontiguous(self, dt, slice_spec, offset):
+        a = generate_random_numpy_array(shape=(4, 6), dtype=dt)
+        a_sliced = a[slice_spec]
+        ia = dpnp.array(a)
+        ia_sliced = ia[slice_spec]
+
+        expected = numpy.diagonal(a_sliced, offset=offset)
+        result = dpnp.diagonal(ia_sliced, offset=offset)
+
+        # Check strides match for non-contiguous arrays
+        assert expected.shape == result.shape
+        assert expected.strides == result.strides
+        assert_array_equal(expected, result)
+
 
 class TestExtins:
     @pytest.mark.parametrize("dt", get_all_dtypes(no_none=True))

From 95ab6a08c6b135388d7ee9f769dd64b8e0c513e9 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Mar 2026 13:46:54 +0100
Subject: [PATCH 20/35] Update QR tests to avoid element-wise comparisons
 (#2785)

This PR proposes updating QR tests to avoid direct element-wise
comparisons which became unstable with oneMKL 2026.0 due to sign and
phase differences in otherwise valid QR results

Since QR factorization is not unique, different MKL and NumPy versions
may return results that differ by sign or complex phase while still
representing a correct decomposition

To make the tests more stable this PR proposes using invariant-based
validation for `mode="raw"` and `mode="r"` based on the unitarity of the
Q factor (Q^H Q = I) and the resulting QR identity
R^H @ R = A^H @ A
---
 CHANGELOG.md                                  |   1 +
 dpnp/tests/qr_helper.py                       |  72 ++++++++++++
 dpnp/tests/test_linalg.py                     | 103 +++---------------
 .../cupy/linalg_tests/test_decomposition.py   |  41 ++++---
 4 files changed, 107 insertions(+), 110 deletions(-)
 create mode 100644 dpnp/tests/qr_helper.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ad71d5692e2a..a77cd9840e99 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -53,6 +53,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Changed `dpnp.partition` implementation to reuse `dpnp.sort` where it brings the performance benefit [#2766](https://github.com/IntelPython/dpnp/pull/2766)
 * `dpnp` uses pybind11 3.0.2 [#27734](https://github.com/IntelPython/dpnp/pull/2773)
 * Modified CMake files for the extension to explicitly mark DPC++ compiler and dpctl headers as system ones and so to suppress the build warning generated inside them [#2770](https://github.com/IntelPython/dpnp/pull/2770)
+* Updated QR tests to avoid element-wise comparisons for `raw` and `r` modes [#2785](https://github.com/IntelPython/dpnp/pull/2785)
 
 ### Deprecated
 
diff --git a/dpnp/tests/qr_helper.py b/dpnp/tests/qr_helper.py
new file mode 100644
index 000000000000..ead959807dc6
--- /dev/null
+++ b/dpnp/tests/qr_helper.py
@@ -0,0 +1,72 @@
+import numpy
+
+from .helper import factor_to_tol, has_support_aspect64
+
+
+def gram(x, xp):
+    # Return Gram matrix: X^H @ X
+    return xp.conjugate(x).swapaxes(-1, -2) @ x
+
+
+def get_R_from_raw(h, m, n, xp):
+    # Get reduced R from NumPy-style raw QR:
+    # R = triu((tril(h))^T), shape (..., k, n)
+    k = min(m, n)
+    rt = xp.tril(h)
+    r = xp.swapaxes(rt, -1, -2)
+    r = xp.triu(r[..., :m, :n])
+    return r[..., :k, :]
+
+
+def check_qr(a_np, a_xp, mode, xp):
+    # QR is not unique:
+    # element-wise comparison with NumPy may differ by sign/phase.
+    # To verify correctness use mode-dependent functional checks:
+    # complete/reduced: check decomposition Q @ R = A
+    # raw/r: check invariant R^H @ R = A^H @ A
+    if mode in ("complete", "reduced"):
+        res = xp.linalg.qr(a_xp, mode)
+        assert xp.allclose(res.Q @ res.R, a_xp, atol=1e-5)
+
+    # Since QR satisfies A = Q @ R with orthonormal Q (Q^H @ Q = I),
+    # validate correctness via the invariant R^H @ R == A^H @ A
+    # for raw/r modes
+    elif mode == "raw":
+        _, tau_np = numpy.linalg.qr(a_np, mode=mode)
+        h_xp, tau_xp = xp.linalg.qr(a_xp, mode=mode)
+
+        m, n = a_np.shape[-2], a_np.shape[-1]
+        Rraw_xp = get_R_from_raw(h_xp, m, n, xp)
+
+        rtol = atol = factor_to_tol(Rraw_xp.dtype, 100)
+
+        # Use reduced QR as a reference:
+        # reduced is validated via Q @ R == A
+        exp_r = xp.linalg.qr(a_xp, mode="reduced").R
+        assert xp.allclose(Rraw_xp, exp_r, atol=atol, rtol=rtol)
+
+        exp_xp = gram(a_xp, xp)
+
+        # Compare R^H @ R == A^H @ A
+        assert xp.allclose(gram(Rraw_xp, xp), exp_xp, atol=atol, rtol=rtol)
+
+        assert tau_xp.shape == tau_np.shape
+        if not has_support_aspect64(tau_xp.sycl_device):
+            assert tau_xp.dtype.kind == tau_np.dtype.kind
+        else:
+            assert tau_xp.dtype == tau_np.dtype
+
+    else:  # mode == "r"
+        r_xp = xp.linalg.qr(a_xp, mode="r")
+
+        # Use reduced QR as a reference:
+        # reduced is validated via Q @ R == A
+        exp_r = xp.linalg.qr(a_xp, mode="reduced").R
+        rtol = atol = factor_to_tol(exp_r.dtype, 100)
+
+        assert xp.allclose(r_xp, exp_r, atol=atol, rtol=rtol)
+
+        exp_xp = gram(a_xp, xp)
+
+        # Compare R^H @ R == A^H @ A
+        assert xp.allclose(gram(r_xp, xp), exp_xp, atol=atol, rtol=rtol)
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 170a2a7b5a13..20d974b32f0c 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -24,6 +24,7 @@
     has_support_aspect64,
     numpy_version,
 )
+from .qr_helper import check_qr
 from .third_party.cupy import testing
 
 
@@ -3584,7 +3585,7 @@ def test_error(self):
 
 
 class TestQr:
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
     @pytest.mark.parametrize(
         "shape",
         [
@@ -3610,60 +3611,27 @@ class TestQr:
             "(2, 2, 4)",
         ],
     )
-    @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
+    @pytest.mark.parametrize("mode", ["complete", "reduced", "r", "raw"])
     def test_qr(self, dtype, shape, mode):
         a = generate_random_numpy_array(shape, dtype, seed_value=81)
-        ia = dpnp.array(a)
+        ia = dpnp.array(a, dtype=dtype)
 
-        if mode == "r":
-            np_r = numpy.linalg.qr(a, mode)
-            dpnp_r = dpnp.linalg.qr(ia, mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a, mode)
-
-            # check decomposition
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia, mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-                assert dpnp.allclose(
-                    dpnp.matmul(dpnp_q, dpnp_r), ia, atol=1e-05
-                )
-            else:  # mode=="raw"
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia, mode)
-                assert_dtype_allclose(dpnp_q, np_q, factor=24)
-
-        if mode in ("raw", "r"):
-            assert_dtype_allclose(dpnp_r, np_r, factor=24)
+        check_qr(a, ia, mode, dpnp)
 
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
     @pytest.mark.parametrize(
         "shape",
         [(32, 32), (8, 16, 16)],
         ids=["(32, 32)", "(8, 16, 16)"],
     )
-    @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
+    @pytest.mark.parametrize("mode", ["complete", "reduced", "r", "raw"])
     def test_qr_large(self, dtype, shape, mode):
         a = generate_random_numpy_array(shape, dtype, seed_value=81)
         ia = dpnp.array(a)
 
-        if mode == "r":
-            np_r = numpy.linalg.qr(a, mode)
-            dpnp_r = dpnp.linalg.qr(ia, mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a, mode)
-
-            # check decomposition
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia, mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-                assert dpnp.allclose(dpnp.matmul(dpnp_q, dpnp_r), ia, atol=1e-5)
-            else:  # mode=="raw"
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia, mode)
-                assert_allclose(dpnp_q, np_q, atol=1e-4)
-        if mode in ("raw", "r"):
-            assert_allclose(dpnp_r, np_r, atol=1e-4)
+        check_qr(a, ia, mode, dpnp)
 
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
     @pytest.mark.parametrize(
         "shape",
         [(0, 0), (0, 2), (2, 0), (2, 0, 3), (2, 3, 0), (0, 2, 3)],
@@ -3676,65 +3644,22 @@ def test_qr_large(self, dtype, shape, mode):
             "(0, 2, 3)",
         ],
     )
-    @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
+    @pytest.mark.parametrize("mode", ["complete", "reduced", "r", "raw"])
     def test_qr_empty(self, dtype, shape, mode):
         a = numpy.empty(shape, dtype=dtype)
         ia = dpnp.array(a)
 
-        if mode == "r":
-            np_r = numpy.linalg.qr(a, mode)
-            dpnp_r = dpnp.linalg.qr(ia, mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a, mode)
-
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia, mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-            else:
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia, mode)
+        check_qr(a, ia, mode, dpnp)
 
-            assert_dtype_allclose(dpnp_q, np_q)
-
-        assert_dtype_allclose(dpnp_r, np_r)
-
-    @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
+    @pytest.mark.parametrize("mode", ["complete", "reduced", "r", "raw"])
     def test_qr_strides(self, mode):
         a = generate_random_numpy_array((5, 5))
         ia = dpnp.array(a)
 
         # positive strides
-        if mode == "r":
-            np_r = numpy.linalg.qr(a[::2, ::2], mode)
-            dpnp_r = dpnp.linalg.qr(ia[::2, ::2], mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a[::2, ::2], mode)
-
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia[::2, ::2], mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-            else:
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia[::2, ::2], mode)
-
-            assert_dtype_allclose(dpnp_q, np_q)
-
-        assert_dtype_allclose(dpnp_r, np_r)
-
+        check_qr(a[::2, ::2], ia[::2, ::2], mode, dpnp)
         # negative strides
-        if mode == "r":
-            np_r = numpy.linalg.qr(a[::-2, ::-2], mode)
-            dpnp_r = dpnp.linalg.qr(ia[::-2, ::-2], mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a[::-2, ::-2], mode)
-
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia[::-2, ::-2], mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-            else:
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia[::-2, ::-2], mode)
-
-            assert_dtype_allclose(dpnp_q, np_q)
-
-        assert_dtype_allclose(dpnp_r, np_r)
+        check_qr(a[::-2, ::-2], ia[::-2, ::-2], mode, dpnp)
 
     def test_qr_errors(self):
         a_dp = dpnp.array([[1, 2], [3, 5]], dtype="float32")
diff --git a/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py b/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
index c7ff275cac0c..697e4ee7988d 100644
--- a/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -12,10 +12,9 @@
 # from cupy.cuda import runtime
 # from cupy.linalg import _util
 from dpnp.tests.helper import (
-    LTS_VERSION,
     has_support_aspect64,
-    is_lts_driver,
 )
+from dpnp.tests.qr_helper import check_qr
 from dpnp.tests.third_party.cupy import testing
 from dpnp.tests.third_party.cupy.testing import _condition
 
@@ -169,7 +168,6 @@ def test_decomposition(self, dtype):
     )
 )
 class TestQRDecomposition(unittest.TestCase):
-
     @testing.for_dtypes("fdFD")
     def check_mode(self, array, mode, dtype):
         # if runtime.is_hip and driver.get_build_version() < 307:
@@ -178,22 +176,29 @@ def check_mode(self, array, mode, dtype):
 
         a_cpu = numpy.asarray(array, dtype=dtype)
         a_gpu = cupy.asarray(array, dtype=dtype)
-        result_gpu = cupy.linalg.qr(a_gpu, mode=mode)
+        # QR is not unique:
+        # element-wise comparison with NumPy may differ by sign/phase.
+        # To verify correctness use mode-dependent functional checks:
+        # complete/reduced: check decomposition Q @ R = A
+        # raw/r: check invariant R^H @ R = A^H @ A
+
+        # result_gpu = cupy.linalg.qr(a_gpu, mode=mode)
         if (
             mode != "raw"
             or numpy.lib.NumpyVersion(numpy.__version__) >= "1.22.0rc1"
         ):
-            result_cpu = numpy.linalg.qr(a_cpu, mode=mode)
-            self._check_result(result_cpu, result_gpu)
-
-    def _check_result(self, result_cpu, result_gpu):
-        if isinstance(result_cpu, tuple):
-            for b_cpu, b_gpu in zip(result_cpu, result_gpu):
-                assert b_cpu.dtype == b_gpu.dtype
-                testing.assert_allclose(b_cpu, b_gpu, atol=1e-4)
-        else:
-            assert result_cpu.dtype == result_gpu.dtype
-            testing.assert_allclose(result_cpu, result_gpu, atol=1e-4)
+            # result_cpu = numpy.linalg.qr(a_cpu, mode=mode)
+            # self._check_result(result_cpu, result_gpu, a_gpu, mode)
+            check_qr(a_cpu, a_gpu, mode, cupy)
+
+    # def _check_result(self, result_cpu, result_gpu):
+    #     if isinstance(result_cpu, tuple):
+    #         for b_cpu, b_gpu in zip(result_cpu, result_gpu):
+    #             assert b_cpu.dtype == b_gpu.dtype
+    #             testing.assert_allclose(b_cpu, b_gpu, atol=1e-4)
+    #     else:
+    #         assert result_cpu.dtype == result_gpu.dtype
+    #         testing.assert_allclose(result_cpu, result_gpu, atol=1e-4)
 
     @testing.fix_random()
     @_condition.repeat(3, 10)
@@ -202,9 +207,6 @@ def test_mode(self):
         self.check_mode(numpy.random.randn(3, 3), mode=self.mode)
         self.check_mode(numpy.random.randn(5, 4), mode=self.mode)
 
-    @pytest.mark.skipif(
-        is_lts_driver(version=LTS_VERSION.V1_6), reason="SAT-8375"
-    )
     @testing.with_requires("numpy>=1.22")
     @testing.fix_random()
     def test_mode_rank3(self):
@@ -212,9 +214,6 @@ def test_mode_rank3(self):
         self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode)
         self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode)
 
-    @pytest.mark.skipif(
-        is_lts_driver(version=LTS_VERSION.V1_6), reason="SAT-8375"
-    )
     @testing.with_requires("numpy>=1.22")
     @testing.fix_random()
     def test_mode_rank4(self):

From c8c0f88b4f03a5d5d1aa7f4cd6f5257811346646 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Thu, 19 Mar 2026 18:05:23 +0100
Subject: [PATCH 21/35] Consolidate SYCL kernels under a single folder (#2816)

This PR moves all SYCL kernel functors from extensions/ to kernels/
directory, improving the project structure.

Moved kernels:
- statistics: histogram, sliding_window1d functors
- indexing: choose functor
- window: bartlett, blackman, hamming, hanning, kaiser functors

The PR also improves the code coverage report, since the SYCL kernels
can be easy excluded from the report now.
---
 CHANGELOG.md                                  |   1 +
 .../extensions/indexing/CMakeLists.txt        |   2 +-
 dpnp/backend/extensions/indexing/choose.cpp   | 129 ++++++---
 .../extensions/indexing/choose_kernel.hpp     | 191 ------------
 .../extensions/statistics/CMakeLists.txt      |   2 +-
 .../statistics/histogram_common.hpp           | 138 ++++-----
 .../statistics/sliding_window1d.hpp           | 222 ++------------
 .../elementwise_functions/interpolate.cpp     | 151 +++++-----
 dpnp/backend/extensions/window/common.hpp     |  31 +-
 dpnp/backend/extensions/window/kaiser.cpp     |  51 +---
 dpnp/backend/extensions/window/kaiser.hpp     |   7 +-
 dpnp/backend/extensions/window/window_py.cpp  |  37 ++-
 .../elementwise_functions/interpolate.hpp     | 108 ++++---
 dpnp/backend/kernels/indexing/choose.hpp      | 128 ++++++++
 dpnp/backend/kernels/statistics/histogram.hpp |  99 +++++++
 .../kernels/statistics/sliding_window1d.hpp   | 274 ++++++++++++++++++
 .../window/bartlett.hpp                       |  27 +-
 .../window/blackman.hpp                       |  25 +-
 .../window/hamming.hpp                        |  25 +-
 .../window/hanning.hpp                        |  25 +-
 dpnp/backend/kernels/window/kaiser.hpp        |  64 ++++
 scripts/gen_coverage.py                       |   3 +
 22 files changed, 979 insertions(+), 761 deletions(-)
 delete mode 100644 dpnp/backend/extensions/indexing/choose_kernel.hpp
 create mode 100644 dpnp/backend/kernels/indexing/choose.hpp
 create mode 100644 dpnp/backend/kernels/statistics/histogram.hpp
 create mode 100644 dpnp/backend/kernels/statistics/sliding_window1d.hpp
 rename dpnp/backend/{extensions => kernels}/window/bartlett.hpp (80%)
 rename dpnp/backend/{extensions => kernels}/window/blackman.hpp (83%)
 rename dpnp/backend/{extensions => kernels}/window/hamming.hpp (83%)
 rename dpnp/backend/{extensions => kernels}/window/hanning.hpp (83%)
 create mode 100644 dpnp/backend/kernels/window/kaiser.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a77cd9840e99..096eabef6720 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -54,6 +54,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * `dpnp` uses pybind11 3.0.2 [#27734](https://github.com/IntelPython/dpnp/pull/2773)
 * Modified CMake files for the extension to explicitly mark DPC++ compiler and dpctl headers as system ones and so to suppress the build warning generated inside them [#2770](https://github.com/IntelPython/dpnp/pull/2770)
 * Updated QR tests to avoid element-wise comparisons for `raw` and `r` modes [#2785](https://github.com/IntelPython/dpnp/pull/2785)
+* Moved all SYCL kernel functors from `backend/extensions/` to a unified `backend/kernels/` directory hierarchy [#2816](https://github.com/IntelPython/dpnp/pull/2816)
 
 ### Deprecated
 
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 370d59f95585..e1bc34c9ae8b 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -62,7 +62,7 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/indexing/choose.cpp b/dpnp/backend/extensions/indexing/choose.cpp
index 99d91744366f..3b2df73f46ef 100644
--- a/dpnp/backend/extensions/indexing/choose.cpp
+++ b/dpnp/backend/extensions/indexing/choose.cpp
@@ -30,41 +30,123 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
+#include <stdexcept>
+#include <string>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
-#include "choose_kernel.hpp"
+#include <sycl/sycl.hpp>
+
 #include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
-// utils extension header
 #include "ext/common.hpp"
+#include "kernels/indexing/choose.hpp"
 
 // dpctl tensor headers
 #include "utils/indexing_utils.hpp"
 #include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
 
 namespace dpnp::extensions::indexing
 {
+namespace py = pybind11;
 
+namespace impl
+{
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-static kernels::choose_fn_ptr_t choose_clip_dispatch_table[td_ns::num_types]
-                                                          [td_ns::num_types];
-static kernels::choose_fn_ptr_t choose_wrap_dispatch_table[td_ns::num_types]
-                                                          [td_ns::num_types];
+using dpctl::tensor::ssize_t;
+
+typedef sycl::event (*choose_fn_ptr_t)(sycl::queue &,
+                                       size_t,
+                                       ssize_t,
+                                       int,
+                                       const ssize_t *,
+                                       const char *,
+                                       char *,
+                                       char **,
+                                       ssize_t,
+                                       ssize_t,
+                                       const ssize_t *,
+                                       const std::vector<sycl::event> &);
+
+static choose_fn_ptr_t choose_clip_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+static choose_fn_ptr_t choose_wrap_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename ProjectorT, typename indTy, typename Ty>
+sycl::event choose_impl(sycl::queue &q,
+                        size_t nelems,
+                        ssize_t n_chcs,
+                        int nd,
+                        const ssize_t *shape_and_strides,
+                        const char *ind_cp,
+                        char *dst_cp,
+                        char **chcs_cp,
+                        ssize_t ind_offset,
+                        ssize_t dst_offset,
+                        const ssize_t *chc_offsets,
+                        const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
 
-namespace py = pybind11;
+    const indTy *ind_tp = reinterpret_cast<const indTy *>(ind_cp);
+    Ty *dst_tp = reinterpret_cast<Ty *>(dst_cp);
 
-namespace detail
+    sycl::event choose_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using InOutIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const InOutIndexerT ind_out_indexer{nd, ind_offset, dst_offset,
+                                            shape_and_strides};
+
+        using NthChoiceIndexerT =
+            dpnp::kernels::choose::strides::NthStrideOffsetUnpacked;
+        const NthChoiceIndexerT choices_indexer{
+            nd, chc_offsets, shape_and_strides, shape_and_strides + 3 * nd};
+
+        using ChooseFunc =
+            dpnp::kernels::choose::ChooseFunctor<ProjectorT, InOutIndexerT,
+                                                 NthChoiceIndexerT, indTy, Ty>;
+
+        cgh.parallel_for<ChooseFunc>(sycl::range<1>(nelems),
+                                     ChooseFunc(ind_tp, dst_tp, chcs_cp, n_chcs,
+                                                ind_out_indexer,
+                                                choices_indexer));
+    });
+
+    return choose_ev;
+}
+
+template <typename fnT, typename IndT, typename T, typename Index>
+struct ChooseFactory
 {
+    fnT get()
+    {
+        if constexpr (std::is_integral<IndT>::value &&
+                      !std::is_same<IndT, bool>::value) {
+            fnT fn = choose_impl<Index, IndT, T>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
 
+namespace detail
+{
 using host_ptrs_allocator_t =
     dpctl::tensor::alloc_utils::usm_host_allocator<char *>;
 using ptrs_t = std::vector<char *, host_ptrs_allocator_t>;
@@ -191,7 +273,6 @@ std::vector<dpctl::tensor::usm_ndarray> parse_py_chcs(const sycl::queue &q,
 
     return res;
 }
-
 } // namespace detail
 
 std::pair<sycl::event, sycl::event>
@@ -412,23 +493,6 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(arg_cleanup_ev, choose_generic_ev);
 }
 
-template <typename fnT, typename IndT, typename T, typename Index>
-struct ChooseFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_integral<IndT>::value &&
-                      !std::is_same<IndT, bool>::value) {
-            fnT fn = kernels::choose_impl<Index, IndT, T>;
-            return fn;
-        }
-        else {
-            fnT fn = nullptr;
-            return fn;
-        }
-    }
-};
-
 using dpctl::tensor::indexing_utils::ClipIndex;
 using dpctl::tensor::indexing_utils::WrapIndex;
 
@@ -441,23 +505,22 @@ using ChooseClipFactory = ChooseFactory<fnT, IndT, T, ClipIndex<IndT>>;
 void init_choose_dispatch_tables(void)
 {
     using ext::common::init_dispatch_table;
-    using kernels::choose_fn_ptr_t;
 
     init_dispatch_table<choose_fn_ptr_t, ChooseClipFactory>(
         choose_clip_dispatch_table);
     init_dispatch_table<choose_fn_ptr_t, ChooseWrapFactory>(
         choose_wrap_dispatch_table);
 }
+} // namespace impl
 
 void init_choose(py::module_ m)
 {
-    dpnp::extensions::indexing::init_choose_dispatch_tables();
+    impl::init_choose_dispatch_tables();
 
-    m.def("_choose", &py_choose, "", py::arg("src"), py::arg("chcs"),
+    m.def("_choose", &impl::py_choose, "", py::arg("src"), py::arg("chcs"),
           py::arg("dst"), py::arg("mode"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
     return;
 }
-
 } // namespace dpnp::extensions::indexing
diff --git a/dpnp/backend/extensions/indexing/choose_kernel.hpp b/dpnp/backend/extensions/indexing/choose_kernel.hpp
deleted file mode 100644
index 6b1ac8005054..000000000000
--- a/dpnp/backend/extensions/indexing/choose_kernel.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-//*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// - Neither the name of the copyright holder nor the names of its contributors
-//   may be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-
-#pragma once
-
-#include <algorithm>
-#include <complex>
-#include <cstdint>
-#include <limits>
-#include <type_traits>
-
-#include <sycl/sycl.hpp>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "utils/indexing_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/strided_iters.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpnp::extensions::indexing::strides_detail
-{
-
-struct NthStrideOffsetUnpacked
-{
-    NthStrideOffsetUnpacked(int common_nd,
-                            dpctl::tensor::ssize_t const *_offsets,
-                            dpctl::tensor::ssize_t const *_shape,
-                            dpctl::tensor::ssize_t const *_strides)
-        : _ind(common_nd), nd(common_nd), offsets(_offsets), shape(_shape),
-          strides(_strides)
-    {
-    }
-
-    template <typename nT>
-    size_t operator()(dpctl::tensor::ssize_t gid, nT n) const
-    {
-        dpctl::tensor::ssize_t relative_offset(0);
-        _ind.get_displacement<const dpctl::tensor::ssize_t *,
-                              const dpctl::tensor::ssize_t *>(
-            gid, shape, strides + (n * nd), relative_offset);
-
-        return relative_offset + offsets[n];
-    }
-
-private:
-    dpctl::tensor::strides::CIndexer_vector<dpctl::tensor::ssize_t> _ind;
-
-    int nd;
-    dpctl::tensor::ssize_t const *offsets;
-    dpctl::tensor::ssize_t const *shape;
-    dpctl::tensor::ssize_t const *strides;
-};
-
-static_assert(sycl::is_device_copyable_v<NthStrideOffsetUnpacked>);
-
-} // namespace dpnp::extensions::indexing::strides_detail
-
-namespace dpnp::extensions::indexing::kernels
-{
-
-template <typename ProjectorT,
-          typename IndOutIndexerT,
-          typename ChoicesIndexerT,
-          typename IndT,
-          typename T>
-class ChooseFunctor
-{
-private:
-    const IndT *ind = nullptr;
-    T *dst = nullptr;
-    char **chcs = nullptr;
-    dpctl::tensor::ssize_t n_chcs;
-    const IndOutIndexerT ind_out_indexer;
-    const ChoicesIndexerT chcs_indexer;
-
-public:
-    ChooseFunctor(const IndT *ind_,
-                  T *dst_,
-                  char **chcs_,
-                  dpctl::tensor::ssize_t n_chcs_,
-                  const IndOutIndexerT &ind_out_indexer_,
-                  const ChoicesIndexerT &chcs_indexer_)
-        : ind(ind_), dst(dst_), chcs(chcs_), n_chcs(n_chcs_),
-          ind_out_indexer(ind_out_indexer_), chcs_indexer(chcs_indexer_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        const ProjectorT proj{};
-
-        dpctl::tensor::ssize_t i = id[0];
-
-        auto ind_dst_offsets = ind_out_indexer(i);
-        dpctl::tensor::ssize_t ind_offset = ind_dst_offsets.get_first_offset();
-        dpctl::tensor::ssize_t dst_offset = ind_dst_offsets.get_second_offset();
-
-        IndT chc_idx = ind[ind_offset];
-        // proj produces an index in the range of n_chcs
-        dpctl::tensor::ssize_t projected_idx = proj(n_chcs, chc_idx);
-
-        dpctl::tensor::ssize_t chc_offset = chcs_indexer(i, projected_idx);
-
-        T *chc = reinterpret_cast<T *>(chcs[projected_idx]);
-
-        dst[dst_offset] = chc[chc_offset];
-    }
-};
-
-typedef sycl::event (*choose_fn_ptr_t)(sycl::queue &,
-                                       size_t,
-                                       dpctl::tensor::ssize_t,
-                                       int,
-                                       const dpctl::tensor::ssize_t *,
-                                       const char *,
-                                       char *,
-                                       char **,
-                                       dpctl::tensor::ssize_t,
-                                       dpctl::tensor::ssize_t,
-                                       const dpctl::tensor::ssize_t *,
-                                       const std::vector<sycl::event> &);
-
-template <typename ProjectorT, typename indTy, typename Ty>
-sycl::event choose_impl(sycl::queue &q,
-                        size_t nelems,
-                        dpctl::tensor::ssize_t n_chcs,
-                        int nd,
-                        const dpctl::tensor::ssize_t *shape_and_strides,
-                        const char *ind_cp,
-                        char *dst_cp,
-                        char **chcs_cp,
-                        dpctl::tensor::ssize_t ind_offset,
-                        dpctl::tensor::ssize_t dst_offset,
-                        const dpctl::tensor::ssize_t *chc_offsets,
-                        const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
-
-    const indTy *ind_tp = reinterpret_cast<const indTy *>(ind_cp);
-    Ty *dst_tp = reinterpret_cast<Ty *>(dst_cp);
-
-    sycl::event choose_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using InOutIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        const InOutIndexerT ind_out_indexer{nd, ind_offset, dst_offset,
-                                            shape_and_strides};
-
-        using NthChoiceIndexerT = strides_detail::NthStrideOffsetUnpacked;
-        const NthChoiceIndexerT choices_indexer{
-            nd, chc_offsets, shape_and_strides, shape_and_strides + 3 * nd};
-
-        using ChooseFunc = ChooseFunctor<ProjectorT, InOutIndexerT,
-                                         NthChoiceIndexerT, indTy, Ty>;
-
-        cgh.parallel_for<ChooseFunc>(sycl::range<1>(nelems),
-                                     ChooseFunc(ind_tp, dst_tp, chcs_cp, n_chcs,
-                                                ind_out_indexer,
-                                                choices_indexer));
-    });
-
-    return choose_ev;
-}
-
-} // namespace dpnp::extensions::indexing::kernels
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 7ccb05238ae4..36786c8cbaf3 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -67,7 +67,7 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/statistics/histogram_common.hpp b/dpnp/backend/extensions/statistics/histogram_common.hpp
index 02fc66f26610..8091e8874d17 100644
--- a/dpnp/backend/extensions/statistics/histogram_common.hpp
+++ b/dpnp/backend/extensions/statistics/histogram_common.hpp
@@ -28,24 +28,26 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+
 #include <sycl/sycl.hpp>
 
+#include "dpctl4pybind11.hpp"
+
 #include "ext/common.hpp"
+#include "kernels/statistics/histogram.hpp"
 
-namespace dpctl::tensor
+namespace statistics::histogram
 {
-class usm_ndarray;
-}
-
 using dpctl::tensor::usm_ndarray;
 
 using ext::common::AtomicOp;
 using ext::common::IsNan;
 using ext::common::Less;
 
-namespace statistics::histogram
-{
-
 template <typename T, int Dims>
 struct CachedData
 {
@@ -69,23 +71,23 @@ struct CachedData
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &item) const
     {
-        uint32_t llid = item.get_local_linear_id();
+        std::uint32_t llid = item.get_local_linear_id();
         auto local_ptr = &local_data[0];
-        uint32_t size = local_data.size();
+        std::uint32_t size = local_data.size();
         auto group = item.get_group();
-        uint32_t local_size = group.get_local_linear_range();
+        std::uint32_t local_size = group.get_local_linear_range();
 
-        for (uint32_t i = llid; i < size; i += local_size) {
+        for (std::uint32_t i = llid; i < size; i += local_size) {
             local_ptr[i] = global_data[i];
         }
     }
 
-    size_t size() const { return local_data.size(); }
+    std::size_t size() const { return local_data.size(); }
 
     T &operator[](const sycl::id<Dims> &id) const { return local_data[id]; }
 
     template <typename = std::enable_if_t<Dims == 1>>
-    T &operator[](const size_t id) const
+    T &operator[](const std::size_t id) const
     {
         return local_data[id];
     }
@@ -117,12 +119,12 @@ struct UncachedData
     {
     }
 
-    size_t size() const { return _shape.size(); }
+    std::size_t size() const { return _shape.size(); }
 
     T &operator[](const sycl::id<Dims> &id) const { return global_data[id]; }
 
     template <typename = std::enable_if_t<Dims == 1>>
-    T &operator[](const size_t id) const
+    T &operator[](const std::size_t id) const
     {
         return global_data[id];
     }
@@ -139,15 +141,15 @@ struct HistLocalType
 };
 
 template <>
-struct HistLocalType<uint64_t>
+struct HistLocalType<std::uint64_t>
 {
-    using type = uint32_t;
+    using type = std::uint32_t;
 };
 
 template <>
-struct HistLocalType<int64_t>
+struct HistLocalType<std::int64_t>
 {
-    using type = int32_t;
+    using type = std::int32_t;
 };
 
 template <typename T, typename localT = typename HistLocalType<T>::type>
@@ -159,8 +161,8 @@ struct HistWithLocalCopies
     using LocalHist = sycl::local_accessor<localT, 2>;
 
     HistWithLocalCopies(T *global_data,
-                        size_t bins_count,
-                        int32_t copies_count,
+                        std::size_t bins_count,
+                        std::int32_t copies_count,
                         sycl::handler &cgh)
     {
         local_hist = LocalHist(sycl::range<2>(copies_count, bins_count), cgh);
@@ -170,23 +172,25 @@ struct HistWithLocalCopies
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &item, localT val = 0) const
     {
-        uint32_t llid = item.get_local_linear_id();
+        std::uint32_t llid = item.get_local_linear_id();
         auto *local_ptr = &local_hist[0][0];
-        uint32_t size = local_hist.size();
+        std::uint32_t size = local_hist.size();
         auto group = item.get_group();
-        uint32_t local_size = group.get_local_linear_range();
+        std::uint32_t local_size = group.get_local_linear_range();
 
-        for (uint32_t i = llid; i < size; i += local_size) {
+        for (std::uint32_t i = llid; i < size; i += local_size) {
             local_ptr[i] = val;
         }
     }
 
     template <int _Dims>
-    void add(const sycl::nd_item<_Dims> &item, int32_t bin, localT value) const
+    void add(const sycl::nd_item<_Dims> &item,
+             std::int32_t bin,
+             localT value) const
     {
-        int32_t llid = item.get_local_linear_id();
-        int32_t local_hist_count = local_hist.get_range().get(0);
-        int32_t local_copy_id =
+        std::int32_t llid = item.get_local_linear_id();
+        std::int32_t local_hist_count = local_hist.get_range().get(0);
+        std::int32_t local_copy_id =
             local_hist_count == 1 ? 0 : llid % local_hist_count;
 
         AtomicOp<localT, sycl::memory_order::relaxed,
@@ -198,15 +202,15 @@ struct HistWithLocalCopies
     template <int _Dims>
     void finalize(const sycl::nd_item<_Dims> &item) const
     {
-        uint32_t llid = item.get_local_linear_id();
-        uint32_t bins_count = local_hist.get_range().get(1);
-        uint32_t local_hist_count = local_hist.get_range().get(0);
+        std::uint32_t llid = item.get_local_linear_id();
+        std::uint32_t bins_count = local_hist.get_range().get(1);
+        std::uint32_t local_hist_count = local_hist.get_range().get(0);
         auto group = item.get_group();
-        uint32_t local_size = group.get_local_linear_range();
+        std::uint32_t local_size = group.get_local_linear_range();
 
-        for (uint32_t i = llid; i < bins_count; i += local_size) {
+        for (std::uint32_t i = llid; i < bins_count; i += local_size) {
             auto value = local_hist[0][i];
-            for (uint32_t lhc = 1; lhc < local_hist_count; ++lhc) {
+            for (std::uint32_t lhc = 1; lhc < local_hist_count; ++lhc) {
                 value += local_hist[lhc][i];
             }
             if (value != T(0)) {
@@ -217,7 +221,7 @@ struct HistWithLocalCopies
         }
     }
 
-    uint32_t size() const { return local_hist.size(); }
+    std::uint32_t size() const { return local_hist.size(); }
 
 private:
     LocalHist local_hist;
@@ -238,7 +242,7 @@ struct HistGlobalMemory
     }
 
     template <int _Dims>
-    void add(const sycl::nd_item<_Dims> &, int32_t bin, T value) const
+    void add(const sycl::nd_item<_Dims> &, std::int32_t bin, T value) const
     {
         AtomicOp<T, sycl::memory_order::relaxed,
                  sycl::memory_scope::device>::add(global_hist[bin], value);
@@ -253,10 +257,10 @@ struct HistGlobalMemory
     T *global_hist = nullptr;
 };
 
-template <typename T = uint32_t>
+template <typename T = std::uint32_t>
 struct NoWeights
 {
-    constexpr T get(size_t) const { return 1; }
+    constexpr T get(std::size_t) const { return 1; }
 };
 
 template <typename T>
@@ -264,7 +268,7 @@ struct Weights
 {
     Weights(T *weights) { data = weights; }
 
-    T get(size_t id) const { return data[id]; }
+    T get(std::size_t id) const { return data[id]; }
 
 private:
     T *data = nullptr;
@@ -277,55 +281,23 @@ bool check_in_bounds(const dT &val, const dT &min, const dT &max)
     return !_less(val, min) && !_less(max, val) && !IsNan<dT>::isnan(val);
 }
 
-template <typename T, typename HistImpl, typename Edges, typename Weights>
-class histogram_kernel;
-
 template <typename T, typename HistImpl, typename Edges, typename Weights>
 void submit_histogram(const T *in,
-                      const size_t size,
-                      const size_t dims,
-                      const uint32_t WorkPI,
+                      const std::size_t size,
+                      const std::size_t dims,
+                      const std::uint32_t WorkPI,
                       const HistImpl &hist,
                       const Edges &edges,
                       const Weights &weights,
                       sycl::nd_range<1> nd_range,
                       sycl::handler &cgh)
 {
-    cgh.parallel_for<histogram_kernel<T, HistImpl, Edges, Weights>>(
-        nd_range, [=](sycl::nd_item<1> item) {
-            auto id = item.get_group_linear_id();
-            auto lid = item.get_local_linear_id();
-            auto group = item.get_group();
-            auto local_size = item.get_local_range(0);
-
-            hist.init(item);
-            edges.init(item);
-
-            if constexpr (HistImpl::sync_after_init || Edges::sync_after_init) {
-                sycl::group_barrier(group, sycl::memory_scope::work_group);
-            }
-
-            auto bounds = edges.get_bounds();
-
-            for (uint32_t i = 0; i < WorkPI; ++i) {
-                auto data_idx = id * WorkPI * local_size + i * local_size + lid;
-                if (data_idx < size) {
-                    auto *d = &in[data_idx * dims];
-
-                    if (edges.in_bounds(d, bounds)) {
-                        auto bin = edges.get_bin(item, d, bounds);
-                        auto weight = weights.get(data_idx);
-                        hist.add(item, bin, weight);
-                    }
-                }
-            }
-
-            if constexpr (HistImpl::sync_before_finalize) {
-                sycl::group_barrier(group, sycl::memory_scope::work_group);
-            }
+    using HistogramKernel =
+        dpnp::kernels::histogram::HistogramFunctor<T, HistImpl, Edges, Weights>;
 
-            hist.finalize(item);
-        });
+    cgh.parallel_for<HistogramKernel>(
+        nd_range,
+        HistogramKernel(in, size, dims, WorkPI, hist, edges, weights));
 }
 
 void validate(const usm_ndarray &sample,
@@ -333,8 +305,8 @@ void validate(const usm_ndarray &sample,
               const std::optional<const dpctl::tensor::usm_ndarray> &weights,
               const usm_ndarray &histogram);
 
-uint32_t get_local_hist_copies_count(uint32_t loc_mem_size_in_items,
-                                     uint32_t local_size,
-                                     uint32_t hist_size_in_items);
+std::uint32_t get_local_hist_copies_count(std::uint32_t loc_mem_size_in_items,
+                                          std::uint32_t local_size,
+                                          std::uint32_t hist_size_in_items);
 
 } // namespace statistics::histogram
diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.hpp b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
index f33a23609666..329c96dfc1c6 100644
--- a/dpnp/backend/extensions/statistics/sliding_window1d.hpp
+++ b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
@@ -28,25 +28,21 @@
 
 #pragma once
 
-#include <algorithm>
-
-#include "utils/math_utils.hpp"
-#include <sycl/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
 #include <type_traits>
 
-#include <stdio.h>
-
-#include "ext/common.hpp"
+#include <sycl/sycl.hpp>
 
-using dpctl::tensor::usm_ndarray;
+#include "dpctl4pybind11.hpp"
 
-using ext::common::Align;
-using ext::common::CeilDiv;
+#include "kernels/statistics/sliding_window1d.hpp"
 
 namespace statistics::sliding_window1d
 {
+using dpctl::tensor::usm_ndarray;
 
-template <typename T, uint32_t Size>
+template <typename T, std::uint32_t Size>
 class _RegistryDataStorage
 {
 public:
@@ -144,7 +140,7 @@ class _RegistryDataStorage
     ncT data[Size];
 };
 
-template <typename T, uint32_t Size = 1>
+template <typename T, std::uint32_t Size = 1>
 struct RegistryData : public _RegistryDataStorage<T, Size>
 {
     using SizeT = typename _RegistryDataStorage<T, Size>::SizeT;
@@ -336,7 +332,7 @@ struct RegistryData : public _RegistryDataStorage<T, Size>
     T *store(T *const data) { return store(data, true); }
 };
 
-template <typename T, uint32_t Size>
+template <typename T, std::uint32_t Size>
 struct RegistryWindow : public RegistryData<T, Size>
 {
     using SizeT = typename RegistryData<T, Size>::SizeT;
@@ -349,7 +345,7 @@ struct RegistryWindow : public RegistryData<T, Size>
         static_assert(std::is_integral_v<shT>,
                       "shift must be of an integral type");
 
-        uint32_t shift_r = this->size_x() - shift;
+        std::uint32_t shift_r = this->size_x() - shift;
         for (SizeT i = 0; i < Size; ++i) {
             this->data[i] = this->shift_left(i, shift);
             auto border =
@@ -369,7 +365,7 @@ struct RegistryWindow : public RegistryData<T, Size>
     }
 };
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 class Span
 {
 public:
@@ -391,13 +387,13 @@ class Span
     const SizeT size_;
 };
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 Span<T, SizeT> make_span(T *const data, const SizeT size)
 {
     return Span<T, SizeT>(data, size);
 }
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 class PaddedSpan : public Span<T, SizeT>
 {
 public:
@@ -417,68 +413,14 @@ class PaddedSpan : public Span<T, SizeT>
     const SizeT pad_;
 };
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 PaddedSpan<T, SizeT>
     make_padded_span(T *const data, const SizeT size, const SizeT offset)
 {
     return PaddedSpan<T, SizeT>(data, size, offset);
 }
 
-template <typename Results,
-          typename AData,
-          typename VData,
-          typename Op,
-          typename Red>
-void process_block(Results &results,
-                   uint32_t r_size,
-                   AData &a_data,
-                   VData &v_data,
-                   uint32_t block_size,
-                   Op op,
-                   Red red)
-{
-    for (uint32_t i = 0; i < block_size; ++i) {
-        auto v_val = v_data.broadcast(i);
-        for (uint32_t r = 0; r < r_size; ++r) {
-            results[r] = red(results[r], op(a_data[r], v_val));
-        }
-        a_data.advance_left();
-    }
-}
-
-template <typename SizeT>
-SizeT get_global_linear_id(const uint32_t wpi, const sycl::nd_item<1> &item)
-{
-    auto sbgroup = item.get_sub_group();
-    const auto sg_loc_id = sbgroup.get_local_linear_id();
-
-    const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
-    const SizeT id = sg_base_id + sg_loc_id;
-
-    return id;
-}
-
-template <typename SizeT>
-uint32_t get_results_num(const uint32_t wpi,
-                         const SizeT size,
-                         const SizeT global_id,
-                         const sycl::nd_item<1> &item)
-{
-    auto sbgroup = item.get_sub_group();
-
-    const auto sbg_size = sbgroup.get_max_local_range()[0];
-    const auto size_ = sycl::sub_sat(size, global_id);
-    return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
-}
-
-template <uint32_t WorkPI,
-          typename T,
-          typename SizeT,
-          typename Op,
-          typename Red>
-class sliding_window1d_kernel;
-
-template <uint32_t WorkPI,
+template <std::uint32_t WorkPI,
           typename T,
           typename SizeT,
           typename Op,
@@ -491,77 +433,16 @@ void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
                              sycl::nd_range<1> nd_range,
                              sycl::handler &cgh)
 {
-    cgh.parallel_for<sliding_window1d_kernel<WorkPI, T, SizeT, Op, Red>>(
-        nd_range, [=](sycl::nd_item<1> item) {
-            auto glid = get_global_linear_id<SizeT>(WorkPI, item);
-
-            auto results = RegistryData<T, WorkPI>(item);
-            results.fill(0);
-
-            auto results_num = get_results_num(WorkPI, out.size(), glid, item);
-
-            const auto *a_begin = a.begin();
-            const auto *a_end = a.end();
+    using SlidingWindow1dKernel =
+        dpnp::kernels::sliding_window1d::SlidingWindow1dFunctor<
+            WorkPI, PaddedSpan<const T, SizeT>, Span<const T, SizeT>, Op, Red,
+            Span<T, SizeT>, RegistryData, RegistryWindow>;
 
-            auto sbgroup = item.get_sub_group();
-
-            const auto chunks_count =
-                CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
-
-            const auto *a_ptr = &a.padded_begin()[glid];
-
-            auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
-                return ptr >= a_begin && ptr < a_end;
-            };
-
-            auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
-            a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
-
-            const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
-            auto v_size = v.size();
-
-            for (uint32_t b = 0; b < chunks_count; ++b) {
-                auto v_data = RegistryData<const T>(item);
-                v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
-
-                uint32_t chunk_size_ =
-                    std::min(v_size, SizeT(v_data.total_size()));
-                process_block(results, results_num, a_data, v_data, chunk_size_,
-                              op, red);
-
-                if (b != chunks_count - 1) {
-                    a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
-                                             _a_load_cond, 0);
-                    v_size -= v_data.total_size();
-                }
-            }
-
-            auto *const out_ptr = out.begin();
-            // auto *const out_end = out.end();
-
-            auto y_start = glid;
-            auto y_stop =
-                std::min(y_start + WorkPI * results.size_x(), out.size());
-            uint32_t i = 0;
-            for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
-                out_ptr[y] = results[i++];
-            }
-            // while the code itself seems to be valid, inside correlate
-            // kernel it results in memory corruption. Further investigation
-            // is needed. SAT-7693
-            // corruption results.store(&out_ptr[glid],
-            //               [out_end](auto &&ptr) { return ptr < out_end; });
-        });
+    cgh.parallel_for<SlidingWindow1dKernel>(
+        nd_range, SlidingWindow1dKernel(a, v, op, red, out));
 }
 
-template <uint32_t WorkPI,
-          typename T,
-          typename SizeT,
-          typename Op,
-          typename Red>
-class sliding_window1d_small_kernel;
-
-template <uint32_t WorkPI,
+template <std::uint32_t WorkPI,
           typename T,
           typename SizeT,
           typename Op,
@@ -574,61 +455,18 @@ void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
                                           sycl::nd_range<1> nd_range,
                                           sycl::handler &cgh)
 {
-    cgh.parallel_for<sliding_window1d_small_kernel<WorkPI, T, SizeT, Op, Red>>(
-        nd_range, [=](sycl::nd_item<1> item) {
-            auto glid = get_global_linear_id<SizeT>(WorkPI, item);
-
-            auto results = RegistryData<T, WorkPI>(item);
-            results.fill(0);
-
-            auto sbgroup = item.get_sub_group();
-            auto sg_size = sbgroup.get_max_local_range()[0];
-
-            const uint32_t to_read = WorkPI * sg_size + v.size();
-            const auto *a_begin = a.begin();
-
-            const auto *a_ptr = &a.padded_begin()[glid];
-            const auto *a_end = std::min(a_ptr + to_read, a.end());
-
-            auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
-                return ptr >= a_begin && ptr < a_end;
-            };
+    using SlidingWindow1dSmallKernel =
+        dpnp::kernels::sliding_window1d::SlidingWindow1dSmallFunctor<
+            WorkPI, PaddedSpan<const T, SizeT>, Span<const T, SizeT>, Op, Red,
+            Span<T, SizeT>, RegistryData, RegistryWindow>;
 
-            auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
-            a_data.load(a_ptr, _a_load_cond, 0);
-
-            const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
-            auto v_size = v.size();
-
-            auto v_data = RegistryData<const T>(item);
-            v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
-
-            auto results_num = get_results_num(WorkPI, out.size(), glid, item);
-
-            process_block(results, results_num, a_data, v_data, v_size, op,
-                          red);
-
-            auto *const out_ptr = out.begin();
-            // auto *const out_end = out.end();
-
-            auto y_start = glid;
-            auto y_stop =
-                std::min(y_start + WorkPI * results.size_x(), out.size());
-            uint32_t i = 0;
-            for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
-                out_ptr[y] = results[i++];
-            }
-            // while the code itself seems to be valid, inside correlate
-            // kernel it results in memory corruption. Further investigation
-            // is needed. SAT-7693
-            // corruption results.store(&out_ptr[glid],
-            //               [out_end](auto &&ptr) { return ptr < out_end; });
-        });
+    cgh.parallel_for<SlidingWindow1dSmallKernel>(
+        nd_range, SlidingWindow1dSmallKernel(a, v, op, red, out));
 }
 
 void validate(const usm_ndarray &a,
               const usm_ndarray &v,
               const usm_ndarray &out,
-              const size_t l_pad,
-              const size_t r_pad);
+              const std::size_t l_pad,
+              const std::size_t r_pad);
 } // namespace statistics::sliding_window1d
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index fca8c43f816e..8830569ce9cf 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -41,40 +41,29 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "kernels/elementwise_functions/interpolate.hpp"
+
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
-#include "kernels/elementwise_functions/interpolate.hpp"
-
 // utils extension headers
 #include "ext/common.hpp"
 #include "ext/validation_utils.hpp"
 
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace type_utils = dpctl::tensor::type_utils;
-
-using ext::common::value_type_of;
-using ext::validation::array_names;
-using ext::validation::array_ptr;
-
-using ext::common::dtype_from_typenum;
-using ext::validation::check_has_dtype;
-using ext::validation::check_num_dims;
-using ext::validation::check_same_dtype;
-using ext::validation::check_same_size;
-using ext::validation::common_checks;
-
 namespace dpnp::extensions::ufunc
 {
+namespace py = pybind11;
 
 namespace impl
 {
-using ext::common::init_dispatch_vector;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace type_utils = dpctl::tensor::type_utils;
 
 template <typename T>
-using value_type_of_t = typename value_type_of<T>::type;
+using value_type_of_t = typename ext::common::value_type_of<T>::type;
+
+using ext::common::dtype_from_typenum;
 
 typedef sycl::event (*interpolate_fn_ptr_t)(sycl::queue &,
                                             const void *,      // x
@@ -88,8 +77,10 @@ typedef sycl::event (*interpolate_fn_ptr_t)(sycl::queue &,
                                             const std::size_t, // xp_size
                                             const std::vector<sycl::event> &);
 
+interpolate_fn_ptr_t interpolate_dispatch_vector[td_ns::num_types];
+
 template <typename T, typename TIdx = std::int64_t>
-sycl::event interpolate_call(sycl::queue &exec_q,
+sycl::event interpolate_impl(sycl::queue &q,
                              const void *vx,
                              const void *vidx,
                              const void *vxp,
@@ -101,6 +92,8 @@ sycl::event interpolate_call(sycl::queue &exec_q,
                              const std::size_t xp_size,
                              const std::vector<sycl::event> &depends)
 {
+    dpctl::tensor::type_utils::validate_type_for_device<T>(q);
+
     using type_utils::is_complex_v;
     using TCoord = std::conditional_t<is_complex_v<T>, value_type_of_t<T>, T>;
 
@@ -112,23 +105,69 @@ sycl::event interpolate_call(sycl::queue &exec_q,
     const T *right = static_cast<const T *>(vright);
     T *out = static_cast<T *>(vout);
 
-    using dpnp::kernels::interpolate::interpolate_impl;
-    sycl::event interpolate_ev = interpolate_impl<TCoord, T>(
-        exec_q, x, idx, xp, fp, left, right, out, n, xp_size, depends);
+    sycl::event interpolate_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using InterpolateFunc =
+            dpnp::kernels::interpolate::InterpolateFunctor<TCoord, T>;
+
+        cgh.parallel_for<InterpolateFunc>(
+            sycl::range<1>(n),
+            InterpolateFunc(x, idx, xp, fp, left, right, out, xp_size));
+    });
 
     return interpolate_ev;
 }
 
-interpolate_fn_ptr_t interpolate_dispatch_vector[td_ns::num_types];
+/**
+ * @brief A factory to define pairs of supported types for which
+ * interpolate function is available.
+ *
+ * @tparam T Type of input vector `a` and of result vector `y`.
+ */
+template <typename T>
+struct InterpolateOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
 
-void common_interpolate_checks(
-    const dpctl::tensor::usm_ndarray &x,
-    const dpctl::tensor::usm_ndarray &idx,
-    const dpctl::tensor::usm_ndarray &xp,
-    const dpctl::tensor::usm_ndarray &fp,
-    const dpctl::tensor::usm_ndarray &out,
-    const std::optional<const dpctl::tensor::usm_ndarray> &left,
-    const std::optional<const dpctl::tensor::usm_ndarray> &right)
+template <typename fnT, typename T>
+struct InterpolateFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename InterpolateOutputType<T>::value_type,
+                          void>) {
+            return nullptr;
+        }
+        else {
+            return interpolate_impl<T>;
+        }
+    }
+};
+
+namespace detail
+{
+using ext::validation::array_names;
+using ext::validation::check_has_dtype;
+using ext::validation::check_num_dims;
+using ext::validation::check_same_dtype;
+using ext::validation::check_same_size;
+using ext::validation::common_checks;
+
+void validate(const dpctl::tensor::usm_ndarray &x,
+              const dpctl::tensor::usm_ndarray &idx,
+              const dpctl::tensor::usm_ndarray &xp,
+              const dpctl::tensor::usm_ndarray &fp,
+              const dpctl::tensor::usm_ndarray &out,
+              const std::optional<const dpctl::tensor::usm_ndarray> &left,
+              const std::optional<const dpctl::tensor::usm_ndarray> &right)
 {
     array_names names = {{&x, "x"}, {&xp, "xp"}, {&fp, "fp"}, {&out, "out"}};
 
@@ -158,6 +197,7 @@ void common_interpolate_checks(
         throw py::value_error("array of sample points is empty");
     }
 }
+} // namespace detail
 
 std::pair<sycl::event, sycl::event>
     py_interpolate(const dpctl::tensor::usm_ndarray &x,
@@ -170,7 +210,7 @@ std::pair<sycl::event, sycl::event>
                    sycl::queue &exec_q,
                    const std::vector<sycl::event> &depends)
 {
-    common_interpolate_checks(x, idx, xp, fp, out, left, right);
+    detail::validate(x, idx, xp, fp, out, left, right);
 
     int out_typenum = out.get_typenum();
 
@@ -214,56 +254,21 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(args_ev, ev);
 }
 
-/**
- * @brief A factory to define pairs of supported types for which
- * interpolate function is available.
- *
- * @tparam T Type of input vector `a` and of result vector `y`.
- */
-template <typename T>
-struct InterpolateOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-};
-
-template <typename fnT, typename T>
-struct InterpolateFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_same_v<
-                          typename InterpolateOutputType<T>::value_type,
-                          void>) {
-            return nullptr;
-        }
-        else {
-            return interpolate_call<T>;
-        }
-    }
-};
-
 static void init_interpolate_dispatch_vectors()
 {
-    init_dispatch_vector<interpolate_fn_ptr_t, InterpolateFactory>(
+    using ext::common::init_dispatch_vector;
+    init_dispatch_vector<interpolate_fn_ptr_t, impl::InterpolateFactory>(
         interpolate_dispatch_vector);
 }
-
 } // namespace impl
 
 void init_interpolate(py::module_ m)
 {
     impl::init_interpolate_dispatch_vectors();
 
-    using impl::py_interpolate;
-    m.def("_interpolate", &py_interpolate, "", py::arg("x"), py::arg("idx"),
-          py::arg("xp"), py::arg("fp"), py::arg("left"), py::arg("right"),
-          py::arg("out"), py::arg("sycl_queue"),
+    m.def("_interpolate", &impl::py_interpolate, "", py::arg("x"),
+          py::arg("idx"), py::arg("xp"), py::arg("fp"), py::arg("left"),
+          py::arg("right"), py::arg("out"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 }
-
 } // namespace dpnp::extensions::ufunc
diff --git a/dpnp/backend/extensions/window/common.hpp b/dpnp/backend/extensions/window/common.hpp
index cb084e972d78..9e7b1192e3a2 100644
--- a/dpnp/backend/extensions/window/common.hpp
+++ b/dpnp/backend/extensions/window/common.hpp
@@ -28,11 +28,18 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <cstddef>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
 #include <sycl/sycl.hpp>
 
 #include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 // dpctl tensor headers
 #include "utils/output_validation.hpp"
@@ -41,10 +48,8 @@
 
 namespace dpnp::extensions::window
 {
-
-namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
-
 namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
 
 typedef sycl::event (*window_fn_ptr_t)(sycl::queue &,
                                        char *,
@@ -72,6 +77,20 @@ sycl::event window_impl(sycl::queue &exec_q,
     return window_ev;
 }
 
+template <typename fnT, typename T, template <typename> typename FunctorT>
+struct Factory
+{
+    fnT get()
+    {
+        if constexpr (std::is_floating_point_v<T>) {
+            return window_impl<T, FunctorT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
 template <typename funcPtrT>
 std::tuple<size_t, char *, funcPtrT>
     window_fn(sycl::queue &exec_q,
@@ -101,7 +120,7 @@ std::tuple<size_t, char *, funcPtrT>
     }
 
     const int result_typenum = result.get_typenum();
-    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    auto array_types = td_ns::usm_ndarray_types();
     const int result_type_id = array_types.typenum_to_lookup_id(result_typenum);
     funcPtrT fn = window_dispatch_vector[result_type_id];
 
diff --git a/dpnp/backend/extensions/window/kaiser.cpp b/dpnp/backend/extensions/window/kaiser.cpp
index b83f88f69a9b..e5c1aa837a64 100644
--- a/dpnp/backend/extensions/window/kaiser.cpp
+++ b/dpnp/backend/extensions/window/kaiser.cpp
@@ -26,26 +26,24 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
-#include "kaiser.hpp"
+#include <sycl/sycl.hpp>
+
 #include "common.hpp"
+#include "kaiser.hpp"
+
+#include "kernels/window/kaiser.hpp"
 
 // utils extension header
 #include "ext/common.hpp"
 
 // dpctl tensor headers
-#include "utils/output_validation.hpp"
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
-#include <sycl/sycl.hpp>
-
-#include "kernels/elementwise_functions/i0.hpp"
-
 namespace dpnp::extensions::window
 {
-namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
-
-using ext::common::init_dispatch_vector;
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
 
 typedef sycl::event (*kaiser_fn_ptr_t)(sycl::queue &,
                                        char *,
@@ -53,34 +51,10 @@ typedef sycl::event (*kaiser_fn_ptr_t)(sycl::queue &,
                                        const py::object &,
                                        const std::vector<sycl::event> &);
 
-static kaiser_fn_ptr_t kaiser_dispatch_vector[dpctl_td_ns::num_types];
+static kaiser_fn_ptr_t kaiser_dispatch_vector[td_ns::num_types];
 
-template <typename T>
-class KaiserFunctor
+namespace impl
 {
-private:
-    T *res = nullptr;
-    const std::size_t N;
-    const T beta;
-
-public:
-    KaiserFunctor(T *res, const std::size_t N, const T beta)
-        : res(res), N(N), beta(beta)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        using dpnp::kernels::i0::cyl_bessel_i0;
-
-        const auto i = id.get(0);
-        const T alpha = (N - 1) / T(2);
-        const T tmp = (i - alpha) / alpha;
-        res[i] = cyl_bessel_i0(beta * sycl::sqrt(1 - tmp * tmp)) /
-                 cyl_bessel_i0(beta);
-    }
-};
-
 template <typename T>
 sycl::event kaiser_impl(sycl::queue &exec_q,
                         char *result,
@@ -96,7 +70,7 @@ sycl::event kaiser_impl(sycl::queue &exec_q,
     sycl::event kaiser_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        using KaiserKernel = KaiserFunctor<T>;
+        using KaiserKernel = dpnp::kernels::kaiser::KaiserFunctor<T>;
         cgh.parallel_for<KaiserKernel>(sycl::range<1>(nelems),
                                        KaiserKernel(res, nelems, beta));
     });
@@ -117,6 +91,7 @@ struct KaiserFactory
         }
     }
 };
+} // namespace impl
 
 std::pair<sycl::event, sycl::event>
     py_kaiser(sycl::queue &exec_q,
@@ -141,8 +116,8 @@ std::pair<sycl::event, sycl::event>
 
 void init_kaiser_dispatch_vectors()
 {
-    init_dispatch_vector<kaiser_fn_ptr_t, KaiserFactory>(
+    using ext::common::init_dispatch_vector;
+    init_dispatch_vector<kaiser_fn_ptr_t, impl::KaiserFactory>(
         kaiser_dispatch_vector);
 }
-
 } // namespace dpnp::extensions::window
diff --git a/dpnp/backend/extensions/window/kaiser.hpp b/dpnp/backend/extensions/window/kaiser.hpp
index 0a4712cc594e..4ba506620db2 100644
--- a/dpnp/backend/extensions/window/kaiser.hpp
+++ b/dpnp/backend/extensions/window/kaiser.hpp
@@ -28,11 +28,15 @@
 
 #pragma once
 
-#include <dpctl4pybind11.hpp>
 #include <sycl/sycl.hpp>
 
+#include <dpctl4pybind11.hpp>
+#include <pybind11/pybind11.h>
+
 namespace dpnp::extensions::window
 {
+namespace py = pybind11;
+
 extern std::pair<sycl::event, sycl::event>
     py_kaiser(sycl::queue &exec_q,
               const py::object &beta,
@@ -40,5 +44,4 @@ extern std::pair<sycl::event, sycl::event>
               const std::vector<sycl::event> &depends);
 
 extern void init_kaiser_dispatch_vectors(void);
-
 } // namespace dpnp::extensions::window
diff --git a/dpnp/backend/extensions/window/window_py.cpp b/dpnp/backend/extensions/window/window_py.cpp
index 2b8090c40cca..5ae80f4027b5 100644
--- a/dpnp/backend/extensions/window/window_py.cpp
+++ b/dpnp/backend/extensions/window/window_py.cpp
@@ -33,11 +33,12 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "bartlett.hpp"
-#include "blackman.hpp"
+#include "kernels/window/bartlett.hpp"
+#include "kernels/window/blackman.hpp"
+#include "kernels/window/hamming.hpp"
+#include "kernels/window/hanning.hpp"
+
 #include "common.hpp"
-#include "hamming.hpp"
-#include "hanning.hpp"
 #include "kaiser.hpp"
 
 // utils extension header
@@ -51,6 +52,22 @@ using window_ns::window_fn_ptr_t;
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 
+template <typename fnT, typename T>
+using BartlettFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::bartlett::BartlettFunctor>;
+
+template <typename fnT, typename T>
+using BlackmanFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::blackman::BlackmanFunctor>;
+
+template <typename fnT, typename T>
+using HammingFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::hamming::HammingFunctor>;
+
+template <typename fnT, typename T>
+using HanningFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::hanning::HanningFunctor>;
+
 static window_fn_ptr_t bartlett_dispatch_vector[dpctl_td_ns::num_types];
 static window_fn_ptr_t blackman_dispatch_vector[dpctl_td_ns::num_types];
 static window_fn_ptr_t hamming_dispatch_vector[dpctl_td_ns::num_types];
@@ -62,8 +79,7 @@ PYBIND11_MODULE(_window_impl, m)
     using event_vecT = std::vector<sycl::event>;
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::BartlettFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, BartlettFactory>(
             bartlett_dispatch_vector);
 
         auto bartlett_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
@@ -78,8 +94,7 @@ PYBIND11_MODULE(_window_impl, m)
     }
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::BlackmanFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, BlackmanFactory>(
             blackman_dispatch_vector);
 
         auto blackman_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
@@ -94,8 +109,7 @@ PYBIND11_MODULE(_window_impl, m)
     }
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::HammingFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, HammingFactory>(
             hamming_dispatch_vector);
 
         auto hamming_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
@@ -110,8 +124,7 @@ PYBIND11_MODULE(_window_impl, m)
     }
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::HanningFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, HanningFactory>(
             hanning_dispatch_vector);
 
         auto hanning_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
diff --git a/dpnp/backend/kernels/elementwise_functions/interpolate.hpp b/dpnp/backend/kernels/elementwise_functions/interpolate.hpp
index ef38157b00e9..c85dafea24b0 100644
--- a/dpnp/backend/kernels/elementwise_functions/interpolate.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/interpolate.hpp
@@ -28,67 +28,79 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstdint>
+
 #include <sycl/sycl.hpp>
-#include <vector>
 
 #include "ext/common.hpp"
 
-using ext::common::IsNan;
-
 namespace dpnp::kernels::interpolate
 {
+using ext::common::IsNan;
+
 template <typename TCoord, typename TValue, typename TIdx = std::int64_t>
-sycl::event interpolate_impl(sycl::queue &q,
-                             const TCoord *x,
-                             const TIdx *idx,
-                             const TCoord *xp,
-                             const TValue *fp,
-                             const TValue *left,
-                             const TValue *right,
-                             TValue *out,
-                             const std::size_t n,
-                             const std::size_t xp_size,
-                             const std::vector<sycl::event> &depends)
+class InterpolateFunctor
 {
+private:
+    const TCoord *x = nullptr;
+    const TIdx *idx = nullptr;
+    const TCoord *xp = nullptr;
+    const TValue *fp = nullptr;
+    const TValue *left = nullptr;
+    const TValue *right = nullptr;
+    TValue *out = nullptr;
+    const std::size_t xp_size;
+
+public:
+    InterpolateFunctor(const TCoord *x_,
+                       const TIdx *idx_,
+                       const TCoord *xp_,
+                       const TValue *fp_,
+                       const TValue *left_,
+                       const TValue *right_,
+                       TValue *out_,
+                       const std::size_t xp_size_)
+        : x(x_), idx(idx_), xp(xp_), fp(fp_), left(left_), right(right_),
+          out(out_), xp_size(xp_size_)
+    {
+    }
+
     // Selected over the work-group version
     // due to simpler execution and slightly better performance.
-    return q.submit([&](sycl::handler &h) {
-        h.depends_on(depends);
-        h.parallel_for(sycl::range<1>(n), [=](sycl::id<1> i) {
-            TValue left_val = left ? *left : fp[0];
-            TValue right_val = right ? *right : fp[xp_size - 1];
+    void operator()(sycl::id<1> id) const
+    {
+        TValue left_val = left ? *left : fp[0];
+        TValue right_val = right ? *right : fp[xp_size - 1];
 
-            TCoord x_val = x[i];
-            TIdx x_idx = idx[i] - 1;
+        TCoord x_val = x[id];
+        TIdx x_idx = idx[id] - 1;
 
-            if (IsNan<TCoord>::isnan(x_val)) {
-                out[i] = x_val;
-            }
-            else if (x_idx < 0) {
-                out[i] = left_val;
-            }
-            else if (x_val == xp[xp_size - 1]) {
-                out[i] = fp[xp_size - 1];
-            }
-            else if (x_idx >= static_cast<TIdx>(xp_size - 1)) {
-                out[i] = right_val;
-            }
-            else {
-                TValue slope =
-                    (fp[x_idx + 1] - fp[x_idx]) / (xp[x_idx + 1] - xp[x_idx]);
-                TValue res = slope * (x_val - xp[x_idx]) + fp[x_idx];
+        if (IsNan<TCoord>::isnan(x_val)) {
+            out[id] = x_val;
+        }
+        else if (x_idx < 0) {
+            out[id] = left_val;
+        }
+        else if (x_val == xp[xp_size - 1]) {
+            out[id] = fp[xp_size - 1];
+        }
+        else if (x_idx >= static_cast<TIdx>(xp_size - 1)) {
+            out[id] = right_val;
+        }
+        else {
+            TValue slope =
+                (fp[x_idx + 1] - fp[x_idx]) / (xp[x_idx + 1] - xp[x_idx]);
+            TValue res = slope * (x_val - xp[x_idx]) + fp[x_idx];
 
-                if (IsNan<TValue>::isnan(res)) {
-                    res = slope * (x_val - xp[x_idx + 1]) + fp[x_idx + 1];
-                    if (IsNan<TValue>::isnan(res) &&
-                        (fp[x_idx] == fp[x_idx + 1])) {
-                        res = fp[x_idx];
-                    }
+            if (IsNan<TValue>::isnan(res)) {
+                res = slope * (x_val - xp[x_idx + 1]) + fp[x_idx + 1];
+                if (IsNan<TValue>::isnan(res) && (fp[x_idx] == fp[x_idx + 1])) {
+                    res = fp[x_idx];
                 }
-                out[i] = res;
             }
-        });
-    });
-}
-
+            out[id] = res;
+        }
+    }
+};
 } // namespace dpnp::kernels::interpolate
diff --git a/dpnp/backend/kernels/indexing/choose.hpp b/dpnp/backend/kernels/indexing/choose.hpp
new file mode 100644
index 000000000000..49b71d05c96b
--- /dev/null
+++ b/dpnp/backend/kernels/indexing/choose.hpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "utils/strided_iters.hpp"
+
+namespace dpnp::kernels::choose
+{
+using dpctl::tensor::ssize_t;
+
+template <typename ProjectorT,
+          typename IndOutIndexerT,
+          typename ChoicesIndexerT,
+          typename IndT,
+          typename T>
+class ChooseFunctor
+{
+private:
+    const IndT *ind = nullptr;
+    T *dst = nullptr;
+    char **chcs = nullptr;
+    ssize_t n_chcs;
+    const IndOutIndexerT ind_out_indexer;
+    const ChoicesIndexerT chcs_indexer;
+
+public:
+    ChooseFunctor(const IndT *ind_,
+                  T *dst_,
+                  char **chcs_,
+                  ssize_t n_chcs_,
+                  const IndOutIndexerT &ind_out_indexer_,
+                  const ChoicesIndexerT &chcs_indexer_)
+        : ind(ind_), dst(dst_), chcs(chcs_), n_chcs(n_chcs_),
+          ind_out_indexer(ind_out_indexer_), chcs_indexer(chcs_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const ProjectorT proj{};
+
+        ssize_t i = id[0];
+
+        auto ind_dst_offsets = ind_out_indexer(i);
+        ssize_t ind_offset = ind_dst_offsets.get_first_offset();
+        ssize_t dst_offset = ind_dst_offsets.get_second_offset();
+
+        IndT chc_idx = ind[ind_offset];
+        // proj produces an index in the range of n_chcs
+        ssize_t projected_idx = proj(n_chcs, chc_idx);
+
+        ssize_t chc_offset = chcs_indexer(i, projected_idx);
+
+        T *chc = reinterpret_cast<T *>(chcs[projected_idx]);
+
+        dst[dst_offset] = chc[chc_offset];
+    }
+};
+
+namespace strides
+{
+using dpctl::tensor::strides::CIndexer_vector;
+
+struct NthStrideOffsetUnpacked
+{
+    NthStrideOffsetUnpacked(int common_nd,
+                            ssize_t const *_offsets,
+                            ssize_t const *_shape,
+                            ssize_t const *_strides)
+        : _ind(common_nd), nd(common_nd), offsets(_offsets), shape(_shape),
+          strides(_strides)
+    {
+    }
+
+    template <typename nT>
+    size_t operator()(ssize_t gid, nT n) const
+    {
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid, shape, strides + (n * nd), relative_offset);
+
+        return relative_offset + offsets[n];
+    }
+
+private:
+    CIndexer_vector<ssize_t> _ind;
+
+    int nd;
+    ssize_t const *offsets;
+    ssize_t const *shape;
+    ssize_t const *strides;
+};
+
+static_assert(sycl::is_device_copyable_v<NthStrideOffsetUnpacked>);
+
+} // namespace strides
+} // namespace dpnp::kernels::choose
diff --git a/dpnp/backend/kernels/statistics/histogram.hpp b/dpnp/backend/kernels/statistics/histogram.hpp
new file mode 100644
index 000000000000..6d0fedbe0bc3
--- /dev/null
+++ b/dpnp/backend/kernels/statistics/histogram.hpp
@@ -0,0 +1,99 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <sycl/sycl.hpp>
+
+namespace dpnp::kernels::histogram
+{
+template <typename T, typename HistImpl, typename Edges, typename Weights>
+class HistogramFunctor
+{
+private:
+    const T *in = nullptr;
+    const std::size_t size;
+    const std::size_t dims;
+    const std::uint32_t WorkPI;
+    const HistImpl hist;
+    const Edges edges;
+    const Weights weights;
+
+public:
+    HistogramFunctor(const T *in_,
+                     const std::size_t size_,
+                     const std::size_t dims_,
+                     const std::uint32_t WorkPI_,
+                     const HistImpl &hist_,
+                     const Edges &edges_,
+                     const Weights &weights_)
+        : in(in_), size(size_), dims(dims_), WorkPI(WorkPI_), hist(hist_),
+          edges(edges_), weights(weights_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> item) const
+    {
+        auto id = item.get_group_linear_id();
+        auto lid = item.get_local_linear_id();
+        auto group = item.get_group();
+        auto local_size = item.get_local_range(0);
+
+        hist.init(item);
+        edges.init(item);
+
+        if constexpr (HistImpl::sync_after_init || Edges::sync_after_init) {
+            sycl::group_barrier(group, sycl::memory_scope::work_group);
+        }
+
+        auto bounds = edges.get_bounds();
+
+        for (std::uint32_t i = 0; i < WorkPI; ++i) {
+            auto data_idx = id * WorkPI * local_size + i * local_size + lid;
+            if (data_idx < size) {
+                auto *d = &in[data_idx * dims];
+
+                if (edges.in_bounds(d, bounds)) {
+                    auto bin = edges.get_bin(item, d, bounds);
+                    auto weight = weights.get(data_idx);
+                    hist.add(item, bin, weight);
+                }
+            }
+        }
+
+        if constexpr (HistImpl::sync_before_finalize) {
+            sycl::group_barrier(group, sycl::memory_scope::work_group);
+        }
+
+        hist.finalize(item);
+    }
+};
+} // namespace dpnp::kernels::histogram
diff --git a/dpnp/backend/kernels/statistics/sliding_window1d.hpp b/dpnp/backend/kernels/statistics/sliding_window1d.hpp
new file mode 100644
index 000000000000..5b3c5535afd4
--- /dev/null
+++ b/dpnp/backend/kernels/statistics/sliding_window1d.hpp
@@ -0,0 +1,274 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include <sycl/sycl.hpp>
+
+#include "ext/common.hpp"
+
+namespace dpnp::kernels::sliding_window1d
+{
+using ext::common::CeilDiv;
+
+namespace detail
+{
+template <typename SizeT>
+SizeT get_global_linear_id(const std::uint32_t wpi,
+                           const sycl::nd_item<1> &item)
+{
+    auto sbgroup = item.get_sub_group();
+    const auto sg_loc_id = sbgroup.get_local_linear_id();
+
+    const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
+    const SizeT id = sg_base_id + sg_loc_id;
+
+    return id;
+}
+
+template <typename SizeT>
+std::uint32_t get_results_num(const std::uint32_t wpi,
+                              const SizeT size,
+                              const SizeT global_id,
+                              const sycl::nd_item<1> &item)
+{
+    auto sbgroup = item.get_sub_group();
+
+    const auto sbg_size = sbgroup.get_max_local_range()[0];
+    const auto size_ = sycl::sub_sat(size, global_id);
+    return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
+}
+
+template <typename Results,
+          typename AData,
+          typename VData,
+          typename Op,
+          typename Red>
+void process_block(Results &results,
+                   std::uint32_t r_size,
+                   AData &a_data,
+                   VData &v_data,
+                   std::uint32_t block_size,
+                   Op op,
+                   Red red)
+{
+    for (std::uint32_t i = 0; i < block_size; ++i) {
+        auto v_val = v_data.broadcast(i);
+        for (std::uint32_t r = 0; r < r_size; ++r) {
+            results[r] = red(results[r], op(a_data[r], v_val));
+        }
+        a_data.advance_left();
+    }
+}
+} // namespace detail
+
+template <std::uint32_t WorkPI,
+          typename SpanT,
+          typename KernelT,
+          typename OpT,
+          typename RedT,
+          typename ResultT,
+          template <typename, std::uint32_t> class RegistryDataT,
+          template <typename, std::uint32_t> class RegistryWindowT>
+class SlidingWindow1dFunctor
+{
+private:
+    const SpanT a;
+    const KernelT v;
+    const OpT op;
+    const RedT red;
+    ResultT out;
+
+    static constexpr std::uint32_t default_reg_data_size = 1;
+    using SizeT = typename SpanT::size_type;
+
+public:
+    SlidingWindow1dFunctor(const SpanT &a_,
+                           const KernelT &v_,
+                           const OpT &op_,
+                           const RedT &red_,
+                           ResultT &out_)
+        : a(a_), v(v_), op(op_), red(red_), out(out_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> item) const
+    {
+        auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
+
+        auto results =
+            RegistryDataT<typename ResultT::value_type, WorkPI>(item);
+        results.fill(0);
+
+        auto results_num =
+            detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
+
+        const auto *a_begin = a.begin();
+        const auto *a_end = a.end();
+
+        auto sbgroup = item.get_sub_group();
+
+        const auto chunks_count =
+            CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
+
+        const auto *a_ptr = &a.padded_begin()[glid];
+
+        auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
+            return ptr >= a_begin && ptr < a_end;
+        };
+
+        auto a_data =
+            RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
+        a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
+
+        const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
+        auto v_size = v.size();
+
+        for (std::uint32_t b = 0; b < chunks_count; ++b) {
+            auto v_data = RegistryDataT<typename KernelT::value_type,
+                                        default_reg_data_size>(item);
+            v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
+
+            std::uint32_t chunk_size_ =
+                std::min(v_size, SizeT(v_data.total_size()));
+            detail::process_block(results, results_num, a_data, v_data,
+                                  chunk_size_, op, red);
+
+            if (b != chunks_count - 1) {
+                a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
+                                         _a_load_cond, 0);
+                v_size -= v_data.total_size();
+            }
+        }
+
+        auto *const out_ptr = out.begin();
+        // auto *const out_end = out.end();
+
+        auto y_start = glid;
+        auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
+        std::uint32_t i = 0;
+        for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
+            out_ptr[y] = results[i++];
+        }
+        // while the code itself seems to be valid, inside correlate
+        // kernel it results in memory corruption. Further investigation
+        // is needed. SAT-7693
+        // corruption results.store(&out_ptr[glid],
+        //               [out_end](auto &&ptr) { return ptr < out_end; });
+    }
+};
+
+template <std::uint32_t WorkPI,
+          typename SpanT,
+          typename KernelT,
+          typename OpT,
+          typename RedT,
+          typename ResultT,
+          template <typename, std::uint32_t> class RegistryDataT,
+          template <typename, std::uint32_t> class RegistryWindowT>
+class SlidingWindow1dSmallFunctor
+{
+private:
+    const SpanT a;
+    const KernelT v;
+    const OpT op;
+    const RedT red;
+    ResultT out;
+
+    static constexpr std::uint32_t default_reg_data_size = 1;
+    using SizeT = typename SpanT::size_type;
+
+public:
+    SlidingWindow1dSmallFunctor(const SpanT &a_,
+                                const KernelT &v_,
+                                const OpT &op_,
+                                const RedT &red_,
+                                ResultT &out_)
+        : a(a_), v(v_), op(op_), red(red_), out(out_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> item) const
+    {
+        auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
+
+        auto results =
+            RegistryDataT<typename ResultT::value_type, WorkPI>(item);
+        results.fill(0);
+
+        auto sbgroup = item.get_sub_group();
+        auto sg_size = sbgroup.get_max_local_range()[0];
+
+        const std::uint32_t to_read = WorkPI * sg_size + v.size();
+        const auto *a_begin = a.begin();
+
+        const auto *a_ptr = &a.padded_begin()[glid];
+        const auto *a_end = std::min(a_ptr + to_read, a.end());
+
+        auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
+            return ptr >= a_begin && ptr < a_end;
+        };
+
+        auto a_data =
+            RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
+        a_data.load(a_ptr, _a_load_cond, 0);
+
+        const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
+        auto v_size = v.size();
+
+        auto v_data =
+            RegistryDataT<typename KernelT::value_type, default_reg_data_size>(
+                item);
+        v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
+
+        auto results_num =
+            detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
+
+        detail::process_block(results, results_num, a_data, v_data, v_size, op,
+                              red);
+
+        auto *const out_ptr = out.begin();
+        // auto *const out_end = out.end();
+
+        auto y_start = glid;
+        auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
+        std::uint32_t i = 0;
+        for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
+            out_ptr[y] = results[i++];
+        }
+        // while the code itself seems to be valid, inside correlate
+        // kernel it results in memory corruption. Further investigation
+        // is needed. SAT-7693
+        // corruption results.store(&out_ptr[glid],
+        //               [out_end](auto &&ptr) { return ptr < out_end; });
+    }
+};
+} // namespace dpnp::kernels::sliding_window1d
diff --git a/dpnp/backend/extensions/window/bartlett.hpp b/dpnp/backend/kernels/window/bartlett.hpp
similarity index 80%
rename from dpnp/backend/extensions/window/bartlett.hpp
rename to dpnp/backend/kernels/window/bartlett.hpp
index 69d3be627c84..20d410150dcb 100644
--- a/dpnp/backend/extensions/window/bartlett.hpp
+++ b/dpnp/backend/kernels/window/bartlett.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -19,7 +19,7 @@
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, RES, OR PROFITS; OR BUSINESS
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::bartlett
 {
-
 template <typename T>
 class BartlettFunctor
 {
@@ -52,19 +52,4 @@ class BartlettFunctor
         res[i] = T(1) - sycl::fabs(i - alpha) / alpha;
     }
 };
-
-template <typename fnT, typename T>
-struct BartlettFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, BartlettFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::bartlett
diff --git a/dpnp/backend/extensions/window/blackman.hpp b/dpnp/backend/kernels/window/blackman.hpp
similarity index 83%
rename from dpnp/backend/extensions/window/blackman.hpp
rename to dpnp/backend/kernels/window/blackman.hpp
index 7a75d226792f..9df7cb8728e2 100644
--- a/dpnp/backend/extensions/window/blackman.hpp
+++ b/dpnp/backend/kernels/window/blackman.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::blackman
 {
-
 template <typename T>
 class BlackmanFunctor
 {
@@ -53,19 +53,4 @@ class BlackmanFunctor
                  T(0.08) * sycl::cospi(T(2) * alpha);
     }
 };
-
-template <typename fnT, typename T>
-struct BlackmanFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, BlackmanFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::blackman
diff --git a/dpnp/backend/extensions/window/hamming.hpp b/dpnp/backend/kernels/window/hamming.hpp
similarity index 83%
rename from dpnp/backend/extensions/window/hamming.hpp
rename to dpnp/backend/kernels/window/hamming.hpp
index 521ebc10c281..895ecb0e588c 100644
--- a/dpnp/backend/extensions/window/hamming.hpp
+++ b/dpnp/backend/kernels/window/hamming.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::hamming
 {
-
 template <typename T>
 class HammingFunctor
 {
@@ -51,19 +51,4 @@ class HammingFunctor
         res[i] = T(0.54) - T(0.46) * sycl::cospi(T(2) * i / (N - 1));
     }
 };
-
-template <typename fnT, typename T>
-struct HammingFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, HammingFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::hamming
diff --git a/dpnp/backend/extensions/window/hanning.hpp b/dpnp/backend/kernels/window/hanning.hpp
similarity index 83%
rename from dpnp/backend/extensions/window/hanning.hpp
rename to dpnp/backend/kernels/window/hanning.hpp
index 612036d6b05a..35b441f921f8 100644
--- a/dpnp/backend/extensions/window/hanning.hpp
+++ b/dpnp/backend/kernels/window/hanning.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::hanning
 {
-
 template <typename T>
 class HanningFunctor
 {
@@ -51,19 +51,4 @@ class HanningFunctor
         res[i] = T(0.5) - T(0.5) * sycl::cospi(T(2) * i / (N - 1));
     }
 };
-
-template <typename fnT, typename T>
-struct HanningFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, HanningFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::hanning
diff --git a/dpnp/backend/kernels/window/kaiser.hpp b/dpnp/backend/kernels/window/kaiser.hpp
new file mode 100644
index 000000000000..ce8c8e52fd18
--- /dev/null
+++ b/dpnp/backend/kernels/window/kaiser.hpp
@@ -0,0 +1,64 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/elementwise_functions/i0.hpp"
+
+namespace dpnp::kernels::kaiser
+{
+template <typename T>
+class KaiserFunctor
+{
+private:
+    T *res = nullptr;
+    const std::size_t N;
+    const T beta;
+
+public:
+    KaiserFunctor(T *res, const std::size_t N, const T beta)
+        : res(res), N(N), beta(beta)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        using dpnp::kernels::i0::cyl_bessel_i0;
+
+        const auto i = id.get(0);
+        const T alpha = (N - 1) / T(2);
+        const T tmp = (i - alpha) / alpha;
+        res[i] = cyl_bessel_i0(beta * sycl::sqrt(1 - tmp * tmp)) /
+                 cyl_bessel_i0(beta);
+    }
+};
+} // namespace dpnp::kernels::kaiser
diff --git a/scripts/gen_coverage.py b/scripts/gen_coverage.py
index 588345d91b2e..545fd888c1ba 100644
--- a/scripts/gen_coverage.py
+++ b/scripts/gen_coverage.py
@@ -259,6 +259,9 @@ def find_objects():
                     "-format=lcov",
                     "-ignore-filename-regex=/tmp/icpx*",
                     r"-ignore-filename-regex=.*/backend/kernels/elementwise_functions/.*\.hpp$",
+                    r"-ignore-filename-regex=.*/backend/kernels/indexing/.*\.hpp$",
+                    r"-ignore-filename-regex=.*/backend/kernels/statistics/.*\.hpp$",
+                    r"-ignore-filename-regex=.*/backend/kernels/window/.*\.hpp$",
                     "-instr-profile=" + instr_profile_fn,
                 ]
                 + objects

From d74afafe7545ad392228c8ed0148f7b893524efb Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Fri, 20 Mar 2026 15:34:41 +0100
Subject: [PATCH 22/35] Add ndarray subclassing support via `ndarray.view()`
 method (#2815)

The PR adds ndarray subclassing support via `ndarray.view()` method.

It implements the `type` parameter in `dpnp.ndarray.view()` to enable
custom subclasses, matching NumPy/CuPy behavior. Also includes proper
`__array_finalize__` hook invocation for metadata propagation.

The implementation is done through `_view_impl()` helper using
'array_class' parameter to avoid shadowing builtin 'type'.
The tests scope is extended with new 7 tests for verifying subclassing
support and enabling `TestSubclassArrayView` class from third party
tests.

This PR closes #2764.
---
 CHANGELOG.md                                  |   1 +
 dpnp/dpnp_array.py                            | 223 ++++++++++++------
 dpnp/tests/test_ndarray.py                    |  85 ++++++-
 .../core_tests/test_ndarray_copy_and_view.py  |   1 -
 4 files changed, 232 insertions(+), 78 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 096eabef6720..a742a2f4b532 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.divmod` [#2674](https://github.com/IntelPython/dpnp/pull/2674)
 * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595)
 * Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
+* Added support for ndarray subclassing via `dpnp.ndarray.view` method with `type` parameter [#2815](https://github.com/IntelPython/dpnp/issues/2815)
 
 ### Changed
 
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index dad67fc1b584..951f782c3007 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -644,6 +644,136 @@ def _create_from_usm_ndarray(usm_ary: dpt.usm_ndarray):
         res._array_obj._set_namespace(dpnp)
         return res
 
+    def _create_view(self, array_class, shape, dtype, strides):
+        """
+        Create a view of an array with the specified class.
+
+        The method handles subclass instantiation by creating a usm_ndarray
+        view and then wrapping it in the appropriate class.
+
+        Parameters
+        ----------
+        array_class : type
+            The class to instantiate (dpnp_array or a subclass).
+        shape : tuple
+            Shape of the view.
+        dtype : dtype
+            Data type of the view (can be None to keep source's dtype).
+        strides : tuple
+            Strides of the view.
+
+        Returns
+        -------
+        view : array_class instance
+            A view of the array as the specified class.
+
+        """
+
+        if dtype is None:
+            dtype = self.dtype
+
+        # create the underlying usm_ndarray view
+        usm_view = dpt.usm_ndarray(
+            shape,
+            dtype=dtype,
+            buffer=self._array_obj,
+            strides=tuple(s // dpnp.dtype(dtype).itemsize for s in strides),
+        )
+
+        # wrap the view into the appropriate class
+        if array_class is dpnp_array:
+            res = dpnp_array._create_from_usm_ndarray(usm_view)
+        else:
+            # for subclasses, create using __new__ and set up manually
+            res = array_class.__new__(array_class)
+            res._array_obj = usm_view
+            res._array_obj._set_namespace(dpnp)
+
+            if hasattr(res, "__array_finalize__"):
+                res.__array_finalize__(self)
+
+        return res
+
+    def _view_impl(self, dtype=None, array_class=None):
+        """
+        Internal implementation of view method to avoid an issue where
+        `type` parameter in ndarray.view method shadowing builtin type.
+
+        """
+
+        # check if dtype is actually a type
+        if dtype is not None:
+            if isinstance(dtype, type) and issubclass(dtype, dpnp_array):
+                if array_class is not None:
+                    raise ValueError("Cannot specify output type twice")
+                array_class = dtype
+                dtype = None
+
+        # validate array_class parameter
+        if not (
+            array_class is None
+            or isinstance(array_class, type)
+            and issubclass(array_class, dpnp_array)
+        ):
+            raise ValueError("Type must be a sub-type of ndarray type")
+
+        if array_class is None:
+            # it's a view on dpnp.ndarray
+            array_class = self.__class__
+
+        old_sh = self.shape
+        old_strides = self.strides
+
+        if dtype is None:
+            return self._create_view(array_class, old_sh, None, old_strides)
+
+        new_dt = dpnp.dtype(dtype)
+        new_dt = dtu._to_device_supported_dtype(new_dt, self.sycl_device)
+
+        new_itemsz = new_dt.itemsize
+        old_itemsz = self.dtype.itemsize
+        if new_itemsz == old_itemsz:
+            return self._create_view(array_class, old_sh, new_dt, old_strides)
+
+        ndim = self.ndim
+        if ndim == 0:
+            raise ValueError(
+                "Changing the dtype of a 0d array is only supported "
+                "if the itemsize is unchanged"
+            )
+
+        # resize on last axis only
+        axis = ndim - 1
+        if (
+            old_sh[axis] != 1
+            and self.size != 0
+            and old_strides[axis] != old_itemsz
+        ):
+            raise ValueError(
+                "To change to a dtype of a different size, "
+                "the last axis must be contiguous"
+            )
+
+        # normalize strides whenever itemsize changes
+        new_strides = tuple(
+            old_strides[i] if i != axis else new_itemsz for i in range(ndim)
+        )
+
+        new_dim = old_sh[axis] * old_itemsz
+        if new_dim % new_itemsz != 0:
+            raise ValueError(
+                "When changing to a larger dtype, its size must be a divisor "
+                "of the total size in bytes of the last axis of the array"
+            )
+
+        # normalize shape whenever itemsize changes
+        new_sh = tuple(
+            old_sh[i] if i != axis else new_dim // new_itemsz
+            for i in range(ndim)
+        )
+
+        return self._create_view(array_class, new_sh, new_dt, new_strides)
+
     def all(self, axis=None, *, out=None, keepdims=False, where=True):
         """
         Return ``True`` if all elements evaluate to ``True``.
@@ -2322,10 +2452,18 @@ def view(self, /, dtype=None, *, type=None):
 
         Parameters
         ----------
-        dtype : {None, str, dtype object}, optional
+        dtype : {None, str, dtype object, type}, optional
             The desired data type of the returned view, e.g. :obj:`dpnp.float32`
-            or :obj:`dpnp.int16`. By default, it results in the view having the
-            same data type.
+            or :obj:`dpnp.int16`. Omitting it results in the view having the
+            same data type. Can also be a subclass of :class:`dpnp.ndarray` to
+            create a view of that type (this is equivalent to setting the `type`
+            parameter).
+
+            Default: ``None``.
+        type : {None, type}, optional
+            Type of the returned view, e.g. a subclass of :class:`dpnp.ndarray`.
+            If specified, the returned array will be an instance of `type`.
+            Omitting it results in type preservation.
 
             Default: ``None``.
 
@@ -2340,11 +2478,6 @@ def view(self, /, dtype=None, *, type=None):
 
         Only the last axis has to be contiguous.
 
-        Limitations
-        -----------
-        Parameter `type` is supported only with default value ``None``.
-        Otherwise, the function raises ``NotImplementedError`` exception.
-
         Examples
         --------
         >>> import dpnp as np
@@ -2368,73 +2501,17 @@ def view(self, /, dtype=None, *, type=None):
             [[2312, 2826],
                 [5396, 5910]]], dtype=int16)
 
-        """
-
-        if type is not None:
-            raise NotImplementedError(
-                "Keyword argument `type` is supported only with "
-                f"default value ``None``, but got {type}."
-            )
-
-        old_sh = self.shape
-        old_strides = self.strides
-
-        if dtype is None:
-            return dpnp_array(old_sh, buffer=self, strides=old_strides)
-
-        new_dt = dpnp.dtype(dtype)
-        new_dt = dtu._to_device_supported_dtype(new_dt, self.sycl_device)
-
-        new_itemsz = new_dt.itemsize
-        old_itemsz = self.dtype.itemsize
-        if new_itemsz == old_itemsz:
-            return dpnp_array(
-                old_sh, dtype=new_dt, buffer=self, strides=old_strides
-            )
-
-        ndim = self.ndim
-        if ndim == 0:
-            raise ValueError(
-                "Changing the dtype of a 0d array is only supported "
-                "if the itemsize is unchanged"
-            )
-
-        # resize on last axis only
-        axis = ndim - 1
-        if (
-            old_sh[axis] != 1
-            and self.size != 0
-            and old_strides[axis] != old_itemsz
-        ):
-            raise ValueError(
-                "To change to a dtype of a different size, "
-                "the last axis must be contiguous"
-            )
+        Creating a view with a custom ndarray subclass:
 
-        # normalize strides whenever itemsize changes
-        new_strides = tuple(
-            old_strides[i] if i != axis else new_itemsz for i in range(ndim)
-        )
-
-        new_dim = old_sh[axis] * old_itemsz
-        if new_dim % new_itemsz != 0:
-            raise ValueError(
-                "When changing to a larger dtype, its size must be a divisor "
-                "of the total size in bytes of the last axis of the array"
-            )
-
-        # normalize shape whenever itemsize changes
-        new_sh = tuple(
-            old_sh[i] if i != axis else new_dim // new_itemsz
-            for i in range(ndim)
-        )
+        >>> class MyArray(np.ndarray):
+        ...     pass
+        >>> x = np.array([1, 2, 3])
+        >>> y = x.view(MyArray)
+        >>> type(y)
+        <class 'MyArray'>
 
-        return dpnp_array(
-            new_sh,
-            dtype=new_dt,
-            buffer=self,
-            strides=new_strides,
-        )
+        """
+        return self._view_impl(dtype=dtype, array_class=type)
 
     @property
     def usm_type(self):
diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py
index 4e4e42bbc85e..6ce8645a11d4 100644
--- a/dpnp/tests/test_ndarray.py
+++ b/dpnp/tests/test_ndarray.py
@@ -228,10 +228,87 @@ def test_python_types(self, dt):
         expected = a.view(dt)
         assert_allclose(result, expected)
 
-    def test_type_error(self):
-        x = dpnp.ones(4, dtype="i4")
-        with pytest.raises(NotImplementedError):
-            x.view("i2", type=dpnp.ndarray)
+    def test_subclass_basic(self):
+        class MyArray(dpnp.ndarray):
+            pass
+
+        x = dpnp.array([1, 2, 3])
+        view = x.view(type=MyArray)
+
+        assert isinstance(view, MyArray)
+        assert type(view) is MyArray
+        assert (view == x).all()
+
+    def test_dtype_type_subclass(self):
+        class MyArray(dpnp.ndarray):
+            pass
+
+        x = dpnp.array([1, 2, 3])
+
+        # All three syntaxes should work identically
+        view1 = x.view(type=MyArray)
+        view2 = x.view(MyArray)
+        view3 = x.view(dtype=MyArray)
+
+        assert type(view1) is MyArray
+        assert type(view2) is MyArray
+        assert type(view3) is MyArray
+
+    def test_subclass_array_finalize(self):
+        class ArrayWithInfo(dpnp.ndarray):
+            def __array_finalize__(self, obj):
+                self.info = getattr(obj, "info", "default")
+
+        x = dpnp.array([1, 2, 3]).view(type=ArrayWithInfo)
+        x.info = "metadata"
+
+        # Create a view - __array_finalize__ should be called
+        view = x.view()
+        assert hasattr(view, "info")
+        assert view.info == "metadata"
+        assert type(view) is ArrayWithInfo
+
+    def test_subclass_self_class_preservation(self):
+        class MyArray(dpnp.ndarray):
+            pass
+
+        x = dpnp.array([1, 2, 3]).view(type=MyArray)
+
+        # View without type parameter should preserve MyArray
+        view = x.view()
+        assert type(view) is MyArray
+
+    def test_subclass_with_dtype_change(self):
+        class MyArray(dpnp.ndarray):
+            pass
+
+        x = dpnp.array([1.0, 2.0], dtype=dpnp.float32)
+        view = x.view(dtype=dpnp.int32, type=MyArray)
+
+        assert type(view) is MyArray
+        assert view.dtype == dpnp.int32
+
+    @pytest.mark.parametrize("xp", [dpnp, numpy])
+    def test_subclass_invalid_type(self, xp):
+        x = xp.array([1, 2, 3])
+        with pytest.raises(
+            ValueError, match="Type must be a sub-type of ndarray type"
+        ):
+            x.view(type=list)
+
+    @pytest.mark.parametrize("xp", [dpnp, numpy])
+    def test_subclass_double_type_specification(self, xp):
+        class MyArray(xp.ndarray):
+            pass
+
+        class OtherArray(xp.ndarray):
+            pass
+
+        x = xp.array([1, 2, 3])
+        with pytest.raises(
+            ValueError, match="Cannot specify output type twice"
+        ):
+            x.view(dtype=MyArray, type=OtherArray)
 
 
 @pytest.mark.parametrize(
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_copy_and_view.py b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_copy_and_view.py
index 7b503f1997a5..5df4322ba0b1 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_copy_and_view.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_copy_and_view.py
@@ -466,7 +466,6 @@ def __array_finalize__(self, obj):
         self.info = getattr(obj, "info", None)
 
 
-@pytest.mark.skip("subclass array is not supported")
 class TestSubclassArrayView:
 
     def test_view_casting(self):

From 7abbee9860b9c20b4347fbbcfc91cf31f34b0f04 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 22 Mar 2026 15:28:46 +0100
Subject: [PATCH 23/35] Bump github/codeql-action from 4.32.6 to 4.34.1 (#2827)

Bumps [github/codeql-action](https://github.com/github/codeql-action)
from 4.32.6 to 4.34.1.
---
 .github/workflows/openssf-scorecard.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index 5d7e0677281e..ffe61f3fb739 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -72,6 +72,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@0d579ffd059c29b07949a3cce3983f0780820c98 # v4.32.6
+        uses: github/codeql-action/upload-sarif@38697555549f1db7851b81482ff19f1fa5c4fedc # v4.34.1
         with:
           sarif_file: results.sarif

From 10d121289cea5d1ed2db652572e7c8d815bce9d0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 23 Mar 2026 10:46:20 +0100
Subject: [PATCH 24/35] Bump nick-fields/retry from 3.0.2 to 4.0.0 (#2826)

Bumps [nick-fields/retry](https://github.com/nick-fields/retry) from
3.0.2 to 4.0.0.
---
 .github/workflows/check-onemath.yaml     | 4 ++--
 .github/workflows/conda-package.yml      | 4 ++--
 .github/workflows/cron-run-tests.yaml    | 4 ++--
 .github/workflows/generate_coverage.yaml | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml
index acbfcac96890..27218e480d60 100644
--- a/.github/workflows/check-onemath.yaml
+++ b/.github/workflows/check-onemath.yaml
@@ -139,7 +139,7 @@ jobs:
       - name: ReRun tests on Linux
         if: env.rerun-tests-on-failure == 'true'
         id: run_tests
-        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0
         with:
           timeout_minutes: ${{ env.rerun-tests-timeout }}
           max_attempts: ${{ env.rerun-tests-max-attempts }}
@@ -245,7 +245,7 @@ jobs:
       - name: ReRun tests on Linux
         if: env.rerun-tests-on-failure == 'true'
         id: run_tests
-        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0
         with:
           timeout_minutes: ${{ env.rerun-tests-timeout }}
           max_attempts: ${{ env.rerun-tests-max-attempts }}
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index c894c530a20e..b24ffec1334a 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -230,7 +230,7 @@ jobs:
       - name: Run tests
         if: env.rerun-tests-on-failure == 'true'
         id: run_tests_linux
-        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0
         with:
           timeout_minutes: ${{ env.rerun-tests-timeout }}
           max_attempts: ${{ env.rerun-tests-max-attempts }}
@@ -392,7 +392,7 @@ jobs:
       - name: Run tests
         if: env.rerun-tests-on-failure == 'true'
         id: run_tests_win
-        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0
         with:
           timeout_minutes: ${{ env.rerun-tests-timeout }}
           max_attempts: ${{ env.rerun-tests-max-attempts }}
diff --git a/.github/workflows/cron-run-tests.yaml b/.github/workflows/cron-run-tests.yaml
index ea4fd4f14fc3..5b3a6452401f 100644
--- a/.github/workflows/cron-run-tests.yaml
+++ b/.github/workflows/cron-run-tests.yaml
@@ -128,7 +128,7 @@ jobs:
       - name: ReRun tests on Linux
         if: steps.run_tests.outcome == 'failure' && matrix.runner != 'windows-2022'
         id: run_tests_linux
-        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0
         with:
           timeout_minutes: ${{ env.rerun-tests-timeout }}
           max_attempts: ${{ env.rerun-tests-max-attempts }}
@@ -145,7 +145,7 @@ jobs:
       - name: ReRun tests on Windows
         if: steps.run_tests.outcome == 'failure' && matrix.runner == 'windows-2022'
         id: run_tests_win
-        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0
         with:
           timeout_minutes: ${{ env.rerun-tests-timeout }}
           max_attempts: ${{ env.rerun-tests-max-attempts }}
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index bfc3c7357a3e..5fd211e55a81 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -119,7 +119,7 @@ jobs:
 
       - name: Build dpnp with coverage
         id: build_coverage
-        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0
         with:
           shell: bash
           timeout_minutes: 60

From 5e5dc24056c6c194ab6f7870f98dd194be2eb277 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 23 Mar 2026 13:01:32 +0100
Subject: [PATCH 25/35] Weekly pre-commit autoupdate (#2825)

This PR updates the `.pre-commit-config.yaml` using `pre-commit
autoupdate`.
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 57ec9e2a2a8e..2d14b00c86ad 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -89,12 +89,12 @@ repos:
             - flake8-docstrings==1.7.0
             - flake8-bugbear==24.12.12
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v22.1.0
+    rev: v22.1.1
     hooks:
     -   id: clang-format
         args: ["-i"]
 -   repo: https://github.com/gitleaks/gitleaks
-    rev: v8.30.1
+    rev: v8.30.0
     hooks:
     -   id: gitleaks
 -   repo: https://github.com/jumanjihouse/pre-commit-hooks

From 744e57ff4418c60970e378d2601d90324a84926a Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Fri, 27 Mar 2026 14:41:26 +0100
Subject: [PATCH 26/35] Fix test tolerances for float16 precision in math tests
 (#2828)

This PR fixes test failures when testing with
`DPNP_TEST_ALL_INT_TYPES=1` against conda-forge's NumPy, where float16
precision is used in various scenarios requiring relaxed tolerances.
---
 CHANGELOG.md                                  |  1 +
 .../cupy/math_tests/test_explog.py            | 11 +--
 .../cupy/math_tests/test_hyperbolic.py        | 10 ++-
 .../third_party/cupy/math_tests/test_misc.py  | 66 ++++++++++++++---
 dpnp/tests/third_party/cupy/testing/_loops.py | 72 ++++++++++++-------
 5 files changed, 117 insertions(+), 43 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a742a2f4b532..8e2f5703486d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -81,6 +81,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Fixed an issue causing an exception in `dpnp.geomspace` and `dpnp.logspace` when called with explicit `device` keyword but any input array is allocated on another device [#2723](https://github.com/IntelPython/dpnp/pull/2723)
 * Fixed `.data.ptr` property on array views to correctly return the pointer to the view's data location instead of the base allocation pointer [#2812](https://github.com/IntelPython/dpnp/pull/2812)
 * Resolved an issue with strides calculation in `dpnp.diagonal` to return correct values for empty diagonals [#2814](https://github.com/IntelPython/dpnp/pull/2814)
+* Fixed test tolerance issues for float16 intermediate precision that became visible when testing against conda-forge's NumPy [#2828](https://github.com/IntelPython/dpnp/pull/2828)
 
 ### Security
 
diff --git a/dpnp/tests/third_party/cupy/math_tests/test_explog.py b/dpnp/tests/third_party/cupy/math_tests/test_explog.py
index 2d4b539d1fb4..ecd5f6a20ea2 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_explog.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_explog.py
@@ -6,9 +6,14 @@
 
 
 class TestExplog:
+    # rtol=1e-3 is used to pass the test when dtype is int8/uint8
+    # for such a case, output dtype is float16
+    _rtol_dict = {numpy.float16: 1e-3, "default": 1e-7}
 
     @testing.for_all_dtypes()
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        rtol=_rtol_dict, atol=1e-5, type_check=has_support_aspect64()
+    )
     def check_unary(self, name, xp, dtype, no_complex=False):
         if no_complex:
             if numpy.dtype(dtype).kind == "c":
@@ -16,11 +21,9 @@ def check_unary(self, name, xp, dtype, no_complex=False):
         a = testing.shaped_arange((2, 3), xp, dtype)
         return getattr(xp, name)(a)
 
-    # rtol=1e-3 is added for dpnp to pass the test when dtype is int8/unint8
-    # for such a case, output dtype is float16
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-3, atol=1e-5, type_check=has_support_aspect64()
+        rtol=_rtol_dict, atol=1e-5, type_check=has_support_aspect64()
     )
     def check_binary(self, name, xp, dtype, no_complex=False):
         if no_complex:
diff --git a/dpnp/tests/third_party/cupy/math_tests/test_hyperbolic.py b/dpnp/tests/third_party/cupy/math_tests/test_hyperbolic.py
index 5613cee41589..8d87feb0d1a7 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_hyperbolic.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_hyperbolic.py
@@ -7,10 +7,14 @@
 
 
 class TestHyperbolic(unittest.TestCase):
+    # rtol=1e-2 is used to pass the test when dtype is int8/uint8
+    # for such a case, output dtype is float16
+    _rtol_dict = {numpy.float16: 1e-2, "default": 1e-7}
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        atol={numpy.float16: 1e-3, "default": 1e-5},
+        rtol=_rtol_dict,
+        atol=1e-5,
         type_check=has_support_aspect64(),
     )
     def check_unary(self, name, xp, dtype):
@@ -18,7 +22,7 @@ def check_unary(self, name, xp, dtype):
         return getattr(xp, name)(a)
 
     @testing.for_dtypes(["e", "f", "d"])
-    @testing.numpy_cupy_allclose(atol={numpy.float16: 1e-3, "default": 1e-5})
+    @testing.numpy_cupy_allclose(rtol=_rtol_dict, atol=1e-5)
     def check_unary_unit(self, name, xp, dtype):
         a = xp.array([0.2, 0.4, 0.6, 0.8], dtype=dtype)
         return getattr(xp, name)(a)
@@ -36,7 +40,7 @@ def test_arcsinh(self):
         self.check_unary("arcsinh")
 
     @testing.for_dtypes(["e", "f", "d"])
-    @testing.numpy_cupy_allclose(atol={numpy.float16: 1e-3, "default": 1e-5})
+    @testing.numpy_cupy_allclose(rtol=_rtol_dict, atol=1e-5)
     def test_arccosh(self, xp, dtype):
         a = xp.array([1, 2, 3], dtype=dtype)
         return xp.arccosh(a)
diff --git a/dpnp/tests/third_party/cupy/math_tests/test_misc.py b/dpnp/tests/third_party/cupy/math_tests/test_misc.py
index e2f12ae373a6..dcc3c4017c6b 100644
--- a/dpnp/tests/third_party/cupy/math_tests/test_misc.py
+++ b/dpnp/tests/third_party/cupy/math_tests/test_misc.py
@@ -9,6 +9,27 @@
 
 
 class TestMisc:
+    @staticmethod
+    def _interp_atol(_result_dtype, dtype_x=None, **_kwargs):
+        """Compute absolute tolerance based on intermediate computation dtype.
+
+        Args:
+            _result_dtype: Output dtype (unused - we check input dtype instead)
+            dtype_x: Input dtype for fx coordinates
+            _kwargs: Additional test parameters (unused)
+
+        When dtype_x is int8/uint8/float16, xp.sin(fx) uses float16 precision,
+        so we need relaxed tolerance even if the final result is upcasted to float64.
+        Float16 has ~3 decimal digits of precision, hence atol=1e-3.
+        """
+        if dtype_x is not None:
+            if numpy.dtype(dtype_x).type in (
+                numpy.int8,
+                numpy.uint8,
+                numpy.float16,
+            ):
+                return 1e-3
+        return 1e-5
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
@@ -401,9 +422,12 @@ def test_real_if_close_with_float_tol_false(self, xp, dtype):
 
     @testing.for_all_dtypes(name="dtype_x", no_bool=True, no_complex=True)
     @testing.for_all_dtypes(name="dtype_y", no_bool=True)
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
+        # tolerance is automatically adjusted based on dtype_x via resolver
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
         fx = xp.asarray([1, 3, 5, 7, 9], dtype=dtype_x)
         fy = xp.sin(fx).astype(dtype_y)
@@ -411,7 +435,9 @@ def test_interp(self, xp, dtype_y, dtype_x):
 
     @testing.for_all_dtypes(name="dtype_x", no_bool=True, no_complex=True)
     @testing.for_all_dtypes(name="dtype_y", no_bool=True)
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_period(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
@@ -421,7 +447,9 @@ def test_interp_period(self, xp, dtype_y, dtype_x):
 
     @testing.for_all_dtypes(name="dtype_x", no_bool=True, no_complex=True)
     @testing.for_all_dtypes(name="dtype_y", no_bool=True)
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_left_right(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
@@ -434,7 +462,9 @@ def test_interp_left_right(self, xp, dtype_y, dtype_x):
     @testing.with_requires("numpy>=1.17.0")
     @testing.for_all_dtypes(name="dtype_x", no_bool=True, no_complex=True)
     @testing.for_dtypes("efdFD", name="dtype_y")
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_nan_fy(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
@@ -446,7 +476,9 @@ def test_interp_nan_fy(self, xp, dtype_y, dtype_x):
     @testing.with_requires("numpy>=1.17.0")
     @testing.for_float_dtypes(name="dtype_x")
     @testing.for_dtypes("efdFD", name="dtype_y")
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_nan_fx(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
@@ -458,7 +490,9 @@ def test_interp_nan_fx(self, xp, dtype_y, dtype_x):
     @testing.with_requires("numpy>=1.17.0")
     @testing.for_float_dtypes(name="dtype_x")
     @testing.for_dtypes("efdFD", name="dtype_y")
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_nan_x(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
@@ -470,7 +504,9 @@ def test_interp_nan_x(self, xp, dtype_y, dtype_x):
     @testing.with_requires("numpy>=1.17.0")
     @testing.for_all_dtypes(name="dtype_x", no_bool=True, no_complex=True)
     @testing.for_dtypes("efdFD", name="dtype_y")
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_inf_fy(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
@@ -482,7 +518,9 @@ def test_interp_inf_fy(self, xp, dtype_y, dtype_x):
     @testing.with_requires("numpy>=1.17.0")
     @testing.for_float_dtypes(name="dtype_x")
     @testing.for_dtypes("efdFD", name="dtype_y")
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_inf_fx(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
@@ -494,7 +532,9 @@ def test_interp_inf_fx(self, xp, dtype_y, dtype_x):
     @testing.with_requires("numpy>=1.17.0")
     @testing.for_float_dtypes(name="dtype_x")
     @testing.for_dtypes("efdFD", name="dtype_y")
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_inf_x(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
@@ -505,7 +545,9 @@ def test_interp_inf_x(self, xp, dtype_y, dtype_x):
 
     @testing.for_all_dtypes(name="dtype_x", no_bool=True, no_complex=True)
     @testing.for_all_dtypes(name="dtype_y", no_bool=True)
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_size1(self, xp, dtype_y, dtype_x):
         # interpolate at points on and outside the boundaries
         x = xp.asarray([0, 1, 2, 4, 6, 8, 9, 10], dtype=dtype_x)
@@ -518,7 +560,9 @@ def test_interp_size1(self, xp, dtype_y, dtype_x):
     @testing.with_requires("numpy>=1.17.0")
     @testing.for_float_dtypes(name="dtype_x")
     @testing.for_dtypes("efdFD", name="dtype_y")
-    @testing.numpy_cupy_allclose(atol=1e-5, type_check=has_support_aspect64())
+    @testing.numpy_cupy_allclose(
+        atol=_interp_atol, type_check=has_support_aspect64()
+    )
     def test_interp_inf_to_nan(self, xp, dtype_y, dtype_x):
         # from NumPy's test_non_finite_inf
         x = xp.asarray([0.5], dtype=dtype_x)
diff --git a/dpnp/tests/third_party/cupy/testing/_loops.py b/dpnp/tests/third_party/cupy/testing/_loops.py
index 026c451e71e3..03232642b221 100644
--- a/dpnp/tests/third_party/cupy/testing/_loops.py
+++ b/dpnp/tests/third_party/cupy/testing/_loops.py
@@ -410,7 +410,7 @@ def test_func(*args, **kw):
                         numpy_r = numpy_r[mask]
 
                 if not skip:
-                    check_func(cupy_r, numpy_r)
+                    check_func(cupy_r, numpy_r, **kw)
 
         return test_func
 
@@ -469,6 +469,9 @@ def _convert_output_to_ndarray(c_out, n_out, sp_name, check_sparse_format):
 
 def _check_tolerance_keys(rtol, atol):
     def _check(tol):
+        if callable(tol):
+            # Callable tolerance is allowed
+            return
         if isinstance(tol, dict):
             for k in tol.keys():
                 if type(k) is type:
@@ -486,9 +489,13 @@ def _check(tol):
     _check(atol)
 
 
-def _resolve_tolerance(type_check, result, rtol, atol):
+def _resolve_tolerance(type_check, result, rtol, atol, **test_kwargs):
     def _resolve(dtype, tol):
-        if isinstance(tol, dict):
+        if callable(tol):
+            # Support callable tolerance that can inspect test kwargs
+            return tol(dtype, **test_kwargs)
+        elif isinstance(tol, dict):
+            # Original dict lookup logic
             tol1 = tol.get(dtype.type)
             if tol1 is None:
                 tol1 = tol.get("default")
@@ -523,13 +530,15 @@ def numpy_cupy_allclose(
     """Decorator that checks NumPy results and CuPy ones are close.
 
     Args:
-         rtol(float or dict): Relative tolerance. Besides a float value, a
-             dictionary that maps a dtypes to a float value can be supplied to
-             adjust tolerance per dtype. If the dictionary has ``'default'``
-             string as its key, its value is used as the default tolerance in
-             case any dtype keys do not match.
-         atol(float or dict): Absolute tolerance. Besides a float value, a
-             dictionary can be supplied as ``rtol``.
+         rtol(float, dict, or callable): Relative tolerance. Can be:
+             - A float value
+             - A dictionary that maps dtypes to float values. If the dictionary
+               has ``'default'`` string as its key, its value is used as the
+               default tolerance in case any dtype keys do not match.
+             - A callable with signature ``(dtype, **test_kwargs)`` that returns
+               a float. This allows dynamic tolerance based on test parameters
+               like input dtypes.
+         atol(float, dict, or callable): Absolute tolerance. Same options as ``rtol``.
          err_msg(str): The error message to be printed in case of failure.
          verbose(bool): If ``True``, the conflicting values are
              appended to the error message.
@@ -583,10 +592,17 @@ def numpy_cupy_allclose(
     #             "must be supplied as float."
     #         )
 
-    def check_func(c, n):
-        rtol1, atol1 = _resolve_tolerance(type_check, c, rtol, atol)
+    def check_func(cupy_result, numpy_result, **test_kwargs):
+        rtol1, atol1 = _resolve_tolerance(
+            type_check, cupy_result, rtol, atol, **test_kwargs
+        )
         _array.assert_allclose(
-            c, n, rtol1, atol1, err_msg=err_msg, verbose=verbose
+            cupy_result,
+            numpy_result,
+            rtol1,
+            atol1,
+            err_msg=err_msg,
+            verbose=verbose,
         )
 
     return _make_decorator(
@@ -641,8 +657,10 @@ def numpy_cupy_array_almost_equal(
     .. seealso:: :func:`cupy.testing.assert_array_almost_equal`
     """
 
-    def check_func(x, y):
-        _array.assert_array_almost_equal(x, y, decimal, err_msg, verbose)
+    def check_func(cupy_result, numpy_result, **test_kwargs):
+        _array.assert_array_almost_equal(
+            cupy_result, numpy_result, decimal, err_msg, verbose
+        )
 
     return _make_decorator(
         check_func, name, type_check, False, accept_error, sp_name, scipy_name
@@ -684,8 +702,8 @@ def numpy_cupy_array_almost_equal_nulp(
     .. seealso:: :func:`cupy.testing.assert_array_almost_equal_nulp`
     """
 
-    def check_func(x, y):
-        _array.assert_array_almost_equal_nulp(x, y, nulp)
+    def check_func(cupy_result, numpy_result, **test_kwargs):
+        _array.assert_array_almost_equal_nulp(cupy_result, numpy_result, nulp)
 
     return _make_decorator(
         check_func,
@@ -738,8 +756,8 @@ def numpy_cupy_array_max_ulp(
 
     """
 
-    def check_func(x, y):
-        _array.assert_array_max_ulp(x, y, maxulp, dtype)
+    def check_func(cupy_result, numpy_result, **test_kwargs):
+        _array.assert_array_max_ulp(cupy_result, numpy_result, maxulp, dtype)
 
     return _make_decorator(
         check_func, name, type_check, False, accept_error, sp_name, scipy_name
@@ -787,9 +805,13 @@ def numpy_cupy_array_equal(
     .. seealso:: :func:`cupy.testing.assert_array_equal`
     """
 
-    def check_func(x, y):
+    def check_func(cupy_result, numpy_result, **test_kwargs):
         _array.assert_array_equal(
-            x, y, err_msg, verbose, strides_check=strides_check
+            cupy_result,
+            numpy_result,
+            err_msg,
+            verbose,
+            strides_check=strides_check,
         )
 
     return _make_decorator(
@@ -826,8 +848,8 @@ def numpy_cupy_array_list_equal(
         DeprecationWarning,
     )
 
-    def check_func(x, y):
-        _array.assert_array_equal(x, y, err_msg, verbose)
+    def check_func(cupy_result, numpy_result, **test_kwargs):
+        _array.assert_array_equal(cupy_result, numpy_result, err_msg, verbose)
 
     return _make_decorator(
         check_func, name, False, False, False, sp_name, scipy_name
@@ -871,8 +893,8 @@ def numpy_cupy_array_less(
     .. seealso:: :func:`cupy.testing.assert_array_less`
     """
 
-    def check_func(x, y):
-        _array.assert_array_less(x, y, err_msg, verbose)
+    def check_func(cupy_result, numpy_result, **test_kwargs):
+        _array.assert_array_less(cupy_result, numpy_result, err_msg, verbose)
 
     return _make_decorator(
         check_func, name, type_check, False, accept_error, sp_name, scipy_name

From 3e13c962647f205723a008e94168f979b1222731 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 28 Mar 2026 23:40:01 +0100
Subject: [PATCH 27/35] Bump github/codeql-action from 4.34.1 to 4.35.1 (#2833)

Bumps [github/codeql-action](https://github.com/github/codeql-action)
from 4.34.1 to 4.35.1.
---
 .github/workflows/openssf-scorecard.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index ffe61f3fb739..e5cc7dec86ea 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -72,6 +72,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@38697555549f1db7851b81482ff19f1fa5c4fedc # v4.34.1
+        uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1
         with:
           sarif_file: results.sarif

From d838c4b04e1a1630aa95af735e44080407f07b6a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sun, 29 Mar 2026 01:37:03 +0100
Subject: [PATCH 28/35] Weekly pre-commit autoupdate (#2832)

This PR updates the `.pre-commit-config.yaml` using `pre-commit
autoupdate`.
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2d14b00c86ad..91b9f34d8818 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -89,7 +89,7 @@ repos:
             - flake8-docstrings==1.7.0
             - flake8-bugbear==24.12.12
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v22.1.1
+    rev: v22.1.2
     hooks:
     -   id: clang-format
         args: ["-i"]

From e36835d1668b8c0b4852a39b8caf5710c8e55f22 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Wed, 1 Apr 2026 14:03:14 +0200
Subject: [PATCH 29/35] Use Pybind11 `3.0.3` to build dpnp (#2834)

The PR updates CMakeLists.txt to bump pybind11 `3.0.3` up from `3.0.2`
version.
---
 CHANGELOG.md   | 5 +++--
 CMakeLists.txt | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8e2f5703486d..30d60fc98988 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.scipy.special.erfinv` and `dpnp.scipy.special.erfcinv` [#2624](https://github.com/IntelPython/dpnp/pull/2624)
 * Added implementation of `dpnp.ndarray.tolist` method [#2652](https://github.com/IntelPython/dpnp/pull/2652)
 * Added implementation of `dpnp.frexp` [#2635](https://github.com/IntelPython/dpnp/pull/2635)
-* Added implementation of `dpnp.ndarray.tofile` method [#2635](https://github.com/IntelPython/dpnp/pull/2635)
+* Added implementation of `dpnp.ndarray.tofile` method [#2653](https://github.com/IntelPython/dpnp/pull/2653)
 * Extended `pre-commit` configuration with `pyupgrade`, `actionlint`, and `gersemi` hooks [#2658](https://github.com/IntelPython/dpnp/pull/2658)
 * Added implementation of `dpnp.ndarray.tobytes` method [#2656](https://github.com/IntelPython/dpnp/pull/2656)
 * Added implementation of `dpnp.ndarray.__format__` method [#2662](https://github.com/IntelPython/dpnp/pull/2662)
@@ -52,10 +52,11 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Aligned `strides` property of `dpnp.ndarray` with NumPy and CuPy implementations [#2747](https://github.com/IntelPython/dpnp/pull/2747)
 * Extended `dpnp.nan_to_num` to support broadcasting of `nan`, `posinf`, and `neginf` keywords [#2754](https://github.com/IntelPython/dpnp/pull/2754)
 * Changed `dpnp.partition` implementation to reuse `dpnp.sort` where it brings the performance benefit [#2766](https://github.com/IntelPython/dpnp/pull/2766)
-* `dpnp` uses pybind11 3.0.2 [#27734](https://github.com/IntelPython/dpnp/pull/2773)
+* `dpnp` uses pybind11 3.0.2 [#2773](https://github.com/IntelPython/dpnp/pull/2773)
 * Modified CMake files for the extension to explicitly mark DPC++ compiler and dpctl headers as system ones and so to suppress the build warning generated inside them [#2770](https://github.com/IntelPython/dpnp/pull/2770)
 * Updated QR tests to avoid element-wise comparisons for `raw` and `r` modes [#2785](https://github.com/IntelPython/dpnp/pull/2785)
 * Moved all SYCL kernel functors from `backend/extensions/` to a unified `backend/kernels/` directory hierarchy [#2816](https://github.com/IntelPython/dpnp/pull/2816)
+* `dpnp` uses pybind11 3.0.3 [#2834](https://github.com/IntelPython/dpnp/pull/2834)
 
 ### Deprecated
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58ba34082be2..129bf1d87c25 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,8 +95,8 @@ find_package(Python 3.10...<3.15 REQUIRED COMPONENTS Development.Module NumPy)
 include(FetchContent)
 FetchContent_Declare(
     pybind11
-    URL https://github.com/pybind/pybind11/archive/refs/tags/v3.0.2.tar.gz
-    URL_HASH SHA256=2f20a0af0b921815e0e169ea7fec63909869323581b89d7de1553468553f6a2d
+    URL https://github.com/pybind/pybind11/archive/refs/tags/v3.0.3.tar.gz
+    URL_HASH SHA256=787459e1e186ee82001759508fefa408373eae8a076ffe0078b126c6f8f0ec5e
     FIND_PACKAGE_ARGS NAMES pybind11
 )
 FetchContent_MakeAvailable(pybind11)

From d5230619e2adc0815d953f061bba789b952b8aa6 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Tue, 7 Apr 2026 15:09:36 +0200
Subject: [PATCH 30/35] Fix device-aware dtype handling in identity, gradient
 functions (#2835)

This PR ensures that default dtype selection respects device-specific
capabilities across multiple functions.

The PR includes changes:
- `dpnp.identity`: Remove redundant default dtype handling. The function
now delegates dtype resolution to dpnp.eye(), which already handles
device-aware default types correctly.
- `dpnp.gradient`: Pass sycl_queue parameter to default_float_type()
calls to ensure the selected float type is compatible with the device
where the array resides. This prevents issues when converting integer
arrays on devices with different dtype support.

The PR also updates SYCL queue tests to fix the parametrization to
generate device-dtype pairs using a new get_all_dev_dtypes() helper.
Each device is now tested only with dtypes it actually supports (e.g.,
devices without fp64 support won't test fp64), preventing false failures
and unnecessary test combinations.
---
 CHANGELOG.md                     |  1 +
 dpnp/dpnp_iface_arraycreation.py |  3 +--
 dpnp/dpnp_iface_mathematical.py  | 10 ++++++----
 dpnp/tests/test_sycl_queue.py    | 31 +++++++++++++++++++++++--------
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 30d60fc98988..f8aaae542ec5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -83,6 +83,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Fixed `.data.ptr` property on array views to correctly return the pointer to the view's data location instead of the base allocation pointer [#2812](https://github.com/IntelPython/dpnp/pull/2812)
 * Resolved an issue with strides calculation in `dpnp.diagonal` to return correct values for empty diagonals [#2814](https://github.com/IntelPython/dpnp/pull/2814)
 * Fixed test tolerance issues for float16 intermediate precision that became visible when testing against conda-forge's NumPy [#2828](https://github.com/IntelPython/dpnp/pull/2828)
+* Ensured device aware dtype handling in `dpnp.identity` and `dpnp.gradient` [#2835](https://github.com/IntelPython/dpnp/pull/2835)
 
 ### Security
 
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index e7b902647186..5bcf5ea19b82 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -2664,10 +2664,9 @@ def identity(
 
     dpnp.check_limitations(like=like)
 
-    _dtype = dpnp.default_float_type() if dtype is None else dtype
     return dpnp.eye(
         n,
-        dtype=_dtype,
+        dtype=dtype,
         device=device,
         usm_type=usm_type,
         sycl_queue=sycl_queue,
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 366a3363404a..e06904a57bda 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -141,7 +141,9 @@ def _gradient_build_dx(f, axes, *varargs):
             if dpnp.issubdtype(distances.dtype, dpnp.integer):
                 # Convert integer types to default float type to avoid modular
                 # arithmetic in dpnp.diff(distances).
-                distances = distances.astype(dpnp.default_float_type())
+                distances = distances.astype(
+                    dpnp.default_float_type(sycl_queue=f.sycl_queue)
+                )
             diffx = dpnp.diff(distances)
 
             # if distances are constant reduce to the scalar case
@@ -2707,9 +2709,9 @@ def gradient(f, *varargs, axis=None, edge_order=1):
         # All other types convert to floating point.
         # First check if f is a dpnp integer type; if so, convert f to default
         # float type to avoid modular arithmetic when computing changes in f.
-        if dpnp.issubdtype(otype, dpnp.integer):
-            f = f.astype(dpnp.default_float_type())
-        otype = dpnp.default_float_type()
+        otype = dpnp.default_float_type(sycl_queue=f.sycl_queue)
+        if dpnp.issubdtype(f.dtype, dpnp.integer):
+            f = f.astype(otype)
 
     for axis_, ax_dx in zip(axes, dx):
         if f.shape[axis_] < edge_order + 1:
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index b0f746720af8..e4b9403df8a4 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -54,6 +54,23 @@ def assert_sycl_queue_equal(result, expected):
     assert exec_queue is not None
 
 
+def get_all_dev_dtypes(no_float16=True, no_none=True):
+    """
+    Build a list of (device, dtype) combinations for each device's
+    supported dtype.
+
+    """
+
+    device_dtype_pairs = []
+    for device in valid_dev:
+        dtypes = get_all_dtypes(
+            no_float16=no_float16, no_none=no_none, device=device
+        )
+        for dtype in dtypes:
+            device_dtype_pairs.append((device, dtype))
+    return device_dtype_pairs
+
+
 @pytest.mark.parametrize(
     "func, arg, kwargs",
     [
@@ -1082,11 +1099,10 @@ def test_array_creation_from_dpctl(copy, device):
     assert isinstance(result, dpnp_array)
 
 
-@pytest.mark.parametrize("device", valid_dev, ids=dev_ids)
-@pytest.mark.parametrize("arr_dtype", get_all_dtypes(no_float16=True))
+@pytest.mark.parametrize("device, dt", get_all_dev_dtypes())
 @pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)])
-def test_from_dlpack(arr_dtype, shape, device):
-    X = dpnp.ones(shape=shape, dtype=arr_dtype, device=device)
+def test_from_dlpack(shape, device, dt):
+    X = dpnp.ones(shape=shape, dtype=dt, device=device)
     Y = dpnp.from_dlpack(X)
     assert_array_equal(X, Y)
     assert X.__dlpack_device__() == Y.__dlpack_device__()
@@ -1098,10 +1114,9 @@ def test_from_dlpack(arr_dtype, shape, device):
         assert V.strides == W.strides
 
 
-@pytest.mark.parametrize("device", valid_dev, ids=dev_ids)
-@pytest.mark.parametrize("arr_dtype", get_all_dtypes(no_float16=True))
-def test_from_dlpack_with_dpt(arr_dtype, device):
-    X = dpt.ones((64,), dtype=arr_dtype, device=device)
+@pytest.mark.parametrize("device, dt", get_all_dev_dtypes())
+def test_from_dlpack_with_dpt(device, dt):
+    X = dpt.ones((64,), dtype=dt, device=device)
     Y = dpnp.from_dlpack(X)
     assert_array_equal(X, Y)
     assert isinstance(Y, dpnp.dpnp_array.dpnp_array)

From ba96d2c8cbeba1fdd761ccdaf7cc206e5c45ade4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 17:10:57 +0200
Subject: [PATCH 31/35] Bump mshick/add-pr-comment from 3.9.0 to 3.10.0 (#2838)

Bumps [mshick/add-pr-comment](https://github.com/mshick/add-pr-comment)
from 3.9.0 to 3.10.0.
---
 .github/workflows/build-sphinx.yml  | 4 ++--
 .github/workflows/conda-package.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index 87a7311b95e4..31423357e3f3 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -224,7 +224,7 @@ jobs:
         if: env.GH_EVENT_OPEN_PR_UPSTREAM == 'true'
         env:
           PR_NUM: ${{ github.event.number }}
-        uses: mshick/add-pr-comment@ffd016c7e151d97d69d21a843022fd4cd5b96fe5 # v3.9.0.8.3.9.0
+        uses: mshick/add-pr-comment@64b8e914979889d746c99dea15a76e77ef64580a # v3.10.0.8.3.10.0
         with:
           message-id: url_to_docs
           message: |
@@ -268,7 +268,7 @@ jobs:
           git push tokened_docs gh-pages
 
       - name: Modify the comment with URL to official documentation
-        uses: mshick/add-pr-comment@ffd016c7e151d97d69d21a843022fd4cd5b96fe5 # v3.9.0.8.3.9.0
+        uses: mshick/add-pr-comment@64b8e914979889d746c99dea15a76e77ef64580a # v3.10.0.8.3.10.0
         with:
           message-id: url_to_docs
           find: |
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index b24ffec1334a..610b166e8aee 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -654,7 +654,7 @@ jobs:
 
       - name: Post result to PR
         if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork }}
-        uses: mshick/add-pr-comment@ffd016c7e151d97d69d21a843022fd4cd5b96fe5 # v3.9.0.8.3.9.0
+        uses: mshick/add-pr-comment@64b8e914979889d746c99dea15a76e77ef64580a # v3.10.0.8.3.10.0
         with:
           message-id: array_api_results
           message: |

From 2a78c06283e4a20140b7ae83c0d295033c177e26 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 19:59:51 +0200
Subject: [PATCH 32/35] Weekly pre-commit autoupdate (#2837)

This PR updates the `.pre-commit-config.yaml` using `pre-commit
autoupdate`.
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 91b9f34d8818..13197415b15e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -124,7 +124,7 @@ repos:
     -   id: pretty-format-toml
         args: [--autofix]
 -   repo: https://github.com/rhysd/actionlint
-    rev: v1.7.11
+    rev: v1.7.12
     hooks:
     -   id: actionlint
 -   repo: https://github.com/BlankSpruce/gersemi

From 1c3097d95d9877cde32af44944f256f33d5be771 Mon Sep 17 00:00:00 2001
From: Anton <100830759+antonwolfy@users.noreply.github.com>
Date: Thu, 9 Apr 2026 20:39:00 +0200
Subject: [PATCH 33/35] Bump `conda-index` version (#2839)

The PR updates `conda-index` version to `0.10.0`.
---
 environments/create_conda_channel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/create_conda_channel.yml b/environments/create_conda_channel.yml
index aa136bc2dec8..ea571ef0d072 100644
--- a/environments/create_conda_channel.yml
+++ b/environments/create_conda_channel.yml
@@ -3,4 +3,4 @@ channels:
   - conda-forge
 dependencies:
   - python=3.13 # no python 3.14 support by conda-index
-  - conda-index=0.7.0
+  - conda-index=0.10.0

From d835f965f25e0caa143e8fad464161188d76c1f0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 10 Apr 2026 09:13:10 -0700
Subject: [PATCH 34/35] Use ExecutionPlacementError from dpnp.exceptions in
 dpnp

---
 dpnp/tests/test_fft.py        | 2 +-
 dpnp/tests/test_sycl_queue.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index 6c481e4b7917..f8cc95a7a3ca 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -277,7 +277,7 @@ def test_validate_out(self):
         # Inconsistent sycl_queue
         a = dpnp.ones((10,), dtype=dpnp.complex64, sycl_queue=dpctl.SyclQueue())
         out = dpnp.empty((10,), sycl_queue=dpctl.SyclQueue())
-        assert_raises(dpt.ExecutionPlacementError, dpnp.fft.fft, a, out=out)
+        assert_raises(ExecutionPlacementError, dpnp.fft.fft, a, out=out)
 
         # Invalid shape
         a = dpnp.ones((10,), dtype=dpnp.complex64)
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index d8ef27de50a7..5420285d5940 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -674,7 +674,7 @@ def test_2in_broadcasting(func, data1, data2, device):
 def test_2in_1out_diff_queue_but_equal_context(func, device):
     x1 = dpnp.arange(10)
     x2 = dpnp.arange(10, sycl_queue=dpctl.SyclQueue(device))[::-1]
-    with assert_raises((ValueError, dpt.ExecutionPlacementError)):
+    with assert_raises((ValueError, ExecutionPlacementError)):
         getattr(dpnp, func)(x1, x2)
 
 

From 56a0af4a6441d2b0809f2ddeb8838f196fb7f09f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 10 Apr 2026 09:35:15 -0700
Subject: [PATCH 35/35] Apply clang-format

---
 dpnp/backend/include/dpnp4pybind11.hpp        |  20 +---
 .../include/kernels/accumulators.hpp          |  87 ++++++--------
 .../tensor/libtensor/include/kernels/clip.hpp |   3 +-
 .../include/kernels/copy_and_cast.hpp         |  13 +--
 .../include/kernels/copy_as_contiguous.hpp    |  18 +--
 .../kernels/elementwise_functions/abs.hpp     |   6 +-
 .../kernels/elementwise_functions/add.hpp     |  23 ++--
 .../kernels/elementwise_functions/atanh.hpp   |   4 +-
 .../elementwise_functions/bitwise_and.hpp     |   4 +-
 .../bitwise_left_shift.hpp                    |  10 +-
 .../bitwise_right_shift.hpp                   |  20 ++--
 .../elementwise_functions/bitwise_xor.hpp     |   4 +-
 .../kernels/elementwise_functions/cbrt.hpp    |   5 +-
 .../kernels/elementwise_functions/common.hpp  | 101 ++++++++---------
 .../elementwise_functions/common_detail.hpp   |   7 +-
 .../elementwise_functions/common_inplace.hpp  |  34 +++---
 .../kernels/elementwise_functions/equal.hpp   |   6 +-
 .../elementwise_functions/floor_divide.hpp    |  14 +--
 .../kernels/elementwise_functions/greater.hpp |   6 +-
 .../elementwise_functions/greater_equal.hpp   |   6 +-
 .../elementwise_functions/isfinite.hpp        |   3 +-
 .../kernels/elementwise_functions/isinf.hpp   |   3 +-
 .../kernels/elementwise_functions/isnan.hpp   |   3 +-
 .../kernels/elementwise_functions/less.hpp    |   6 +-
 .../elementwise_functions/less_equal.hpp      |   6 +-
 .../kernels/elementwise_functions/maximum.hpp |   9 +-
 .../kernels/elementwise_functions/minimum.hpp |   9 +-
 .../elementwise_functions/multiply.hpp        |  17 +--
 .../elementwise_functions/negative.hpp        |   5 +-
 .../elementwise_functions/not_equal.hpp       |   3 +-
 .../elementwise_functions/positive.hpp        |   5 +-
 .../kernels/elementwise_functions/pow.hpp     |  11 +-
 .../elementwise_functions/remainder.hpp       |  10 +-
 .../kernels/elementwise_functions/rsqrt.hpp   |   5 +-
 .../kernels/elementwise_functions/signbit.hpp |   5 +-
 .../elementwise_functions/subtract.hpp        |  14 +--
 .../elementwise_functions/true_divide.hpp     |  22 ++--
 .../kernels/linalg_functions/dot_product.hpp  |  36 +++---
 .../include/kernels/linalg_functions/gemm.hpp |  54 ++++-----
 .../libtensor/include/kernels/reductions.hpp  | 106 ++++++++----------
 .../include/kernels/sorting/merge_sort.hpp    |  32 ++----
 .../include/kernels/sorting/radix_sort.hpp    |  40 ++-----
 .../include/kernels/sorting/topk.hpp          |  12 +-
 .../libtensor/include/kernels/where.hpp       |   6 +-
 .../libtensor/include/utils/offset_utils.hpp  |  64 +++--------
 .../include/utils/rich_comparisons.hpp        |   6 +-
 .../libtensor/include/utils/strided_iters.hpp |  24 +---
 .../libtensor/include/utils/sycl_utils.hpp    |   5 +-
 .../libtensor/include/utils/type_dispatch.hpp |   4 +-
 .../include/utils/type_dispatch_building.hpp  |  16 +--
 .../libtensor/include/utils/type_utils.hpp    |   3 +-
 .../accumulators/accumulate_over_axis.hpp     |   3 +-
 .../accumulators/cumulative_logsumexp.cpp     |  12 +-
 .../source/accumulators/cumulative_prod.cpp   |  20 ++--
 .../source/accumulators/cumulative_sum.cpp    |  20 ++--
 .../source/boolean_advanced_indexing.cpp      |   9 +-
 dpnp/tensor/libtensor/source/clip.cpp         |   6 +-
 .../source/copy_and_cast_usm_to_usm.cpp       |   3 +-
 .../libtensor/source/copy_as_contig.cpp       |   6 +-
 .../source/device_support_queries.cpp         |  10 +-
 .../elementwise_functions.hpp                 |  27 ++---
 dpnp/tensor/libtensor/source/full_ctor.cpp    |   5 +-
 .../libtensor/source/linalg_functions/dot.cpp |  15 +--
 .../libtensor/source/reductions/argmax.cpp    |  15 +--
 .../libtensor/source/reductions/argmin.cpp    |  15 +--
 .../libtensor/source/reductions/logsumexp.cpp |   9 +-
 .../libtensor/source/reductions/max.cpp       |   9 +-
 .../libtensor/source/reductions/min.cpp       |   9 +-
 .../libtensor/source/reductions/prod.cpp      |  18 +--
 .../source/reductions/reduce_hypot.cpp        |   9 +-
 .../reductions/reduction_atomic_support.hpp   |   8 +-
 .../source/reductions/reduction_over_axis.hpp |  33 ++----
 .../libtensor/source/reductions/sum.cpp       |   9 +-
 dpnp/tensor/libtensor/source/repeat.cpp       |  11 +-
 .../source/simplify_iteration_space.cpp       |   3 +-
 .../source/sorting/merge_argsort.cpp          |   6 +-
 .../source/sorting/py_argsort_common.hpp      |   3 +-
 .../source/sorting/radix_argsort.cpp          |   6 +-
 .../libtensor/source/sorting/searchsorted.cpp |  19 ++--
 dpnp/tensor/libtensor/source/where.cpp        |   7 +-
 80 files changed, 445 insertions(+), 815 deletions(-)

diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
index ada7b7e380fb..896ff20873a5 100644
--- a/dpnp/backend/include/dpnp4pybind11.hpp
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -195,22 +195,10 @@ class dpctl_capi
         return api;
     }
 
-    py::object default_sycl_queue_pyobj()
-    {
-        return *default_sycl_queue_;
-    }
-    py::object default_usm_memory_pyobj()
-    {
-        return *default_usm_memory_;
-    }
-    py::object default_usm_ndarray_pyobj()
-    {
-        return *default_usm_ndarray_;
-    }
-    py::object as_usm_memory_pyobj()
-    {
-        return *as_usm_memory_;
-    }
+    py::object default_sycl_queue_pyobj() { return *default_sycl_queue_; }
+    py::object default_usm_memory_pyobj() { return *default_usm_memory_; }
+    py::object default_usm_ndarray_pyobj() { return *default_usm_ndarray_; }
+    py::object as_usm_memory_pyobj() { return *as_usm_memory_; }
 
 private:
     struct Deleter
diff --git a/dpnp/tensor/libtensor/include/kernels/accumulators.hpp b/dpnp/tensor/libtensor/include/kernels/accumulators.hpp
index 60382e210d8b..9449c030ac67 100644
--- a/dpnp/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/accumulators.hpp
@@ -85,10 +85,7 @@ struct NoOpTransformer
 {
     constexpr NoOpTransformer() {}
 
-    T operator()(const T &val) const
-    {
-        return val;
-    }
+    T operator()(const T &val) const { return val; }
 };
 
 template <typename srcTy, typename dstTy>
@@ -134,22 +131,13 @@ class stack_t
         : src_(src), size_(sz), local_scans_(local_scans)
     {
     }
-    ~stack_t(){};
+    ~stack_t() {};
 
-    T *get_src_ptr() const
-    {
-        return src_;
-    }
+    T *get_src_ptr() const { return src_; }
 
-    std::size_t get_size() const
-    {
-        return size_;
-    }
+    std::size_t get_size() const { return size_; }
 
-    T *get_local_scans_ptr() const
-    {
-        return local_scans_;
-    }
+    T *get_local_scans_ptr() const { return local_scans_; }
 };
 
 template <typename T>
@@ -170,27 +158,15 @@ class stack_strided_t
           local_stride_(local_stride)
     {
     }
-    ~stack_strided_t(){};
+    ~stack_strided_t() {};
 
-    T *get_src_ptr() const
-    {
-        return src_;
-    }
+    T *get_src_ptr() const { return src_; }
 
-    std::size_t get_size() const
-    {
-        return size_;
-    }
+    std::size_t get_size() const { return size_; }
 
-    T *get_local_scans_ptr() const
-    {
-        return local_scans_;
-    }
+    T *get_local_scans_ptr() const { return local_scans_; }
 
-    std::size_t get_local_stride() const
-    {
-        return local_stride_;
-    }
+    std::size_t get_local_stride() const { return local_stride_; }
 };
 
 } // end of namespace detail
@@ -515,32 +491,35 @@ sycl::event inclusive_scan_base_step_striped(
             it.barrier(sycl::access::fence_space::local_space);
 
             // convert back to blocked layout
-            {{const std::uint32_t local_offset0 = lid * n_wi;
+            {
+                {
+                    const std::uint32_t local_offset0 = lid * n_wi;
 #pragma unroll
-            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi];
-            }
+                    for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                        slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi];
+                    }
 
-            it.barrier(sycl::access::fence_space::local_space);
+                    it.barrier(sycl::access::fence_space::local_space);
                 }
             }
 
             {
-        const std::uint32_t block_offset = sgroup_id * sgSize * n_wi + lane_id;
+                const std::uint32_t block_offset =
+                    sgroup_id * sgSize * n_wi + lane_id;
 #pragma unroll
-        for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-            const std::uint32_t m_wi_scaled = m_wi * sgSize;
-            const std::size_t out_id = inp_id0 + m_wi_scaled;
-            if (out_id < acc_nelems) {
-                output[out_iter_offset + out_indexer(out_id)] =
-                    slm_iscan_tmp[block_offset + m_wi_scaled];
-            }
-        }
+                for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                    const std::uint32_t m_wi_scaled = m_wi * sgSize;
+                    const std::size_t out_id = inp_id0 + m_wi_scaled;
+                    if (out_id < acc_nelems) {
+                        output[out_iter_offset + out_indexer(out_id)] =
+                            slm_iscan_tmp[block_offset + m_wi_scaled];
+                    }
+                }
             }
-});
-});
+        });
+    });
 
-return inc_scan_phase1_ev;
+    return inc_scan_phase1_ev;
 }
 
 template <typename inputT,
@@ -746,8 +725,7 @@ sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
         }
 
         for (std::size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
-             ++reverse_stack_id)
-        {
+             ++reverse_stack_id) {
             const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
 
             const auto &stack_elem = stack[stack_id];
@@ -1082,8 +1060,7 @@ sycl::event inclusive_scan_iter(sycl::queue &exec_q,
         }
 
         for (std::size_t reverse_stack_id = 0;
-             reverse_stack_id < stack.size() - 1; ++reverse_stack_id)
-        {
+             reverse_stack_id < stack.size() - 1; ++reverse_stack_id) {
             const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
 
             const auto &stack_elem = stack[stack_id];
diff --git a/dpnp/tensor/libtensor/include/kernels/clip.hpp b/dpnp/tensor/libtensor/include/kernels/clip.hpp
index 58a86a8f82d6..900fcf3df100 100644
--- a/dpnp/tensor/libtensor/include/kernels/clip.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/clip.hpp
@@ -219,8 +219,7 @@ sycl::event clip_contig_impl(sycl::queue &q,
         if (is_aligned<required_alignment>(x_cp) &&
             is_aligned<required_alignment>(min_cp) &&
             is_aligned<required_alignment>(max_cp) &&
-            is_aligned<required_alignment>(dst_cp))
-        {
+            is_aligned<required_alignment>(dst_cp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
             using Impl =
diff --git a/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
index d6001a11e471..2c4146d467e6 100644
--- a/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -352,8 +352,7 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
         const auto lws_range = sycl::range<1>(lws);
 
         if (is_aligned<required_alignment>(src_cp) &&
-            is_aligned<required_alignment>(dst_cp))
-        {
+            is_aligned<required_alignment>(dst_cp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName =
                 copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
@@ -920,10 +919,7 @@ struct CompositionIndexer
 {
     CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {}
 
-    auto operator()(std::size_t gid) const
-    {
-        return f_(t_(gid));
-    }
+    auto operator()(std::size_t gid) const { return f_(t_(gid)); }
 
 private:
     IndexerT f_;
@@ -944,10 +940,7 @@ struct RolledNDIndexer
     {
     }
 
-    ssize_t operator()(std::size_t gid) const
-    {
-        return compute_offset(gid);
-    }
+    ssize_t operator()(std::size_t gid) const { return compute_offset(gid); }
 
 private:
     int nd_ = -1;
diff --git a/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
index 37126a22dc64..a723f6334e7e 100644
--- a/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -261,10 +261,7 @@ typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)(
 template <typename fnT, typename T>
 struct AsCContigFactory
 {
-    fnT get()
-    {
-        return as_c_contiguous_array_generic_impl<T>;
-    }
+    fnT get() { return as_c_contiguous_array_generic_impl<T>; }
 };
 
 template <typename T,
@@ -496,8 +493,7 @@ sycl::event as_c_contiguous_batch_of_square_matrices_impl(
             else {
                 // map local_linear_id into (local_dim0, local_dim1)
                 for (std::uint16_t el_id = lid_lin;
-                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1)
-                {
+                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1) {
 
                     // 0 <= local_i0 < local_dim0
                     const std::uint16_t loc_i0 = el_id / local_dim1;
@@ -577,10 +573,7 @@ typedef sycl::event (
 template <typename fnT, typename T>
 struct AsCContig1DBatchOfSquareMatricesFactory
 {
-    fnT get()
-    {
-        return as_c_contiguous_1d_batch_of_square_matrices_impl<T>;
-    }
+    fnT get() { return as_c_contiguous_1d_batch_of_square_matrices_impl<T>; }
 };
 
 template <typename T>
@@ -638,9 +631,6 @@ typedef sycl::event (
 template <typename fnT, typename T>
 struct AsCContigNDBatchOfSquareMatricesFactory
 {
-    fnT get()
-    {
-        return as_c_contiguous_nd_batch_of_square_matrices_impl<T>;
-    }
+    fnT get() { return as_c_contiguous_nd_batch_of_square_matrices_impl<T>; }
 };
 } // namespace dpctl::tensor::kernels::copy_as_contig
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index 1f0b3df33e4e..250ba1d70455 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -73,8 +73,7 @@ struct AbsFunctor
 
         if constexpr (std::is_same_v<argT, bool> ||
                       (std::is_integral<argT>::value &&
-                       std::is_unsigned<argT>::value))
-        {
+                       std::is_unsigned<argT>::value)) {
             static_assert(std::is_same_v<resT, argT>);
             return x;
         }
@@ -83,8 +82,7 @@ struct AbsFunctor
                 return detail::cabs(x);
             }
             else if constexpr (std::is_same_v<argT, sycl::half> ||
-                               std::is_floating_point_v<argT>)
-            {
+                               std::is_floating_point_v<argT>) {
                 return (sycl::signbit(x) ? -x : x);
             }
             else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index 1b7440304f0e..c7386f99236a 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -71,23 +71,20 @@ struct AddFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             using rT1 = typename argT1::value_type;
             using rT2 = typename argT2::value_type;
 
             return exprm_ns::complex<rT1>(in1) + exprm_ns::complex<rT2>(in2);
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           !tu_ns::is_complex<argT2>::value)
-        {
+                           !tu_ns::is_complex<argT2>::value) {
             using rT1 = typename argT1::value_type;
 
             return exprm_ns::complex<rT1>(in1) + in2;
         }
         else if constexpr (!tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             using rT2 = typename argT2::value_type;
 
             return in1 + exprm_ns::complex<rT2>(in2);
@@ -402,8 +399,7 @@ struct AddContigMatrixContigRowBroadcastFactory
             using resT = typename AddOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -449,8 +445,7 @@ struct AddContigRowContigMatrixBroadcastFactory
             using resT = typename AddOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -472,10 +467,7 @@ struct AddInplaceFunctor
     using supports_vec = std::negation<
         std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
 
-    void operator()(resT &res, const argT &in)
-    {
-        res += in;
-    }
+    void operator()(resT &res, const argT &in) { res += in; }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -672,8 +664,7 @@ struct AddInplaceRowMatrixBroadcastFactory
         }
         else {
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index f72380ae3de9..32f5384f4ad8 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -123,8 +123,8 @@ struct AtanhFunctor
              */
             const realT RECIP_EPSILON =
                 realT(1) / std::numeric_limits<realT>::epsilon();
-            if (sycl::fabs(x) > RECIP_EPSILON || sycl::fabs(y) > RECIP_EPSILON)
-            {
+            if (sycl::fabs(x) > RECIP_EPSILON ||
+                sycl::fabs(y) > RECIP_EPSILON) {
                 const realT pi_half = sycl::atan(realT(1)) * 2;
 
                 const realT res_re = realT(0);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index d0b644c2f6bb..dae2e62a76b2 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -372,8 +372,8 @@ struct BitwiseAndInplaceTypeMapFactory
     /*! @brief get typeid for output type of x &= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (BitwiseAndInplaceTypePairSupport<argT, resT>::is_defined)
-        {
+        if constexpr (BitwiseAndInplaceTypePairSupport<argT,
+                                                       resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index 549a220fbabc..59279a803ed8 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -307,10 +307,7 @@ struct BitwiseLeftShiftInplaceFunctor
     using supports_sg_loadstore = typename std::true_type;
     using supports_vec = typename std::true_type;
 
-    void operator()(resT &res, const argT &in) const
-    {
-        impl(res, in);
-    }
+    void operator()(resT &res, const argT &in) const { impl(res, in); }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -392,9 +389,8 @@ struct BitwiseLeftShiftInplaceTypeMapFactory
     /*! @brief get typeid for output type of x <<= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (BitwiseLeftShiftInplaceTypePairSupport<argT,
-                                                             resT>::is_defined)
-        {
+        if constexpr (BitwiseLeftShiftInplaceTypePairSupport<
+                          argT, resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index 49e05ac43f9a..241852b6a06e 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -309,10 +309,7 @@ struct BitwiseRightShiftInplaceFunctor
     using supports_sg_loadstore = typename std::true_type;
     using supports_vec = typename std::true_type;
 
-    void operator()(resT &res, const argT &in) const
-    {
-        impl(res, in);
-    }
+    void operator()(resT &res, const argT &in) const { impl(res, in); }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -396,9 +393,8 @@ struct BitwiseRightShiftInplaceTypeMapFactory
     /*! @brief get typeid for output type of x >>= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (BitwiseRightShiftInplaceTypePairSupport<argT,
-                                                              resT>::is_defined)
-        {
+        if constexpr (BitwiseRightShiftInplaceTypePairSupport<
+                          argT, resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
@@ -436,9 +432,8 @@ struct BitwiseRightShiftInplaceContigFactory
 {
     fnT get()
     {
-        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<T1,
-                                                               T2>::is_defined)
-        {
+        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<
+                          T1, T2>::is_defined) {
             fnT fn = nullptr;
             return fn;
         }
@@ -477,9 +472,8 @@ struct BitwiseRightShiftInplaceStridedFactory
 {
     fnT get()
     {
-        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<T1,
-                                                               T2>::is_defined)
-        {
+        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<
+                          T1, T2>::is_defined) {
             fnT fn = nullptr;
             return fn;
         }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index 2238492d50d3..292cf3f76df6 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -375,8 +375,8 @@ struct BitwiseXorInplaceTypeMapFactory
     /*! @brief get typeid for output type of x ^= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (BitwiseXorInplaceTypePairSupport<argT, resT>::is_defined)
-        {
+        if constexpr (BitwiseXorInplaceTypePairSupport<argT,
+                                                       resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
index 57bbb09523a4..20fb0ea7bcda 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -67,10 +67,7 @@ struct CbrtFunctor
     // do both argTy and resTy support sugroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &in) const
-    {
-        return sycl::cbrt(in);
-    }
+    resT operator()(const argT &in) const { return sycl::cbrt(in); }
 };
 
 template <typename argTy,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 1c072dc58fdc..cfe3f4898491 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -88,8 +88,8 @@ struct UnaryContigFunctor
         /* Each work-item processes vec_sz elements, contiguous in memory */
         /* NOTE: work-group size must be divisible by sub-group size */
 
-        if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value)
-        {
+        if constexpr (enable_sg_loadstore &&
+                      UnaryOperatorT::is_constant::value) {
             // value of operator is known to be a known constant
             constexpr resT const_val = UnaryOperatorT::constant_value;
 
@@ -120,8 +120,8 @@ struct UnaryContigFunctor
         }
         else if constexpr (enable_sg_loadstore &&
                            UnaryOperatorT::supports_sg_loadstore::value &&
-                           UnaryOperatorT::supports_vec::value && (vec_sz > 1))
-        {
+                           UnaryOperatorT::supports_vec::value &&
+                           (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -155,8 +155,7 @@ struct UnaryContigFunctor
         }
         else if constexpr (enable_sg_loadstore &&
                            UnaryOperatorT::supports_sg_loadstore::value &&
-                           std::is_same_v<resT, argT>)
-        {
+                           std::is_same_v<resT, argT>) {
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
@@ -193,8 +192,7 @@ struct UnaryContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           UnaryOperatorT::supports_sg_loadstore::value)
-        {
+                           UnaryOperatorT::supports_sg_loadstore::value) {
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
@@ -290,16 +288,16 @@ SizeT select_lws(const sycl::device &, SizeT n_work_items_needed)
 }
 
 template <typename argTy,
-          template <typename T>
-          class UnaryOutputType,
+          template <typename T> class UnaryOutputType,
           template <typename A,
                     typename R,
                     std::uint8_t vs,
                     std::uint8_t nv,
-                    bool enable>
-          class ContigFunctorT,
-          template <typename A, typename R, std::uint8_t vs, std::uint8_t nv>
-          class kernel_name,
+                    bool enable> class ContigFunctorT,
+          template <typename A,
+                    typename R,
+                    std::uint8_t vs,
+                    std::uint8_t nv> class kernel_name,
           std::uint8_t vec_sz = 4u,
           std::uint8_t n_vecs = 2u>
 sycl::event unary_contig_impl(sycl::queue &exec_q,
@@ -328,8 +326,7 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
         cgh.depends_on(depends);
 
         if (is_aligned<required_alignment>(arg_p) &&
-            is_aligned<required_alignment>(res_p))
-        {
+            is_aligned<required_alignment>(res_p)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
@@ -356,12 +353,9 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
 }
 
 template <typename argTy,
-          template <typename T>
-          class UnaryOutputType,
-          template <typename A, typename R, typename I>
-          class StridedFunctorT,
-          template <typename A, typename R, typename I>
-          class kernel_name>
+          template <typename T> class UnaryOutputType,
+          template <typename A, typename R, typename I> class StridedFunctorT,
+          template <typename A, typename R, typename I> class kernel_name>
 sycl::event
     unary_strided_impl(sycl::queue &exec_q,
                        std::size_t nelems,
@@ -428,8 +422,7 @@ struct BinaryContigFunctor
 
         if constexpr (enable_sg_loadstore &&
                       BinaryOperatorT::supports_sg_loadstore::value &&
-                      BinaryOperatorT::supports_vec::value && (vec_sz > 1))
-        {
+                      BinaryOperatorT::supports_vec::value && (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -469,8 +462,7 @@ struct BinaryContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           BinaryOperatorT::supports_sg_loadstore::value)
-        {
+                           BinaryOperatorT::supports_sg_loadstore::value) {
             auto sg = ndit.get_sub_group();
             const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -771,21 +763,18 @@ typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)(
 
 template <typename argTy1,
           typename argTy2,
-          template <typename T1, typename T2>
-          class BinaryOutputType,
+          template <typename T1, typename T2> class BinaryOutputType,
           template <typename T1,
                     typename T2,
                     typename T3,
                     std::uint8_t vs,
                     std::uint8_t nv,
-                    bool enable_sg_loadstore>
-          class BinaryContigFunctorT,
+                    bool enable_sg_loadstore> class BinaryContigFunctorT,
           template <typename T1,
                     typename T2,
                     typename T3,
                     std::uint8_t vs,
-                    std::uint8_t nv>
-          class kernel_name,
+                    std::uint8_t nv> class kernel_name,
           std::uint8_t vec_sz = 4u,
           std::uint8_t n_vecs = 2u>
 sycl::event binary_contig_impl(sycl::queue &exec_q,
@@ -821,8 +810,7 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
 
         if (is_aligned<required_alignment>(arg1_tp) &&
             is_aligned<required_alignment>(arg2_tp) &&
-            is_aligned<required_alignment>(res_tp))
-        {
+            is_aligned<required_alignment>(res_tp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
@@ -849,12 +837,15 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
 
 template <typename argTy1,
           typename argTy2,
-          template <typename T1, typename T2>
-          class BinaryOutputType,
-          template <typename T1, typename T2, typename T3, typename IndT>
-          class BinaryStridedFunctorT,
-          template <typename T1, typename T2, typename T3, typename IndT>
-          class kernel_name>
+          template <typename T1, typename T2> class BinaryOutputType,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename IndT> class BinaryStridedFunctorT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename IndT> class kernel_name>
 sycl::event
     binary_strided_impl(sycl::queue &exec_q,
                         std::size_t nelems,
@@ -893,13 +884,14 @@ sycl::event
     return comp_ev;
 }
 
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          template <typename T1, typename T2, typename T3>
-          class BinaryContigMatrixContigRowBroadcastFunctorT,
-          template <typename T1, typename T2, typename T3>
-          class kernel_name>
+template <
+    typename argT1,
+    typename argT2,
+    typename resT,
+    template <typename T1,
+              typename T2,
+              typename T3> class BinaryContigMatrixContigRowBroadcastFunctorT,
+    template <typename T1, typename T2, typename T3> class kernel_name>
 sycl::event binary_contig_matrix_contig_row_broadcast_impl(
     sycl::queue &exec_q,
     std::vector<sycl::event> &host_tasks,
@@ -967,13 +959,14 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
     return comp_ev;
 }
 
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          template <typename T1, typename T2, typename T3>
-          class BinaryContigRowContigMatrixBroadcastFunctorT,
-          template <typename T1, typename T2, typename T3>
-          class kernel_name>
+template <
+    typename argT1,
+    typename argT2,
+    typename resT,
+    template <typename T1,
+              typename T2,
+              typename T3> class BinaryContigRowContigMatrixBroadcastFunctorT,
+    template <typename T1, typename T2, typename T3> class kernel_name>
 sycl::event binary_contig_row_contig_matrix_broadcast_impl(
     sycl::queue &exec_q,
     std::vector<sycl::event> &host_tasks,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
index b304b5ac3a39..68d025ec6307 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
@@ -58,10 +58,9 @@ sycl::event
         sycl::range<1> gRange{padded_vec_sz};
 
         cgh.parallel_for<class populate_padded_vec_krn<T>>(
-            gRange, [=](sycl::id<1> id)
-        {
-            std::size_t i = id[0];
-            padded_vec[i] = vec[i % vec_sz];
+            gRange, [=](sycl::id<1> id) {
+                std::size_t i = id[0];
+                padded_vec[i] = vec[i % vec_sz];
             });
     });
 
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index 2c028bc30155..61902fce888a 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -92,8 +92,7 @@ struct BinaryInplaceContigFunctor
         if constexpr (enable_sg_loadstore &&
                       BinaryInplaceOperatorT::supports_sg_loadstore::value &&
                       BinaryInplaceOperatorT::supports_vec::value &&
-                      (vec_sz > 1))
-        {
+                      (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -130,8 +129,8 @@ struct BinaryInplaceContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           BinaryInplaceOperatorT::supports_sg_loadstore::value)
-        {
+                           BinaryInplaceOperatorT::supports_sg_loadstore::
+                               value) {
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -312,10 +311,11 @@ template <typename argTy,
                     typename T2,
                     std::uint8_t vs,
                     std::uint8_t nv,
-                    bool enable_sg_loadstore>
-          class BinaryInplaceContigFunctorT,
-          template <typename T1, typename T2, std::uint8_t vs, std::uint8_t nv>
-          class kernel_name,
+                    bool enable_sg_loadstore> class BinaryInplaceContigFunctorT,
+          template <typename T1,
+                    typename T2,
+                    std::uint8_t vs,
+                    std::uint8_t nv> class kernel_name,
           std::uint8_t vec_sz = 4u,
           std::uint8_t n_vecs = 2u>
 sycl::event
@@ -341,8 +341,7 @@ sycl::event
         resTy *res_tp = reinterpret_cast<resTy *>(lhs_p) + lhs_offset;
 
         if (is_aligned<required_alignment>(arg_tp) &&
-            is_aligned<required_alignment>(res_tp))
-        {
+            is_aligned<required_alignment>(res_tp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
             using Impl =
@@ -372,10 +371,10 @@ sycl::event
 
 template <typename argTy,
           typename resTy,
-          template <typename T1, typename T2, typename IndT>
-          class BinaryInplaceStridedFunctorT,
-          template <typename T1, typename T2, typename IndT>
-          class kernel_name>
+          template <typename T1,
+                    typename T2,
+                    typename IndT> class BinaryInplaceStridedFunctorT,
+          template <typename T1, typename T2, typename IndT> class kernel_name>
 sycl::event binary_inplace_strided_impl(
     sycl::queue &exec_q,
     std::size_t nelems,
@@ -410,10 +409,9 @@ sycl::event binary_inplace_strided_impl(
 
 template <typename argT,
           typename resT,
-          template <typename T1, typename T3>
-          class BinaryInplaceRowMatrixBroadcastFunctorT,
-          template <typename T1, typename T3>
-          class kernel_name>
+          template <typename T1,
+                    typename T3> class BinaryInplaceRowMatrixBroadcastFunctorT,
+          template <typename T1, typename T3> class kernel_name>
 sycl::event binary_inplace_row_matrix_broadcast_impl(
     sycl::queue &exec_q,
     std::vector<sycl::event> &host_tasks,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index 3a838e919369..07b3566c5cef 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -73,8 +73,7 @@ struct EqualFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
             using realT2 = typename argT2::value_type;
 
@@ -84,8 +83,7 @@ struct EqualFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? false : (static_cast<argT2>(in1) == in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 19ee9d268770..e669a97c04ea 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -128,10 +128,7 @@ struct FloorDivideFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 
 template <typename argT1,
@@ -401,10 +398,7 @@ struct FloorDivideInplaceFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 
 template <typename argT,
@@ -462,8 +456,8 @@ struct FloorDivideInplaceTypeMapFactory
     /*! @brief get typeid for output type of x //= y */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (FloorDivideInplaceTypePairSupport<argT, resT>::is_defined)
-        {
+        if constexpr (FloorDivideInplaceTypePairSupport<argT,
+                                                        resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
index 3e38b5f4deca..9b3659faa161 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -73,8 +73,7 @@ struct GreaterFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::greater_complex;
             return greater_complex<argT1>(in1, in2);
@@ -82,8 +81,7 @@ struct GreaterFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? false : (static_cast<argT2>(in1) > in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
index 029741b02600..25c56d4d40a4 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -73,8 +73,7 @@ struct GreaterEqualFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::greater_equal_complex;
             return greater_equal_complex<argT1>(in1, in2);
@@ -82,8 +81,7 @@ struct GreaterEqualFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? false : (static_cast<argT2>(in1) >= in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index 5b8ee877981f..8eb435c089d8 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -84,8 +84,7 @@ struct IsFiniteFunctor
             return (real_isfinite && imag_isfinite);
         }
         else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
+                           std::is_integral<argT>::value) {
             return constant_value;
         }
         else if constexpr (std::is_same_v<argT, sycl::half>) {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index 89ba83df9268..b7d85e21a1f2 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -82,8 +82,7 @@ struct IsInfFunctor
             return (real_isinf || imag_isinf);
         }
         else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
+                           std::is_integral<argT>::value) {
             return constant_value;
         }
         else if constexpr (std::is_same_v<argT, sycl::half>) {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index f78b724bf2d3..cad2d2239de0 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -83,8 +83,7 @@ struct IsNanFunctor
             return (real_isnan || imag_isnan);
         }
         else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
+                           std::is_integral<argT>::value) {
             return constant_value;
         }
         else {
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
index 7f1c68c5c65c..19077936372e 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -73,8 +73,7 @@ struct LessFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::less_complex;
             return less_complex<argT1>(in1, in2);
@@ -82,8 +81,7 @@ struct LessFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? true : (static_cast<argT2>(in1) < in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
index a8c58ee31277..a0b23693e70d 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -73,8 +73,7 @@ struct LessEqualFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::less_equal_complex;
             return less_equal_complex<argT1>(in1, in2);
@@ -82,8 +81,7 @@ struct LessEqualFunctor
         else {
             if constexpr (std::is_integral_v<argT1> &&
                           std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
                 if constexpr (std::is_signed_v<argT1> &&
                               !std::is_signed_v<argT2>) {
                     return (in1 < 0) ? true : (static_cast<argT2>(in1) <= in2);
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index af6f95863e65..52494cceba93 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -73,15 +73,13 @@ struct MaximumFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::max_complex;
             return max_complex<argT1>(in1, in2);
         }
         else if constexpr (std::is_floating_point_v<argT1> ||
-                           std::is_same_v<argT1, sycl::half>)
-        {
+                           std::is_same_v<argT1, sycl::half>) {
             const bool choose_first = (sycl::isnan(in1) || (in1 > in2));
             return (choose_first) ? in1 : in2;
         }
@@ -101,8 +99,7 @@ struct MaximumFunctor
             const auto &v1 = in1[i];
             const auto &v2 = in2[i];
             if constexpr (std::is_floating_point_v<argT1> ||
-                          std::is_same_v<argT1, sycl::half>)
-            {
+                          std::is_same_v<argT1, sycl::half>) {
                 const bool choose_first = (sycl::isnan(v1) || (v1 > v2));
                 res[i] = (choose_first) ? v1 : v2;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index 0a95987449a1..c11961f8c5c0 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -73,15 +73,13 @@ struct MinimumFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
             using dpctl::tensor::math_utils::min_complex;
             return min_complex<argT1>(in1, in2);
         }
         else if constexpr (std::is_floating_point_v<argT1> ||
-                           std::is_same_v<argT1, sycl::half>)
-        {
+                           std::is_same_v<argT1, sycl::half>) {
             const bool choose_first = sycl::isnan(in1) || (in1 < in2);
             return (choose_first) ? in1 : in2;
         }
@@ -101,8 +99,7 @@ struct MinimumFunctor
             const auto &v1 = in1[i];
             const auto &v2 = in2[i];
             if constexpr (std::is_floating_point_v<argT1> ||
-                          std::is_same_v<argT1, sycl::half>)
-            {
+                          std::is_same_v<argT1, sycl::half>) {
                 const bool choose_first = sycl::isnan(v1) || (v1 < v2);
                 res[i] = (choose_first) ? v1 : v2;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
index 587a05106ead..58ff88b3afeb 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -72,8 +72,7 @@ struct MultiplyFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
             using realT2 = typename argT2::value_type;
 
@@ -358,8 +357,7 @@ struct MultiplyContigMatrixContigRowBroadcastFactory
             using resT = typename MultiplyOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -406,8 +404,7 @@ struct MultiplyContigRowContigMatrixBroadcastFactory
             using resT = typename MultiplyOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -430,10 +427,7 @@ struct MultiplyInplaceFunctor
     using supports_vec = std::negation<
         std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
 
-    void operator()(resT &res, const argT &in)
-    {
-        res *= in;
-    }
+    void operator()(resT &res, const argT &in) { res *= in; }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -632,8 +626,7 @@ struct MultiplyInplaceRowMatrixBroadcastFactory
         }
         else {
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index f90786013557..e0ac856a3818 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -68,10 +68,7 @@ struct NegativeFunctor
     using supports_sg_loadstore = typename std::negation<
         std::disjunction<is_complex<resT>, is_complex<argT>>>;
 
-    resT operator()(const argT &x) const
-    {
-        return -x;
-    }
+    resT operator()(const argT &x) const { return -x; }
 };
 
 template <typename argT,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
index 224e3fbe5b77..007f374b6386 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -73,8 +73,7 @@ struct NotEqualFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (std::is_integral_v<argT1> && std::is_integral_v<argT2> &&
-                      std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-        {
+                      std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
             if constexpr (std::is_signed_v<argT1> && !std::is_signed_v<argT2>) {
                 return (in1 < 0) ? true : (static_cast<argT2>(in1) != in2);
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index c1ef29c709ab..fb351b6e50d2 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -70,10 +70,7 @@ struct PositiveFunctor
     using supports_sg_loadstore = typename std::negation<
         std::disjunction<is_complex<resT>, is_complex<argT>>>;
 
-    resT operator()(const argT &x) const
-    {
-        return x;
-    }
+    resT operator()(const argT &x) const { return x; }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index 46489f45985e..1c669ec894d2 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -94,8 +94,7 @@ struct PowFunctor
             return res;
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
             using realT2 = typename argT2::value_type;
 
@@ -143,9 +142,8 @@ struct PowFunctor
         }
         else {
             auto res = sycl::pow(in1, in2);
-            if constexpr (std::is_same_v<resT,
-                                         typename decltype(res)::element_type>)
-            {
+            if constexpr (std::is_same_v<
+                              resT, typename decltype(res)::element_type>) {
                 return res;
             }
             else {
@@ -400,8 +398,7 @@ struct PowInplaceFunctor
             res = res_tmp;
         }
         else if constexpr (tu_ns::is_complex<argT>::value &&
-                           tu_ns::is_complex<resT>::value)
-        {
+                           tu_ns::is_complex<resT>::value) {
             using r_resT = typename resT::value_type;
             using r_argT = typename argT::value_type;
 
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index ca87d0f41605..65cd97dbe56d 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -146,10 +146,7 @@ struct RemainderFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 
 template <typename argT1,
@@ -429,10 +426,7 @@ struct RemainderInplaceFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 
 template <typename argT,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
index 0228aecdca67..aa4f1113d839 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -67,10 +67,7 @@ struct RsqrtFunctor
     // do both argTy and resTy support sugroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &in) const
-    {
-        return sycl::rsqrt(in);
-    }
+    resT operator()(const argT &in) const { return sycl::rsqrt(in); }
 };
 
 template <typename argTy,
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
index d67120633efd..65e9e5a202a9 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -69,10 +69,7 @@ struct SignbitFunctor
     using supports_vec = std::true_type;
     using supports_sg_loadstore = std::true_type;
 
-    resT operator()(const argT &in) const
-    {
-        return std::signbit(in);
-    }
+    resT operator()(const argT &in) const { return std::signbit(in); }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index dfd9ac72b860..431596594ad3 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -355,8 +355,7 @@ struct SubtractContigMatrixContigRowBroadcastFactory
             using resT = typename SubtractOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -405,8 +404,7 @@ struct SubtractContigRowContigMatrixBroadcastFactory
             using resT = typename SubtractOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -429,10 +427,7 @@ struct SubtractInplaceFunctor
     using supports_vec = std::negation<
         std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
 
-    void operator()(resT &res, const argT &in)
-    {
-        res -= in;
-    }
+    void operator()(resT &res, const argT &in) { res -= in; }
 
     template <int vec_sz>
     void operator()(sycl::vec<resT, vec_sz> &res,
@@ -630,8 +625,7 @@ struct SubtractInplaceRowMatrixBroadcastFactory
         }
         else {
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index 1372663b96c5..caa1cd2029c4 100644
--- a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -71,8 +71,7 @@ struct TrueDivideFunctor
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
         if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
+                      tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
             using realT2 = typename argT2::value_type;
 
@@ -80,15 +79,13 @@ struct TrueDivideFunctor
                    exprm_ns::complex<realT2>(in2);
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           !tu_ns::is_complex<argT2>::value)
-        {
+                           !tu_ns::is_complex<argT2>::value) {
             using realT1 = typename argT1::value_type;
 
             return exprm_ns::complex<realT1>(in1) / in2;
         }
         else if constexpr (!tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             using realT2 = typename argT2::value_type;
 
             return in1 / exprm_ns::complex<realT2>(in2);
@@ -362,8 +359,7 @@ struct TrueDivideContigMatrixContigRowBroadcastFactory
             using resT = typename TrueDivideOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -412,8 +408,7 @@ struct TrueDivideContigRowContigMatrixBroadcastFactory
             using resT = typename TrueDivideOutputType<T1, T2>::value_type;
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
                           dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
@@ -498,8 +493,8 @@ struct TrueDivideInplaceTypeMapFactory
     /*! @brief get typeid for output type of divide(T1 x, T2 y) */
     std::enable_if_t<std::is_same<fnT, int>::value, int> get()
     {
-        if constexpr (TrueDivideInplaceTypePairSupport<argT, resT>::is_defined)
-        {
+        if constexpr (TrueDivideInplaceTypePairSupport<argT,
+                                                       resT>::is_defined) {
             return td_ns::GetTypeid<resT>{}.get();
         }
         else {
@@ -652,8 +647,7 @@ struct TrueDivideInplaceRowMatrixBroadcastFactory
         }
         else {
             if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
                 fnT fn = nullptr;
                 return fn;
             }
diff --git a/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
index b7f996bfa797..b987ff2988be 100644
--- a/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
@@ -172,8 +172,7 @@ struct DotProductFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
             const auto &lhs_reduction_offset =
                 reduction_offsets_.get_first_offset();
@@ -270,8 +269,7 @@ struct DotProductCustomFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
             const auto &lhs_reduction_offset =
                 reduction_offsets_.get_first_offset();
@@ -301,14 +299,16 @@ struct DotProductCustomFunctor
     }
 };
 
-template <
-    typename lhsTy,
-    typename rhsTy,
-    typename resTy,
-    typename BatchIndexerT,
-    typename RedIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
 sycl::event sequential_dot_product(sycl::queue &exec_q,
                                    const lhsTy *lhs,
                                    const rhsTy *rhs,
@@ -345,8 +345,7 @@ template <typename lhsTy,
                     typename T3,
                     typename T4,
                     typename T5,
-                    typename T6>
-          class kernel_name_token>
+                    typename T6> class kernel_name_token>
 sycl::event submit_atomic_dot_product(sycl::queue &exec_q,
                                       const lhsTy *lhs,
                                       const rhsTy *rhs,
@@ -716,8 +715,7 @@ struct DotProductNoAtomicFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
             const auto &lhs_reduction_offset =
                 reduction_offsets_.get_first_offset();
@@ -817,8 +815,7 @@ struct DotProductNoAtomicCustomFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
             const auto &lhs_reduction_offset =
                 reduction_offsets_.get_first_offset();
@@ -858,8 +855,7 @@ template <typename lhsTy,
                     typename T3,
                     typename T4,
                     typename T5,
-                    typename T6>
-          class kernel_name_token>
+                    typename T6> class kernel_name_token>
 sycl::event
     submit_no_atomic_dot_product(sycl::queue &exec_q,
                                  const lhsTy *lhs,
diff --git a/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
index 8f84d950c0cd..5644ea172a1d 100644
--- a/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
@@ -72,8 +72,7 @@ void scale_gemm_k_parameters(const std::size_t &local_mem_size,
     static constexpr std::size_t slm_elem_size = sizeof(T) * m_groups;
 
     while (slm_elem_size * (n_wi + delta_n) * delta_k + reserved_slm_size >=
-           local_mem_size)
-    {
+           local_mem_size) {
         n_wi = n_wi / 2;
         delta_n = delta_n / 2;
         if (delta_n == 0)
@@ -95,8 +94,7 @@ void scale_gemm_nm_parameters(const std::size_t &local_mem_size,
     while ((wi_delta_n * wg_delta_n * wi_delta_k * slm_A_elem_size) +
                (wi_delta_k * wg_delta_m * slm_B_elem_size) +
                reserved_slm_size >=
-           local_mem_size)
-    {
+           local_mem_size) {
         wg_delta_n /= 2;
         wg_delta_m /= 2;
         wi_delta_k /= 2;
@@ -641,8 +639,8 @@ class GemmBatchFunctorThreadK
                 else {
                     accV_t local_B_vec;
 #pragma unroll
-                    for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx)
-                    {
+                    for (std::size_t vec_idx = 0; vec_idx < m_groups;
+                         ++vec_idx) {
                         local_B_vec[vec_idx] =
                             (sq < k && j + vec_idx < m)
                                 ? static_cast<resT>(
@@ -1006,8 +1004,7 @@ class GemmBatchFunctorThreadNM_vecm
             // populate local_lhs_block<resT> ( wg_delta_n * wi_delta_n,
             // wi_delta_k)
             for (std::uint32_t vid = lid; vid < local_lhs_block.size();
-                 vid += it.get_local_range()[0])
-            {
+                 vid += it.get_local_range()[0]) {
                 // 0 <= v_i < wg_delta_n * wi_delta_n
                 const std::uint32_t v_i = vid / wi_delta_k;
                 // 0 <= v_s < wi_delta_k
@@ -1029,8 +1026,7 @@ class GemmBatchFunctorThreadNM_vecm
             // populate local_rhs_block<vec<resT, m_vec_size>> ( wg_delta_m *
             // wi_delta_m_vecs, wi_delta_k )
             for (std::uint32_t vid = lid; vid < local_rhs_block.size();
-                 vid += it.get_local_range()[0])
-            {
+                 vid += it.get_local_range()[0]) {
                 // 0 <= v_j < wg_delta_m * wi_delta_m_vecs
                 const std::uint32_t v_j = vid / wi_delta_k;
                 // 0 <= v_s < wi_delta_k
@@ -1091,8 +1087,8 @@ class GemmBatchFunctorThreadNM_vecm
 #pragma unroll
                 for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
 #pragma unroll
-                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
-                    {
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs;
+                         ++pr_j) {
                         private_C[pr_i * wi_delta_m_vecs + pr_j] +=
                             pr_lhs[pr_i] * pr_rhs[pr_j];
                     }
@@ -1108,8 +1104,8 @@ class GemmBatchFunctorThreadNM_vecm
                 std::size_t out_i = i + local_i + pr_i * wg_delta_n;
                 if (out_i < n) {
 #pragma unroll
-                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
-                    {
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs;
+                         ++pr_j) {
                         const std::size_t out_j =
                             j + (local_j + pr_j * wg_delta_m) * m_vec_size;
                         const std::size_t out_flat_id =
@@ -1128,8 +1124,8 @@ class GemmBatchFunctorThreadNM_vecm
                 std::size_t out_i = i + local_i + pr_i * wg_delta_n;
                 if (out_i < n) {
                     // could be unrolled
-                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
-                    {
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs;
+                         ++pr_j) {
                         std::size_t out_j =
                             j + (local_j + pr_j * wg_delta_m) * m_vec_size;
 #pragma unroll
@@ -1168,18 +1164,12 @@ struct GemmBatchFunctorThreadNM_vecm_HyperParameters
     {
     }
 
-    constexpr std::uint32_t get_wi_delta_n() const
-    {
-        return wi_delta_n;
-    }
+    constexpr std::uint32_t get_wi_delta_n() const { return wi_delta_n; }
     constexpr std::uint32_t get_wi_delta_m_vecs() const
     {
         return wi_delta_m_vecs;
     }
-    constexpr std::uint32_t get_m_vec_size() const
-    {
-        return m_vec_size;
-    }
+    constexpr std::uint32_t get_m_vec_size() const { return m_vec_size; }
 };
 
 template <typename resT>
@@ -1937,8 +1927,8 @@ class GemmBatchNoAtomicFunctorThreadNM
             else {
                 slmB_t vec{};
 #pragma unroll
-                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id)
-                {
+                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m;
+                     ++lane_id) {
                     std::size_t g_j1 = g_j + lane_id;
                     vec[lane_id] =
                         (g_j1 < m && g_s < k)
@@ -1966,8 +1956,8 @@ class GemmBatchNoAtomicFunctorThreadNM
             const std::size_t a_pr_offset = private_i * wi_delta_k;
 
             slmB_t local_sum(identity_);
-            for (std::size_t private_s = 0; private_s < wi_delta_k; ++private_s)
-            {
+            for (std::size_t private_s = 0; private_s < wi_delta_k;
+                 ++private_s) {
                 local_sum = local_sum +
                             (local_A_block[a_offset + a_pr_offset + private_s] *
                              local_B_block[b_offset + private_s]);
@@ -1984,8 +1974,8 @@ class GemmBatchNoAtomicFunctorThreadNM
             }
             else {
 #pragma unroll
-                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id)
-                {
+                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m;
+                     ++lane_id) {
                     const std::size_t gl_j = j + lane_id;
 
                     if (gl_i < n && gl_j < m) {
@@ -2111,8 +2101,8 @@ class GemmBatchNoAtomicFunctorThreadK
                 else {
                     accV_t local_B_vec;
 #pragma unroll
-                    for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx)
-                    {
+                    for (std::size_t vec_idx = 0; vec_idx < m_groups;
+                         ++vec_idx) {
                         local_B_vec[vec_idx] =
                             (sq < k && j + vec_idx < m)
                                 ? static_cast<resT>(
diff --git a/dpnp/tensor/libtensor/include/kernels/reductions.hpp b/dpnp/tensor/libtensor/include/kernels/reductions.hpp
index ee6431dec637..75df2c201968 100644
--- a/dpnp/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/reductions.hpp
@@ -138,8 +138,7 @@ struct SequentialReduction
             using dpctl::tensor::type_utils::convert_impl;
             outT val;
             if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
-            {
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value) {
                 val = convert_impl<bool, argT>(inp_[inp_offset]);
             }
             else {
@@ -221,8 +220,7 @@ struct ReductionOverGroupWithAtomicFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto inp_reduction_offset =
                 inp_reduced_dims_indexer_(arg_reduce_gid);
             auto inp_offset = inp_iter_offset + inp_reduction_offset;
@@ -230,8 +228,7 @@ struct ReductionOverGroupWithAtomicFunctor
             using dpctl::tensor::type_utils::convert_impl;
             outT val;
             if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
-            {
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value) {
                 // handle nans
                 val = convert_impl<bool, argT>(inp_[inp_offset]);
             }
@@ -356,8 +353,7 @@ struct CustomReductionOverGroupWithAtomicFunctor
             reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
 
         for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
             auto inp_reduction_offset =
                 inp_reduced_dims_indexer_(arg_reduce_gid);
             auto inp_offset = inp_iter_offset + inp_reduction_offset;
@@ -365,8 +361,7 @@ struct CustomReductionOverGroupWithAtomicFunctor
             using dpctl::tensor::type_utils::convert_impl;
             outT val;
             if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
-            {
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value) {
                 // handle nans
                 val = convert_impl<bool, argT>(inp_[inp_offset]);
             }
@@ -401,8 +396,8 @@ struct CustomReductionOverGroupWithAtomicFunctor
                                                        ReductionOp>::value) {
                 res_ref.fetch_and(red_val_over_wg);
             }
-            else if constexpr (su_ns::IsSyclLogicalOr<outT, ReductionOp>::value)
-            {
+            else if constexpr (su_ns::IsSyclLogicalOr<outT,
+                                                      ReductionOp>::value) {
                 res_ref.fetch_or(red_val_over_wg);
             }
             else {
@@ -487,8 +482,7 @@ struct ReductionOverGroupNoAtomicFunctor
                 using dpctl::tensor::type_utils::convert_impl;
                 outT val;
                 if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                              su_ns::IsLogicalOr<outT, ReductionOp>::value)
-                {
+                              su_ns::IsLogicalOr<outT, ReductionOp>::value) {
                     // handle nans
                     val = convert_impl<bool, argT>(inp_[inp_offset]);
                 }
@@ -600,8 +594,7 @@ struct CustomReductionOverGroupNoAtomicFunctor
                 if constexpr (std::is_same_v<ReductionOp,
                                              sycl::logical_and<outT>> ||
                               std::is_same_v<ReductionOp,
-                                             sycl::logical_or<outT>>)
-                {
+                                             sycl::logical_or<outT>>) {
                     // handle nans
                     val = convert_impl<bool, argT>(inp_[inp_offset]);
                 }
@@ -626,14 +619,16 @@ struct CustomReductionOverGroupNoAtomicFunctor
     }
 };
 
-template <
-    typename argTy,
-    typename resTy,
-    typename ReductionOpT,
-    typename InputOutputIterIndexerT,
-    typename ReductionIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
 sycl::event
     sequential_reduction(sycl::queue &exec_q,
                          const argTy *arg,
@@ -666,14 +661,16 @@ sycl::event
 template <typename BasedKernelName>
 class custom_reduction_wrapper;
 
-template <
-    typename argTy,
-    typename resTy,
-    typename ReductionOpT,
-    typename InputOutputIterIndexerT,
-    typename ReductionIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
 sycl::event
     submit_atomic_reduction(sycl::queue &exec_q,
                             const argTy *arg,
@@ -1051,14 +1048,16 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
 
 /* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */
 
-template <
-    typename argTy,
-    typename resTy,
-    typename ReductionOpT,
-    typename InputOutputIterIndexerT,
-    typename ReductionIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
 sycl::event submit_no_atomic_reduction(
     sycl::queue &exec_q,
     const argTy *arg,
@@ -1928,15 +1927,13 @@ struct SequentialSearchReduction
                         // less_complex always returns false for NaNs, so check
                         if (less_complex<argT>(val, red_val) ||
                             std::isnan(std::real(val)) ||
-                            std::isnan(std::imag(val)))
-                        {
+                            std::isnan(std::imag(val))) {
                             red_val = val;
                             idx_val = static_cast<outT>(m);
                         }
                     }
                     else if constexpr (std::is_floating_point_v<argT> ||
-                                       std::is_same_v<argT, sycl::half>)
-                    {
+                                       std::is_same_v<argT, sycl::half>) {
                         if (val < red_val || std::isnan(val)) {
                             red_val = val;
                             idx_val = static_cast<outT>(m);
@@ -1955,15 +1952,13 @@ struct SequentialSearchReduction
                         using dpctl::tensor::math_utils::greater_complex;
                         if (greater_complex<argT>(val, red_val) ||
                             std::isnan(std::real(val)) ||
-                            std::isnan(std::imag(val)))
-                        {
+                            std::isnan(std::imag(val))) {
                             red_val = val;
                             idx_val = static_cast<outT>(m);
                         }
                     }
                     else if constexpr (std::is_floating_point_v<argT> ||
-                                       std::is_same_v<argT, sycl::half>)
-                    {
+                                       std::is_same_v<argT, sycl::half>) {
                         if (val > red_val || std::isnan(val)) {
                             red_val = val;
                             idx_val = static_cast<outT>(m);
@@ -2243,8 +2238,7 @@ struct CustomSearchReduction
                             // check
                             if (less_complex<argT>(val, local_red_val) ||
                                 std::isnan(std::real(val)) ||
-                                std::isnan(std::imag(val)))
-                            {
+                                std::isnan(std::imag(val))) {
                                 local_red_val = val;
                                 if constexpr (!First) {
                                     local_idx = inds_[inp_offset];
@@ -2256,8 +2250,7 @@ struct CustomSearchReduction
                             }
                         }
                         else if constexpr (std::is_floating_point_v<argT> ||
-                                           std::is_same_v<argT, sycl::half>)
-                        {
+                                           std::is_same_v<argT, sycl::half>) {
                             if (val < local_red_val || std::isnan(val)) {
                                 local_red_val = val;
                                 if constexpr (!First) {
@@ -2289,8 +2282,7 @@ struct CustomSearchReduction
                             using dpctl::tensor::math_utils::greater_complex;
                             if (greater_complex<argT>(val, local_red_val) ||
                                 std::isnan(std::real(val)) ||
-                                std::isnan(std::imag(val)))
-                            {
+                                std::isnan(std::imag(val))) {
                                 local_red_val = val;
                                 if constexpr (!First) {
                                     local_idx = inds_[inp_offset];
@@ -2302,8 +2294,7 @@ struct CustomSearchReduction
                             }
                         }
                         else if constexpr (std::is_floating_point_v<argT> ||
-                                           std::is_same_v<argT, sycl::half>)
-                        {
+                                           std::is_same_v<argT, sycl::half>) {
                             if (val > local_red_val || std::isnan(val)) {
                                 local_red_val = val;
                                 if constexpr (!First) {
@@ -2347,8 +2338,7 @@ struct CustomSearchReduction
                             : idx_identity_;
         }
         else if constexpr (std::is_floating_point_v<argT> ||
-                           std::is_same_v<argT, sycl::half>)
-        {
+                           std::is_same_v<argT, sycl::half>) {
             // equality does not hold for NaNs, so check here
             local_idx =
                 (red_val_over_wg == local_red_val || std::isnan(local_red_val))
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
index a047c172f7bc..75d3dc5f01a0 100644
--- a/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -190,8 +190,8 @@ void merge_impl(const std::size_t offset,
             // Handle intermediate items
             if (l_search_bound_1 == r_search_bound_1) {
                 const std::size_t shift_1 = l_search_bound_1 - start_1;
-                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
-                {
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1;
+                     ++idx) {
                     const auto intermediate_item_2 = in_acc[idx];
                     const std::size_t shift_2 = idx - start_2;
                     out_acc[start_out + shift_1 + shift_2] =
@@ -199,8 +199,8 @@ void merge_impl(const std::size_t offset,
                 }
             }
             else {
-                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
-                {
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1;
+                     ++idx) {
                     const auto intermediate_item_2 = in_acc[idx];
                     // we shouldn't seek in whole 1st sequence. Just for the
                     // part where the 2nd sequence should be
@@ -282,10 +282,7 @@ struct GetValueType<sycl::buffer<ElementType, Dim, AllocatorT>>
 template <typename Iter>
 struct GetReadOnlyAccess
 {
-    Iter operator()(const Iter &it, sycl::handler &)
-    {
-        return it;
-    }
+    Iter operator()(const Iter &it, sycl::handler &) { return it; }
 };
 
 template <typename ElementType, int Dim, typename AllocatorT>
@@ -302,10 +299,7 @@ struct GetReadOnlyAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
 template <typename Iter>
 struct GetWriteDiscardAccess
 {
-    Iter operator()(Iter it, sycl::handler &)
-    {
-        return it;
-    }
+    Iter operator()(Iter it, sycl::handler &) { return it; }
 };
 
 template <typename ElementType, int Dim, typename AllocatorT>
@@ -322,10 +316,7 @@ struct GetWriteDiscardAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
 template <typename Iter>
 struct GetReadWriteAccess
 {
-    Iter operator()(Iter &it, sycl::handler &)
-    {
-        return it;
-    }
+    Iter operator()(Iter &it, sycl::handler &) { return it; }
 };
 
 template <typename ElementType, int Dim, typename AllocatorT>
@@ -479,8 +470,7 @@ sycl::event sort_over_work_group_contig_impl(
 
             // load input into SLM
             for (std::size_t array_id = segment_start_idx + lid;
-                 array_id < segment_end_idx; array_id += lws)
-            {
+                 array_id < segment_end_idx; array_id += lws) {
                 T v = (array_id < sort_nelems)
                           ? input_acc[iter_id * sort_nelems + array_id]
                           : T{};
@@ -505,8 +495,7 @@ sycl::event sort_over_work_group_contig_impl(
             const std::size_t max_chunks_merged =
                 1 + ((wg_chunk_size - 1) / chunk);
             for (; n_chunks_merged < max_chunks_merged;
-                 data_in_temp = !data_in_temp, n_chunks_merged *= 2)
-            {
+                 data_in_temp = !data_in_temp, n_chunks_merged *= 2) {
                 const std::size_t nelems_sorted_so_far =
                     n_chunks_merged * chunk;
                 const std::size_t q = (lid / n_chunks_merged);
@@ -531,8 +520,7 @@ sycl::event sort_over_work_group_contig_impl(
 
             const auto &out_src = (data_in_temp) ? scratch_space : work_space;
             for (std::size_t array_id = segment_start_idx + lid;
-                 array_id < segment_end_idx; array_id += lws)
-            {
+                 array_id < segment_end_idx; array_id += lws) {
                 if (array_id < sort_nelems) {
                     output_acc[iter_id * sort_nelems + array_id] =
                         out_src[array_id - segment_start_idx];
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 940c6d802a9a..5baa98e237df 100644
--- a/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -374,8 +374,7 @@ sycl::event
             // each work-item in the order of their local ids
             const std::uint32_t count_start_id = radix_states * lid;
             for (std::uint32_t radix_state_id = 0;
-                 radix_state_id < radix_states; ++radix_state_id)
-            {
+                 radix_state_id < radix_states; ++radix_state_id) {
                 counts_lacc[count_start_id + radix_state_id] =
                     counts_arr[radix_state_id];
             }
@@ -396,8 +395,7 @@ sycl::event
             // count per work-group: reduce until count_lacc[] size >
             // radix_states (n_witems /= 2 per iteration)
             for (std::uint32_t n_witems = (wg_size >> 1);
-                 n_witems >= radix_states; n_witems >>= 1)
-            {
+                 n_witems >= radix_states; n_witems >>= 1) {
                 if (lid < n_witems)
                     counts_lacc[lid] += counts_lacc[n_witems + lid];
 
@@ -478,8 +476,8 @@ sycl::event radix_sort_scan_submit(sycl::queue &exec_q,
 
             // NB: No race condition here, because the condition may ever be
             // true for only on one WG, one WI.
-            if ((lid == wg_size - 1) && (begin_ptr[scan_size - 1] == n_values))
-            {
+            if ((lid == wg_size - 1) &&
+                (begin_ptr[scan_size - 1] == n_values)) {
                 // set flag, since all the values got into one
                 // this is optimization, may happy often for
                 // higher radix offsets (all zeros)
@@ -794,8 +792,7 @@ sycl::event
             offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id];
 
             for (std::uint32_t radix_state_id = 1;
-                 radix_state_id < radix_states; ++radix_state_id)
-            {
+                 radix_state_id < radix_states; ++radix_state_id) {
                 const std::uint32_t local_offset_id =
                     segment_id + scan_size * radix_state_id;
 
@@ -835,8 +832,7 @@ sycl::event
 
                     OffsetT new_offset_id = 0;
                     for (std::uint32_t radix_state_id = 0;
-                         radix_state_id < radix_states; ++radix_state_id)
-                    {
+                         radix_state_id < radix_states; ++radix_state_id) {
                         bool is_current_bucket = (bucket_id == radix_state_id);
                         std::uint32_t sg_total_offset =
                             peer_prefix_hlp.peer_contribution(
@@ -864,8 +860,7 @@ sycl::event
 
                     OffsetT new_offset_id = 0;
                     for (std::uint32_t radix_state_id = 0;
-                         radix_state_id < radix_states; ++radix_state_id)
-                    {
+                         radix_state_id < radix_states; ++radix_state_id) {
                         bool is_current_bucket = (bucket_id == radix_state_id);
                         std::uint32_t sg_total_offset =
                             peer_prefix_hlp.peer_contribution(
@@ -899,8 +894,7 @@ sycl::event
 
                 OffsetT new_offset_id = 0;
                 for (std::uint32_t radix_state_id = 0;
-                     radix_state_id < radix_states; ++radix_state_id)
-                {
+                     radix_state_id < radix_states; ++radix_state_id) {
                     bool is_current_bucket = (bucket_id == radix_state_id);
                     std::uint32_t sg_total_offset =
                         peer_prefix_hlp.peer_contribution(
@@ -1038,8 +1032,7 @@ struct parallel_radix_sort_iteration_step
         static constexpr std::size_t sg16_v = 16u;
         static constexpr std::size_t sg08_v = 8u;
         if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size ||
-            sg08_v == reorder_sg_size)
-        {
+            sg08_v == reorder_sg_size) {
             static constexpr auto peer_algorithm =
                 peer_prefix_algo::subgroup_ballot;
 
@@ -1164,10 +1157,7 @@ struct subgroup_radix_sort
             return sycl::local_accessor<KeyT>(buf_size, cgh);
         }
 
-        std::size_t get_iter_stride() const
-        {
-            return std::size_t{0};
-        }
+        std::size_t get_iter_stride() const { return std::size_t{0}; }
     };
 
     template <typename KeyT>
@@ -1185,10 +1175,7 @@ struct subgroup_radix_sort
         {
             return sycl::accessor(buf, cgh, sycl::read_write, sycl::no_init);
         }
-        std::size_t get_iter_stride() const
-        {
-            return iter_stride;
-        }
+        std::size_t get_iter_stride() const { return iter_stride; }
     };
 
     static_assert(wg_size <= 1024);
@@ -1798,10 +1785,7 @@ struct IndexedProj
     {
     }
 
-    auto operator()(IndexT i) const
-    {
-        return value_projector(ptr[i]);
-    }
+    auto operator()(IndexT i) const { return value_projector(ptr[i]); }
 
 private:
     const ValueT *ptr;
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
index d9a103a02e99..1bbaa9e8345a 100644
--- a/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -299,8 +299,7 @@ sycl::event topk_merge_impl(
         // if allocation would be sufficiently large or k is larger than
         // elements processed, use full sort
         if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size ||
-            alloc_len >= axis_nelems / 2)
-        {
+            alloc_len >= axis_nelems / 2) {
             return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems,
                                              k, arg_tp, vals_tp, inds_tp,
                                              index_comp, depends);
@@ -346,8 +345,7 @@ sycl::event topk_merge_impl(
 
                     // load input into SLM
                     for (std::size_t array_id = segment_start_idx + lid;
-                         array_id < segment_end_idx; array_id += lws)
-                    {
+                         array_id < segment_end_idx; array_id += lws) {
                         IndexTy v = (array_id < axis_nelems)
                                         ? iter_id * axis_nelems + array_id
                                         : IndexTy{};
@@ -374,8 +372,7 @@ sycl::event topk_merge_impl(
                     const std::size_t max_chunks_merged =
                         1 + ((wg_chunk_size - 1) / chunk);
                     for (; n_chunks_merged < max_chunks_merged;
-                         data_in_temp = !data_in_temp, n_chunks_merged *= 2)
-                    {
+                         data_in_temp = !data_in_temp, n_chunks_merged *= 2) {
                         const std::size_t nelems_sorted_so_far =
                             n_chunks_merged * chunk;
                         const std::size_t q = (lid / n_chunks_merged);
@@ -410,8 +407,7 @@ sycl::event topk_merge_impl(
                     const auto &out_src =
                         (data_in_temp) ? scratch_space : work_space;
                     for (std::size_t array_id = k_segment_start_idx + lid;
-                         array_id < k_segment_end_idx; array_id += lws)
-                    {
+                         array_id < k_segment_end_idx; array_id += lws) {
                         if (lid < k_rounded) {
                             index_data[iter_id * alloc_len + array_id] =
                                 out_src[array_id - k_segment_start_idx];
diff --git a/dpnp/tensor/libtensor/include/kernels/where.hpp b/dpnp/tensor/libtensor/include/kernels/where.hpp
index 454e1e61fa0d..5527cccec8d2 100644
--- a/dpnp/tensor/libtensor/include/kernels/where.hpp
+++ b/dpnp/tensor/libtensor/include/kernels/where.hpp
@@ -96,8 +96,7 @@ class WhereContigFunctor
 
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (!enable_sg_loadstore || is_complex<condT>::value ||
-                      is_complex<T>::value)
-        {
+                      is_complex<T>::value) {
             const std::uint16_t sgSize =
                 ndit.get_sub_group().get_local_range()[0];
             const std::size_t gid = ndit.get_global_linear_id();
@@ -199,8 +198,7 @@ sycl::event where_contig_impl(sycl::queue &q,
         if (is_aligned<required_alignment>(cond_cp) &&
             is_aligned<required_alignment>(x1_cp) &&
             is_aligned<required_alignment>(x2_cp) &&
-            is_aligned<required_alignment>(dst_cp))
-        {
+            is_aligned<required_alignment>(dst_cp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = where_contig_kernel<T, condT, vec_sz, n_vecs>;
 
diff --git a/dpnp/tensor/libtensor/include/utils/offset_utils.hpp b/dpnp/tensor/libtensor/include/utils/offset_utils.hpp
index 19664c3d4e12..3a6ac75dfc3a 100644
--- a/dpnp/tensor/libtensor/include/utils/offset_utils.hpp
+++ b/dpnp/tensor/libtensor/include/utils/offset_utils.hpp
@@ -53,9 +53,9 @@ namespace detail
 {
 struct sink_t
 {
-    sink_t(){};
+    sink_t() {};
     template <class T>
-    sink_t(T &&){};
+    sink_t(T &&) {};
 };
 
 template <class V>
@@ -137,10 +137,7 @@ std::tuple<std::unique_ptr<indT, dpctl::tensor::alloc_utils::USMDeleter>,
 struct NoOpIndexer
 {
     constexpr NoOpIndexer() {}
-    constexpr std::size_t operator()(std::size_t gid) const
-    {
-        return gid;
-    }
+    constexpr std::size_t operator()(std::size_t gid) const { return gid; }
 };
 
 using dpctl::tensor::ssize_t;
@@ -156,10 +153,7 @@ struct StridedIndexer
     {
     }
 
-    ssize_t operator()(ssize_t gid) const
-    {
-        return compute_offset(gid);
-    }
+    ssize_t operator()(ssize_t gid) const { return compute_offset(gid); }
 
     ssize_t operator()(std::size_t gid) const
     {
@@ -200,10 +194,7 @@ struct UnpackedStridedIndexer
     {
     }
 
-    ssize_t operator()(ssize_t gid) const
-    {
-        return compute_offset(gid);
-    }
+    ssize_t operator()(ssize_t gid) const { return compute_offset(gid); }
 
     ssize_t operator()(std::size_t gid) const
     {
@@ -310,14 +301,8 @@ struct TwoOffsets
     {
     }
 
-    constexpr displacementT get_first_offset() const
-    {
-        return first_offset;
-    }
-    constexpr displacementT get_second_offset() const
-    {
-        return second_offset;
-    }
+    constexpr displacementT get_first_offset() const { return first_offset; }
+    constexpr displacementT get_second_offset() const { return second_offset; }
 
 private:
     displacementT first_offset = 0;
@@ -418,18 +403,9 @@ struct ThreeOffsets
     {
     }
 
-    constexpr displacementT get_first_offset() const
-    {
-        return first_offset;
-    }
-    constexpr displacementT get_second_offset() const
-    {
-        return second_offset;
-    }
-    constexpr displacementT get_third_offset() const
-    {
-        return third_offset;
-    }
+    constexpr displacementT get_first_offset() const { return first_offset; }
+    constexpr displacementT get_second_offset() const { return second_offset; }
+    constexpr displacementT get_third_offset() const { return third_offset; }
 
 private:
     displacementT first_offset = 0;
@@ -552,22 +528,10 @@ struct FourOffsets
     {
     }
 
-    constexpr displacementT get_first_offset() const
-    {
-        return first_offset;
-    }
-    constexpr displacementT get_second_offset() const
-    {
-        return second_offset;
-    }
-    constexpr displacementT get_third_offset() const
-    {
-        return third_offset;
-    }
-    constexpr displacementT get_fourth_offset() const
-    {
-        return fourth_offset;
-    }
+    constexpr displacementT get_first_offset() const { return first_offset; }
+    constexpr displacementT get_second_offset() const { return second_offset; }
+    constexpr displacementT get_third_offset() const { return third_offset; }
+    constexpr displacementT get_fourth_offset() const { return fourth_offset; }
 
 private:
     displacementT first_offset = 0;
diff --git a/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp b/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
index 87cdfbfbd54f..5d03294392d8 100644
--- a/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
+++ b/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
@@ -112,9 +112,9 @@ struct ExtendedComplexFPGreater
 };
 
 template <typename T>
-inline constexpr bool is_fp_v = (std::is_same_v<T, sycl::half> ||
-                                 std::is_same_v<T, float> ||
-                                 std::is_same_v<T, double>);
+inline constexpr bool is_fp_v =
+    (std::is_same_v<T, sycl::half> || std::is_same_v<T, float> ||
+     std::is_same_v<T, double>);
 
 } // namespace detail
 
diff --git a/dpnp/tensor/libtensor/include/utils/strided_iters.hpp b/dpnp/tensor/libtensor/include/utils/strided_iters.hpp
index 0bed181802ae..65250b755b56 100644
--- a/dpnp/tensor/libtensor/include/utils/strided_iters.hpp
+++ b/dpnp/tensor/libtensor/include/utils/strided_iters.hpp
@@ -312,14 +312,8 @@ class CIndexer_array
         elem_count = s;
     }
 
-    indT size() const
-    {
-        return elem_count;
-    }
-    indT rank() const
-    {
-        return ndim;
-    }
+    indT size() const { return elem_count; }
+    indT rank() const { return ndim; }
 
     void set(const indT i)
     {
@@ -339,10 +333,7 @@ class CIndexer_array
         multi_index[0] = i_;
     }
 
-    const index_t &get() const
-    {
-        return multi_index;
-    }
+    const index_t &get() const { return multi_index; }
 };
 
 /*
@@ -658,8 +649,7 @@ int simplify_iteration_three_strides(const int nd,
         auto str3_p = strides3[p];
         shape_w.push_back(sh_p);
         if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 &&
-            std::min({str1_p, str2_p, str3_p}) < 0)
-        {
+            std::min({str1_p, str2_p, str3_p}) < 0) {
             disp1 += str1_p * (sh_p - 1);
             str1_p = -str1_p;
             disp2 += str2_p * (sh_p - 1);
@@ -832,8 +822,7 @@ int simplify_iteration_four_strides(const int nd,
         auto str4_p = strides4[p];
         shape_w.push_back(sh_p);
         if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 && str4_p <= 0 &&
-            std::min({str1_p, str2_p, str3_p, str4_p}) < 0)
-        {
+            std::min({str1_p, str2_p, str3_p, str4_p}) < 0) {
             disp1 += str1_p * (sh_p - 1);
             str1_p = -str1_p;
             disp2 += str2_p * (sh_p - 1);
@@ -919,8 +908,7 @@ std::tuple<vecT, vecT, T, vecT, T, vecT, T, vecT, T>
 {
     const std::size_t dim = shape.size();
     if (dim != strides1.size() || dim != strides2.size() ||
-        dim != strides3.size() || dim != strides4.size())
-    {
+        dim != strides3.size() || dim != strides4.size()) {
         throw Error("Shape and strides must be of equal size.");
     }
     vecT out_shape = shape;
diff --git a/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
index f45918e3c800..9ae41e5ade6e 100644
--- a/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -501,10 +501,7 @@ struct GetIdentity<Op, T, std::enable_if_t<IsLogSumExp<T, Op>::value>>
 template <typename T>
 struct Hypot
 {
-    T operator()(const T &x, const T &y) const
-    {
-        return sycl::hypot(x, y);
-    }
+    T operator()(const T &x, const T &y) const { return sycl::hypot(x, y); }
 };
 
 template <typename T, class Op>
diff --git a/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
index d08187aeaacc..bead0da5093e 100644
--- a/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
+++ b/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -106,8 +106,8 @@ struct usm_ndarray_types
                 throw_unrecognized_typenum_error(typenum);
             }
         }
-        else if (typenum == api.UAR_LONGLONG_ || typenum == api.UAR_ULONGLONG_)
-        {
+        else if (typenum == api.UAR_LONGLONG_ ||
+                 typenum == api.UAR_ULONGLONG_) {
             switch (sizeof(long long)) {
             case sizeof(std::int64_t):
                 return ((typenum == api.UAR_LONGLONG_)
diff --git a/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
index 431e020fbdbe..7170624b5bbe 100644
--- a/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
+++ b/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
@@ -62,8 +62,7 @@ enum class typenum_t : int
 inline constexpr int num_types = 14; // number of elements in typenum_t
 
 template <typename funcPtrT,
-          template <typename fnT, typename D, typename S>
-          typename factory,
+          template <typename fnT, typename D, typename S> typename factory,
           int _num_types>
 class DispatchTableBuilder
 {
@@ -124,8 +123,7 @@ class DispatchTableBuilder
 };
 
 template <typename funcPtrT,
-          template <typename fnT, typename T>
-          typename factory,
+          template <typename fnT, typename T> typename factory,
           int _num_types>
 class DispatchVectorBuilder
 {
@@ -260,10 +258,7 @@ struct NullPtrVector
 
     NullPtrVector() : val(nullptr) {}
 
-    const_reference operator[](int) const
-    {
-        return val;
-    }
+    const_reference operator[](int) const { return val; }
 
 private:
     value_type val;
@@ -278,10 +273,7 @@ struct NullPtrTable
 
     NullPtrTable() : val() {}
 
-    const_reference operator[](int) const
-    {
-        return val;
-    }
+    const_reference operator[](int) const { return val; }
 
 private:
     value_type val;
diff --git a/dpnp/tensor/libtensor/include/utils/type_utils.hpp b/dpnp/tensor/libtensor/include/utils/type_utils.hpp
index e5855081c727..47b1a5554815 100644
--- a/dpnp/tensor/libtensor/include/utils/type_utils.hpp
+++ b/dpnp/tensor/libtensor/include/utils/type_utils.hpp
@@ -98,8 +98,7 @@ dstTy convert_impl(const srcTy &v)
     }
     else if constexpr (!std::is_integral_v<srcTy> &&
                        !std::is_same_v<dstTy, bool> &&
-                       std::is_integral_v<dstTy> && std::is_unsigned_v<dstTy>)
-    {
+                       std::is_integral_v<dstTy> && std::is_unsigned_v<dstTy>) {
         // first cast to signed variant, the cast to unsigned one
         using signedT = typename std::make_signed_t<dstTy>;
         return static_cast<dstTy>(convert_impl<signedT, srcTy>(v));
diff --git a/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
index 4dd00620a260..bce47c45f9b1 100644
--- a/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
+++ b/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
@@ -445,8 +445,7 @@ bool py_accumulate_dtype_supported(const py::dtype &input_dtype,
     }
 
     if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
+        out_typeid >= td_ns::num_types) {
         throw std::runtime_error("Reduction type support check: lookup failed");
     }
 
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
index e24cf56ddd62..d4961c9edbf1 100644
--- a/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
@@ -140,8 +140,7 @@ struct CumLogSumExp1DContigFactory
     fnT get()
     {
         if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -173,8 +172,7 @@ struct CumLogSumExp1DIncludeInitialContigFactory
     fnT get()
     {
         if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -206,8 +204,7 @@ struct CumLogSumExpStridedFactory
     fnT get()
     {
         if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -239,8 +236,7 @@ struct CumLogSumExpIncludeInitialStridedFactory
     fnT get()
     {
         if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = su_ns::LogSumExp<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
index 65f3c311eda1..319709b30a76 100644
--- a/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -151,9 +151,8 @@ struct CumProd1DContigFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumProdScanOpT<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -184,9 +183,8 @@ struct CumProd1DIncludeInitialContigFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumProdScanOpT<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -217,9 +215,8 @@ struct CumProdStridedFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumProdScanOpT<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -250,9 +247,8 @@ struct CumProdIncludeInitialStridedFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumProdScanOpT<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
index 60b46946acc9..f700883af2a1 100644
--- a/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -150,9 +150,8 @@ struct CumSum1DContigFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumSumScanOpT<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -183,9 +182,8 @@ struct CumSum1DIncludeInitialContigFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumSumScanOpT<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -216,9 +214,8 @@ struct CumSumStridedFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumSumScanOpT<dstTy>;
             static constexpr bool include_initial = false;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
@@ -249,9 +246,8 @@ struct CumSumIncludeInitialStridedFactory
 {
     fnT get()
     {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
             using ScanOpT = CumSumScanOpT<dstTy>;
             static constexpr bool include_initial = true;
             if constexpr (std::is_same_v<srcTy, dstTy>) {
diff --git a/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
index e44abbd48303..146be45e4858 100644
--- a/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -213,8 +213,7 @@ std::pair<sycl::event, sycl::event>
     // masked_dst_nelems is number of set elements in the mask, or last element
     // in cumsum
     if (!same_ortho_dims ||
-        (masked_src_nelems != static_cast<std::size_t>(cumsum_sz)))
-    {
+        (masked_src_nelems != static_cast<std::size_t>(cumsum_sz))) {
         throw py::value_error("Inconsistent array dimensions");
     }
 
@@ -539,8 +538,7 @@ std::pair<sycl::event, sycl::event>
     }
 
     if (!same_ortho_dims ||
-        (masked_dst_nelems != static_cast<std::size_t>(cumsum_sz)))
-    {
+        (masked_dst_nelems != static_cast<std::size_t>(cumsum_sz))) {
         throw py::value_error("Inconsistent array dimensions");
     }
 
@@ -791,8 +789,7 @@ std::pair<sycl::event, sycl::event>
 
     // cumsum must be int32_t or int64_t only
     if ((cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) ||
-        (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid))
-    {
+        (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid)) {
         throw py::value_error("Cumulative sum array and index array must have "
                               "int32 or int64 data-type");
     }
diff --git a/dpnp/tensor/libtensor/source/clip.cpp b/dpnp/tensor/libtensor/source/clip.cpp
index 3e1c5e8cd262..4a0e5b9357de 100644
--- a/dpnp/tensor/libtensor/source/clip.cpp
+++ b/dpnp/tensor/libtensor/source/clip.cpp
@@ -142,8 +142,7 @@ std::pair<sycl::event, sycl::event>
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(dst, src) && !same_logical_tensors(dst, src)) ||
         (overlap(dst, min) && !same_logical_tensors(dst, min)) ||
-        (overlap(dst, max) && !same_logical_tensors(dst, max)))
-    {
+        (overlap(dst, max) && !same_logical_tensors(dst, max))) {
         throw py::value_error("Destination array overlaps with input.");
     }
 
@@ -159,8 +158,7 @@ std::pair<sycl::event, sycl::event>
     int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
 
     if (src_typeid != dst_typeid || src_typeid != min_typeid ||
-        src_typeid != max_typeid)
-    {
+        src_typeid != max_typeid) {
         throw py::value_error("Input, min, max, and destination arrays must "
                               "have the same data type");
     }
diff --git a/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
index 43a6fbf4a0dd..7c2db989b0c2 100644
--- a/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
+++ b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -204,8 +204,7 @@ std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
 
             sycl::event copy_and_cast_1d_event;
             if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) &&
-                (src_offset == 0) && (dst_offset == 0))
-            {
+                (src_offset == 0) && (dst_offset == 0)) {
                 auto contig_fn =
                     copy_and_cast_contig_dispatch_table[dst_type_id]
                                                        [src_type_id];
diff --git a/dpnp/tensor/libtensor/source/copy_as_contig.cpp b/dpnp/tensor/libtensor/source/copy_as_contig.cpp
index 5d78862651fc..c1c4b740dfba 100644
--- a/dpnp/tensor/libtensor/source/copy_as_contig.cpp
+++ b/dpnp/tensor/libtensor/source/copy_as_contig.cpp
@@ -535,8 +535,7 @@ std::pair<sycl::event, sycl::event>
     if (1 == nd) {
         const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
         if ((simplified_shape.front() != expected_dim) ||
-            (simplified_dst_strides.front() != dst_batch_step))
-        {
+            (simplified_dst_strides.front() != dst_batch_step)) {
             throw std::runtime_error(
                 "Unexpected result of simplifying iteration space, 2");
         }
@@ -727,8 +726,7 @@ std::pair<sycl::event, sycl::event>
     if (1 == nd) {
         const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
         if ((simplified_shape.front() != expected_dim) ||
-            (simplified_dst_strides.front() != dst_batch_step))
-        {
+            (simplified_dst_strides.front() != dst_batch_step)) {
             throw std::runtime_error(
                 "Unexpected result of simplifying iteration space, 2");
         }
diff --git a/dpnp/tensor/libtensor/source/device_support_queries.cpp b/dpnp/tensor/libtensor/source/device_support_queries.cpp
index 3cc0952c2080..6026520f3daa 100644
--- a/dpnp/tensor/libtensor/source/device_support_queries.cpp
+++ b/dpnp/tensor/libtensor/source/device_support_queries.cpp
@@ -110,15 +110,9 @@ std::string _default_device_complex_type(const sycl::device &d)
     }
 }
 
-std::string _default_device_bool_type(const sycl::device &)
-{
-    return "b1";
-}
+std::string _default_device_bool_type(const sycl::device &) { return "b1"; }
 
-std::string _default_device_index_type(const sycl::device &)
-{
-    return "i8";
-}
+std::string _default_device_index_type(const sycl::device &) { return "i8"; }
 
 sycl::device _extract_device(const py::object &arg)
 {
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
index b8450f8e7296..3a8dc6bfb56f 100644
--- a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -377,8 +377,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
     auto const &same_logical_tensors =
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) ||
-        (overlap(src2, dst) && !same_logical_tensors(src2, dst)))
-    {
+        (overlap(src2, dst) && !same_logical_tensors(src2, dst))) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
     // check memory overlap
@@ -445,8 +444,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
 
         if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
             isEqual(simplified_src2_strides, unit_stride) &&
-            isEqual(simplified_dst_strides, unit_stride))
-        {
+            isEqual(simplified_dst_strides, unit_stride)) {
             auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
 
             if (contig_fn != nullptr) {
@@ -468,8 +466,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_src2_strides, zero_one_strides) &&
                 isEqual(simplified_src1_strides, {simplified_shape[1], one}) &&
-                isEqual(simplified_dst_strides, {simplified_shape[1], one}))
-            {
+                isEqual(simplified_dst_strides, {simplified_shape[1], one})) {
                 auto matrix_row_broadcast_fn =
                     contig_matrix_row_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
@@ -483,8 +480,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                         is_aligned<required_alignment>(
                             src2_data + src2_offset * src2_itemsize) &&
                         is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
+                            dst_data + dst_offset * dst_itemsize)) {
                         std::size_t n0 = simplified_shape[0];
                         std::size_t n1 = simplified_shape[1];
                         sycl::event comp_ev = matrix_row_broadcast_fn(
@@ -501,8 +497,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
             }
             if (isEqual(simplified_src1_strides, one_zero_strides) &&
                 isEqual(simplified_src2_strides, {one, simplified_shape[0]}) &&
-                isEqual(simplified_dst_strides, {one, simplified_shape[0]}))
-            {
+                isEqual(simplified_dst_strides, {one, simplified_shape[0]})) {
                 auto row_matrix_broadcast_fn =
                     contig_row_matrix_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
@@ -517,8 +512,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                         is_aligned<required_alignment>(
                             src2_data + src2_offset * src2_itemsize) &&
                         is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
+                            dst_data + dst_offset * dst_itemsize)) {
                         std::size_t n0 = simplified_shape[1];
                         std::size_t n1 = simplified_shape[0];
                         sycl::event comp_ev = row_matrix_broadcast_fn(
@@ -590,8 +584,7 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
     }
 
     if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
+        src2_typeid >= td_ns::num_types) {
         throw std::runtime_error("binary output type lookup failed");
     }
     int dst_typeid = output_types_table[src1_typeid][src2_typeid];
@@ -739,8 +732,7 @@ std::pair<sycl::event, sycl::event>
             std::initializer_list<py::ssize_t>{1};
 
         if ((nd == 1) && isEqual(simplified_rhs_strides, unit_stride) &&
-            isEqual(simplified_lhs_strides, unit_stride))
-        {
+            isEqual(simplified_lhs_strides, unit_stride)) {
             auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
 
             if (contig_fn != nullptr) {
@@ -759,8 +751,7 @@ std::pair<sycl::event, sycl::event>
             static constexpr py::ssize_t one{1};
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_rhs_strides, one_zero_strides) &&
-                isEqual(simplified_lhs_strides, {one, simplified_shape[0]}))
-            {
+                isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) {
                 auto row_matrix_broadcast_fn =
                     contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
                                                               [lhs_typeid];
diff --git a/dpnp/tensor/libtensor/source/full_ctor.cpp b/dpnp/tensor/libtensor/source/full_ctor.cpp
index dfe1d25b769c..8d7fcd22b914 100644
--- a/dpnp/tensor/libtensor/source/full_ctor.cpp
+++ b/dpnp/tensor/libtensor/source/full_ctor.cpp
@@ -127,10 +127,7 @@ sycl::event full_contig_impl(sycl::queue &exec_q,
                 constexpr UInt128() : v1{}, v2{} {}
                 UInt128(const UInt128 &) = default;
 
-                operator bool() const
-                {
-                    return bool(!v1) && bool(!v2);
-                }
+                operator bool() const { return bool(!v1) && bool(!v2); }
 
                 std::uint64_t v1;
                 std::uint64_t v2;
diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp b/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
index 05ee37594e12..9621ebc3277f 100644
--- a/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
+++ b/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
@@ -217,8 +217,7 @@ std::pair<sycl::event, sycl::event>
     int x1_nd = x1.get_ndim();
     int x2_nd = x2.get_ndim();
     if (x1_nd != (batch_dims + x1_outer_dims + inner_dims) ||
-        x2_nd != (batch_dims + x2_outer_dims + inner_dims))
-    {
+        x2_nd != (batch_dims + x2_outer_dims + inner_dims)) {
         throw py::value_error("Input arrays do not have dimensions consistent "
                               "with input dimensions");
     }
@@ -332,8 +331,7 @@ std::pair<sycl::event, sycl::event>
     sycl::event dot_ev;
     if (call_vecdot) {
         if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig) ||
-            ((is_x1_f_contig && is_x2_f_contig) && !call_batched))
-        {
+            ((is_x1_f_contig && is_x2_f_contig) && !call_batched)) {
             dot_product_contig_impl_fn_ptr_t fn = nullptr;
             if (supports_atomics) {
                 fn = dot_product_contig_dispatch_table[x1_typeid][x2_typeid];
@@ -660,8 +658,7 @@ std::pair<sycl::event, sycl::event>
                 x1_batch_offset, x2_batch_offset, dst_batch_offset);
 
             if (batch_dims == 1 && x1_outer_dims == 1 && x2_outer_dims == 1 &&
-                inner_dims == 1)
-            {
+                inner_dims == 1) {
                 bool gemm_batch_c_contig = false;
 
                 if ((static_cast<std::size_t>(outer_inner_x1_strides[0]) ==
@@ -672,8 +669,7 @@ std::pair<sycl::event, sycl::event>
                      outer_inner_x2_strides[1] == 1) &&
                     (static_cast<std::size_t>(outer_inner_dst_strides[0]) ==
                          x2_outer_nelems &&
-                     outer_inner_dst_strides[1] == 1))
-                {
+                     outer_inner_dst_strides[1] == 1)) {
                     gemm_batch_c_contig =
                         (static_cast<std::size_t>(
                              simplified_batch_x1_strides[0]) ==
@@ -801,8 +797,7 @@ py::object py_dot_result_type(const py::dtype &input1_dtype,
     }
 
     if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
+        src2_typeid >= td_ns::num_types) {
         throw std::runtime_error("binary output type lookup failed");
     }
     int dst_typeid = output_types_table[src1_typeid][src2_typeid];
diff --git a/dpnp/tensor/libtensor/source/reductions/argmax.cpp b/dpnp/tensor/libtensor/source/reductions/argmax.cpp
index 10fc49759168..af602371dfc5 100644
--- a/dpnp/tensor/libtensor/source/reductions/argmax.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/argmax.cpp
@@ -131,9 +131,8 @@ struct ArgmaxOverAxisTempsStridedFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
@@ -165,9 +164,8 @@ struct ArgmaxOverAxis1TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
@@ -199,9 +197,8 @@ struct ArgmaxOverAxis0TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
diff --git a/dpnp/tensor/libtensor/source/reductions/argmin.cpp b/dpnp/tensor/libtensor/source/reductions/argmin.cpp
index ec4637b62d49..4869b75eacf9 100644
--- a/dpnp/tensor/libtensor/source/reductions/argmin.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/argmin.cpp
@@ -131,9 +131,8 @@ struct ArgminOverAxisTempsStridedFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgminReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
@@ -165,9 +164,8 @@ struct ArgminOverAxis1TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgminReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
@@ -199,9 +197,8 @@ struct ArgminOverAxis0TempsContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
+        if constexpr (TypePairSupportForArgminReductionTemps<
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_integral_v<srcTy> &&
                           !std::is_same_v<srcTy, bool>) {
                 // op for values
diff --git a/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp b/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
index 75e4010bfd5b..351eab82ee6b 100644
--- a/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
@@ -138,8 +138,7 @@ struct LogSumExpOverAxisTempsStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::LogSumExp<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_over_group_temps_strided_impl<srcTy, dstTy,
@@ -157,8 +156,7 @@ struct LogSumExpOverAxis1TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::LogSumExp<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
@@ -176,8 +174,7 @@ struct LogSumExpOverAxis0TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::LogSumExp<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
diff --git a/dpnp/tensor/libtensor/source/reductions/max.cpp b/dpnp/tensor/libtensor/source/reductions/max.cpp
index d19ed226d3b4..628f7cfe8606 100644
--- a/dpnp/tensor/libtensor/source/reductions/max.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/max.cpp
@@ -163,8 +163,7 @@ struct MaxOverAxisAtomicStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMaxReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Maximum<dstTy>;
                 return dpctl::tensor::kernels::
@@ -217,8 +216,7 @@ struct MaxOverAxis1AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMaxReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Maximum<dstTy>;
                 return dpctl::tensor::kernels::
@@ -244,8 +242,7 @@ struct MaxOverAxis0AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMaxReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Maximum<dstTy>;
                 return dpctl::tensor::kernels::
diff --git a/dpnp/tensor/libtensor/source/reductions/min.cpp b/dpnp/tensor/libtensor/source/reductions/min.cpp
index 97d3432b13ed..68bfdb583b0b 100644
--- a/dpnp/tensor/libtensor/source/reductions/min.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/min.cpp
@@ -163,8 +163,7 @@ struct MinOverAxisAtomicStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMinReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Minimum<dstTy>;
                 return dpctl::tensor::kernels::
@@ -217,8 +216,7 @@ struct MinOverAxis1AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMinReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Minimum<dstTy>;
                 return dpctl::tensor::kernels::
@@ -244,8 +242,7 @@ struct MinOverAxis0AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForMinReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             if constexpr (std::is_floating_point<dstTy>::value) {
                 using ReductionOpT = su_ns::Minimum<dstTy>;
                 return dpctl::tensor::kernels::
diff --git a/dpnp/tensor/libtensor/source/reductions/prod.cpp b/dpnp/tensor/libtensor/source/reductions/prod.cpp
index 6cbb21dfe02c..9ecd403159b0 100644
--- a/dpnp/tensor/libtensor/source/reductions/prod.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/prod.cpp
@@ -246,8 +246,7 @@ struct ProductOverAxisAtomicStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::multiplies<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
@@ -265,8 +264,7 @@ struct ProductOverAxisTempsStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
                                                     sycl::logical_and<dstTy>,
                                                     sycl::multiplies<dstTy>>;
@@ -286,8 +284,7 @@ struct ProductOverAxis1AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::multiplies<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis1_over_group_with_atomics_contig_impl<
@@ -305,8 +302,7 @@ struct ProductOverAxis0AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::multiplies<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis0_over_group_with_atomics_contig_impl<
@@ -324,8 +320,7 @@ struct ProductOverAxis1TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
                                                     sycl::logical_and<dstTy>,
                                                     sycl::multiplies<dstTy>>;
@@ -345,8 +340,7 @@ struct ProductOverAxis0TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForProductReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
                                                     sycl::logical_and<dstTy>,
                                                     sycl::multiplies<dstTy>>;
diff --git a/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
index 5279b4f6c276..b8a042e9a55b 100644
--- a/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
@@ -138,8 +138,7 @@ struct HypotOverAxisTempsStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForHypotReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::Hypot<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_over_group_temps_strided_impl<srcTy, dstTy,
@@ -157,8 +156,7 @@ struct HypotOverAxis1TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForHypotReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::Hypot<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
@@ -176,8 +174,7 @@ struct HypotOverAxis0TempsContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForHypotReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = su_ns::Hypot<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
index 5f9cc32f1203..af6c3f0d513a 100644
--- a/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
+++ b/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
@@ -97,8 +97,7 @@ struct ArithmeticAtomicSupportFactory
     {
         using dpctl::tensor::type_utils::is_complex;
         if constexpr (std::is_floating_point_v<T> ||
-                      std::is_same_v<T, sycl::half> || is_complex<T>::value)
-        {
+                      std::is_same_v<T, sycl::half> || is_complex<T>::value) {
             // for real- and complex- floating point types, tree reduction has
             // better round-off accumulation properties (round-off error is
             // proportional to the log2(reduction_size), while naive elementwise
@@ -117,10 +116,7 @@ struct ArithmeticAtomicSupportFactory
 template <typename fnT, typename T>
 struct MinMaxAtomicSupportFactory
 {
-    fnT get()
-    {
-        return check_atomic_support<T>;
-    }
+    fnT get() { return check_atomic_support<T>; }
 };
 
 template <typename fnT, typename T>
diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
index 936c8dbe9b56..8224163ccb19 100644
--- a/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
+++ b/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -96,8 +96,7 @@ bool py_reduction_dtype_supported(
     }
 
     if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
+        out_typeid >= td_ns::num_types) {
         throw std::runtime_error("Reduction type support check: lookup failed");
     }
 
@@ -158,8 +157,7 @@ bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype,
     }
 
     if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
+        out_typeid >= td_ns::num_types) {
         throw std::runtime_error("Reduction type support check: lookup failed");
     }
 
@@ -259,8 +257,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
     bool is_src_f_contig = src.is_f_contiguous();
 
     if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nelems == 1))
-    {
+        (is_src_f_contig && dst_nelems == 1)) {
         // remove_all_extents gets underlying type of table
         using contig_fn_ptr_T =
             typename std::remove_all_extents<contig_fnT>::type;
@@ -292,8 +289,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
         }
     }
     else if (is_src_f_contig &&
-             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-    {
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) {
         // remove_all_extents gets underlying type of table
         using contig_fn_ptr_T =
             typename std::remove_all_extents<contig_fnT>::type;
@@ -391,8 +387,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
                      simplified_iteration_src_strides[0]) == reduction_nelems);
         }
         else if (static_cast<std::size_t>(
-                     simplified_reduction_src_strides[0]) == iter_nelems)
-        {
+                     simplified_reduction_src_strides[0]) == iter_nelems) {
             mat_reduce_over_axis0 =
                 (simplified_iteration_dst_strides[0] == 1) &&
                 (simplified_iteration_src_strides[0] == 1);
@@ -586,8 +581,7 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
     bool is_src_f_contig = src.is_f_contiguous();
 
     if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nelems == 1))
-    {
+        (is_src_f_contig && dst_nelems == 1)) {
         auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
@@ -610,8 +604,7 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
         }
     }
     else if (is_src_f_contig &&
-             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-    {
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) {
         auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
         if (fn != nullptr) {
             std::size_t iter_nelems = dst_nelems;
@@ -699,8 +692,7 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
                      simplified_iteration_src_strides[0]) == reduction_nelems);
         }
         else if (static_cast<std::size_t>(
-                     simplified_reduction_src_strides[0]) == iter_nelems)
-        {
+                     simplified_reduction_src_strides[0]) == iter_nelems) {
             mat_reduce_over_axis0 =
                 (simplified_iteration_dst_strides[0] == 1) &&
                 (simplified_iteration_src_strides[0] == 1);
@@ -969,8 +961,7 @@ std::pair<sycl::event, sycl::event> py_search_over_axis(
                      simplified_iteration_src_strides[0]) == reduction_nelems);
         }
         else if (static_cast<std::size_t>(compact_reduction_src_strides[0]) ==
-                 iter_nelems)
-        {
+                 iter_nelems) {
             mat_reduce_over_axis0 =
                 (simplified_iteration_dst_strides[0] == 1) &&
                 (simplified_iteration_src_strides[0] == 1);
@@ -1153,8 +1144,7 @@ std::pair<sycl::event, sycl::event>
 
     // TODO: should be dst_nelems == 0?
     if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nelems == 0))
-    {
+        (is_src_f_contig && dst_nelems == 0)) {
         auto fn = axis1_contig_dispatch_vector[src_typeid];
         static constexpr py::ssize_t zero_offset = 0;
 
@@ -1168,8 +1158,7 @@ std::pair<sycl::event, sycl::event>
         return std::make_pair(keep_args_event, red_ev);
     }
     else if (is_src_f_contig &&
-             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-    {
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) {
         auto fn = axis0_contig_dispatch_vector[src_typeid];
         static constexpr py::ssize_t zero_offset = 0;
 
diff --git a/dpnp/tensor/libtensor/source/reductions/sum.cpp b/dpnp/tensor/libtensor/source/reductions/sum.cpp
index d7142477750a..9a0d212ed8da 100644
--- a/dpnp/tensor/libtensor/source/reductions/sum.cpp
+++ b/dpnp/tensor/libtensor/source/reductions/sum.cpp
@@ -246,8 +246,7 @@ struct SumOverAxisAtomicStridedFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::plus<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
@@ -285,8 +284,7 @@ struct SumOverAxis1AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::plus<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis1_over_group_with_atomics_contig_impl<
@@ -304,8 +302,7 @@ struct SumOverAxis0AtomicContigFactory
     fnT get() const
     {
         if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
+                          srcTy, dstTy>::is_defined) {
             using ReductionOpT = sycl::plus<dstTy>;
             return dpctl::tensor::kernels::
                 reduction_axis0_over_group_with_atomics_contig_impl<
diff --git a/dpnp/tensor/libtensor/source/repeat.cpp b/dpnp/tensor/libtensor/source/repeat.cpp
index 919f51f9a4d1..b809160e257b 100644
--- a/dpnp/tensor/libtensor/source/repeat.cpp
+++ b/dpnp/tensor/libtensor/source/repeat.cpp
@@ -136,8 +136,8 @@ std::pair<sycl::event, sycl::event>
         throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
     }
 
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {src, reps, cumsum, dst})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
@@ -170,8 +170,7 @@ std::pair<sycl::event, sycl::event>
 
     // shape at repeated axis must be equal to the sum of reps
     if (!same_orthog_dims || src_axis_nelems != reps_sz ||
-        src_axis_nelems != cumsum_sz)
-    {
+        src_axis_nelems != cumsum_sz) {
         throw py::value_error("Inconsistent array dimensions");
     }
 
@@ -386,8 +385,8 @@ std::pair<sycl::event, sycl::event>
         throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
     }
 
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {src, reps, cumsum, dst})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
diff --git a/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp b/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
index 5e42938a22f2..573aaeb0a60b 100644
--- a/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -350,8 +350,7 @@ void simplify_iteration_space_4(
         simplified_dst_strides.reserve(nd);
 
         if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
-            (src3_strides[0] < 0) && (dst_strides[0] < 0))
-        {
+            (src3_strides[0] < 0) && (dst_strides[0] < 0)) {
             simplified_src1_strides.push_back(-src1_strides[0]);
             simplified_src2_strides.push_back(-src2_strides[0]);
             simplified_src3_strides.push_back(-src3_strides[0]);
diff --git a/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
index 2b6dcc8bf447..11df5cd2ef47 100644
--- a/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
+++ b/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
@@ -72,8 +72,7 @@ struct AscendingArgSortContigFactory
     fnT get()
     {
         if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
-                      std::is_same_v<IndexTy, std::int32_t>)
-        {
+                      std::is_same_v<IndexTy, std::int32_t>) {
             using dpctl::tensor::rich_comparisons::AscendingSorter;
             using Comp = typename AscendingSorter<argTy>::type;
 
@@ -92,8 +91,7 @@ struct DescendingArgSortContigFactory
     fnT get()
     {
         if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
-                      std::is_same_v<IndexTy, std::int32_t>)
-        {
+                      std::is_same_v<IndexTy, std::int32_t>) {
             using dpctl::tensor::rich_comparisons::DescendingSorter;
             using Comp = typename DescendingSorter<argTy>::type;
 
diff --git a/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
index 6328b3339376..018f3166a0ad 100644
--- a/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
+++ b/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
@@ -128,8 +128,7 @@ std::pair<sycl::event, sycl::event>
     int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
 
     if ((dst_typeid != static_cast<int>(td_ns::typenum_t::INT64)) &&
-        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32)))
-    {
+        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32))) {
         throw py::value_error(
             "Output index array must have data type int32 or int64");
     }
diff --git a/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
index e54b8f739a4b..0eec8fba9ded 100644
--- a/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
+++ b/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
@@ -104,8 +104,7 @@ struct AscendingRadixArgSortContigFactory
     {
         if constexpr (RadixSortSupportVector<argTy>::is_defined &&
                       (std::is_same_v<IndexTy, std::int64_t> ||
-                       std::is_same_v<IndexTy, std::int32_t>))
-        {
+                       std::is_same_v<IndexTy, std::int32_t>)) {
             return argsort_axis1_contig_caller<
                 /*ascending*/ true, argTy, IndexTy>;
         }
@@ -122,8 +121,7 @@ struct DescendingRadixArgSortContigFactory
     {
         if constexpr (RadixSortSupportVector<argTy>::is_defined &&
                       (std::is_same_v<IndexTy, std::int64_t> ||
-                       std::is_same_v<IndexTy, std::int32_t>))
-        {
+                       std::is_same_v<IndexTy, std::int32_t>)) {
             return argsort_axis1_contig_caller<
                 /*ascending*/ false, argTy, IndexTy>;
         }
diff --git a/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp b/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
index 8b1ce04a97d6..6c50b0cbc08c 100644
--- a/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
+++ b/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
@@ -82,8 +82,7 @@ struct LeftSideSearchSortedContigFactory
     fnT get() const
     {
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
+                      std::is_same_v<indTy, std::int64_t>) {
             static constexpr bool left_side_search(true);
             using dpctl::tensor::kernels::searchsorted_contig_impl;
             using dpctl::tensor::rich_comparisons::AscendingSorter;
@@ -107,8 +106,7 @@ struct RightSideSearchSortedContigFactory
     fnT get() const
     {
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
+                      std::is_same_v<indTy, std::int64_t>) {
             static constexpr bool right_side_search(false);
 
             using dpctl::tensor::kernels::searchsorted_contig_impl;
@@ -141,8 +139,7 @@ struct LeftSideSearchSortedStridedFactory
     fnT get() const
     {
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
+                      std::is_same_v<indTy, std::int64_t>) {
             static constexpr bool left_side_search(true);
             using dpctl::tensor::kernels::searchsorted_strided_impl;
             using dpctl::tensor::rich_comparisons::AscendingSorter;
@@ -166,8 +163,7 @@ struct RightSideSearchSortedStridedFactory
     fnT get() const
     {
         if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
+                      std::is_same_v<indTy, std::int64_t>) {
             static constexpr bool right_side_search(false);
             using dpctl::tensor::kernels::searchsorted_strided_impl;
             using dpctl::tensor::rich_comparisons::AscendingSorter;
@@ -263,8 +259,8 @@ std::pair<sycl::event, sycl::event>
     dpctl::tensor::validation::CheckWritable::throw_if_not_writable(positions);
 
     // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, positions}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {hay, needles, positions})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
@@ -295,8 +291,7 @@ std::pair<sycl::event, sycl::event>
     const auto positions_typenum_t_v =
         static_cast<td_ns::typenum_t>(positions_typeid);
     if (positions_typenum_t_v != td_ns::typenum_t::INT32 &&
-        positions_typenum_t_v != td_ns::typenum_t::INT64)
-    {
+        positions_typenum_t_v != td_ns::typenum_t::INT64) {
         throw py::value_error(
             "Positions array must have data-type int32, or int64");
     }
diff --git a/dpnp/tensor/libtensor/source/where.cpp b/dpnp/tensor/libtensor/source/where.cpp
index 46c52cf83b34..1d535a712917 100644
--- a/dpnp/tensor/libtensor/source/where.cpp
+++ b/dpnp/tensor/libtensor/source/where.cpp
@@ -79,8 +79,8 @@ std::pair<sycl::event, sycl::event>
              const std::vector<sycl::event> &depends)
 {
 
-    if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, condition, dst}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {x1, x2, condition, dst})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
@@ -129,8 +129,7 @@ std::pair<sycl::event, sycl::event>
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(dst, condition) && !same_logical_tensors(dst, condition)) ||
         (overlap(dst, x1) && !same_logical_tensors(dst, x1)) ||
-        (overlap(dst, x2) && !same_logical_tensors(dst, x2)))
-    {
+        (overlap(dst, x2) && !same_logical_tensors(dst, x2))) {
         throw py::value_error("Destination array overlaps with input.");
     }