diff --git a/.clang-format b/.clang-format
index 622a5bf67634..4304e0ed40d4 100644
--- a/.clang-format
+++ b/.clang-format
@@ -16,7 +16,7 @@ BinPackParameters: false
 BraceWrapping:
   AfterCaseLabel:  true
   AfterClass:      true
-  AfterControlStatement: MultiLine
+  AfterControlStatement: Never
   AfterEnum:       true
   AfterFunction:   true
   AfterNamespace:  true
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 841f009ace89..e4fe0bcc4b2f 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -17,3 +17,6 @@ c106d91b866f4acd30226b68519b12a73a881490
 
 # Add pygrep-hooks to pre-commit config
 e62718415aa3660da5f607e352c991a063a54219
+
+# Bump clang-format from 12.0.1 to 22.1.0 version
+c2d65bd451a7d8e5b6319147da95e9dabf7a382b
diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index 0745ca1ca9dc..87a7311b95e4 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -47,7 +47,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 
@@ -224,7 +224,7 @@ jobs:
         if: env.GH_EVENT_OPEN_PR_UPSTREAM == 'true'
         env:
           PR_NUM: ${{ github.event.number }}
-        uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
+        uses: mshick/add-pr-comment@ffd016c7e151d97d69d21a843022fd4cd5b96fe5 # v3.9.0.8.3.9.0
         with:
           message-id: url_to_docs
           message: |
@@ -268,7 +268,7 @@ jobs:
           git push tokened_docs gh-pages
 
       - name: Modify the comment with URL to official documentation
-        uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
+        uses: mshick/add-pr-comment@ffd016c7e151d97d69d21a843022fd4cd5b96fe5 # v3.9.0.8.3.9.0
         with:
           message-id: url_to_docs
           find: |
diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml
index 409117c692b9..acbfcac96890 100644
--- a/.github/workflows/check-onemath.yaml
+++ b/.github/workflows/check-onemath.yaml
@@ -34,7 +34,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 
@@ -87,7 +87,7 @@ jobs:
           fetch-depth: 0
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.environment-file-name }}
           path: ${{ env.environment-file-loc }}
@@ -181,7 +181,7 @@ jobs:
           fetch-depth: 0
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.environment-file-name }}
           path: ${{ env.environment-file-loc }}
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index a12486300aa0..c894c530a20e 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -49,7 +49,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 
@@ -151,7 +151,7 @@ jobs:
           path: ${{ env.dpnp-repo-path }}
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.pkg-path-in-channel }}
@@ -280,7 +280,7 @@ jobs:
           path: ${{ env.dpnp-repo-path }}
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.pkg-path-in-channel }}
@@ -439,12 +439,12 @@ jobs:
           fetch-depth: ${{ env.fetch-depth }}
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ matrix.python }}
 
       - name: Download wheels artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Wheels Python ${{ matrix.python }}
 
@@ -528,7 +528,7 @@ jobs:
           path: ${{ env.dpnp-repo-path }}
 
       - name: Download artifact
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: ${{ env.package-name }} ${{ runner.os }} Python ${{ env.python-ver }}
           path: ${{ env.pkg-path-in-channel }}
@@ -654,7 +654,7 @@ jobs:
 
       - name: Post result to PR
         if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork }}
-        uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
+        uses: mshick/add-pr-comment@ffd016c7e151d97d69d21a843022fd4cd5b96fe5 # v3.9.0.8.3.9.0
         with:
           message-id: array_api_results
           message: |
diff --git a/.github/workflows/cron-run-tests.yaml b/.github/workflows/cron-run-tests.yaml
index f8e8394c6713..ea4fd4f14fc3 100644
--- a/.github/workflows/cron-run-tests.yaml
+++ b/.github/workflows/cron-run-tests.yaml
@@ -43,7 +43,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index 2cbe97ab0242..bfc3c7357a3e 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -33,7 +33,7 @@ jobs:
 
     steps:
       - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@3155a141048f8f89c06b4cdae32e7853e97536bc # 0.13.0
+        uses: styfle/cancel-workflow-action@d07a454dad7609a92316b57b23c9ccfd4f59af66 # 0.13.1
         with:
           access_token: ${{ github.token }}
 
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index 8b4cc3b93f64..5d7e0677281e 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -72,6 +72,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4
+        uses: github/codeql-action/upload-sarif@0d579ffd059c29b07949a3cce3983f0780820c98 # v4.32.6
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index d5f9f22146fc..d8f59405ce89 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -15,13 +15,6 @@ jobs:
     timeout-minutes: 10
 
     steps:
-      - name: Set up clang-format
-        run: |
-          sudo apt-get install -y clang-format-12
-          sudo unlink /usr/bin/clang-format
-          sudo ln -s /usr/bin/clang-format-12 /usr/bin/clang-format
-          clang-format --version
-
       - name: Set up pip packages
         uses: BSFishy/pip-action@8f2d471d809dc20b6ada98c91910b6ae6243f318 # v1
         with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 66245039ce3c..57ec9e2a2a8e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -57,19 +57,19 @@ repos:
     hooks:
     -   id: pyupgrade
 -   repo: https://github.com/codespell-project/codespell
-    rev: v2.4.1
+    rev: v2.4.2
     hooks:
     -   id: codespell
         args: ["-L", "abd"]  # ignore "abd" used in einsum tests
         additional_dependencies:
             - tomli
 -   repo: https://github.com/psf/black
-    rev: 26.1.0
+    rev: 26.3.1
     hooks:
     -   id: black
         exclude: "dpnp/_version.py"
 -   repo: https://github.com/pycqa/isort
-    rev: 8.0.0
+    rev: 8.0.1
     hooks:
     -   id: isort
         name: isort (python)
@@ -88,13 +88,13 @@ repos:
         additional_dependencies:
             - flake8-docstrings==1.7.0
             - flake8-bugbear==24.12.12
--   repo: https://github.com/pocc/pre-commit-hooks
-    rev: v1.3.5
+-   repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v22.1.0
     hooks:
     -   id: clang-format
         args: ["-i"]
 -   repo: https://github.com/gitleaks/gitleaks
-    rev: v8.30.0
+    rev: v8.30.1
     hooks:
     -   id: gitleaks
 -   repo: https://github.com/jumanjihouse/pre-commit-hooks
@@ -114,7 +114,8 @@ repos:
             "-sn", # Don't display the score
             "--disable=import-error",
             "--disable=redefined-builtin",
-            "--disable=unused-wildcard-import"
+            "--disable=unused-wildcard-import",
+            "--disable=c-extension-no-member"
             ]
         files: '^dpnp/(dpnp_iface.*|fft|linalg|scipy|dpnp_array)'
 -   repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
@@ -127,7 +128,7 @@ repos:
     hooks:
     -   id: actionlint
 -   repo: https://github.com/BlankSpruce/gersemi
-    rev: 0.26.0
+    rev: 0.26.1
     hooks:
     -   id: gersemi
         exclude: "dpnp/backend/cmake/Modules/"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 61cde1ddfefc..a742a2f4b532 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.divmod` [#2674](https://github.com/IntelPython/dpnp/pull/2674)
 * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595)
 * Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
+* Added support for ndarray subclassing via `dpnp.ndarray.view` method with `type` parameter [#2815](https://github.com/IntelPython/dpnp/issues/2815)
 
 ### Changed
 
@@ -53,6 +54,8 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Changed `dpnp.partition` implementation to reuse `dpnp.sort` where it brings the performance benefit [#2766](https://github.com/IntelPython/dpnp/pull/2766)
 * `dpnp` uses pybind11 3.0.2 [#27734](https://github.com/IntelPython/dpnp/pull/2773)
 * Modified CMake files for the extension to explicitly mark DPC++ compiler and dpctl headers as system ones and so to suppress the build warning generated inside them [#2770](https://github.com/IntelPython/dpnp/pull/2770)
+* Updated QR tests to avoid element-wise comparisons for `raw` and `r` modes [#2785](https://github.com/IntelPython/dpnp/pull/2785)
+* Moved all SYCL kernel functors from `backend/extensions/` to a unified `backend/kernels/` directory hierarchy [#2816](https://github.com/IntelPython/dpnp/pull/2816)
 
 ### Deprecated
 
@@ -76,6 +79,8 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Resolved an issue causing `dpnp.linspace` to return an incorrect output shape when inputs were passed as arrays [#2712](https://github.com/IntelPython/dpnp/pull/2712)
 * Resolved an issue where `dpnp` always returns the base allocation pointer, when the view start is expected [#2651](https://github.com/IntelPython/dpnp/pull/2651)
 * Fixed an issue causing an exception in `dpnp.geomspace` and `dpnp.logspace` when called with explicit `device` keyword but any input array is allocated on another device [#2723](https://github.com/IntelPython/dpnp/pull/2723)
+* Fixed `.data.ptr` property on array views to correctly return the pointer to the view's data location instead of the base allocation pointer [#2812](https://github.com/IntelPython/dpnp/pull/2812)
+* Resolved an issue with strides calculation in `dpnp.diagonal` to return correct values for empty diagonals [#2814](https://github.com/IntelPython/dpnp/pull/2814)
 
 ### Security
 
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
index c5e5663e21fb..3d0e7f88d55f 100644
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -15,7 +15,9 @@
 
     // List of branches to benchmark. If not provided, defaults to "master"
     // (for git) or "tip" (for mercurial).
-    "branches": ["HEAD"],
+    "branches": [
+        "HEAD"
+    ],
 
     // The DVCS being used.  If not set, it will be automatically
     // determined from "repo" by looking at the protocol in the URL
@@ -35,7 +37,9 @@
 
     // The Pythons you'd like to test against.  If not provided, defaults
     // to the current version of Python used to run `asv`.
-    "pythons": ["3.7"],
+    "pythons": [
+        "3.7"
+    ],
 
     // The matrix of dependencies to test.  Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty
@@ -53,7 +57,6 @@
     // environments in.  If not provided, defaults to "env"
     "env_dir": "env",
 
-
     // The directory (relative to the current directory) that raw benchmark
     // results are stored in.  If not provided, defaults to "results".
     "results_dir": "results",
@@ -79,7 +82,8 @@
     // skipped for the matching benchmark.
     //
     // "regressions_first_commits": {
-    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this
+    //    commit
     //    "another_benchmark": null,   // Skip regression detection altogether
     // }
 }
diff --git a/dpnp/backend/extensions/blas/dot_common.hpp b/dpnp/backend/extensions/blas/dot_common.hpp
index 1672e7217cba..383804ff1718 100644
--- a/dpnp/backend/extensions/blas/dot_common.hpp
+++ b/dpnp/backend/extensions/blas/dot_common.hpp
@@ -97,8 +97,7 @@ std::pair<sycl::event, sycl::event>
 
     if (!dpctl::utils::queues_are_compatible(
             exec_q,
-            {vectorX.get_queue(), vectorY.get_queue(), result.get_queue()}))
-    {
+            {vectorX.get_queue(), vectorY.get_queue(), result.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
@@ -120,8 +119,8 @@ std::pair<sycl::event, sycl::event>
     const int vectorY_typenum = vectorY.get_typenum();
     const int result_typenum = result.get_typenum();
 
-    if (result_typenum != vectorX_typenum || result_typenum != vectorY_typenum)
-    {
+    if (result_typenum != vectorX_typenum ||
+        result_typenum != vectorY_typenum) {
         throw py::value_error("Given arrays must be of the same type.");
     }
 
diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp
index 48c1ae98ead4..86f751baf2e0 100644
--- a/dpnp/backend/extensions/blas/gemm.cpp
+++ b/dpnp/backend/extensions/blas/gemm.cpp
@@ -181,8 +181,7 @@ std::tuple<sycl::event, sycl::event, bool>
 
     if (!dpctl::utils::queues_are_compatible(
             exec_q,
-            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()}))
-    {
+            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
diff --git a/dpnp/backend/extensions/blas/gemm_batch.cpp b/dpnp/backend/extensions/blas/gemm_batch.cpp
index a6cd7ac4e130..d02b035922c0 100644
--- a/dpnp/backend/extensions/blas/gemm_batch.cpp
+++ b/dpnp/backend/extensions/blas/gemm_batch.cpp
@@ -237,8 +237,7 @@ std::tuple<sycl::event, sycl::event, bool>
 
     if (!dpctl::utils::queues_are_compatible(
             exec_q,
-            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()}))
-    {
+            {matrixA.get_queue(), matrixB.get_queue(), resultC.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
diff --git a/dpnp/backend/extensions/blas/gemv.cpp b/dpnp/backend/extensions/blas/gemv.cpp
index a9c5414ef8c7..0b6ae78bc76e 100644
--- a/dpnp/backend/extensions/blas/gemv.cpp
+++ b/dpnp/backend/extensions/blas/gemv.cpp
@@ -169,8 +169,7 @@ std::pair<sycl::event, sycl::event>
 
     if (!dpctl::utils::queues_are_compatible(
             exec_q,
-            {matrixA.get_queue(), vectorX.get_queue(), vectorY.get_queue()}))
-    {
+            {matrixA.get_queue(), vectorX.get_queue(), vectorY.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
diff --git a/dpnp/backend/extensions/blas/syrk.cpp b/dpnp/backend/extensions/blas/syrk.cpp
index 8b0ebce3d888..9668e72b57f6 100644
--- a/dpnp/backend/extensions/blas/syrk.cpp
+++ b/dpnp/backend/extensions/blas/syrk.cpp
@@ -248,8 +248,7 @@ std::pair<sycl::event, sycl::event>
     }
 
     if (!dpctl::utils::queues_are_compatible(
-            exec_q, {matrixA.get_queue(), resultC.get_queue()}))
-    {
+            exec_q, {matrixA.get_queue(), resultC.get_queue()})) {
         throw py::value_error(
             "USM allocations are not compatible with the execution queue.");
     }
diff --git a/dpnp/backend/extensions/common/ext/common.hpp b/dpnp/backend/extensions/common/ext/common.hpp
index d626b56ea00c..f0ce1722bfb1 100644
--- a/dpnp/backend/extensions/common/ext/common.hpp
+++ b/dpnp/backend/extensions/common/ext/common.hpp
@@ -213,8 +213,7 @@ sycl::nd_range<1>
 pybind11::dtype dtype_from_typenum(int dst_typenum);
 
 template <typename dispatchT,
-          template <typename fnT, typename T>
-          typename factoryT,
+          template <typename fnT, typename T> typename factoryT,
           int _num_types = type_dispatch::num_types>
 inline void init_dispatch_vector(dispatchT dispatch_vector[])
 {
@@ -223,8 +222,7 @@ inline void init_dispatch_vector(dispatchT dispatch_vector[])
 }
 
 template <typename dispatchT,
-          template <typename fnT, typename D, typename S>
-          typename factoryT,
+          template <typename fnT, typename D, typename S> typename factoryT,
           int _num_types = type_dispatch::num_types>
 inline void init_dispatch_table(dispatchT dispatch_table[][_num_types])
 {
diff --git a/dpnp/backend/extensions/common/ext/dispatch_table.hpp b/dpnp/backend/extensions/common/ext/dispatch_table.hpp
index 4cfe1bd57250..6655f054f355 100644
--- a/dpnp/backend/extensions/common/ext/dispatch_table.hpp
+++ b/dpnp/backend/extensions/common/ext/dispatch_table.hpp
@@ -99,8 +99,7 @@ using SupportedDTypeList2 = std::vector<DTypePair>;
 
 template <typename FnT,
           typename SupportedTypes,
-          template <typename>
-          typename Func>
+          template <typename> typename Func>
 struct TableBuilder
 {
     template <typename _FnT, typename T>
@@ -125,8 +124,7 @@ struct TableBuilder
 
 template <typename FnT,
           typename SupportedTypes,
-          template <typename, typename>
-          typename Func>
+          template <typename, typename> typename Func>
 struct TableBuilder2
 {
     template <typename _FnT, typename T1, typename T2>
@@ -232,8 +230,7 @@ class DispatchTable2
     }
 
     template <typename SupportedTypes,
-              template <typename, typename>
-              typename Func>
+              template <typename, typename> typename Func>
     void populate_dispatch_table()
     {
         using TBulder = typename TableBuilder2<FnT, SupportedTypes, Func>::type;
diff --git a/dpnp/backend/extensions/elementwise_functions/common.hpp b/dpnp/backend/extensions/elementwise_functions/common.hpp
index df2b3afe53b9..f3b15c8d6774 100644
--- a/dpnp/backend/extensions/elementwise_functions/common.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/common.hpp
@@ -131,8 +131,7 @@ struct UnaryTwoOutputsContigFunctor
         else if constexpr (enable_sg_loadstore &&
                            UnaryTwoOutputsOpT::supports_sg_loadstore::value &&
                            UnaryTwoOutputsOpT::supports_vec::value &&
-                           (vec_sz > 1))
-        {
+                           (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -171,8 +170,7 @@ struct UnaryTwoOutputsContigFunctor
         }
         else if constexpr (enable_sg_loadstore &&
                            UnaryTwoOutputsOpT::supports_sg_loadstore::value &&
-                           std::is_same_v<resT1, argT>)
-        {
+                           std::is_same_v<resT1, argT>) {
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
@@ -214,8 +212,7 @@ struct UnaryTwoOutputsContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           UnaryTwoOutputsOpT::supports_sg_loadstore::value)
-        {
+                           UnaryTwoOutputsOpT::supports_sg_loadstore::value) {
             // default: use scalar-value function
 
             auto sg = ndit.get_sub_group();
@@ -359,8 +356,7 @@ struct BinaryTwoOutputsContigFunctor
 
         if constexpr (enable_sg_loadstore &&
                       BinaryOperatorT::supports_sg_loadstore::value &&
-                      BinaryOperatorT::supports_vec::value && (vec_sz > 1))
-        {
+                      BinaryOperatorT::supports_vec::value && (vec_sz > 1)) {
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -405,8 +401,7 @@ struct BinaryTwoOutputsContigFunctor
             }
         }
         else if constexpr (enable_sg_loadstore &&
-                           BinaryOperatorT::supports_sg_loadstore::value)
-        {
+                           BinaryOperatorT::supports_sg_loadstore::value) {
             auto sg = ndit.get_sub_group();
             const std::uint16_t sgSize = sg.get_max_local_range()[0];
 
@@ -528,21 +523,18 @@ struct BinaryTwoOutputsStridedFunctor
  * dpctl::tensor::kernels::elementwise_common namespace.
  */
 template <typename argTy,
-          template <typename T>
-          class UnaryTwoOutputsType,
+          template <typename T> class UnaryTwoOutputsType,
           template <typename A,
                     typename R1,
                     typename R2,
                     std::uint8_t vs,
                     std::uint8_t nv,
-                    bool enable>
-          class UnaryTwoOutputsContigFunctorT,
+                    bool enable> class UnaryTwoOutputsContigFunctorT,
           template <typename A,
                     typename R1,
                     typename R2,
                     std::uint8_t vs,
-                    std::uint8_t nv>
-          class kernel_name,
+                    std::uint8_t nv> class kernel_name,
           std::uint8_t vec_sz = 4u,
           std::uint8_t n_vecs = 2u>
 sycl::event
@@ -576,8 +568,7 @@ sycl::event
 
         if (is_aligned<required_alignment>(arg_p) &&
             is_aligned<required_alignment>(res1_p) &&
-            is_aligned<required_alignment>(res2_p))
-        {
+            is_aligned<required_alignment>(res2_p)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl =
@@ -613,12 +604,15 @@ sycl::event
  * dpctl::tensor::kernels::elementwise_common namespace.
  */
 template <typename argTy,
-          template <typename T>
-          class UnaryTwoOutputsType,
-          template <typename A, typename R1, typename R2, typename I>
-          class UnaryTwoOutputsStridedFunctorT,
-          template <typename A, typename R1, typename R2, typename I>
-          class kernel_name>
+          template <typename T> class UnaryTwoOutputsType,
+          template <typename A,
+                    typename R1,
+                    typename R2,
+                    typename I> class UnaryTwoOutputsStridedFunctorT,
+          template <typename A,
+                    typename R1,
+                    typename R2,
+                    typename I> class kernel_name>
 sycl::event unary_two_outputs_strided_impl(
     sycl::queue &exec_q,
     std::size_t nelems,
@@ -665,27 +659,25 @@ sycl::event unary_two_outputs_strided_impl(
  * @note It extends binary_contig_impl from
  * dpctl::tensor::kernels::elementwise_common namespace.
  */
-template <typename argTy1,
-          typename argTy2,
-          template <typename T1, typename T2>
-          class BinaryTwoOutputsType,
-          template <typename T1,
-                    typename T2,
-                    typename T3,
-                    typename T4,
-                    std::uint8_t vs,
-                    std::uint8_t nv,
-                    bool enable_sg_loadstore>
-          class BinaryTwoOutputsContigFunctorT,
-          template <typename T1,
-                    typename T2,
-                    typename T3,
-                    typename T4,
-                    std::uint8_t vs,
-                    std::uint8_t nv>
-          class kernel_name,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u>
+template <
+    typename argTy1,
+    typename argTy2,
+    template <typename T1, typename T2> class BinaryTwoOutputsType,
+    template <typename T1,
+              typename T2,
+              typename T3,
+              typename T4,
+              std::uint8_t vs,
+              std::uint8_t nv,
+              bool enable_sg_loadstore> class BinaryTwoOutputsContigFunctorT,
+    template <typename T1,
+              typename T2,
+              typename T3,
+              typename T4,
+              std::uint8_t vs,
+              std::uint8_t nv> class kernel_name,
+    std::uint8_t vec_sz = 4u,
+    std::uint8_t n_vecs = 2u>
 sycl::event
     binary_two_outputs_contig_impl(sycl::queue &exec_q,
                                    std::size_t nelems,
@@ -726,8 +718,7 @@ sycl::event
         if (is_aligned<required_alignment>(arg1_tp) &&
             is_aligned<required_alignment>(arg2_tp) &&
             is_aligned<required_alignment>(res1_tp) &&
-            is_aligned<required_alignment>(res2_tp))
-        {
+            is_aligned<required_alignment>(res2_tp)) {
             static constexpr bool enable_sg_loadstore = true;
             using KernelName = BaseKernelName;
             using Impl = BinaryTwoOutputsContigFunctorT<argTy1, argTy2, resTy1,
@@ -761,15 +752,19 @@ sycl::event
  * @note It extends binary_strided_impl from
  * dpctl::tensor::kernels::elementwise_common namespace.
  */
-template <
-    typename argTy1,
-    typename argTy2,
-    template <typename T1, typename T2>
-    class BinaryTwoOutputsType,
-    template <typename T1, typename T2, typename T3, typename T4, typename IndT>
-    class BinaryTwoOutputsStridedFunctorT,
-    template <typename T1, typename T2, typename T3, typename T4, typename IndT>
-    class kernel_name>
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2> class BinaryTwoOutputsType,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename IndT> class BinaryTwoOutputsStridedFunctorT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename IndT> class kernel_name>
 sycl::event binary_two_outputs_strided_impl(
     sycl::queue &exec_q,
     std::size_t nelems,
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
index c996ac07df02..6a29c9a33c5a 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
@@ -309,8 +309,7 @@ std::pair<sycl::event, sycl::event>
 
     // check that types are supported
     if (dst1_typeid != func_output_typeids.first ||
-        dst2_typeid != func_output_typeids.second)
-    {
+        dst2_typeid != func_output_typeids.second) {
         throw py::value_error(
             "One of destination arrays has unexpected elemental data type.");
     }
@@ -362,8 +361,7 @@ std::pair<sycl::event, sycl::event>
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(src, dst1) && !same_logical_tensors(src, dst1)) ||
         (overlap(src, dst2) && !same_logical_tensors(src, dst2)) ||
-        (overlap(dst1, dst2) && !same_logical_tensors(dst1, dst2)))
-    {
+        (overlap(dst1, dst2) && !same_logical_tensors(dst1, dst2))) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
 
@@ -430,8 +428,7 @@ std::pair<sycl::event, sycl::event>
         simplified_dst2_strides, src_offset, dst1_offset, dst2_offset);
 
     if (nd == 1 && simplified_src_strides[0] == 1 &&
-        simplified_dst1_strides[0] == 1 && simplified_dst2_strides[0] == 1)
-    {
+        simplified_dst1_strides[0] == 1 && simplified_dst2_strides[0] == 1) {
         // Special case of contiguous data
         auto contig_fn = contig_dispatch_vector[src_typeid];
 
@@ -625,8 +622,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
     auto const &same_logical_tensors =
         dpctl::tensor::overlap::SameLogicalTensors();
     if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) ||
-        (overlap(src2, dst) && !same_logical_tensors(src2, dst)))
-    {
+        (overlap(src2, dst) && !same_logical_tensors(src2, dst))) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
     // check memory overlap
@@ -693,8 +689,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
 
         if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
             isEqual(simplified_src2_strides, unit_stride) &&
-            isEqual(simplified_dst_strides, unit_stride))
-        {
+            isEqual(simplified_dst_strides, unit_stride)) {
             auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
 
             if (contig_fn != nullptr) {
@@ -716,8 +711,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_src2_strides, zero_one_strides) &&
                 isEqual(simplified_src1_strides, {simplified_shape[1], one}) &&
-                isEqual(simplified_dst_strides, {simplified_shape[1], one}))
-            {
+                isEqual(simplified_dst_strides, {simplified_shape[1], one})) {
                 auto matrix_row_broadcast_fn =
                     contig_matrix_row_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
@@ -731,8 +725,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                         is_aligned<required_alignment>(
                             src2_data + src2_offset * src2_itemsize) &&
                         is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
+                            dst_data + dst_offset * dst_itemsize)) {
                         std::size_t n0 = simplified_shape[0];
                         std::size_t n1 = simplified_shape[1];
                         sycl::event comp_ev = matrix_row_broadcast_fn(
@@ -749,8 +742,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
             }
             if (isEqual(simplified_src1_strides, one_zero_strides) &&
                 isEqual(simplified_src2_strides, {one, simplified_shape[0]}) &&
-                isEqual(simplified_dst_strides, {one, simplified_shape[0]}))
-            {
+                isEqual(simplified_dst_strides, {one, simplified_shape[0]})) {
                 auto row_matrix_broadcast_fn =
                     contig_row_matrix_broadcast_dispatch_table[src1_typeid]
                                                               [src2_typeid];
@@ -765,8 +757,7 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
                         is_aligned<required_alignment>(
                             src2_data + src2_offset * src2_itemsize) &&
                         is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
+                            dst_data + dst_offset * dst_itemsize)) {
                         std::size_t n0 = simplified_shape[1];
                         std::size_t n1 = simplified_shape[0];
                         sycl::event comp_ev = row_matrix_broadcast_fn(
@@ -839,8 +830,7 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
     }
 
     if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
+        src2_typeid >= td_ns::num_types) {
         throw std::runtime_error("binary output type lookup failed");
     }
     int dst_typeid = output_types_table[src1_typeid][src2_typeid];
@@ -898,8 +888,8 @@ std::pair<sycl::event, sycl::event>
     }
 
     // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src1, src2, dst1, dst2}))
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {src1, src2, dst1, dst2})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
@@ -955,8 +945,7 @@ std::pair<sycl::event, sycl::event>
         (overlap(src1, dst2) && !same_logical_tensors(src1, dst2)) ||
         (overlap(src2, dst1) && !same_logical_tensors(src2, dst1)) ||
         (overlap(src2, dst2) && !same_logical_tensors(src2, dst2)) ||
-        (overlap(dst1, dst2)))
-    {
+        (overlap(dst1, dst2))) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
 
@@ -1031,8 +1020,7 @@ std::pair<sycl::event, sycl::event>
     if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
         isEqual(simplified_src2_strides, unit_stride) &&
         isEqual(simplified_dst1_strides, unit_stride) &&
-        isEqual(simplified_dst2_strides, unit_stride))
-    {
+        isEqual(simplified_dst2_strides, unit_stride)) {
         auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
 
         if (contig_fn != nullptr) {
@@ -1107,8 +1095,7 @@ std::pair<py::object, py::object> py_binary_two_outputs_ufunc_result_type(
     }
 
     if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
+        src2_typeid >= td_ns::num_types) {
         throw std::runtime_error("binary output type lookup failed");
     }
     std::pair<int, int> dst_typeids =
@@ -1263,8 +1250,7 @@ std::pair<sycl::event, sycl::event>
             std::initializer_list<py::ssize_t>{1};
 
         if ((nd == 1) && isEqual(simplified_rhs_strides, unit_stride) &&
-            isEqual(simplified_lhs_strides, unit_stride))
-        {
+            isEqual(simplified_lhs_strides, unit_stride)) {
             auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
 
             if (contig_fn != nullptr) {
@@ -1283,8 +1269,7 @@ std::pair<sycl::event, sycl::event>
             static constexpr py::ssize_t one{1};
             // special case of C-contiguous matrix and a row
             if (isEqual(simplified_rhs_strides, one_zero_strides) &&
-                isEqual(simplified_lhs_strides, {one, simplified_shape[0]}))
-            {
+                isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) {
                 auto row_matrix_broadcast_fn =
                     contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
                                                               [lhs_typeid];
diff --git a/dpnp/backend/extensions/elementwise_functions/simplify_iteration_space.cpp b/dpnp/backend/extensions/elementwise_functions/simplify_iteration_space.cpp
index e34cb74fcb0a..c60602ccb01d 100644
--- a/dpnp/backend/extensions/elementwise_functions/simplify_iteration_space.cpp
+++ b/dpnp/backend/extensions/elementwise_functions/simplify_iteration_space.cpp
@@ -292,8 +292,7 @@ void simplify_iteration_space_4(
         simplified_dst_strides.reserve(nd);
 
         if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
-            (src3_strides[0] < 0) && (dst_strides[0] < 0))
-        {
+            (src3_strides[0] < 0) && (dst_strides[0] < 0)) {
             simplified_src1_strides.push_back(-src1_strides[0]);
             simplified_src2_strides.push_back(-src2_strides[0]);
             simplified_src3_strides.push_back(-src3_strides[0]);
diff --git a/dpnp/backend/extensions/fft/common.hpp b/dpnp/backend/extensions/fft/common.hpp
index f76da9721316..44f0b43f8597 100644
--- a/dpnp/backend/extensions/fft/common.hpp
+++ b/dpnp/backend/extensions/fft/common.hpp
@@ -56,8 +56,7 @@ class DescriptorWrapper
     {
         mkl_dft::precision fft_prec = get_precision();
         if (fft_prec == mkl_dft::precision::DOUBLE &&
-            !q.get_device().has(sycl::aspect::fp64))
-        {
+            !q.get_device().has(sycl::aspect::fp64)) {
             throw py::value_error("Descriptor is double precision but the "
                                   "device does not support double precision.");
         }
@@ -66,10 +65,7 @@ class DescriptorWrapper
         queue_ptr_ = std::make_unique<sycl::queue>(q);
     }
 
-    descr_type &get_descriptor()
-    {
-        return descr_;
-    }
+    descr_type &get_descriptor() { return descr_; }
 
     const sycl::queue &get_queue() const
     {
diff --git a/dpnp/backend/extensions/fft/out_of_place.tpp b/dpnp/backend/extensions/fft/out_of_place.tpp
index 290408dc60bc..ed5cd37df7f1 100644
--- a/dpnp/backend/extensions/fft/out_of_place.tpp
+++ b/dpnp/backend/extensions/fft/out_of_place.tpp
@@ -82,9 +82,8 @@ std::pair<sycl::event, sycl::event>
     }
 
     sycl::queue exec_q = descr.get_queue();
-    if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {in.get_queue(), out.get_queue()}))
-    {
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q, {in.get_queue(), out.get_queue()})) {
         throw py::value_error("USM allocations are not compatible with the "
                               "execution queue of the descriptor.");
     }
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 370d59f95585..e1bc34c9ae8b 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -62,7 +62,7 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/indexing/choose.cpp b/dpnp/backend/extensions/indexing/choose.cpp
index 99d91744366f..3b2df73f46ef 100644
--- a/dpnp/backend/extensions/indexing/choose.cpp
+++ b/dpnp/backend/extensions/indexing/choose.cpp
@@ -30,41 +30,123 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
+#include <stdexcept>
+#include <string>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
-#include "choose_kernel.hpp"
+#include <sycl/sycl.hpp>
+
 #include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
-// utils extension header
 #include "ext/common.hpp"
+#include "kernels/indexing/choose.hpp"
 
 // dpctl tensor headers
 #include "utils/indexing_utils.hpp"
 #include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
 
 namespace dpnp::extensions::indexing
 {
+namespace py = pybind11;
 
+namespace impl
+{
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-static kernels::choose_fn_ptr_t choose_clip_dispatch_table[td_ns::num_types]
-                                                          [td_ns::num_types];
-static kernels::choose_fn_ptr_t choose_wrap_dispatch_table[td_ns::num_types]
-                                                          [td_ns::num_types];
+using dpctl::tensor::ssize_t;
+
+typedef sycl::event (*choose_fn_ptr_t)(sycl::queue &,
+                                       size_t,
+                                       ssize_t,
+                                       int,
+                                       const ssize_t *,
+                                       const char *,
+                                       char *,
+                                       char **,
+                                       ssize_t,
+                                       ssize_t,
+                                       const ssize_t *,
+                                       const std::vector<sycl::event> &);
+
+static choose_fn_ptr_t choose_clip_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+static choose_fn_ptr_t choose_wrap_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename ProjectorT, typename indTy, typename Ty>
+sycl::event choose_impl(sycl::queue &q,
+                        size_t nelems,
+                        ssize_t n_chcs,
+                        int nd,
+                        const ssize_t *shape_and_strides,
+                        const char *ind_cp,
+                        char *dst_cp,
+                        char **chcs_cp,
+                        ssize_t ind_offset,
+                        ssize_t dst_offset,
+                        const ssize_t *chc_offsets,
+                        const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
 
-namespace py = pybind11;
+    const indTy *ind_tp = reinterpret_cast<const indTy *>(ind_cp);
+    Ty *dst_tp = reinterpret_cast<Ty *>(dst_cp);
 
-namespace detail
+    sycl::event choose_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using InOutIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const InOutIndexerT ind_out_indexer{nd, ind_offset, dst_offset,
+                                            shape_and_strides};
+
+        using NthChoiceIndexerT =
+            dpnp::kernels::choose::strides::NthStrideOffsetUnpacked;
+        const NthChoiceIndexerT choices_indexer{
+            nd, chc_offsets, shape_and_strides, shape_and_strides + 3 * nd};
+
+        using ChooseFunc =
+            dpnp::kernels::choose::ChooseFunctor<ProjectorT, InOutIndexerT,
+                                                 NthChoiceIndexerT, indTy, Ty>;
+
+        cgh.parallel_for<ChooseFunc>(sycl::range<1>(nelems),
+                                     ChooseFunc(ind_tp, dst_tp, chcs_cp, n_chcs,
+                                                ind_out_indexer,
+                                                choices_indexer));
+    });
+
+    return choose_ev;
+}
+
+template <typename fnT, typename IndT, typename T, typename Index>
+struct ChooseFactory
 {
+    fnT get()
+    {
+        if constexpr (std::is_integral<IndT>::value &&
+                      !std::is_same<IndT, bool>::value) {
+            fnT fn = choose_impl<Index, IndT, T>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
 
+namespace detail
+{
 using host_ptrs_allocator_t =
     dpctl::tensor::alloc_utils::usm_host_allocator<char *>;
 using ptrs_t = std::vector<char *, host_ptrs_allocator_t>;
@@ -191,7 +273,6 @@ std::vector<dpctl::tensor::usm_ndarray> parse_py_chcs(const sycl::queue &q,
 
     return res;
 }
-
 } // namespace detail
 
 std::pair<sycl::event, sycl::event>
@@ -412,23 +493,6 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(arg_cleanup_ev, choose_generic_ev);
 }
 
-template <typename fnT, typename IndT, typename T, typename Index>
-struct ChooseFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_integral<IndT>::value &&
-                      !std::is_same<IndT, bool>::value) {
-            fnT fn = kernels::choose_impl<Index, IndT, T>;
-            return fn;
-        }
-        else {
-            fnT fn = nullptr;
-            return fn;
-        }
-    }
-};
-
 using dpctl::tensor::indexing_utils::ClipIndex;
 using dpctl::tensor::indexing_utils::WrapIndex;
 
@@ -441,23 +505,22 @@ using ChooseClipFactory = ChooseFactory<fnT, IndT, T, ClipIndex<IndT>>;
 void init_choose_dispatch_tables(void)
 {
     using ext::common::init_dispatch_table;
-    using kernels::choose_fn_ptr_t;
 
     init_dispatch_table<choose_fn_ptr_t, ChooseClipFactory>(
         choose_clip_dispatch_table);
     init_dispatch_table<choose_fn_ptr_t, ChooseWrapFactory>(
         choose_wrap_dispatch_table);
 }
+} // namespace impl
 
 void init_choose(py::module_ m)
 {
-    dpnp::extensions::indexing::init_choose_dispatch_tables();
+    impl::init_choose_dispatch_tables();
 
-    m.def("_choose", &py_choose, "", py::arg("src"), py::arg("chcs"),
+    m.def("_choose", &impl::py_choose, "", py::arg("src"), py::arg("chcs"),
           py::arg("dst"), py::arg("mode"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
     return;
 }
-
 } // namespace dpnp::extensions::indexing
diff --git a/dpnp/backend/extensions/indexing/choose_kernel.hpp b/dpnp/backend/extensions/indexing/choose_kernel.hpp
deleted file mode 100644
index 6b1ac8005054..000000000000
--- a/dpnp/backend/extensions/indexing/choose_kernel.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-//*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// - Neither the name of the copyright holder nor the names of its contributors
-//   may be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-
-#pragma once
-
-#include <algorithm>
-#include <complex>
-#include <cstdint>
-#include <limits>
-#include <type_traits>
-
-#include <sycl/sycl.hpp>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "utils/indexing_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/strided_iters.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpnp::extensions::indexing::strides_detail
-{
-
-struct NthStrideOffsetUnpacked
-{
-    NthStrideOffsetUnpacked(int common_nd,
-                            dpctl::tensor::ssize_t const *_offsets,
-                            dpctl::tensor::ssize_t const *_shape,
-                            dpctl::tensor::ssize_t const *_strides)
-        : _ind(common_nd), nd(common_nd), offsets(_offsets), shape(_shape),
-          strides(_strides)
-    {
-    }
-
-    template <typename nT>
-    size_t operator()(dpctl::tensor::ssize_t gid, nT n) const
-    {
-        dpctl::tensor::ssize_t relative_offset(0);
-        _ind.get_displacement<const dpctl::tensor::ssize_t *,
-                              const dpctl::tensor::ssize_t *>(
-            gid, shape, strides + (n * nd), relative_offset);
-
-        return relative_offset + offsets[n];
-    }
-
-private:
-    dpctl::tensor::strides::CIndexer_vector<dpctl::tensor::ssize_t> _ind;
-
-    int nd;
-    dpctl::tensor::ssize_t const *offsets;
-    dpctl::tensor::ssize_t const *shape;
-    dpctl::tensor::ssize_t const *strides;
-};
-
-static_assert(sycl::is_device_copyable_v<NthStrideOffsetUnpacked>);
-
-} // namespace dpnp::extensions::indexing::strides_detail
-
-namespace dpnp::extensions::indexing::kernels
-{
-
-template <typename ProjectorT,
-          typename IndOutIndexerT,
-          typename ChoicesIndexerT,
-          typename IndT,
-          typename T>
-class ChooseFunctor
-{
-private:
-    const IndT *ind = nullptr;
-    T *dst = nullptr;
-    char **chcs = nullptr;
-    dpctl::tensor::ssize_t n_chcs;
-    const IndOutIndexerT ind_out_indexer;
-    const ChoicesIndexerT chcs_indexer;
-
-public:
-    ChooseFunctor(const IndT *ind_,
-                  T *dst_,
-                  char **chcs_,
-                  dpctl::tensor::ssize_t n_chcs_,
-                  const IndOutIndexerT &ind_out_indexer_,
-                  const ChoicesIndexerT &chcs_indexer_)
-        : ind(ind_), dst(dst_), chcs(chcs_), n_chcs(n_chcs_),
-          ind_out_indexer(ind_out_indexer_), chcs_indexer(chcs_indexer_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        const ProjectorT proj{};
-
-        dpctl::tensor::ssize_t i = id[0];
-
-        auto ind_dst_offsets = ind_out_indexer(i);
-        dpctl::tensor::ssize_t ind_offset = ind_dst_offsets.get_first_offset();
-        dpctl::tensor::ssize_t dst_offset = ind_dst_offsets.get_second_offset();
-
-        IndT chc_idx = ind[ind_offset];
-        // proj produces an index in the range of n_chcs
-        dpctl::tensor::ssize_t projected_idx = proj(n_chcs, chc_idx);
-
-        dpctl::tensor::ssize_t chc_offset = chcs_indexer(i, projected_idx);
-
-        T *chc = reinterpret_cast<T *>(chcs[projected_idx]);
-
-        dst[dst_offset] = chc[chc_offset];
-    }
-};
-
-typedef sycl::event (*choose_fn_ptr_t)(sycl::queue &,
-                                       size_t,
-                                       dpctl::tensor::ssize_t,
-                                       int,
-                                       const dpctl::tensor::ssize_t *,
-                                       const char *,
-                                       char *,
-                                       char **,
-                                       dpctl::tensor::ssize_t,
-                                       dpctl::tensor::ssize_t,
-                                       const dpctl::tensor::ssize_t *,
-                                       const std::vector<sycl::event> &);
-
-template <typename ProjectorT, typename indTy, typename Ty>
-sycl::event choose_impl(sycl::queue &q,
-                        size_t nelems,
-                        dpctl::tensor::ssize_t n_chcs,
-                        int nd,
-                        const dpctl::tensor::ssize_t *shape_and_strides,
-                        const char *ind_cp,
-                        char *dst_cp,
-                        char **chcs_cp,
-                        dpctl::tensor::ssize_t ind_offset,
-                        dpctl::tensor::ssize_t dst_offset,
-                        const dpctl::tensor::ssize_t *chc_offsets,
-                        const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
-
-    const indTy *ind_tp = reinterpret_cast<const indTy *>(ind_cp);
-    Ty *dst_tp = reinterpret_cast<Ty *>(dst_cp);
-
-    sycl::event choose_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using InOutIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        const InOutIndexerT ind_out_indexer{nd, ind_offset, dst_offset,
-                                            shape_and_strides};
-
-        using NthChoiceIndexerT = strides_detail::NthStrideOffsetUnpacked;
-        const NthChoiceIndexerT choices_indexer{
-            nd, chc_offsets, shape_and_strides, shape_and_strides + 3 * nd};
-
-        using ChooseFunc = ChooseFunctor<ProjectorT, InOutIndexerT,
-                                         NthChoiceIndexerT, indTy, Ty>;
-
-        cgh.parallel_for<ChooseFunc>(sycl::range<1>(nelems),
-                                     ChooseFunc(ind_tp, dst_tp, chcs_cp, n_chcs,
-                                                ind_out_indexer,
-                                                choices_indexer));
-    });
-
-    return choose_ev;
-}
-
-} // namespace dpnp::extensions::indexing::kernels
diff --git a/dpnp/backend/extensions/lapack/evd_batch_common.hpp b/dpnp/backend/extensions/lapack/evd_batch_common.hpp
index e1debdc35934..d2edffcf520a 100644
--- a/dpnp/backend/extensions/lapack/evd_batch_common.hpp
+++ b/dpnp/backend/extensions/lapack/evd_batch_common.hpp
@@ -75,8 +75,7 @@ std::pair<sycl::event, sycl::event>
                       expected_eig_vecs_nd, expected_eig_vals_nd);
 
     if (eig_vecs_shape[2] != eig_vals_shape[0] ||
-        eig_vecs_shape[0] != eig_vals_shape[1])
-    {
+        eig_vecs_shape[0] != eig_vals_shape[1]) {
         throw py::value_error(
             "The shape of 'eig_vals' must be (batch_size, n), "
             "where batch_size = " +
diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
index e0821e23e440..033c3db01b10 100644
--- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp
+++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
@@ -98,13 +98,13 @@ static sycl::event geqrf_batch_impl(sycl::queue &exec_q,
 
         geqrf_batch_event = mkl_lapack::geqrf_batch(
             exec_q,
-            m, // The number of rows in each matrix in the batch; (0 ≤ m).
-               // It must be a non-negative integer.
-            n, // The number of columns in each matrix in the batch; (0 ≤ n).
-               // It must be a non-negative integer.
-            a, // Pointer to the batch of matrices, each of size (m x n).
-            lda,      // The leading dimension of each matrix in the batch.
-                      // For row major layout, lda ≥ max(1, m).
+            m,   // The number of rows in each matrix in the batch; (0 ≤ m).
+                 // It must be a non-negative integer.
+            n,   // The number of columns in each matrix in the batch; (0 ≤ n).
+                 // It must be a non-negative integer.
+            a,   // Pointer to the batch of matrices, each of size (m x n).
+            lda, // The leading dimension of each matrix in the batch.
+                 // For row major layout, lda ≥ max(1, m).
             stride_a, // Stride between consecutive matrices in the batch.
             tau, // Pointer to the array of scalar factors of the elementary
                  // reflectors for each matrix in the batch.
diff --git a/dpnp/backend/extensions/lapack/gesv.cpp b/dpnp/backend/extensions/lapack/gesv.cpp
index 0569fab2c350..bec24db585a6 100644
--- a/dpnp/backend/extensions/lapack/gesv.cpp
+++ b/dpnp/backend/extensions/lapack/gesv.cpp
@@ -114,14 +114,14 @@ static sycl::event gesv_impl(sycl::queue &exec_q,
     try {
         getrf_event = mkl_lapack::getrf(
             exec_q,
-            n,    // The order of the square matrix A (0 ≤ n).
-                  // It must be a non-negative integer.
-            n,    // The number of columns in the square matrix A (0 ≤ n).
-                  // It must be a non-negative integer.
-            a,    // Pointer to the square matrix A (n x n).
-            lda,  // The leading dimension of matrix A.
-                  // It must be at least max(1, n).
-            ipiv, // Pointer to the output array of pivot indices.
+            n,          // The order of the square matrix A (0 ≤ n).
+                        // It must be a non-negative integer.
+            n,          // The number of columns in the square matrix A (0 ≤ n).
+                        // It must be a non-negative integer.
+            a,          // Pointer to the square matrix A (n x n).
+            lda,        // The leading dimension of matrix A.
+                        // It must be at least max(1, n).
+            ipiv,       // Pointer to the output array of pivot indices.
             scratchpad, // Pointer to scratchpad memory to be used by MKL
                         // routine for storing intermediate results.
             scratchpad_size, depends);
@@ -242,8 +242,7 @@ std::pair<sycl::event, sycl::event>
     // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty
     // events
     if (helper::check_zeros_shape(coeff_matrix_nd, coeff_matrix_shape) ||
-        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape))
-    {
+        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape)) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
     }
diff --git a/dpnp/backend/extensions/lapack/gesv_batch.cpp b/dpnp/backend/extensions/lapack/gesv_batch.cpp
index ce02f8517eb5..893279245344 100644
--- a/dpnp/backend/extensions/lapack/gesv_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesv_batch.cpp
@@ -258,10 +258,10 @@ static sycl::event gesv_batch_impl(sycl::queue &exec_q,
         try {
             gesv_event = mkl_lapack::gesv(
                 exec_q,
-                n,    // The order of the square matrix A
-                      // and the number of rows in matrix B (0 ≤ n).
-                nrhs, // The number of right-hand sides,
-                      // i.e., the number of columns in matrix B (0 ≤ nrhs).
+                n,       // The order of the square matrix A
+                         // and the number of rows in matrix B (0 ≤ n).
+                nrhs,    // The number of right-hand sides,
+                         // i.e., the number of columns in matrix B (0 ≤ nrhs).
                 a_batch, // Pointer to the square coefficient matrix A (n x n).
                 lda, // The leading dimension of a, must be at least max(1, n).
                 current_ipiv, // The pivot indices that define the permutation
@@ -341,8 +341,7 @@ std::pair<sycl::event, sycl::event>
     // Ensure `batch_size`, `n` and 'nrhs' are non-zero, otherwise return empty
     // events
     if (helper::check_zeros_shape(coeff_matrix_nd, coeff_matrix_shape) ||
-        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape))
-    {
+        helper::check_zeros_shape(dependent_vals_nd, dependent_vals_shape)) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
     }
diff --git a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
index d86d7e29413e..62f1e9589a0b 100644
--- a/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
+++ b/dpnp/backend/extensions/lapack/gesv_common_utils.hpp
@@ -64,8 +64,7 @@ inline void common_gesv_checks(sycl::queue &exec_q,
     }
 
     if (dependent_vals_nd < min_dependent_vals_ndim ||
-        dependent_vals_nd > max_dependent_vals_ndim)
-    {
+        dependent_vals_nd > max_dependent_vals_ndim) {
         throw py::value_error("The dependent values array has ndim=" +
                               std::to_string(dependent_vals_nd) + ", but a " +
                               std::to_string(min_dependent_vals_ndim) +
@@ -95,8 +94,7 @@ inline void common_gesv_checks(sycl::queue &exec_q,
 
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {coeff_matrix, dependent_vals}))
-    {
+                                             {coeff_matrix, dependent_vals})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues.");
     }
diff --git a/dpnp/backend/extensions/lapack/gesvd.cpp b/dpnp/backend/extensions/lapack/gesvd.cpp
index d46179ac3b9a..e347837e3cfe 100644
--- a/dpnp/backend/extensions/lapack/gesvd.cpp
+++ b/dpnp/backend/extensions/lapack/gesvd.cpp
@@ -171,8 +171,7 @@ std::pair<sycl::event, sycl::event>
     // Ensure `m` and 'n' are non-zero, otherwise return empty
     // events
     if (gesvd_utils::check_zeros_shape_gesvd(a_array, out_s, out_u, out_vt,
-                                             jobu_val, jobvt_val))
-    {
+                                             jobu_val, jobvt_val)) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
     }
@@ -223,8 +222,8 @@ struct GesvdContigFactory
 {
     fnT get()
     {
-        if constexpr (types::GesvdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::GesvdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return gesvd_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/gesvd_batch.cpp b/dpnp/backend/extensions/lapack/gesvd_batch.cpp
index eb9903ba6e1e..868facc200e2 100644
--- a/dpnp/backend/extensions/lapack/gesvd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/gesvd_batch.cpp
@@ -102,8 +102,7 @@ static sycl::event gesvd_batch_impl(sycl::queue &exec_q,
     std::int64_t vt_size = 0;
 
     if (jobu == oneapi::mkl::jobsvd::somevec ||
-        jobu == oneapi::mkl::jobsvd::vectorsina)
-    {
+        jobu == oneapi::mkl::jobsvd::vectorsina) {
         u_size = m * k;
         vt_size = k * n;
     }
@@ -238,8 +237,7 @@ std::pair<sycl::event, sycl::event>
     // Ensure `batch_size`, `m` and 'n' are non-zero, otherwise return empty
     // events
     if (gesvd_utils::check_zeros_shape_gesvd(a_array, out_s, out_u, out_vt,
-                                             jobu_val, jobvt_val))
-    {
+                                             jobu_val, jobvt_val)) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
     }
@@ -293,8 +291,8 @@ struct GesvdBatchContigFactory
 {
     fnT get()
     {
-        if constexpr (types::GesvdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::GesvdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return gesvd_batch_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/gesvd_common_utils.hpp b/dpnp/backend/extensions/lapack/gesvd_common_utils.hpp
index ce2d9c1eb474..1cd2c8ac4997 100644
--- a/dpnp/backend/extensions/lapack/gesvd_common_utils.hpp
+++ b/dpnp/backend/extensions/lapack/gesvd_common_utils.hpp
@@ -122,8 +122,7 @@ inline void common_gesvd_checks(sycl::queue &exec_q,
 
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {a_array, out_s, out_u, out_vt}))
-    {
+                                             {a_array, out_s, out_u, out_vt})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues.");
     }
@@ -131,8 +130,7 @@ inline void common_gesvd_checks(sycl::queue &exec_q,
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
     if (overlap(a_array, out_s) || overlap(a_array, out_u) ||
         overlap(a_array, out_vt) || overlap(out_s, out_u) ||
-        overlap(out_s, out_vt) || overlap(out_u, out_vt))
-    {
+        overlap(out_s, out_vt) || overlap(out_u, out_vt)) {
         throw py::value_error("Arrays have overlapping segments of memory");
     }
 
diff --git a/dpnp/backend/extensions/lapack/getrf.cpp b/dpnp/backend/extensions/lapack/getrf.cpp
index abf20aff643a..870ccc8e811a 100644
--- a/dpnp/backend/extensions/lapack/getrf.cpp
+++ b/dpnp/backend/extensions/lapack/getrf.cpp
@@ -91,14 +91,14 @@ static sycl::event getrf_impl(sycl::queue &exec_q,
 
         getrf_event = mkl_lapack::getrf(
             exec_q,
-            m,    // The number of rows in the input matrix A (0 ≤ m).
-                  // It must be a non-negative integer.
-            n,    // The number of columns in the input matrix A (0 ≤ n).
-                  // It must be a non-negative integer.
-            a,    // Pointer to the input matrix A (m x n).
-            lda,  // The leading dimension of matrix A.
-                  // It must be at least max(1, m).
-            ipiv, // Pointer to the output array of pivot indices.
+            m,          // The number of rows in the input matrix A (0 ≤ m).
+                        // It must be a non-negative integer.
+            n,          // The number of columns in the input matrix A (0 ≤ n).
+                        // It must be a non-negative integer.
+            a,          // Pointer to the input matrix A (m x n).
+            lda,        // The leading dimension of matrix A.
+                        // It must be at least max(1, m).
+            ipiv,       // Pointer to the output array of pivot indices.
             scratchpad, // Pointer to scratchpad memory to be used by MKL
                         // routine for storing intermediate results.
             scratchpad_size, depends);
diff --git a/dpnp/backend/extensions/lapack/getrs.cpp b/dpnp/backend/extensions/lapack/getrs.cpp
index 8108afd97003..94e1a1027898 100644
--- a/dpnp/backend/extensions/lapack/getrs.cpp
+++ b/dpnp/backend/extensions/lapack/getrs.cpp
@@ -208,8 +208,7 @@ std::pair<sycl::event, sycl::event>
 
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {a_array, b_array, ipiv_array}))
-    {
+                                             {a_array, b_array, ipiv_array})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
diff --git a/dpnp/backend/extensions/lapack/getrs_batch.cpp b/dpnp/backend/extensions/lapack/getrs_batch.cpp
index 9fc6ce1a5dfc..f4fb446c328d 100644
--- a/dpnp/backend/extensions/lapack/getrs_batch.cpp
+++ b/dpnp/backend/extensions/lapack/getrs_batch.cpp
@@ -253,8 +253,7 @@ std::pair<sycl::event, sycl::event>
 
     // check compatibility of execution queue and allocation queue
     if (!dpctl::utils::queues_are_compatible(exec_q,
-                                             {a_array, b_array, ipiv_array}))
-    {
+                                             {a_array, b_array, ipiv_array})) {
         throw py::value_error(
             "Execution queue is not compatible with allocation queues");
     }
diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp
index 5990e5344a17..96d6a03e9b8e 100644
--- a/dpnp/backend/extensions/lapack/heevd.cpp
+++ b/dpnp/backend/extensions/lapack/heevd.cpp
@@ -124,8 +124,8 @@ struct HeevdContigFactory
 {
     fnT get()
     {
-        if constexpr (types::HeevdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::HeevdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return heevd_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp
index e1c1a96bc320..e8614498bd41 100644
--- a/dpnp/backend/extensions/lapack/heevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp
@@ -161,8 +161,8 @@ struct HeevdBatchContigFactory
 {
     fnT get()
     {
-        if constexpr (types::HeevdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::HeevdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return heevd_batch_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/linalg_exceptions.hpp b/dpnp/backend/extensions/lapack/linalg_exceptions.hpp
index d087adfbd2b6..c823d1995a4e 100644
--- a/dpnp/backend/extensions/lapack/linalg_exceptions.hpp
+++ b/dpnp/backend/extensions/lapack/linalg_exceptions.hpp
@@ -37,10 +37,7 @@ class LinAlgError : public std::exception
 public:
     explicit LinAlgError(const char *message) : msg_(message) {}
 
-    const char *what() const noexcept override
-    {
-        return msg_.c_str();
-    }
+    const char *what() const noexcept override { return msg_.c_str(); }
 
 private:
     std::string msg_;
diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
index ef1c85b91f4a..a29fe9b342fc 100644
--- a/dpnp/backend/extensions/lapack/orgqr_batch.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
@@ -100,15 +100,15 @@ static sycl::event orgqr_batch_impl(sycl::queue &exec_q,
 
         orgqr_batch_event = mkl_lapack::orgqr_batch(
             exec_q,
-            m, // The number of rows in each matrix in the batch; (0 ≤ m).
-               // It must be a non-negative integer.
-            n, // The number of columns in each matrix in the batch; (0 ≤ n).
-               // It must be a non-negative integer.
-            k, // The number of elementary reflectors
-               // whose product defines the matrices Qi; (0 ≤ k ≤ n).
-            a, // Pointer to the batch of matrices, each of size (m x n).
-            lda,      // The leading dimension of each matrix in the batch.
-                      // For row major layout, lda ≥ max(1, m).
+            m,   // The number of rows in each matrix in the batch; (0 ≤ m).
+                 // It must be a non-negative integer.
+            n,   // The number of columns in each matrix in the batch; (0 ≤ n).
+                 // It must be a non-negative integer.
+            k,   // The number of elementary reflectors
+                 // whose product defines the matrices Qi; (0 ≤ k ≤ n).
+            a,   // Pointer to the batch of matrices, each of size (m x n).
+            lda, // The leading dimension of each matrix in the batch.
+                 // For row major layout, lda ≥ max(1, m).
             stride_a, // Stride between consecutive matrices in the batch.
             tau, // Pointer to the array of scalar factors of the elementary
                  // reflectors for each matrix in the batch.
diff --git a/dpnp/backend/extensions/lapack/syevd.cpp b/dpnp/backend/extensions/lapack/syevd.cpp
index af69cf9e6b7e..3ecd386299ac 100644
--- a/dpnp/backend/extensions/lapack/syevd.cpp
+++ b/dpnp/backend/extensions/lapack/syevd.cpp
@@ -124,8 +124,8 @@ struct SyevdContigFactory
 {
     fnT get()
     {
-        if constexpr (types::SyevdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::SyevdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return syevd_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp
index 0c326e5d79bb..13237d27a35c 100644
--- a/dpnp/backend/extensions/lapack/syevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp
@@ -161,8 +161,8 @@ struct SyevdBatchContigFactory
 {
     fnT get()
     {
-        if constexpr (types::SyevdTypePairSupportFactory<T, RealT>::is_defined)
-        {
+        if constexpr (types::SyevdTypePairSupportFactory<T,
+                                                         RealT>::is_defined) {
             return syevd_batch_impl<T, RealT>;
         }
         else {
diff --git a/dpnp/backend/extensions/lapack/ungqr_batch.cpp b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
index 7c890d968b0a..04de27cb257c 100644
--- a/dpnp/backend/extensions/lapack/ungqr_batch.cpp
+++ b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
@@ -100,15 +100,15 @@ static sycl::event ungqr_batch_impl(sycl::queue &exec_q,
 
         ungqr_batch_event = mkl_lapack::ungqr_batch(
             exec_q,
-            m, // The number of rows in each matrix in the batch; (0 ≤ m).
-               // It must be a non-negative integer.
-            n, // The number of columns in each matrix in the batch; (0 ≤ n).
-               // It must be a non-negative integer.
-            k, // The number of elementary reflectors
-               // whose product defines the matrices Qi; (0 ≤ k ≤ n).
-            a, // Pointer to the batch of matrices, each of size (m x n).
-            lda,      // The leading dimension of each matrix in the batch.
-                      // For row major layout, lda ≥ max(1, m).
+            m,   // The number of rows in each matrix in the batch; (0 ≤ m).
+                 // It must be a non-negative integer.
+            n,   // The number of columns in each matrix in the batch; (0 ≤ n).
+                 // It must be a non-negative integer.
+            k,   // The number of elementary reflectors
+                 // whose product defines the matrices Qi; (0 ≤ k ≤ n).
+            a,   // Pointer to the batch of matrices, each of size (m x n).
+            lda, // The leading dimension of each matrix in the batch.
+                 // For row major layout, lda ≥ max(1, m).
             stride_a, // Stride between consecutive matrices in the batch.
             tau, // Pointer to the array of scalar factors of the elementary
                  // reflectors for each matrix in the batch.
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 7ccb05238ae4..36786c8cbaf3 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -67,7 +67,7 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
 
 # treat below headers as system to suppress the warnings there during the build
diff --git a/dpnp/backend/extensions/statistics/bincount.cpp b/dpnp/backend/extensions/statistics/bincount.cpp
index ba258cd55447..9bfe5c2a2449 100644
--- a/dpnp/backend/extensions/statistics/bincount.cpp
+++ b/dpnp/backend/extensions/statistics/bincount.cpp
@@ -59,10 +59,7 @@ struct BincountEdges
     {
     }
 
-    boundsT get_bounds() const
-    {
-        return {min, max};
-    }
+    boundsT get_bounds() const { return {min, max}; }
 
     template <int _Dims, typename dT>
     size_t get_bin(const sycl::nd_item<_Dims> &,
diff --git a/dpnp/backend/extensions/statistics/histogram_common.hpp b/dpnp/backend/extensions/statistics/histogram_common.hpp
index 539b42475fbf..8091e8874d17 100644
--- a/dpnp/backend/extensions/statistics/histogram_common.hpp
+++ b/dpnp/backend/extensions/statistics/histogram_common.hpp
@@ -28,24 +28,26 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+
 #include <sycl/sycl.hpp>
 
+#include "dpctl4pybind11.hpp"
+
 #include "ext/common.hpp"
+#include "kernels/statistics/histogram.hpp"
 
-namespace dpctl::tensor
+namespace statistics::histogram
 {
-class usm_ndarray;
-}
-
 using dpctl::tensor::usm_ndarray;
 
 using ext::common::AtomicOp;
 using ext::common::IsNan;
 using ext::common::Less;
 
-namespace statistics::histogram
-{
-
 template <typename T, int Dims>
 struct CachedData
 {
@@ -64,37 +66,28 @@ struct CachedData
         local_data = LocalData(shape, cgh);
     }
 
-    T *get_ptr() const
-    {
-        return &local_data[0];
-    }
+    T *get_ptr() const { return &local_data[0]; }
 
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &item) const
     {
-        uint32_t llid = item.get_local_linear_id();
+        std::uint32_t llid = item.get_local_linear_id();
         auto local_ptr = &local_data[0];
-        uint32_t size = local_data.size();
+        std::uint32_t size = local_data.size();
         auto group = item.get_group();
-        uint32_t local_size = group.get_local_linear_range();
+        std::uint32_t local_size = group.get_local_linear_range();
 
-        for (uint32_t i = llid; i < size; i += local_size) {
+        for (std::uint32_t i = llid; i < size; i += local_size) {
             local_ptr[i] = global_data[i];
         }
     }
 
-    size_t size() const
-    {
-        return local_data.size();
-    }
+    std::size_t size() const { return local_data.size(); }
 
-    T &operator[](const sycl::id<Dims> &id) const
-    {
-        return local_data[id];
-    }
+    T &operator[](const sycl::id<Dims> &id) const { return local_data[id]; }
 
     template <typename = std::enable_if_t<Dims == 1>>
-    T &operator[](const size_t id) const
+    T &operator[](const std::size_t id) const
     {
         return local_data[id];
     }
@@ -119,28 +112,19 @@ struct UncachedData
         _shape = shape;
     }
 
-    T *get_ptr() const
-    {
-        return global_data;
-    }
+    T *get_ptr() const { return global_data; }
 
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &) const
     {
     }
 
-    size_t size() const
-    {
-        return _shape.size();
-    }
+    std::size_t size() const { return _shape.size(); }
 
-    T &operator[](const sycl::id<Dims> &id) const
-    {
-        return global_data[id];
-    }
+    T &operator[](const sycl::id<Dims> &id) const { return global_data[id]; }
 
     template <typename = std::enable_if_t<Dims == 1>>
-    T &operator[](const size_t id) const
+    T &operator[](const std::size_t id) const
     {
         return global_data[id];
     }
@@ -157,15 +141,15 @@ struct HistLocalType
 };
 
 template <>
-struct HistLocalType<uint64_t>
+struct HistLocalType<std::uint64_t>
 {
-    using type = uint32_t;
+    using type = std::uint32_t;
 };
 
 template <>
-struct HistLocalType<int64_t>
+struct HistLocalType<std::int64_t>
 {
-    using type = int32_t;
+    using type = std::int32_t;
 };
 
 template <typename T, typename localT = typename HistLocalType<T>::type>
@@ -177,8 +161,8 @@ struct HistWithLocalCopies
     using LocalHist = sycl::local_accessor<localT, 2>;
 
     HistWithLocalCopies(T *global_data,
-                        size_t bins_count,
-                        int32_t copies_count,
+                        std::size_t bins_count,
+                        std::int32_t copies_count,
                         sycl::handler &cgh)
     {
         local_hist = LocalHist(sycl::range<2>(copies_count, bins_count), cgh);
@@ -188,23 +172,25 @@ struct HistWithLocalCopies
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &item, localT val = 0) const
     {
-        uint32_t llid = item.get_local_linear_id();
+        std::uint32_t llid = item.get_local_linear_id();
         auto *local_ptr = &local_hist[0][0];
-        uint32_t size = local_hist.size();
+        std::uint32_t size = local_hist.size();
         auto group = item.get_group();
-        uint32_t local_size = group.get_local_linear_range();
+        std::uint32_t local_size = group.get_local_linear_range();
 
-        for (uint32_t i = llid; i < size; i += local_size) {
+        for (std::uint32_t i = llid; i < size; i += local_size) {
             local_ptr[i] = val;
         }
     }
 
     template <int _Dims>
-    void add(const sycl::nd_item<_Dims> &item, int32_t bin, localT value) const
+    void add(const sycl::nd_item<_Dims> &item,
+             std::int32_t bin,
+             localT value) const
     {
-        int32_t llid = item.get_local_linear_id();
-        int32_t local_hist_count = local_hist.get_range().get(0);
-        int32_t local_copy_id =
+        std::int32_t llid = item.get_local_linear_id();
+        std::int32_t local_hist_count = local_hist.get_range().get(0);
+        std::int32_t local_copy_id =
             local_hist_count == 1 ? 0 : llid % local_hist_count;
 
         AtomicOp<localT, sycl::memory_order::relaxed,
@@ -216,15 +202,15 @@ struct HistWithLocalCopies
     template <int _Dims>
     void finalize(const sycl::nd_item<_Dims> &item) const
     {
-        uint32_t llid = item.get_local_linear_id();
-        uint32_t bins_count = local_hist.get_range().get(1);
-        uint32_t local_hist_count = local_hist.get_range().get(0);
+        std::uint32_t llid = item.get_local_linear_id();
+        std::uint32_t bins_count = local_hist.get_range().get(1);
+        std::uint32_t local_hist_count = local_hist.get_range().get(0);
         auto group = item.get_group();
-        uint32_t local_size = group.get_local_linear_range();
+        std::uint32_t local_size = group.get_local_linear_range();
 
-        for (uint32_t i = llid; i < bins_count; i += local_size) {
+        for (std::uint32_t i = llid; i < bins_count; i += local_size) {
             auto value = local_hist[0][i];
-            for (uint32_t lhc = 1; lhc < local_hist_count; ++lhc) {
+            for (std::uint32_t lhc = 1; lhc < local_hist_count; ++lhc) {
                 value += local_hist[lhc][i];
             }
             if (value != T(0)) {
@@ -235,10 +221,7 @@ struct HistWithLocalCopies
         }
     }
 
-    uint32_t size() const
-    {
-        return local_hist.size();
-    }
+    std::uint32_t size() const { return local_hist.size(); }
 
 private:
     LocalHist local_hist;
@@ -251,10 +234,7 @@ struct HistGlobalMemory
     static constexpr bool const sync_after_init = false;
     static constexpr bool const sync_before_finalize = false;
 
-    HistGlobalMemory(T *global_data)
-    {
-        global_hist = global_data;
-    }
+    HistGlobalMemory(T *global_data) { global_hist = global_data; }
 
     template <int _Dims>
     void init(const sycl::nd_item<_Dims> &) const
@@ -262,7 +242,7 @@ struct HistGlobalMemory
     }
 
     template <int _Dims>
-    void add(const sycl::nd_item<_Dims> &, int32_t bin, T value) const
+    void add(const sycl::nd_item<_Dims> &, std::int32_t bin, T value) const
     {
         AtomicOp<T, sycl::memory_order::relaxed,
                  sycl::memory_scope::device>::add(global_hist[bin], value);
@@ -277,27 +257,18 @@ struct HistGlobalMemory
     T *global_hist = nullptr;
 };
 
-template <typename T = uint32_t>
+template <typename T = std::uint32_t>
 struct NoWeights
 {
-    constexpr T get(size_t) const
-    {
-        return 1;
-    }
+    constexpr T get(std::size_t) const { return 1; }
 };
 
 template <typename T>
 struct Weights
 {
-    Weights(T *weights)
-    {
-        data = weights;
-    }
+    Weights(T *weights) { data = weights; }
 
-    T get(size_t id) const
-    {
-        return data[id];
-    }
+    T get(std::size_t id) const { return data[id]; }
 
 private:
     T *data = nullptr;
@@ -310,55 +281,23 @@ bool check_in_bounds(const dT &val, const dT &min, const dT &max)
     return !_less(val, min) && !_less(max, val) && !IsNan<dT>::isnan(val);
 }
 
-template <typename T, typename HistImpl, typename Edges, typename Weights>
-class histogram_kernel;
-
 template <typename T, typename HistImpl, typename Edges, typename Weights>
 void submit_histogram(const T *in,
-                      const size_t size,
-                      const size_t dims,
-                      const uint32_t WorkPI,
+                      const std::size_t size,
+                      const std::size_t dims,
+                      const std::uint32_t WorkPI,
                       const HistImpl &hist,
                       const Edges &edges,
                       const Weights &weights,
                       sycl::nd_range<1> nd_range,
                       sycl::handler &cgh)
 {
-    cgh.parallel_for<histogram_kernel<T, HistImpl, Edges, Weights>>(
-        nd_range, [=](sycl::nd_item<1> item) {
-            auto id = item.get_group_linear_id();
-            auto lid = item.get_local_linear_id();
-            auto group = item.get_group();
-            auto local_size = item.get_local_range(0);
-
-            hist.init(item);
-            edges.init(item);
-
-            if constexpr (HistImpl::sync_after_init || Edges::sync_after_init) {
-                sycl::group_barrier(group, sycl::memory_scope::work_group);
-            }
-
-            auto bounds = edges.get_bounds();
-
-            for (uint32_t i = 0; i < WorkPI; ++i) {
-                auto data_idx = id * WorkPI * local_size + i * local_size + lid;
-                if (data_idx < size) {
-                    auto *d = &in[data_idx * dims];
-
-                    if (edges.in_bounds(d, bounds)) {
-                        auto bin = edges.get_bin(item, d, bounds);
-                        auto weight = weights.get(data_idx);
-                        hist.add(item, bin, weight);
-                    }
-                }
-            }
-
-            if constexpr (HistImpl::sync_before_finalize) {
-                sycl::group_barrier(group, sycl::memory_scope::work_group);
-            }
+    using HistogramKernel =
+        dpnp::kernels::histogram::HistogramFunctor<T, HistImpl, Edges, Weights>;
 
-            hist.finalize(item);
-        });
+    cgh.parallel_for<HistogramKernel>(
+        nd_range,
+        HistogramKernel(in, size, dims, WorkPI, hist, edges, weights));
 }
 
 void validate(const usm_ndarray &sample,
@@ -366,8 +305,8 @@ void validate(const usm_ndarray &sample,
               const std::optional<const dpctl::tensor::usm_ndarray> &weights,
               const usm_ndarray &histogram);
 
-uint32_t get_local_hist_copies_count(uint32_t loc_mem_size_in_items,
-                                     uint32_t local_size,
-                                     uint32_t hist_size_in_items);
+std::uint32_t get_local_hist_copies_count(std::uint32_t loc_mem_size_in_items,
+                                          std::uint32_t local_size,
+                                          std::uint32_t hist_size_in_items);
 
 } // namespace statistics::histogram
diff --git a/dpnp/backend/extensions/statistics/histogramdd.cpp b/dpnp/backend/extensions/statistics/histogramdd.cpp
index a5ed4a8c7d1c..bd2177073333 100644
--- a/dpnp/backend/extensions/statistics/histogramdd.cpp
+++ b/dpnp/backend/extensions/statistics/histogramdd.cpp
@@ -90,10 +90,7 @@ struct EdgesDd
         }
     }
 
-    boundsT get_bounds() const
-    {
-        return {&min[0], &max[0]};
-    }
+    boundsT get_bounds() const { return {&min[0], &max[0]}; }
 
     auto get_bin_for_dim(const EdgesT &val,
                          const EdgesT *edges_data,
diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.hpp b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
index c5a5bac111dd..329c96dfc1c6 100644
--- a/dpnp/backend/extensions/statistics/sliding_window1d.hpp
+++ b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
@@ -28,25 +28,21 @@
 
 #pragma once
 
-#include <algorithm>
-
-#include "utils/math_utils.hpp"
-#include <sycl/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
 #include <type_traits>
 
-#include <stdio.h>
-
-#include "ext/common.hpp"
+#include <sycl/sycl.hpp>
 
-using dpctl::tensor::usm_ndarray;
+#include "dpctl4pybind11.hpp"
 
-using ext::common::Align;
-using ext::common::CeilDiv;
+#include "kernels/statistics/sliding_window1d.hpp"
 
 namespace statistics::sliding_window1d
 {
+using dpctl::tensor::usm_ndarray;
 
-template <typename T, uint32_t Size>
+template <typename T, std::uint32_t Size>
 class _RegistryDataStorage
 {
 public:
@@ -129,37 +125,22 @@ class _RegistryDataStorage
         return sycl::shift_group_right(sbgroup, data[y], x);
     }
 
-    constexpr SizeT size_y() const
-    {
-        return _size;
-    }
+    constexpr SizeT size_y() const { return _size; }
 
-    SizeT size_x() const
-    {
-        return sbgroup.get_max_local_range()[0];
-    }
+    SizeT size_x() const { return sbgroup.get_max_local_range()[0]; }
 
-    SizeT total_size() const
-    {
-        return size_x() * size_y();
-    }
+    SizeT total_size() const { return size_x() * size_y(); }
 
-    ncT *ptr()
-    {
-        return data;
-    }
+    ncT *ptr() { return data; }
 
-    SizeT x() const
-    {
-        return sbgroup.get_local_linear_id();
-    }
+    SizeT x() const { return sbgroup.get_local_linear_id(); }
 
 protected:
     const sycl::sub_group sbgroup;
     ncT data[Size];
 };
 
-template <typename T, uint32_t Size = 1>
+template <typename T, std::uint32_t Size = 1>
 struct RegistryData : public _RegistryDataStorage<T, Size>
 {
     using SizeT = typename _RegistryDataStorage<T, Size>::SizeT;
@@ -277,8 +258,7 @@ struct RegistryData : public _RegistryDataStorage<T, Size>
 
     T *load(const T *const data, const bool &mask, const T &default_v)
     {
-        return load(
-            data, [mask](auto &&) { return mask; }, default_v);
+        return load(data, [mask](auto &&) { return mask; }, default_v);
     }
 
     T *load(const T *const data)
@@ -349,13 +329,10 @@ struct RegistryData : public _RegistryDataStorage<T, Size>
         return store(data, [mask](auto &&) { return mask; });
     }
 
-    T *store(T *const data)
-    {
-        return store(data, true);
-    }
+    T *store(T *const data) { return store(data, true); }
 };
 
-template <typename T, uint32_t Size>
+template <typename T, std::uint32_t Size>
 struct RegistryWindow : public RegistryData<T, Size>
 {
     using SizeT = typename RegistryData<T, Size>::SizeT;
@@ -368,7 +345,7 @@ struct RegistryWindow : public RegistryData<T, Size>
         static_assert(std::is_integral_v<shT>,
                       "shift must be of an integral type");
 
-        uint32_t shift_r = this->size_x() - shift;
+        std::uint32_t shift_r = this->size_x() - shift;
         for (SizeT i = 0; i < Size; ++i) {
             this->data[i] = this->shift_left(i, shift);
             auto border =
@@ -379,10 +356,7 @@ struct RegistryWindow : public RegistryData<T, Size>
         }
     }
 
-    void advance_left(const T &fill_value)
-    {
-        advance_left(1, fill_value);
-    }
+    void advance_left(const T &fill_value) { advance_left(1, fill_value); }
 
     void advance_left()
     {
@@ -391,7 +365,7 @@ struct RegistryWindow : public RegistryData<T, Size>
     }
 };
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 class Span
 {
 public:
@@ -400,38 +374,26 @@ class Span
 
     Span(T *const data, const SizeT size) : data_(data), size_(size) {}
 
-    T *begin() const
-    {
-        return data();
-    }
+    T *begin() const { return data(); }
 
-    T *end() const
-    {
-        return data() + size();
-    }
+    T *end() const { return data() + size(); }
 
-    SizeT size() const
-    {
-        return size_;
-    }
+    SizeT size() const { return size_; }
 
-    T *data() const
-    {
-        return data_;
-    }
+    T *data() const { return data_; }
 
 protected:
     T *const data_;
     const SizeT size_;
 };
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 Span<T, SizeT> make_span(T *const data, const SizeT size)
 {
     return Span<T, SizeT>(data, size);
 }
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 class PaddedSpan : public Span<T, SizeT>
 {
 public:
@@ -443,82 +405,22 @@ class PaddedSpan : public Span<T, SizeT>
     {
     }
 
-    T *padded_begin() const
-    {
-        return this->begin() - pad();
-    }
+    T *padded_begin() const { return this->begin() - pad(); }
 
-    SizeT pad() const
-    {
-        return pad_;
-    }
+    SizeT pad() const { return pad_; }
 
 protected:
     const SizeT pad_;
 };
 
-template <typename T, typename SizeT = size_t>
+template <typename T, typename SizeT = std::size_t>
 PaddedSpan<T, SizeT>
     make_padded_span(T *const data, const SizeT size, const SizeT offset)
 {
     return PaddedSpan<T, SizeT>(data, size, offset);
 }
 
-template <typename Results,
-          typename AData,
-          typename VData,
-          typename Op,
-          typename Red>
-void process_block(Results &results,
-                   uint32_t r_size,
-                   AData &a_data,
-                   VData &v_data,
-                   uint32_t block_size,
-                   Op op,
-                   Red red)
-{
-    for (uint32_t i = 0; i < block_size; ++i) {
-        auto v_val = v_data.broadcast(i);
-        for (uint32_t r = 0; r < r_size; ++r) {
-            results[r] = red(results[r], op(a_data[r], v_val));
-        }
-        a_data.advance_left();
-    }
-}
-
-template <typename SizeT>
-SizeT get_global_linear_id(const uint32_t wpi, const sycl::nd_item<1> &item)
-{
-    auto sbgroup = item.get_sub_group();
-    const auto sg_loc_id = sbgroup.get_local_linear_id();
-
-    const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
-    const SizeT id = sg_base_id + sg_loc_id;
-
-    return id;
-}
-
-template <typename SizeT>
-uint32_t get_results_num(const uint32_t wpi,
-                         const SizeT size,
-                         const SizeT global_id,
-                         const sycl::nd_item<1> &item)
-{
-    auto sbgroup = item.get_sub_group();
-
-    const auto sbg_size = sbgroup.get_max_local_range()[0];
-    const auto size_ = sycl::sub_sat(size, global_id);
-    return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
-}
-
-template <uint32_t WorkPI,
-          typename T,
-          typename SizeT,
-          typename Op,
-          typename Red>
-class sliding_window1d_kernel;
-
-template <uint32_t WorkPI,
+template <std::uint32_t WorkPI,
           typename T,
           typename SizeT,
           typename Op,
@@ -531,77 +433,16 @@ void submit_sliding_window1d(const PaddedSpan<const T, SizeT> &a,
                              sycl::nd_range<1> nd_range,
                              sycl::handler &cgh)
 {
-    cgh.parallel_for<sliding_window1d_kernel<WorkPI, T, SizeT, Op, Red>>(
-        nd_range, [=](sycl::nd_item<1> item) {
-            auto glid = get_global_linear_id<SizeT>(WorkPI, item);
-
-            auto results = RegistryData<T, WorkPI>(item);
-            results.fill(0);
-
-            auto results_num = get_results_num(WorkPI, out.size(), glid, item);
-
-            const auto *a_begin = a.begin();
-            const auto *a_end = a.end();
-
-            auto sbgroup = item.get_sub_group();
-
-            const auto chunks_count =
-                CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
-
-            const auto *a_ptr = &a.padded_begin()[glid];
-
-            auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
-                return ptr >= a_begin && ptr < a_end;
-            };
-
-            auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
-            a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
-
-            const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
-            auto v_size = v.size();
-
-            for (uint32_t b = 0; b < chunks_count; ++b) {
-                auto v_data = RegistryData<const T>(item);
-                v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
-
-                uint32_t chunk_size_ =
-                    std::min(v_size, SizeT(v_data.total_size()));
-                process_block(results, results_num, a_data, v_data, chunk_size_,
-                              op, red);
+    using SlidingWindow1dKernel =
+        dpnp::kernels::sliding_window1d::SlidingWindow1dFunctor<
+            WorkPI, PaddedSpan<const T, SizeT>, Span<const T, SizeT>, Op, Red,
+            Span<T, SizeT>, RegistryData, RegistryWindow>;
 
-                if (b != chunks_count - 1) {
-                    a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
-                                             _a_load_cond, 0);
-                    v_size -= v_data.total_size();
-                }
-            }
-
-            auto *const out_ptr = out.begin();
-            // auto *const out_end = out.end();
-
-            auto y_start = glid;
-            auto y_stop =
-                std::min(y_start + WorkPI * results.size_x(), out.size());
-            uint32_t i = 0;
-            for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
-                out_ptr[y] = results[i++];
-            }
-            // while the code itself seems to be valid, inside correlate
-            // kernel it results in memory corruption. Further investigation
-            // is needed. SAT-7693
-            // corruption results.store(&out_ptr[glid],
-            //               [out_end](auto &&ptr) { return ptr < out_end; });
-        });
+    cgh.parallel_for<SlidingWindow1dKernel>(
+        nd_range, SlidingWindow1dKernel(a, v, op, red, out));
 }
 
-template <uint32_t WorkPI,
-          typename T,
-          typename SizeT,
-          typename Op,
-          typename Red>
-class sliding_window1d_small_kernel;
-
-template <uint32_t WorkPI,
+template <std::uint32_t WorkPI,
           typename T,
           typename SizeT,
           typename Op,
@@ -614,61 +455,18 @@ void submit_sliding_window1d_small_kernel(const PaddedSpan<const T, SizeT> &a,
                                           sycl::nd_range<1> nd_range,
                                           sycl::handler &cgh)
 {
-    cgh.parallel_for<sliding_window1d_small_kernel<WorkPI, T, SizeT, Op, Red>>(
-        nd_range, [=](sycl::nd_item<1> item) {
-            auto glid = get_global_linear_id<SizeT>(WorkPI, item);
-
-            auto results = RegistryData<T, WorkPI>(item);
-            results.fill(0);
-
-            auto sbgroup = item.get_sub_group();
-            auto sg_size = sbgroup.get_max_local_range()[0];
+    using SlidingWindow1dSmallKernel =
+        dpnp::kernels::sliding_window1d::SlidingWindow1dSmallFunctor<
+            WorkPI, PaddedSpan<const T, SizeT>, Span<const T, SizeT>, Op, Red,
+            Span<T, SizeT>, RegistryData, RegistryWindow>;
 
-            const uint32_t to_read = WorkPI * sg_size + v.size();
-            const auto *a_begin = a.begin();
-
-            const auto *a_ptr = &a.padded_begin()[glid];
-            const auto *a_end = std::min(a_ptr + to_read, a.end());
-
-            auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
-                return ptr >= a_begin && ptr < a_end;
-            };
-
-            auto a_data = RegistryWindow<const T, WorkPI + 1>(item);
-            a_data.load(a_ptr, _a_load_cond, 0);
-
-            const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
-            auto v_size = v.size();
-
-            auto v_data = RegistryData<const T>(item);
-            v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
-
-            auto results_num = get_results_num(WorkPI, out.size(), glid, item);
-
-            process_block(results, results_num, a_data, v_data, v_size, op,
-                          red);
-
-            auto *const out_ptr = out.begin();
-            // auto *const out_end = out.end();
-
-            auto y_start = glid;
-            auto y_stop =
-                std::min(y_start + WorkPI * results.size_x(), out.size());
-            uint32_t i = 0;
-            for (uint32_t y = y_start; y < y_stop; y += results.size_x()) {
-                out_ptr[y] = results[i++];
-            }
-            // while the code itself seems to be valid, inside correlate
-            // kernel it results in memory corruption. Further investigation
-            // is needed. SAT-7693
-            // corruption results.store(&out_ptr[glid],
-            //               [out_end](auto &&ptr) { return ptr < out_end; });
-        });
+    cgh.parallel_for<SlidingWindow1dSmallKernel>(
+        nd_range, SlidingWindow1dSmallKernel(a, v, op, red, out));
 }
 
 void validate(const usm_ndarray &a,
               const usm_ndarray &v,
               const usm_ndarray &out,
-              const size_t l_pad,
-              const size_t r_pad);
+              const std::size_t l_pad,
+              const std::size_t r_pad);
 } // namespace statistics::sliding_window1d
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
index 5254e50d3faf..6f10e651fe25 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
@@ -184,8 +184,7 @@ using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
     };
 
 template <template <typename fnT, typename T> typename contigFactoryT,
-          template <typename fnT, typename T>
-          typename stridedFactoryT>
+          template <typename fnT, typename T> typename stridedFactoryT>
 static void populate(py::module_ m,
                      const char *name,
                      const char *docstring,
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index 33c7ab19b9ab..8830569ce9cf 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -41,40 +41,29 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "kernels/elementwise_functions/interpolate.hpp"
+
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
-#include "kernels/elementwise_functions/interpolate.hpp"
-
 // utils extension headers
 #include "ext/common.hpp"
 #include "ext/validation_utils.hpp"
 
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace type_utils = dpctl::tensor::type_utils;
-
-using ext::common::value_type_of;
-using ext::validation::array_names;
-using ext::validation::array_ptr;
-
-using ext::common::dtype_from_typenum;
-using ext::validation::check_has_dtype;
-using ext::validation::check_num_dims;
-using ext::validation::check_same_dtype;
-using ext::validation::check_same_size;
-using ext::validation::common_checks;
-
 namespace dpnp::extensions::ufunc
 {
+namespace py = pybind11;
 
 namespace impl
 {
-using ext::common::init_dispatch_vector;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace type_utils = dpctl::tensor::type_utils;
 
 template <typename T>
-using value_type_of_t = typename value_type_of<T>::type;
+using value_type_of_t = typename ext::common::value_type_of<T>::type;
+
+using ext::common::dtype_from_typenum;
 
 typedef sycl::event (*interpolate_fn_ptr_t)(sycl::queue &,
                                             const void *,      // x
@@ -88,8 +77,10 @@ typedef sycl::event (*interpolate_fn_ptr_t)(sycl::queue &,
                                             const std::size_t, // xp_size
                                             const std::vector<sycl::event> &);
 
+interpolate_fn_ptr_t interpolate_dispatch_vector[td_ns::num_types];
+
 template <typename T, typename TIdx = std::int64_t>
-sycl::event interpolate_call(sycl::queue &exec_q,
+sycl::event interpolate_impl(sycl::queue &q,
                              const void *vx,
                              const void *vidx,
                              const void *vxp,
@@ -101,6 +92,8 @@ sycl::event interpolate_call(sycl::queue &exec_q,
                              const std::size_t xp_size,
                              const std::vector<sycl::event> &depends)
 {
+    dpctl::tensor::type_utils::validate_type_for_device<T>(q);
+
     using type_utils::is_complex_v;
     using TCoord = std::conditional_t<is_complex_v<T>, value_type_of_t<T>, T>;
 
@@ -112,23 +105,69 @@ sycl::event interpolate_call(sycl::queue &exec_q,
     const T *right = static_cast<const T *>(vright);
     T *out = static_cast<T *>(vout);
 
-    using dpnp::kernels::interpolate::interpolate_impl;
-    sycl::event interpolate_ev = interpolate_impl<TCoord, T>(
-        exec_q, x, idx, xp, fp, left, right, out, n, xp_size, depends);
+    sycl::event interpolate_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using InterpolateFunc =
+            dpnp::kernels::interpolate::InterpolateFunctor<TCoord, T>;
+
+        cgh.parallel_for<InterpolateFunc>(
+            sycl::range<1>(n),
+            InterpolateFunc(x, idx, xp, fp, left, right, out, xp_size));
+    });
 
     return interpolate_ev;
 }
 
-interpolate_fn_ptr_t interpolate_dispatch_vector[td_ns::num_types];
+/**
+ * @brief A factory to define pairs of supported types for which
+ * interpolate function is available.
+ *
+ * @tparam T Type of input vector `a` and of result vector `y`.
+ */
+template <typename T>
+struct InterpolateOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
 
-void common_interpolate_checks(
-    const dpctl::tensor::usm_ndarray &x,
-    const dpctl::tensor::usm_ndarray &idx,
-    const dpctl::tensor::usm_ndarray &xp,
-    const dpctl::tensor::usm_ndarray &fp,
-    const dpctl::tensor::usm_ndarray &out,
-    const std::optional<const dpctl::tensor::usm_ndarray> &left,
-    const std::optional<const dpctl::tensor::usm_ndarray> &right)
+template <typename fnT, typename T>
+struct InterpolateFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename InterpolateOutputType<T>::value_type,
+                          void>) {
+            return nullptr;
+        }
+        else {
+            return interpolate_impl<T>;
+        }
+    }
+};
+
+namespace detail
+{
+using ext::validation::array_names;
+using ext::validation::check_has_dtype;
+using ext::validation::check_num_dims;
+using ext::validation::check_same_dtype;
+using ext::validation::check_same_size;
+using ext::validation::common_checks;
+
+void validate(const dpctl::tensor::usm_ndarray &x,
+              const dpctl::tensor::usm_ndarray &idx,
+              const dpctl::tensor::usm_ndarray &xp,
+              const dpctl::tensor::usm_ndarray &fp,
+              const dpctl::tensor::usm_ndarray &out,
+              const std::optional<const dpctl::tensor::usm_ndarray> &left,
+              const std::optional<const dpctl::tensor::usm_ndarray> &right)
 {
     array_names names = {{&x, "x"}, {&xp, "xp"}, {&fp, "fp"}, {&out, "out"}};
 
@@ -158,6 +197,7 @@ void common_interpolate_checks(
         throw py::value_error("array of sample points is empty");
     }
 }
+} // namespace detail
 
 std::pair<sycl::event, sycl::event>
     py_interpolate(const dpctl::tensor::usm_ndarray &x,
@@ -170,7 +210,7 @@ std::pair<sycl::event, sycl::event>
                    sycl::queue &exec_q,
                    const std::vector<sycl::event> &depends)
 {
-    common_interpolate_checks(x, idx, xp, fp, out, left, right);
+    detail::validate(x, idx, xp, fp, out, left, right);
 
     int out_typenum = out.get_typenum();
 
@@ -214,56 +254,21 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(args_ev, ev);
 }
 
-/**
- * @brief A factory to define pairs of supported types for which
- * interpolate function is available.
- *
- * @tparam T Type of input vector `a` and of result vector `y`.
- */
-template <typename T>
-struct InterpolateOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-};
-
-template <typename fnT, typename T>
-struct InterpolateFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_same_v<
-                          typename InterpolateOutputType<T>::value_type, void>)
-        {
-            return nullptr;
-        }
-        else {
-            return interpolate_call<T>;
-        }
-    }
-};
-
 static void init_interpolate_dispatch_vectors()
 {
-    init_dispatch_vector<interpolate_fn_ptr_t, InterpolateFactory>(
+    using ext::common::init_dispatch_vector;
+    init_dispatch_vector<interpolate_fn_ptr_t, impl::InterpolateFactory>(
         interpolate_dispatch_vector);
 }
-
 } // namespace impl
 
 void init_interpolate(py::module_ m)
 {
     impl::init_interpolate_dispatch_vectors();
 
-    using impl::py_interpolate;
-    m.def("_interpolate", &py_interpolate, "", py::arg("x"), py::arg("idx"),
-          py::arg("xp"), py::arg("fp"), py::arg("left"), py::arg("right"),
-          py::arg("out"), py::arg("sycl_queue"),
+    m.def("_interpolate", &impl::py_interpolate, "", py::arg("x"),
+          py::arg("idx"), py::arg("xp"), py::arg("fp"), py::arg("left"),
+          py::arg("right"), py::arg("out"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 }
-
 } // namespace dpnp::extensions::ufunc
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/populate.hpp b/dpnp/backend/extensions/ufunc/elementwise_functions/populate.hpp
index f0c630562aae..2971c3eb4aca 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/populate.hpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/populate.hpp
@@ -158,8 +158,7 @@ namespace ext_ns = ext::common;
             if constexpr (std::is_same_v<typename OutputType<T>::value_type1,  \
                                          void> ||                              \
                           std::is_same_v<typename OutputType<T>::value_type2,  \
-                                         void>)                                \
-            {                                                                  \
+                                         void>) {                              \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
@@ -210,8 +209,7 @@ namespace ext_ns = ext::common;
             if constexpr (std::is_same_v<typename OutputType<T>::value_type1,  \
                                          void> ||                              \
                           std::is_same_v<typename OutputType<T>::value_type2,  \
-                                         void>)                                \
-            {                                                                  \
+                                         void>) {                              \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
@@ -263,8 +261,8 @@ namespace ext_ns = ext::common;
         fnT get()                                                              \
         {                                                                      \
             if constexpr (std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type, void>)  \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type,         \
+                              void>) {                                         \
                                                                                \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
@@ -312,8 +310,8 @@ namespace ext_ns = ext::common;
         fnT get()                                                              \
         {                                                                      \
             if constexpr (std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type, void>)  \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type,         \
+                              void>) {                                         \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
@@ -368,8 +366,8 @@ namespace ext_ns = ext::common;
                               typename OutputType<T1, T2>::value_type1,        \
                               void> ||                                         \
                           std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type2, void>) \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type2,        \
+                              void>) {                                         \
                                                                                \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
@@ -425,8 +423,8 @@ namespace ext_ns = ext::common;
                               typename OutputType<T1, T2>::value_type1,        \
                               void> ||                                         \
                           std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type2, void>) \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type2,        \
+                              void>) {                                         \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
diff --git a/dpnp/backend/extensions/ufunc/ufunc_py.cpp b/dpnp/backend/extensions/ufunc/ufunc_py.cpp
index 516d7187d479..7180a4c44be4 100644
--- a/dpnp/backend/extensions/ufunc/ufunc_py.cpp
+++ b/dpnp/backend/extensions/ufunc/ufunc_py.cpp
@@ -32,7 +32,4 @@
 
 namespace ufunc_ns = dpnp::extensions::ufunc;
 
-PYBIND11_MODULE(_ufunc_impl, m)
-{
-    ufunc_ns::init_elementwise_functions(m);
-}
+PYBIND11_MODULE(_ufunc_impl, m) { ufunc_ns::init_elementwise_functions(m); }
diff --git a/dpnp/backend/extensions/vm/common.hpp b/dpnp/backend/extensions/vm/common.hpp
index 6ee73504ce96..325aba7fafd2 100644
--- a/dpnp/backend/extensions/vm/common.hpp
+++ b/dpnp/backend/extensions/vm/common.hpp
@@ -181,8 +181,7 @@ bool need_to_call_unary_two_outputs_ufunc(
 
     // check that types are supported
     if (dst1_typeid != func_output_typeids.first ||
-        dst2_typeid != func_output_typeids.second)
-    {
+        dst2_typeid != func_output_typeids.second) {
         return false;
     }
 
@@ -425,8 +424,7 @@ bool need_to_call_binary_ufunc(sycl::queue &exec_q,
             if constexpr (std::is_same_v<typename OutputType<T>::value_type1,  \
                                          void> ||                              \
                           std::is_same_v<typename OutputType<T>::value_type2,  \
-                                         void>)                                \
-            {                                                                  \
+                                         void>) {                              \
                 fnT fn = nullptr;                                              \
                 return fn;                                                     \
             }                                                                  \
@@ -471,8 +469,8 @@ bool need_to_call_binary_ufunc(sycl::queue &exec_q,
         fnT get()                                                              \
         {                                                                      \
             if constexpr (std::is_same_v<                                      \
-                              typename OutputType<T1, T2>::value_type, void>)  \
-            {                                                                  \
+                              typename OutputType<T1, T2>::value_type,         \
+                              void>) {                                         \
                 return nullptr;                                                \
             }                                                                  \
             else {                                                             \
diff --git a/dpnp/backend/extensions/window/common.hpp b/dpnp/backend/extensions/window/common.hpp
index cb084e972d78..9e7b1192e3a2 100644
--- a/dpnp/backend/extensions/window/common.hpp
+++ b/dpnp/backend/extensions/window/common.hpp
@@ -28,11 +28,18 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <cstddef>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
 #include <sycl/sycl.hpp>
 
 #include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 // dpctl tensor headers
 #include "utils/output_validation.hpp"
@@ -41,10 +48,8 @@
 
 namespace dpnp::extensions::window
 {
-
-namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
-
 namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
 
 typedef sycl::event (*window_fn_ptr_t)(sycl::queue &,
                                        char *,
@@ -72,6 +77,20 @@ sycl::event window_impl(sycl::queue &exec_q,
     return window_ev;
 }
 
+template <typename fnT, typename T, template <typename> typename FunctorT>
+struct Factory
+{
+    fnT get()
+    {
+        if constexpr (std::is_floating_point_v<T>) {
+            return window_impl<T, FunctorT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
 template <typename funcPtrT>
 std::tuple<size_t, char *, funcPtrT>
     window_fn(sycl::queue &exec_q,
@@ -101,7 +120,7 @@ std::tuple<size_t, char *, funcPtrT>
     }
 
     const int result_typenum = result.get_typenum();
-    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    auto array_types = td_ns::usm_ndarray_types();
     const int result_type_id = array_types.typenum_to_lookup_id(result_typenum);
     funcPtrT fn = window_dispatch_vector[result_type_id];
 
diff --git a/dpnp/backend/extensions/window/kaiser.cpp b/dpnp/backend/extensions/window/kaiser.cpp
index b83f88f69a9b..e5c1aa837a64 100644
--- a/dpnp/backend/extensions/window/kaiser.cpp
+++ b/dpnp/backend/extensions/window/kaiser.cpp
@@ -26,26 +26,24 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
-#include "kaiser.hpp"
+#include <sycl/sycl.hpp>
+
 #include "common.hpp"
+#include "kaiser.hpp"
+
+#include "kernels/window/kaiser.hpp"
 
 // utils extension header
 #include "ext/common.hpp"
 
 // dpctl tensor headers
-#include "utils/output_validation.hpp"
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
-#include <sycl/sycl.hpp>
-
-#include "kernels/elementwise_functions/i0.hpp"
-
 namespace dpnp::extensions::window
 {
-namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
-
-using ext::common::init_dispatch_vector;
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
 
 typedef sycl::event (*kaiser_fn_ptr_t)(sycl::queue &,
                                        char *,
@@ -53,34 +51,10 @@ typedef sycl::event (*kaiser_fn_ptr_t)(sycl::queue &,
                                        const py::object &,
                                        const std::vector<sycl::event> &);
 
-static kaiser_fn_ptr_t kaiser_dispatch_vector[dpctl_td_ns::num_types];
+static kaiser_fn_ptr_t kaiser_dispatch_vector[td_ns::num_types];
 
-template <typename T>
-class KaiserFunctor
+namespace impl
 {
-private:
-    T *res = nullptr;
-    const std::size_t N;
-    const T beta;
-
-public:
-    KaiserFunctor(T *res, const std::size_t N, const T beta)
-        : res(res), N(N), beta(beta)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        using dpnp::kernels::i0::cyl_bessel_i0;
-
-        const auto i = id.get(0);
-        const T alpha = (N - 1) / T(2);
-        const T tmp = (i - alpha) / alpha;
-        res[i] = cyl_bessel_i0(beta * sycl::sqrt(1 - tmp * tmp)) /
-                 cyl_bessel_i0(beta);
-    }
-};
-
 template <typename T>
 sycl::event kaiser_impl(sycl::queue &exec_q,
                         char *result,
@@ -96,7 +70,7 @@ sycl::event kaiser_impl(sycl::queue &exec_q,
     sycl::event kaiser_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        using KaiserKernel = KaiserFunctor<T>;
+        using KaiserKernel = dpnp::kernels::kaiser::KaiserFunctor<T>;
         cgh.parallel_for<KaiserKernel>(sycl::range<1>(nelems),
                                        KaiserKernel(res, nelems, beta));
     });
@@ -117,6 +91,7 @@ struct KaiserFactory
         }
     }
 };
+} // namespace impl
 
 std::pair<sycl::event, sycl::event>
     py_kaiser(sycl::queue &exec_q,
@@ -141,8 +116,8 @@ std::pair<sycl::event, sycl::event>
 
 void init_kaiser_dispatch_vectors()
 {
-    init_dispatch_vector<kaiser_fn_ptr_t, KaiserFactory>(
+    using ext::common::init_dispatch_vector;
+    init_dispatch_vector<kaiser_fn_ptr_t, impl::KaiserFactory>(
         kaiser_dispatch_vector);
 }
-
 } // namespace dpnp::extensions::window
diff --git a/dpnp/backend/extensions/window/kaiser.hpp b/dpnp/backend/extensions/window/kaiser.hpp
index 0a4712cc594e..4ba506620db2 100644
--- a/dpnp/backend/extensions/window/kaiser.hpp
+++ b/dpnp/backend/extensions/window/kaiser.hpp
@@ -28,11 +28,15 @@
 
 #pragma once
 
-#include <dpctl4pybind11.hpp>
 #include <sycl/sycl.hpp>
 
+#include <dpctl4pybind11.hpp>
+#include <pybind11/pybind11.h>
+
 namespace dpnp::extensions::window
 {
+namespace py = pybind11;
+
 extern std::pair<sycl::event, sycl::event>
     py_kaiser(sycl::queue &exec_q,
               const py::object &beta,
@@ -40,5 +44,4 @@ extern std::pair<sycl::event, sycl::event>
               const std::vector<sycl::event> &depends);
 
 extern void init_kaiser_dispatch_vectors(void);
-
 } // namespace dpnp::extensions::window
diff --git a/dpnp/backend/extensions/window/window_py.cpp b/dpnp/backend/extensions/window/window_py.cpp
index 2b8090c40cca..5ae80f4027b5 100644
--- a/dpnp/backend/extensions/window/window_py.cpp
+++ b/dpnp/backend/extensions/window/window_py.cpp
@@ -33,11 +33,12 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "bartlett.hpp"
-#include "blackman.hpp"
+#include "kernels/window/bartlett.hpp"
+#include "kernels/window/blackman.hpp"
+#include "kernels/window/hamming.hpp"
+#include "kernels/window/hanning.hpp"
+
 #include "common.hpp"
-#include "hamming.hpp"
-#include "hanning.hpp"
 #include "kaiser.hpp"
 
 // utils extension header
@@ -51,6 +52,22 @@ using window_ns::window_fn_ptr_t;
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 
+template <typename fnT, typename T>
+using BartlettFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::bartlett::BartlettFunctor>;
+
+template <typename fnT, typename T>
+using BlackmanFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::blackman::BlackmanFunctor>;
+
+template <typename fnT, typename T>
+using HammingFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::hamming::HammingFunctor>;
+
+template <typename fnT, typename T>
+using HanningFactory =
+    window_ns::Factory<fnT, T, dpnp::kernels::hanning::HanningFunctor>;
+
 static window_fn_ptr_t bartlett_dispatch_vector[dpctl_td_ns::num_types];
 static window_fn_ptr_t blackman_dispatch_vector[dpctl_td_ns::num_types];
 static window_fn_ptr_t hamming_dispatch_vector[dpctl_td_ns::num_types];
@@ -62,8 +79,7 @@ PYBIND11_MODULE(_window_impl, m)
     using event_vecT = std::vector<sycl::event>;
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::BartlettFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, BartlettFactory>(
             bartlett_dispatch_vector);
 
         auto bartlett_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
@@ -78,8 +94,7 @@ PYBIND11_MODULE(_window_impl, m)
     }
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::BlackmanFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, BlackmanFactory>(
             blackman_dispatch_vector);
 
         auto blackman_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
@@ -94,8 +109,7 @@ PYBIND11_MODULE(_window_impl, m)
     }
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::HammingFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, HammingFactory>(
             hamming_dispatch_vector);
 
         auto hamming_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
@@ -110,8 +124,7 @@ PYBIND11_MODULE(_window_impl, m)
     }
 
     {
-        init_dispatch_vector<window_ns::window_fn_ptr_t,
-                             window_ns::kernels::HanningFactory>(
+        init_dispatch_vector<window_ns::window_fn_ptr_t, HanningFactory>(
             hanning_dispatch_vector);
 
         auto hanning_pyapi = [&](sycl::queue &exec_q, const arrayT &result,
diff --git a/dpnp/backend/kernels/dpnp_krnl_random.cpp b/dpnp/backend/kernels/dpnp_krnl_random.cpp
index be78704c9ccf..faef700a0407 100644
--- a/dpnp/backend/kernels/dpnp_krnl_random.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_random.cpp
@@ -1077,8 +1077,7 @@ DPCTLSyclEventRef
         // input parameters which follow the condition
         if (is_cpu_queue ||
             (!is_cpu_queue && (p_size >= ((size_t)ntrial * 16)) &&
-             (ntrial <= 16)))
-        {
+             (ntrial <= 16))) {
             DPNPC_ptr_adapter<_DataType> result_ptr(q_ref, result, size, true,
                                                     true);
             _DataType *result1 = result_ptr.get_ptr();
@@ -1399,8 +1398,7 @@ DPCTLSyclEventRef
                 size_t j;
                 int cv = pvec[idx[i]];
                 // TODO vectorize
-                for (j = i + 1; (j < size) && (pvec[idx[j]] == cv); j++) {
-                }
+                for (j = i + 1; (j < size) && (pvec[idx[j]] == cv); j++) {}
 
                 if (j <= i) {
                     throw std::runtime_error(
diff --git a/dpnp/backend/kernels/elementwise_functions/degrees.hpp b/dpnp/backend/kernels/elementwise_functions/degrees.hpp
index 73d2566e9546..8493a1821a6a 100644
--- a/dpnp/backend/kernels/elementwise_functions/degrees.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/degrees.hpp
@@ -44,10 +44,7 @@ struct DegreesFunctor
     // do both argT and resT support subgroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &x) const
-    {
-        return sycl::degrees(x);
-    }
+    resT operator()(const argT &x) const { return sycl::degrees(x); }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &x) const
diff --git a/dpnp/backend/kernels/elementwise_functions/divmod.hpp b/dpnp/backend/kernels/elementwise_functions/divmod.hpp
index 35ea4a9fccc5..37a79cffb7f2 100644
--- a/dpnp/backend/kernels/elementwise_functions/divmod.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/divmod.hpp
@@ -115,9 +115,6 @@ struct DivmodFunctor
     }
 
 private:
-    bool l_xor(bool b1, bool b2) const
-    {
-        return (b1 != b2);
-    }
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
 };
 } // namespace dpnp::kernels::divmod
diff --git a/dpnp/backend/kernels/elementwise_functions/fabs.hpp b/dpnp/backend/kernels/elementwise_functions/fabs.hpp
index 7c5ed96f226e..13c5e81898b7 100644
--- a/dpnp/backend/kernels/elementwise_functions/fabs.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/fabs.hpp
@@ -44,9 +44,6 @@ struct FabsFunctor
     // do both argT and resT support subgroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &x) const
-    {
-        return sycl::fabs(x);
-    }
+    resT operator()(const argT &x) const { return sycl::fabs(x); }
 };
 } // namespace dpnp::kernels::fabs
diff --git a/dpnp/backend/kernels/elementwise_functions/fmax.hpp b/dpnp/backend/kernels/elementwise_functions/fmax.hpp
index d28e7ba17b8d..ac5f81dbc698 100644
--- a/dpnp/backend/kernels/elementwise_functions/fmax.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/fmax.hpp
@@ -55,8 +55,7 @@ struct FmaxFunctor
             return in1 >= in2 ? in1 : in2;
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
 
             using realT = typename argT1::value_type;
@@ -64,8 +63,7 @@ struct FmaxFunctor
             const realT in2i = std::imag(in2);
 
             if (sycl::isnan(in2r) || sycl::isnan(in2i) ||
-                mu_ns::greater_equal_complex<argT1>(in1, in2))
-            {
+                mu_ns::greater_equal_complex<argT1>(in1, in2)) {
                 return in1;
             }
             return in2;
diff --git a/dpnp/backend/kernels/elementwise_functions/fmin.hpp b/dpnp/backend/kernels/elementwise_functions/fmin.hpp
index 340dfc8ec545..0cbc0385ce69 100644
--- a/dpnp/backend/kernels/elementwise_functions/fmin.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/fmin.hpp
@@ -55,8 +55,7 @@ struct FminFunctor
             return in1 <= in2 ? in1 : in2;
         }
         else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
+                           tu_ns::is_complex<argT2>::value) {
             static_assert(std::is_same_v<argT1, argT2>);
 
             using realT = typename argT1::value_type;
@@ -64,8 +63,7 @@ struct FminFunctor
             const realT in2i = std::imag(in2);
 
             if (sycl::isnan(in2r) || sycl::isnan(in2i) ||
-                mu_ns::less_equal_complex<argT1>(in1, in2))
-            {
+                mu_ns::less_equal_complex<argT1>(in1, in2)) {
                 return in1;
             }
             return in2;
diff --git a/dpnp/backend/kernels/elementwise_functions/interpolate.hpp b/dpnp/backend/kernels/elementwise_functions/interpolate.hpp
index ef38157b00e9..c85dafea24b0 100644
--- a/dpnp/backend/kernels/elementwise_functions/interpolate.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/interpolate.hpp
@@ -28,67 +28,79 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstdint>
+
 #include <sycl/sycl.hpp>
-#include <vector>
 
 #include "ext/common.hpp"
 
-using ext::common::IsNan;
-
 namespace dpnp::kernels::interpolate
 {
+using ext::common::IsNan;
+
 template <typename TCoord, typename TValue, typename TIdx = std::int64_t>
-sycl::event interpolate_impl(sycl::queue &q,
-                             const TCoord *x,
-                             const TIdx *idx,
-                             const TCoord *xp,
-                             const TValue *fp,
-                             const TValue *left,
-                             const TValue *right,
-                             TValue *out,
-                             const std::size_t n,
-                             const std::size_t xp_size,
-                             const std::vector<sycl::event> &depends)
+class InterpolateFunctor
 {
+private:
+    const TCoord *x = nullptr;
+    const TIdx *idx = nullptr;
+    const TCoord *xp = nullptr;
+    const TValue *fp = nullptr;
+    const TValue *left = nullptr;
+    const TValue *right = nullptr;
+    TValue *out = nullptr;
+    const std::size_t xp_size;
+
+public:
+    InterpolateFunctor(const TCoord *x_,
+                       const TIdx *idx_,
+                       const TCoord *xp_,
+                       const TValue *fp_,
+                       const TValue *left_,
+                       const TValue *right_,
+                       TValue *out_,
+                       const std::size_t xp_size_)
+        : x(x_), idx(idx_), xp(xp_), fp(fp_), left(left_), right(right_),
+          out(out_), xp_size(xp_size_)
+    {
+    }
+
     // Selected over the work-group version
     // due to simpler execution and slightly better performance.
-    return q.submit([&](sycl::handler &h) {
-        h.depends_on(depends);
-        h.parallel_for(sycl::range<1>(n), [=](sycl::id<1> i) {
-            TValue left_val = left ? *left : fp[0];
-            TValue right_val = right ? *right : fp[xp_size - 1];
+    void operator()(sycl::id<1> id) const
+    {
+        TValue left_val = left ? *left : fp[0];
+        TValue right_val = right ? *right : fp[xp_size - 1];
 
-            TCoord x_val = x[i];
-            TIdx x_idx = idx[i] - 1;
+        TCoord x_val = x[id];
+        TIdx x_idx = idx[id] - 1;
 
-            if (IsNan<TCoord>::isnan(x_val)) {
-                out[i] = x_val;
-            }
-            else if (x_idx < 0) {
-                out[i] = left_val;
-            }
-            else if (x_val == xp[xp_size - 1]) {
-                out[i] = fp[xp_size - 1];
-            }
-            else if (x_idx >= static_cast<TIdx>(xp_size - 1)) {
-                out[i] = right_val;
-            }
-            else {
-                TValue slope =
-                    (fp[x_idx + 1] - fp[x_idx]) / (xp[x_idx + 1] - xp[x_idx]);
-                TValue res = slope * (x_val - xp[x_idx]) + fp[x_idx];
+        if (IsNan<TCoord>::isnan(x_val)) {
+            out[id] = x_val;
+        }
+        else if (x_idx < 0) {
+            out[id] = left_val;
+        }
+        else if (x_val == xp[xp_size - 1]) {
+            out[id] = fp[xp_size - 1];
+        }
+        else if (x_idx >= static_cast<TIdx>(xp_size - 1)) {
+            out[id] = right_val;
+        }
+        else {
+            TValue slope =
+                (fp[x_idx + 1] - fp[x_idx]) / (xp[x_idx + 1] - xp[x_idx]);
+            TValue res = slope * (x_val - xp[x_idx]) + fp[x_idx];
 
-                if (IsNan<TValue>::isnan(res)) {
-                    res = slope * (x_val - xp[x_idx + 1]) + fp[x_idx + 1];
-                    if (IsNan<TValue>::isnan(res) &&
-                        (fp[x_idx] == fp[x_idx + 1])) {
-                        res = fp[x_idx];
-                    }
+            if (IsNan<TValue>::isnan(res)) {
+                res = slope * (x_val - xp[x_idx + 1]) + fp[x_idx + 1];
+                if (IsNan<TValue>::isnan(res) && (fp[x_idx] == fp[x_idx + 1])) {
+                    res = fp[x_idx];
                 }
-                out[i] = res;
             }
-        });
-    });
-}
-
+            out[id] = res;
+        }
+    }
+};
 } // namespace dpnp::kernels::interpolate
diff --git a/dpnp/backend/kernels/elementwise_functions/isclose.hpp b/dpnp/backend/kernels/elementwise_functions/isclose.hpp
index 5086797435b1..179ad1ad8d2a 100644
--- a/dpnp/backend/kernels/elementwise_functions/isclose.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/isclose.hpp
@@ -81,8 +81,7 @@ inline bool isclose(const std::complex<T> a,
     }
 
     if (sycl::isnan(a.real()) && sycl::isnan(a.imag()) &&
-        sycl::isnan(b.real()) && sycl::isnan(b.imag()))
-    {
+        sycl::isnan(b.real()) && sycl::isnan(b.imag())) {
         return equal_nan;
     }
 
@@ -311,8 +310,7 @@ sycl::event
         using dpctl::tensor::kernels::alignment_utils::required_alignment;
         if (is_aligned<required_alignment>(a_tp) &&
             is_aligned<required_alignment>(b_tp) &&
-            is_aligned<required_alignment>(out_tp))
-        {
+            is_aligned<required_alignment>(out_tp)) {
             constexpr bool enable_sg_loadstore = true;
             using IsCloseFunc =
                 IsCloseContigScalarFunctor<T, scT, resTy, vec_sz, n_vecs,
diff --git a/dpnp/backend/kernels/elementwise_functions/nan_to_num.hpp b/dpnp/backend/kernels/elementwise_functions/nan_to_num.hpp
index e33ede58ac41..07c55feaf944 100644
--- a/dpnp/backend/kernels/elementwise_functions/nan_to_num.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/nan_to_num.hpp
@@ -261,8 +261,7 @@ sycl::event nan_to_num_contig_impl(sycl::queue &exec_q,
         using dpctl::tensor::kernels::alignment_utils::is_aligned;
         using dpctl::tensor::kernels::alignment_utils::required_alignment;
         if (is_aligned<required_alignment>(in_tp) &&
-            is_aligned<required_alignment>(out_tp))
-        {
+            is_aligned<required_alignment>(out_tp)) {
             constexpr bool enable_sg_loadstore = true;
             using NanToNumFunc = NanToNumContigFunctor<T, scT, vec_sz, n_vecs,
                                                        enable_sg_loadstore>;
diff --git a/dpnp/backend/kernels/elementwise_functions/radians.hpp b/dpnp/backend/kernels/elementwise_functions/radians.hpp
index ae598f3089d1..cb676249a6ac 100644
--- a/dpnp/backend/kernels/elementwise_functions/radians.hpp
+++ b/dpnp/backend/kernels/elementwise_functions/radians.hpp
@@ -44,10 +44,7 @@ struct RadiansFunctor
     // do both argT and resT support subgroup store/load operation
     using supports_sg_loadstore = typename std::true_type;
 
-    resT operator()(const argT &x) const
-    {
-        return sycl::radians(x);
-    }
+    resT operator()(const argT &x) const { return sycl::radians(x); }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &x) const
diff --git a/dpnp/backend/kernels/indexing/choose.hpp b/dpnp/backend/kernels/indexing/choose.hpp
new file mode 100644
index 000000000000..49b71d05c96b
--- /dev/null
+++ b/dpnp/backend/kernels/indexing/choose.hpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "utils/strided_iters.hpp"
+
+namespace dpnp::kernels::choose
+{
+using dpctl::tensor::ssize_t;
+
+template <typename ProjectorT,
+          typename IndOutIndexerT,
+          typename ChoicesIndexerT,
+          typename IndT,
+          typename T>
+class ChooseFunctor
+{
+private:
+    const IndT *ind = nullptr;
+    T *dst = nullptr;
+    char **chcs = nullptr;
+    ssize_t n_chcs;
+    const IndOutIndexerT ind_out_indexer;
+    const ChoicesIndexerT chcs_indexer;
+
+public:
+    ChooseFunctor(const IndT *ind_,
+                  T *dst_,
+                  char **chcs_,
+                  ssize_t n_chcs_,
+                  const IndOutIndexerT &ind_out_indexer_,
+                  const ChoicesIndexerT &chcs_indexer_)
+        : ind(ind_), dst(dst_), chcs(chcs_), n_chcs(n_chcs_),
+          ind_out_indexer(ind_out_indexer_), chcs_indexer(chcs_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const ProjectorT proj{};
+
+        ssize_t i = id[0];
+
+        auto ind_dst_offsets = ind_out_indexer(i);
+        ssize_t ind_offset = ind_dst_offsets.get_first_offset();
+        ssize_t dst_offset = ind_dst_offsets.get_second_offset();
+
+        IndT chc_idx = ind[ind_offset];
+        // proj produces an index in the range of n_chcs
+        ssize_t projected_idx = proj(n_chcs, chc_idx);
+
+        ssize_t chc_offset = chcs_indexer(i, projected_idx);
+
+        T *chc = reinterpret_cast<T *>(chcs[projected_idx]);
+
+        dst[dst_offset] = chc[chc_offset];
+    }
+};
+
+namespace strides
+{
+using dpctl::tensor::strides::CIndexer_vector;
+
+struct NthStrideOffsetUnpacked
+{
+    NthStrideOffsetUnpacked(int common_nd,
+                            ssize_t const *_offsets,
+                            ssize_t const *_shape,
+                            ssize_t const *_strides)
+        : _ind(common_nd), nd(common_nd), offsets(_offsets), shape(_shape),
+          strides(_strides)
+    {
+    }
+
+    template <typename nT>
+    size_t operator()(ssize_t gid, nT n) const
+    {
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid, shape, strides + (n * nd), relative_offset);
+
+        return relative_offset + offsets[n];
+    }
+
+private:
+    CIndexer_vector<ssize_t> _ind;
+
+    int nd;
+    ssize_t const *offsets;
+    ssize_t const *shape;
+    ssize_t const *strides;
+};
+
+static_assert(sycl::is_device_copyable_v<NthStrideOffsetUnpacked>);
+
+} // namespace strides
+} // namespace dpnp::kernels::choose
diff --git a/dpnp/backend/kernels/statistics/histogram.hpp b/dpnp/backend/kernels/statistics/histogram.hpp
new file mode 100644
index 000000000000..6d0fedbe0bc3
--- /dev/null
+++ b/dpnp/backend/kernels/statistics/histogram.hpp
@@ -0,0 +1,99 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <sycl/sycl.hpp>
+
+namespace dpnp::kernels::histogram
+{
+template <typename T, typename HistImpl, typename Edges, typename Weights>
+class HistogramFunctor
+{
+private:
+    const T *in = nullptr;
+    const std::size_t size;
+    const std::size_t dims;
+    const std::uint32_t WorkPI;
+    const HistImpl hist;
+    const Edges edges;
+    const Weights weights;
+
+public:
+    HistogramFunctor(const T *in_,
+                     const std::size_t size_,
+                     const std::size_t dims_,
+                     const std::uint32_t WorkPI_,
+                     const HistImpl &hist_,
+                     const Edges &edges_,
+                     const Weights &weights_)
+        : in(in_), size(size_), dims(dims_), WorkPI(WorkPI_), hist(hist_),
+          edges(edges_), weights(weights_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> item) const
+    {
+        auto id = item.get_group_linear_id();
+        auto lid = item.get_local_linear_id();
+        auto group = item.get_group();
+        auto local_size = item.get_local_range(0);
+
+        hist.init(item);
+        edges.init(item);
+
+        if constexpr (HistImpl::sync_after_init || Edges::sync_after_init) {
+            sycl::group_barrier(group, sycl::memory_scope::work_group);
+        }
+
+        auto bounds = edges.get_bounds();
+
+        for (std::uint32_t i = 0; i < WorkPI; ++i) {
+            auto data_idx = id * WorkPI * local_size + i * local_size + lid;
+            if (data_idx < size) {
+                auto *d = &in[data_idx * dims];
+
+                if (edges.in_bounds(d, bounds)) {
+                    auto bin = edges.get_bin(item, d, bounds);
+                    auto weight = weights.get(data_idx);
+                    hist.add(item, bin, weight);
+                }
+            }
+        }
+
+        if constexpr (HistImpl::sync_before_finalize) {
+            sycl::group_barrier(group, sycl::memory_scope::work_group);
+        }
+
+        hist.finalize(item);
+    }
+};
+} // namespace dpnp::kernels::histogram
diff --git a/dpnp/backend/kernels/statistics/sliding_window1d.hpp b/dpnp/backend/kernels/statistics/sliding_window1d.hpp
new file mode 100644
index 000000000000..5b3c5535afd4
--- /dev/null
+++ b/dpnp/backend/kernels/statistics/sliding_window1d.hpp
@@ -0,0 +1,274 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include <sycl/sycl.hpp>
+
+#include "ext/common.hpp"
+
+namespace dpnp::kernels::sliding_window1d
+{
+using ext::common::CeilDiv;
+
+namespace detail
+{
+template <typename SizeT>
+SizeT get_global_linear_id(const std::uint32_t wpi,
+                           const sycl::nd_item<1> &item)
+{
+    auto sbgroup = item.get_sub_group();
+    const auto sg_loc_id = sbgroup.get_local_linear_id();
+
+    const SizeT sg_base_id = wpi * (item.get_global_linear_id() - sg_loc_id);
+    const SizeT id = sg_base_id + sg_loc_id;
+
+    return id;
+}
+
+template <typename SizeT>
+std::uint32_t get_results_num(const std::uint32_t wpi,
+                              const SizeT size,
+                              const SizeT global_id,
+                              const sycl::nd_item<1> &item)
+{
+    auto sbgroup = item.get_sub_group();
+
+    const auto sbg_size = sbgroup.get_max_local_range()[0];
+    const auto size_ = sycl::sub_sat(size, global_id);
+    return std::min(SizeT(wpi), CeilDiv(size_, sbg_size));
+}
+
+template <typename Results,
+          typename AData,
+          typename VData,
+          typename Op,
+          typename Red>
+void process_block(Results &results,
+                   std::uint32_t r_size,
+                   AData &a_data,
+                   VData &v_data,
+                   std::uint32_t block_size,
+                   Op op,
+                   Red red)
+{
+    for (std::uint32_t i = 0; i < block_size; ++i) {
+        auto v_val = v_data.broadcast(i);
+        for (std::uint32_t r = 0; r < r_size; ++r) {
+            results[r] = red(results[r], op(a_data[r], v_val));
+        }
+        a_data.advance_left();
+    }
+}
+} // namespace detail
+
+template <std::uint32_t WorkPI,
+          typename SpanT,
+          typename KernelT,
+          typename OpT,
+          typename RedT,
+          typename ResultT,
+          template <typename, std::uint32_t> class RegistryDataT,
+          template <typename, std::uint32_t> class RegistryWindowT>
+class SlidingWindow1dFunctor
+{
+private:
+    const SpanT a;
+    const KernelT v;
+    const OpT op;
+    const RedT red;
+    ResultT out;
+
+    static constexpr std::uint32_t default_reg_data_size = 1;
+    using SizeT = typename SpanT::size_type;
+
+public:
+    SlidingWindow1dFunctor(const SpanT &a_,
+                           const KernelT &v_,
+                           const OpT &op_,
+                           const RedT &red_,
+                           ResultT &out_)
+        : a(a_), v(v_), op(op_), red(red_), out(out_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> item) const
+    {
+        auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
+
+        auto results =
+            RegistryDataT<typename ResultT::value_type, WorkPI>(item);
+        results.fill(0);
+
+        auto results_num =
+            detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
+
+        const auto *a_begin = a.begin();
+        const auto *a_end = a.end();
+
+        auto sbgroup = item.get_sub_group();
+
+        const auto chunks_count =
+            CeilDiv(v.size(), sbgroup.get_max_local_range()[0]);
+
+        const auto *a_ptr = &a.padded_begin()[glid];
+
+        auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
+            return ptr >= a_begin && ptr < a_end;
+        };
+
+        auto a_data =
+            RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
+        a_ptr = a_data.load(a_ptr, _a_load_cond, 0);
+
+        const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
+        auto v_size = v.size();
+
+        for (std::uint32_t b = 0; b < chunks_count; ++b) {
+            auto v_data = RegistryDataT<typename KernelT::value_type,
+                                        default_reg_data_size>(item);
+            v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
+
+            std::uint32_t chunk_size_ =
+                std::min(v_size, SizeT(v_data.total_size()));
+            detail::process_block(results, results_num, a_data, v_data,
+                                  chunk_size_, op, red);
+
+            if (b != chunks_count - 1) {
+                a_ptr = a_data.load_lane(a_data.size_y() - 1, a_ptr,
+                                         _a_load_cond, 0);
+                v_size -= v_data.total_size();
+            }
+        }
+
+        auto *const out_ptr = out.begin();
+        // auto *const out_end = out.end();
+
+        auto y_start = glid;
+        auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
+        std::uint32_t i = 0;
+        for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
+            out_ptr[y] = results[i++];
+        }
+        // while the code itself seems to be valid, inside correlate
+        // kernel it results in memory corruption. Further investigation
+        // is needed. SAT-7693
+        // corruption results.store(&out_ptr[glid],
+        //               [out_end](auto &&ptr) { return ptr < out_end; });
+    }
+};
+
+template <std::uint32_t WorkPI,
+          typename SpanT,
+          typename KernelT,
+          typename OpT,
+          typename RedT,
+          typename ResultT,
+          template <typename, std::uint32_t> class RegistryDataT,
+          template <typename, std::uint32_t> class RegistryWindowT>
+class SlidingWindow1dSmallFunctor
+{
+private:
+    const SpanT a;
+    const KernelT v;
+    const OpT op;
+    const RedT red;
+    ResultT out;
+
+    static constexpr std::uint32_t default_reg_data_size = 1;
+    using SizeT = typename SpanT::size_type;
+
+public:
+    SlidingWindow1dSmallFunctor(const SpanT &a_,
+                                const KernelT &v_,
+                                const OpT &op_,
+                                const RedT &red_,
+                                ResultT &out_)
+        : a(a_), v(v_), op(op_), red(red_), out(out_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> item) const
+    {
+        auto glid = detail::get_global_linear_id<SizeT>(WorkPI, item);
+
+        auto results =
+            RegistryDataT<typename ResultT::value_type, WorkPI>(item);
+        results.fill(0);
+
+        auto sbgroup = item.get_sub_group();
+        auto sg_size = sbgroup.get_max_local_range()[0];
+
+        const std::uint32_t to_read = WorkPI * sg_size + v.size();
+        const auto *a_begin = a.begin();
+
+        const auto *a_ptr = &a.padded_begin()[glid];
+        const auto *a_end = std::min(a_ptr + to_read, a.end());
+
+        auto _a_load_cond = [a_begin, a_end](auto &&ptr) {
+            return ptr >= a_begin && ptr < a_end;
+        };
+
+        auto a_data =
+            RegistryWindowT<typename SpanT::value_type, WorkPI + 1>(item);
+        a_data.load(a_ptr, _a_load_cond, 0);
+
+        const auto *v_ptr = &v.begin()[sbgroup.get_local_linear_id()];
+        auto v_size = v.size();
+
+        auto v_data =
+            RegistryDataT<typename KernelT::value_type, default_reg_data_size>(
+                item);
+        v_ptr = v_data.load(v_ptr, v_data.x() < v_size, 0);
+
+        auto results_num =
+            detail::get_results_num<SizeT>(WorkPI, out.size(), glid, item);
+
+        detail::process_block(results, results_num, a_data, v_data, v_size, op,
+                              red);
+
+        auto *const out_ptr = out.begin();
+        // auto *const out_end = out.end();
+
+        auto y_start = glid;
+        auto y_stop = std::min(y_start + WorkPI * results.size_x(), out.size());
+        std::uint32_t i = 0;
+        for (std::uint32_t y = y_start; y < y_stop; y += results.size_x()) {
+            out_ptr[y] = results[i++];
+        }
+        // while the code itself seems to be valid, inside correlate
+        // kernel it results in memory corruption. Further investigation
+        // is needed. SAT-7693
+        // corruption results.store(&out_ptr[glid],
+        //               [out_end](auto &&ptr) { return ptr < out_end; });
+    }
+};
+} // namespace dpnp::kernels::sliding_window1d
diff --git a/dpnp/backend/extensions/window/bartlett.hpp b/dpnp/backend/kernels/window/bartlett.hpp
similarity index 80%
rename from dpnp/backend/extensions/window/bartlett.hpp
rename to dpnp/backend/kernels/window/bartlett.hpp
index 69d3be627c84..20d410150dcb 100644
--- a/dpnp/backend/extensions/window/bartlett.hpp
+++ b/dpnp/backend/kernels/window/bartlett.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -19,7 +19,7 @@
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, RES, OR PROFITS; OR BUSINESS
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::bartlett
 {
-
 template <typename T>
 class BartlettFunctor
 {
@@ -52,19 +52,4 @@ class BartlettFunctor
         res[i] = T(1) - sycl::fabs(i - alpha) / alpha;
     }
 };
-
-template <typename fnT, typename T>
-struct BartlettFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, BartlettFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::bartlett
diff --git a/dpnp/backend/extensions/window/blackman.hpp b/dpnp/backend/kernels/window/blackman.hpp
similarity index 83%
rename from dpnp/backend/extensions/window/blackman.hpp
rename to dpnp/backend/kernels/window/blackman.hpp
index 7a75d226792f..9df7cb8728e2 100644
--- a/dpnp/backend/extensions/window/blackman.hpp
+++ b/dpnp/backend/kernels/window/blackman.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::blackman
 {
-
 template <typename T>
 class BlackmanFunctor
 {
@@ -53,19 +53,4 @@ class BlackmanFunctor
                  T(0.08) * sycl::cospi(T(2) * alpha);
     }
 };
-
-template <typename fnT, typename T>
-struct BlackmanFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, BlackmanFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::blackman
diff --git a/dpnp/backend/extensions/window/hamming.hpp b/dpnp/backend/kernels/window/hamming.hpp
similarity index 83%
rename from dpnp/backend/extensions/window/hamming.hpp
rename to dpnp/backend/kernels/window/hamming.hpp
index 521ebc10c281..895ecb0e588c 100644
--- a/dpnp/backend/extensions/window/hamming.hpp
+++ b/dpnp/backend/kernels/window/hamming.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::hamming
 {
-
 template <typename T>
 class HammingFunctor
 {
@@ -51,19 +51,4 @@ class HammingFunctor
         res[i] = T(0.54) - T(0.46) * sycl::cospi(T(2) * i / (N - 1));
     }
 };
-
-template <typename fnT, typename T>
-struct HammingFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, HammingFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::hamming
diff --git a/dpnp/backend/extensions/window/hanning.hpp b/dpnp/backend/kernels/window/hanning.hpp
similarity index 83%
rename from dpnp/backend/extensions/window/hanning.hpp
rename to dpnp/backend/kernels/window/hanning.hpp
index 612036d6b05a..35b441f921f8 100644
--- a/dpnp/backend/extensions/window/hanning.hpp
+++ b/dpnp/backend/kernels/window/hanning.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
+// Copyright (c) 2026, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
 
 #pragma once
 
-#include "common.hpp"
+#include <cstddef>
+
 #include <sycl/sycl.hpp>
 
-namespace dpnp::extensions::window::kernels
+namespace dpnp::kernels::hanning
 {
-
 template <typename T>
 class HanningFunctor
 {
@@ -51,19 +51,4 @@ class HanningFunctor
         res[i] = T(0.5) - T(0.5) * sycl::cospi(T(2) * i / (N - 1));
     }
 };
-
-template <typename fnT, typename T>
-struct HanningFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_floating_point_v<T>) {
-            return window_impl<T, HanningFunctor>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace dpnp::extensions::window::kernels
+} // namespace dpnp::kernels::hanning
diff --git a/dpnp/backend/kernels/window/kaiser.hpp b/dpnp/backend/kernels/window/kaiser.hpp
new file mode 100644
index 000000000000..ce8c8e52fd18
--- /dev/null
+++ b/dpnp/backend/kernels/window/kaiser.hpp
@@ -0,0 +1,64 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/elementwise_functions/i0.hpp"
+
+namespace dpnp::kernels::kaiser
+{
+template <typename T>
+class KaiserFunctor
+{
+private:
+    T *res = nullptr;
+    const std::size_t N;
+    const T beta;
+
+public:
+    KaiserFunctor(T *res, const std::size_t N, const T beta)
+        : res(res), N(N), beta(beta)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        using dpnp::kernels::i0::cyl_bessel_i0;
+
+        const auto i = id.get(0);
+        const T alpha = (N - 1) / T(2);
+        const T tmp = (i - alpha) / alpha;
+        res[i] = cyl_bessel_i0(beta * sycl::sqrt(1 - tmp * tmp)) /
+                 cyl_bessel_i0(beta);
+    }
+};
+} // namespace dpnp::kernels::kaiser
diff --git a/dpnp/backend/src/dpnp_fptr.hpp b/dpnp/backend/src/dpnp_fptr.hpp
index 9fd2dedb4a2d..15d6b7ab20ee 100644
--- a/dpnp/backend/src/dpnp_fptr.hpp
+++ b/dpnp/backend/src/dpnp_fptr.hpp
@@ -128,8 +128,7 @@ class dpnp_less_comp
     {
         if constexpr (both_types_are_same<
                           dpnp_remove_cvref_t<_Xp>, dpnp_remove_cvref_t<_Yp>,
-                          std::complex<float>, std::complex<double>>)
-        {
+                          std::complex<float>, std::complex<double>>) {
             bool ret = false;
             _Xp a = std::forward<_Xp>(__x);
             _Yp b = std::forward<_Yp>(__y);
diff --git a/dpnp/backend/src/queue_sycl.cpp b/dpnp/backend/src/queue_sycl.cpp
index b768ae21fafe..715193959cf1 100644
--- a/dpnp/backend/src/queue_sycl.cpp
+++ b/dpnp/backend/src/queue_sycl.cpp
@@ -87,8 +87,7 @@
 
     std::cout << "Available SYCL devices:" << std::endl;
     for (std::vector<sycl::device>::const_iterator it = devices.cbegin();
-         it != devices.cend(); ++it)
-    {
+         it != devices.cend(); ++it) {
         std::cout
             // not yet implemented error << " " <<
             // it->has(sycl::aspect::usm_shared_allocations)  << " "
diff --git a/dpnp/backend/src/queue_sycl.hpp b/dpnp/backend/src/queue_sycl.hpp
index 7aae5d00374e..6100a03c872a 100644
--- a/dpnp/backend/src/queue_sycl.hpp
+++ b/dpnp/backend/src/queue_sycl.hpp
@@ -30,10 +30,10 @@
 #ifndef QUEUE_SYCL_H // Cython compatibility
 #define QUEUE_SYCL_H
 
-//#pragma clang diagnostic push
-//#pragma clang diagnostic ignored "-Wpass-failed"
+// #pragma clang diagnostic push
+// #pragma clang diagnostic ignored "-Wpass-failed"
 #include <sycl/sycl.hpp>
-//#pragma clang diagnostic pop
+// #pragma clang diagnostic pop
 
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-parameter"
@@ -115,8 +115,8 @@ class backend_sycl
                      ? sycl::property_list{sycl::property::queue::
                                                enable_profiling()}
                      : sycl::property_list{}},
-          rng_mt19937_engine_{queue_, default_seed}, rng_mcg59_engine_{
-                                                         queue_, default_seed}
+          rng_mt19937_engine_{queue_, default_seed},
+          rng_mcg59_engine_{queue_, default_seed}
     {
     }
 
diff --git a/dpnp/backend/tests/test_random.cpp b/dpnp/backend/tests/test_random.cpp
index 5b7999724174..bda2658e1de4 100644
--- a/dpnp/backend/tests/test_random.cpp
+++ b/dpnp/backend/tests/test_random.cpp
@@ -43,10 +43,7 @@
 class RandomTestCase : public ::testing::Test
 {
 public:
-    static void SetUpTestCase()
-    {
-        _get_device_mem();
-    }
+    static void SetUpTestCase() { _get_device_mem(); }
 
     static void TearDownTestCase()
     {
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index dad67fc1b584..951f782c3007 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -644,6 +644,136 @@ def _create_from_usm_ndarray(usm_ary: dpt.usm_ndarray):
         res._array_obj._set_namespace(dpnp)
         return res
 
+    def _create_view(self, array_class, shape, dtype, strides):
+        """
+        Create a view of an array with the specified class.
+
+        The method handles subclass instantiation by creating a usm_ndarray
+        view and then wrapping it in the appropriate class.
+
+        Parameters
+        ----------
+        array_class : type
+            The class to instantiate (dpnp_array or a subclass).
+        shape : tuple
+            Shape of the view.
+        dtype : dtype
+            Data type of the view (can be None to keep source's dtype).
+        strides : tuple
+            Strides of the view.
+
+        Returns
+        -------
+        view : array_class instance
+            A view of the array as the specified class.
+
+        """
+
+        if dtype is None:
+            dtype = self.dtype
+
+        # create the underlying usm_ndarray view
+        usm_view = dpt.usm_ndarray(
+            shape,
+            dtype=dtype,
+            buffer=self._array_obj,
+            strides=tuple(s // dpnp.dtype(dtype).itemsize for s in strides),
+        )
+
+        # wrap the view into the appropriate class
+        if array_class is dpnp_array:
+            res = dpnp_array._create_from_usm_ndarray(usm_view)
+        else:
+            # for subclasses, create using __new__ and set up manually
+            res = array_class.__new__(array_class)
+            res._array_obj = usm_view
+            res._array_obj._set_namespace(dpnp)
+
+            if hasattr(res, "__array_finalize__"):
+                res.__array_finalize__(self)
+
+        return res
+
+    def _view_impl(self, dtype=None, array_class=None):
+        """
+        Internal implementation of view method to avoid an issue where
+        `type` parameter in ndarray.view method shadowing builtin type.
+
+        """
+
+        # check if dtype is actually a type
+        if dtype is not None:
+            if isinstance(dtype, type) and issubclass(dtype, dpnp_array):
+                if array_class is not None:
+                    raise ValueError("Cannot specify output type twice")
+                array_class = dtype
+                dtype = None
+
+        # validate array_class parameter
+        if not (
+            array_class is None
+            or isinstance(array_class, type)
+            and issubclass(array_class, dpnp_array)
+        ):
+            raise ValueError("Type must be a sub-type of ndarray type")
+
+        if array_class is None:
+            # it's a view on dpnp.ndarray
+            array_class = self.__class__
+
+        old_sh = self.shape
+        old_strides = self.strides
+
+        if dtype is None:
+            return self._create_view(array_class, old_sh, None, old_strides)
+
+        new_dt = dpnp.dtype(dtype)
+        new_dt = dtu._to_device_supported_dtype(new_dt, self.sycl_device)
+
+        new_itemsz = new_dt.itemsize
+        old_itemsz = self.dtype.itemsize
+        if new_itemsz == old_itemsz:
+            return self._create_view(array_class, old_sh, new_dt, old_strides)
+
+        ndim = self.ndim
+        if ndim == 0:
+            raise ValueError(
+                "Changing the dtype of a 0d array is only supported "
+                "if the itemsize is unchanged"
+            )
+
+        # resize on last axis only
+        axis = ndim - 1
+        if (
+            old_sh[axis] != 1
+            and self.size != 0
+            and old_strides[axis] != old_itemsz
+        ):
+            raise ValueError(
+                "To change to a dtype of a different size, "
+                "the last axis must be contiguous"
+            )
+
+        # normalize strides whenever itemsize changes
+        new_strides = tuple(
+            old_strides[i] if i != axis else new_itemsz for i in range(ndim)
+        )
+
+        new_dim = old_sh[axis] * old_itemsz
+        if new_dim % new_itemsz != 0:
+            raise ValueError(
+                "When changing to a larger dtype, its size must be a divisor "
+                "of the total size in bytes of the last axis of the array"
+            )
+
+        # normalize shape whenever itemsize changes
+        new_sh = tuple(
+            old_sh[i] if i != axis else new_dim // new_itemsz
+            for i in range(ndim)
+        )
+
+        return self._create_view(array_class, new_sh, new_dt, new_strides)
+
     def all(self, axis=None, *, out=None, keepdims=False, where=True):
         """
         Return ``True`` if all elements evaluate to ``True``.
@@ -2322,10 +2452,18 @@ def view(self, /, dtype=None, *, type=None):
 
         Parameters
         ----------
-        dtype : {None, str, dtype object}, optional
+        dtype : {None, str, dtype object, type}, optional
             The desired data type of the returned view, e.g. :obj:`dpnp.float32`
-            or :obj:`dpnp.int16`. By default, it results in the view having the
-            same data type.
+            or :obj:`dpnp.int16`. Omitting it results in the view having the
+            same data type. Can also be a subclass of :class:`dpnp.ndarray` to
+            create a view of that type (this is equivalent to setting the `type`
+            parameter).
+
+            Default: ``None``.
+        type : {None, type}, optional
+            Type of the returned view, e.g. a subclass of :class:`dpnp.ndarray`.
+            If specified, the returned array will be an instance of `type`.
+            Omitting it results in type preservation.
 
             Default: ``None``.
 
@@ -2340,11 +2478,6 @@ def view(self, /, dtype=None, *, type=None):
 
         Only the last axis has to be contiguous.
 
-        Limitations
-        -----------
-        Parameter `type` is supported only with default value ``None``.
-        Otherwise, the function raises ``NotImplementedError`` exception.
-
         Examples
         --------
         >>> import dpnp as np
@@ -2368,73 +2501,17 @@ def view(self, /, dtype=None, *, type=None):
             [[2312, 2826],
                 [5396, 5910]]], dtype=int16)
 
-        """
-
-        if type is not None:
-            raise NotImplementedError(
-                "Keyword argument `type` is supported only with "
-                f"default value ``None``, but got {type}."
-            )
-
-        old_sh = self.shape
-        old_strides = self.strides
-
-        if dtype is None:
-            return dpnp_array(old_sh, buffer=self, strides=old_strides)
-
-        new_dt = dpnp.dtype(dtype)
-        new_dt = dtu._to_device_supported_dtype(new_dt, self.sycl_device)
-
-        new_itemsz = new_dt.itemsize
-        old_itemsz = self.dtype.itemsize
-        if new_itemsz == old_itemsz:
-            return dpnp_array(
-                old_sh, dtype=new_dt, buffer=self, strides=old_strides
-            )
-
-        ndim = self.ndim
-        if ndim == 0:
-            raise ValueError(
-                "Changing the dtype of a 0d array is only supported "
-                "if the itemsize is unchanged"
-            )
-
-        # resize on last axis only
-        axis = ndim - 1
-        if (
-            old_sh[axis] != 1
-            and self.size != 0
-            and old_strides[axis] != old_itemsz
-        ):
-            raise ValueError(
-                "To change to a dtype of a different size, "
-                "the last axis must be contiguous"
-            )
+        Creating a view with a custom ndarray subclass:
 
-        # normalize strides whenever itemsize changes
-        new_strides = tuple(
-            old_strides[i] if i != axis else new_itemsz for i in range(ndim)
-        )
-
-        new_dim = old_sh[axis] * old_itemsz
-        if new_dim % new_itemsz != 0:
-            raise ValueError(
-                "When changing to a larger dtype, its size must be a divisor "
-                "of the total size in bytes of the last axis of the array"
-            )
-
-        # normalize shape whenever itemsize changes
-        new_sh = tuple(
-            old_sh[i] if i != axis else new_dim // new_itemsz
-            for i in range(ndim)
-        )
+        >>> class MyArray(np.ndarray):
+        ...     pass
+        >>> x = np.array([1, 2, 3])
+        >>> y = x.view(MyArray)
+        >>> type(y)
+        <class 'MyArray'>
 
-        return dpnp_array(
-            new_sh,
-            dtype=new_dt,
-            buffer=self,
-            strides=new_strides,
-        )
+        """
+        return self._view_impl(dtype=dtype, array_class=type)
 
     @property
     def usm_type(self):
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index db70f1fd2384..2a90f6cff637 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -719,24 +719,18 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
         offset = -offset
 
     a_shape = a.shape
-    a_straides = a.strides
+    a_strides = a.strides
     n, m = a_shape[-2:]
-    st_n, st_m = a_straides[-2:]
-
-    # Compute shape, strides and offset of the resulting diagonal array
-    # based on the input offset
-    if offset == 0:
-        out_shape = a_shape[:-2] + (min(n, m),)
-        out_strides = a_straides[:-2] + (st_n + st_m,)
-        out_offset = 0
-    elif 0 < offset < m:
-        out_shape = a_shape[:-2] + (min(n, m - offset),)
-        out_strides = a_straides[:-2] + (st_n + st_m,)
-        out_offset = st_m // a.itemsize * offset
-    else:
-        out_shape = a_shape[:-2] + (0,)
-        out_strides = a_straides[:-2] + (a.itemsize,)
-        out_offset = 0
+    st_n, st_m = a_strides[-2:]
+
+    # Compute the diagonal array as a view:
+    # - stride: sum of row and column strides (diag advances in both dimensions)
+    # - shape: determined by diagonal size using max(0, min(n, m - offset))
+    # - offset: starting position in buffer for non-zero offsets
+    diag_size = max(0, min(n, m - offset))
+    out_shape = a_shape[:-2] + (diag_size,)
+    out_strides = a_strides[:-2] + (st_n + st_m,)
+    out_offset = st_m // a.itemsize * offset
 
     return dpnp_array(
         out_shape, buffer=a, strides=out_strides, offset=out_offset
diff --git a/dpnp/memory/_memory.py b/dpnp/memory/_memory.py
index f978c5e50db2..70d93c04d6a5 100644
--- a/dpnp/memory/_memory.py
+++ b/dpnp/memory/_memory.py
@@ -98,13 +98,18 @@ def create_data(x):
     usm_data = x.usm_data
 
     if isinstance(usm_data, tuple(dispatch.values())):
-        return usm_data
-
-    cls = dispatch.get(type(usm_data), None)
-    if cls:
-        data = cls(usm_data)
-        # `ptr` is expecting to point at the start of the array's data,
-        # while `usm_data._pointer` is a pointer at the start of memory buffer
-        data.ptr = x._pointer
-        return data
-    raise TypeError(f"Expected USM memory, but got {type(usm_data)}")
+        # usm_data is already an instance of MemoryUSM<type> class
+        cls = usm_data.__class__
+    elif (cls := dispatch.get(type(usm_data))) is not None:
+        pass  # cls is set
+    else:
+        raise TypeError(f"Expected USM memory, but got {type(usm_data)}")
+
+    # create a new instance each time since usm_data might be a view
+    # of another array
+    data = cls(usm_data)
+
+    # `ptr` is expecting to point at the start of the array's data,
+    # while `usm_data._pointer` is a pointer at the start of memory buffer
+    data.ptr = x._pointer
+    return data
diff --git a/dpnp/tests/qr_helper.py b/dpnp/tests/qr_helper.py
new file mode 100644
index 000000000000..ead959807dc6
--- /dev/null
+++ b/dpnp/tests/qr_helper.py
@@ -0,0 +1,72 @@
+import numpy
+
+from .helper import factor_to_tol, has_support_aspect64
+
+
+def gram(x, xp):
+    # Return Gram matrix: X^H @ X
+    return xp.conjugate(x).swapaxes(-1, -2) @ x
+
+
+def get_R_from_raw(h, m, n, xp):
+    # Get reduced R from NumPy-style raw QR:
+    # R = triu((tril(h))^T), shape (..., k, n)
+    k = min(m, n)
+    rt = xp.tril(h)
+    r = xp.swapaxes(rt, -1, -2)
+    r = xp.triu(r[..., :m, :n])
+    return r[..., :k, :]
+
+
+def check_qr(a_np, a_xp, mode, xp):
+    # QR is not unique:
+    # element-wise comparison with NumPy may differ by sign/phase.
+    # To verify correctness use mode-dependent functional checks:
+    # complete/reduced: check decomposition Q @ R = A
+    # raw/r: check invariant R^H @ R = A^H @ A
+    if mode in ("complete", "reduced"):
+        res = xp.linalg.qr(a_xp, mode)
+        assert xp.allclose(res.Q @ res.R, a_xp, atol=1e-5)
+
+    # Since QR satisfies A = Q @ R with orthonormal Q (Q^H @ Q = I),
+    # validate correctness via the invariant R^H @ R == A^H @ A
+    # for raw/r modes
+    elif mode == "raw":
+        _, tau_np = numpy.linalg.qr(a_np, mode=mode)
+        h_xp, tau_xp = xp.linalg.qr(a_xp, mode=mode)
+
+        m, n = a_np.shape[-2], a_np.shape[-1]
+        Rraw_xp = get_R_from_raw(h_xp, m, n, xp)
+
+        rtol = atol = factor_to_tol(Rraw_xp.dtype, 100)
+
+        # Use reduced QR as a reference:
+        # reduced is validated via Q @ R == A
+        exp_r = xp.linalg.qr(a_xp, mode="reduced").R
+        assert xp.allclose(Rraw_xp, exp_r, atol=atol, rtol=rtol)
+
+        exp_xp = gram(a_xp, xp)
+
+        # Compare R^H @ R == A^H @ A
+        assert xp.allclose(gram(Rraw_xp, xp), exp_xp, atol=atol, rtol=rtol)
+
+        assert tau_xp.shape == tau_np.shape
+        if not has_support_aspect64(tau_xp.sycl_device):
+            assert tau_xp.dtype.kind == tau_np.dtype.kind
+        else:
+            assert tau_xp.dtype == tau_np.dtype
+
+    else:  # mode == "r"
+        r_xp = xp.linalg.qr(a_xp, mode="r")
+
+        # Use reduced QR as a reference:
+        # reduced is validated via Q @ R == A
+        exp_r = xp.linalg.qr(a_xp, mode="reduced").R
+        rtol = atol = factor_to_tol(exp_r.dtype, 100)
+
+        assert xp.allclose(r_xp, exp_r, atol=atol, rtol=rtol)
+
+        exp_xp = gram(a_xp, xp)
+
+        # Compare R^H @ R == A^H @ A
+        assert xp.allclose(gram(r_xp, xp), exp_xp, atol=atol, rtol=rtol)
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index b6cae0733d40..27f34f6288b3 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -18,6 +18,7 @@
 from dpnp.exceptions import AxisError, ExecutionPlacementError
 
 from .helper import (
+    generate_random_numpy_array,
     get_abs_array,
     get_all_dtypes,
     get_array,
@@ -44,7 +45,9 @@ def wrapped(a, axis, **kwargs):
 
 
 class TestDiagonal:
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
     @pytest.mark.parametrize("offset", [-3, -1, 0, 1, 3])
     @pytest.mark.parametrize(
         "shape",
@@ -58,7 +61,7 @@ class TestDiagonal:
             "(2, 2, 2, 3)",
         ],
     )
-    def test_diagonal_offset(self, shape, dtype, offset):
+    def test_offset(self, shape, dtype, offset):
         a = numpy.arange(numpy.prod(shape), dtype=dtype).reshape(shape)
         a_dp = dpnp.array(a)
         expected = numpy.diagonal(a, offset)
@@ -74,7 +77,7 @@ def test_diagonal_offset(self, shape, dtype, offset):
             ((4, 3, 5, 2), [(0, 1), (1, 2), (2, 3), (0, 3)]),
         ],
     )
-    def test_diagonal_axes(self, shape, axis_pairs, dtype):
+    def test_axes(self, shape, axis_pairs, dtype):
         a = numpy.arange(numpy.prod(shape), dtype=dtype).reshape(shape)
         a_dp = dpnp.array(a)
         for axis1, axis2 in axis_pairs:
@@ -91,7 +94,7 @@ def test_linalg_diagonal(self, offset):
         result = dpnp.linalg.diagonal(a_dp, offset=offset)
         assert_array_equal(expected, result)
 
-    def test_diagonal_errors(self):
+    def test_errors(self):
         a = dpnp.arange(12).reshape(3, 4)
 
         # unsupported type
@@ -115,6 +118,64 @@ def test_diagonal_errors(self):
         assert_raises(ValueError, a.diagonal, axis1=1, axis2=1)
         assert_raises(ValueError, a.diagonal, axis1=1, axis2=-1)
 
+    @pytest.mark.parametrize("dt", get_all_dtypes(no_none=True))
+    @pytest.mark.parametrize(
+        "shape, offset",
+        [
+            ((2, 5), 5),  # offset >= m
+            ((2, 5), 10),  # offset >> m
+            ((4, 5), 6),  # offset >= m
+            ((2, 5), -5),  # negative offset >= n
+            ((3, 3, 4), 5),  # 3D array, offset >= m
+        ],
+    )
+    def test_empty_strides(self, dt, shape, offset):
+        a = generate_random_numpy_array(shape=shape, dtype=dt)
+        ia = dpnp.array(a)
+
+        expected = numpy.diagonal(a, offset)
+        result = dpnp.diagonal(ia, offset)
+
+        # Check both shape and strides match NumPy
+        assert expected.shape == result.shape
+        assert expected.strides == result.strides
+        assert_array_equal(expected, result)
+
+    @pytest.mark.parametrize("dt", get_all_dtypes(no_none=True))
+    def test_view(self, dt):
+        a = generate_random_numpy_array(shape=(3, 4), dtype=dt)
+        a = dpnp.array(a)
+        ia = a.copy()
+
+        diag = dpnp.diagonal(a)
+        diag[1] = 17  # modify a diagonal element
+        ia[1, 1] = 17  # do the same in original copy of the array
+
+        assert (a == ia).all()
+
+    @pytest.mark.parametrize("dt", get_all_dtypes(no_none=True))
+    @pytest.mark.parametrize(
+        "slice_spec, offset",
+        [
+            ((slice(None), slice(None, None, 2)), 0),  # skip columns
+            ((slice(None, None, 2), slice(None)), 1),  # skip rows
+            ((slice(None, None, 2), slice(None, None, 2)), 0),  # skip both
+        ],
+    )
+    def test_noncontiguous(self, dt, slice_spec, offset):
+        a = generate_random_numpy_array(shape=(4, 6), dtype=dt)
+        a_sliced = a[slice_spec]
+        ia = dpnp.array(a)
+        ia_sliced = ia[slice_spec]
+
+        expected = numpy.diagonal(a_sliced, offset=offset)
+        result = dpnp.diagonal(ia_sliced, offset=offset)
+
+        # Check strides match for non-contiguous arrays
+        assert expected.shape == result.shape
+        assert expected.strides == result.strides
+        assert_array_equal(expected, result)
+
 
 class TestExtins:
     @pytest.mark.parametrize("dt", get_all_dtypes(no_none=True))
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 170a2a7b5a13..20d974b32f0c 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -24,6 +24,7 @@
     has_support_aspect64,
     numpy_version,
 )
+from .qr_helper import check_qr
 from .third_party.cupy import testing
 
 
@@ -3584,7 +3585,7 @@ def test_error(self):
 
 
 class TestQr:
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
     @pytest.mark.parametrize(
         "shape",
         [
@@ -3610,60 +3611,27 @@ class TestQr:
             "(2, 2, 4)",
         ],
     )
-    @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
+    @pytest.mark.parametrize("mode", ["complete", "reduced", "r", "raw"])
     def test_qr(self, dtype, shape, mode):
         a = generate_random_numpy_array(shape, dtype, seed_value=81)
-        ia = dpnp.array(a)
+        ia = dpnp.array(a, dtype=dtype)
 
-        if mode == "r":
-            np_r = numpy.linalg.qr(a, mode)
-            dpnp_r = dpnp.linalg.qr(ia, mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a, mode)
-
-            # check decomposition
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia, mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-                assert dpnp.allclose(
-                    dpnp.matmul(dpnp_q, dpnp_r), ia, atol=1e-05
-                )
-            else:  # mode=="raw"
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia, mode)
-                assert_dtype_allclose(dpnp_q, np_q, factor=24)
-
-        if mode in ("raw", "r"):
-            assert_dtype_allclose(dpnp_r, np_r, factor=24)
+        check_qr(a, ia, mode, dpnp)
 
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
     @pytest.mark.parametrize(
         "shape",
         [(32, 32), (8, 16, 16)],
         ids=["(32, 32)", "(8, 16, 16)"],
     )
-    @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
+    @pytest.mark.parametrize("mode", ["complete", "reduced", "r", "raw"])
     def test_qr_large(self, dtype, shape, mode):
         a = generate_random_numpy_array(shape, dtype, seed_value=81)
         ia = dpnp.array(a)
 
-        if mode == "r":
-            np_r = numpy.linalg.qr(a, mode)
-            dpnp_r = dpnp.linalg.qr(ia, mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a, mode)
-
-            # check decomposition
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia, mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-                assert dpnp.allclose(dpnp.matmul(dpnp_q, dpnp_r), ia, atol=1e-5)
-            else:  # mode=="raw"
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia, mode)
-                assert_allclose(dpnp_q, np_q, atol=1e-4)
-        if mode in ("raw", "r"):
-            assert_allclose(dpnp_r, np_r, atol=1e-4)
+        check_qr(a, ia, mode, dpnp)
 
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
     @pytest.mark.parametrize(
         "shape",
         [(0, 0), (0, 2), (2, 0), (2, 0, 3), (2, 3, 0), (0, 2, 3)],
@@ -3676,65 +3644,22 @@ def test_qr_large(self, dtype, shape, mode):
             "(0, 2, 3)",
         ],
     )
-    @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
+    @pytest.mark.parametrize("mode", ["complete", "reduced", "r", "raw"])
     def test_qr_empty(self, dtype, shape, mode):
         a = numpy.empty(shape, dtype=dtype)
         ia = dpnp.array(a)
 
-        if mode == "r":
-            np_r = numpy.linalg.qr(a, mode)
-            dpnp_r = dpnp.linalg.qr(ia, mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a, mode)
-
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia, mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-            else:
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia, mode)
+        check_qr(a, ia, mode, dpnp)
 
-            assert_dtype_allclose(dpnp_q, np_q)
-
-        assert_dtype_allclose(dpnp_r, np_r)
-
-    @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
+    @pytest.mark.parametrize("mode", ["complete", "reduced", "r", "raw"])
     def test_qr_strides(self, mode):
         a = generate_random_numpy_array((5, 5))
         ia = dpnp.array(a)
 
         # positive strides
-        if mode == "r":
-            np_r = numpy.linalg.qr(a[::2, ::2], mode)
-            dpnp_r = dpnp.linalg.qr(ia[::2, ::2], mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a[::2, ::2], mode)
-
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia[::2, ::2], mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-            else:
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia[::2, ::2], mode)
-
-            assert_dtype_allclose(dpnp_q, np_q)
-
-        assert_dtype_allclose(dpnp_r, np_r)
-
+        check_qr(a[::2, ::2], ia[::2, ::2], mode, dpnp)
         # negative strides
-        if mode == "r":
-            np_r = numpy.linalg.qr(a[::-2, ::-2], mode)
-            dpnp_r = dpnp.linalg.qr(ia[::-2, ::-2], mode)
-        else:
-            np_q, np_r = numpy.linalg.qr(a[::-2, ::-2], mode)
-
-            if mode in ("complete", "reduced"):
-                result = dpnp.linalg.qr(ia[::-2, ::-2], mode)
-                dpnp_q, dpnp_r = result.Q, result.R
-            else:
-                dpnp_q, dpnp_r = dpnp.linalg.qr(ia[::-2, ::-2], mode)
-
-            assert_dtype_allclose(dpnp_q, np_q)
-
-        assert_dtype_allclose(dpnp_r, np_r)
+        check_qr(a[::-2, ::-2], ia[::-2, ::-2], mode, dpnp)
 
     def test_qr_errors(self):
         a_dp = dpnp.array([[1, 2], [3, 5]], dtype="float32")
diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py
index 1bc0da8c1535..6a3d6ac5afae 100644
--- a/dpnp/tests/test_memory.py
+++ b/dpnp/tests/test_memory.py
@@ -27,6 +27,41 @@ def test_wrong_usm_data(self):
         with pytest.raises(TypeError):
             dpm.create_data(d)
 
+    def test_dpctl_view(self):
+        a = dpt.arange(10)
+        view = a[3:]
+
+        data = dpm.create_data(view)
+        assert data.ptr == view._pointer
+
+    def test_dpctl_different_views(self):
+        a = dpt.reshape(dpt.arange(12), (3, 4))
+
+        data0 = dpm.create_data(a[0])
+        data1 = dpm.create_data(a[1])
+
+        # Verify independent wrapper objects
+        assert data0 is not data1
+
+        # Verify correct pointers
+        assert data0.ptr == a[0]._pointer
+        assert data1.ptr == a[1]._pointer
+        assert data0.ptr != data1.ptr
+
+    def test_repeated_calls(self):
+        a = dpt.arange(20)
+        view = a[5:15]
+
+        # Multiple calls should return independent objects with same ptr
+        data1 = dpm.create_data(view)
+        data2 = dpm.create_data(view)
+
+        assert data1 is not data2, "Should create independent wrapper objects"
+        assert data1.ptr == data2.ptr, "Both should point to same location"
+        assert data1.ptr == view._pointer
+
+
+class TestNdarray:
     def test_ndarray_from_data(self):
         a = dpnp.empty(5)
         b = dpnp.ndarray(a.shape, buffer=a.data)
@@ -42,3 +77,62 @@ def test_view_non_zero_offset(self):
         pl = dpnp.ndarray((n, m), dtype=a.dtype, buffer=sl)
         assert pl.data.ptr == sl.data.ptr
         assert a.data.ptr != sl.data.ptr
+
+    def test_slices_2d(self):
+        # Create 2D array and verify slices have different pointers
+        a = dpnp.arange(12, dtype=dpnp.float32).reshape(3, 4)
+
+        # Each row should have a different pointer
+        row0_ptr = a[0].data.ptr
+        row1_ptr = a[1].data.ptr
+        row2_ptr = a[2].data.ptr
+
+        assert (
+            row0_ptr != row1_ptr
+        ), "a[0] and a[1] should have different pointers"
+        assert (
+            row1_ptr != row2_ptr
+        ), "a[1] and a[2] should have different pointers"
+
+        # Check byte offsets match expected stride
+        stride = a.strides[0]  # stride between rows in bytes
+        assert row1_ptr - row0_ptr == stride
+        assert row2_ptr - row1_ptr == stride
+
+    def test_slices_multidimensional(self):
+        # 3D array
+        a = dpnp.zeros((5, 10, 20), dtype=dpnp.int32)
+
+        # Different slices along first axis should have different pointers
+        slice0_ptr = a[0].data.ptr
+        slice1_ptr = a[1].data.ptr
+
+        assert slice0_ptr != slice1_ptr
+        assert slice1_ptr - slice0_ptr == a.strides[0]
+
+    def test_repeated_access(self):
+        a = dpnp.arange(20).reshape(4, 5)
+
+        # Multiple accesses to same slice should give same ptr value
+        ptr1 = a[2].data.ptr
+        ptr2 = a[2].data.ptr
+
+        assert ptr1 == ptr2, "Same slice should have consistent ptr value"
+
+        # But different slices should have different ptrs
+        assert a[0].data.ptr != a[2].data.ptr
+
+    def test_array_on_view_with_slicing(self):
+        # Original array
+        a = dpnp.arange(24, dtype=dpnp.float32).reshape(6, 4)
+
+        # Create view using slicing
+        view = a[2:5]
+
+        # Construct new array from view
+        new_arr = dpnp.ndarray(view.shape, dtype=view.dtype, buffer=view)
+
+        # Pointers should match
+        assert new_arr.data.ptr == view.data.ptr
+        # And should be different from base array
+        assert new_arr.data.ptr != a.data.ptr
diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py
index 4e4e42bbc85e..6ce8645a11d4 100644
--- a/dpnp/tests/test_ndarray.py
+++ b/dpnp/tests/test_ndarray.py
@@ -228,10 +228,87 @@ def test_python_types(self, dt):
         expected = a.view(dt)
         assert_allclose(result, expected)
 
-    def test_type_error(self):
-        x = dpnp.ones(4, dtype="i4")
-        with pytest.raises(NotImplementedError):
-            x.view("i2", type=dpnp.ndarray)
+    def test_subclass_basic(self):
+        class MyArray(dpnp.ndarray):
+            pass
+
+        x = dpnp.array([1, 2, 3])
+        view = x.view(type=MyArray)
+
+        assert isinstance(view, MyArray)
+        assert type(view) is MyArray
+        assert (view == x).all()
+
+    def test_dtype_type_subclass(self):
+        class MyArray(dpnp.ndarray):
+            pass
+
+        x = dpnp.array([1, 2, 3])
+
+        # All three syntaxes should work identically
+        view1 = x.view(type=MyArray)
+        view2 = x.view(MyArray)
+        view3 = x.view(dtype=MyArray)
+
+        assert type(view1) is MyArray
+        assert type(view2) is MyArray
+        assert type(view3) is MyArray
+
+    def test_subclass_array_finalize(self):
+        class ArrayWithInfo(dpnp.ndarray):
+            def __array_finalize__(self, obj):
+                self.info = getattr(obj, "info", "default")
+
+        x = dpnp.array([1, 2, 3]).view(type=ArrayWithInfo)
+        x.info = "metadata"
+
+        # Create a view - __array_finalize__ should be called
+        view = x.view()
+        assert hasattr(view, "info")
+        assert view.info == "metadata"
+        assert type(view) is ArrayWithInfo
+
+    def test_subclass_self_class_preservation(self):
+        class MyArray(dpnp.ndarray):
+            pass
+
+        x = dpnp.array([1, 2, 3]).view(type=MyArray)
+
+        # View without type parameter should preserve MyArray
+        view = x.view()
+        assert type(view) is MyArray
+
+    def test_subclass_with_dtype_change(self):
+        class MyArray(dpnp.ndarray):
+            pass
+
+        x = dpnp.array([1.0, 2.0], dtype=dpnp.float32)
+        view = x.view(dtype=dpnp.int32, type=MyArray)
+
+        assert type(view) is MyArray
+        assert view.dtype == dpnp.int32
+
+    @pytest.mark.parametrize("xp", [dpnp, numpy])
+    def test_subclass_invalid_type(self, xp):
+        x = xp.array([1, 2, 3])
+        with pytest.raises(
+            ValueError, match="Type must be a sub-type of ndarray type"
+        ):
+            x.view(type=list)
+
+    @pytest.mark.parametrize("xp", [dpnp, numpy])
+    def test_subclass_double_type_specification(self, xp):
+        class MyArray(xp.ndarray):
+            pass
+
+        class OtherArray(xp.ndarray):
+            pass
+
+        x = xp.array([1, 2, 3])
+        with pytest.raises(
+            ValueError, match="Cannot specify output type twice"
+        ):
+            x.view(dtype=MyArray, type=OtherArray)
 
 
 @pytest.mark.parametrize(
diff --git a/dpnp/tests/test_special.py b/dpnp/tests/test_special.py
index 1ebb64d8da7f..075bef5aeca3 100644
--- a/dpnp/tests/test_special.py
+++ b/dpnp/tests/test_special.py
@@ -106,6 +106,9 @@ def test_erfc(self, inverse):
             atol=self.tol,
         )
 
+    @pytest.mark.usefixtures(
+        "suppress_overflow_encountered_in_cast_numpy_warnings"
+    )
     def test_erfcx(self, inverse):
         self._check_variant_func(
             inverse,
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_copy_and_view.py b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_copy_and_view.py
index 7b503f1997a5..5df4322ba0b1 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_copy_and_view.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_copy_and_view.py
@@ -466,7 +466,6 @@ def __array_finalize__(self, obj):
         self.info = getattr(obj, "info", None)
 
 
-@pytest.mark.skip("subclass array is not supported")
 class TestSubclassArrayView:
 
     def test_view_casting(self):
diff --git a/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py b/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
index c7ff275cac0c..697e4ee7988d 100644
--- a/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -12,10 +12,9 @@
 # from cupy.cuda import runtime
 # from cupy.linalg import _util
 from dpnp.tests.helper import (
-    LTS_VERSION,
     has_support_aspect64,
-    is_lts_driver,
 )
+from dpnp.tests.qr_helper import check_qr
 from dpnp.tests.third_party.cupy import testing
 from dpnp.tests.third_party.cupy.testing import _condition
 
@@ -169,7 +168,6 @@ def test_decomposition(self, dtype):
     )
 )
 class TestQRDecomposition(unittest.TestCase):
-
     @testing.for_dtypes("fdFD")
     def check_mode(self, array, mode, dtype):
         # if runtime.is_hip and driver.get_build_version() < 307:
@@ -178,22 +176,29 @@ def check_mode(self, array, mode, dtype):
 
         a_cpu = numpy.asarray(array, dtype=dtype)
         a_gpu = cupy.asarray(array, dtype=dtype)
-        result_gpu = cupy.linalg.qr(a_gpu, mode=mode)
+        # QR is not unique:
+        # element-wise comparison with NumPy may differ by sign/phase.
+        # To verify correctness use mode-dependent functional checks:
+        # complete/reduced: check decomposition Q @ R = A
+        # raw/r: check invariant R^H @ R = A^H @ A
+
+        # result_gpu = cupy.linalg.qr(a_gpu, mode=mode)
         if (
             mode != "raw"
             or numpy.lib.NumpyVersion(numpy.__version__) >= "1.22.0rc1"
         ):
-            result_cpu = numpy.linalg.qr(a_cpu, mode=mode)
-            self._check_result(result_cpu, result_gpu)
-
-    def _check_result(self, result_cpu, result_gpu):
-        if isinstance(result_cpu, tuple):
-            for b_cpu, b_gpu in zip(result_cpu, result_gpu):
-                assert b_cpu.dtype == b_gpu.dtype
-                testing.assert_allclose(b_cpu, b_gpu, atol=1e-4)
-        else:
-            assert result_cpu.dtype == result_gpu.dtype
-            testing.assert_allclose(result_cpu, result_gpu, atol=1e-4)
+            # result_cpu = numpy.linalg.qr(a_cpu, mode=mode)
+            # self._check_result(result_cpu, result_gpu, a_gpu, mode)
+            check_qr(a_cpu, a_gpu, mode, cupy)
+
+    # def _check_result(self, result_cpu, result_gpu):
+    #     if isinstance(result_cpu, tuple):
+    #         for b_cpu, b_gpu in zip(result_cpu, result_gpu):
+    #             assert b_cpu.dtype == b_gpu.dtype
+    #             testing.assert_allclose(b_cpu, b_gpu, atol=1e-4)
+    #     else:
+    #         assert result_cpu.dtype == result_gpu.dtype
+    #         testing.assert_allclose(result_cpu, result_gpu, atol=1e-4)
 
     @testing.fix_random()
     @_condition.repeat(3, 10)
@@ -202,9 +207,6 @@ def test_mode(self):
         self.check_mode(numpy.random.randn(3, 3), mode=self.mode)
         self.check_mode(numpy.random.randn(5, 4), mode=self.mode)
 
-    @pytest.mark.skipif(
-        is_lts_driver(version=LTS_VERSION.V1_6), reason="SAT-8375"
-    )
     @testing.with_requires("numpy>=1.22")
     @testing.fix_random()
     def test_mode_rank3(self):
@@ -212,9 +214,6 @@ def test_mode_rank3(self):
         self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode)
         self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode)
 
-    @pytest.mark.skipif(
-        is_lts_driver(version=LTS_VERSION.V1_6), reason="SAT-8375"
-    )
     @testing.with_requires("numpy>=1.22")
     @testing.fix_random()
     def test_mode_rank4(self):
diff --git a/pyproject.toml b/pyproject.toml
index d659428877fc..78ebe9d9aa66 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -163,6 +163,10 @@ allow-wildcard-with-all = true
 [tool.pylint.miscellaneous]
 notes = ["FIXME", "XXX"]
 
+[tool.pylint.typecheck]
+extension-pkg-allow-list = ["numpy"]
+generated-members = ["numpy.*", "finfo.*", "iinfo.*"]
+
 [tool.versioneer]
 VCS = "git"
 parentdir_prefix = "dpnp-"
diff --git a/scripts/gen_coverage.py b/scripts/gen_coverage.py
index 588345d91b2e..545fd888c1ba 100644
--- a/scripts/gen_coverage.py
+++ b/scripts/gen_coverage.py
@@ -259,6 +259,9 @@ def find_objects():
                     "-format=lcov",
                     "-ignore-filename-regex=/tmp/icpx*",
                     r"-ignore-filename-regex=.*/backend/kernels/elementwise_functions/.*\.hpp$",
+                    r"-ignore-filename-regex=.*/backend/kernels/indexing/.*\.hpp$",
+                    r"-ignore-filename-regex=.*/backend/kernels/statistics/.*\.hpp$",
+                    r"-ignore-filename-regex=.*/backend/kernels/window/.*\.hpp$",
                     "-instr-profile=" + instr_profile_fn,
                 ]
                 + objects