JavaTypedScript
diff --git a/‎3rdparty/hal_rvv/hal_rvv.hpp
Copy file name to clipboardExpand all lines: 3rdparty/hal_rvv/hal_rvv.hpp
+1Lines changed: 1 addition & 0 deletions b/‎3rdparty/hal_rvv/hal_rvv.hpp
Copy file name to clipboardExpand all lines: 3rdparty/hal_rvv/hal_rvv.hpp
+1Lines changed: 1 addition & 0 deletions
diff --git a/‎3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp
Copy file name to clipboard
+122Lines changed: 122 additions & 0 deletions b/‎3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp
Copy file name to clipboard
+122Lines changed: 122 additions & 0 deletions
@@ -38,6 +38,7 @@
 #include "hal_rvv_1p0/cholesky.hpp" // core
 #include "hal_rvv_1p0/qr.hpp" // core
 #include "hal_rvv_1p0/svd.hpp" // core
+#include "hal_rvv_1p0/sqrt.hpp" // core
 
 #include "hal_rvv_1p0/filter.hpp" // imgproc
 #include "hal_rvv_1p0/pyramids.hpp" // imgproc
 
@@ -0,0 +1,122 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
+#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
+
+#include <riscv_vector.h>
+#include <cmath>
+#include "hal_rvv_1p0/types.hpp"
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_sqrt32f
+#undef cv_hal_sqrt64f
+#undef cv_hal_invSqrt32f
+#undef cv_hal_invSqrt64f
+
+#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
+#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
+
+#ifdef __clang__
+// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
+// So a smaller LMUL is used here.
+#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M4>>
+#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M4>>
+#else
+#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
+#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
+#endif
+
+namespace detail {
+
+// Newton-Raphson method
+// Use 4 LMUL registers
+template <size_t iter_times, typename VEC_T>
+inline VEC_T sqrt(VEC_T x, size_t vl)
+{
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#pragma unroll
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul(t, y, vl);
+    }
+    // just to prevent the compiler from calculating mask before the invSqrt, which will run out
+    // of registers and cause memory access.
+    asm volatile("" ::: "memory");
+    auto mask = __riscv_vmfne(x, 0.0, vl);
+    mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
+    return __riscv_vfmul_mu(mask, x, x, y, vl);
+}
+
+// Newton-Raphson method
+// Use 3 LMUL registers and 1 mask register
+template <size_t iter_times, typename VEC_T>
+inline VEC_T invSqrt(VEC_T x, size_t vl)
+{
+    auto mask = __riscv_vmfne(x, 0.0, vl);
+    mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#pragma unroll
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul_mu(mask, y, t, y, vl);
+    }
+    return y;
+}
+
+}  // namespace detail
+
+template <typename RVV_T>
+struct Sqrt32f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 2;
+};
+
+template <typename RVV_T>
+struct Sqrt64f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 3;
+};
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int sqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, detail::sqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int invSqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, detail::invSqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+}}  // namespace cv::cv_hal_rvv
+
+#endif  // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED