Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit e2c93c7

Browse filesBrowse files
update HWY namespace
1 parent dbbc8ac commit e2c93c7
Copy full SHA for e2c93c7

File tree

Expand file treeCollapse file tree

1 file changed

+68
-58
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+68
-58
lines changed

‎numpy/_core/src/umath/loops_arithmetic.dispatch.cpp

Copy file name to clipboardExpand all lines: numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+68-58Lines changed: 68 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#define _MULTIARRAYMODULE
33
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
44

5-
#include "hwy/highway.h"
65
#include "loops_utils.h"
76
#include "loops.h"
87
#include <cstring> // for memcpy
@@ -13,12 +12,12 @@
1312
#include "numpy/npy_math.h"
1413
#include <cstdio>
1514

16-
using namespace hwy::HWY_NAMESPACE;
15+
#include <hwy/highway.h>
16+
namespace hn = hwy::HWY_NAMESPACE;
1717

1818
HWY_BEFORE_NAMESPACE();
1919
namespace HWY_NAMESPACE {
2020

21-
2221
// Helper function to set float status
2322
inline void set_float_status(bool overflow, bool divbyzero) {
2423
if (overflow) {
@@ -32,9 +31,9 @@ inline void set_float_status(bool overflow, bool divbyzero) {
3231
// Signed integer division
3332
template <typename T>
3433
void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) {
35-
using D = ScalableTag<T>;
34+
using D = hn::ScalableTag<T>;
3635
const D d;
37-
const size_t N = Lanes(d);
36+
const size_t N = hn::Lanes(d);
3837

3938
bool raise_overflow = false;
4039
bool raise_divbyzero = false;
@@ -43,108 +42,120 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
4342
// Handle division by zero
4443
std::fill(dst, dst + len, static_cast<T>(0));
4544
raise_divbyzero = true;
46-
} else if (scalar == 1) {
45+
}
46+
else if (scalar == 1) {
4747
// Special case for division by 1
4848
memcpy(dst, src, len * sizeof(T));
49-
} else if (scalar == static_cast<T>(-1)) {
50-
const auto vec_min_val = Set(d, std::numeric_limits<T>::min());
49+
}
50+
else if (scalar == static_cast<T>(-1)) {
51+
const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
5152
size_t i = 0;
5253
for (; i + N <= static_cast<size_t>(len); i += N) {
53-
const auto vec_src = LoadU(d, src + i);
54-
const auto is_min_val = Eq(vec_src, vec_min_val);
55-
const auto vec_res = IfThenElse(is_min_val, vec_min_val, Neg(vec_src));
56-
StoreU(vec_res, d, dst + i);
57-
if (!raise_overflow && !AllFalse(d, is_min_val)) {
54+
const auto vec_src = hn::LoadU(d, src + i);
55+
const auto is_min_val = hn::Eq(vec_src, vec_min_val);
56+
const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src));
57+
hn::StoreU(vec_res, d, dst + i);
58+
if (!raise_overflow && !hn::AllFalse(d, is_min_val)) {
5859
raise_overflow = true;
5960
}
6061
}
6162
if (i < static_cast<size_t>(len)) {
6263
const size_t num = len - i;
63-
const auto vec_src = LoadN(d, src + i, num);
64-
const auto is_min_val = Eq(vec_src, vec_min_val);
65-
const auto vec_res = IfThenElse(is_min_val, vec_min_val, Neg(vec_src));
66-
StoreN(vec_res, d, dst + i, num);
67-
if (!raise_overflow && !AllFalse(d, is_min_val)) {
64+
const auto vec_src = hn::LoadN(d, src + i, num);
65+
const auto is_min_val = hn::Eq(vec_src, vec_min_val);
66+
const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src));
67+
hn::StoreN(vec_res, d, dst + i, num);
68+
if (!raise_overflow && !hn::AllFalse(d, is_min_val)) {
6869
raise_overflow = true;
6970
}
7071
}
71-
} else {
72-
const auto vec_scalar = Set(d, scalar);
73-
const auto zero = Zero(d);
72+
}
73+
else {
74+
const auto vec_scalar = hn::Set(d, scalar);
75+
const auto zero = hn::Zero(d);
7476
size_t i = 0;
7577
for (; i + N <= static_cast<size_t>(len); i += N) {
76-
const auto vec_src = LoadU(d, src + i);
77-
auto vec_res = Div(vec_src, vec_scalar);
78-
const auto vec_mul = Mul(vec_res, vec_scalar);
79-
const auto remainder_check = Ne(vec_src, vec_mul);
80-
const auto vec_nsign_src = Lt(vec_src, zero);
81-
const auto vec_nsign_scalar = Lt(vec_scalar, zero);
82-
const auto diff_sign = Xor(vec_nsign_src, vec_nsign_scalar);
83-
vec_res = IfThenElse(And(remainder_check, diff_sign), Sub(vec_res, Set(d, 1)), vec_res);
84-
StoreU(vec_res, d, dst + i);
78+
const auto vec_src = hn::LoadU(d, src + i);
79+
auto vec_res = hn::Div(vec_src, vec_scalar);
80+
const auto vec_mul = hn::Mul(vec_res, vec_scalar);
81+
const auto remainder_check = hn::Ne(vec_src, vec_mul);
82+
const auto vec_nsign_src = hn::Lt(vec_src, zero);
83+
const auto vec_nsign_scalar = hn::Lt(vec_scalar, zero);
84+
const auto diff_sign = hn::Xor(vec_nsign_src, vec_nsign_scalar);
85+
vec_res = hn::IfThenElse(
86+
hn::And(remainder_check, diff_sign),
87+
hn::Sub(vec_res, hn::Set(d, 1)),
88+
vec_res
89+
);
90+
hn::StoreU(vec_res, d, dst + i);
8591
}
8692
if (i < static_cast<size_t>(len)) {
8793
const size_t num = len - i;
88-
const auto vec_src = LoadN(d, src + i, num);
89-
auto vec_res = Div(vec_src, vec_scalar);
90-
const auto vec_mul = Mul(vec_res, vec_scalar);
91-
const auto remainder_check = Ne(vec_src, vec_mul);
92-
const auto vec_nsign_src = Lt(vec_src, zero);
93-
const auto vec_nsign_scalar = Lt(vec_scalar, zero);
94-
const auto diff_sign = Xor(vec_nsign_src, vec_nsign_scalar);
95-
vec_res = IfThenElse(And(remainder_check, diff_sign), Sub(vec_res, Set(d, 1)), vec_res);
96-
StoreN(vec_res, d, dst + i, num);
94+
const auto vec_src = hn::LoadN(d, src + i, num);
95+
auto vec_res = hn::Div(vec_src, vec_scalar);
96+
const auto vec_mul = hn::Mul(vec_res, vec_scalar);
97+
const auto remainder_check = hn::Ne(vec_src, vec_mul);
98+
const auto vec_nsign_src = hn::Lt(vec_src, zero);
99+
const auto vec_nsign_scalar = hn::Lt(vec_scalar, zero);
100+
const auto diff_sign = hn::Xor(vec_nsign_src, vec_nsign_scalar);
101+
vec_res = hn::IfThenElse(
102+
hn::And(remainder_check, diff_sign),
103+
hn::Sub(vec_res, hn::Set(d, 1)),
104+
vec_res
105+
);
106+
hn::StoreN(vec_res, d, dst + i, num);
97107
}
98108
}
99109

100110
set_float_status(raise_overflow, raise_divbyzero);
101111
}
102112

103-
104113
// Unsigned integer division
105114
template <typename T>
106115
void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp len) {
107-
using D = ScalableTag<T>;
116+
using D = hn::ScalableTag<T>;
108117
const D d;
109-
const size_t N = Lanes(d);
118+
const size_t N = hn::Lanes(d);
110119

111120
bool raise_divbyzero = false;
112121

113122
if (scalar == 0) {
114123
// Handle division by zero
115124
std::fill(dst, dst + len, static_cast<T>(0));
116125
raise_divbyzero = true;
117-
} else if (scalar == 1) {
126+
}
127+
else if (scalar == 1) {
118128
// Special case for division by 1
119129
memcpy(dst, src, len * sizeof(T));
120-
} else {
121-
const auto vec_scalar = Set(d, scalar);
130+
}
131+
else {
132+
const auto vec_scalar = hn::Set(d, scalar);
122133
size_t i = 0;
123134
for (; i + N <= static_cast<size_t>(len); i += N) {
124-
const auto vec_src = LoadU(d, src + i);
125-
const auto vec_res = Div(vec_src, vec_scalar);
126-
StoreU(vec_res, d, dst + i);
135+
const auto vec_src = hn::LoadU(d, src + i);
136+
const auto vec_res = hn::Div(vec_src, vec_scalar);
137+
hn::StoreU(vec_res, d, dst + i);
127138
}
128139
if (i < static_cast<size_t>(len)) {
129140
const size_t num = len - i;
130-
const auto vec_src = LoadN(d, src + i, num);
131-
const auto vec_res = Div(vec_src, vec_scalar);
132-
StoreN(vec_res, d, dst + i, num);
141+
const auto vec_src = hn::LoadN(d, src + i, num);
142+
const auto vec_res = hn::Div(vec_src, vec_scalar);
143+
hn::StoreN(vec_res, d, dst + i, num);
133144
}
134145
}
135146

136147
set_float_status(false, raise_divbyzero);
137148
}
138149

139-
140150
// Floor division for signed integers
141151
template <typename T>
142152
T floor_div(T n, T d) {
143153
if (HWY_UNLIKELY(d == 0 || (n == std::numeric_limits<T>::min() && d == -1))) {
144154
if (d == 0) {
145155
npy_set_floatstatus_divbyzero();
146156
return 0;
147-
} else {
157+
}
158+
else {
148159
npy_set_floatstatus_overflow();
149160
return std::numeric_limits<T>::min();
150161
}
@@ -167,7 +178,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
167178
if (steps[0] == sizeof(T) && steps[1] == 0 && steps[2] == sizeof(T)) {
168179
// Contiguous case with scalar divisor
169180
simd_divide_by_scalar_contig_signed(src1, *src2, dst, len);
170-
} else {
181+
}
182+
else {
171183
// Non-contiguous case
172184
for (npy_intp i = 0; i < len; ++i) {
173185
*dst = floor_div(*src1, *src2);
@@ -189,7 +201,8 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
189201
if (steps[0] == sizeof(T) && steps[1] == 0 && steps[2] == sizeof(T)) {
190202
// Contiguous case with scalar divisor
191203
simd_divide_by_scalar_contig_unsigned(src1, *src2, dst, len);
192-
} else {
204+
}
205+
else {
193206
// Non-contiguous case
194207
for (npy_intp i = 0; i < len; ++i) {
195208
if (HWY_UNLIKELY(*src2 == 0)) {
@@ -274,7 +287,6 @@ DEFINE_DIVIDE_FUNCTION(LONG, int64_t)
274287
DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
275288
#endif
276289

277-
278290
#define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \
279291
extern "C" { \
280292
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
@@ -293,10 +305,8 @@ DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t)
293305
DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t)
294306
#endif
295307

296-
297308
#undef DEFINE_DIVIDE_FUNCTION
298309
#undef DEFINE_DIVIDE_FUNCTION_UNSIGNED
299-
#undef DEFINE_EXTERNAL_FUNCTIONS
300310

301311
} // namespace HWY_NAMESPACE
302312
HWY_AFTER_NAMESPACE();

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.