2
2
#define _MULTIARRAYMODULE
3
3
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
4
4
5
- #include " hwy/highway.h"
6
5
#include " loops_utils.h"
7
6
#include " loops.h"
8
7
#include < cstring> // for memcpy
13
12
#include " numpy/npy_math.h"
14
13
#include < cstdio>
15
14
16
- using namespace hwy ::HWY_NAMESPACE;
15
+ #include < hwy/highway.h>
16
+ namespace hn = hwy::HWY_NAMESPACE;
17
17
18
18
HWY_BEFORE_NAMESPACE ();
19
19
namespace HWY_NAMESPACE {
20
20
21
-
22
21
// Helper function to set float status
23
22
inline void set_float_status (bool overflow, bool divbyzero) {
24
23
if (overflow) {
@@ -32,9 +31,9 @@ inline void set_float_status(bool overflow, bool divbyzero) {
32
31
// Signed integer division
33
32
template <typename T>
34
33
void simd_divide_by_scalar_contig_signed (T* src, T scalar, T* dst, npy_intp len) {
35
- using D = ScalableTag<T>;
34
+ using D = hn:: ScalableTag<T>;
36
35
const D d;
37
- const size_t N = Lanes (d);
36
+ const size_t N = hn:: Lanes (d);
38
37
39
38
bool raise_overflow = false ;
40
39
bool raise_divbyzero = false ;
@@ -43,108 +42,120 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
43
42
// Handle division by zero
44
43
std::fill (dst, dst + len, static_cast <T>(0 ));
45
44
raise_divbyzero = true ;
46
- } else if (scalar == 1 ) {
45
+ }
46
+ else if (scalar == 1 ) {
47
47
// Special case for division by 1
48
48
memcpy (dst, src, len * sizeof (T));
49
- } else if (scalar == static_cast <T>(-1 )) {
50
- const auto vec_min_val = Set (d, std::numeric_limits<T>::min ());
49
+ }
50
+ else if (scalar == static_cast <T>(-1 )) {
51
+ const auto vec_min_val = hn::Set (d, std::numeric_limits<T>::min ());
51
52
size_t i = 0 ;
52
53
for (; i + N <= static_cast <size_t >(len); i += N) {
53
- const auto vec_src = LoadU (d, src + i);
54
- const auto is_min_val = Eq (vec_src, vec_min_val);
55
- const auto vec_res = IfThenElse (is_min_val, vec_min_val, Neg (vec_src));
56
- StoreU (vec_res, d, dst + i);
57
- if (!raise_overflow && !AllFalse (d, is_min_val)) {
54
+ const auto vec_src = hn:: LoadU (d, src + i);
55
+ const auto is_min_val = hn:: Eq (vec_src, vec_min_val);
56
+ const auto vec_res = hn:: IfThenElse (is_min_val, vec_min_val, hn:: Neg (vec_src));
57
+ hn:: StoreU (vec_res, d, dst + i);
58
+ if (!raise_overflow && !hn:: AllFalse (d, is_min_val)) {
58
59
raise_overflow = true ;
59
60
}
60
61
}
61
62
if (i < static_cast <size_t >(len)) {
62
63
const size_t num = len - i;
63
- const auto vec_src = LoadN (d, src + i, num);
64
- const auto is_min_val = Eq (vec_src, vec_min_val);
65
- const auto vec_res = IfThenElse (is_min_val, vec_min_val, Neg (vec_src));
66
- StoreN (vec_res, d, dst + i, num);
67
- if (!raise_overflow && !AllFalse (d, is_min_val)) {
64
+ const auto vec_src = hn:: LoadN (d, src + i, num);
65
+ const auto is_min_val = hn:: Eq (vec_src, vec_min_val);
66
+ const auto vec_res = hn:: IfThenElse (is_min_val, vec_min_val, hn:: Neg (vec_src));
67
+ hn:: StoreN (vec_res, d, dst + i, num);
68
+ if (!raise_overflow && !hn:: AllFalse (d, is_min_val)) {
68
69
raise_overflow = true ;
69
70
}
70
71
}
71
- } else {
72
- const auto vec_scalar = Set (d, scalar);
73
- const auto zero = Zero (d);
72
+ }
73
+ else {
74
+ const auto vec_scalar = hn::Set (d, scalar);
75
+ const auto zero = hn::Zero (d);
74
76
size_t i = 0 ;
75
77
for (; i + N <= static_cast <size_t >(len); i += N) {
76
- const auto vec_src = LoadU (d, src + i);
77
- auto vec_res = Div (vec_src, vec_scalar);
78
- const auto vec_mul = Mul (vec_res, vec_scalar);
79
- const auto remainder_check = Ne (vec_src, vec_mul);
80
- const auto vec_nsign_src = Lt (vec_src, zero);
81
- const auto vec_nsign_scalar = Lt (vec_scalar, zero);
82
- const auto diff_sign = Xor (vec_nsign_src, vec_nsign_scalar);
83
- vec_res = IfThenElse (And (remainder_check, diff_sign), Sub (vec_res, Set (d, 1 )), vec_res);
84
- StoreU (vec_res, d, dst + i);
78
+ const auto vec_src = hn::LoadU (d, src + i);
79
+ auto vec_res = hn::Div (vec_src, vec_scalar);
80
+ const auto vec_mul = hn::Mul (vec_res, vec_scalar);
81
+ const auto remainder_check = hn::Ne (vec_src, vec_mul);
82
+ const auto vec_nsign_src = hn::Lt (vec_src, zero);
83
+ const auto vec_nsign_scalar = hn::Lt (vec_scalar, zero);
84
+ const auto diff_sign = hn::Xor (vec_nsign_src, vec_nsign_scalar);
85
+ vec_res = hn::IfThenElse (
86
+ hn::And (remainder_check, diff_sign),
87
+ hn::Sub (vec_res, hn::Set (d, 1 )),
88
+ vec_res
89
+ );
90
+ hn::StoreU (vec_res, d, dst + i);
85
91
}
86
92
if (i < static_cast <size_t >(len)) {
87
93
const size_t num = len - i;
88
- const auto vec_src = LoadN (d, src + i, num);
89
- auto vec_res = Div (vec_src, vec_scalar);
90
- const auto vec_mul = Mul (vec_res, vec_scalar);
91
- const auto remainder_check = Ne (vec_src, vec_mul);
92
- const auto vec_nsign_src = Lt (vec_src, zero);
93
- const auto vec_nsign_scalar = Lt (vec_scalar, zero);
94
- const auto diff_sign = Xor (vec_nsign_src, vec_nsign_scalar);
95
- vec_res = IfThenElse (And (remainder_check, diff_sign), Sub (vec_res, Set (d, 1 )), vec_res);
96
- StoreN (vec_res, d, dst + i, num);
94
+ const auto vec_src = hn::LoadN (d, src + i, num);
95
+ auto vec_res = hn::Div (vec_src, vec_scalar);
96
+ const auto vec_mul = hn::Mul (vec_res, vec_scalar);
97
+ const auto remainder_check = hn::Ne (vec_src, vec_mul);
98
+ const auto vec_nsign_src = hn::Lt (vec_src, zero);
99
+ const auto vec_nsign_scalar = hn::Lt (vec_scalar, zero);
100
+ const auto diff_sign = hn::Xor (vec_nsign_src, vec_nsign_scalar);
101
+ vec_res = hn::IfThenElse (
102
+ hn::And (remainder_check, diff_sign),
103
+ hn::Sub (vec_res, hn::Set (d, 1 )),
104
+ vec_res
105
+ );
106
+ hn::StoreN (vec_res, d, dst + i, num);
97
107
}
98
108
}
99
109
100
110
set_float_status (raise_overflow, raise_divbyzero);
101
111
}
102
112
103
-
104
113
// Unsigned integer division
105
114
template <typename T>
106
115
void simd_divide_by_scalar_contig_unsigned (T* src, T scalar, T* dst, npy_intp len) {
107
- using D = ScalableTag<T>;
116
+ using D = hn:: ScalableTag<T>;
108
117
const D d;
109
- const size_t N = Lanes (d);
118
+ const size_t N = hn:: Lanes (d);
110
119
111
120
bool raise_divbyzero = false ;
112
121
113
122
if (scalar == 0 ) {
114
123
// Handle division by zero
115
124
std::fill (dst, dst + len, static_cast <T>(0 ));
116
125
raise_divbyzero = true ;
117
- } else if (scalar == 1 ) {
126
+ }
127
+ else if (scalar == 1 ) {
118
128
// Special case for division by 1
119
129
memcpy (dst, src, len * sizeof (T));
120
- } else {
121
- const auto vec_scalar = Set (d, scalar);
130
+ }
131
+ else {
132
+ const auto vec_scalar = hn::Set (d, scalar);
122
133
size_t i = 0 ;
123
134
for (; i + N <= static_cast <size_t >(len); i += N) {
124
- const auto vec_src = LoadU (d, src + i);
125
- const auto vec_res = Div (vec_src, vec_scalar);
126
- StoreU (vec_res, d, dst + i);
135
+ const auto vec_src = hn:: LoadU (d, src + i);
136
+ const auto vec_res = hn:: Div (vec_src, vec_scalar);
137
+ hn:: StoreU (vec_res, d, dst + i);
127
138
}
128
139
if (i < static_cast <size_t >(len)) {
129
140
const size_t num = len - i;
130
- const auto vec_src = LoadN (d, src + i, num);
131
- const auto vec_res = Div (vec_src, vec_scalar);
132
- StoreN (vec_res, d, dst + i, num);
141
+ const auto vec_src = hn:: LoadN (d, src + i, num);
142
+ const auto vec_res = hn:: Div (vec_src, vec_scalar);
143
+ hn:: StoreN (vec_res, d, dst + i, num);
133
144
}
134
145
}
135
146
136
147
set_float_status (false , raise_divbyzero);
137
148
}
138
149
139
-
140
150
// Floor division for signed integers
141
151
template <typename T>
142
152
T floor_div (T n, T d) {
143
153
if (HWY_UNLIKELY (d == 0 || (n == std::numeric_limits<T>::min () && d == -1 ))) {
144
154
if (d == 0 ) {
145
155
npy_set_floatstatus_divbyzero ();
146
156
return 0 ;
147
- } else {
157
+ }
158
+ else {
148
159
npy_set_floatstatus_overflow ();
149
160
return std::numeric_limits<T>::min ();
150
161
}
@@ -167,7 +178,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
167
178
if (steps[0 ] == sizeof (T) && steps[1 ] == 0 && steps[2 ] == sizeof (T)) {
168
179
// Contiguous case with scalar divisor
169
180
simd_divide_by_scalar_contig_signed (src1, *src2, dst, len);
170
- } else {
181
+ }
182
+ else {
171
183
// Non-contiguous case
172
184
for (npy_intp i = 0 ; i < len; ++i) {
173
185
*dst = floor_div (*src1, *src2);
@@ -189,7 +201,8 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
189
201
if (steps[0 ] == sizeof (T) && steps[1 ] == 0 && steps[2 ] == sizeof (T)) {
190
202
// Contiguous case with scalar divisor
191
203
simd_divide_by_scalar_contig_unsigned (src1, *src2, dst, len);
192
- } else {
204
+ }
205
+ else {
193
206
// Non-contiguous case
194
207
for (npy_intp i = 0 ; i < len; ++i) {
195
208
if (HWY_UNLIKELY (*src2 == 0 )) {
@@ -274,7 +287,6 @@ DEFINE_DIVIDE_FUNCTION(LONG, int64_t)
274
287
DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t )
275
288
#endif
276
289
277
-
278
290
#define DEFINE_DIVIDE_FUNCTION_UNSIGNED (TYPE, SCALAR_TYPE ) \
279
291
extern " C" { \
280
292
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX (TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
@@ -293,10 +305,8 @@ DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t)
293
305
DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t )
294
306
#endif
295
307
296
-
297
308
#undef DEFINE_DIVIDE_FUNCTION
298
309
#undef DEFINE_DIVIDE_FUNCTION_UNSIGNED
299
- #undef DEFINE_EXTERNAL_FUNCTIONS
300
310
301
311
} // namespace HWY_NAMESPACE
302
312
HWY_AFTER_NAMESPACE ();
0 commit comments