From d2e77689db58cb4aab91b5330a3336dd24930ade Mon Sep 17 00:00:00 2001 From: Touqir Sajed Date: Mon, 18 Jan 2021 17:59:04 +0600 Subject: [PATCH 01/13] Added support for SIMD operations for int types in numpy.count_nonzero function --- numpy/core/src/multiarray/item_selection.c | 287 ++++++++++++++++++++- 1 file changed, 278 insertions(+), 9 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 8e4b2ebe120e..2d1d6db83b04 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2131,18 +2131,22 @@ count_nonzero_bytes_384(const npy_uint64 * w) #if NPY_SIMD +/* + +*/ + /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */ static NPY_INLINE NPY_GCC_OPT_3 npyv_u8 count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count) { - const npyv_u8 vone = npyv_setall_u8(1); - const npyv_u8 vzero = npyv_zero_u8(); + const npyv_u8 vone = npyv_setall_u8(1); + const npyv_u8 vzero = npyv_zero_u8(); - npy_intp lane_max = 0; - npyv_u8 vsum8 = npyv_zero_u8(); + npy_intp lane_max = 0; + npyv_u8 vsum8 = npyv_zero_u8(); while (*d < end && lane_max <= max_count - 1) { // we count zeros because `cmpeq` cheaper than `cmpneq` for most archs - npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); + npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); vt = npyv_and_u8(vt, vone); vsum8 = npyv_add_u8(vsum8, vt); *d += npyv_nlanes_u8; @@ -2155,8 +2159,8 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2 count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count) { npyv_u16x2 vsum16; - vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); - npy_intp lane_max = 0; + vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); // Setting a vector of 0s (16 maybe) + npy_intp lane_max = 0; // scalar 0 while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) { npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8); npyv_u16x2 part = npyv_expand_u16_u8(vsum8); @@ -2202,7 +2206,252 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx) return unrollx - zero_count; } +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + + +static NPY_INLINE NPY_GCC_OPT_3 npy_uintp +count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx) +{ + npy_uintp zero_count = 0; + const npy_uintp innerloop_jump = NPY_MAX_UINT16; + const npy_int16 *end = d + unrollx; + + const npyv_u16 vone = npyv_setall_u16(1); + const npyv_u16 vzero = npyv_zero_u16(); + + npy_int16 *target = d; + npy_uint16 sums[npyv_nlanes_u16]; + + while (dtype_num == NPY_BOOL) { + + +#if NPY_SIMD + if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) { + return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self), + PyArray_DIMS(self), PyArray_STRIDES(self)); + } + + if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) { + return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self), + PyArray_DIMS(self), PyArray_STRIDES(self)); + } + + if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) { + return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self), + PyArray_DIMS(self), PyArray_STRIDES(self)); + } + +#endif + + if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) { return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), PyArray_DIMS(self), PyArray_STRIDES(self)); } + nonzero = PyArray_DESCR(self)->f->nonzero; /* If it's a trivial one-dimensional loop, don't use an iterator */ From c716a120cba2c8c2b972433604b085d87122823e Mon Sep 17 00:00:00 2001 From: Touqir Sajed Date: Mon, 18 Jan 2021 18:01:55 +0600 Subject: [PATCH 02/13] Added tests for i1,i2,i4,i8 types for numpy.count_nonzero function --- numpy/core/tests/test_numeric.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py index 280874d21695..6de9e3764cd9 100644 --- a/numpy/core/tests/test_numeric.py +++ b/numpy/core/tests/test_numeric.py @@ -1257,20 +1257,30 @@ def test_nonzero_onedim(self): assert_equal(np.count_nonzero(x), 4) assert_equal(np.nonzero(x), ([0, 2, 3, 6],)) - x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)], - dtype=[('a', 'i4'), ('b', 'i2')]) + # x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)], + # dtype=[('a', 'i4'), ('b', 'i2')]) + x = np.array([(1, 2, -5, -3), (0, 0, 2, 7), (1, 1, 0, 1), (-1, 3, 1, 0), (0, 7, 0, 4)], + dtype=[('a', 'i4'), ('b', 'i2'), ('c', 'i1'), ('d', 'i8')]) assert_equal(np.count_nonzero(x['a']), 3) assert_equal(np.count_nonzero(x['b']), 4) + assert_equal(np.count_nonzero(x['c']), 3) + assert_equal(np.count_nonzero(x['d']), 4) assert_equal(np.nonzero(x['a']), ([0, 2, 3],)) assert_equal(np.nonzero(x['b']), ([0, 2, 3, 4],)) def test_nonzero_twodim(self): x = np.array([[0, 1, 0], [2, 0, 3]]) - assert_equal(np.count_nonzero(x), 3) + assert_equal(np.count_nonzero(x.astype('i1')), 3) + assert_equal(np.count_nonzero(x.astype('i2')), 3) + assert_equal(np.count_nonzero(x.astype('i4')), 3) + assert_equal(np.count_nonzero(x.astype('i8')), 3) assert_equal(np.nonzero(x), ([0, 1, 1], [1, 0, 2])) x = np.eye(3) - assert_equal(np.count_nonzero(x), 3) + assert_equal(np.count_nonzero(x.astype('i1')), 3) + assert_equal(np.count_nonzero(x.astype('i2')), 3) + assert_equal(np.count_nonzero(x.astype('i4')), 3) + assert_equal(np.count_nonzero(x.astype('i8')), 3) assert_equal(np.nonzero(x), ([0, 1, 2], [0, 1, 2])) x = np.array([[(0, 1), (0, 0), (1, 11)], From 15cf37d5394e69fc1847b1efa8d5253de4890cbe Mon Sep 17 00:00:00 2001 From: Touqir Sajed Date: Tue, 19 Jan 2021 16:19:11 +0600 Subject: [PATCH 03/13] Merged count_nonzero_int16/int32/int64 into count_nonzero_int and added benchmarks --- benchmarks/benchmarks/bench_core.py | 2 +- numpy/core/src/multiarray/item_selection.c | 206 +++++++-------------- 2 files changed, 67 insertions(+), 141 deletions(-) diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py index 1c028542db04..279c6f475920 100644 --- a/benchmarks/benchmarks/bench_core.py +++ b/benchmarks/benchmarks/bench_core.py @@ -136,7 +136,7 @@ class CountNonzero(Benchmark): params = [ [1, 2, 3], [100, 10000, 1000000], - [bool, int, str, object] + [bool, np.int8, np.int16, np.int32, np.int64, str, object] ] def setup(self, numaxes, size, dtype): diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 2d1d6db83b04..01438e27d63a 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2206,9 +2206,6 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx) return unrollx - zero_count; } -#define MAX(x, y) (((x) > (y)) ? (x) : (y)) -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) - static NPY_INLINE NPY_GCC_OPT_3 npy_uintp count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx) @@ -2225,7 +2222,7 @@ count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx) while (dtype_num == NPY_INT16 || dtype->type_num == NPY_UINT16) { - return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self), - PyArray_DIMS(self), PyArray_STRIDES(self)); - } +// #if NPY_SIMD +// if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) { +// return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self), +// PyArray_DIMS(self), PyArray_STRIDES(self)); +// } - if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) { - return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self), - PyArray_DIMS(self), PyArray_STRIDES(self)); - } +// if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) { +// return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self), +// PyArray_DIMS(self), PyArray_STRIDES(self)); +// } - if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) { - return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self), - PyArray_DIMS(self), PyArray_STRIDES(self)); - } +// if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) { +// return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self), +// PyArray_DIMS(self), PyArray_STRIDES(self)); +// } -#endif +// #endif + + if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) { + return count_nonzero_int(PyArray_NDIM(self), (void *) PyArray_DATA(self), + PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num); + } if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) { return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), From 2b41cbf3e46e6d16e84f0fa800500346789dba6d Mon Sep 17 00:00:00 2001 From: Touqir Sajed Date: Tue, 19 Jan 2021 16:22:54 +0600 Subject: [PATCH 04/13] Removed commented out code from PyArray_CountNonzero --- numpy/core/src/multiarray/item_selection.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 01438e27d63a..373286d2305f 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2467,25 +2467,6 @@ PyArray_CountNonzero(PyArrayObject *self) /* Special low-overhead version specific to the boolean type */ dtype = PyArray_DESCR(self); - -// #if NPY_SIMD -// if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) { -// return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self), -// PyArray_DIMS(self), PyArray_STRIDES(self)); -// } - -// if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) { -// return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self), -// PyArray_DIMS(self), PyArray_STRIDES(self)); -// } - -// if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) { -// return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self), -// PyArray_DIMS(self), PyArray_STRIDES(self)); -// } - -// #endif - if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) { return count_nonzero_int(PyArray_NDIM(self), (void *) PyArray_DATA(self), PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num); From 87c5d51a32b406a9872428b9ca1db6c5242dded6 Mon Sep 17 00:00:00 2001 From: Touqir Sajed Date: Fri, 5 Feb 2021 16:57:56 +0600 Subject: [PATCH 05/13] Replaced manual sums with horizontal simd sums for count_nonzero_16/64 --- numpy/core/src/multiarray/item_selection.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 373286d2305f..f92327827cca 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2218,8 +2218,6 @@ count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx) const npyv_u16 vzero = npyv_zero_u16(); npy_int16 *target = d; - npy_uint16 sums[npyv_nlanes_u16]; - while (d Date: Sun, 7 Feb 2021 12:48:39 +0600 Subject: [PATCH 06/13] fixed CI errors and optimized further simd_16 and simd_32 --- numpy/core/src/multiarray/item_selection.c | 34 +++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index f92327827cca..b0133983af34 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2159,8 +2159,8 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2 count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count) { npyv_u16x2 vsum16; - vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); // Setting a vector of 0s (16 maybe) - npy_intp lane_max = 0; // scalar 0 + vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); + npy_intp lane_max = 0; while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) { npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8); npyv_u16x2 part = npyv_expand_u16_u8(vsum8); @@ -2208,16 +2208,16 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx) static NPY_INLINE NPY_GCC_OPT_3 npy_uintp -count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx) +count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx) { npy_uintp zero_count = 0; - const npy_uintp innerloop_jump = NPY_MAX_UINT16; - const npy_int16 *end = d + unrollx; + npy_uintp innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16; + npy_uint16 *end = d + unrollx; const npyv_u16 vone = npyv_setall_u16(1); const npyv_u16 vzero = npyv_zero_u16(); - npy_int16 *target = d; + npy_uint16 *target = d; while (dtype_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) { - return count_nonzero_int(PyArray_NDIM(self), (void *) PyArray_DATA(self), + return count_nonzero_int(PyArray_NDIM(self), (char *) PyArray_DATA(self), PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num); } From 022cc66e425b12680b252340d56c11d87d3c8765 Mon Sep 17 00:00:00 2001 From: Touqir Sajed Date: Sun, 7 Feb 2021 16:43:03 +0600 Subject: [PATCH 07/13] some fixes for the build problems --- numpy/core/src/multiarray/item_selection.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index b0133983af34..d793b64efac0 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -30,6 +30,8 @@ #include "array_coercion.h" #include "simd/simd.h" +#include + static NPY_GCC_OPT_3 NPY_INLINE int npy_fasttake_impl( char *dest, char *src, const npy_intp *indices, @@ -2206,12 +2208,17 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx) return unrollx - zero_count; } +#define safe_ptr_addition_uint16(result, ptr, adder) \ + result = ((((uint64_t) ptr) + (((uint64_t) adder) << 1)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint16 *) NPY_MAX_UINTP; + +#define safe_ptr_addition_uint32(result, ptr, adder) \ + result = ((((uint64_t) ptr) + (((uint64_t) adder) << 2)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint32 *) NPY_MAX_UINTP; static NPY_INLINE NPY_GCC_OPT_3 npy_uintp count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx) { npy_uintp zero_count = 0; - npy_uintp innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16; + uint64_t innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16; npy_uint16 *end = d + unrollx; const npyv_u16 vone = npyv_setall_u16(1); @@ -2220,7 +2227,8 @@ count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx) npy_uint16 *target = d; while (d Date: Sun, 7 Feb 2021 18:04:10 +0600 Subject: [PATCH 08/13] another attempt to fix build issues --- numpy/core/src/multiarray/item_selection.c | 38 ++++++++++------------ 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index d793b64efac0..06513ab9a351 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2208,17 +2208,12 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx) return unrollx - zero_count; } -#define safe_ptr_addition_uint16(result, ptr, adder) \ - result = ((((uint64_t) ptr) + (((uint64_t) adder) << 1)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint16 *) NPY_MAX_UINTP; - -#define safe_ptr_addition_uint32(result, ptr, adder) \ - result = ((((uint64_t) ptr) + (((uint64_t) adder) << 2)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint32 *) NPY_MAX_UINTP; static NPY_INLINE NPY_GCC_OPT_3 npy_uintp count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx) { npy_uintp zero_count = 0; - uint64_t innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16; + uint64_t innerloop_jump = NPY_MAX_UINT16; npy_uint16 *end = d + unrollx; const npyv_u16 vone = npyv_setall_u16(1); @@ -2227,14 +2222,15 @@ count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx) npy_uint16 *target = d; while (d> 1); + target = (npy_uint16*) PyArray_MIN(target_tmp, (uint64_t) end); + for (; d> 2); + target = (npy_uint32*) PyArray_MIN(target_tmp, (uint64_t) end); + for (; d Date: Sun, 7 Feb 2021 18:53:34 +0600 Subject: [PATCH 09/13] removed the target variable and changed the loop as suggested by Sayed Adel --- numpy/core/src/multiarray/item_selection.c | 28 +++++++--------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 06513ab9a351..f2da62ae96d4 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2213,23 +2213,17 @@ static NPY_INLINE NPY_GCC_OPT_3 npy_uintp count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx) { npy_uintp zero_count = 0; - uint64_t innerloop_jump = NPY_MAX_UINT16; npy_uint16 *end = d + unrollx; const npyv_u16 vone = npyv_setall_u16(1); const npyv_u16 vzero = npyv_zero_u16(); - npy_uint16 *target = d; while (d> 1); - target = (npy_uint16*) PyArray_MIN(target_tmp, (uint64_t) end); - for (; d> 2); - target = (npy_uint32*) PyArray_MIN(target_tmp, (uint64_t) end); - for (; d Date: Mon, 8 Feb 2021 04:13:45 +0600 Subject: [PATCH 10/13] Modified PyArray_CountNonzero to discriminate between types based on elsize --- numpy/core/src/multiarray/item_selection.c | 59 ++++++++++++++++++---- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index f2da62ae96d4..c0eec4c84f66 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2279,7 +2279,7 @@ count_nonzero_int64_simd(npy_uint64 *d, npy_uintp unrollx) static NPY_INLINE NPY_GCC_OPT_3 npy_intp -count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int type_num) +count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int elsize) { int idim; npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS]; @@ -2342,13 +2342,13 @@ count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp * NPY_BEGIN_THREADS_THRESHOLDED(shape[0]); - if (type_num == NPY_INT16 || type_num == NPY_UINT16) { + if (elsize == 2) { _ITERATE_I16; } - else if (type_num == NPY_INT32 || type_num == NPY_UINT32) { + else if (elsize == 4) { _ITERATE_I32; } - else if (type_num == NPY_INT64 || type_num == NPY_UINT64) { + else if (elsize == 8) { _ITERATE_I64; } @@ -2453,16 +2453,55 @@ PyArray_CountNonzero(PyArrayObject *self) /* Special low-overhead version specific to the boolean type */ dtype = PyArray_DESCR(self); - if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) { - return count_nonzero_int(PyArray_NDIM(self), (char *) PyArray_DATA(self), - PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num); - } + // if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) { + // return count_nonzero_int(PyArray_NDIM(self), (char *) PyArray_DATA(self), + // PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num); + // } + + // if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) { + // return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), + // PyArray_DIMS(self), PyArray_STRIDES(self)); + // } + + switch(dtype->kind) { + case 'u': + { + if (dtype->elsize == 1) + return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), + PyArray_DIMS(self), PyArray_STRIDES(self)); + + if (dtype->elsize >=2 && dtype->elsize <= 8) + return count_nonzero_int( + PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self), + PyArray_STRIDES(self), dtype->elsize + ); - if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) { - return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), + break; + } + case 'i': + { + if (dtype->elsize == 1) + return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), PyArray_DIMS(self), PyArray_STRIDES(self)); + + if (dtype->elsize >=2 && dtype->elsize <= 8) + return count_nonzero_int( + PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self), + PyArray_STRIDES(self), dtype->elsize + ); + + break; + } + case 'b': + { + if (dtype->elsize == 1) + return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), + PyArray_DIMS(self), PyArray_STRIDES(self)); + + } } + nonzero = PyArray_DESCR(self)->f->nonzero; /* If it's a trivial one-dimensional loop, don't use an iterator */ From 1eb91a33202416f582dbf389e44409290922734d Mon Sep 17 00:00:00 2001 From: Touqir Sajed Date: Mon, 8 Feb 2021 05:46:06 +0600 Subject: [PATCH 11/13] Ensured overflow does not happen for 16 and 32 bit ints --- numpy/core/src/multiarray/item_selection.c | 34 +++++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index c0eec4c84f66..9de3446352de 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2133,10 +2133,6 @@ count_nonzero_bytes_384(const npy_uint64 * w) #if NPY_SIMD -/* - -*/ - /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */ static NPY_INLINE NPY_GCC_OPT_3 npyv_u8 count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count) @@ -2209,23 +2205,37 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx) } +#define safe_ptr_addition_uint16(result, ptr, adder) \ + result = ((((uint64_t) ptr) + (((uint64_t) adder) << 1)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint16 *) NPY_MAX_UINTP; + +#define safe_ptr_addition_uint32(result, ptr, adder) \ + result = ((((uint64_t) ptr) + (((uint64_t) adder) << 2)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint32 *) NPY_MAX_UINTP; + static NPY_INLINE NPY_GCC_OPT_3 npy_uintp count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx) { npy_uintp zero_count = 0; + uint64_t innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16; npy_uint16 *end = d + unrollx; const npyv_u16 vone = npyv_setall_u16(1); const npyv_u16 vzero = npyv_zero_u16(); + npy_uint16 *target = d; while (d Date: Sat, 13 Feb 2021 05:46:28 +0200 Subject: [PATCH 12/13] cleanup --- numpy/core/src/multiarray/item_selection.c | 412 ++++++++------------- 1 file changed, 148 insertions(+), 264 deletions(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 9de3446352de..2c57e5643440 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -30,8 +30,6 @@ #include "array_coercion.h" #include "simd/simd.h" -#include - static NPY_GCC_OPT_3 NPY_INLINE int npy_fasttake_impl( char *dest, char *src, const npy_intp *indices, @@ -2132,19 +2130,18 @@ count_nonzero_bytes_384(const npy_uint64 * w) } #if NPY_SIMD - /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */ static NPY_INLINE NPY_GCC_OPT_3 npyv_u8 count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count) { - const npyv_u8 vone = npyv_setall_u8(1); - const npyv_u8 vzero = npyv_zero_u8(); + const npyv_u8 vone = npyv_setall_u8(1); + const npyv_u8 vzero = npyv_zero_u8(); - npy_intp lane_max = 0; - npyv_u8 vsum8 = npyv_zero_u8(); + npy_intp lane_max = 0; + npyv_u8 vsum8 = npyv_zero_u8(); while (*d < end && lane_max <= max_count - 1) { // we count zeros because `cmpeq` cheaper than `cmpneq` for most archs - npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); + npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); vt = npyv_and_u8(vt, vone); vsum8 = npyv_add_u8(vsum8, vt); *d += npyv_nlanes_u8; @@ -2157,8 +2154,8 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2 count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count) { npyv_u16x2 vsum16; - vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); - npy_intp lane_max = 0; + vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); + npy_intp lane_max = 0; while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) { npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8); npyv_u16x2 part = npyv_expand_u16_u8(vsum8); @@ -2168,18 +2165,18 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c } return vsum16; } - +#endif // NPY_SIMD /* * Counts the number of non-zero values in a raw array. * The one loop process is shown below(take SSE2 with 128bits vector for example): - * |------------16 lanes---------| + * |------------16 lanes---------| *[vsum8] 255 255 255 ... 255 255 255 255 count_zero_bytes_u8: counting 255*16 elements * !! - * |------------8 lanes---------| + * |------------8 lanes---------| *[vsum16] 65535 65535 65535 ... 65535 count_zero_bytes_u16: counting (2*16-1)*16 elements * 65535 65535 65535 ... 65535 * !! - * |------------4 lanes---------| + * |------------4 lanes---------| *[sum_32_0] 65535 65535 65535 65535 count_nonzero_bytes * 65535 65535 65535 65535 *[sum_32_1] 65535 65535 65535 65535 @@ -2188,211 +2185,143 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c * (2*16-1)*16 */ static NPY_INLINE NPY_GCC_OPT_3 npy_intp -count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx) +count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len) { - npy_intp zero_count = 0; - const npy_uint8 *end = d + unrollx; - while (d < end) { - npyv_u16x2 vsum16 = count_zero_bytes_u16(&d, end, NPY_MAX_UINT16); - npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]); - npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]); - zero_count += npyv_sum_u32(npyv_add_u32( - npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]), - npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1]) - )); - } - return unrollx - zero_count; + npy_intp count = 0; + if (bstride == 1)) { + #if NPY_SIMD + npy_uintp len_m = len & -npyv_nlanes_u8; + npy_uintp zcount = 0; + for (const char *end = data + len_m; data < end;) { + npyv_u16x2 vsum16 = count_zero_bytes_u16((const npy_uint8**)&data, (const npy_uint8*)end, NPY_MAX_UINT16); + npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]); + npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]); + zcount += npyv_sum_u32(npyv_add_u32( + npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]), + npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1]) + )); + } + len -= len_m; + count = len_m - zcount; + #else + if (!NPY_ALIGNMENT_REQUIRED || npy_is_aligned(data, sizeof(npy_uint64))) { + int step = 6 * sizeof(npy_uint64); + int left_bytes = len % step; + for (const char *end = data + len; data < end - left_bytes; data += step) { + count += count_nonzero_bytes_384((const npy_uint64 *)data); + } + len = left_bytes; + } + #endif // NPY_SIMD + } + for (; len > 0; --len, data += bstride) { + count += (*data != 0); + } + return count; } - -#define safe_ptr_addition_uint16(result, ptr, adder) \ - result = ((((uint64_t) ptr) + (((uint64_t) adder) << 1)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint16 *) NPY_MAX_UINTP; - -#define safe_ptr_addition_uint32(result, ptr, adder) \ - result = ((((uint64_t) ptr) + (((uint64_t) adder) << 2)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint32 *) NPY_MAX_UINTP; - -static NPY_INLINE NPY_GCC_OPT_3 npy_uintp -count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx) +static NPY_INLINE NPY_GCC_OPT_3 npy_intp +count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len) { - npy_uintp zero_count = 0; - uint64_t innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16; - npy_uint16 *end = d + unrollx; - - const npyv_u16 vone = npyv_setall_u16(1); - const npyv_u16 vzero = npyv_zero_u16(); - - npy_uint16 *target = d; - while (d 0;) { + npyv_u16 vsum16 = npyv_zero_u16(); + npy_uintp max16 = PyArray_MIN(lenx, NPY_MAX_UINT16*npyv_nlanes_u16); + + for (const char *end = data + max16*bstride; data < end; data += NPY_SIMD_WIDTH) { + npyv_u16 mask = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16((npy_uint16*)data), vzero)); + mask = npyv_and_u16(mask, vone); + vsum16 = npyv_add_u16(vsum16, mask); + } + lenx -= max16; + zcount += npyv_sumup_u16(vsum16); } - - const npyv_u16 maskevn = npyv_reinterpret_u16_u32(npyv_setall_u32(0xffff)); - npyv_u32 odd = npyv_shri_u32(npyv_reinterpret_u32_u16(vsum16), 16); - npyv_u32 even = npyv_reinterpret_u32_u16(npyv_and_u16(vsum16, maskevn)); - zero_count += npyv_sum_u32(npyv_add_u32(odd, even)); + len -= len_m; + count = len_m - zcount; } - - return unrollx - zero_count; +#endif + for (; len > 0; --len, data += bstride) { + count += (*(npy_uint16*)data != 0); + } + return count; } - -static NPY_INLINE NPY_GCC_OPT_3 npy_uintp -count_nonzero_int32_simd(npy_uint32 *d, npy_uintp unrollx) +static NPY_INLINE NPY_GCC_OPT_3 npy_intp +count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len) { - npy_uintp zero_count = 0; - uint64_t innerloop_jump = NPY_MAX_UINT32 * npyv_nlanes_u32; - npy_uint32 *end = d + unrollx; - - const npyv_u32 vone = npyv_setall_u32(1); - const npyv_u32 vzero = npyv_zero_u32(); - - npy_uint32 *target = d; - while (d max_iter ? max_iter : len) & -npyv_nlanes_u32; + const npyv_u32 vone = npyv_setall_u32(1); + const npyv_u32 vzero = npyv_zero_u32(); + + npyv_u32 vsum32 = npyv_zero_u32(); + for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) { + npyv_u32 mask = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32((npy_uint32*)data), vzero)); + mask = npyv_and_u32(mask, vone); + vsum32 = npyv_add_u32(vsum32, mask); } - const npyv_u32 maskevn = npyv_reinterpret_u32_u64(npyv_setall_u64(0xffffffffULL)); npyv_u64 odd = npyv_shri_u64(npyv_reinterpret_u64_u32(vsum32), 32); npyv_u64 even = npyv_reinterpret_u64_u32(npyv_and_u32(vsum32, maskevn)); - zero_count += npyv_sum_u64(npyv_add_u64(odd, even)); + count = len_m - npyv_sum_u64(npyv_add_u64(odd, even)); + len -= len_m; } - - return unrollx - zero_count; -} - - -static NPY_INLINE NPY_GCC_OPT_3 npy_uintp -count_nonzero_int64_simd(npy_uint64 *d, npy_uintp unrollx) -{ - npy_uintp zero_count; - const npy_uint64 *end = d + unrollx; - const npyv_u64 vone = npyv_setall_u64(1); - const npyv_u64 vzero = npyv_zero_u64(); - npyv_u64 vsum64 = npyv_zero_u64(); - - for (; d 0; --len, data += bstride) { + count += (*(npy_uint32*)data != 0); } - - zero_count = npyv_sum_u64(vsum64); - - return unrollx - zero_count; + return count; } -#endif - - static NPY_INLINE NPY_GCC_OPT_3 npy_intp -count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int elsize) +count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len) { - int idim; - npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS]; - npy_intp coord[NPY_MAXDIMS]; npy_intp count = 0; - NPY_BEGIN_THREADS_DEF; - - /* Use raw iteration with no heap memory allocation */ - if (PyArray_PrepareOneRawArrayIter( - ndim, ashape, - data, astrides, - &ndim, shape, - &data, strides) < 0) { - return -1; - } - - /* Handle zero-sized array */ - if (shape[0] == 0) { - return 0; - } - - -#define _ITERATE_INT_SIMPLE(bits) \ - npy_int##bits *d = (npy_int##bits *) data; \ - NPY_RAW_ITER_START(idim, ndim, coord, shape) { \ - /* Process the innermost dimension */ \ - for (npy_intp i = 0; i < shape[0]; ++i, d = (npy_int##bits *) (((npy_int8*) d) + strides[0])) { \ - count += (*d != 0); \ - } \ - d = (npy_int##bits *) data; \ - } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d, strides); - -#define _ITERATE_INT(bits, bytes) \ - if (strides[0] == bytes) { \ - npy_int##bits *d2 = (npy_int##bits *) data; \ - NPY_RAW_ITER_START(idim, ndim, coord, shape) { \ - /* Process the innermost dimension */ \ - npy_uint##bits *d = (npy_uint##bits *) data; \ - const npy_uint##bits *e = ((npy_uint##bits *) data) + shape[0]; \ - npy_uintp stride = shape[0] & -npyv_nlanes_u##bits; \ - count += count_nonzero_int##bits##_simd(d, stride); \ - d += stride; \ - for (; d < e; ++d) { \ - count += (*d != 0); \ - } \ - } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d2, strides); \ - } else { \ - _ITERATE_INT_SIMPLE(bits) \ - } - #if NPY_SIMD - #define _ITERATE_I16 _ITERATE_INT(16, 2) - #define _ITERATE_I32 _ITERATE_INT(32, 4) - #define _ITERATE_I64 _ITERATE_INT(64, 8) -#else - #define _ITERATE_I16 _ITERATE_INT_SIMPLE(16) - #define _ITERATE_I32 _ITERATE_INT_SIMPLE(32) - #define _ITERATE_I64 _ITERATE_INT_SIMPLE(64) -#endif + if (bstride == sizeof(npy_uint64)) { + const npy_uintp len_m = len & -npyv_nlanes_u64; + const npyv_u64 vone = npyv_setall_u64(1); + const npyv_u64 vzero = npyv_zero_u64(); - NPY_BEGIN_THREADS_THRESHOLDED(shape[0]); - - if (elsize == 2) { - _ITERATE_I16; - } - else if (elsize == 4) { - _ITERATE_I32; + npyv_u64 vsum64 = npyv_zero_u64(); + for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) { + npyv_u64 mask = npyv_cvt_u64_b64(npyv_cmpeq_u64(npyv_load_u64((npy_uint64*)data), vzero)); + mask = npyv_and_u64(mask, vone); + vsum64 = npyv_add_u64(vsum64, mask); + } + len -= len_m; + count = len_m - npyv_sum_u64(vsum64); } - else if (elsize == 8) { - _ITERATE_I64; +#endif + for (; len > 0; --len, data += bstride) { + count += (*(npy_uint64*)data != 0); } - - NPY_END_THREADS; - return count; } - - - /* * Counts the number of True values in a raw boolean array. This * is a low-overhead function which does no heap allocations. * * Returns -1 on error. */ -NPY_NO_EXPORT npy_intp -count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides) +static NPY_GCC_OPT_3 npy_intp +count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int elsize) { + assert(elsize <= 8); int idim; npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS]; - npy_intp i, coord[NPY_MAXDIMS]; - npy_intp count = 0; - NPY_BEGIN_THREADS_DEF; + npy_intp coord[NPY_MAXDIMS]; - /* Use raw iteration with no heap memory allocation */ + // Use raw iteration with no heap memory allocation if (PyArray_PrepareOneRawArrayIter( ndim, ashape, data, astrides, @@ -2401,51 +2330,44 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const return -1; } - /* Handle zero-sized array */ + // Handle zero-sized array if (shape[0] == 0) { return 0; } + NPY_BEGIN_THREADS_DEF; NPY_BEGIN_THREADS_THRESHOLDED(shape[0]); - /* Special case for contiguous inner loop */ - if (strides[0] == 1) { - NPY_RAW_ITER_START(idim, ndim, coord, shape) { - /* Process the innermost dimension */ - const char *d = data; - const char *e = data + shape[0]; -#if NPY_SIMD - npy_uintp stride = shape[0] & -npyv_nlanes_u8; - count += count_nonzero_bytes((const npy_uint8 *)d, stride); - d += stride; -#else - if (!NPY_ALIGNMENT_REQUIRED || - npy_is_aligned(d, sizeof(npy_uint64))) { - npy_uintp stride = 6 * sizeof(npy_uint64); - for (; d < e - (shape[0] % stride); d += stride) { - count += count_nonzero_bytes_384((const npy_uint64 *)d); - } - } -#endif - for (; d < e; ++d) { - count += (*d != 0); - } - } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); - } - /* General inner loop */ - else { - NPY_RAW_ITER_START(idim, ndim, coord, shape) { - char *d = data; - /* Process the innermost dimension */ - for (i = 0; i < shape[0]; ++i, d += strides[0]) { - count += (*d != 0); - } - } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); + + #define NONZERO_CASE(LEN, SFX) \ + case LEN: \ + NPY_RAW_ITER_START(idim, ndim, coord, shape) { \ + count += count_nonzero_##SFX(data, strides[0], shape[0]); \ + } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); \ + break + + npy_intp count = 0; + switch(elsize) { + NONZERO_CASE(1, u8); + NONZERO_CASE(2, u16); + NONZERO_CASE(4, u32); + NONZERO_CASE(8, u64); } + #undef NONZERO_CASE NPY_END_THREADS; - return count; } +/* + * Counts the number of True values in a raw boolean array. This + * is a low-overhead function which does no heap allocations. + * + * Returns -1 on error. + */ +NPY_NO_EXPORT NPY_GCC_OPT_3 npy_intp +count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides) +{ + return count_nonzero_int(ndim, data, ashape, astrides, 1); +} /*NUMPY_API * Counts the number of non-zero elements in the array. @@ -2468,60 +2390,22 @@ PyArray_CountNonzero(PyArrayObject *self) npy_intp *strideptr, *innersizeptr; NPY_BEGIN_THREADS_DEF; - /* Special low-overhead version specific to the boolean type */ + // Special low-overhead version specific to the boolean/int types dtype = PyArray_DESCR(self); - - // if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) { - // return count_nonzero_int(PyArray_NDIM(self), (char *) PyArray_DATA(self), - // PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num); - // } - - // if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) { - // return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), - // PyArray_DIMS(self), PyArray_STRIDES(self)); - // } - switch(dtype->kind) { case 'u': - { - if (dtype->elsize == 1) - return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), - PyArray_DIMS(self), PyArray_STRIDES(self)); - - if (dtype->elsize >=2 && dtype->elsize <= 8) - return count_nonzero_int( - PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self), - PyArray_STRIDES(self), dtype->elsize - ); - - break; - } case 'i': - { - if (dtype->elsize == 1) - return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), - PyArray_DIMS(self), PyArray_STRIDES(self)); - - if (dtype->elsize >=2 && dtype->elsize <= 8) - return count_nonzero_int( - PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self), - PyArray_STRIDES(self), dtype->elsize - ); - - break; - } case 'b': - { - if (dtype->elsize == 1) - return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), - PyArray_DIMS(self), PyArray_STRIDES(self)); - - } + if (dtype->elsize > 8) { + break; + } + return count_nonzero_int( + PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self), + PyArray_STRIDES(self), dtype->elsize + ); } - nonzero = PyArray_DESCR(self)->f->nonzero; - /* If it's a trivial one-dimensional loop, don't use an iterator */ if (PyArray_TRIVIALLY_ITERABLE(self)) { needs_api = PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI); From 85e2ce980fa4883c1add983be924d5e16d3723ec Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Sat, 13 Feb 2021 05:59:04 +0200 Subject: [PATCH 13/13] fix up --- numpy/core/src/multiarray/item_selection.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 2c57e5643440..fb354ce5473a 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2188,7 +2188,7 @@ static NPY_INLINE NPY_GCC_OPT_3 npy_intp count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len) { npy_intp count = 0; - if (bstride == 1)) { + if (bstride == 1) { #if NPY_SIMD npy_uintp len_m = len & -npyv_nlanes_u8; npy_uintp zcount = 0;