From d2e77689db58cb4aab91b5330a3336dd24930ade Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Mon, 18 Jan 2021 17:59:04 +0600
Subject: [PATCH 01/13] Added support for SIMD operations for int types in
 numpy.count_nonzero function

---
 numpy/core/src/multiarray/item_selection.c | 287 ++++++++++++++++++++-
 1 file changed, 278 insertions(+), 9 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 8e4b2ebe120e..2d1d6db83b04 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2131,18 +2131,22 @@ count_nonzero_bytes_384(const npy_uint64 * w)
 
 #if NPY_SIMD
 
+/*
+
+*/
+
 /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
 static NPY_INLINE NPY_GCC_OPT_3 npyv_u8
 count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count)
 {
-    const npyv_u8 vone = npyv_setall_u8(1);
-    const npyv_u8 vzero = npyv_zero_u8();
+    const npyv_u8 vone = npyv_setall_u8(1); 
+    const npyv_u8 vzero = npyv_zero_u8();   
 
-    npy_intp lane_max = 0;
-    npyv_u8 vsum8 = npyv_zero_u8();
+    npy_intp lane_max = 0; 
+    npyv_u8 vsum8 = npyv_zero_u8(); 
     while (*d < end && lane_max <= max_count - 1) {
         // we count zeros because `cmpeq` cheaper than `cmpneq` for most archs
-        npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero));
+        npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); 
         vt = npyv_and_u8(vt, vone);
         vsum8 = npyv_add_u8(vsum8, vt);
         *d += npyv_nlanes_u8;
@@ -2155,8 +2159,8 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2
 count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count)
 {
     npyv_u16x2 vsum16;
-    vsum16.val[0] = vsum16.val[1] = npyv_zero_u16();
-    npy_intp lane_max = 0;
+    vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); // Setting a vector of 0s (16 maybe)
+    npy_intp lane_max = 0; // scalar 0
     while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) {
         npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8);
         npyv_u16x2 part = npyv_expand_u16_u8(vsum8);
@@ -2202,7 +2206,252 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
     return unrollx - zero_count;
 }
 
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
+count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
+{
+    npy_uintp zero_count = 0;
+    const npy_uintp innerloop_jump = NPY_MAX_UINT16;
+    const npy_int16 *end = d + unrollx;
+
+    const npyv_u16 vone = npyv_setall_u16(1); 
+    const npyv_u16 vzero = npyv_zero_u16();   
+
+    npy_int16 *target = d;
+    npy_uint16 sums[npyv_nlanes_u16];
+
+    while (d<end) {
+        npyv_u16 vsum16 = npyv_zero_u16(); 
+        target = MIN(target+innerloop_jump, end);
+        for (; d<target; d+=npyv_nlanes_u16) {
+            npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero)); 
+            vt = npyv_and_u16(vt, vone);
+            vsum16 = npyv_add_u16(vsum16, vt);
+        }
+
+        npyv_store_u16(sums, vsum16);
+        for (int i=0; i<npyv_nlanes_u16; ++i) {
+            zero_count += sums[i];
+        }
+    }
+
+    return unrollx - zero_count;
+}
+
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
+count_nonzero_int32_simd(npy_int32 *d, npy_uintp unrollx)
+{
+    npy_uintp zero_count = 0;
+    const npy_uintp innerloop_jump = NPY_MAX_UINT32;
+    const npy_int32 *end = d + unrollx;
+
+    const npyv_u32 vone = npyv_setall_u32(1); 
+    const npyv_u32 vzero = npyv_zero_u32();   
+
+    npy_int32 *target = d;
+    while (d<end) {
+        npyv_u32 vsum32 = npyv_zero_u32(); 
+        target = MIN(target+innerloop_jump, end);
+        for (; d<target; d+=npyv_nlanes_u32) {
+            npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero)); 
+            vt = npyv_and_u32(vt, vone);
+            vsum32 = npyv_add_u32(vsum32, vt);
+        }
+        zero_count += npyv_sum_u32(vsum32);    
+    }
+
+    return unrollx - zero_count;
+}
+
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
+count_nonzero_int64_simd(npy_int64 *d, npy_uintp unrollx)
+{
+    npy_uintp zero_count = 0;
+    const npy_int64 *end = d + unrollx;
+    const npyv_u64 vone = npyv_setall_u64(1); 
+    const npyv_u64 vzero = npyv_zero_u64();   
+    npyv_u64 vsum64 = npyv_zero_u64(); 
+
+    for (; d<end; d+=npyv_nlanes_u64) {
+        npyv_u64 vt = npyv_cvt_u64_b64(npyv_cmpeq_u64(npyv_load_u64(d), vzero)); 
+        vt = npyv_and_u64(vt, vone);
+        vsum64 = npyv_add_u64(vsum64, vt);
+    }
+
+    npy_uint64 sums[npyv_nlanes_u64];
+    npyv_store_u64(sums, vsum64);
+    for (int i=0; i<npyv_nlanes_u64; ++i) {
+        zero_count += sums[i];
+    }
+
+    return unrollx - zero_count;
+}
+
 #endif
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_int16(int ndim, const npy_int16 *data, const npy_intp *ashape, const npy_intp *astrides)
+{
+    int idim;
+    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+    npy_intp count = 0;
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Use raw iteration with no heap memory allocation */
+    if (PyArray_PrepareOneRawArrayIter(
+                    ndim, ashape,
+                    data, astrides,
+                    &ndim, shape,
+                    &data, strides) < 0) {
+        return -1;
+    }
+
+    /* Handle zero-sized array */
+    if (shape[0] == 0) {
+        return 0;
+    }
+
+    NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
+    if (strides[0] == 2) {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+            /* Process the innermost dimension */
+            const npy_int16 *d = data;
+            const npy_int16 *e = data + shape[0];
+            npy_uintp stride = shape[0] & -npyv_nlanes_u16;
+            count += count_nonzero_int16_simd(d, stride);
+            d += stride;
+            for (; d < e; ++d) {
+                count += (*d != 0);
+            }
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+    } else {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+            npy_int16 *d = data;
+            /* Process the innermost dimension */
+            for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) {
+                count += (*d != 0);
+            }
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+    }
+
+    NPY_END_THREADS;
+
+    return count;
+}
+
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_int32(int ndim, const npy_int32 *data, const npy_intp *ashape, const npy_intp *astrides)
+{
+    int idim;
+    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+    npy_intp count = 0;
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Use raw iteration with no heap memory allocation */
+    if (PyArray_PrepareOneRawArrayIter(
+                    ndim, ashape,
+                    data, astrides,
+                    &ndim, shape,
+                    &data, strides) < 0) {
+        return -1;
+    }
+
+    /* Handle zero-sized array */
+    if (shape[0] == 0) {
+        return 0;
+    }
+
+    NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
+    if (strides[0] == 4) {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+            /* Process the innermost dimension */
+            const npy_int32 *d = data;
+            const npy_int32 *e = data + shape[0];
+            npy_uintp stride = shape[0] & -npyv_nlanes_u32;
+            count += count_nonzero_int32_simd(d, stride);
+            d += stride;
+            for (; d < e; ++d) {
+                count += (*d != 0);
+            }
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+    } else {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+            npy_int32 *d = data;
+            /* Process the innermost dimension */
+            for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) {
+                count += (*d != 0);
+            }
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+    }
+
+    NPY_END_THREADS;
+
+    return count;
+}
+
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_int64(int ndim, const npy_int64 *data, const npy_intp *ashape, const npy_intp *astrides)
+{
+    int idim;
+    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+    npy_intp count = 0;
+    NPY_BEGIN_THREADS_DEF;
+
+    /* Use raw iteration with no heap memory allocation */
+    if (PyArray_PrepareOneRawArrayIter(
+                    ndim, ashape,
+                    data, astrides,
+                    &ndim, shape,
+                    &data, strides) < 0) {
+        return -1;
+    }
+
+    /* Handle zero-sized array */
+    if (shape[0] == 0) {
+        return 0;
+    }
+
+    NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
+
+    if (strides[0] == 8) {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+            /* Process the innermost dimension */
+            const npy_int64 *d = data;
+            const npy_int64 *e = data + shape[0];
+            npy_uintp stride = shape[0] & -npyv_nlanes_u64;
+            count += count_nonzero_int64_simd(d, stride);
+            d += stride;
+            for (; d < e; ++d) {
+                count += (*d != 0);
+            }
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+    } else {
+        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
+            npy_int64 *d = data;
+            /* Process the innermost dimension */
+            for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) {
+                count += (*d != 0);
+            }
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+    }
+
+    NPY_END_THREADS;
+
+    return count;
+}
+
+
+
 /*
  * Counts the number of True values in a raw boolean array. This
  * is a low-overhead function which does no heap allocations.
@@ -2212,7 +2461,6 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
 NPY_NO_EXPORT npy_intp
 count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides)
 {
-    
     int idim;
     npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
     npy_intp i, coord[NPY_MAXDIMS];
@@ -2297,10 +2545,31 @@ PyArray_CountNonzero(PyArrayObject *self)
 
     /* Special low-overhead version specific to the boolean type */
     dtype = PyArray_DESCR(self);
-    if (dtype->type_num == NPY_BOOL) {
+
+
+#if NPY_SIMD
+    if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) {
+        return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self),
+                        PyArray_DIMS(self), PyArray_STRIDES(self));
+    }
+
+    if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) {
+        return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self),
+                        PyArray_DIMS(self), PyArray_STRIDES(self));
+    }
+
+    if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) {
+        return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self),
+                        PyArray_DIMS(self), PyArray_STRIDES(self));
+    }
+
+#endif
+
+    if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) {
         return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
                         PyArray_DIMS(self), PyArray_STRIDES(self));
     }
+
     nonzero = PyArray_DESCR(self)->f->nonzero;
 
     /* If it's a trivial one-dimensional loop, don't use an iterator */

From c716a120cba2c8c2b972433604b085d87122823e Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Mon, 18 Jan 2021 18:01:55 +0600
Subject: [PATCH 02/13] Added tests for i1,i2,i4,i8 types for
 numpy.count_nonzero function

---
 numpy/core/tests/test_numeric.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 280874d21695..6de9e3764cd9 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -1257,20 +1257,30 @@ def test_nonzero_onedim(self):
         assert_equal(np.count_nonzero(x), 4)
         assert_equal(np.nonzero(x), ([0, 2, 3, 6],))
 
-        x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)],
-                     dtype=[('a', 'i4'), ('b', 'i2')])
+        # x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)],
+        #              dtype=[('a', 'i4'), ('b', 'i2')])
+        x = np.array([(1, 2, -5, -3), (0, 0, 2, 7), (1, 1, 0, 1), (-1, 3, 1, 0), (0, 7, 0, 4)],
+                     dtype=[('a', 'i4'), ('b', 'i2'), ('c', 'i1'), ('d', 'i8')])
         assert_equal(np.count_nonzero(x['a']), 3)
         assert_equal(np.count_nonzero(x['b']), 4)
+        assert_equal(np.count_nonzero(x['c']), 3)
+        assert_equal(np.count_nonzero(x['d']), 4)
         assert_equal(np.nonzero(x['a']), ([0, 2, 3],))
         assert_equal(np.nonzero(x['b']), ([0, 2, 3, 4],))
 
     def test_nonzero_twodim(self):
         x = np.array([[0, 1, 0], [2, 0, 3]])
-        assert_equal(np.count_nonzero(x), 3)
+        assert_equal(np.count_nonzero(x.astype('i1')), 3)
+        assert_equal(np.count_nonzero(x.astype('i2')), 3)
+        assert_equal(np.count_nonzero(x.astype('i4')), 3)
+        assert_equal(np.count_nonzero(x.astype('i8')), 3)
         assert_equal(np.nonzero(x), ([0, 1, 1], [1, 0, 2]))
 
         x = np.eye(3)
-        assert_equal(np.count_nonzero(x), 3)
+        assert_equal(np.count_nonzero(x.astype('i1')), 3)
+        assert_equal(np.count_nonzero(x.astype('i2')), 3)
+        assert_equal(np.count_nonzero(x.astype('i4')), 3)
+        assert_equal(np.count_nonzero(x.astype('i8')), 3)
         assert_equal(np.nonzero(x), ([0, 1, 2], [0, 1, 2]))
 
         x = np.array([[(0, 1), (0, 0), (1, 11)],

From 15cf37d5394e69fc1847b1efa8d5253de4890cbe Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Tue, 19 Jan 2021 16:19:11 +0600
Subject: [PATCH 03/13] Merged count_nonzero_int16/int32/int64 into
 count_nonzero_int and added benchmarks

---
 benchmarks/benchmarks/bench_core.py        |   2 +-
 numpy/core/src/multiarray/item_selection.c | 206 +++++++--------------
 2 files changed, 67 insertions(+), 141 deletions(-)

diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
index 1c028542db04..279c6f475920 100644
--- a/benchmarks/benchmarks/bench_core.py
+++ b/benchmarks/benchmarks/bench_core.py
@@ -136,7 +136,7 @@ class CountNonzero(Benchmark):
     params = [
         [1, 2, 3],
         [100, 10000, 1000000],
-        [bool, int, str, object]
+        [bool, np.int8, np.int16, np.int32, np.int64, str, object]
     ]
 
     def setup(self, numaxes, size, dtype):
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 2d1d6db83b04..01438e27d63a 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2206,9 +2206,6 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
     return unrollx - zero_count;
 }
 
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
 
 static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
@@ -2225,7 +2222,7 @@ count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
 
     while (d<end) {
         npyv_u16 vsum16 = npyv_zero_u16(); 
-        target = MIN(target+innerloop_jump, end);
+        target = PyArray_MIN(target+innerloop_jump, end);
         for (; d<target; d+=npyv_nlanes_u16) {
             npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero)); 
             vt = npyv_and_u16(vt, vone);
@@ -2255,7 +2252,7 @@ count_nonzero_int32_simd(npy_int32 *d, npy_uintp unrollx)
     npy_int32 *target = d;
     while (d<end) {
         npyv_u32 vsum32 = npyv_zero_u32(); 
-        target = MIN(target+innerloop_jump, end);
+        target = PyArray_MIN(target+innerloop_jump, end);
         for (; d<target; d+=npyv_nlanes_u32) {
             npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero)); 
             vt = npyv_and_u32(vt, vone);
@@ -2294,60 +2291,9 @@ count_nonzero_int64_simd(npy_int64 *d, npy_uintp unrollx)
 
 #endif
 
-static NPY_INLINE NPY_GCC_OPT_3 npy_intp
-count_nonzero_int16(int ndim, const npy_int16 *data, const npy_intp *ashape, const npy_intp *astrides)
-{
-    int idim;
-    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
-    npy_intp coord[NPY_MAXDIMS];
-    npy_intp count = 0;
-    NPY_BEGIN_THREADS_DEF;
-
-    /* Use raw iteration with no heap memory allocation */
-    if (PyArray_PrepareOneRawArrayIter(
-                    ndim, ashape,
-                    data, astrides,
-                    &ndim, shape,
-                    &data, strides) < 0) {
-        return -1;
-    }
-
-    /* Handle zero-sized array */
-    if (shape[0] == 0) {
-        return 0;
-    }
-
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
-    if (strides[0] == 2) {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            /* Process the innermost dimension */
-            const npy_int16 *d = data;
-            const npy_int16 *e = data + shape[0];
-            npy_uintp stride = shape[0] & -npyv_nlanes_u16;
-            count += count_nonzero_int16_simd(d, stride);
-            d += stride;
-            for (; d < e; ++d) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
-    } else {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            npy_int16 *d = data;
-            /* Process the innermost dimension */
-            for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
-    }
-
-    NPY_END_THREADS;
-
-    return count;
-}
-
 
 static NPY_INLINE NPY_GCC_OPT_3 npy_intp
-count_nonzero_int32(int ndim, const npy_int32 *data, const npy_intp *ashape, const npy_intp *astrides)
+count_nonzero_int(int ndim, void *data, const npy_intp *ashape, const npy_intp *astrides, int type_num)
 {
     int idim;
     npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
@@ -2369,83 +2315,58 @@ count_nonzero_int32(int ndim, const npy_int32 *data, const npy_intp *ashape, con
         return 0;
     }
 
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
-    if (strides[0] == 4) {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            /* Process the innermost dimension */
-            const npy_int32 *d = data;
-            const npy_int32 *e = data + shape[0];
-            npy_uintp stride = shape[0] & -npyv_nlanes_u32;
-            count += count_nonzero_int32_simd(d, stride);
-            d += stride;
-            for (; d < e; ++d) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
-    } else {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            npy_int32 *d = data;
-            /* Process the innermost dimension */
-            for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
-    }
-
-    NPY_END_THREADS;
 
-    return count;
-}
+#define _ITERATE_INT_SIMPLE(bits) \
+    npy_int##bits *d = (npy_int##bits *) data; \
+    NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
+        /* Process the innermost dimension */ \
+        for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) { \
+            count += (*d != 0); \
+        } \
+        d = (npy_int##bits *) data; \
+    } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d, strides); 
+
+#define _ITERATE_INT(bits, bytes) \
+    if (strides[0] == bytes) { \
+        npy_int##bits *d2 = (npy_int##bits *) data; \
+        NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
+            /* Process the innermost dimension */ \
+            const npy_int##bits *d = (npy_int##bits *) data; \
+            const npy_int##bits *e = ((npy_int##bits *) data) + shape[0]; \
+            npy_uintp stride = shape[0] & -npyv_nlanes_u##bits; \
+            count += count_nonzero_int##bits##_simd(d, stride); \
+            d += stride; \
+            for (; d < e; ++d) { \
+                count += (*d != 0); \
+            } \
+        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d2, strides); \
+    } else { \
+        _ITERATE_INT_SIMPLE(bits) \
+    } 
 
+#if NPY_SIMD
+    #define _ITERATE_I16 _ITERATE_INT(16, 2)
+    #define _ITERATE_I32 _ITERATE_INT(32, 4)
+    #define _ITERATE_I64 _ITERATE_INT(64, 8)
+#else
+    #define _ITERATE_I16 _ITERATE_INT_SIMPLE(16)
+    #define _ITERATE_I32 _ITERATE_INT_SIMPLE(32)
+    #define _ITERATE_I64 _ITERATE_INT_SIMPLE(64)
+#endif
 
-static NPY_INLINE NPY_GCC_OPT_3 npy_intp
-count_nonzero_int64(int ndim, const npy_int64 *data, const npy_intp *ashape, const npy_intp *astrides)
-{
-    int idim;
-    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
-    npy_intp coord[NPY_MAXDIMS];
-    npy_intp count = 0;
-    NPY_BEGIN_THREADS_DEF;
+    NPY_BEGIN_THREADS_THRESHOLDED(shape[0]); 
 
-    /* Use raw iteration with no heap memory allocation */
-    if (PyArray_PrepareOneRawArrayIter(
-                    ndim, ashape,
-                    data, astrides,
-                    &ndim, shape,
-                    &data, strides) < 0) {
-        return -1;
+    if (type_num == NPY_INT16 || type_num == NPY_UINT16) {
+        _ITERATE_I16;
     }
-
-    /* Handle zero-sized array */
-    if (shape[0] == 0) {
-        return 0;
+    else if (type_num == NPY_INT32 || type_num == NPY_UINT32) {
+        _ITERATE_I32;
     }
-
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
-
-    if (strides[0] == 8) {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            /* Process the innermost dimension */
-            const npy_int64 *d = data;
-            const npy_int64 *e = data + shape[0];
-            npy_uintp stride = shape[0] & -npyv_nlanes_u64;
-            count += count_nonzero_int64_simd(d, stride);
-            d += stride;
-            for (; d < e; ++d) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
-    } else {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            npy_int64 *d = data;
-            /* Process the innermost dimension */
-            for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+    else if (type_num == NPY_INT64 || type_num == NPY_UINT64) {
+        _ITERATE_I64;
     }
 
-    NPY_END_THREADS;
+    NPY_END_THREADS; 
 
     return count;
 }
@@ -2547,23 +2468,28 @@ PyArray_CountNonzero(PyArrayObject *self)
     dtype = PyArray_DESCR(self);
 
 
-#if NPY_SIMD
-    if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) {
-        return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self),
-                        PyArray_DIMS(self), PyArray_STRIDES(self));
-    }
+// #if NPY_SIMD
+//     if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) {
+//         return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self),
+//                         PyArray_DIMS(self), PyArray_STRIDES(self));
+//     }
 
-    if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) {
-        return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self),
-                        PyArray_DIMS(self), PyArray_STRIDES(self));
-    }
+//     if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) {
+//         return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self),
+//                         PyArray_DIMS(self), PyArray_STRIDES(self));
+//     }
 
-    if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) {
-        return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self),
-                        PyArray_DIMS(self), PyArray_STRIDES(self));
-    }
+//     if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) {
+//         return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self),
+//                         PyArray_DIMS(self), PyArray_STRIDES(self));
+//     }
 
-#endif
+// #endif
+
+    if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) {
+        return count_nonzero_int(PyArray_NDIM(self), (void *) PyArray_DATA(self),
+                        PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num);
+    }
 
     if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) {
         return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),

From 2b41cbf3e46e6d16e84f0fa800500346789dba6d Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Tue, 19 Jan 2021 16:22:54 +0600
Subject: [PATCH 04/13] Removed commented out code from PyArray_CountNonzero

---
 numpy/core/src/multiarray/item_selection.c | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 01438e27d63a..373286d2305f 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2467,25 +2467,6 @@ PyArray_CountNonzero(PyArrayObject *self)
     /* Special low-overhead version specific to the boolean type */
     dtype = PyArray_DESCR(self);
 
-
-// #if NPY_SIMD
-//     if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) {
-//         return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self),
-//                         PyArray_DIMS(self), PyArray_STRIDES(self));
-//     }
-
-//     if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) {
-//         return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self),
-//                         PyArray_DIMS(self), PyArray_STRIDES(self));
-//     }
-
-//     if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) {
-//         return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self),
-//                         PyArray_DIMS(self), PyArray_STRIDES(self));
-//     }
-
-// #endif
-
     if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) {
         return count_nonzero_int(PyArray_NDIM(self), (void *) PyArray_DATA(self),
                         PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num);

From 87c5d51a32b406a9872428b9ca1db6c5242dded6 Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Fri, 5 Feb 2021 16:57:56 +0600
Subject: [PATCH 05/13] Replaced manual sums with horizontal simd sums for
 count_nonzero_16/64

---
 numpy/core/src/multiarray/item_selection.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 373286d2305f..f92327827cca 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2218,8 +2218,6 @@ count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
     const npyv_u16 vzero = npyv_zero_u16();   
 
     npy_int16 *target = d;
-    npy_uint16 sums[npyv_nlanes_u16];
-
     while (d<end) {
         npyv_u16 vsum16 = npyv_zero_u16(); 
         target = PyArray_MIN(target+innerloop_jump, end);
@@ -2229,10 +2227,7 @@ count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
             vsum16 = npyv_add_u16(vsum16, vt);
         }
 
-        npyv_store_u16(sums, vsum16);
-        for (int i=0; i<npyv_nlanes_u16; ++i) {
-            zero_count += sums[i];
-        }
+        zero_count += npyv_sumup_u16(vsum16);
     }
 
     return unrollx - zero_count;
@@ -2268,7 +2263,7 @@ count_nonzero_int32_simd(npy_int32 *d, npy_uintp unrollx)
 static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int64_simd(npy_int64 *d, npy_uintp unrollx)
 {
-    npy_uintp zero_count = 0;
+    npy_uintp zero_count;
     const npy_int64 *end = d + unrollx;
     const npyv_u64 vone = npyv_setall_u64(1); 
     const npyv_u64 vzero = npyv_zero_u64();   
@@ -2280,11 +2275,7 @@ count_nonzero_int64_simd(npy_int64 *d, npy_uintp unrollx)
         vsum64 = npyv_add_u64(vsum64, vt);
     }
 
-    npy_uint64 sums[npyv_nlanes_u64];
-    npyv_store_u64(sums, vsum64);
-    for (int i=0; i<npyv_nlanes_u64; ++i) {
-        zero_count += sums[i];
-    }
+    zero_count = npyv_sum_u64(vsum64);
 
     return unrollx - zero_count;
 }

From 65892ef68f3a075578516b0630023d2cd4b832bc Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Sun, 7 Feb 2021 12:48:39 +0600
Subject: [PATCH 06/13] fixed CI errors and optimized further simd_16 and
 simd_32

---
 numpy/core/src/multiarray/item_selection.c | 34 +++++++++++-----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index f92327827cca..b0133983af34 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2159,8 +2159,8 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2
 count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count)
 {
     npyv_u16x2 vsum16;
-    vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); // Setting a vector of 0s (16 maybe)
-    npy_intp lane_max = 0; // scalar 0
+    vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); 
+    npy_intp lane_max = 0; 
     while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) {
         npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8);
         npyv_u16x2 part = npyv_expand_u16_u8(vsum8);
@@ -2208,16 +2208,16 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
 
 
 static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
-count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
+count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
-    const npy_uintp innerloop_jump = NPY_MAX_UINT16;
-    const npy_int16 *end = d + unrollx;
+    npy_uintp innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16;
+    npy_uint16 *end = d + unrollx;
 
     const npyv_u16 vone = npyv_setall_u16(1); 
     const npyv_u16 vzero = npyv_zero_u16();   
 
-    npy_int16 *target = d;
+    npy_uint16 *target = d;
     while (d<end) {
         npyv_u16 vsum16 = npyv_zero_u16(); 
         target = PyArray_MIN(target+innerloop_jump, end);
@@ -2235,16 +2235,16 @@ count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
 
 
 static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
-count_nonzero_int32_simd(npy_int32 *d, npy_uintp unrollx)
+count_nonzero_int32_simd(npy_uint32 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
-    const npy_uintp innerloop_jump = NPY_MAX_UINT32;
-    const npy_int32 *end = d + unrollx;
+    npy_uintp innerloop_jump = NPY_MAX_UINT32 * npyv_nlanes_u32;
+    npy_uint32 *end = d + unrollx;
 
     const npyv_u32 vone = npyv_setall_u32(1); 
     const npyv_u32 vzero = npyv_zero_u32();   
 
-    npy_int32 *target = d;
+    npy_uint32 *target = d;
     while (d<end) {
         npyv_u32 vsum32 = npyv_zero_u32(); 
         target = PyArray_MIN(target+innerloop_jump, end);
@@ -2261,10 +2261,10 @@ count_nonzero_int32_simd(npy_int32 *d, npy_uintp unrollx)
 
 
 static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
-count_nonzero_int64_simd(npy_int64 *d, npy_uintp unrollx)
+count_nonzero_int64_simd(npy_uint64 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count;
-    const npy_int64 *end = d + unrollx;
+    const npy_uint64 *end = d + unrollx;
     const npyv_u64 vone = npyv_setall_u64(1); 
     const npyv_u64 vzero = npyv_zero_u64();   
     npyv_u64 vsum64 = npyv_zero_u64(); 
@@ -2284,7 +2284,7 @@ count_nonzero_int64_simd(npy_int64 *d, npy_uintp unrollx)
 
 
 static NPY_INLINE NPY_GCC_OPT_3 npy_intp
-count_nonzero_int(int ndim, void *data, const npy_intp *ashape, const npy_intp *astrides, int type_num)
+count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int type_num)
 {
     int idim;
     npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
@@ -2311,7 +2311,7 @@ count_nonzero_int(int ndim, void *data, const npy_intp *ashape, const npy_intp *
     npy_int##bits *d = (npy_int##bits *) data; \
     NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
         /* Process the innermost dimension */ \
-        for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) { \
+        for (npy_intp i = 0; i < shape[0]; ++i, d = (npy_int##bits *) (((npy_int8*) d) + strides[0])) { \
             count += (*d != 0); \
         } \
         d = (npy_int##bits *) data; \
@@ -2322,8 +2322,8 @@ count_nonzero_int(int ndim, void *data, const npy_intp *ashape, const npy_intp *
         npy_int##bits *d2 = (npy_int##bits *) data; \
         NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
             /* Process the innermost dimension */ \
-            const npy_int##bits *d = (npy_int##bits *) data; \
-            const npy_int##bits *e = ((npy_int##bits *) data) + shape[0]; \
+            npy_uint##bits *d = (npy_uint##bits *) data; \
+            const npy_uint##bits *e = ((npy_uint##bits *) data) + shape[0]; \
             npy_uintp stride = shape[0] & -npyv_nlanes_u##bits; \
             count += count_nonzero_int##bits##_simd(d, stride); \
             d += stride; \
@@ -2459,7 +2459,7 @@ PyArray_CountNonzero(PyArrayObject *self)
     dtype = PyArray_DESCR(self);
 
     if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) {
-        return count_nonzero_int(PyArray_NDIM(self), (void *) PyArray_DATA(self),
+        return count_nonzero_int(PyArray_NDIM(self), (char *) PyArray_DATA(self),
                         PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num);
     }
 

From 022cc66e425b12680b252340d56c11d87d3c8765 Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Sun, 7 Feb 2021 16:43:03 +0600
Subject: [PATCH 07/13] some fixes for the build problems

---
 numpy/core/src/multiarray/item_selection.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index b0133983af34..d793b64efac0 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -30,6 +30,8 @@
 #include "array_coercion.h"
 #include "simd/simd.h"
 
+#include <stdlib.h>
+
 static NPY_GCC_OPT_3 NPY_INLINE int
 npy_fasttake_impl(
         char *dest, char *src, const npy_intp *indices,
@@ -2206,12 +2208,17 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
     return unrollx - zero_count;
 }
 
+#define safe_ptr_addition_uint16(result, ptr, adder) \
+    result = ((((uint64_t) ptr) + (((uint64_t) adder) << 1)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint16 *) NPY_MAX_UINTP; 
+
+#define safe_ptr_addition_uint32(result, ptr, adder) \
+    result = ((((uint64_t) ptr) + (((uint64_t) adder) << 2)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint32 *) NPY_MAX_UINTP; 
 
 static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
-    npy_uintp innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16;
+    uint64_t innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16;
     npy_uint16 *end = d + unrollx;
 
     const npyv_u16 vone = npyv_setall_u16(1); 
@@ -2220,7 +2227,8 @@ count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx)
     npy_uint16 *target = d;
     while (d<end) {
         npyv_u16 vsum16 = npyv_zero_u16(); 
-        target = PyArray_MIN(target+innerloop_jump, end);
+        safe_ptr_addition_uint16(target, target, innerloop_jump)
+        target = PyArray_MIN(target, end);
         for (; d<target; d+=npyv_nlanes_u16) {
             npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero)); 
             vt = npyv_and_u16(vt, vone);
@@ -2238,7 +2246,7 @@ static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int32_simd(npy_uint32 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
-    npy_uintp innerloop_jump = NPY_MAX_UINT32 * npyv_nlanes_u32;
+    uint64_t innerloop_jump = NPY_MAX_UINT32 * npyv_nlanes_u32;
     npy_uint32 *end = d + unrollx;
 
     const npyv_u32 vone = npyv_setall_u32(1); 
@@ -2247,7 +2255,8 @@ count_nonzero_int32_simd(npy_uint32 *d, npy_uintp unrollx)
     npy_uint32 *target = d;
     while (d<end) {
         npyv_u32 vsum32 = npyv_zero_u32(); 
-        target = PyArray_MIN(target+innerloop_jump, end);
+        safe_ptr_addition_uint32(target, target, innerloop_jump)
+        target = PyArray_MIN(target, end);
         for (; d<target; d+=npyv_nlanes_u32) {
             npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero)); 
             vt = npyv_and_u32(vt, vone);

From 6895bab181c1adbee4b9f8863844b6f788dd947c Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Sun, 7 Feb 2021 18:04:10 +0600
Subject: [PATCH 08/13] another attempt to fix build issues

---
 numpy/core/src/multiarray/item_selection.c | 38 ++++++++++------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index d793b64efac0..06513ab9a351 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2208,17 +2208,12 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
     return unrollx - zero_count;
 }
 
-#define safe_ptr_addition_uint16(result, ptr, adder) \
-    result = ((((uint64_t) ptr) + (((uint64_t) adder) << 1)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint16 *) NPY_MAX_UINTP; 
-
-#define safe_ptr_addition_uint32(result, ptr, adder) \
-    result = ((((uint64_t) ptr) + (((uint64_t) adder) << 2)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint32 *) NPY_MAX_UINTP; 
 
 static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
-    uint64_t innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16;
+    uint64_t innerloop_jump = NPY_MAX_UINT16;
     npy_uint16 *end = d + unrollx;
 
     const npyv_u16 vone = npyv_setall_u16(1); 
@@ -2227,14 +2222,15 @@ count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx)
     npy_uint16 *target = d;
     while (d<end) {
         npyv_u16 vsum16 = npyv_zero_u16(); 
-        safe_ptr_addition_uint16(target, target, innerloop_jump)
-        target = PyArray_MIN(target, end);
-        for (; d<target; d+=npyv_nlanes_u16) {
-            npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero)); 
-            vt = npyv_and_u16(vt, vone);
-            vsum16 = npyv_add_u16(vsum16, vt);
+        for (int i=0; i<npyv_nlanes_u16 && d<end; ++i) {
+            uint64_t target_tmp = ((uint64_t)target)+(innerloop_jump >> 1);
+            target = (npy_uint16*) PyArray_MIN(target_tmp, (uint64_t) end);
+            for (; d<target; d+=npyv_nlanes_u16) {
+                npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero)); 
+                vt = npyv_and_u16(vt, vone);
+                vsum16 = npyv_add_u16(vsum16, vt);
+            }
         }
-
         zero_count += npyv_sumup_u16(vsum16);
     }
 
@@ -2246,7 +2242,7 @@ static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int32_simd(npy_uint32 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
-    uint64_t innerloop_jump = NPY_MAX_UINT32 * npyv_nlanes_u32;
+    uint64_t innerloop_jump = NPY_MAX_UINT32;
     npy_uint32 *end = d + unrollx;
 
     const npyv_u32 vone = npyv_setall_u32(1); 
@@ -2255,12 +2251,14 @@ count_nonzero_int32_simd(npy_uint32 *d, npy_uintp unrollx)
     npy_uint32 *target = d;
     while (d<end) {
         npyv_u32 vsum32 = npyv_zero_u32(); 
-        safe_ptr_addition_uint32(target, target, innerloop_jump)
-        target = PyArray_MIN(target, end);
-        for (; d<target; d+=npyv_nlanes_u32) {
-            npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero)); 
-            vt = npyv_and_u32(vt, vone);
-            vsum32 = npyv_add_u32(vsum32, vt);
+        for (int i=0; i<npyv_nlanes_u32 && d<end; ++i) {
+            uint64_t target_tmp = ((uint64_t)target)+(innerloop_jump >> 2);
+            target = (npy_uint32*) PyArray_MIN(target_tmp, (uint64_t) end);
+            for (; d<target; d+=npyv_nlanes_u32) {
+                npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero)); 
+                vt = npyv_and_u32(vt, vone);
+                vsum32 = npyv_add_u32(vsum32, vt);
+            }
         }
         zero_count += npyv_sum_u32(vsum32);    
     }

From 89d6e55595ab37d6670e6f2771eb28258b8e7b98 Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Sun, 7 Feb 2021 18:53:34 +0600
Subject: [PATCH 09/13] removed the target variable and changed the loop as
 suggested by Sayed Adel

---
 numpy/core/src/multiarray/item_selection.c | 28 +++++++---------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 06513ab9a351..f2da62ae96d4 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2213,23 +2213,17 @@ static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
-    uint64_t innerloop_jump = NPY_MAX_UINT16;
     npy_uint16 *end = d + unrollx;
 
     const npyv_u16 vone = npyv_setall_u16(1); 
     const npyv_u16 vzero = npyv_zero_u16();   
 
-    npy_uint16 *target = d;
     while (d<end) {
         npyv_u16 vsum16 = npyv_zero_u16(); 
-        for (int i=0; i<npyv_nlanes_u16 && d<end; ++i) {
-            uint64_t target_tmp = ((uint64_t)target)+(innerloop_jump >> 1);
-            target = (npy_uint16*) PyArray_MIN(target_tmp, (uint64_t) end);
-            for (; d<target; d+=npyv_nlanes_u16) {
-                npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero)); 
-                vt = npyv_and_u16(vt, vone);
-                vsum16 = npyv_add_u16(vsum16, vt);
-            }
+        for (npy_intp i = 0; d < end && i < NPY_MAX_UINT16; ++i, d += npyv_nlanes_u16) {
+            npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero)); 
+            vt = npyv_and_u16(vt, vone);
+            vsum16 = npyv_add_u16(vsum16, vt);
         }
         zero_count += npyv_sumup_u16(vsum16);
     }
@@ -2242,23 +2236,17 @@ static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int32_simd(npy_uint32 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
-    uint64_t innerloop_jump = NPY_MAX_UINT32;
     npy_uint32 *end = d + unrollx;
 
     const npyv_u32 vone = npyv_setall_u32(1); 
     const npyv_u32 vzero = npyv_zero_u32();   
 
-    npy_uint32 *target = d;
     while (d<end) {
         npyv_u32 vsum32 = npyv_zero_u32(); 
-        for (int i=0; i<npyv_nlanes_u32 && d<end; ++i) {
-            uint64_t target_tmp = ((uint64_t)target)+(innerloop_jump >> 2);
-            target = (npy_uint32*) PyArray_MIN(target_tmp, (uint64_t) end);
-            for (; d<target; d+=npyv_nlanes_u32) {
-                npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero)); 
-                vt = npyv_and_u32(vt, vone);
-                vsum32 = npyv_add_u32(vsum32, vt);
-            }
+        for (npy_intp i = 0; d < end && i < NPY_MAX_UINT32; ++i, d += npyv_nlanes_u32) {
+            npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero)); 
+            vt = npyv_and_u32(vt, vone);
+            vsum32 = npyv_add_u32(vsum32, vt);
         }
         zero_count += npyv_sum_u32(vsum32);    
     }

From 534132e17cb2ab071507053bb40f1261d993b4b2 Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Mon, 8 Feb 2021 04:13:45 +0600
Subject: [PATCH 10/13] Modified PyArray_CountNonzero to discriminate between
 types based on elsize

---
 numpy/core/src/multiarray/item_selection.c | 59 ++++++++++++++++++----
 1 file changed, 49 insertions(+), 10 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index f2da62ae96d4..c0eec4c84f66 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2279,7 +2279,7 @@ count_nonzero_int64_simd(npy_uint64 *d, npy_uintp unrollx)
 
 
 static NPY_INLINE NPY_GCC_OPT_3 npy_intp
-count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int type_num)
+count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int elsize)
 {
     int idim;
     npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
@@ -2342,13 +2342,13 @@ count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *
 
     NPY_BEGIN_THREADS_THRESHOLDED(shape[0]); 
 
-    if (type_num == NPY_INT16 || type_num == NPY_UINT16) {
+    if (elsize == 2) {
         _ITERATE_I16;
     }
-    else if (type_num == NPY_INT32 || type_num == NPY_UINT32) {
+    else if (elsize == 4) {
         _ITERATE_I32;
     }
-    else if (type_num == NPY_INT64 || type_num == NPY_UINT64) {
+    else if (elsize == 8) {
         _ITERATE_I64;
     }
 
@@ -2453,16 +2453,55 @@ PyArray_CountNonzero(PyArrayObject *self)
     /* Special low-overhead version specific to the boolean type */
     dtype = PyArray_DESCR(self);
 
-    if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) {
-        return count_nonzero_int(PyArray_NDIM(self), (char *) PyArray_DATA(self),
-                        PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num);
-    }
+    // if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) {
+    //     return count_nonzero_int(PyArray_NDIM(self), (char *) PyArray_DATA(self),
+    //                     PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num);
+    // }
+
+    // if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) {
+    //     return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
+    //                     PyArray_DIMS(self), PyArray_STRIDES(self));
+    // }
+
+    switch(dtype->kind) {
+        case 'u':
+        {
+            if (dtype->elsize == 1) 
+                return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
+                        PyArray_DIMS(self), PyArray_STRIDES(self));
+
+            if (dtype->elsize >=2 && dtype->elsize <= 8)
+                return count_nonzero_int(
+                    PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
+                    PyArray_STRIDES(self), dtype->elsize
+                );
 
-    if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) {
-        return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
+            break;
+        }
+        case 'i':
+        {
+            if (dtype->elsize == 1) 
+                return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
                         PyArray_DIMS(self), PyArray_STRIDES(self));
+
+            if (dtype->elsize >=2 && dtype->elsize <= 8)
+                return count_nonzero_int(
+                    PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
+                    PyArray_STRIDES(self), dtype->elsize
+                );
+            
+            break;
+        }
+        case 'b':
+        {
+           if (dtype->elsize == 1) 
+                return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
+                        PyArray_DIMS(self), PyArray_STRIDES(self));
+           
+        }
     }
 
+
     nonzero = PyArray_DESCR(self)->f->nonzero;
 
     /* If it's a trivial one-dimensional loop, don't use an iterator */

From 1eb91a33202416f582dbf389e44409290922734d Mon Sep 17 00:00:00 2001
From: Touqir Sajed <touqir@ualberta.ca>
Date: Mon, 8 Feb 2021 05:46:06 +0600
Subject: [PATCH 11/13] Ensured overflow does not happen for 16 and 32 bit ints

---
 numpy/core/src/multiarray/item_selection.c | 34 +++++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index c0eec4c84f66..9de3446352de 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2133,10 +2133,6 @@ count_nonzero_bytes_384(const npy_uint64 * w)
 
 #if NPY_SIMD
 
-/*
-
-*/
-
 /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
 static NPY_INLINE NPY_GCC_OPT_3 npyv_u8
 count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count)
@@ -2209,23 +2205,37 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
 }
 
 
+#define safe_ptr_addition_uint16(result, ptr, adder) \
+    result = ((((uint64_t) ptr) + (((uint64_t) adder) << 1)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint16 *) NPY_MAX_UINTP; 
+
+#define safe_ptr_addition_uint32(result, ptr, adder) \
+    result = ((((uint64_t) ptr) + (((uint64_t) adder) << 2)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint32 *) NPY_MAX_UINTP; 
+
 static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
+    uint64_t innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16;
     npy_uint16 *end = d + unrollx;
 
     const npyv_u16 vone = npyv_setall_u16(1); 
     const npyv_u16 vzero = npyv_zero_u16();   
 
+    npy_uint16 *target = d;
     while (d<end) {
         npyv_u16 vsum16 = npyv_zero_u16(); 
-        for (npy_intp i = 0; d < end && i < NPY_MAX_UINT16; ++i, d += npyv_nlanes_u16) {
+        safe_ptr_addition_uint16(target, target, innerloop_jump)
+        target = PyArray_MIN(target, end);
+        for (; d<target; d+=npyv_nlanes_u16) {
             npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero)); 
             vt = npyv_and_u16(vt, vone);
             vsum16 = npyv_add_u16(vsum16, vt);
         }
-        zero_count += npyv_sumup_u16(vsum16);
+
+        const npyv_u16 maskevn = npyv_reinterpret_u16_u32(npyv_setall_u32(0xffff));
+        npyv_u32 odd  = npyv_shri_u32(npyv_reinterpret_u32_u16(vsum16), 16);
+        npyv_u32 even = npyv_reinterpret_u32_u16(npyv_and_u16(vsum16, maskevn));
+        zero_count   += npyv_sum_u32(npyv_add_u32(odd, even));        
     }
 
     return unrollx - zero_count;
@@ -2236,19 +2246,27 @@ static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
 count_nonzero_int32_simd(npy_uint32 *d, npy_uintp unrollx)
 {
     npy_uintp zero_count = 0;
+    uint64_t innerloop_jump = NPY_MAX_UINT32 * npyv_nlanes_u32;
     npy_uint32 *end = d + unrollx;
 
     const npyv_u32 vone = npyv_setall_u32(1); 
     const npyv_u32 vzero = npyv_zero_u32();   
 
+    npy_uint32 *target = d;
     while (d<end) {
         npyv_u32 vsum32 = npyv_zero_u32(); 
-        for (npy_intp i = 0; d < end && i < NPY_MAX_UINT32; ++i, d += npyv_nlanes_u32) {
+        safe_ptr_addition_uint32(target, target, innerloop_jump)
+        target = PyArray_MIN(target, end);
+        for (; d<target; d+=npyv_nlanes_u32) {
             npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero)); 
             vt = npyv_and_u32(vt, vone);
             vsum32 = npyv_add_u32(vsum32, vt);
         }
-        zero_count += npyv_sum_u32(vsum32);    
+
+        const npyv_u32 maskevn = npyv_reinterpret_u32_u64(npyv_setall_u64(0xffffffffULL));
+        npyv_u64 odd  = npyv_shri_u64(npyv_reinterpret_u64_u32(vsum32), 32);
+        npyv_u64 even = npyv_reinterpret_u64_u32(npyv_and_u32(vsum32, maskevn));
+        zero_count   += npyv_sum_u64(npyv_add_u64(odd, even));        
     }
 
     return unrollx - zero_count;

From d20870266a0afb9ceb083d4e5804984182492cf9 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Sat, 13 Feb 2021 05:46:28 +0200
Subject: [PATCH 12/13] cleanup

---
 numpy/core/src/multiarray/item_selection.c | 412 ++++++++-------------
 1 file changed, 148 insertions(+), 264 deletions(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 9de3446352de..2c57e5643440 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -30,8 +30,6 @@
 #include "array_coercion.h"
 #include "simd/simd.h"
 
-#include <stdlib.h>
-
 static NPY_GCC_OPT_3 NPY_INLINE int
 npy_fasttake_impl(
         char *dest, char *src, const npy_intp *indices,
@@ -2132,19 +2130,18 @@ count_nonzero_bytes_384(const npy_uint64 * w)
 }
 
 #if NPY_SIMD
-
 /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
 static NPY_INLINE NPY_GCC_OPT_3 npyv_u8
 count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count)
 {
-    const npyv_u8 vone = npyv_setall_u8(1); 
-    const npyv_u8 vzero = npyv_zero_u8();   
+    const npyv_u8 vone = npyv_setall_u8(1);
+    const npyv_u8 vzero = npyv_zero_u8();
 
-    npy_intp lane_max = 0; 
-    npyv_u8 vsum8 = npyv_zero_u8(); 
+    npy_intp lane_max = 0;
+    npyv_u8 vsum8 = npyv_zero_u8();
     while (*d < end && lane_max <= max_count - 1) {
         // we count zeros because `cmpeq` cheaper than `cmpneq` for most archs
-        npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); 
+        npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero));
         vt = npyv_and_u8(vt, vone);
         vsum8 = npyv_add_u8(vsum8, vt);
         *d += npyv_nlanes_u8;
@@ -2157,8 +2154,8 @@ static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2
 count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count)
 {
     npyv_u16x2 vsum16;
-    vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); 
-    npy_intp lane_max = 0; 
+    vsum16.val[0] = vsum16.val[1] = npyv_zero_u16();
+    npy_intp lane_max = 0;
     while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) {
         npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8);
         npyv_u16x2 part = npyv_expand_u16_u8(vsum8);
@@ -2168,18 +2165,18 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c
     }
     return vsum16;
 }
-
+#endif // NPY_SIMD
 /*
  * Counts the number of non-zero values in a raw array.
  * The one loop process is shown below(take SSE2 with 128bits vector for example):
- *          |------------16 lanes---------|          
+ *          |------------16 lanes---------|
  *[vsum8]   255 255 255 ... 255 255 255 255 count_zero_bytes_u8: counting 255*16 elements
  *                          !!
- *           |------------8 lanes---------|          
+ *           |------------8 lanes---------|
  *[vsum16]   65535 65535 65535 ...   65535  count_zero_bytes_u16: counting (2*16-1)*16 elements
  *           65535 65535 65535 ...   65535
  *                          !!
- *           |------------4 lanes---------|          
+ *           |------------4 lanes---------|
  *[sum_32_0] 65535    65535   65535   65535  count_nonzero_bytes
  *           65535    65535   65535   65535
  *[sum_32_1] 65535    65535   65535   65535
@@ -2188,211 +2185,143 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c
  *                     (2*16-1)*16
 */
 static NPY_INLINE NPY_GCC_OPT_3 npy_intp
-count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
+count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
 {
-    npy_intp zero_count = 0;
-    const npy_uint8 *end = d + unrollx;
-    while (d < end) {
-        npyv_u16x2 vsum16 = count_zero_bytes_u16(&d, end, NPY_MAX_UINT16);
-        npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]);
-        npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]);
-        zero_count += npyv_sum_u32(npyv_add_u32(
-                npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]),
-                npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1])
-        ));
-    }
-    return unrollx - zero_count;
+    npy_intp count = 0;
+    if (bstride == 1)) {
+    #if NPY_SIMD
+        npy_uintp len_m = len & -npyv_nlanes_u8;
+        npy_uintp zcount = 0;
+        for (const char *end = data + len_m; data < end;) {
+            npyv_u16x2 vsum16 = count_zero_bytes_u16((const npy_uint8**)&data, (const npy_uint8*)end, NPY_MAX_UINT16);
+            npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]);
+            npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]);
+            zcount += npyv_sum_u32(npyv_add_u32(
+                    npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]),
+                    npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1])
+            ));
+        }
+        len  -= len_m;
+        count = len_m - zcount;
+    #else
+        if (!NPY_ALIGNMENT_REQUIRED || npy_is_aligned(data, sizeof(npy_uint64))) {
+            int step = 6 * sizeof(npy_uint64);
+            int left_bytes = len % step;
+            for (const char *end = data + len; data < end - left_bytes; data += step) {
+                 count += count_nonzero_bytes_384((const npy_uint64 *)data);
+            }
+            len = left_bytes;
+        }
+    #endif // NPY_SIMD
+    }
+    for (; len > 0; --len, data += bstride) {
+        count += (*data != 0);
+    }
+    return count;
 }
 
-
-#define safe_ptr_addition_uint16(result, ptr, adder) \
-    result = ((((uint64_t) ptr) + (((uint64_t) adder) << 1)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint16 *) NPY_MAX_UINTP; 
-
-#define safe_ptr_addition_uint32(result, ptr, adder) \
-    result = ((((uint64_t) ptr) + (((uint64_t) adder) << 2)) == ((uint64_t) (ptr + adder))) ? (ptr+adder) : (npy_uint32 *) NPY_MAX_UINTP; 
-
-static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
-count_nonzero_int16_simd(npy_uint16 *d, npy_uintp unrollx)
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len)
 {
-    npy_uintp zero_count = 0;
-    uint64_t innerloop_jump = NPY_MAX_UINT16 * npyv_nlanes_u16;
-    npy_uint16 *end = d + unrollx;
-
-    const npyv_u16 vone = npyv_setall_u16(1); 
-    const npyv_u16 vzero = npyv_zero_u16();   
-
-    npy_uint16 *target = d;
-    while (d<end) {
-        npyv_u16 vsum16 = npyv_zero_u16(); 
-        safe_ptr_addition_uint16(target, target, innerloop_jump)
-        target = PyArray_MIN(target, end);
-        for (; d<target; d+=npyv_nlanes_u16) {
-            npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero)); 
-            vt = npyv_and_u16(vt, vone);
-            vsum16 = npyv_add_u16(vsum16, vt);
+    npy_intp count = 0;
+#if NPY_SIMD
+    if (bstride == sizeof(npy_uint16)) {
+        npy_uintp zcount = 0, len_m = len & -npyv_nlanes_u16;
+        const npyv_u16 vone  = npyv_setall_u16(1);
+        const npyv_u16 vzero = npyv_zero_u16();
+
+        for (npy_uintp lenx = len_m; lenx > 0;) {
+            npyv_u16 vsum16 = npyv_zero_u16();
+            npy_uintp max16 = PyArray_MIN(lenx, NPY_MAX_UINT16*npyv_nlanes_u16);
+
+            for (const char *end = data + max16*bstride; data < end; data += NPY_SIMD_WIDTH) {
+                npyv_u16 mask = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16((npy_uint16*)data), vzero));
+                         mask = npyv_and_u16(mask, vone);
+                       vsum16 = npyv_add_u16(vsum16, mask);
+            }
+            lenx   -= max16;
+            zcount += npyv_sumup_u16(vsum16);
         }
-
-        const npyv_u16 maskevn = npyv_reinterpret_u16_u32(npyv_setall_u32(0xffff));
-        npyv_u32 odd  = npyv_shri_u32(npyv_reinterpret_u32_u16(vsum16), 16);
-        npyv_u32 even = npyv_reinterpret_u32_u16(npyv_and_u16(vsum16, maskevn));
-        zero_count   += npyv_sum_u32(npyv_add_u32(odd, even));        
+        len  -= len_m;
+        count = len_m - zcount;
     }
-
-    return unrollx - zero_count;
+#endif
+    for (; len > 0; --len, data += bstride) {
+        count += (*(npy_uint16*)data != 0);
+    }
+    return count;
 }
 
-
-static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
-count_nonzero_int32_simd(npy_uint32 *d, npy_uintp unrollx)
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len)
 {
-    npy_uintp zero_count = 0;
-    uint64_t innerloop_jump = NPY_MAX_UINT32 * npyv_nlanes_u32;
-    npy_uint32 *end = d + unrollx;
-
-    const npyv_u32 vone = npyv_setall_u32(1); 
-    const npyv_u32 vzero = npyv_zero_u32();   
-
-    npy_uint32 *target = d;
-    while (d<end) {
-        npyv_u32 vsum32 = npyv_zero_u32(); 
-        safe_ptr_addition_uint32(target, target, innerloop_jump)
-        target = PyArray_MIN(target, end);
-        for (; d<target; d+=npyv_nlanes_u32) {
-            npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero)); 
-            vt = npyv_and_u32(vt, vone);
-            vsum32 = npyv_add_u32(vsum32, vt);
+    npy_intp count = 0;
+#if NPY_SIMD
+    if (bstride == sizeof(npy_uint32)) {
+        const npy_uintp max_iter = NPY_MAX_UINT32*npyv_nlanes_u32;
+        const npy_uintp len_m = (len > max_iter ? max_iter : len) & -npyv_nlanes_u32;
+        const npyv_u32 vone   = npyv_setall_u32(1);
+        const npyv_u32 vzero  = npyv_zero_u32();
+
+        npyv_u32 vsum32 = npyv_zero_u32();
+        for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) {
+            npyv_u32 mask = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32((npy_uint32*)data), vzero));
+                     mask = npyv_and_u32(mask, vone);
+                   vsum32 = npyv_add_u32(vsum32, mask);
         }
-
         const npyv_u32 maskevn = npyv_reinterpret_u32_u64(npyv_setall_u64(0xffffffffULL));
         npyv_u64 odd  = npyv_shri_u64(npyv_reinterpret_u64_u32(vsum32), 32);
         npyv_u64 even = npyv_reinterpret_u64_u32(npyv_and_u32(vsum32, maskevn));
-        zero_count   += npyv_sum_u64(npyv_add_u64(odd, even));        
+        count = len_m - npyv_sum_u64(npyv_add_u64(odd, even));
+        len  -= len_m;
     }
-
-    return unrollx - zero_count;
-}
-
-
-static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
-count_nonzero_int64_simd(npy_uint64 *d, npy_uintp unrollx)
-{
-    npy_uintp zero_count;
-    const npy_uint64 *end = d + unrollx;
-    const npyv_u64 vone = npyv_setall_u64(1); 
-    const npyv_u64 vzero = npyv_zero_u64();   
-    npyv_u64 vsum64 = npyv_zero_u64(); 
-
-    for (; d<end; d+=npyv_nlanes_u64) {
-        npyv_u64 vt = npyv_cvt_u64_b64(npyv_cmpeq_u64(npyv_load_u64(d), vzero)); 
-        vt = npyv_and_u64(vt, vone);
-        vsum64 = npyv_add_u64(vsum64, vt);
+#endif
+    for (; len > 0; --len, data += bstride) {
+        count += (*(npy_uint32*)data != 0);
     }
-
-    zero_count = npyv_sum_u64(vsum64);
-
-    return unrollx - zero_count;
+    return count;
 }
 
-#endif
-
-
 static NPY_INLINE NPY_GCC_OPT_3 npy_intp
-count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int elsize)
+count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)
 {
-    int idim;
-    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
-    npy_intp coord[NPY_MAXDIMS];
     npy_intp count = 0;
-    NPY_BEGIN_THREADS_DEF;
-
-    /* Use raw iteration with no heap memory allocation */
-    if (PyArray_PrepareOneRawArrayIter(
-                    ndim, ashape,
-                    data, astrides,
-                    &ndim, shape,
-                    &data, strides) < 0) {
-        return -1;
-    }
-
-    /* Handle zero-sized array */
-    if (shape[0] == 0) {
-        return 0;
-    }
-
-
-#define _ITERATE_INT_SIMPLE(bits) \
-    npy_int##bits *d = (npy_int##bits *) data; \
-    NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
-        /* Process the innermost dimension */ \
-        for (npy_intp i = 0; i < shape[0]; ++i, d = (npy_int##bits *) (((npy_int8*) d) + strides[0])) { \
-            count += (*d != 0); \
-        } \
-        d = (npy_int##bits *) data; \
-    } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d, strides); 
-
-#define _ITERATE_INT(bits, bytes) \
-    if (strides[0] == bytes) { \
-        npy_int##bits *d2 = (npy_int##bits *) data; \
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
-            /* Process the innermost dimension */ \
-            npy_uint##bits *d = (npy_uint##bits *) data; \
-            const npy_uint##bits *e = ((npy_uint##bits *) data) + shape[0]; \
-            npy_uintp stride = shape[0] & -npyv_nlanes_u##bits; \
-            count += count_nonzero_int##bits##_simd(d, stride); \
-            d += stride; \
-            for (; d < e; ++d) { \
-                count += (*d != 0); \
-            } \
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d2, strides); \
-    } else { \
-        _ITERATE_INT_SIMPLE(bits) \
-    } 
-
 #if NPY_SIMD
-    #define _ITERATE_I16 _ITERATE_INT(16, 2)
-    #define _ITERATE_I32 _ITERATE_INT(32, 4)
-    #define _ITERATE_I64 _ITERATE_INT(64, 8)
-#else
-    #define _ITERATE_I16 _ITERATE_INT_SIMPLE(16)
-    #define _ITERATE_I32 _ITERATE_INT_SIMPLE(32)
-    #define _ITERATE_I64 _ITERATE_INT_SIMPLE(64)
-#endif
+    if (bstride == sizeof(npy_uint64)) {
+        const npy_uintp len_m = len & -npyv_nlanes_u64;
+        const npyv_u64 vone   = npyv_setall_u64(1);
+        const npyv_u64 vzero  = npyv_zero_u64();
 
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[0]); 
-
-    if (elsize == 2) {
-        _ITERATE_I16;
-    }
-    else if (elsize == 4) {
-        _ITERATE_I32;
+        npyv_u64 vsum64 = npyv_zero_u64();
+        for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) {
+            npyv_u64 mask = npyv_cvt_u64_b64(npyv_cmpeq_u64(npyv_load_u64((npy_uint64*)data), vzero));
+                     mask = npyv_and_u64(mask, vone);
+                   vsum64 = npyv_add_u64(vsum64, mask);
+        }
+        len  -= len_m;
+        count = len_m - npyv_sum_u64(vsum64);
     }
-    else if (elsize == 8) {
-        _ITERATE_I64;
+#endif
+    for (; len > 0; --len, data += bstride) {
+        count += (*(npy_uint64*)data != 0);
     }
-
-    NPY_END_THREADS; 
-
     return count;
 }
-
-
-
 /*
  * Counts the number of True values in a raw boolean array. This
  * is a low-overhead function which does no heap allocations.
  *
  * Returns -1 on error.
  */
-NPY_NO_EXPORT npy_intp
-count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides)
+static NPY_GCC_OPT_3 npy_intp
+count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int elsize)
 {
+    assert(elsize <= 8);
     int idim;
     npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
-    npy_intp i, coord[NPY_MAXDIMS];
-    npy_intp count = 0;
-    NPY_BEGIN_THREADS_DEF;
+    npy_intp coord[NPY_MAXDIMS];
 
-    /* Use raw iteration with no heap memory allocation */
+    // Use raw iteration with no heap memory allocation
     if (PyArray_PrepareOneRawArrayIter(
                     ndim, ashape,
                     data, astrides,
@@ -2401,51 +2330,44 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const
         return -1;
     }
 
-    /* Handle zero-sized array */
+    // Handle zero-sized array
     if (shape[0] == 0) {
         return 0;
     }
 
+    NPY_BEGIN_THREADS_DEF;
     NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
-    /* Special case for contiguous inner loop */
-    if (strides[0] == 1) {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            /* Process the innermost dimension */
-            const char *d = data;
-            const char *e = data + shape[0];
-#if NPY_SIMD
-            npy_uintp stride = shape[0] & -npyv_nlanes_u8;
-            count += count_nonzero_bytes((const npy_uint8 *)d, stride);
-            d += stride;
-#else
-            if (!NPY_ALIGNMENT_REQUIRED ||
-                    npy_is_aligned(d, sizeof(npy_uint64))) {
-                npy_uintp stride = 6 * sizeof(npy_uint64);
-                for (; d < e - (shape[0] % stride); d += stride) {
-                    count += count_nonzero_bytes_384((const npy_uint64 *)d);
-                }
-            }
-#endif
-            for (; d < e; ++d) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
-    }
-    /* General inner loop */
-    else {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            char *d = data;
-            /* Process the innermost dimension */
-            for (i = 0; i < shape[0]; ++i, d += strides[0]) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+
+    #define NONZERO_CASE(LEN, SFX) \
+        case LEN: \
+            NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
+                count += count_nonzero_##SFX(data, strides[0], shape[0]); \
+            } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); \
+            break
+
+    npy_intp count = 0;
+    switch(elsize) {
+        NONZERO_CASE(1, u8);
+        NONZERO_CASE(2, u16);
+        NONZERO_CASE(4, u32);
+        NONZERO_CASE(8, u64);
     }
+    #undef NONZERO_CASE
 
     NPY_END_THREADS;
-
     return count;
 }
+/*
+ * Counts the number of True values in a raw boolean array. This
+ * is a low-overhead function which does no heap allocations.
+ *
+ * Returns -1 on error.
+ */
+NPY_NO_EXPORT NPY_GCC_OPT_3 npy_intp
+count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides)
+{
+    return count_nonzero_int(ndim, data, ashape, astrides, 1);
+}
 
 /*NUMPY_API
  * Counts the number of non-zero elements in the array.
@@ -2468,60 +2390,22 @@ PyArray_CountNonzero(PyArrayObject *self)
     npy_intp *strideptr, *innersizeptr;
     NPY_BEGIN_THREADS_DEF;
 
-    /* Special low-overhead version specific to the boolean type */
+    // Special low-overhead version specific to the boolean/int types
     dtype = PyArray_DESCR(self);
-
-    // if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) {
-    //     return count_nonzero_int(PyArray_NDIM(self), (char *) PyArray_DATA(self),
-    //                     PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num);
-    // }
-
-    // if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) {
-    //     return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
-    //                     PyArray_DIMS(self), PyArray_STRIDES(self));
-    // }
-
     switch(dtype->kind) {
         case 'u':
-        {
-            if (dtype->elsize == 1) 
-                return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
-                        PyArray_DIMS(self), PyArray_STRIDES(self));
-
-            if (dtype->elsize >=2 && dtype->elsize <= 8)
-                return count_nonzero_int(
-                    PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
-                    PyArray_STRIDES(self), dtype->elsize
-                );
-
-            break;
-        }
         case 'i':
-        {
-            if (dtype->elsize == 1) 
-                return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
-                        PyArray_DIMS(self), PyArray_STRIDES(self));
-
-            if (dtype->elsize >=2 && dtype->elsize <= 8)
-                return count_nonzero_int(
-                    PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
-                    PyArray_STRIDES(self), dtype->elsize
-                );
-            
-            break;
-        }
         case 'b':
-        {
-           if (dtype->elsize == 1) 
-                return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
-                        PyArray_DIMS(self), PyArray_STRIDES(self));
-           
-        }
+            if (dtype->elsize > 8) {
+                break;
+            }
+            return count_nonzero_int(
+                PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
+                PyArray_STRIDES(self), dtype->elsize
+            );
     }
 
-
     nonzero = PyArray_DESCR(self)->f->nonzero;
-
     /* If it's a trivial one-dimensional loop, don't use an iterator */
     if (PyArray_TRIVIALLY_ITERABLE(self)) {
         needs_api = PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI);

From 85e2ce980fa4883c1add983be924d5e16d3723ec Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Sat, 13 Feb 2021 05:59:04 +0200
Subject: [PATCH 13/13] fix up

---
 numpy/core/src/multiarray/item_selection.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 2c57e5643440..fb354ce5473a 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2188,7 +2188,7 @@ static NPY_INLINE NPY_GCC_OPT_3 npy_intp
 count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
 {
     npy_intp count = 0;
-    if (bstride == 1)) {
+    if (bstride == 1) {
     #if NPY_SIMD
         npy_uintp len_m = len & -npyv_nlanes_u8;
         npy_uintp zcount = 0;