Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

ENH: Improve performance of numpy.nonzero for 1D/2D contiguous arrays #27519

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: main
Choose a base branch
Loading
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
review comments
  • Loading branch information
eendebakpt committed Feb 19, 2025
commit 53a9ef386d6f00230316821d1412fb151e288cd9
2 changes: 1 addition & 1 deletion 2 numpy/_core/src/common/lowlevel_strided_loops.h
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,7 @@ PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr
stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2); \
}

NPY_NO_EXPORT npy_bool nonzero_idxs_dispatcher(void * data, npy_intp* idxs, int dim, const npy_intp* shape,
NPY_NO_EXPORT npy_bool nonzero_idxs_dispatcher(void const* data, npy_intp* idxs, int dim, const npy_intp* shape,
const npy_intp* strides, int dtype, npy_intp nonzero_count);


Expand Down
19 changes: 11 additions & 8 deletions 19 numpy/_core/src/multiarray/item_selection.c
Original file line number Diff line number Diff line change
Expand Up @@ -2801,7 +2801,9 @@ PyArray_CountNonzero(PyArrayObject *self)
return nonzero_count;
}

static inline void nonzero_idxs_1D_bool(npy_intp count, npy_intp nonzero_count, char *data, npy_intp stride, npy_intp* multi_index)
static inline void
nonzero_idxs_1D_bool(npy_intp count, npy_intp nonzero_count, char *data,
npy_intp stride, npy_intp* multi_index)
{
/*
* use fast memchr variant for sparse data, see gh-4370
Expand Down Expand Up @@ -2910,14 +2912,11 @@ PyArray_Nonzero(PyArrayObject *self)
goto finish;
}

PyArrayObject* original_array = self;
if (PyArray_BASE(self) != NULL) {
original_array = (PyArrayObject*) PyArray_BASE(self);
}
int bytes_not_swapped = PyArray_ISNOTSWAPPED(self) && PyArray_ISNOTSWAPPED(original_array);
int bytes_not_swapped = PyArray_ISNOTSWAPPED(self);

// do not add ndim=1, dtype->kind == 'b', since we have a separate fast path for it
int optimized_count = PyArray_TRIVIALLY_ITERABLE(self) && ! (ndim == 1 && dtype->kind=='b') && bytes_not_swapped;
int optimized_count = PyArray_TRIVIALLY_ITERABLE(self) &&
!(ndim == 1 && dtype->kind=='b') && bytes_not_swapped;
if (optimized_count ) {
npy_intp * multi_index = (npy_intp *)PyArray_DATA(ret);
char * data = PyArray_BYTES(self);
Expand All @@ -2926,9 +2925,14 @@ PyArray_Nonzero(PyArrayObject *self)
const npy_intp* M_strides = PyArray_STRIDES(self);
int M_type_num = dtype->type_num;

NPY_BEGIN_THREADS_DEF;
if (!needs_api) {
NPY_BEGIN_THREADS_THRESHOLDED(PyArray_DIM(self, 0));
}
bool executed = nonzero_idxs_dispatcher((void*)data, multi_index, M_dim,
M_shape, M_strides, M_type_num, nonzero_count);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This branch is missing the thresholded GIL release (internals of it I guess).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Releasing the GIL (or running the cpython free-threading build) could result in a buffer overflow if another thread changes the number of non-zero entries in the array. I added a guard for that, but we should rerun the benchmarks (I still expect a good speedup though).

Should a test be added for this? (probably here: https://github.com/numpy/numpy/blob/main/numpy/_core/tests/test_multithreading.py)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, a case where we would actually want to lock down a whole array (but I think this is super hard with buffer exchange, so in practice...).

Would be very cool if you add a test. Larger use of free-threading is in the future after-all, and maybe not a too distant one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests have been added in #28361


NPY_END_THREADS;
if (executed) {
added_count = nonzero_count;
goto finish;
Expand All @@ -2942,7 +2946,6 @@ PyArray_Nonzero(PyArrayObject *self)
npy_intp stride = PyArray_STRIDE(self, 0);
npy_intp count = PyArray_DIM(self, 0);
NPY_BEGIN_THREADS_DEF;

if (!needs_api) {
NPY_BEGIN_THREADS_THRESHOLDED(count);
}
Expand Down
9 changes: 3 additions & 6 deletions 9 numpy/_core/src/multiarray/lowlevel_strided_loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -1887,7 +1887,7 @@ mapiter_@name@(
* #name = bool, u8, i8, u16, i16, u32, i32, u64, i64, f32, f64#
*/

static inline void nonzero_idxs_trivial_@name@(@dtype@ * data, npy_intp* idxs, const npy_intp* shape, const npy_intp* strides, npy_intp nonzero_count) {
static inline void nonzero_idxs_trivial_@name@(@dtype@ const* data, npy_intp* idxs, const npy_intp* shape, const npy_intp* strides, npy_intp nonzero_count) {
npy_intp stride = strides[0];
npy_intp added_count = 0;
npy_intp a = 0;
Expand All @@ -1903,7 +1903,7 @@ static inline void nonzero_idxs_trivial_@name@(@dtype@ * data, npy_intp* idxs, c
}
}

static inline void nonzero_idxs_2D_@name@(@dtype@ * data, npy_intp* idxs, const npy_intp* shape, const npy_intp* strides, npy_intp nonzero_count) {
static inline void nonzero_idxs_2D_@name@(@dtype@ const* data, npy_intp* idxs, const npy_intp* shape, const npy_intp* strides, npy_intp nonzero_count) {
npy_intp idxs_stride = 2;
npy_intp size_1 = shape[1];
npy_intp stride_1 = strides[1];
Expand Down Expand Up @@ -1932,9 +1932,8 @@ static inline void nonzero_idxs_2D_@name@(@dtype@ * data, npy_intp* idxs, const

/**end repeat**/

npy_bool nonzero_idxs_dispatcher(void * data, npy_intp* idxs, int ndim, const npy_intp* shape, const npy_intp* strides, int dtype, npy_intp nonzero_count)
npy_bool nonzero_idxs_dispatcher(void const * data, npy_intp* idxs, int ndim, const npy_intp* shape, const npy_intp* strides, int dtype, npy_intp nonzero_count)
{

if (ndim==1) {
switch(dtype) {
/**begin repeat
Expand All @@ -1943,7 +1942,6 @@ npy_bool nonzero_idxs_dispatcher(void * data, npy_intp* idxs, int ndim, const np
* #dtypeID = NPY_BOOL, NPY_UINT8, NPY_INT8, NPY_UINT16, NPY_INT16, NPY_UINT32, NPY_INT32, NPY_UINT64, NPY_INT64, NPY_FLOAT32, NPY_FLOAT64#
* #name = bool, u8, i8, u16, i16, u32, i32, u64, i64, f32, f64#
*/

case @dtypeID@:
{
@dtype@ * data_ptr = (@dtype@ *) data;
Expand All @@ -1962,7 +1960,6 @@ npy_bool nonzero_idxs_dispatcher(void * data, npy_intp* idxs, int ndim, const np
* #dtypeID = NPY_BOOL, NPY_UINT8, NPY_INT8, NPY_UINT16, NPY_INT16, NPY_UINT32, NPY_INT32, NPY_UINT64, NPY_INT64, NPY_FLOAT32, NPY_FLOAT64#
* #name = bool, u8, i8, u16, i16, u32, i32, u64, i64, f32, f64#
*/

case @dtypeID@:
{
@dtype@ * data_ptr = (@dtype@ *) data;
Expand Down
11 changes: 11 additions & 0 deletions 11 numpy/_core/tests/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -1956,6 +1956,17 @@ def __bool__(self):
a = np.array([[ThrowsAfter(15)]] * 10)
assert_raises(ValueError, np.nonzero, a)

def test_nonzero_byteorder(self):
values = [np.array([0., -0., 1, float('nan')]), np.array([0, 1]),
np.array([0, 12.3], dtype=np.float16)]
expected = [[2, 3], [1], [1]]

for A, expected in zip(values, expected):
A_byteswapped = (A.view(A.dtype.newbyteorder()).byteswap()).copy()

assert_equal(np.nonzero(A)[0], expected)
assert_equal(np.nonzero(A_byteswapped)[0], expected)


class TestIndex:
def test_boolean(self):
Expand Down
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.