diff --git a/numpy/_core/src/multiarray/mapping.c b/numpy/_core/src/multiarray/mapping.c index f17e4ffa65c1..1861241a040e 100644 --- a/numpy/_core/src/multiarray/mapping.c +++ b/numpy/_core/src/multiarray/mapping.c @@ -1580,7 +1580,7 @@ array_subscript(PyArrayObject *self, PyObject *op) if (PyArray_GetDTypeTransferFunction(is_aligned, itemsize, itemsize, - PyArray_DESCR(self), PyArray_DESCR(self), + PyArray_DESCR(self), PyArray_DESCR((PyArrayObject *)result), 0, &cast_info, &transfer_flags) != NPY_SUCCEED) { goto finish; } diff --git a/numpy/_core/src/multiarray/multiarraymodule.c b/numpy/_core/src/multiarray/multiarraymodule.c index 37b2f4860b1a..4946465617bc 100644 --- a/numpy/_core/src/multiarray/multiarraymodule.c +++ b/numpy/_core/src/multiarray/multiarraymodule.c @@ -3258,7 +3258,8 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) return NULL; } - NPY_cast_info cast_info = {.func = NULL}; + NPY_cast_info x_cast_info = {.func = NULL}; + NPY_cast_info y_cast_info = {.func = NULL}; ax = (PyArrayObject*)PyArray_FROM_O(x); if (ax == NULL) { @@ -3282,13 +3283,33 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) NPY_ITER_READONLY | NPY_ITER_ALIGNED, NPY_ITER_READONLY | NPY_ITER_ALIGNED }; + common_dt = PyArray_ResultType(2, &op_in[2], 0, NULL); if (common_dt == NULL) { goto fail; } + npy_intp itemsize = common_dt->elsize; + + // If x and y don't have references, we ask the iterator to create buffers + // using the common data type of x and y and then do fast trivial copies + // in the loop below. + // Otherwise trivial copies aren't possible and we handle the cast item by item + // in the loop. + PyArray_Descr *x_dt, *y_dt; + int trivial_copy_loop = !PyDataType_REFCHK(common_dt) && + ((itemsize == 16) || (itemsize == 8) || (itemsize == 4) || + (itemsize == 2) || (itemsize == 1)); + if (trivial_copy_loop) { + x_dt = common_dt; + y_dt = common_dt; + } + else { + x_dt = PyArray_DESCR(op_in[2]); + y_dt = PyArray_DESCR(op_in[3]); + } /* `PyArray_DescrFromType` cannot fail for simple builtin types: */ - PyArray_Descr * op_dt[4] = {common_dt, PyArray_DescrFromType(NPY_BOOL), - common_dt, common_dt}; + PyArray_Descr * op_dt[4] = {common_dt, PyArray_DescrFromType(NPY_BOOL), x_dt, y_dt}; + NpyIter * iter; NPY_BEGIN_THREADS_DEF; @@ -3302,26 +3323,27 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) /* Get the result from the iterator object array */ ret = (PyObject*)NpyIter_GetOperandArray(iter)[0]; - - npy_intp itemsize = common_dt->elsize; - - int has_ref = PyDataType_REFCHK(common_dt); + PyArray_Descr *ret_dt = PyArray_DESCR((PyArrayObject *)ret); NPY_ARRAYMETHOD_FLAGS transfer_flags = 0; - npy_intp transfer_strides[2] = {itemsize, itemsize}; + npy_intp x_strides[2] = {x_dt->elsize, itemsize}; + npy_intp y_strides[2] = {y_dt->elsize, itemsize}; npy_intp one = 1; - if (has_ref || ((itemsize != 16) && (itemsize != 8) && (itemsize != 4) && - (itemsize != 2) && (itemsize != 1))) { + if (!trivial_copy_loop) { // The iterator has NPY_ITER_ALIGNED flag so no need to check alignment // of the input arrays. - // - // There's also no need to set up a cast for y, since the iterator - // ensures both casts are identical. if (PyArray_GetDTypeTransferFunction( - 1, itemsize, itemsize, common_dt, common_dt, 0, - &cast_info, &transfer_flags) != NPY_SUCCEED) { + 1, x_strides[0], x_strides[1], + PyArray_DESCR(op_in[2]), ret_dt, 0, + &x_cast_info, &transfer_flags) != NPY_SUCCEED) { + goto fail; + } + if (PyArray_GetDTypeTransferFunction( + 1, y_strides[0], y_strides[1], + PyArray_DESCR(op_in[3]), ret_dt, 0, + &y_cast_info, &transfer_flags) != NPY_SUCCEED) { goto fail; } } @@ -3353,19 +3375,19 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) npy_intp ystride = strides[3]; /* constant sizes so compiler replaces memcpy */ - if (!has_ref && itemsize == 16) { + if (trivial_copy_loop && itemsize == 16) { INNER_WHERE_LOOP(16); } - else if (!has_ref && itemsize == 8) { + else if (trivial_copy_loop && itemsize == 8) { INNER_WHERE_LOOP(8); } - else if (!has_ref && itemsize == 4) { + else if (trivial_copy_loop && itemsize == 4) { INNER_WHERE_LOOP(4); } - else if (!has_ref && itemsize == 2) { + else if (trivial_copy_loop && itemsize == 2) { INNER_WHERE_LOOP(2); } - else if (!has_ref && itemsize == 1) { + else if (trivial_copy_loop && itemsize == 1) { INNER_WHERE_LOOP(1); } else { @@ -3374,18 +3396,18 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) if (*csrc) { char *args[2] = {xsrc, dst}; - if (cast_info.func( - &cast_info.context, args, &one, - transfer_strides, cast_info.auxdata) < 0) { + if (x_cast_info.func( + &x_cast_info.context, args, &one, + x_strides, x_cast_info.auxdata) < 0) { goto fail; } } else { char *args[2] = {ysrc, dst}; - if (cast_info.func( - &cast_info.context, args, &one, - transfer_strides, cast_info.auxdata) < 0) { + if (y_cast_info.func( + &y_cast_info.context, args, &one, + y_strides, y_cast_info.auxdata) < 0) { goto fail; } } @@ -3405,7 +3427,8 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) Py_DECREF(ax); Py_DECREF(ay); Py_DECREF(common_dt); - NPY_cast_info_xfree(&cast_info); + NPY_cast_info_xfree(&x_cast_info); + NPY_cast_info_xfree(&y_cast_info); if (NpyIter_Deallocate(iter) != NPY_SUCCEED) { Py_DECREF(ret); @@ -3419,7 +3442,8 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) Py_XDECREF(ax); Py_XDECREF(ay); Py_XDECREF(common_dt); - NPY_cast_info_xfree(&cast_info); + NPY_cast_info_xfree(&x_cast_info); + NPY_cast_info_xfree(&y_cast_info); return NULL; } diff --git a/numpy/_core/src/multiarray/stringdtype/casts.c b/numpy/_core/src/multiarray/stringdtype/casts.c index b896d09ace65..42c588199890 100644 --- a/numpy/_core/src/multiarray/stringdtype/casts.c +++ b/numpy/_core/src/multiarray/stringdtype/casts.c @@ -395,6 +395,7 @@ string_to_bool(PyArrayMethod_Context *context, char *const data[], npy_string_allocator *allocator = NpyString_acquire_allocator(descr); int has_null = descr->na_object != NULL; int has_string_na = descr->has_string_na; + int has_nan_na = descr->has_nan_na; const npy_static_string *default_string = &descr->default_string; npy_intp N = dimensions[0]; @@ -415,8 +416,13 @@ string_to_bool(PyArrayMethod_Context *context, char *const data[], } else if (is_null) { if (has_null && !has_string_na) { - // numpy treats NaN as truthy, following python - *out = NPY_TRUE; + if (has_nan_na) { + // numpy treats NaN as truthy, following python + *out = NPY_TRUE; + } + else { + *out = NPY_FALSE; + } } else { *out = (npy_bool)(default_string->size == 0); diff --git a/numpy/_core/src/multiarray/stringdtype/dtype.c b/numpy/_core/src/multiarray/stringdtype/dtype.c index 6e12084a9707..bcaeaa5be5f8 100644 --- a/numpy/_core/src/multiarray/stringdtype/dtype.c +++ b/numpy/_core/src/multiarray/stringdtype/dtype.c @@ -415,8 +415,23 @@ stringdtype_getitem(PyArray_StringDTypeObject *descr, char **dataptr) // PyArray_NonzeroFunc // Unicode strings are nonzero if their length is nonzero. npy_bool -nonzero(void *data, void *NPY_UNUSED(arr)) +nonzero(void *data, void *arr) { + PyArray_StringDTypeObject *descr = (PyArray_StringDTypeObject *)PyArray_DESCR(arr); + int has_null = descr->na_object != NULL; + int has_nan_na = descr->has_nan_na; + int has_string_na = descr->has_string_na; + if (has_null && NpyString_isnull((npy_packed_static_string *)data)) { + if (!has_string_na) { + if (has_nan_na) { + // numpy treats NaN as truthy, following python + return 1; + } + else { + return 0; + } + } + } return NpyString_size((npy_packed_static_string *)data) != 0; } diff --git a/numpy/_core/tests/test_stringdtype.py b/numpy/_core/tests/test_stringdtype.py index b5d91d402a10..9c0324c91f71 100644 --- a/numpy/_core/tests/test_stringdtype.py +++ b/numpy/_core/tests/test_stringdtype.py @@ -40,8 +40,7 @@ def na_object(request): return request.param -@pytest.fixture() -def dtype(na_object, coerce): +def get_dtype(na_object, coerce=True): # explicit is check for pd_NA because != with pd_NA returns pd_NA if na_object is pd_NA or na_object != "unset": return StringDType(na_object=na_object, coerce=coerce) @@ -49,6 +48,11 @@ def dtype(na_object, coerce): return StringDType(coerce=coerce) +@pytest.fixture() +def dtype(na_object, coerce): + return get_dtype(na_object, coerce) + + # second copy for cast tests to do a cartesian product over dtypes @pytest.fixture(params=[True, False]) def coerce2(request): @@ -456,11 +460,41 @@ def test_sort(strings, arr_sorted): ["", "a", "😸", "ááðfáíóåéë"], ], ) -def test_nonzero(strings): - arr = np.array(strings, dtype="T") - is_nonzero = np.array([i for i, item in enumerate(arr) if len(item) != 0]) +def test_nonzero(strings, na_object): + dtype = get_dtype(na_object) + arr = np.array(strings, dtype=dtype) + is_nonzero = np.array( + [i for i, item in enumerate(strings) if len(item) != 0]) assert_array_equal(arr.nonzero()[0], is_nonzero) + if na_object is not pd_NA and na_object == 'unset': + return + + strings_with_na = np.array(strings + [na_object], dtype=dtype) + is_nan = np.isnan(np.array([dtype.na_object], dtype=dtype))[0] + + if is_nan: + assert strings_with_na.nonzero()[0][-1] == 4 + else: + assert strings_with_na.nonzero()[0][-1] == 3 + + # check that the casting to bool and nonzero give consistent results + assert_array_equal(strings_with_na[strings_with_na.nonzero()], + strings_with_na[strings_with_na.astype(bool)]) + + +def test_where(string_list, na_object): + dtype = get_dtype(na_object) + a = np.array(string_list, dtype=dtype) + b = a[::-1] + res = np.where([True, False, True, False, True, False], a, b) + assert_array_equal(res, [a[0], b[1], a[2], b[3], a[4], b[5]]) + + +def test_fancy_indexing(string_list): + sarr = np.array(string_list, dtype="T") + assert_array_equal(sarr, sarr[np.arange(sarr.shape[0])]) + def test_creation_functions(): assert_array_equal(np.zeros(3, dtype="T"), ["", "", ""])