From 5ffae65f6f44a104a2061c6cdeb6475f904c4d23 Mon Sep 17 00:00:00 2001 From: warren Date: Sun, 19 Jun 2022 20:47:53 -0400 Subject: [PATCH 01/14] ENH: lib: Allow usecols to be a callable in loadtxt(). --- numpy/core/setup.py | 1 + .../src/multiarray/textreading/readtext.c | 51 ++++++------- numpy/core/src/multiarray/textreading/rows.c | 75 ++++++++++++++++++- numpy/core/src/multiarray/textreading/rows.h | 3 +- .../textreading/seq_to_ssize_c_array.c | 52 +++++++++++++ .../textreading/seq_to_ssize_c_array.h | 6 ++ numpy/lib/npyio.py | 34 ++++++++- numpy/lib/tests/test_loadtxt.py | 72 ++++++++++++++++-- 8 files changed, 250 insertions(+), 44 deletions(-) create mode 100644 numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c create mode 100644 numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 543b6ae39650..c4341cc91303 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1017,6 +1017,7 @@ def opts_if_msvc(build_cmd): join('src', 'multiarray', 'textreading', 'stream_pyobject.c'), join('src', 'multiarray', 'textreading', 'str_to_int.c'), join('src', 'multiarray', 'textreading', 'tokenize.cpp'), + join('src', 'multiarray', 'textreading', 'seq_to_ssize_c_array.c'), ] ####################################################################### diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index a5db1cb7761a..8c78cfead2a0 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -11,6 +11,7 @@ #include "common.h" #include "conversion_utils.h" +#include "textreading/seq_to_ssize_c_array.h" #include "textreading/parser_config.h" #include "textreading/stream_pyobject.h" #include "textreading/field_types.h" @@ -19,8 +20,8 @@ // -// `usecols` must point to a Python object that is Py_None or a 1-d contiguous -// numpy array with data type int32. +// If the argument `usecols_obj` is not Py_None, it must be a callable. +// In that case, the argument `usecols` must be NULL. // // `dtype` must point to a Python object that is Py_None or a numpy dtype // instance. If the latter, code and sizes must be arrays of length @@ -37,6 +38,7 @@ static PyObject * _readtext_from_stream(stream *s, parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[], + PyObject *usecols_obj, Py_ssize_t skiplines, Py_ssize_t max_rows, PyObject *converters, PyObject *dtype) { @@ -69,7 +71,8 @@ _readtext_from_stream(stream *s, arr = read_rows( s, max_rows, num_fields, ft, pc, - num_usecols, usecols, skiplines, converters, + num_usecols, usecols, usecols_obj, + skiplines, converters, NULL, out_dtype, homogeneous); if (arr == NULL) { goto finish; @@ -263,38 +266,25 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), */ Py_ssize_t num_usecols = -1; Py_ssize_t *usecols = NULL; - if (usecols_obj != Py_None) { + if (usecols_obj != Py_None && !PyCallable_Check(usecols_obj)) { num_usecols = PySequence_Length(usecols_obj); if (num_usecols < 0) { return NULL; } - /* Calloc just to not worry about overflow */ - usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t)); + usecols = seq_to_ssize_c_array(num_usecols, usecols_obj, + "usecols must be an int or a sequence of ints but " + "it contains at least one element of type '%s'"); if (usecols == NULL) { - PyErr_NoMemory(); return NULL; } - for (Py_ssize_t i = 0; i < num_usecols; i++) { - PyObject *tmp = PySequence_GetItem(usecols_obj, i); - if (tmp == NULL) { - PyMem_FREE(usecols); - return NULL; - } - usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); - if (error_converting(usecols[i])) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) { - PyErr_Format(PyExc_TypeError, - "usecols must be an int or a sequence of ints but " - "it contains at least one element of type '%s'", - Py_TYPE(tmp)->tp_name); - } - Py_DECREF(tmp); - PyMem_FREE(usecols); - return NULL; - } - Py_DECREF(tmp); - } + /* + * The given usecols_obj is a Python sequence; it has been processed to + * give the usecols array, so reset the Python object to None. + */ + usecols_obj = Py_None; } + assert(usecols == NULL || usecols_obj == Py_None); + /* At this point, if usecols_obj is not Py_None, it must be a callable object. */ stream *s; if (filelike) { @@ -304,14 +294,15 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), s = stream_python_iterable(file, encoding); } if (s == NULL) { - PyMem_FREE(usecols); + PyMem_Free(usecols); return NULL; } arr = _readtext_from_stream( - s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype); + s, &pc, num_usecols, usecols, usecols_obj, skiplines, max_rows, + converters, dtype); stream_close(s); - PyMem_FREE(usecols); + PyMem_Free(usecols); return arr; } diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index a72fb79d93cc..449d85da5c93 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -11,6 +11,7 @@ #include #include +#include "textreading/seq_to_ssize_c_array.h" #include "textreading/stream.h" #include "textreading/tokenize.h" #include "textreading/conversions.h" @@ -134,6 +135,11 @@ create_conv_funcs( * @param usecols An array of length `num_usecols` or NULL. If given indicates * which column is read for each individual row (negative columns are * accepted). + * @param usecols_obj Either `Py_None` or a callable Python object. If + * callable, the function must accept a single integer argument (the + * number of columns in the file), and return a sequence of integers + * (the sequence of column indices to use). When `usecols_obj` is not + * `Py_None`, `usecols` MUST be NULL. * @param skiplines The number of lines to skip, these lines are ignored. * @param converters Python dictionary of converters. Finalizing converters * is difficult without information about the number of columns. @@ -155,7 +161,8 @@ create_conv_funcs( NPY_NO_EXPORT PyArrayObject * read_rows(stream *s, npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types, - parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, + parser_config *pconfig, + Py_ssize_t num_usecols, Py_ssize_t *usecols, PyObject *usecols_obj, Py_ssize_t skiplines, PyObject *converters, PyArrayObject *data_array, PyArray_Descr *out_descr, bool homogeneous) @@ -179,6 +186,10 @@ read_rows(stream *s, /* We give a warning if max_rows is used and an empty line is encountered */ bool give_empty_row_warning = max_rows >= 0; + // If the caller passes in a callable for usecols_obj, then they must + // also pass in NULL for usecols. + assert(usecols == NULL || usecols_obj == Py_None); + int ts_result = 0; tokenizer_state ts; if (tokenizer_init(&ts, pconfig) < 0) { @@ -246,6 +257,56 @@ read_rows(stream *s, // We've deferred some of the initialization tasks to here, // because we've now read the first line, and we definitively // know how many fields (i.e. columns) we will be processing. + + if (usecols_obj != Py_None) { + // Call the Python function provided by the caller to + // create the usecols array. + PyObject *seq = PyObject_CallFunction(usecols_obj, "n", + current_num_fields); + if (PyErr_Occurred()) { + // User-provided function failed. + goto error; + } + // The user-defined usecols function must return a sequence + // of integers. + num_usecols = PySequence_Length(seq); + if (num_usecols == -1) { + // User-provided function did not return a sequence. + PyErr_Clear(); + PyErr_Format(PyExc_TypeError, + "the user-provided callable usecols must return a " + "sequence of ints, but it returned an instance of " + "type '%s'", Py_TYPE(seq)->tp_name); + Py_DECREF(seq); + goto error; + } + + if (!homogeneous && num_field_types != num_usecols) { + // A structured dtype was provided, and the length of + // the sequence returned by the user-provided function + // does not have the same length as the number of fields + // in the dtype. + Py_DECREF(seq); + PyErr_Format(PyExc_RuntimeError, + "length of the sequence returned by the callable " + "usecols (%d) does not equal the number of fields " + "in the given dtype (%d)", + num_usecols, num_field_types); + goto error; + } + + // Convert the sequence to a C array of Py_ssize_t ints. + usecols = seq_to_ssize_c_array(num_usecols, seq, + "the user-provided callable usecols must return a " + "sequence of ints, but it returned a sequence " + "containing at least one occurrence of type '%s'"); + Py_DECREF(seq); + if (usecols == NULL) { + goto error; + } + actual_num_fields = num_usecols; + } + if (actual_num_fields == -1) { actual_num_fields = current_num_fields; } @@ -431,6 +492,12 @@ read_rows(stream *s, data_ptr += row_size; } + if (usecols_obj != Py_None) { + // This function owns usecols if a callable usecols_obj was given. + PyMem_Free(usecols); + usecols = NULL; // An overabundance of caution... + } + tokenizer_clear(&ts); if (conv_funcs != NULL) { for (Py_ssize_t i = 0; i < actual_num_fields; i++) { @@ -479,6 +546,12 @@ read_rows(stream *s, return data_array; error: + if (usecols_obj != Py_None) { + // If the error occurred early enough in the function, we might + // not have allocated usecols yet, but that's OK, because we know + // that usecols is NULL in that case. + PyMem_Free(usecols); + } if (conv_funcs != NULL) { for (Py_ssize_t i = 0; i < actual_num_fields; i++) { Py_XDECREF(conv_funcs[i]); diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h index 20eb9e186a19..1a707a2d3918 100644 --- a/numpy/core/src/multiarray/textreading/rows.h +++ b/numpy/core/src/multiarray/textreading/rows.h @@ -14,7 +14,8 @@ NPY_NO_EXPORT PyArrayObject * read_rows(stream *s, npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types, - parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, + parser_config *pconfig, + Py_ssize_t num_usecols, Py_ssize_t *usecols, PyObject *usecols_obj, Py_ssize_t skiplines, PyObject *converters, PyArrayObject *data_array, PyArray_Descr *out_descr, bool homogeneous); diff --git a/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c b/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c new file mode 100644 index 000000000000..67a2effef45a --- /dev/null +++ b/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c @@ -0,0 +1,52 @@ +#define PY_SSIZE_T_CLEAN +#include + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" // For NPY_NO_EXPORT + +// +// Convert a Python sequence to a C array of Py_ssize_t integers. +// +// `seq` must be a Python sequence of length `len`. It is assumed that the +// caller has already checked the length of the sequence, so the length is an +// argument instead of being inferred from `seq` itself. +// +// `errtext` must be provided, and it must contain one occurrence of the +// format code sequence '%s'. This text is used as the text of the TypeError +// that is raised when an element of `seq` is found that cannot be converted +// to an a Py_ssize_t integer. The '%s' format code will be replaced with +// the type of the object that failed the conversion. +// +// Returns NULL with an exception set if the conversion fails. +// +// The memory for the array is allocated with PyMem_Calloc. +// The caller must free the memory with PyMem_FREE or PyMem_Free. +// +NPY_NO_EXPORT Py_ssize_t * +seq_to_ssize_c_array(Py_ssize_t len, PyObject *seq, char *errtext) +{ + Py_ssize_t *arr = PyMem_Calloc(len, sizeof(Py_ssize_t)); + if (arr == NULL) { + PyErr_NoMemory(); + return NULL; + } + for (Py_ssize_t i = 0; i < len; ++i) { + PyObject *tmp = PySequence_GetItem(seq, i); + if (tmp == NULL) { + PyMem_Free(arr); + return NULL; + } + arr[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); + if (arr[i] == -1 && PyErr_Occurred()) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Format(PyExc_TypeError, errtext, Py_TYPE(tmp)->tp_name); + } + Py_DECREF(tmp); + PyMem_Free(arr); + return NULL; + } + Py_DECREF(tmp); + } + return arr; +} diff --git a/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h b/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h new file mode 100644 index 000000000000..a58d7a01a93c --- /dev/null +++ b/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h @@ -0,0 +1,6 @@ +#ifndef SEQ_TO_SSIZE_C_ARRAY_H +#define SEQ_TO_SSIZE_C_ARRAY_H + +Py_ssize_t *seq_to_ssize_c_array(Py_ssize_t len, PyObject *seq, char *initialtext); + +#endif diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 210c0ea94a7e..f2240cbd5a11 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -888,9 +888,10 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', read_dtype_via_object_chunks = dtype dtype = np.dtype(object) - if usecols is not None: - # Allow usecols to be a single int or a sequence of ints, the C-code - # handles the rest + if usecols is not None and not callable(usecols): + # If usecols is not callable, it must be an int or a sequence of ints. + # Process usecols so that when it is not a callable, it is a list; the + # C code will handle the rest of the validation. try: usecols = list(usecols) except TypeError: @@ -1099,15 +1100,24 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, Default: None. skiprows : int, optional Skip the first `skiprows` lines, including comments; default: 0. - usecols : int or sequence, optional + usecols : int, sequence of ints, or callable, optional Which columns to read, with 0 being the first. For example, ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. The default, None, results in all columns being read. + `usecols` may be a callable function that accepts a single integer + argument that gives the number of columns in the file. The callable + must return a sequence of integers that indicate which columns to + include in the output array. + .. versionchanged:: 1.11.0 When a single column has to be read it is possible to use an integer instead of a tuple. E.g ``usecols = 3`` reads the fourth column the same way as ``usecols = (3,)`` would. + + .. versionchanged:: 1.24.0 + Added the option for `usecols` to be a callable. + unpack : bool, optional If True, the returned array is transposed, so that arguments may be unpacked using ``x, y, z = loadtxt(...)``. When used with a @@ -1272,6 +1282,22 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, >>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"') array('Hello, my name is "Monty"!', dtype='>> s1 = StringIO('10.25 ABC 1.5 3.5\n12.50 XYZ 2.5 8.0') + >>> np.loadtxt(s1, usecols=lambda n: [0] + list(range(2, n))) + array([[10.25, 1.5 , 3.5 ], + [12.5 , 2.5 , 8. ]]) + + The caller does not have to know the number of columns in advance. + The same example works with this input, which has more columns. + + >>> s2 = StringIO('10.25 ABC 1.5 3.5 5.5\n12.50 XYZ 2.5 8.0 9.5') + >>> np.loadtxt(s2, usecols=lambda n: [0] + list(range(2, n))) + array([[10.25, 1.5 , 3.5 , 5.5 ], + [12.5 , 2.5 , 8. , 9.5 ]]) + """ if like is not None: diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 0b8fe3c479c6..042f8a44ef6c 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -225,7 +225,8 @@ def test_converters_negative_indices(): assert_equal(res, expected) -def test_converters_negative_indices_with_usecols(): +@pytest.mark.parametrize('usecols', [[0, -1], lambda n: [0, -1]]) +def test_converters_negative_indices_with_usecols(usecols): txt = StringIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n') conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]]) @@ -234,37 +235,88 @@ def test_converters_negative_indices_with_usecols(): dtype=np.float64, delimiter=",", converters=conv, - usecols=[0, -1], + usecols=usecols, encoding=None, ) assert_equal(res, expected) # Second test with variable number of rows: res = np.loadtxt(StringIO('''0,1,2\n0,1,2,3,4'''), delimiter=",", - usecols=[0, -1], converters={-1: (lambda x: -1)}) + usecols=usecols, converters={-1: (lambda x: -1)}) assert_array_equal(res, [[0, -1], [0, -1]]) -def test_ragged_usecols(): + +@pytest.mark.parametrize('usecols', [[0, -2], lambda n: [0, -2]]) +def test_ragged_usecols(usecols): # usecols, and negative ones, work even with varying number of columns. txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") expected = np.array([[0, 0], [0, 0], [0, 0]]) - res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=usecols) assert_equal(res, expected) txt = StringIO("0,0,XXX\n0\n0,XXX,XXX,0,XXX\n") with pytest.raises(ValueError, match="invalid column index -2 at row 2 with 1 columns"): # There is no -2 column in the second row: - np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + np.loadtxt(txt, dtype=float, delimiter=",", usecols=usecols) -def test_empty_usecols(): +@pytest.mark.parametrize('usecols', [[], lambda n: []]) +def test_empty_usecols(usecols): txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") - res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=[]) + res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=usecols) assert res.shape == (3,) assert res.dtype == np.dtype([]) +def test_callable_usecols_basic(): + txt = StringIO('1 2 3 4 5\n6 7 8 9 1\n2 3 4 5 6\n') + res = np.loadtxt(txt, dtype=int, usecols=lambda n: list(range(n)[::2])) + expected = np.array([[1, 3, 5], [6, 8, 1], [2, 4, 6]]) + assert_equal(res, expected) + + +def test_callable_usecols_method(): + + class Foo: + + def usecols(self, n): + return list(range(n)[1::2]) + + def load(self): + return np.loadtxt(StringIO('1 2 3 4\n5 6 7 8'), + usecols=self.usecols) + + res = Foo().load() + expected = np.array([[2.0, 4.0], [6.0, 8.0]]) + assert_equal(res, expected) + + +def test_callable_usecols_raises_exception(): + with pytest.raises(ZeroDivisionError, match='division by zero'): + np.loadtxt(StringIO('1 2\n4 5'), usecols=lambda n: 1/0) + + +def test_callable_usecols_not_a_sequence(): + with pytest.raises(TypeError, + match="returned an instance of type 'module'"): + np.loadtxt(StringIO('1 2\n4 5'), usecols=lambda n: np) + + +def test_callable_usecols_bad_element_type(): + with pytest.raises(TypeError, + match="at least one occurrence of type 'module'"): + np.loadtxt(StringIO('1 2\n3 4'), usecols=lambda n: [np]) + + +def test_callable_usecols_bad_len_with_structured_dtype(): + dt = np.dtype([('x', float), ('y', float)]) + txt = StringIO('1 2 3 4\n5 6 7 8\n') + with pytest.raises(RuntimeError, + match='the number of fields in the given dtype'): + np.loadtxt(txt, usecols=lambda n: [0, 2, 3], dtype=dt) + + @pytest.mark.parametrize("c1", ["a", "の", "🫕"]) @pytest.mark.parametrize("c2", ["a", "の", "🫕"]) def test_large_unicode_characters(c1, c2): @@ -415,6 +467,7 @@ def test_implicit_cast_float_to_int_fails(dtype): with pytest.raises(ValueError): np.loadtxt(txt, dtype=dtype, delimiter=",") + @pytest.mark.parametrize("dtype", (np.complex64, np.complex128)) @pytest.mark.parametrize("with_parens", (False, True)) def test_complex_parsing(dtype, with_parens): @@ -610,6 +663,7 @@ def test_quoted_field_is_not_empty(): res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') assert_equal(res, expected) + def test_quoted_field_is_not_empty_nonstrict(): # Same as test_quoted_field_is_not_empty but check that we are not strict # about missing closing quote (this is the `csv.reader` default also) @@ -618,6 +672,7 @@ def test_quoted_field_is_not_empty_nonstrict(): res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') assert_equal(res, expected) + def test_consecutive_quotechar_escaped(): txt = StringIO('"Hello, my name is ""Monty""!"') expected = np.array('Hello, my name is "Monty"!', dtype="U40") @@ -649,6 +704,7 @@ def test_warn_on_no_data(data, ndmin, usecols): res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) assert res.shape == expected_shape + @pytest.mark.parametrize("skiprows", (2, 3)) def test_warn_on_skipped_data(skiprows): data = "1 2 3\n4 5 6" From 10f1d08920586deb191856dde68443336deed3a1 Mon Sep 17 00:00:00 2001 From: warren Date: Tue, 21 Jun 2022 10:27:26 -0400 Subject: [PATCH 02/14] Fix check of the result of PyObject_CallFunction() --- numpy/core/src/multiarray/textreading/rows.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 449d85da5c93..a75c3f8aeacd 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -263,7 +263,7 @@ read_rows(stream *s, // create the usecols array. PyObject *seq = PyObject_CallFunction(usecols_obj, "n", current_num_fields); - if (PyErr_Occurred()) { + if (seq == NULL) { // User-provided function failed. goto error; } From 0d8d3c55c80952ba45d671ee27888e48db1f1bea Mon Sep 17 00:00:00 2001 From: warren Date: Thu, 23 Jun 2022 14:11:28 -0400 Subject: [PATCH 03/14] Lots of updates. (Squash this commit!) --- numpy/core/setup.py | 1 - numpy/core/src/multiarray/conversion_utils.c | 63 ++++ numpy/core/src/multiarray/conversion_utils.h | 5 + .../src/multiarray/textreading/readtext.c | 107 +------ numpy/core/src/multiarray/textreading/rows.c | 299 +++++++++++------- numpy/core/src/multiarray/textreading/rows.h | 10 +- .../textreading/seq_to_ssize_c_array.c | 52 --- .../textreading/seq_to_ssize_c_array.h | 6 - .../src/multiarray/textreading/tokenize.cpp | 2 + numpy/lib/tests/test_io.py | 4 +- numpy/lib/tests/test_loadtxt.py | 44 +++ 11 files changed, 314 insertions(+), 279 deletions(-) delete mode 100644 numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c delete mode 100644 numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h diff --git a/numpy/core/setup.py b/numpy/core/setup.py index c4341cc91303..543b6ae39650 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1017,7 +1017,6 @@ def opts_if_msvc(build_cmd): join('src', 'multiarray', 'textreading', 'stream_pyobject.c'), join('src', 'multiarray', 'textreading', 'str_to_int.c'), join('src', 'multiarray', 'textreading', 'tokenize.cpp'), - join('src', 'multiarray', 'textreading', 'seq_to_ssize_c_array.c'), ] ####################################################################### diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c index 96afb6c00ce2..bb4a14b6d7d3 100644 --- a/numpy/core/src/multiarray/conversion_utils.c +++ b/numpy/core/src/multiarray/conversion_utils.c @@ -1147,6 +1147,69 @@ PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals) } +// +// Convert a Python sequence to a C array of Py_ssize_t integers. +// +// `seq` must be a Python sequence. +// +// `seq_typeerror_text` and `element_typeerror_text` must be provided, and +// each must contain one occurrence of the format code sequence '%s'. The +// '%s' format code will be replaced with the type of the object that failed +// the conversion. +// * `seq_typeerror_text` is used if the attempt to get the length of `seq` +// fails. +// * `element_typerror_text` is used if the attempt to convert an element +// of `seq` to a Py_ssize_t fails. +// +// On success, the function assigns the allocated memory to *parr, and +// returns the length of the sequence. +// +// The memory for the array is allocated with PyMem_Calloc. +// The caller must free the memory with PyMem_FREE or PyMem_Free. +// +// Returns -1 with an exception set and with *parr set to NULL if the +// conversion fails. +// +NPY_NO_EXPORT Py_ssize_t +PyArray_SeqToSsizeCArray(PyObject *seq, Py_ssize_t **parr, + char *seq_typeerror_text, + char *element_typeerror_text) +{ + *parr = NULL; + Py_ssize_t len = PySequence_Length(seq); + if (len == -1) { + PyErr_Format(PyExc_TypeError, seq_typeerror_text, + Py_TYPE(seq)->tp_name); + return -1; + } + Py_ssize_t *arr = PyMem_Calloc(len, sizeof(Py_ssize_t)); + if (arr == NULL) { + PyErr_NoMemory(); + return -1; + } + for (Py_ssize_t i = 0; i < len; ++i) { + PyObject *tmp = PySequence_GetItem(seq, i); + if (tmp == NULL) { + PyMem_Free(arr); + return -1; + } + arr[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); + if (error_converting(arr[i])) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Format(PyExc_TypeError, element_typeerror_text, + Py_TYPE(tmp)->tp_name); + } + Py_DECREF(tmp); + PyMem_Free(arr); + return -1; + } + Py_DECREF(tmp); + } + *parr = arr; + return len; +} + + /** * WARNING: This flag is a bad idea, but was the only way to both * 1) Support unpickling legacy pickles with object types. diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h index 4d0fbb8941ba..4c27ca31a052 100644 --- a/numpy/core/src/multiarray/conversion_utils.h +++ b/numpy/core/src/multiarray/conversion_utils.h @@ -48,6 +48,11 @@ PyArray_IntpFromIndexSequence(PyObject *seq, npy_intp *vals, npy_intp maxvals); NPY_NO_EXPORT int PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals); +NPY_NO_EXPORT Py_ssize_t +PyArray_SeqToSsizeCArray(PyObject *seq, Py_ssize_t **parr, + char *seq_typeerror_text, + char *element_typeerror_text); + NPY_NO_EXPORT int PyArray_TypestrConvert(int itemsize, int gentype); diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 8c78cfead2a0..0d61bac1324d 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -11,78 +11,9 @@ #include "common.h" #include "conversion_utils.h" -#include "textreading/seq_to_ssize_c_array.h" #include "textreading/parser_config.h" #include "textreading/stream_pyobject.h" -#include "textreading/field_types.h" #include "textreading/rows.h" -#include "textreading/str_to_int.h" - - -// -// If the argument `usecols_obj` is not Py_None, it must be a callable. -// In that case, the argument `usecols` must be NULL. -// -// `dtype` must point to a Python object that is Py_None or a numpy dtype -// instance. If the latter, code and sizes must be arrays of length -// num_dtype_fields, holding the flattened data field type codes and byte -// sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype, -// but we do that in Python code.) -// -// If both `usecols` and `dtype` are not None, and the data type is compound, -// then len(usecols) must equal num_dtype_fields. -// -// If `dtype` is given and it is compound, and `usecols` is None, then the -// number of columns in the file must match the number of fields in `dtype`. -// -static PyObject * -_readtext_from_stream(stream *s, - parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[], - PyObject *usecols_obj, - Py_ssize_t skiplines, Py_ssize_t max_rows, - PyObject *converters, PyObject *dtype) -{ - PyArrayObject *arr = NULL; - PyArray_Descr *out_dtype = NULL; - field_type *ft = NULL; - - /* - * If dtypes[0] is dtype the input was not structured and the result - * is considered "homogeneous" and we have to discover the number of - * columns/ - */ - out_dtype = (PyArray_Descr *)dtype; - Py_INCREF(out_dtype); - - Py_ssize_t num_fields = field_types_create(out_dtype, &ft); - if (num_fields < 0) { - goto finish; - } - bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype; - - if (!homogeneous && usecols != NULL && num_usecols != num_fields) { - PyErr_Format(PyExc_TypeError, - "If a structured dtype is used, the number of columns in " - "`usecols` must match the effective number of fields. " - "But %zd usecols were given and the number of fields is %zd.", - num_usecols, num_fields); - goto finish; - } - - arr = read_rows( - s, max_rows, num_fields, ft, pc, - num_usecols, usecols, usecols_obj, - skiplines, converters, - NULL, out_dtype, homogeneous); - if (arr == NULL) { - goto finish; - } - - finish: - Py_XDECREF(out_dtype); - field_types_xclear(num_fields, ft); - return (PyObject *)arr; -} static int @@ -208,8 +139,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), }; bool filelike = true; - PyObject *arr = NULL; - NPY_PREPARE_ARGPARSER; if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames, "file", NULL, &file, @@ -260,32 +189,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), } } - /* - * Parse usecols, the rest of NumPy has no clear helper for this, so do - * it here manually. - */ - Py_ssize_t num_usecols = -1; - Py_ssize_t *usecols = NULL; - if (usecols_obj != Py_None && !PyCallable_Check(usecols_obj)) { - num_usecols = PySequence_Length(usecols_obj); - if (num_usecols < 0) { - return NULL; - } - usecols = seq_to_ssize_c_array(num_usecols, usecols_obj, - "usecols must be an int or a sequence of ints but " - "it contains at least one element of type '%s'"); - if (usecols == NULL) { - return NULL; - } - /* - * The given usecols_obj is a Python sequence; it has been processed to - * give the usecols array, so reset the Python object to None. - */ - usecols_obj = Py_None; - } - assert(usecols == NULL || usecols_obj == Py_None); - /* At this point, if usecols_obj is not Py_None, it must be a callable object. */ - stream *s; if (filelike) { s = stream_python_file(file, encoding); @@ -294,15 +197,13 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), s = stream_python_iterable(file, encoding); } if (s == NULL) { - PyMem_Free(usecols); return NULL; } - arr = _readtext_from_stream( - s, &pc, num_usecols, usecols, usecols_obj, skiplines, max_rows, - converters, dtype); + PyArrayObject *arr = read_rows(s, max_rows, &pc, usecols_obj, skiplines, + converters, NULL, dtype); + stream_close(s); - PyMem_Free(usecols); - return arr; + return (PyObject *)arr; } diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index a75c3f8aeacd..c92878adc1d4 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -11,7 +11,6 @@ #include #include -#include "textreading/seq_to_ssize_c_array.h" #include "textreading/stream.h" #include "textreading/tokenize.h" #include "textreading/conversions.h" @@ -26,6 +25,36 @@ #define MIN_BLOCK_SIZE (1 << 13) +/* + * Call the user-defined usecols callable and create the C array of + * Py_ssize_t integers. + * + * Returns the size of the array, and sets *p_usecols_arr to the + * C array, allocated with PyMem_Calloc. The caller is responsible + * for freeing this memory with PyMem_Free(). + * + * Returns -1 with an exception set and *p_usecols_arr set to NULL + * if the call or conversion fails. + */ +static Py_ssize_t +get_usecols_arr_from_callable(PyObject *usecols_obj, Py_ssize_t n, + Py_ssize_t **p_usecols_arr) +{ + *p_usecols_arr = NULL; + PyObject *seq = PyObject_CallFunction(usecols_obj, "n", n); + if (seq == NULL) { + return -1; + } + Py_ssize_t num_usecols = PyArray_SeqToSsizeCArray(seq, p_usecols_arr, + "the user-provided callable usecols must return a " + "sequence of ints, but it returned an instance of " + "type '%s'", + "the user-provided callable usecols must return a " + "sequence of ints, but it returned a sequence " + "containing at least one occurrence of type '%s'"); + Py_DECREF(seq); + return num_usecols; +} /* * Create the array of converter functions from the Python converters. @@ -118,6 +147,17 @@ create_conv_funcs( return NULL; } +static void +free_conv_funcs(Py_ssize_t num_fields, Py_ssize_t *conv_funcs) +{ + if (conv_funcs != NULL) { + for (Py_ssize_t i = 0; i < num_fields; i++) { + Py_XDECREF(conv_funcs[i]); + } + PyMem_FREE(conv_funcs); + } +} + /** * Read a file into the provided array, or create (and possibly grow) an * array to read into. @@ -126,51 +166,88 @@ create_conv_funcs( * the tokenizer. * @param max_rows The number of rows to read, or -1. If negative * all rows are read. - * @param num_field_types The number of field types stored in `field_types`. - * @param field_types Information about the dtype for each column (or one if - * `homogeneous`). * @param pconfig Pointer to the parser config object used by both the * tokenizer and the conversion functions. - * @param num_usecols The number of columns in `usecols`. - * @param usecols An array of length `num_usecols` or NULL. If given indicates - * which column is read for each individual row (negative columns are - * accepted). - * @param usecols_obj Either `Py_None` or a callable Python object. If - * callable, the function must accept a single integer argument (the - * number of columns in the file), and return a sequence of integers - * (the sequence of column indices to use). When `usecols_obj` is not - * `Py_None`, `usecols` MUST be NULL. + * @param usecols_obj The `usecols` object provided by the `loadtxt()` caller. * @param skiplines The number of lines to skip, these lines are ignored. * @param converters Python dictionary of converters. Finalizing converters * is difficult without information about the number of columns. * @param data_array An array to be filled or NULL. In either case a new * reference is returned (the reference to `data_array` is not stolen). - * @param out_descr The dtype used for allocating a new array. This is not + * @param dtype The dtype used for allocating a new array. This is not * used if `data_array` is provided. Note that the actual dtype of the * returned array can differ for strings. - * @param num_cols Pointer in which the actual (discovered) number of columns - * is returned. This is only relevant if `homogeneous` is true. - * @param homogeneous Whether the datatype of the array is not homogeneous, - * i.e. not structured. In this case the number of columns has to be - * discovered an the returned array will be 2-dimensional rather than - * 1-dimensional. * * @returns Returns the result as an array object or NULL on error. The result * is always a new reference (even when `data_array` was passed in). */ NPY_NO_EXPORT PyArrayObject * read_rows(stream *s, - npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types, - parser_config *pconfig, - Py_ssize_t num_usecols, Py_ssize_t *usecols, PyObject *usecols_obj, + npy_intp max_rows, parser_config *pconfig, PyObject *usecols_obj, Py_ssize_t skiplines, PyObject *converters, - PyArrayObject *data_array, PyArray_Descr *out_descr, - bool homogeneous) + PyArrayObject *data_array, PyObject *dtype) { char *data_ptr = NULL; Py_ssize_t current_num_fields; - npy_intp row_size = out_descr->elsize; + Py_ssize_t prev_num_fields; PyObject **conv_funcs = NULL; + npy_intp row_size; + Py_ssize_t num_usecols = -1; + Py_ssize_t *usecols_arr = NULL; + PyArray_Descr *out_descr = NULL; + field_type *field_types = NULL; + + int ts_result = 0; + tokenizer_state ts; + if (tokenizer_init(&ts, pconfig) < 0) { + goto error; + } + + bool usecols_iscallable = PyCallable_Check(usecols_obj); + + /* + * Parse usecols_obj, if it is not None or callable. + * + * Note: we don't have to handle the case of usecols being a scalar int, + * because such an argument is wrapped in a list in the Python code + * before it gets here. + */ + if (!(usecols_obj == Py_None || usecols_iscallable)) { + num_usecols = PyArray_SeqToSsizeCArray(usecols_obj, &usecols_arr, + "usecols must be an int, a sequence of ints, or a callable, " + "but type '%s' was given", + "usecols must contain only ints, but at least one occurrence " + "of type '%s' was found"); + if (num_usecols == -1) { + goto error; + } + } + + /* + * Parse dtype. + * + * If dtypes[0] is dtype the input was not structured and the result + * is considered "homogeneous" and we have to discover the number of + * columns. + */ + out_descr = (PyArray_Descr *)dtype; + /* Hold on to a reference while processing the file. */ + Py_INCREF(out_descr); + + Py_ssize_t num_field_types = field_types_create(out_descr, &field_types); + if (num_field_types < 0) { + goto error; + } + bool homogeneous = num_field_types == 1 && field_types[0].descr == out_descr; + + if (!homogeneous && usecols_arr != NULL && num_usecols != num_field_types) { + PyErr_Format(PyExc_TypeError, + "If a structured dtype is used, the number of columns in " + "`usecols` must match the effective number of fields. " + "But %zd usecols were given and the number of fields is %zd.", + num_usecols, num_field_types); + goto error; + } bool needs_init = PyDataType_FLAGCHK(out_descr, NPY_NEEDS_INIT); @@ -186,19 +263,9 @@ read_rows(stream *s, /* We give a warning if max_rows is used and an empty line is encountered */ bool give_empty_row_warning = max_rows >= 0; - // If the caller passes in a callable for usecols_obj, then they must - // also pass in NULL for usecols. - assert(usecols == NULL || usecols_obj == Py_None); - - int ts_result = 0; - tokenizer_state ts; - if (tokenizer_init(&ts, pconfig) < 0) { - goto error; - } - /* Set the actual number of fields if it is already known, otherwise -1 */ Py_ssize_t actual_num_fields = -1; - if (usecols != NULL) { + if (usecols_arr != NULL) { assert(homogeneous || num_field_types == num_usecols); actual_num_fields = num_usecols; } @@ -258,52 +325,22 @@ read_rows(stream *s, // because we've now read the first line, and we definitively // know how many fields (i.e. columns) we will be processing. - if (usecols_obj != Py_None) { - // Call the Python function provided by the caller to - // create the usecols array. - PyObject *seq = PyObject_CallFunction(usecols_obj, "n", - current_num_fields); - if (seq == NULL) { - // User-provided function failed. - goto error; - } - // The user-defined usecols function must return a sequence - // of integers. - num_usecols = PySequence_Length(seq); - if (num_usecols == -1) { - // User-provided function did not return a sequence. - PyErr_Clear(); - PyErr_Format(PyExc_TypeError, - "the user-provided callable usecols must return a " - "sequence of ints, but it returned an instance of " - "type '%s'", Py_TYPE(seq)->tp_name); - Py_DECREF(seq); + prev_num_fields = current_num_fields; + + if (usecols_iscallable) { + num_usecols = get_usecols_arr_from_callable(usecols_obj, + current_num_fields, &usecols_arr); + if (num_usecols < 0) { goto error; } - if (!homogeneous && num_field_types != num_usecols) { - // A structured dtype was provided, and the length of - // the sequence returned by the user-provided function - // does not have the same length as the number of fields - // in the dtype. - Py_DECREF(seq); PyErr_Format(PyExc_RuntimeError, "length of the sequence returned by the callable " - "usecols (%d) does not equal the number of fields " - "in the given dtype (%d)", + "usecols (%zd) does not equal the number of fields " + "in the given dtype (%zd)", num_usecols, num_field_types); goto error; } - - // Convert the sequence to a C array of Py_ssize_t ints. - usecols = seq_to_ssize_c_array(num_usecols, seq, - "the user-provided callable usecols must return a " - "sequence of ints, but it returned a sequence " - "containing at least one occurrence of type '%s'"); - Py_DECREF(seq); - if (usecols == NULL) { - goto error; - } actual_num_fields = num_usecols; } @@ -313,7 +350,7 @@ read_rows(stream *s, if (converters != Py_None) { conv_funcs = create_conv_funcs( - converters, actual_num_fields, usecols); + converters, actual_num_fields, usecols_arr); if (conv_funcs == NULL) { goto error; } @@ -321,6 +358,11 @@ read_rows(stream *s, /* Note that result_shape[1] is only used if homogeneous is true */ result_shape[1] = actual_num_fields; + /* + * We can now set row_size, the number of bytes in each "row" of + * the output array. + */ + row_size = out_descr->elsize; if (homogeneous) { row_size *= actual_num_fields; } @@ -378,12 +420,66 @@ read_rows(stream *s, data_ptr = PyArray_BYTES(data_array); } - if (!usecols && (actual_num_fields != current_num_fields)) { - PyErr_Format(PyExc_ValueError, - "the number of columns changed from %zd to %zd at row %zd; " - "use `usecols` to select a subset and avoid this error", - actual_num_fields, current_num_fields, row_count+1); - goto error; + if (!usecols_arr) { + if (actual_num_fields != current_num_fields) { + PyErr_Format(PyExc_ValueError, + "the number of columns changed from %zd to %zd at row " + "%zd; use `usecols` to select a subset and avoid this " + "error", + actual_num_fields, current_num_fields, row_count+1); + goto error; + } + } + else { + if (usecols_iscallable && current_num_fields != prev_num_fields) { + /* + * The number of fields in this row is not the same as in the + * previous row. Call the user-defined function to update + * usecols_arr, and possibly the array of converters. + */ + Py_ssize_t new_num_usecols; + Py_ssize_t *new_usecols_arr = NULL; + new_num_usecols = get_usecols_arr_from_callable(usecols_obj, + current_num_fields, &new_usecols_arr); + if (new_num_usecols < 0) { + goto error; + } + if (new_num_usecols != num_usecols) { + PyErr_Format(PyExc_RuntimeError, + "the length of the sequence returned by the " + "user-defined usecols function (%zd) is not the " + "same as in the previous call (%zd)", + new_num_usecols, num_usecols); + PyMem_Free(new_usecols_arr); + goto error; + } + bool usecols_changed = false; + for (Py_ssize_t i = 0; i < num_usecols; ++i) { + if (new_usecols_arr[i] != usecols_arr[i]) { + usecols_changed = true; + break; + } + } + if (!usecols_changed) { + PyMem_Free(new_usecols_arr); + } + else { + PyMem_Free(usecols_arr); + usecols_arr = new_usecols_arr; + if (conv_funcs != NULL) { + /* + * Changing usecols can change the conv_funcs array, so + * free the old one and create a new one. + */ + free_conv_funcs(actual_num_fields, conv_funcs); + conv_funcs = create_conv_funcs( + converters, actual_num_fields, usecols_arr); + if (conv_funcs == NULL) { + goto error; + } + } + } + } } if (NPY_UNLIKELY(data_allocated_rows == row_count)) { @@ -432,11 +528,11 @@ read_rows(stream *s, item_ptr = data_ptr + field_types[f].structured_offset; } - if (usecols == NULL) { + if (usecols_arr == NULL) { col = i; } else { - col = usecols[i]; + col = usecols_arr[i]; if (col < 0) { // Python-like column indexing: k = -1 means the last column. col += current_num_fields; @@ -445,7 +541,7 @@ read_rows(stream *s, PyErr_Format(PyExc_ValueError, "invalid column index %zd at row %zd with %zd " "columns", - usecols[i], row_count+1, current_num_fields); + usecols_arr[i], row_count+1, current_num_fields); goto error; } } @@ -488,23 +584,16 @@ read_rows(stream *s, } } + prev_num_fields = current_num_fields; ++row_count; data_ptr += row_size; } - if (usecols_obj != Py_None) { - // This function owns usecols if a callable usecols_obj was given. - PyMem_Free(usecols); - usecols = NULL; // An overabundance of caution... - } + PyMem_Free(usecols_arr); tokenizer_clear(&ts); - if (conv_funcs != NULL) { - for (Py_ssize_t i = 0; i < actual_num_fields; i++) { - Py_XDECREF(conv_funcs[i]); - } - PyMem_FREE(conv_funcs); - } + free_conv_funcs(actual_num_fields, conv_funcs); + field_types_xclear(num_field_types, field_types); if (data_array == NULL) { assert(row_count == 0 && result_shape[0] == 0); @@ -543,21 +632,15 @@ read_rows(stream *s, ((PyArrayObject_fields *)data_array)->dimensions[0] = row_count; } + Py_DECREF(out_descr); + return data_array; error: - if (usecols_obj != Py_None) { - // If the error occurred early enough in the function, we might - // not have allocated usecols yet, but that's OK, because we know - // that usecols is NULL in that case. - PyMem_Free(usecols); - } - if (conv_funcs != NULL) { - for (Py_ssize_t i = 0; i < actual_num_fields; i++) { - Py_XDECREF(conv_funcs[i]); - } - PyMem_FREE(conv_funcs); - } + PyMem_Free(usecols_arr); + free_conv_funcs(actual_num_fields, conv_funcs); + Py_XDECREF(out_descr); + field_types_xclear(num_field_types, field_types); tokenizer_clear(&ts); Py_XDECREF(data_array); return NULL; diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h index 1a707a2d3918..68c894ffb903 100644 --- a/numpy/core/src/multiarray/textreading/rows.h +++ b/numpy/core/src/multiarray/textreading/rows.h @@ -12,12 +12,8 @@ NPY_NO_EXPORT PyArrayObject * -read_rows(stream *s, - npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types, - parser_config *pconfig, - Py_ssize_t num_usecols, Py_ssize_t *usecols, PyObject *usecols_obj, - Py_ssize_t skiplines, PyObject *converters, - PyArrayObject *data_array, PyArray_Descr *out_descr, - bool homogeneous); +read_rows(stream *s, npy_intp nrows, parser_config *pconfig, + PyObject *usecols_obj, Py_ssize_t skiplines, PyObject *converters, + PyArrayObject *data_array, PyObject *dtype); #endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ */ diff --git a/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c b/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c deleted file mode 100644 index 67a2effef45a..000000000000 --- a/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c +++ /dev/null @@ -1,52 +0,0 @@ -#define PY_SSIZE_T_CLEAN -#include - -#define NPY_NO_DEPRECATED_API NPY_API_VERSION -#define _MULTIARRAYMODULE -#include "numpy/arrayobject.h" // For NPY_NO_EXPORT - -// -// Convert a Python sequence to a C array of Py_ssize_t integers. -// -// `seq` must be a Python sequence of length `len`. It is assumed that the -// caller has already checked the length of the sequence, so the length is an -// argument instead of being inferred from `seq` itself. -// -// `errtext` must be provided, and it must contain one occurrence of the -// format code sequence '%s'. This text is used as the text of the TypeError -// that is raised when an element of `seq` is found that cannot be converted -// to an a Py_ssize_t integer. The '%s' format code will be replaced with -// the type of the object that failed the conversion. -// -// Returns NULL with an exception set if the conversion fails. -// -// The memory for the array is allocated with PyMem_Calloc. -// The caller must free the memory with PyMem_FREE or PyMem_Free. -// -NPY_NO_EXPORT Py_ssize_t * -seq_to_ssize_c_array(Py_ssize_t len, PyObject *seq, char *errtext) -{ - Py_ssize_t *arr = PyMem_Calloc(len, sizeof(Py_ssize_t)); - if (arr == NULL) { - PyErr_NoMemory(); - return NULL; - } - for (Py_ssize_t i = 0; i < len; ++i) { - PyObject *tmp = PySequence_GetItem(seq, i); - if (tmp == NULL) { - PyMem_Free(arr); - return NULL; - } - arr[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); - if (arr[i] == -1 && PyErr_Occurred()) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) { - PyErr_Format(PyExc_TypeError, errtext, Py_TYPE(tmp)->tp_name); - } - Py_DECREF(tmp); - PyMem_Free(arr); - return NULL; - } - Py_DECREF(tmp); - } - return arr; -} diff --git a/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h b/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h deleted file mode 100644 index a58d7a01a93c..000000000000 --- a/numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef SEQ_TO_SSIZE_C_ARRAY_H -#define SEQ_TO_SSIZE_C_ARRAY_H - -Py_ssize_t *seq_to_ssize_c_array(Py_ssize_t len, PyObject *seq, char *initialtext); - -#endif diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp index d0d9cf844912..200b7f438c0d 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.cpp +++ b/numpy/core/src/multiarray/textreading/tokenize.cpp @@ -446,6 +446,8 @@ tokenizer_init(tokenizer_state *ts, parser_config *config) ts->fields = (field_info *)PyMem_Malloc(4 * sizeof(*ts->fields)); if (ts->fields == nullptr) { + PyMem_Free(ts->field_buffer); + ts->field_buffer = nullptr; PyErr_NoMemory(); return -1; } diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 38a751d11511..c99c37bc0171 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -874,13 +874,13 @@ def __index__(self): bogus_idx = 1.5 assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx).__name__, + '^usecols must contain.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=bogus_idx ) assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx).__name__, + '^usecols must contain.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=[0, bogus_idx, 0] ) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 042f8a44ef6c..dbb66385740a 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -269,6 +269,12 @@ def test_empty_usecols(usecols): assert res.dtype == np.dtype([]) +def test_usecols_dtype_num_fields_mismatch(): + dt = np.dtype([('x', float), ('y', float)]) + with pytest.raises(TypeError, match='the number of fields is 2'): + np.loadtxt(StringIO('1 2 3 4\n5 6 7 8'), dtype=dt, usecols=[1, 2, 3]) + + def test_callable_usecols_basic(): txt = StringIO('1 2 3 4 5\n6 7 8 9 1\n2 3 4 5 6\n') res = np.loadtxt(txt, dtype=int, usecols=lambda n: list(range(n)[::2])) @@ -292,11 +298,43 @@ def load(self): assert_equal(res, expected) +def test_callable_usecols_ragged_input_columns(): + txt = StringIO('1 2 3\n2 4 6 8 10\n99\n21 22 23') + res = np.loadtxt(txt, dtype=int, usecols=lambda n: [0, n - 1, n//2]) + expected = np.array([[1, 3, 2], [2, 10, 6], [99, 99, 99], [21, 23, 22]]) + assert_equal(res, expected) + + +# The second 'text' instance has varying numbers of fields in the rows. +@pytest.mark.parametrize('text', ['1 2 3 4 5\n6 7 8 9 1\n2 3 5 7 9', + '1 3 5\n6 7 8 9 1\n2 3 7 9']) +@pytest.mark.parametrize('usecols, expected', + [(lambda n: [0, n-1], [[10, 5], [60, 1], [20, 9]]), + (lambda n: [0, -1], [[10, -5], [60, -1], [20, -9]])]) +def test_callable_usecols_with_converters(text, usecols, expected): + # converters contains the keys 0 and -1. When usecols returns [0, n-1], + # the last column is *not* passed through a converter, because the code + # that processes converters treats the key -1 and the positive key n-1 + # as distinct, and only uses the key that exactly matches what is + # given in usecols. + converters = {0: lambda s: 10*float(s), + -1: lambda s: -float(s)} + res = np.loadtxt(StringIO(text), dtype=int, usecols=usecols, + converters=converters) + assert_equal(res, expected) + + def test_callable_usecols_raises_exception(): with pytest.raises(ZeroDivisionError, match='division by zero'): np.loadtxt(StringIO('1 2\n4 5'), usecols=lambda n: 1/0) +def test_callable_usecols_second_call_raises_exception(): + with pytest.raises(ZeroDivisionError, match='division by zero'): + np.loadtxt(StringIO('1 2\n3 4 5'), + usecols=lambda n: [0, 1] if n == 2 else 1/0) + + def test_callable_usecols_not_a_sequence(): with pytest.raises(TypeError, match="returned an instance of type 'module'"): @@ -317,6 +355,12 @@ def test_callable_usecols_bad_len_with_structured_dtype(): np.loadtxt(txt, usecols=lambda n: [0, 2, 3], dtype=dt) +def test_callable_usecols_returned_num_fields_changed(): + txt = StringIO('1 2 3\n6 7 8 9\n2 3 4 5\n') + with pytest.raises(RuntimeError, match="not the same as in the previous"): + np.loadtxt(txt, dtype=int, usecols=lambda n: range(n - 1)) + + @pytest.mark.parametrize("c1", ["a", "の", "🫕"]) @pytest.mark.parametrize("c2", ["a", "の", "🫕"]) def test_large_unicode_characters(c1, c2): From 571a5e6334d4465a719f80a6ef8e4cda1c30b931 Mon Sep 17 00:00:00 2001 From: warren Date: Tue, 28 Jun 2022 14:26:26 -0400 Subject: [PATCH 04/14] BUG: Fix free_conv_funcs() declaration and a few compiler warnings. --- numpy/core/src/multiarray/textreading/rows.c | 22 +++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index c92878adc1d4..49b4d47cb6ef 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -11,6 +11,7 @@ #include #include +#include "conversion_utils.h" #include "textreading/stream.h" #include "textreading/tokenize.h" #include "textreading/conversions.h" @@ -148,7 +149,7 @@ create_conv_funcs( } static void -free_conv_funcs(Py_ssize_t num_fields, Py_ssize_t *conv_funcs) +free_conv_funcs(Py_ssize_t num_fields, PyObject **conv_funcs) { if (conv_funcs != NULL) { for (Py_ssize_t i = 0; i < num_fields; i++) { @@ -195,6 +196,7 @@ read_rows(stream *s, Py_ssize_t num_usecols = -1; Py_ssize_t *usecols_arr = NULL; PyArray_Descr *out_descr = NULL; + Py_ssize_t num_field_types = 0; field_type *field_types = NULL; int ts_result = 0; @@ -234,7 +236,7 @@ read_rows(stream *s, /* Hold on to a reference while processing the file. */ Py_INCREF(out_descr); - Py_ssize_t num_field_types = field_types_create(out_descr, &field_types); + num_field_types = field_types_create(out_descr, &field_types); if (num_field_types < 0) { goto error; } @@ -251,6 +253,15 @@ read_rows(stream *s, bool needs_init = PyDataType_FLAGCHK(out_descr, NPY_NEEDS_INIT); + /* + * Initial guess of the number of bytes in each "row" of the output array. + * This is correct if the user provided a structure data type. For a + * simple data type (i.e the "homogeneous" case), this will have to be + * multiplied by the number of fields in each row (i.e. the number of + * columns in the output array). + */ + row_size = out_descr->elsize; + int ndim = homogeneous ? 2 : 1; npy_intp result_shape[2] = {0, 1}; @@ -358,12 +369,9 @@ read_rows(stream *s, /* Note that result_shape[1] is only used if homogeneous is true */ result_shape[1] = actual_num_fields; - /* - * We can now set row_size, the number of bytes in each "row" of - * the output array. - */ - row_size = out_descr->elsize; + if (homogeneous) { + /* We're create a 2-d output array, so scale up row_size. */ row_size *= actual_num_fields; } From 6d69407125ada3b95a7c11f7e884c2d6ee061cf9 Mon Sep 17 00:00:00 2001 From: warren Date: Tue, 28 Jun 2022 15:05:00 -0400 Subject: [PATCH 05/14] BUG: remove unnecessary (and incorrect) clause from an assert() --- numpy/core/src/multiarray/textreading/rows.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 49b4d47cb6ef..e7a27d99817e 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -281,7 +281,7 @@ read_rows(stream *s, actual_num_fields = num_usecols; } else if (!homogeneous) { - assert(usecols == NULL || num_field_types == num_usecols); + assert(num_field_types == num_usecols); actual_num_fields = num_field_types; } From 3e30cd825754bcb018575b51eb9bf89f9b48dc3c Mon Sep 17 00:00:00 2001 From: warren Date: Tue, 28 Jun 2022 15:08:17 -0400 Subject: [PATCH 06/14] Initialize a var to avoid a compiler warning. --- numpy/core/src/multiarray/textreading/rows.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index e7a27d99817e..75903d4858b2 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -190,7 +190,7 @@ read_rows(stream *s, { char *data_ptr = NULL; Py_ssize_t current_num_fields; - Py_ssize_t prev_num_fields; + Py_ssize_t prev_num_fields = 0; /* Set value to avoid compiler warning */ PyObject **conv_funcs = NULL; npy_intp row_size; Py_ssize_t num_usecols = -1; From 231321ef1a38326188296b14cc7aae63b0908e65 Mon Sep 17 00:00:00 2001 From: warren Date: Tue, 28 Jun 2022 17:24:05 -0400 Subject: [PATCH 07/14] Ensure PY_SSIZE_T_CLEAN is set before including Python.h --- numpy/core/src/multiarray/textreading/conversions.c | 1 + numpy/core/src/multiarray/textreading/str_to_int.c | 1 + numpy/core/src/multiarray/textreading/tokenize.cpp | 2 +- numpy/core/src/multiarray/textreading/tokenize.h | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c index 11f4210f7283..772b5f9c0022 100644 --- a/numpy/core/src/multiarray/textreading/conversions.c +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -1,4 +1,5 @@ +#define PY_SSIZE_T_CLEAN #include #include diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c index 0dd6c0b0e66a..0fab15eeabc4 100644 --- a/numpy/core/src/multiarray/textreading/str_to_int.c +++ b/numpy/core/src/multiarray/textreading/str_to_int.c @@ -1,4 +1,5 @@ +#define PY_SSIZE_T_CLEAN #include #define NPY_NO_DEPRECATED_API NPY_API_VERSION diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp index 200b7f438c0d..bc27e0d4cd34 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.cpp +++ b/numpy/core/src/multiarray/textreading/tokenize.cpp @@ -1,4 +1,4 @@ - +#define PY_SSIZE_T_CLEAN #include #define NPY_NO_DEPRECATED_API NPY_API_VERSION diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h index a78c6d936c2a..e374e8460ee4 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.h +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -2,6 +2,7 @@ #ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ #define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ +#define PY_SSIZE_T_CLEAN #include #include "numpy/ndarraytypes.h" From d48333c437c2d3ebf7f7964c29cc121ab351899c Mon Sep 17 00:00:00 2001 From: warren Date: Tue, 28 Jun 2022 20:24:08 -0400 Subject: [PATCH 08/14] TMP: Check sizes of integers --- numpy/core/src/multiarray/textreading/readtext.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 0d61bac1324d..bf48476f4eaf 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -159,6 +159,13 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), return NULL; } + if (sizeof(npy_intp) != sizeof(Py_ssize_t)) { + PyErr_Format(PyExc_RuntimeError, + "sizeof(npy_intp) = %zu, sizeof(Py_ssize_t) = %zu\n", + sizeof(npy_intp), sizeof(Py_ssize_t)); + return NULL; + } + /* Reject matching control characters, they just rarely make sense anyway */ if (error_if_matching_control_characters( pc.delimiter, pc.quote, pc.comment) < 0) { From b28370900807bf14764cee0abb1e3071fb8c8df0 Mon Sep 17 00:00:00 2001 From: warren Date: Tue, 28 Jun 2022 20:29:47 -0400 Subject: [PATCH 09/14] TMP: Move code that checks sizes of integers --- numpy/core/src/multiarray/textreading/readtext.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index bf48476f4eaf..eebeba65e325 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -139,6 +139,13 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), }; bool filelike = true; + if (sizeof(npy_intp) != sizeof(Py_ssize_t)) { + PyErr_Format(PyExc_RuntimeError, + "sizeof(npy_intp) = %zu, sizeof(Py_ssize_t) = %zu\n", + sizeof(npy_intp), sizeof(Py_ssize_t)); + return NULL; + } + NPY_PREPARE_ARGPARSER; if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames, "file", NULL, &file, @@ -159,13 +166,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), return NULL; } - if (sizeof(npy_intp) != sizeof(Py_ssize_t)) { - PyErr_Format(PyExc_RuntimeError, - "sizeof(npy_intp) = %zu, sizeof(Py_ssize_t) = %zu\n", - sizeof(npy_intp), sizeof(Py_ssize_t)); - return NULL; - } - /* Reject matching control characters, they just rarely make sense anyway */ if (error_if_matching_control_characters( pc.delimiter, pc.quote, pc.comment) < 0) { From 700dcb74db172fd70aee48d56356eb92b919852c Mon Sep 17 00:00:00 2001 From: warren Date: Tue, 28 Jun 2022 21:54:10 -0400 Subject: [PATCH 10/14] Revert previous TMP commit. --- numpy/core/src/multiarray/textreading/readtext.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index eebeba65e325..0d61bac1324d 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -139,13 +139,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), }; bool filelike = true; - if (sizeof(npy_intp) != sizeof(Py_ssize_t)) { - PyErr_Format(PyExc_RuntimeError, - "sizeof(npy_intp) = %zu, sizeof(Py_ssize_t) = %zu\n", - sizeof(npy_intp), sizeof(Py_ssize_t)); - return NULL; - } - NPY_PREPARE_ARGPARSER; if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames, "file", NULL, &file, From 9c7bf60a0796c7f3ddd20a07eb3cd8d7c6877a1d Mon Sep 17 00:00:00 2001 From: warren Date: Tue, 28 Jun 2022 21:56:09 -0400 Subject: [PATCH 11/14] Remove invalid assert() --- numpy/core/src/multiarray/textreading/rows.c | 1 - 1 file changed, 1 deletion(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 75903d4858b2..b5c49396af24 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -281,7 +281,6 @@ read_rows(stream *s, actual_num_fields = num_usecols; } else if (!homogeneous) { - assert(num_field_types == num_usecols); actual_num_fields = num_field_types; } From 69ab6c3b70adab24df44a9a4edfa4928868af1af Mon Sep 17 00:00:00 2001 From: warren Date: Fri, 1 Jul 2022 01:31:07 -0400 Subject: [PATCH 12/14] get_usecols_arr_from_callable now handles the initial and subsequent calls. --- numpy/core/src/multiarray/textreading/rows.c | 162 +++++++++++-------- 1 file changed, 95 insertions(+), 67 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index b5c49396af24..14bb70b8c2aa 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -25,36 +25,89 @@ */ #define MIN_BLOCK_SIZE (1 << 13) - +#define USECOLS_ERROR -1 +#define USECOLS_NOCHANGE 0 +#define USECOLS_CHANGED 1 /* - * Call the user-defined usecols callable and create the C array of - * Py_ssize_t integers. + * Call the user-defined usecols callable and create or update the + * C array of Py_ssize_t integers. + * + * On input, *p_usecols_arr must be either NULL or the result of + * a previous call to this function. If *p_usecols is not NULL, + * *p_num_usecols must be the length of the array pointed to by + * *p_usecols_arr. * - * Returns the size of the array, and sets *p_usecols_arr to the - * C array, allocated with PyMem_Calloc. The caller is responsible - * for freeing this memory with PyMem_Free(). + * Return values: + * USECOLS_ERROR (-1) + * Error. An exception has been set. The values pointed to by + * p_num_usecols and p_usecols_arr have not been changed. + * USECOLS_NOCHANGE (0) + * The usecols data has not changed. This value can be returned + * only if p_usecols_arr was not NULL. + * USECOLS_CHANGED (1) + * The usecols data has changed. * - * Returns -1 with an exception set and *p_usecols_arr set to NULL - * if the call or conversion fails. + * If no error occurs and *p_usecols_arr was NULL on input, then + * on return, *p_usecol_arr points to an array with length + * *p_num_usecols. If *p_usecols_arr was not NULL, then on + * output *p_num_usecols is not change, and whether or not + * *p_usecol_arr has changed is indicated by the return value. */ -static Py_ssize_t +static int get_usecols_arr_from_callable(PyObject *usecols_obj, Py_ssize_t n, + Py_ssize_t *p_num_usecols, Py_ssize_t **p_usecols_arr) { - *p_usecols_arr = NULL; + Py_ssize_t new_num_usecols; + Py_ssize_t *new_usecols_arr = NULL; PyObject *seq = PyObject_CallFunction(usecols_obj, "n", n); if (seq == NULL) { - return -1; + return USECOLS_ERROR; } - Py_ssize_t num_usecols = PyArray_SeqToSsizeCArray(seq, p_usecols_arr, - "the user-provided callable usecols must return a " - "sequence of ints, but it returned an instance of " - "type '%s'", - "the user-provided callable usecols must return a " - "sequence of ints, but it returned a sequence " - "containing at least one occurrence of type '%s'"); + new_num_usecols = PyArray_SeqToSsizeCArray(seq, &new_usecols_arr, + "the user-provided callable usecols must return a " + "sequence of ints, but it returned an instance of " + "type '%s'", + "the user-provided callable usecols must return a " + "sequence of ints, but it returned a sequence " + "containing at least one occurrence of type '%s'"); Py_DECREF(seq); - return num_usecols; + if (new_num_usecols < 0) { + return USECOLS_ERROR; + } + if (*p_usecols_arr == NULL) { + /* This is an initial call. */ + *p_num_usecols = new_num_usecols; + *p_usecols_arr = new_usecols_arr; + return USECOLS_CHANGED; + } + /* This is a update call. */ + if (new_num_usecols != *p_num_usecols) { + /* A new call of usecols returned a different length. */ + PyErr_Format(PyExc_RuntimeError, + "the length of the sequence returned by the " + "user-defined usecols function (%zd) is not the " + "same as in the previous call (%zd)", + new_num_usecols, *p_num_usecols); + PyMem_Free(new_usecols_arr); + return USECOLS_ERROR; + } + /* Check if the usecols data is different. */ + bool usecols_changed = false; + for (Py_ssize_t i = 0; i < new_num_usecols; ++i) { + if (new_usecols_arr[i] != *p_usecols_arr[i]) { + usecols_changed = true; + break; + } + } + if (!usecols_changed) { + PyMem_Free(new_usecols_arr); + return USECOLS_NOCHANGE; + } + /* The data is different; update the pointer to the new array. */ + PyMem_Free(*p_usecols_arr); + *p_usecols_arr = new_usecols_arr; + return USECOLS_CHANGED; } /* @@ -338,9 +391,9 @@ read_rows(stream *s, prev_num_fields = current_num_fields; if (usecols_iscallable) { - num_usecols = get_usecols_arr_from_callable(usecols_obj, - current_num_fields, &usecols_arr); - if (num_usecols < 0) { + int status = get_usecols_arr_from_callable(usecols_obj, + current_num_fields, &num_usecols, &usecols_arr); + if (status == USECOLS_ERROR) { goto error; } if (!homogeneous && num_field_types != num_usecols) { @@ -437,55 +490,30 @@ read_rows(stream *s, goto error; } } - else { - if (usecols_iscallable && current_num_fields != prev_num_fields) { + else if (NPY_UNLIKELY(current_num_fields != prev_num_fields) + && usecols_iscallable) { + /* + * The number of fields in this row is not the same as in the + * previous row. Call the user-defined function to update + * usecols_arr. Then update the array of converters, if + * necessary. + */ + int status = get_usecols_arr_from_callable(usecols_obj, + current_num_fields, &num_usecols, &usecols_arr); + if (status == USECOLS_ERROR) { + goto error; + } + if (status == USECOLS_CHANGED && conv_funcs != NULL) { /* - * The number of fields in this row is not the same as in the - * previous row. Call the user-defined function to update - * usecols_arr, and possibly the array of converters. + * Changing usecols can change the conv_funcs array, so + * free the old one and create a new one. */ - Py_ssize_t new_num_usecols; - Py_ssize_t *new_usecols_arr = NULL; - new_num_usecols = get_usecols_arr_from_callable(usecols_obj, - current_num_fields, &new_usecols_arr); - if (new_num_usecols < 0) { - goto error; - } - if (new_num_usecols != num_usecols) { - PyErr_Format(PyExc_RuntimeError, - "the length of the sequence returned by the " - "user-defined usecols function (%zd) is not the " - "same as in the previous call (%zd)", - new_num_usecols, num_usecols); - PyMem_Free(new_usecols_arr); + free_conv_funcs(actual_num_fields, conv_funcs); + conv_funcs = create_conv_funcs( + converters, actual_num_fields, usecols_arr); + if (conv_funcs == NULL) { goto error; } - bool usecols_changed = false; - for (Py_ssize_t i = 0; i < num_usecols; ++i) { - if (new_usecols_arr[i] != usecols_arr[i]) { - usecols_changed = true; - break; - } - } - if (!usecols_changed) { - PyMem_Free(new_usecols_arr); - } - else { - PyMem_Free(usecols_arr); - usecols_arr = new_usecols_arr; - if (conv_funcs != NULL) { - /* - * Changing usecols can change the conv_funcs array, so - * free the old one and create a new one. - */ - free_conv_funcs(actual_num_fields, conv_funcs); - conv_funcs = create_conv_funcs( - converters, actual_num_fields, usecols_arr); - if (conv_funcs == NULL) { - goto error; - } - } - } } } From b7e02ddc4ed2106dc693b688bc6b84ba7da073f2 Mon Sep 17 00:00:00 2001 From: warren Date: Wed, 6 Jul 2022 00:21:57 -0400 Subject: [PATCH 13/14] Expand some code comments. --- numpy/core/src/multiarray/conversion_utils.c | 8 ++++++++ numpy/core/src/multiarray/textreading/rows.c | 7 +++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c index bb4a14b6d7d3..a4914c9f144c 100644 --- a/numpy/core/src/multiarray/conversion_utils.c +++ b/numpy/core/src/multiarray/conversion_utils.c @@ -1167,6 +1167,14 @@ PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals) // The memory for the array is allocated with PyMem_Calloc. // The caller must free the memory with PyMem_FREE or PyMem_Free. // +// If the length of the sequence is 0, a non-NULL pointer is returned +// in *parr, but it must not be dereferenced. To quote the PyMem_Calloc +// documentation: +// +// Requesting zero elements or elements of size zero bytes returns a +// distinct non-NULL pointer if possible, as if PyMem_Calloc(1, 1) had +// been called instead. +// // Returns -1 with an exception set and with *parr set to NULL if the // conversion fails. // diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 14bb70b8c2aa..3cf41381701e 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -10,6 +10,7 @@ #include #include +#include #include "conversion_utils.h" #include "textreading/stream.h" @@ -32,8 +33,8 @@ * Call the user-defined usecols callable and create or update the * C array of Py_ssize_t integers. * - * On input, *p_usecols_arr must be either NULL or the result of - * a previous call to this function. If *p_usecols is not NULL, + * On input, *p_usecols_arr must be either NULL or the result of a + * previous call to this function. If *p_usecols_arr is not NULL, * *p_num_usecols must be the length of the array pointed to by * *p_usecols_arr. * @@ -112,6 +113,8 @@ get_usecols_arr_from_callable(PyObject *usecols_obj, Py_ssize_t n, /* * Create the array of converter functions from the Python converters. + * Elements of conv_funcs for which a corresponding converter has not + * been given will be NULL. */ static PyObject ** create_conv_funcs( From 28c32cadaf480cad872688c829fa06f323033058 Mon Sep 17 00:00:00 2001 From: warren Date: Wed, 6 Jul 2022 00:22:22 -0400 Subject: [PATCH 14/14] Split a test in test_loadtxt.py --- numpy/lib/tests/test_loadtxt.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index dbb66385740a..c51e6fececbd 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -226,7 +226,7 @@ def test_converters_negative_indices(): @pytest.mark.parametrize('usecols', [[0, -1], lambda n: [0, -1]]) -def test_converters_negative_indices_with_usecols(usecols): +def test_converters_neg_indices_usecols(usecols): txt = StringIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n') conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]]) @@ -240,8 +240,10 @@ def test_converters_negative_indices_with_usecols(usecols): ) assert_equal(res, expected) - # Second test with variable number of rows: - res = np.loadtxt(StringIO('''0,1,2\n0,1,2,3,4'''), delimiter=",", + +@pytest.mark.parametrize('usecols', [[0, -1], lambda n: [0, -1]]) +def test_converters_neg_indices_usecols_varying_fields(usecols): + res = np.loadtxt(StringIO('0,1,2\n0,1,2,3,4'), delimiter=",", usecols=usecols, converters={-1: (lambda x: -1)}) assert_array_equal(res, [[0, -1], [0, -1]])