numpy · WarrenWeckesser · Jun 20, 2022 · Jun 21, 2022 · Jun 23, 2022 · Jun 28, 2022
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
@@ -1147,6 +1147,69 @@ PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals)
 }


+//
+// Convert a Python sequence to a C array of Py_ssize_t integers.
+//
+// `seq` must be a Python sequence.
+//
+// `seq_typeerror_text` and `element_typeerror_text` must be provided, and
+// each must contain one occurrence of the format code sequence '%s'.  The
+// '%s' format code will be replaced with the type of the object that failed
+// the conversion.
+// * `seq_typeerror_text` is used if the attempt to get the length of `seq`
+//   fails.
+// * `element_typerror_text` is used if the attempt to convert an element
+//   of `seq` to a Py_ssize_t fails.
+//
+// On success, the function assigns the allocated memory to *parr, and
+// returns the length of the sequence.
+//
+// The memory for the array is allocated with PyMem_Calloc.
+// The caller must free the memory with PyMem_FREE or PyMem_Free.
+//
+// Returns -1 with an exception set and with *parr set to NULL if the
+// conversion fails.
+//
+NPY_NO_EXPORT Py_ssize_t
+PyArray_SeqToSsizeCArray(PyObject *seq, Py_ssize_t **parr,
+                         char *seq_typeerror_text,
+                         char *element_typeerror_text)
+{
+    *parr = NULL;
+    Py_ssize_t len = PySequence_Length(seq);
+    if (len == -1) {
+        PyErr_Format(PyExc_TypeError, seq_typeerror_text,
+                     Py_TYPE(seq)->tp_name);
+        return -1;
+    }
+    Py_ssize_t *arr = PyMem_Calloc(len, sizeof(Py_ssize_t));
+    if (arr == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    for (Py_ssize_t i = 0; i < len; ++i) {
+        PyObject *tmp = PySequence_GetItem(seq, i);
+        if (tmp == NULL) {
+            PyMem_Free(arr);
+            return -1;
+        }
+        arr[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
+        if (error_converting(arr[i])) {
+            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+                PyErr_Format(PyExc_TypeError, element_typeerror_text,
+                             Py_TYPE(tmp)->tp_name);
+            }
+            Py_DECREF(tmp);
+            PyMem_Free(arr);
+            return -1;
+        }
+        Py_DECREF(tmp);
+    }
+    *parr = arr;
+    return len;
+}
+
+
 /**
 * WARNING: This flag is a bad idea, but was the only way to both
 *   1) Support unpickling legacy pickles with object types.

diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h
@@ -48,6 +48,11 @@ PyArray_IntpFromIndexSequence(PyObject *seq, npy_intp *vals, npy_intp maxvals);
 NPY_NO_EXPORT int
 PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals);

+NPY_NO_EXPORT Py_ssize_t
+PyArray_SeqToSsizeCArray(PyObject *seq, Py_ssize_t **parr,
+                         char *seq_typeerror_text,
+                         char *element_typeerror_text);
+
 NPY_NO_EXPORT int
 PyArray_TypestrConvert(int itemsize, int gentype);


diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
@@ -13,73 +13,7 @@

 #include "textreading/parser_config.h"
 #include "textreading/stream_pyobject.h"
-#include "textreading/field_types.h"
 #include "textreading/rows.h"
-#include "textreading/str_to_int.h"
-
-
-//
-// `usecols` must point to a Python object that is Py_None or a 1-d contiguous
-// numpy array with data type int32.
-//
-// `dtype` must point to a Python object that is Py_None or a numpy dtype
-// instance.  If the latter, code and sizes must be arrays of length
-// num_dtype_fields, holding the flattened data field type codes and byte
-// sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype,
-// but we do that in Python code.)
-//
-// If both `usecols` and `dtype` are not None, and the data type is compound,
-// then len(usecols) must equal num_dtype_fields.
-//
-// If `dtype` is given and it is compound, and `usecols` is None, then the
-// number of columns in the file must match the number of fields in `dtype`.
-//
-static PyObject *
-_readtext_from_stream(stream *s,
-        parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[],
-        Py_ssize_t skiplines, Py_ssize_t max_rows,
-        PyObject *converters, PyObject *dtype)
-{
-    PyArrayObject *arr = NULL;
-    PyArray_Descr *out_dtype = NULL;
-    field_type *ft = NULL;
-
-    /*
-     * If dtypes[0] is dtype the input was not structured and the result
-     * is considered "homogeneous" and we have to discover the number of
-     * columns/
-     */
-    out_dtype = (PyArray_Descr *)dtype;
-    Py_INCREF(out_dtype);
-
-    Py_ssize_t num_fields = field_types_create(out_dtype, &ft);
-    if (num_fields < 0) {
-        goto finish;
-    }
-    bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype;
-
-    if (!homogeneous && usecols != NULL && num_usecols != num_fields) {
-        PyErr_Format(PyExc_TypeError,
-                "If a structured dtype is used, the number of columns in "
-                "`usecols` must match the effective number of fields. "
-                "But %zd usecols were given and the number of fields is %zd.",
-                num_usecols, num_fields);
-        goto finish;
-    }
-
-    arr = read_rows(
-            s, max_rows, num_fields, ft, pc,
-            num_usecols, usecols, skiplines, converters,
-            NULL, out_dtype, homogeneous);
-    if (arr == NULL) {
-        goto finish;
-    }
-
-  finish:
-    Py_XDECREF(out_dtype);
-    field_types_xclear(num_fields, ft);
-    return (PyObject *)arr;
-}


 static int
@@ -205,8 +139,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
    };
    bool filelike = true;

-    PyObject *arr = NULL;
-
    NPY_PREPARE_ARGPARSER;
    if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames,
            "file", NULL, &file,
@@ -257,45 +189,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
        }
    }

-    /*
-     * Parse usecols, the rest of NumPy has no clear helper for this, so do
-     * it here manually.
-     */
-    Py_ssize_t num_usecols = -1;
-    Py_ssize_t *usecols = NULL;
-    if (usecols_obj != Py_None) {
-        num_usecols = PySequence_Length(usecols_obj);
-        if (num_usecols < 0) {
-            return NULL;
-        }
-        /* Calloc just to not worry about overflow */
-        usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t));
-        if (usecols == NULL) {
-            PyErr_NoMemory();
-            return NULL;
-        }
-        for (Py_ssize_t i = 0; i < num_usecols; i++) {
-            PyObject *tmp = PySequence_GetItem(usecols_obj, i);
-            if (tmp == NULL) {
-                PyMem_FREE(usecols);
-                return NULL;
-            }
-            usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
-            if (error_converting(usecols[i])) {
-                if (PyErr_ExceptionMatches(PyExc_TypeError)) {
-                    PyErr_Format(PyExc_TypeError,
-                            "usecols must be an int or a sequence of ints but "
-                            "it contains at least one element of type '%s'",
-                            Py_TYPE(tmp)->tp_name);
-                }
-                Py_DECREF(tmp);
-                PyMem_FREE(usecols);
-                return NULL;
-            }
-            Py_DECREF(tmp);
-        }
-    }
-
    stream *s;
    if (filelike) {
        s = stream_python_file(file, encoding);
@@ -304,14 +197,13 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
        s = stream_python_iterable(file, encoding);
    }
    if (s == NULL) {
-        PyMem_FREE(usecols);
        return NULL;
    }

-    arr = _readtext_from_stream(
-            s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype);
+    PyArrayObject *arr = read_rows(s, max_rows, &pc, usecols_obj, skiplines,
+                                   converters, NULL, dtype);
+
    stream_close(s);
-    PyMem_FREE(usecols);
-    return arr;
+    return (PyObject *)arr;
 }