numpy
diff --git a/‎numpy/core/setup.py
Copy file name to clipboardExpand all lines: numpy/core/setup.py
+1Lines changed: 1 addition & 0 deletions b/‎numpy/core/setup.py
Copy file name to clipboardExpand all lines: numpy/core/setup.py
+1Lines changed: 1 addition & 0 deletions
diff --git a/‎numpy/core/src/multiarray/textreading/readtext.c
Copy file name to clipboardExpand all lines: numpy/core/src/multiarray/textreading/readtext.c
+21-30Lines changed: 21 additions & 30 deletions b/‎numpy/core/src/multiarray/textreading/readtext.c
Copy file name to clipboardExpand all lines: numpy/core/src/multiarray/textreading/readtext.c
+21-30Lines changed: 21 additions & 30 deletions
diff --git a/‎numpy/core/src/multiarray/textreading/rows.c
Copy file name to clipboardExpand all lines: numpy/core/src/multiarray/textreading/rows.c
+74-1Lines changed: 74 additions & 1 deletion b/‎numpy/core/src/multiarray/textreading/rows.c
Copy file name to clipboardExpand all lines: numpy/core/src/multiarray/textreading/rows.c
+74-1Lines changed: 74 additions & 1 deletion
diff --git a/‎numpy/core/src/multiarray/textreading/rows.h
Copy file name to clipboardExpand all lines: numpy/core/src/multiarray/textreading/rows.h
+2-1Lines changed: 2 additions & 1 deletion b/‎numpy/core/src/multiarray/textreading/rows.h
Copy file name to clipboardExpand all lines: numpy/core/src/multiarray/textreading/rows.h
+2-1Lines changed: 2 additions & 1 deletion
diff --git a/‎numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c
Copy file name to clipboard
+52Lines changed: 52 additions & 0 deletions b/‎numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.c
Copy file name to clipboard
+52Lines changed: 52 additions & 0 deletions
diff --git a/‎numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h
Copy file name to clipboard
+6Lines changed: 6 additions & 0 deletions b/‎numpy/core/src/multiarray/textreading/seq_to_ssize_c_array.h
Copy file name to clipboard
+6Lines changed: 6 additions & 0 deletions
diff --git a/‎numpy/lib/npyio.py
Copy file name to clipboardExpand all lines: numpy/lib/npyio.py
+28-4Lines changed: 28 additions & 4 deletions b/‎numpy/lib/npyio.py
Copy file name to clipboardExpand all lines: numpy/lib/npyio.py
+28-4Lines changed: 28 additions & 4 deletions
@@ -1017,6 +1017,7 @@ def opts_if_msvc(build_cmd):
             join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),
             join('src', 'multiarray', 'textreading', 'str_to_int.c'),
             join('src', 'multiarray', 'textreading', 'tokenize.cpp'),
+            join('src', 'multiarray', 'textreading', 'seq_to_ssize_c_array.c'),
             ]
 
     #######################################################################
 
@@ -11,6 +11,7 @@
 #include "common.h"
 #include "conversion_utils.h"
 
+#include "textreading/seq_to_ssize_c_array.h"
 #include "textreading/parser_config.h"
 #include "textreading/stream_pyobject.h"
 #include "textreading/field_types.h"
@@ -19,8 +20,8 @@
 
 
 //
-// `usecols` must point to a Python object that is Py_None or a 1-d contiguous
-// numpy array with data type int32.
+// If the argument `usecols_obj` is not Py_None, it must be a callable.
+// In that case, the argument `usecols` must be NULL.
 //
 // `dtype` must point to a Python object that is Py_None or a numpy dtype
 // instance.  If the latter, code and sizes must be arrays of length
@@ -37,6 +38,7 @@
 static PyObject *
 _readtext_from_stream(stream *s,
         parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[],
+        PyObject *usecols_obj,
         Py_ssize_t skiplines, Py_ssize_t max_rows,
         PyObject *converters, PyObject *dtype)
 {
@@ -69,7 +71,8 @@ _readtext_from_stream(stream *s,
 
     arr = read_rows(
             s, max_rows, num_fields, ft, pc,
-            num_usecols, usecols, skiplines, converters,
+            num_usecols, usecols, usecols_obj,
+            skiplines, converters,
             NULL, out_dtype, homogeneous);
     if (arr == NULL) {
         goto finish;
@@ -263,38 +266,25 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
      */
     Py_ssize_t num_usecols = -1;
     Py_ssize_t *usecols = NULL;
-    if (usecols_obj != Py_None) {
+    if (usecols_obj != Py_None && !PyCallable_Check(usecols_obj)) {
         num_usecols = PySequence_Length(usecols_obj);
         if (num_usecols < 0) {
             return NULL;
         }
-        /* Calloc just to not worry about overflow */
-        usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t));
+        usecols = seq_to_ssize_c_array(num_usecols, usecols_obj,
+                    "usecols must be an int or a sequence of ints but "
+                    "it contains at least one element of type '%s'");
         if (usecols == NULL) {
-            PyErr_NoMemory();
             return NULL;
         }
-        for (Py_ssize_t i = 0; i < num_usecols; i++) {
-            PyObject *tmp = PySequence_GetItem(usecols_obj, i);
-            if (tmp == NULL) {
-                PyMem_FREE(usecols);
-                return NULL;
-            }
-            usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
-            if (error_converting(usecols[i])) {
-                if (PyErr_ExceptionMatches(PyExc_TypeError)) {
-                    PyErr_Format(PyExc_TypeError,
-                            "usecols must be an int or a sequence of ints but "
-                            "it contains at least one element of type '%s'",
-                            Py_TYPE(tmp)->tp_name);
-                }
-                Py_DECREF(tmp);
-                PyMem_FREE(usecols);
-                return NULL;
-            }
-            Py_DECREF(tmp);
-        }
+        /*
+         *  The given usecols_obj is a Python sequence; it has been processed to
+         *  give the usecols array, so reset the Python object to None.
+         */
+        usecols_obj = Py_None;
     }
+    assert(usecols == NULL || usecols_obj == Py_None);
+    /* At this point, if usecols_obj is not Py_None, it must be a callable object. */
 
     stream *s;
     if (filelike) {
@@ -304,14 +294,15 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
         s = stream_python_iterable(file, encoding);
     }
     if (s == NULL) {
-        PyMem_FREE(usecols);
+        PyMem_Free(usecols);
         return NULL;
     }
 
     arr = _readtext_from_stream(
-            s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype);
+            s, &pc, num_usecols, usecols, usecols_obj, skiplines, max_rows,
+            converters, dtype);
     stream_close(s);
-    PyMem_FREE(usecols);
+    PyMem_Free(usecols);
     return arr;
 }
 
@@ -11,6 +11,7 @@
 #include <string.h>
 #include <stdbool.h>
 
+#include "textreading/seq_to_ssize_c_array.h"
 #include "textreading/stream.h"
 #include "textreading/tokenize.h"
 #include "textreading/conversions.h"
@@ -134,6 +135,11 @@ create_conv_funcs(
  * @param usecols An array of length `num_usecols` or NULL.  If given indicates
  *        which column is read for each individual row (negative columns are
  *        accepted).
+ * @param usecols_obj Either `Py_None` or a callable Python object.  If
+ *        callable, the function must accept a single integer argument (the
+ *        number of columns in the file), and return a sequence of integers
+ *        (the sequence of column indices to use).  When `usecols_obj` is not
+ *        `Py_None`, `usecols` MUST be NULL.
  * @param skiplines The number of lines to skip, these lines are ignored.
  * @param converters Python dictionary of converters.  Finalizing converters
  *        is difficult without information about the number of columns.
@@ -155,7 +161,8 @@ create_conv_funcs(
 NPY_NO_EXPORT PyArrayObject *
 read_rows(stream *s,
         npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types,
-        parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
+        parser_config *pconfig,
+        Py_ssize_t num_usecols, Py_ssize_t *usecols, PyObject *usecols_obj,
         Py_ssize_t skiplines, PyObject *converters,
         PyArrayObject *data_array, PyArray_Descr *out_descr,
         bool homogeneous)
@@ -179,6 +186,10 @@ read_rows(stream *s,
     /* We give a warning if max_rows is used and an empty line is encountered */
     bool give_empty_row_warning = max_rows >= 0;
 
+    // If the caller passes in a callable for usecols_obj, then they must
+    // also pass in NULL for usecols.
+    assert(usecols == NULL || usecols_obj == Py_None);
+
     int ts_result = 0;
     tokenizer_state ts;
     if (tokenizer_init(&ts, pconfig) < 0) {
@@ -246,6 +257,56 @@ read_rows(stream *s,
             // We've deferred some of the initialization tasks to here,
             // because we've now read the first line, and we definitively
             // know how many fields (i.e. columns) we will be processing.
+
+            if (usecols_obj != Py_None) {
+                // Call the Python function provided by the caller to
+                // create the usecols array.
+                PyObject *seq = PyObject_CallFunction(usecols_obj, "n",
+                                                      current_num_fields);
+                if (PyErr_Occurred()) {
+                    // User-provided function failed.
+                    goto error;
+                }
+                // The user-defined usecols function must return a sequence
+                // of integers.
+                num_usecols = PySequence_Length(seq);
+                if (num_usecols == -1) {
+                    // User-provided function did not return a sequence.
+                    PyErr_Clear();
+                    PyErr_Format(PyExc_TypeError,
+                        "the user-provided callable usecols must return a "
+                        "sequence of ints, but it returned an instance of "
+                        "type '%s'", Py_TYPE(seq)->tp_name);
+                    Py_DECREF(seq);
+                    goto error;
+                }
+
+                if (!homogeneous && num_field_types != num_usecols) {
+                    // A structured dtype was provided, and the length of
+                    // the sequence returned by the user-provided function
+                    // does not have the same length as the number of fields
+                    // in the dtype.
+                    Py_DECREF(seq);
+                    PyErr_Format(PyExc_RuntimeError,
+                        "length of the sequence returned by the callable "
+                        "usecols (%d) does not equal the number of fields "
+                        "in the given dtype (%d)",
+                        num_usecols, num_field_types);
+                    goto error;
+                }
+
+                // Convert the sequence to a C array of Py_ssize_t ints.
+                usecols = seq_to_ssize_c_array(num_usecols, seq,
+                        "the user-provided callable usecols must return a "
+                        "sequence of ints, but it returned a sequence "
+                        "containing at least one occurrence of type '%s'");
+                Py_DECREF(seq);
+                if (usecols == NULL) {
+                    goto error;
+                }
+                actual_num_fields = num_usecols;
+            }
+
             if (actual_num_fields == -1) {
                 actual_num_fields = current_num_fields;
             }
@@ -431,6 +492,12 @@ read_rows(stream *s,
         data_ptr += row_size;
     }
 
+    if (usecols_obj != Py_None) {
+        // This function owns usecols if a callable usecols_obj was given.
+        PyMem_Free(usecols);
+        usecols = NULL;  // An overabundance of caution...
+    }
+
     tokenizer_clear(&ts);
     if (conv_funcs != NULL) {
         for (Py_ssize_t i = 0; i < actual_num_fields; i++) {
@@ -479,6 +546,12 @@ read_rows(stream *s,
     return data_array;
 
   error:
+    if (usecols_obj != Py_None) {
+        // If the error occurred early enough in the function, we might
+        // not have allocated usecols yet, but that's OK, because we know
+        // that usecols is NULL in that case.
+        PyMem_Free(usecols);
+    }
     if (conv_funcs != NULL) {
         for (Py_ssize_t i = 0; i < actual_num_fields; i++) {
             Py_XDECREF(conv_funcs[i]);
 
@@ -14,7 +14,8 @@
 NPY_NO_EXPORT PyArrayObject *
 read_rows(stream *s,
         npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types,
-        parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
+        parser_config *pconfig,
+        Py_ssize_t num_usecols, Py_ssize_t *usecols, PyObject *usecols_obj,
         Py_ssize_t skiplines, PyObject *converters,
         PyArrayObject *data_array, PyArray_Descr *out_descr,
         bool homogeneous);
 
@@ -0,0 +1,52 @@
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"  // For NPY_NO_EXPORT
+
+//
+// Convert a Python sequence to a C array of Py_ssize_t integers.
+//
+// `seq` must be a Python sequence of length `len`.  It is assumed that the
+// caller has already checked the length of the sequence, so the length is an
+// argument instead of being inferred from `seq` itself.
+//
+// `errtext` must be provided, and it must contain one occurrence of the
+// format code sequence '%s'.  This text is used as the text of the TypeError
+// that is raised when an element of `seq` is found that cannot be converted
+// to an a Py_ssize_t integer.  The '%s' format code will be replaced with
+// the type of the object that failed the conversion.
+//
+// Returns NULL with an exception set if the conversion fails.
+//
+// The memory for the array is allocated with PyMem_Calloc.
+// The caller must free the memory with PyMem_FREE or PyMem_Free.
+//
+NPY_NO_EXPORT Py_ssize_t *
+seq_to_ssize_c_array(Py_ssize_t len, PyObject *seq, char *errtext)
+{
+    Py_ssize_t *arr = PyMem_Calloc(len, sizeof(Py_ssize_t));
+    if (arr == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    for (Py_ssize_t i = 0; i < len; ++i) {
+        PyObject *tmp = PySequence_GetItem(seq, i);
+        if (tmp == NULL) {
+            PyMem_Free(arr);
+            return NULL;
+        }
+        arr[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
+        if (arr[i] == -1 && PyErr_Occurred()) {
+            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+                PyErr_Format(PyExc_TypeError, errtext, Py_TYPE(tmp)->tp_name);
+            }
+            Py_DECREF(tmp);
+            PyMem_Free(arr);
+            return NULL;
+        }
+        Py_DECREF(tmp);
+    }
+    return arr;
+}
@@ -0,0 +1,6 @@
+#ifndef SEQ_TO_SSIZE_C_ARRAY_H
+#define SEQ_TO_SSIZE_C_ARRAY_H
+
+Py_ssize_t *seq_to_ssize_c_array(Py_ssize_t len, PyObject *seq, char *initialtext);
+
+#endif
@@ -888,9 +888,10 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
         read_dtype_via_object_chunks = dtype
         dtype = np.dtype(object)
 
-    if usecols is not None:
-        # Allow usecols to be a single int or a sequence of ints, the C-code
-        # handles the rest
+    if usecols is not None and not callable(usecols):
+        # If usecols is not callable, it must be an int or a sequence of ints.
+        # Process usecols so that when it is not a callable, it is a list; the
+        # C code will handle the rest of the validation.
         try:
             usecols = list(usecols)
         except TypeError:
@@ -1099,7 +1100,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         Default: None.
     skiprows : int, optional
         Skip the first `skiprows` lines, including comments; default: 0.
-    usecols : int or sequence, optional
+    usecols : int, sequence of ints, or callable, optional
         Which columns to read, with 0 being the first. For example,
         ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
         The default, None, results in all columns being read.
@@ -1108,6 +1109,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             When a single column has to be read it is possible to use
             an integer instead of a tuple. E.g ``usecols = 3`` reads the
             fourth column the same way as ``usecols = (3,)`` would.
+
+        .. versionchanged:: 1.24.0
+            `usecols` may be a callable function that accepts a single
+            integer argument that gives the number of columns in the file.
+            The callable must return a sequence of integers that indicate
+            which columns to include in the output array.
+
     unpack : bool, optional
         If True, the returned array is transposed, so that arguments may be
         unpacked using ``x, y, z = loadtxt(...)``.  When used with a
@@ -1272,6 +1280,22 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     >>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"')
     array('Hello, my name is "Monty"!', dtype='<U26')
 
+    The parameter `usecols` can be a callable function.  This example shows
+    how `usecols` can be used to skip the second column of the input file.
+
+    >>> s1 = StringIO('10.25 ABC 1.5 3.5\n12.50 XYZ 2.5 8.0')
+    >>> np.loadtxt(s1, usecols=lambda n: [0] + list(range(2, n)))
+    array([[10.25,  1.5 ,  3.5 ],
+           [12.5 ,  2.5 ,  8.  ]])
+
+    The caller does not have to know the number of columns in advance.
+    The same example works with this input, which has more columns.
+
+    >>> s2 = StringIO('10.25 ABC 1.5 3.5 5.5\n12.50 XYZ 2.5 8.0 9.5')
+    >>> np.loadtxt(s2, usecols=lambda n: [0] + list(range(2, n)))
+    array([[10.25,  1.5 ,  3.5 ,  5.5 ],
+           [12.5 ,  2.5 ,  8.  ,  9.5 ]])
+
     """
 
     if like is not None:
Original file line number	Diff line number	Diff line change
`@@ -1017,6 +1017,7 @@ def opts_if_msvc(build_cmd):`
`1017`	`1017`	`join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),`
`1018`	`1018`	`join('src', 'multiarray', 'textreading', 'str_to_int.c'),`
`1019`	`1019`	`join('src', 'multiarray', 'textreading', 'tokenize.cpp'),`
	`1020`	`+ join('src', 'multiarray', 'textreading', 'seq_to_ssize_c_array.c'),`
`1020`	`1021`	`]`
`1021`	`1022`
`1022`	`1023`	`#######################################################################`