Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 5b4165b

Browse filesBrowse files
ENH: lib: Allow usecols to be a callable in loadtxt().
1 parent 7fdb7a4 commit 5b4165b
Copy full SHA for 5b4165b

File tree

Expand file treeCollapse file tree

8 files changed

+239
-44
lines changed
Filter options
Expand file treeCollapse file tree

8 files changed

+239
-44
lines changed

‎numpy/core/setup.py

Copy file name to clipboardExpand all lines: numpy/core/setup.py
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,7 @@ def opts_if_msvc(build_cmd):
10171017
join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),
10181018
join('src', 'multiarray', 'textreading', 'str_to_int.c'),
10191019
join('src', 'multiarray', 'textreading', 'tokenize.cpp'),
1020+
join('src', 'multiarray', 'textreading', 'seq_to_ssize_c_array.c'),
10201021
]
10211022

10221023
#######################################################################

‎numpy/core/src/multiarray/textreading/readtext.c

Copy file name to clipboardExpand all lines: numpy/core/src/multiarray/textreading/readtext.c
+21-30Lines changed: 21 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "common.h"
1212
#include "conversion_utils.h"
1313

14+
#include "textreading/seq_to_ssize_c_array.h"
1415
#include "textreading/parser_config.h"
1516
#include "textreading/stream_pyobject.h"
1617
#include "textreading/field_types.h"
@@ -19,8 +20,8 @@
1920

2021

2122
//
22-
// `usecols` must point to a Python object that is Py_None or a 1-d contiguous
23-
// numpy array with data type int32.
23+
// If the argument `usecols_obj` is not Py_None, it must be a callable.
24+
// In that case, the argument `usecols` must be NULL.
2425
//
2526
// `dtype` must point to a Python object that is Py_None or a numpy dtype
2627
// instance. If the latter, code and sizes must be arrays of length
@@ -37,6 +38,7 @@
3738
static PyObject *
3839
_readtext_from_stream(stream *s,
3940
parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[],
41+
PyObject *usecols_obj,
4042
Py_ssize_t skiplines, Py_ssize_t max_rows,
4143
PyObject *converters, PyObject *dtype)
4244
{
@@ -69,7 +71,8 @@ _readtext_from_stream(stream *s,
6971

7072
arr = read_rows(
7173
s, max_rows, num_fields, ft, pc,
72-
num_usecols, usecols, skiplines, converters,
74+
num_usecols, usecols, usecols_obj,
75+
skiplines, converters,
7376
NULL, out_dtype, homogeneous);
7477
if (arr == NULL) {
7578
goto finish;
@@ -263,38 +266,25 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
263266
*/
264267
Py_ssize_t num_usecols = -1;
265268
Py_ssize_t *usecols = NULL;
266-
if (usecols_obj != Py_None) {
269+
if (usecols_obj != Py_None && !PyCallable_Check(usecols_obj)) {
267270
num_usecols = PySequence_Length(usecols_obj);
268271
if (num_usecols < 0) {
269272
return NULL;
270273
}
271-
/* Calloc just to not worry about overflow */
272-
usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t));
274+
usecols = seq_to_ssize_c_array(num_usecols, usecols_obj,
275+
"usecols must be an int or a sequence of ints but "
276+
"it contains at least one element of type '%s'");
273277
if (usecols == NULL) {
274-
PyErr_NoMemory();
275278
return NULL;
276279
}
277-
for (Py_ssize_t i = 0; i < num_usecols; i++) {
278-
PyObject *tmp = PySequence_GetItem(usecols_obj, i);
279-
if (tmp == NULL) {
280-
PyMem_FREE(usecols);
281-
return NULL;
282-
}
283-
usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
284-
if (error_converting(usecols[i])) {
285-
if (PyErr_ExceptionMatches(PyExc_TypeError)) {
286-
PyErr_Format(PyExc_TypeError,
287-
"usecols must be an int or a sequence of ints but "
288-
"it contains at least one element of type '%s'",
289-
Py_TYPE(tmp)->tp_name);
290-
}
291-
Py_DECREF(tmp);
292-
PyMem_FREE(usecols);
293-
return NULL;
294-
}
295-
Py_DECREF(tmp);
296-
}
280+
/*
281+
* The given usecols_obj is a Python sequence; it has been processed to
282+
* give the usecols array, so reset the Python object to None.
283+
*/
284+
usecols_obj = Py_None;
297285
}
286+
assert(usecols == NULL || usecols_obj == Py_None);
287+
/* At this point, if usecols_obj is not Py_None, it must be a callable object. */
298288

299289
stream *s;
300290
if (filelike) {
@@ -304,14 +294,15 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
304294
s = stream_python_iterable(file, encoding);
305295
}
306296
if (s == NULL) {
307-
PyMem_FREE(usecols);
297+
PyMem_Free(usecols);
308298
return NULL;
309299
}
310300

311301
arr = _readtext_from_stream(
312-
s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype);
302+
s, &pc, num_usecols, usecols, usecols_obj, skiplines, max_rows,
303+
converters, dtype);
313304
stream_close(s);
314-
PyMem_FREE(usecols);
305+
PyMem_Free(usecols);
315306
return arr;
316307
}
317308

‎numpy/core/src/multiarray/textreading/rows.c

Copy file name to clipboardExpand all lines: numpy/core/src/multiarray/textreading/rows.c
+74-1Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <string.h>
1212
#include <stdbool.h>
1313

14+
#include "textreading/seq_to_ssize_c_array.h"
1415
#include "textreading/stream.h"
1516
#include "textreading/tokenize.h"
1617
#include "textreading/conversions.h"
@@ -134,6 +135,11 @@ create_conv_funcs(
134135
* @param usecols An array of length `num_usecols` or NULL. If given indicates
135136
* which column is read for each individual row (negative columns are
136137
* accepted).
138+
* @param usecols_obj Either `Py_None` or a callable Python object. If
139+
* callable, the function must accept a single integer argument (the
140+
* number of columns in the file), and return a sequence of integers
141+
* (the sequence of column indices to use). When `usecols_obj` is not
142+
* `Py_None`, `usecols` MUST be NULL.
137143
* @param skiplines The number of lines to skip, these lines are ignored.
138144
* @param converters Python dictionary of converters. Finalizing converters
139145
* is difficult without information about the number of columns.
@@ -155,7 +161,8 @@ create_conv_funcs(
155161
NPY_NO_EXPORT PyArrayObject *
156162
read_rows(stream *s,
157163
npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types,
158-
parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
164+
parser_config *pconfig,
165+
Py_ssize_t num_usecols, Py_ssize_t *usecols, PyObject *usecols_obj,
159166
Py_ssize_t skiplines, PyObject *converters,
160167
PyArrayObject *data_array, PyArray_Descr *out_descr,
161168
bool homogeneous)
@@ -179,6 +186,10 @@ read_rows(stream *s,
179186
/* We give a warning if max_rows is used and an empty line is encountered */
180187
bool give_empty_row_warning = max_rows >= 0;
181188

189+
// If the caller passes in a callable for usecols_obj, then they must
190+
// also pass in NULL for usecols.
191+
assert(usecols == NULL || usecols_obj == Py_None);
192+
182193
int ts_result = 0;
183194
tokenizer_state ts;
184195
if (tokenizer_init(&ts, pconfig) < 0) {
@@ -246,6 +257,56 @@ read_rows(stream *s,
246257
// We've deferred some of the initialization tasks to here,
247258
// because we've now read the first line, and we definitively
248259
// know how many fields (i.e. columns) we will be processing.
260+
261+
if (usecols_obj != Py_None) {
262+
// Call the Python function provided by the caller to
263+
// create the usecols array.
264+
PyObject *seq = PyObject_CallFunction(usecols_obj, "n",
265+
current_num_fields);
266+
if (PyErr_Occurred()) {
267+
// User-provided function failed.
268+
goto error;
269+
}
270+
// The user-defined usecols function must return a sequence
271+
// of integers.
272+
num_usecols = PySequence_Length(seq);
273+
if (num_usecols == -1) {
274+
// User-provided function did not return a sequence.
275+
PyErr_Clear();
276+
PyErr_Format(PyExc_TypeError,
277+
"the user-provided callable usecols must return a "
278+
"sequence of ints, but it returned an instance of "
279+
"type '%s'", Py_TYPE(seq)->tp_name);
280+
Py_DECREF(seq);
281+
goto error;
282+
}
283+
284+
if (!homogeneous && num_field_types != num_usecols) {
285+
// A structured dtype was provided, and the length of
286+
// the sequence returned by the user-provided function
287+
// does not have the same length as the number of fields
288+
// in the dtype.
289+
Py_DECREF(seq);
290+
PyErr_Format(PyExc_RuntimeError,
291+
"length of the sequence returned by the callable "
292+
"usecols (%d) does not equal the number of fields "
293+
"in the given dtype (%d)",
294+
num_usecols, num_field_types);
295+
goto error;
296+
}
297+
298+
// Convert the sequence to a C array of Py_ssize_t ints.
299+
usecols = seq_to_ssize_c_array(num_usecols, seq,
300+
"the user-provided callable usecols must return a "
301+
"sequence of ints, but it returned a sequence "
302+
"containing at least one occurrence of type '%s'");
303+
Py_DECREF(seq);
304+
if (usecols == NULL) {
305+
goto error;
306+
}
307+
actual_num_fields = num_usecols;
308+
}
309+
249310
if (actual_num_fields == -1) {
250311
actual_num_fields = current_num_fields;
251312
}
@@ -431,6 +492,12 @@ read_rows(stream *s,
431492
data_ptr += row_size;
432493
}
433494

495+
if (usecols_obj != Py_None) {
496+
// This function owns usecols if a callable usecols_obj was given.
497+
PyMem_Free(usecols);
498+
usecols = NULL; // An overabundance of caution...
499+
}
500+
434501
tokenizer_clear(&ts);
435502
if (conv_funcs != NULL) {
436503
for (Py_ssize_t i = 0; i < actual_num_fields; i++) {
@@ -479,6 +546,12 @@ read_rows(stream *s,
479546
return data_array;
480547

481548
error:
549+
if (usecols_obj != Py_None) {
550+
// If the error occurred early enough in the function, we might
551+
// not have allocated usecols yet, but that's OK, because we know
552+
// that usecols is NULL in that case.
553+
PyMem_Free(usecols);
554+
}
482555
if (conv_funcs != NULL) {
483556
for (Py_ssize_t i = 0; i < actual_num_fields; i++) {
484557
Py_XDECREF(conv_funcs[i]);

‎numpy/core/src/multiarray/textreading/rows.h

Copy file name to clipboardExpand all lines: numpy/core/src/multiarray/textreading/rows.h
+2-1Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
NPY_NO_EXPORT PyArrayObject *
1515
read_rows(stream *s,
1616
npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types,
17-
parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols,
17+
parser_config *pconfig,
18+
Py_ssize_t num_usecols, Py_ssize_t *usecols, PyObject *usecols_obj,
1819
Py_ssize_t skiplines, PyObject *converters,
1920
PyArrayObject *data_array, PyArray_Descr *out_descr,
2021
bool homogeneous);
+52Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#define PY_SSIZE_T_CLEAN
2+
#include <Python.h>
3+
4+
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
5+
#define _MULTIARRAYMODULE
6+
#include "numpy/arrayobject.h" // For NPY_NO_EXPORT
7+
8+
//
9+
// Convert a Python sequence to a C array of Py_ssize_t integers.
10+
//
11+
// `seq` must be a Python sequence of length `len`. It is assumed that the
12+
// caller has already checked the length of the sequence, so the length is an
13+
// argument instead of being inferred from `seq` itself.
14+
//
15+
// `errtext` must be provided, and it must contain one occurrence of the
16+
// format code sequence '%s'. This text is used as the text of the TypeError
17+
// that is raised when an element of `seq` is found that cannot be converted
18+
// to an a Py_ssize_t integer. The '%s' format code will be replaced with
19+
// the type of the object that failed the conversion.
20+
//
21+
// Returns NULL with an exception set if the conversion fails.
22+
//
23+
// The memory for the array is allocated with PyMem_Calloc.
24+
// The caller must free the memory with PyMem_FREE or PyMem_Free.
25+
//
26+
NPY_NO_EXPORT Py_ssize_t *
27+
seq_to_ssize_c_array(Py_ssize_t len, PyObject *seq, char *errtext)
28+
{
29+
Py_ssize_t *arr = PyMem_Calloc(len, sizeof(Py_ssize_t));
30+
if (arr == NULL) {
31+
PyErr_NoMemory();
32+
return NULL;
33+
}
34+
for (Py_ssize_t i = 0; i < len; ++i) {
35+
PyObject *tmp = PySequence_GetItem(seq, i);
36+
if (tmp == NULL) {
37+
PyMem_Free(arr);
38+
return NULL;
39+
}
40+
arr[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
41+
if (arr[i] == -1 && PyErr_Occurred()) {
42+
if (PyErr_ExceptionMatches(PyExc_TypeError)) {
43+
PyErr_Format(PyExc_TypeError, errtext, Py_TYPE(tmp)->tp_name);
44+
}
45+
Py_DECREF(tmp);
46+
PyMem_Free(arr);
47+
return NULL;
48+
}
49+
Py_DECREF(tmp);
50+
}
51+
return arr;
52+
}
+6Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#ifndef SEQ_TO_SSIZE_C_ARRAY_H
2+
#define SEQ_TO_SSIZE_C_ARRAY_H
3+
4+
Py_ssize_t *seq_to_ssize_c_array(Py_ssize_t len, PyObject *seq, char *initialtext);
5+
6+
#endif

‎numpy/lib/npyio.py

Copy file name to clipboardExpand all lines: numpy/lib/npyio.py
+28-4Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -888,9 +888,10 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
888888
read_dtype_via_object_chunks = dtype
889889
dtype = np.dtype(object)
890890

891-
if usecols is not None:
892-
# Allow usecols to be a single int or a sequence of ints, the C-code
893-
# handles the rest
891+
if usecols is not None and not callable(usecols):
892+
# If usecols is not callable, it must be an int or a sequence of ints.
893+
# Process usecols so that when it is not a callable, it is a list; the
894+
# C code will handle the rest of the validation.
894895
try:
895896
usecols = list(usecols)
896897
except TypeError:
@@ -1099,7 +1100,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
10991100
Default: None.
11001101
skiprows : int, optional
11011102
Skip the first `skiprows` lines, including comments; default: 0.
1102-
usecols : int or sequence, optional
1103+
usecols : int, sequence of ints, or callable, optional
11031104
Which columns to read, with 0 being the first. For example,
11041105
``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
11051106
The default, None, results in all columns being read.
@@ -1108,6 +1109,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
11081109
When a single column has to be read it is possible to use
11091110
an integer instead of a tuple. E.g ``usecols = 3`` reads the
11101111
fourth column the same way as ``usecols = (3,)`` would.
1112+
1113+
.. versionchanged:: 1.24.0
1114+
`usecols` may be a callable function that accepts a single
1115+
integer argument that gives the number of columns in the file.
1116+
The callable must return a sequence of integers that indicate
1117+
which columns to include in the output array.
1118+
11111119
unpack : bool, optional
11121120
If True, the returned array is transposed, so that arguments may be
11131121
unpacked using ``x, y, z = loadtxt(...)``. When used with a
@@ -1272,6 +1280,22 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
12721280
>>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"')
12731281
array('Hello, my name is "Monty"!', dtype='<U26')
12741282
1283+
The parameter `usecols` can be a callable function. This example shows
1284+
how `usecols` can be used to skip the second column of the input file.
1285+
1286+
>>> s1 = StringIO('10.25 ABC 1.5 3.5\n12.50 XYZ 2.5 8.0')
1287+
>>> np.loadtxt(s1, usecols=lambda n: [0] + list(range(2, n)))
1288+
array([[10.25, 1.5 , 3.5 ],
1289+
[12.5 , 2.5 , 8. ]])
1290+
1291+
The caller does not have to know the number of columns in advance.
1292+
The same example works with this input, which has more columns.
1293+
1294+
>>> s2 = StringIO('10.25 ABC 1.5 3.5 5.5\n12.50 XYZ 2.5 8.0 9.5')
1295+
>>> np.loadtxt(s2, usecols=lambda n: [0] + list(range(2, n)))
1296+
array([[10.25, 1.5 , 3.5 , 5.5 ],
1297+
[12.5 , 2.5 , 8. , 9.5 ]])
1298+
12751299
"""
12761300

12771301
if like is not None:

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.