Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

ENH: lib: Allow usecols to be a callable in loadtxt(). #21800

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 15 commits into
base: main
Choose a base branch
Loading
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions 63 numpy/core/src/multiarray/conversion_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -1147,6 +1147,69 @@ PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals)
}


//
// Convert a Python sequence to a C array of Py_ssize_t integers.
//
// `seq` must be a Python sequence.
//
// `seq_typeerror_text` and `element_typeerror_text` must be provided, and
// each must contain one occurrence of the format code sequence '%s'. The
// '%s' format code will be replaced with the type of the object that failed
// the conversion.
// * `seq_typeerror_text` is used if the attempt to get the length of `seq`
// fails.
// * `element_typerror_text` is used if the attempt to convert an element
// of `seq` to a Py_ssize_t fails.
//
// On success, the function assigns the allocated memory to *parr, and
// returns the length of the sequence.
//
// The memory for the array is allocated with PyMem_Calloc.
// The caller must free the memory with PyMem_FREE or PyMem_Free.
//
// Returns -1 with an exception set and with *parr set to NULL if the
// conversion fails.
//
NPY_NO_EXPORT Py_ssize_t
PyArray_SeqToSsizeCArray(PyObject *seq, Py_ssize_t **parr,
char *seq_typeerror_text,
char *element_typeerror_text)
{
*parr = NULL;
Py_ssize_t len = PySequence_Length(seq);
if (len == -1) {
PyErr_Format(PyExc_TypeError, seq_typeerror_text,
Py_TYPE(seq)->tp_name);
return -1;
}
Py_ssize_t *arr = PyMem_Calloc(len, sizeof(Py_ssize_t));
if (arr == NULL) {
PyErr_NoMemory();
return -1;
}
for (Py_ssize_t i = 0; i < len; ++i) {
PyObject *tmp = PySequence_GetItem(seq, i);
if (tmp == NULL) {
PyMem_Free(arr);
return -1;
}
arr[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
if (error_converting(arr[i])) {
if (PyErr_ExceptionMatches(PyExc_TypeError)) {
PyErr_Format(PyExc_TypeError, element_typeerror_text,
Py_TYPE(tmp)->tp_name);
}
Py_DECREF(tmp);
PyMem_Free(arr);
return -1;
}
Py_DECREF(tmp);
}
*parr = arr;
return len;
}


/**
* WARNING: This flag is a bad idea, but was the only way to both
* 1) Support unpickling legacy pickles with object types.
Expand Down
5 changes: 5 additions & 0 deletions 5 numpy/core/src/multiarray/conversion_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ PyArray_IntpFromIndexSequence(PyObject *seq, npy_intp *vals, npy_intp maxvals);
NPY_NO_EXPORT int
PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals);

NPY_NO_EXPORT Py_ssize_t
PyArray_SeqToSsizeCArray(PyObject *seq, Py_ssize_t **parr,
char *seq_typeerror_text,
char *element_typeerror_text);

NPY_NO_EXPORT int
PyArray_TypestrConvert(int itemsize, int gentype);

Expand Down
116 changes: 4 additions & 112 deletions 116 numpy/core/src/multiarray/textreading/readtext.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,73 +13,7 @@

#include "textreading/parser_config.h"
#include "textreading/stream_pyobject.h"
#include "textreading/field_types.h"
#include "textreading/rows.h"
#include "textreading/str_to_int.h"


//
// `usecols` must point to a Python object that is Py_None or a 1-d contiguous
// numpy array with data type int32.
//
// `dtype` must point to a Python object that is Py_None or a numpy dtype
// instance. If the latter, code and sizes must be arrays of length
// num_dtype_fields, holding the flattened data field type codes and byte
// sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype,
// but we do that in Python code.)
//
// If both `usecols` and `dtype` are not None, and the data type is compound,
// then len(usecols) must equal num_dtype_fields.
//
// If `dtype` is given and it is compound, and `usecols` is None, then the
// number of columns in the file must match the number of fields in `dtype`.
//
static PyObject *
_readtext_from_stream(stream *s,
parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[],
Py_ssize_t skiplines, Py_ssize_t max_rows,
PyObject *converters, PyObject *dtype)
{
PyArrayObject *arr = NULL;
PyArray_Descr *out_dtype = NULL;
field_type *ft = NULL;

/*
* If dtypes[0] is dtype the input was not structured and the result
* is considered "homogeneous" and we have to discover the number of
* columns/
*/
out_dtype = (PyArray_Descr *)dtype;
Py_INCREF(out_dtype);

Py_ssize_t num_fields = field_types_create(out_dtype, &ft);
if (num_fields < 0) {
goto finish;
}
bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype;

if (!homogeneous && usecols != NULL && num_usecols != num_fields) {
PyErr_Format(PyExc_TypeError,
"If a structured dtype is used, the number of columns in "
"`usecols` must match the effective number of fields. "
"But %zd usecols were given and the number of fields is %zd.",
num_usecols, num_fields);
goto finish;
}

arr = read_rows(
s, max_rows, num_fields, ft, pc,
num_usecols, usecols, skiplines, converters,
NULL, out_dtype, homogeneous);
if (arr == NULL) {
goto finish;
}

finish:
Py_XDECREF(out_dtype);
field_types_xclear(num_fields, ft);
return (PyObject *)arr;
}


static int
Expand Down Expand Up @@ -205,8 +139,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
};
bool filelike = true;

PyObject *arr = NULL;

NPY_PREPARE_ARGPARSER;
if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames,
"file", NULL, &file,
Expand Down Expand Up @@ -257,45 +189,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
}
}

/*
* Parse usecols, the rest of NumPy has no clear helper for this, so do
* it here manually.
*/
Py_ssize_t num_usecols = -1;
Py_ssize_t *usecols = NULL;
if (usecols_obj != Py_None) {
num_usecols = PySequence_Length(usecols_obj);
if (num_usecols < 0) {
return NULL;
}
/* Calloc just to not worry about overflow */
usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t));
if (usecols == NULL) {
PyErr_NoMemory();
return NULL;
}
for (Py_ssize_t i = 0; i < num_usecols; i++) {
PyObject *tmp = PySequence_GetItem(usecols_obj, i);
if (tmp == NULL) {
PyMem_FREE(usecols);
return NULL;
}
usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError);
if (error_converting(usecols[i])) {
if (PyErr_ExceptionMatches(PyExc_TypeError)) {
PyErr_Format(PyExc_TypeError,
"usecols must be an int or a sequence of ints but "
"it contains at least one element of type '%s'",
Py_TYPE(tmp)->tp_name);
}
Py_DECREF(tmp);
PyMem_FREE(usecols);
return NULL;
}
Py_DECREF(tmp);
}
}

stream *s;
if (filelike) {
s = stream_python_file(file, encoding);
Expand All @@ -304,14 +197,13 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
s = stream_python_iterable(file, encoding);
}
if (s == NULL) {
PyMem_FREE(usecols);
return NULL;
}

arr = _readtext_from_stream(
s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype);
PyArrayObject *arr = read_rows(s, max_rows, &pc, usecols_obj, skiplines,
converters, NULL, dtype);

stream_close(s);
PyMem_FREE(usecols);
return arr;
return (PyObject *)arr;
}

Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.