Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 8aa73b7

Browse filesBrowse files
committed
gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful()
Add PyUnicodeWriter_WriteWideChar() and PyUnicodeWriter_DecodeUTF8Stateful() functions.
1 parent 5c4235c commit 8aa73b7
Copy full SHA for 8aa73b7

File tree

5 files changed

+209
-3
lines changed
Filter options

5 files changed

+209
-3
lines changed

‎Doc/c-api/unicode.rst

Copy file name to clipboardExpand all lines: Doc/c-api/unicode.rst
+30-3Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,9 +1551,17 @@ object.
15511551
On success, return ``0``.
15521552
On error, set an exception, leave the writer unchanged, and return ``-1``.
15531553
1554-
To use a different error handler than ``strict``,
1555-
:c:func:`PyUnicode_DecodeUTF8` can be used with
1556-
:c:func:`PyUnicodeWriter_WriteStr`.
1554+
See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
1555+
1556+
.. c:function:: PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size)
1557+
1558+
Writer the wide string *str* into *writer*.
1559+
1560+
*size* is a number of wide characters. If *size* is equal to ``-1``, call
1561+
``wcslen(str)`` to get the string length.
1562+
1563+
On success, return ``0``.
1564+
On error, set an exception, leave the writer unchanged, and return ``-1``.
15571565
15581566
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
15591567
@@ -1586,3 +1594,22 @@ object.
15861594
15871595
On success, return ``0``.
15881596
On error, set an exception, leave the writer unchanged, and return ``-1``.
1597+
1598+
.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
1599+
1600+
Decode the string *str* from UTF-8 with *errors* error handler and write the
1601+
output into *writer*.
1602+
1603+
*size* is the string length in bytes. If *size* is equal to ``-1``, call
1604+
``strlen(str)`` to get the string length.
1605+
1606+
*errors* is an error handler name, such as ``"replace"``. If *errors* is
1607+
``NULL``, use the strict error handler.
1608+
1609+
If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
1610+
bytes on success.
1611+
1612+
On success, return ``0``.
1613+
On error, set an exception, leave the writer unchanged, and return ``-1``.
1614+
1615+
See also :c:func:`PyUnicodeWriter_WriteUTF8`.

‎Doc/whatsnew/3.14.rst

Copy file name to clipboardExpand all lines: Doc/whatsnew/3.14.rst
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,10 +291,12 @@ New Features
291291
* :c:func:`PyUnicodeWriter_Finish`.
292292
* :c:func:`PyUnicodeWriter_WriteChar`.
293293
* :c:func:`PyUnicodeWriter_WriteUTF8`.
294+
* :c:func:`PyUnicodeWriter_WriteWideChar`.
294295
* :c:func:`PyUnicodeWriter_WriteStr`.
295296
* :c:func:`PyUnicodeWriter_WriteRepr`.
296297
* :c:func:`PyUnicodeWriter_WriteSubstring`.
297298
* :c:func:`PyUnicodeWriter_Format`.
299+
* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
298300

299301
(Contributed by Victor Stinner in :gh:`119182`.)
300302

‎Include/cpython/unicodeobject.h

Copy file name to clipboardExpand all lines: Include/cpython/unicodeobject.h
+10Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
459459
PyUnicodeWriter *writer,
460460
const char *str,
461461
Py_ssize_t size);
462+
PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
463+
PyUnicodeWriter *writer,
464+
wchar_t *str,
465+
Py_ssize_t size);
462466

463467
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
464468
PyUnicodeWriter *writer,
@@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format(
475479
PyUnicodeWriter *writer,
476480
const char *format,
477481
...);
482+
PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
483+
PyUnicodeWriter *writer,
484+
const char *string, /* UTF-8 encoded string */
485+
Py_ssize_t length, /* size of string */
486+
const char *errors, /* error handling */
487+
Py_ssize_t *consumed); /* bytes consumed */
478488

479489

480490
/* --- Private _PyUnicodeWriter API --------------------------------------- */

‎Modules/_testcapi/unicode.c

Copy file name to clipboardExpand all lines: Modules/_testcapi/unicode.c
+121Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,88 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args))
374374
}
375375

376376

377+
static PyObject *
378+
test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
379+
{
380+
// test PyUnicodeWriter_DecodeUTF8Stateful()
381+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
382+
if (writer == NULL) {
383+
return NULL;
384+
}
385+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) {
386+
goto error;
387+
}
388+
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
389+
goto error;
390+
}
391+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
392+
goto error;
393+
}
394+
395+
PyObject *result = PyUnicodeWriter_Finish(writer);
396+
if (result == NULL) {
397+
return NULL;
398+
}
399+
assert(PyUnicode_EqualToUTF8(result, "ignore-replace\xef\xbf\xbd"));
400+
Py_DECREF(result);
401+
402+
Py_RETURN_NONE;
403+
404+
error:
405+
PyUnicodeWriter_Discard(writer);
406+
return NULL;
407+
}
408+
409+
410+
static PyObject *
411+
test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
412+
{
413+
// test PyUnicodeWriter_DecodeUTF8Stateful()
414+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
415+
if (writer == NULL) {
416+
return NULL;
417+
}
418+
Py_ssize_t consumed;
419+
420+
// valid string
421+
consumed = 12345;
422+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) {
423+
goto error;
424+
}
425+
assert(consumed == 4);
426+
427+
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
428+
goto error;
429+
}
430+
431+
// consumed is 0 if write fails
432+
consumed = 12345;
433+
assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
434+
PyErr_Clear();
435+
assert(consumed == 0);
436+
437+
// ignore error handler
438+
consumed = 12345;
439+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) {
440+
goto error;
441+
}
442+
assert(consumed == 5);
443+
444+
PyObject *result = PyUnicodeWriter_Finish(writer);
445+
if (result == NULL) {
446+
return NULL;
447+
}
448+
assert(PyUnicode_EqualToUTF8(result, "text-more"));
449+
Py_DECREF(result);
450+
451+
Py_RETURN_NONE;
452+
453+
error:
454+
PyUnicodeWriter_Discard(writer);
455+
return NULL;
456+
}
457+
458+
377459
static PyObject *
378460
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
379461
{
@@ -436,6 +518,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args
436518
}
437519

438520

521+
static PyObject *
522+
test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
523+
{
524+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
525+
if (writer == NULL) {
526+
return NULL;
527+
}
528+
if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) {
529+
goto error;
530+
}
531+
if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) {
532+
goto error;
533+
}
534+
if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) {
535+
goto error;
536+
}
537+
if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
538+
goto error;
539+
}
540+
541+
PyObject *result = PyUnicodeWriter_Finish(writer);
542+
if (result == NULL) {
543+
return NULL;
544+
}
545+
assert(PyUnicode_EqualToUTF8(result,
546+
"latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
547+
Py_DECREF(result);
548+
549+
Py_RETURN_NONE;
550+
551+
error:
552+
PyUnicodeWriter_Discard(writer);
553+
return NULL;
554+
}
555+
556+
439557
static PyMethodDef TestMethods[] = {
440558
{"unicode_new", unicode_new, METH_VARARGS},
441559
{"unicode_fill", unicode_fill, METH_VARARGS},
@@ -448,8 +566,11 @@ static PyMethodDef TestMethods[] = {
448566
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
449567
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
450568
{"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
569+
{"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
570+
{"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
451571
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
452572
{"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
573+
{"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
453574
{NULL},
454575
};
455576

‎Objects/unicodeobject.c

Copy file name to clipboardExpand all lines: Objects/unicodeobject.c
+46Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13500,6 +13500,52 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
1350013500
return res;
1350113501
}
1350213502

13503+
13504+
int
13505+
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
13506+
const char *string,
13507+
Py_ssize_t length,
13508+
const char *errors,
13509+
Py_ssize_t *consumed)
13510+
{
13511+
if (length < 0) {
13512+
length = strlen(string);
13513+
}
13514+
13515+
_PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
13516+
Py_ssize_t old_pos = _writer->pos;
13517+
int res = unicode_decode_utf8_writer(_writer, string, length,
13518+
_Py_ERROR_UNKNOWN, errors, consumed);
13519+
if (res < 0) {
13520+
_writer->pos = old_pos;
13521+
if (consumed) {
13522+
*consumed = 0;
13523+
}
13524+
}
13525+
return res;
13526+
}
13527+
13528+
13529+
int
13530+
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
13531+
wchar_t *str,
13532+
Py_ssize_t size)
13533+
{
13534+
if (size < 0) {
13535+
size = wcslen(str);
13536+
}
13537+
PyObject *obj = PyUnicode_FromWideChar(str, size);
13538+
if (obj == NULL) {
13539+
return -1;
13540+
}
13541+
13542+
_PyUnicodeWriter *_writer = (_PyUnicodeWriter *)writer;
13543+
int res = _PyUnicodeWriter_WriteStr(_writer, obj);
13544+
Py_DECREF(obj);
13545+
return res;
13546+
}
13547+
13548+
1350313549
int
1350413550
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
1350513551
const char *str, Py_ssize_t len)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.