-
-
Notifications
You must be signed in to change notification settings - Fork 32k
gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful() #120639
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8aa73b7
788a85f
e67a8b4
de56475
e48eec7
75fa8ba
3f284f8
1e018d2
6f29c53
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -374,6 +374,119 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args)) | |
} | ||
|
||
|
||
static PyObject * | ||
test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args)) | ||
{ | ||
// test PyUnicodeWriter_DecodeUTF8Stateful() | ||
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); | ||
if (writer == NULL) { | ||
return NULL; | ||
} | ||
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) { | ||
goto error; | ||
} | ||
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { | ||
goto error; | ||
} | ||
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) { | ||
vstinner marked this conversation as resolved.
Show resolved
Hide resolved
|
||
goto error; | ||
} | ||
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { | ||
goto error; | ||
} | ||
|
||
// incomplete trailing UTF-8 sequence | ||
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0) { | ||
goto error; | ||
} | ||
|
||
PyObject *result = PyUnicodeWriter_Finish(writer); | ||
if (result == NULL) { | ||
return NULL; | ||
} | ||
assert(PyUnicode_EqualToUTF8(result, | ||
"ignore-replace\xef\xbf\xbd" | ||
"-incomplete\xef\xbf\xbd")); | ||
Py_DECREF(result); | ||
|
||
Py_RETURN_NONE; | ||
|
||
error: | ||
PyUnicodeWriter_Discard(writer); | ||
return NULL; | ||
} | ||
|
||
|
||
static PyObject * | ||
test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args)) | ||
{ | ||
// test PyUnicodeWriter_DecodeUTF8Stateful() | ||
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); | ||
if (writer == NULL) { | ||
return NULL; | ||
} | ||
Py_ssize_t consumed; | ||
|
||
// valid string | ||
consumed = 12345; | ||
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) { | ||
vstinner marked this conversation as resolved.
Show resolved
Hide resolved
|
||
goto error; | ||
} | ||
assert(consumed == 4); | ||
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { | ||
goto error; | ||
} | ||
|
||
// non-ASCII | ||
consumed = 12345; | ||
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0) { | ||
goto error; | ||
} | ||
assert(consumed == 6); | ||
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { | ||
goto error; | ||
} | ||
|
||
// consumed is 0 if write fails | ||
consumed = 12345; | ||
assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This do nothing in non-debug build. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assertions are always built in _testcapi.c: the NDEBUG macro is undefined early in parts.h. |
||
PyErr_Clear(); | ||
assert(consumed == 0); | ||
|
||
// ignore error handler | ||
consumed = 12345; | ||
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) { | ||
vstinner marked this conversation as resolved.
Show resolved
Hide resolved
|
||
goto error; | ||
} | ||
assert(consumed == 5); | ||
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { | ||
goto error; | ||
} | ||
|
||
// incomplete trailing UTF-8 sequence | ||
consumed = 12345; | ||
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0) { | ||
goto error; | ||
} | ||
assert(consumed == 10); | ||
|
||
PyObject *result = PyUnicodeWriter_Finish(writer); | ||
if (result == NULL) { | ||
return NULL; | ||
} | ||
assert(PyUnicode_EqualToUTF8(result, | ||
"text-\xC3\xA9-\xE2\x82\xAC-" | ||
"more-incomplete")); | ||
Py_DECREF(result); | ||
|
||
Py_RETURN_NONE; | ||
|
||
error: | ||
PyUnicodeWriter_Discard(writer); | ||
return NULL; | ||
} | ||
|
||
|
||
static PyObject * | ||
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args)) | ||
{ | ||
|
@@ -436,6 +549,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args | |
} | ||
|
||
|
||
static PyObject * | ||
test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args)) | ||
{ | ||
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); | ||
if (writer == NULL) { | ||
return NULL; | ||
} | ||
if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) { | ||
goto error; | ||
} | ||
if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) { | ||
goto error; | ||
} | ||
if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also test surrogate pairs and non-BMP characters. Since the code depends on the kind of the buffer string, you need to test different combinations: write different strings after writing a UCS2 or UCS4 string. I suggest to implement in C a function which creates a PyUnicodeWriter, write the first argument as a Python string, then covert the second argument to the |
||
goto error; | ||
} | ||
if (PyUnicodeWriter_WriteChar(writer, '.') < 0) { | ||
goto error; | ||
} | ||
|
||
PyObject *result = PyUnicodeWriter_Finish(writer); | ||
if (result == NULL) { | ||
return NULL; | ||
} | ||
assert(PyUnicode_EqualToUTF8(result, | ||
"latin1=\xC3\xA9-euro=\xE2\x82\xAC.")); | ||
Py_DECREF(result); | ||
|
||
Py_RETURN_NONE; | ||
|
||
error: | ||
PyUnicodeWriter_Discard(writer); | ||
return NULL; | ||
} | ||
|
||
|
||
static PyMethodDef TestMethods[] = { | ||
{"unicode_new", unicode_new, METH_VARARGS}, | ||
{"unicode_fill", unicode_fill, METH_VARARGS}, | ||
|
@@ -448,8 +597,11 @@ static PyMethodDef TestMethods[] = { | |
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS}, | ||
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS}, | ||
{"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS}, | ||
{"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS}, | ||
{"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS}, | ||
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS}, | ||
{"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS}, | ||
{"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS}, | ||
{NULL}, | ||
}; | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.