From c85e59c423383f3ad0f650ab5d1b9d26ed342e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:30:28 +0100 Subject: [PATCH 01/11] fix `codecs.backslashreplace_errors` --- Lib/test/test_capi/test_codecs.py | 7 +- Python/codecs.c | 131 +++++++++++++++++------------- 2 files changed, 78 insertions(+), 60 deletions(-) diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index a557e35e68915d..9baede2293b224 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -847,18 +847,19 @@ def test_codec_xmlcharrefreplace_errors_handler(self): def test_codec_backslashreplace_errors_handler(self): handler = _testcapi.codec_backslashreplace_errors - self.do_test_codec_errors_handler(handler, self.all_unicode_errors) + self.do_test_codec_errors_handler(handler, self.all_unicode_errors, + safe=True) def test_codec_namereplace_errors_handler(self): handler = _testlimitedcapi.codec_namereplace_errors self.do_test_codec_errors_handler(handler, self.unicode_encode_errors) - def do_test_codec_errors_handler(self, handler, exceptions): + def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False): at_least_one = False for exc in exceptions: # See https://github.com/python/cpython/issues/123378 and related # discussion and issues for details. - if self._exception_may_crash(exc): + if not safe and self._exception_may_crash(exc): continue at_least_one = True diff --git a/Python/codecs.c b/Python/codecs.c index 2cb3875db35058..93c3558733ecf7 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -853,109 +853,126 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { - PyObject *object; - Py_ssize_t i; - Py_ssize_t start; - Py_ssize_t end; - PyObject *res; - Py_UCS1 *outp; - int ressize; - Py_UCS4 c; - if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - const unsigned char *p; - if (PyUnicodeDecodeError_GetStart(exc, &start)) + Py_ssize_t start, end; + if (PyUnicodeDecodeError_GetStart(exc, &start)) { return NULL; - if (PyUnicodeDecodeError_GetEnd(exc, &end)) + } + if (PyUnicodeDecodeError_GetEnd(exc, &end)) { return NULL; - if (!(object = PyUnicodeDecodeError_GetObject(exc))) + } + if (end <= start) { + goto oob; + } + PyObject *obj = PyUnicodeDecodeError_GetObject(exc); + if (obj == NULL) { return NULL; - p = (const unsigned char*)PyBytes_AS_STRING(object); - res = PyUnicode_New(4 * (end - start), 127); + } + const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); + PyObject *res = PyUnicode_New(4 * (end - start), 127); if (res == NULL) { - Py_DECREF(object); + Py_DECREF(obj); return NULL; } - outp = PyUnicode_1BYTE_DATA(res); - for (i = start; i < end; i++, outp += 4) { - unsigned char c = p[i]; + Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); + for (Py_ssize_t i = start; i < end; i++, outp += 4) { + const unsigned char ch = p[i]; outp[0] = '\\'; outp[1] = 'x'; - outp[2] = Py_hexdigits[(c>>4)&0xf]; - outp[3] = Py_hexdigits[c&0xf]; + outp[2] = Py_hexdigits[(ch >> 4) & 0xf]; + outp[3] = Py_hexdigits[ch & 0xf]; } - assert(_PyUnicode_CheckConsistency(res, 1)); - Py_DECREF(object); + Py_DECREF(obj); return Py_BuildValue("(Nn)", res, end); } + + PyObject *obj = NULL; + Py_ssize_t start, end; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - if (PyUnicodeEncodeError_GetStart(exc, &start)) + if (PyUnicodeEncodeError_GetStart(exc, &start)) { return NULL; - if (PyUnicodeEncodeError_GetEnd(exc, &end)) - return NULL; - if (!(object = PyUnicodeEncodeError_GetObject(exc))) + } + if (PyUnicodeEncodeError_GetEnd(exc, &end)) { return NULL; + } + obj = PyUnicodeEncodeError_GetObject(exc); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { - if (PyUnicodeTranslateError_GetStart(exc, &start)) + if (PyUnicodeTranslateError_GetStart(exc, &start)) { return NULL; - if (PyUnicodeTranslateError_GetEnd(exc, &end)) - return NULL; - if (!(object = PyUnicodeTranslateError_GetObject(exc))) + } + if (PyUnicodeTranslateError_GetEnd(exc, &end)) { return NULL; + } + obj = PyUnicodeTranslateError_GetObject(exc); } else { wrong_exception_type(exc); return NULL; } - if (end - start > PY_SSIZE_T_MAX / (1+1+8)) - end = start + PY_SSIZE_T_MAX / (1+1+8); - for (i = start, ressize = 0; i < end; ++i) { + if (obj == NULL) { + return NULL; + } + if (end <= start) { + Py_DECREF(obj); + goto oob; + } + if (end - start > PY_SSIZE_T_MAX / 10) { + end = start + PY_SSIZE_T_MAX / 10; + } + end = Py_MIN(end, PyUnicode_GET_LENGTH(obj)); + + Py_ssize_t ressize = 0; + for (Py_ssize_t i = start; i < end; ++i) { /* object is guaranteed to be "ready" */ - c = PyUnicode_READ_CHAR(object, i); + Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); if (c >= 0x10000) { - ressize += 1+1+8; + ressize += 10; } else if (c >= 0x100) { - ressize += 1+1+4; + ressize += 6; + } + else { + ressize += 4; } - else - ressize += 1+1+2; } - res = PyUnicode_New(ressize, 127); + PyObject *res = PyUnicode_New(ressize, 127); if (res == NULL) { - Py_DECREF(object); + Py_DECREF(obj); return NULL; } - outp = PyUnicode_1BYTE_DATA(res); - for (i = start; i < end; ++i) { - c = PyUnicode_READ_CHAR(object, i); + Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); + for (Py_ssize_t i = start; i < end; ++i) { + Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); *outp++ = '\\'; if (c >= 0x00010000) { *outp++ = 'U'; - *outp++ = Py_hexdigits[(c>>28)&0xf]; - *outp++ = Py_hexdigits[(c>>24)&0xf]; - *outp++ = Py_hexdigits[(c>>20)&0xf]; - *outp++ = Py_hexdigits[(c>>16)&0xf]; - *outp++ = Py_hexdigits[(c>>12)&0xf]; - *outp++ = Py_hexdigits[(c>>8)&0xf]; + *outp++ = Py_hexdigits[(c >> 28) & 0xf]; + *outp++ = Py_hexdigits[(c >> 24) & 0xf]; + *outp++ = Py_hexdigits[(c >> 20) & 0xf]; + *outp++ = Py_hexdigits[(c >> 16) & 0xf]; + *outp++ = Py_hexdigits[(c >> 12) & 0xf]; + *outp++ = Py_hexdigits[(c >> 8) & 0xf]; } else if (c >= 0x100) { *outp++ = 'u'; - *outp++ = Py_hexdigits[(c>>12)&0xf]; - *outp++ = Py_hexdigits[(c>>8)&0xf]; + *outp++ = Py_hexdigits[(c >> 12) & 0xf]; + *outp++ = Py_hexdigits[(c >> 8) & 0xf]; } - else + else { *outp++ = 'x'; - *outp++ = Py_hexdigits[(c>>4)&0xf]; - *outp++ = Py_hexdigits[c&0xf]; + } + *outp++ = Py_hexdigits[(c >> 4) & 0xf]; + *outp++ = Py_hexdigits[c & 0xf]; } - assert(_PyUnicode_CheckConsistency(res, 1)); - Py_DECREF(object); + Py_DECREF(obj); return Py_BuildValue("(Nn)", res, end); + +oob: + return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); } PyObject *PyCodec_NameReplaceErrors(PyObject *exc) From cfc4c9bf96a1c6570b4a38b712646a10da3b2aa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:31:01 +0100 Subject: [PATCH 02/11] blurb --- .../2024-12-06-11-30-58.gh-issue-126004.-p8MAS.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-30-58.gh-issue-126004.-p8MAS.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-30-58.gh-issue-126004.-p8MAS.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-30-58.gh-issue-126004.-p8MAS.rst new file mode 100644 index 00000000000000..619d73042a9bb8 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-30-58.gh-issue-126004.-p8MAS.rst @@ -0,0 +1,3 @@ +Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end` +values in the :func:`codecs.backslashreplace_errors` error handler. Patch by +Bénédikt Tran. From 2649ab753dc02f4ba8e857b29ac8160bbe551de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:59:53 +0100 Subject: [PATCH 03/11] fix warnings --- Python/codecs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 93c3558733ecf7..e83b01dcd039ba 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -853,8 +853,8 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { + Py_ssize_t start, end; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - Py_ssize_t start, end; if (PyUnicodeDecodeError_GetStart(exc, &start)) { return NULL; } @@ -888,7 +888,6 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } PyObject *obj = NULL; - Py_ssize_t start, end; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { if (PyUnicodeEncodeError_GetStart(exc, &start)) { return NULL; From 95589281e7a53f6fb3b1d7acc49c465eacfd93b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:55:53 +0100 Subject: [PATCH 04/11] cosmetic changes --- Python/codecs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index e83b01dcd039ba..4710d080bbee0a 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -855,10 +855,10 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { Py_ssize_t start, end; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - if (PyUnicodeDecodeError_GetStart(exc, &start)) { + if (PyUnicodeDecodeError_GetStart(exc, &start) < 0) { return NULL; } - if (PyUnicodeDecodeError_GetEnd(exc, &end)) { + if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { return NULL; } if (end <= start) { @@ -889,19 +889,19 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) PyObject *obj = NULL; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - if (PyUnicodeEncodeError_GetStart(exc, &start)) { + if (PyUnicodeEncodeError_GetStart(exc, &start) < 0) { return NULL; } - if (PyUnicodeEncodeError_GetEnd(exc, &end)) { + if (PyUnicodeEncodeError_GetEnd(exc, &end) < 0) { return NULL; } obj = PyUnicodeEncodeError_GetObject(exc); } else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { - if (PyUnicodeTranslateError_GetStart(exc, &start)) { + if (PyUnicodeTranslateError_GetStart(exc, &start) < 0) { return NULL; } - if (PyUnicodeTranslateError_GetEnd(exc, &end)) { + if (PyUnicodeTranslateError_GetEnd(exc, &end) < 0) { return NULL; } obj = PyUnicodeTranslateError_GetObject(exc); From 08be0a7c247ef2184619254c3b44af078816b6f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:08:27 +0100 Subject: [PATCH 05/11] use internal `_PyUnicodeError_GetParams` helper --- Python/codecs.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 4710d080bbee0a..f4ed514ea6c1bd 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -853,21 +853,18 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { - Py_ssize_t start, end; + PyObject *obj; + Py_ssize_t objlen, start, end; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - if (PyUnicodeDecodeError_GetStart(exc, &start) < 0) { - return NULL; - } - if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) { + if (_PyUnicodeError_GetParams(exc, + &obj, &objlen, &start, &end, true) < 0) + { return NULL; } if (end <= start) { + Py_DECREF(obj); goto oob; } - PyObject *obj = PyUnicodeDecodeError_GetObject(exc); - if (obj == NULL) { - return NULL; - } const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); PyObject *res = PyUnicode_New(4 * (end - start), 127); if (res == NULL) { @@ -887,33 +884,21 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) return Py_BuildValue("(Nn)", res, end); } - PyObject *obj = NULL; - if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - if (PyUnicodeEncodeError_GetStart(exc, &start) < 0) { - return NULL; - } - if (PyUnicodeEncodeError_GetEnd(exc, &end) < 0) { - return NULL; - } - obj = PyUnicodeEncodeError_GetObject(exc); - } - else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { - if (PyUnicodeTranslateError_GetStart(exc, &start) < 0) { - return NULL; - } - if (PyUnicodeTranslateError_GetEnd(exc, &end) < 0) { + if ( + PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError) + || PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError) + ) { + if (_PyUnicodeError_GetParams(exc, + &obj, &objlen, &start, &end, false) < 0) + { return NULL; } - obj = PyUnicodeTranslateError_GetObject(exc); } else { wrong_exception_type(exc); return NULL; } - if (obj == NULL) { - return NULL; - } if (end <= start) { Py_DECREF(obj); goto oob; From 564ba70b1a2bcb8c1ca75e43d7354efd277a3537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 3 Jan 2025 19:41:51 +0100 Subject: [PATCH 06/11] Update Python/codecs.c --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index f4ed514ea6c1bd..b4f86b48fa4c6b 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -906,7 +906,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) if (end - start > PY_SSIZE_T_MAX / 10) { end = start + PY_SSIZE_T_MAX / 10; } - end = Py_MIN(end, PyUnicode_GET_LENGTH(obj)); + end = Py_MIN(end, objlen); Py_ssize_t ressize = 0; for (Py_ssize_t i = start; i < end; ++i) { From da54d9b06e479474b4905845ba99f7e501627435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 21 Jan 2025 12:54:05 +0100 Subject: [PATCH 07/11] update usages of `_PyUnicodeError_GetParams` --- Python/codecs.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index b4f86b48fa4c6b..e6e51d687dda29 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -851,27 +851,25 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) } } -PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) +PyObject * +PyCodec_BackslashReplaceErrors(PyObject *exc) { PyObject *obj; - Py_ssize_t objlen, start, end; + Py_ssize_t objlen, start, end, slen; if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { if (_PyUnicodeError_GetParams(exc, - &obj, &objlen, &start, &end, true) < 0) + &obj, &objlen, + &start, &end, &slen, true) < 0) { return NULL; } - if (end <= start) { - Py_DECREF(obj); - goto oob; - } - const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); - PyObject *res = PyUnicode_New(4 * (end - start), 127); + PyObject *res = PyUnicode_New(4 * slen, 127); if (res == NULL) { Py_DECREF(obj); return NULL; } Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); + const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); for (Py_ssize_t i = start; i < end; i++, outp += 4) { const unsigned char ch = p[i]; outp[0] = '\\'; @@ -889,7 +887,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) || PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError) ) { if (_PyUnicodeError_GetParams(exc, - &obj, &objlen, &start, &end, false) < 0) + &obj, &objlen, + &start, &end, &slen, false) < 0) { return NULL; } @@ -899,11 +898,11 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) return NULL; } - if (end <= start) { + if (slen == 0) { // end <= start Py_DECREF(obj); - goto oob; + return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); } - if (end - start > PY_SSIZE_T_MAX / 10) { + if (slen > PY_SSIZE_T_MAX / 10) { end = start + PY_SSIZE_T_MAX / 10; } end = Py_MIN(end, objlen); @@ -954,9 +953,6 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) assert(_PyUnicode_CheckConsistency(res, 1)); Py_DECREF(obj); return Py_BuildValue("(Nn)", res, end); - -oob: - return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); } PyObject *PyCodec_NameReplaceErrors(PyObject *exc) From 98b4ec02012054f336c073e9db1dac842f8d8fe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:01:09 +0100 Subject: [PATCH 08/11] amend some cosmetic changes to be consistent --- Python/codecs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index e6e51d687dda29..628f84080900ba 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -851,8 +851,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) } } -PyObject * -PyCodec_BackslashReplaceErrors(PyObject *exc) +PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { PyObject *obj; Py_ssize_t objlen, start, end, slen; From a19b4e336435a0daef89d0382fdd90e999d9ed98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 21 Jan 2025 15:26:58 +0100 Subject: [PATCH 09/11] fix bounds --- Python/codecs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index 628f84080900ba..cc20c8249e388d 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -903,8 +903,9 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } if (slen > PY_SSIZE_T_MAX / 10) { end = start + PY_SSIZE_T_MAX / 10; + end = Py_MIN(end, objlen); + slen = Py_MAX(0, end - start); } - end = Py_MIN(end, objlen); Py_ssize_t ressize = 0; for (Py_ssize_t i = start; i < end; ++i) { From 3eff7b8ed732279959dd087a71bc3fa2e6dae883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 23 Jan 2025 12:06:06 +0100 Subject: [PATCH 10/11] leave optimization to the compiler --- Python/codecs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 8a65be1d312016..da8f2b9f7e35a3 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -908,10 +908,6 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) return NULL; } - if (slen == 0) { // end <= start - Py_DECREF(obj); - return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); - } if (slen > PY_SSIZE_T_MAX / 10) { end = start + PY_SSIZE_T_MAX / 10; end = Py_MIN(end, objlen); From 5e5ecbcfdf51f461a7c42c0f069d82efe0f80d72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 23 Jan 2025 12:50:08 +0100 Subject: [PATCH 11/11] remove magic constants --- Python/codecs.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index da8f2b9f7e35a3..07eaa8ecddcae0 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -908,8 +908,14 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) return NULL; } - if (slen > PY_SSIZE_T_MAX / 10) { - end = start + PY_SSIZE_T_MAX / 10; + // The number of characters that each character 'ch' contributes + // in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch} + // and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS, + // where the number of hexdigits is either 2, 4, or 8 (not 6). + // Since the Unicode range is below 10^7, we choose k = 8 whence + // each "block" requires at most 1 + 1 + 8 characters. + if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) { + end = start + PY_SSIZE_T_MAX / (1 + 1 + 8); end = Py_MIN(end, objlen); slen = Py_MAX(0, end - start); } @@ -919,13 +925,13 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) /* object is guaranteed to be "ready" */ Py_UCS4 c = PyUnicode_READ_CHAR(obj, i); if (c >= 0x10000) { - ressize += 10; + ressize += 1 + 1 + 8; } else if (c >= 0x100) { - ressize += 6; + ressize += 1 + 1 + 4; } else { - ressize += 4; + ressize += 1 + 1 + 2; } } PyObject *res = PyUnicode_New(ressize, 127);