Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 25a614a

Browse filesBrowse files
authored
gh-126004: Fix positions handling in codecs.backslashreplace_errors (#127676)
This fixes how `PyCodec_BackslashReplaceErrors` handles the `start` and `end` attributes of `UnicodeError` objects via the `_PyUnicodeError_GetParams` helper.
1 parent 5c9a63f commit 25a614a
Copy full SHA for 25a614a

File tree

Expand file treeCollapse file tree

3 files changed

+69
-66
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+69
-66
lines changed

‎Lib/test/test_capi/test_codecs.py

Copy file name to clipboardExpand all lines: Lib/test/test_capi/test_codecs.py
+2-1Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,8 @@ def test_codec_xmlcharrefreplace_errors_handler(self):
849849

850850
def test_codec_backslashreplace_errors_handler(self):
851851
handler = _testcapi.codec_backslashreplace_errors
852-
self.do_test_codec_errors_handler(handler, self.all_unicode_errors)
852+
self.do_test_codec_errors_handler(handler, self.all_unicode_errors,
853+
safe=True)
853854

854855
def test_codec_namereplace_errors_handler(self):
855856
handler = _testlimitedcapi.codec_namereplace_errors
+3Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
2+
values in the :func:`codecs.backslashreplace_errors` error handler. Patch by
3+
Bénédikt Tran.

‎Python/codecs.c

Copy file name to clipboardExpand all lines: Python/codecs.c
+64-65Lines changed: 64 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -864,108 +864,107 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
864864

865865
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
866866
{
867-
PyObject *object;
868-
Py_ssize_t i;
869-
Py_ssize_t start;
870-
Py_ssize_t end;
871-
PyObject *res;
872-
Py_UCS1 *outp;
873-
int ressize;
874-
Py_UCS4 c;
875-
867+
PyObject *obj;
868+
Py_ssize_t objlen, start, end, slen;
876869
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
877-
const unsigned char *p;
878-
if (PyUnicodeDecodeError_GetStart(exc, &start))
879-
return NULL;
880-
if (PyUnicodeDecodeError_GetEnd(exc, &end))
881-
return NULL;
882-
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
870+
if (_PyUnicodeError_GetParams(exc,
871+
&obj, &objlen,
872+
&start, &end, &slen, true) < 0)
873+
{
883874
return NULL;
884-
p = (const unsigned char*)PyBytes_AS_STRING(object);
885-
res = PyUnicode_New(4 * (end - start), 127);
875+
}
876+
PyObject *res = PyUnicode_New(4 * slen, 127);
886877
if (res == NULL) {
887-
Py_DECREF(object);
878+
Py_DECREF(obj);
888879
return NULL;
889880
}
890-
outp = PyUnicode_1BYTE_DATA(res);
891-
for (i = start; i < end; i++, outp += 4) {
892-
unsigned char c = p[i];
881+
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
882+
const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
883+
for (Py_ssize_t i = start; i < end; i++, outp += 4) {
884+
const unsigned char ch = p[i];
893885
outp[0] = '\\';
894886
outp[1] = 'x';
895-
outp[2] = Py_hexdigits[(c>>4)&0xf];
896-
outp[3] = Py_hexdigits[c&0xf];
887+
outp[2] = Py_hexdigits[(ch >> 4) & 0xf];
888+
outp[3] = Py_hexdigits[ch & 0xf];
897889
}
898-
899890
assert(_PyUnicode_CheckConsistency(res, 1));
900-
Py_DECREF(object);
891+
Py_DECREF(obj);
901892
return Py_BuildValue("(Nn)", res, end);
902893
}
903-
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
904-
if (PyUnicodeEncodeError_GetStart(exc, &start))
905-
return NULL;
906-
if (PyUnicodeEncodeError_GetEnd(exc, &end))
907-
return NULL;
908-
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
909-
return NULL;
910-
}
911-
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
912-
if (PyUnicodeTranslateError_GetStart(exc, &start))
913-
return NULL;
914-
if (PyUnicodeTranslateError_GetEnd(exc, &end))
915-
return NULL;
916-
if (!(object = PyUnicodeTranslateError_GetObject(exc)))
894+
895+
if (
896+
PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)
897+
|| PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)
898+
) {
899+
if (_PyUnicodeError_GetParams(exc,
900+
&obj, &objlen,
901+
&start, &end, &slen, false) < 0)
902+
{
917903
return NULL;
904+
}
918905
}
919906
else {
920907
wrong_exception_type(exc);
921908
return NULL;
922909
}
923910

924-
if (end - start > PY_SSIZE_T_MAX / (1+1+8))
925-
end = start + PY_SSIZE_T_MAX / (1+1+8);
926-
for (i = start, ressize = 0; i < end; ++i) {
911+
// The number of characters that each character 'ch' contributes
912+
// in the result is 1 + 1 + k, where k >= min{t >= 1 | 16^t > ch}
913+
// and will be formatted as "\\" + ('U'|'u'|'x') + HEXDIGITS,
914+
// where the number of hexdigits is either 2, 4, or 8 (not 6).
915+
// Since the Unicode range is below 10^7, we choose k = 8 whence
916+
// each "block" requires at most 1 + 1 + 8 characters.
917+
if (slen > PY_SSIZE_T_MAX / (1 + 1 + 8)) {
918+
end = start + PY_SSIZE_T_MAX / (1 + 1 + 8);
919+
end = Py_MIN(end, objlen);
920+
slen = Py_MAX(0, end - start);
921+
}
922+
923+
Py_ssize_t ressize = 0;
924+
for (Py_ssize_t i = start; i < end; ++i) {
927925
/* object is guaranteed to be "ready" */
928-
c = PyUnicode_READ_CHAR(object, i);
926+
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
929927
if (c >= 0x10000) {
930-
ressize += 1+1+8;
928+
ressize += 1 + 1 + 8;
931929
}
932930
else if (c >= 0x100) {
933-
ressize += 1+1+4;
931+
ressize += 1 + 1 + 4;
932+
}
933+
else {
934+
ressize += 1 + 1 + 2;
934935
}
935-
else
936-
ressize += 1+1+2;
937936
}
938-
res = PyUnicode_New(ressize, 127);
937+
PyObject *res = PyUnicode_New(ressize, 127);
939938
if (res == NULL) {
940-
Py_DECREF(object);
939+
Py_DECREF(obj);
941940
return NULL;
942941
}
943-
outp = PyUnicode_1BYTE_DATA(res);
944-
for (i = start; i < end; ++i) {
945-
c = PyUnicode_READ_CHAR(object, i);
942+
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
943+
for (Py_ssize_t i = start; i < end; ++i) {
944+
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
946945
*outp++ = '\\';
947946
if (c >= 0x00010000) {
948947
*outp++ = 'U';
949-
*outp++ = Py_hexdigits[(c>>28)&0xf];
950-
*outp++ = Py_hexdigits[(c>>24)&0xf];
951-
*outp++ = Py_hexdigits[(c>>20)&0xf];
952-
*outp++ = Py_hexdigits[(c>>16)&0xf];
953-
*outp++ = Py_hexdigits[(c>>12)&0xf];
954-
*outp++ = Py_hexdigits[(c>>8)&0xf];
948+
*outp++ = Py_hexdigits[(c >> 28) & 0xf];
949+
*outp++ = Py_hexdigits[(c >> 24) & 0xf];
950+
*outp++ = Py_hexdigits[(c >> 20) & 0xf];
951+
*outp++ = Py_hexdigits[(c >> 16) & 0xf];
952+
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
953+
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
955954
}
956955
else if (c >= 0x100) {
957956
*outp++ = 'u';
958-
*outp++ = Py_hexdigits[(c>>12)&0xf];
959-
*outp++ = Py_hexdigits[(c>>8)&0xf];
957+
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
958+
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
960959
}
961-
else
960+
else {
962961
*outp++ = 'x';
963-
*outp++ = Py_hexdigits[(c>>4)&0xf];
964-
*outp++ = Py_hexdigits[c&0xf];
962+
}
963+
*outp++ = Py_hexdigits[(c >> 4) & 0xf];
964+
*outp++ = Py_hexdigits[c & 0xf];
965965
}
966-
967966
assert(_PyUnicode_CheckConsistency(res, 1));
968-
Py_DECREF(object);
967+
Py_DECREF(obj);
969968
return Py_BuildValue("(Nn)", res, end);
970969
}
971970

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.