Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 0c33e5b

Browse filesBrowse files
[3.11] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944)
If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58) (cherry picked from commit 6279eb8) (cherry picked from commit a75953b) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 461ca2c commit 0c33e5b
Copy full SHA for 0c33e5b

File tree

8 files changed

+198
-57
lines changed
Filter options

8 files changed

+198
-57
lines changed

‎Include/cpython/bytesobject.h

Copy file name to clipboardExpand all lines: Include/cpython/bytesobject.h
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
2525
int use_bytearray);
2626

2727
/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
28+
PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
29+
const char *,
30+
int *, const char **);
31+
// Export for binary compatibility.
2832
PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
2933
const char *, const char **);
3034

‎Include/cpython/unicodeobject.h

Copy file name to clipboardExpand all lines: Include/cpython/unicodeobject.h
+13Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -914,6 +914,19 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
914914
);
915915
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
916916
chars. */
917+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
918+
const char *string, /* Unicode-Escape encoded string */
919+
Py_ssize_t length, /* size of string */
920+
const char *errors, /* error handling */
921+
Py_ssize_t *consumed, /* bytes consumed */
922+
int *first_invalid_escape_char, /* on return, if not -1, contain the first
923+
invalid escaped char (<= 0xff) or invalid
924+
octal escape (> 0xff) in string. */
925+
const char **first_invalid_escape_ptr); /* on return, if not NULL, may
926+
point to the first invalid escaped
927+
char in string.
928+
May be NULL if errors is not NULL. */
929+
// Export for binary compatibility.
917930
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
918931
const char *string, /* Unicode-Escape encoded string */
919932
Py_ssize_t length, /* size of string */

‎Lib/test/test_codeccallbacks.py

Copy file name to clipboardExpand all lines: Lib/test/test_codeccallbacks.py
+38-1Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import codecs
22
import html.entities
33
import itertools
4+
import re
45
import sys
56
import unicodedata
67
import unittest
@@ -1124,7 +1125,7 @@ def test_bug828737(self):
11241125
text = 'abc<def>ghi'*n
11251126
text.translate(charmap)
11261127

1127-
def test_mutatingdecodehandler(self):
1128+
def test_mutating_decode_handler(self):
11281129
baddata = [
11291130
("ascii", b"\xff"),
11301131
("utf-7", b"++"),
@@ -1159,6 +1160,42 @@ def mutating(exc):
11591160
for (encoding, data) in baddata:
11601161
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
11611162

1163+
def test_mutating_decode_handler_unicode_escape(self):
1164+
decode = codecs.unicode_escape_decode
1165+
def mutating(exc):
1166+
if isinstance(exc, UnicodeDecodeError):
1167+
r = data.get(exc.object[:exc.end])
1168+
if r is not None:
1169+
exc.object = r[0] + exc.object[exc.end:]
1170+
return ('\u0404', r[1])
1171+
raise AssertionError("don't know how to handle %r" % exc)
1172+
1173+
codecs.register_error('test.mutating2', mutating)
1174+
data = {
1175+
br'\x0': (b'\\', 0),
1176+
br'\x3': (b'xxx\\', 3),
1177+
br'\x5': (b'x\\', 1),
1178+
}
1179+
def check(input, expected, msg):
1180+
with self.assertWarns(DeprecationWarning) as cm:
1181+
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
1182+
self.assertIn(msg, str(cm.warning))
1183+
1184+
check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1185+
check(br'\x0n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
1186+
check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
1187+
1188+
check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
1189+
check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
1190+
check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
1191+
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
1192+
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
1193+
1194+
check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1195+
check(br'\x5n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
1196+
check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
1197+
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
1198+
11621199
# issue32583
11631200
def test_crashing_decode_handler(self):
11641201
# better generating one more character to fill the extra space slot

‎Lib/test/test_codecs.py

Copy file name to clipboardExpand all lines: Lib/test/test_codecs.py
+42-10Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1198,23 +1198,39 @@ def test_escape(self):
11981198
check(br"[\1010]", b"[A0]")
11991199
check(br"[\x41]", b"[A]")
12001200
check(br"[\x410]", b"[A0]")
1201+
1202+
def test_warnings(self):
1203+
decode = codecs.escape_decode
1204+
check = coding_checker(self, decode)
12011205
for i in range(97, 123):
12021206
b = bytes([i])
12031207
if b not in b'abfnrtvx':
1204-
with self.assertWarns(DeprecationWarning):
1208+
with self.assertWarnsRegex(DeprecationWarning,
1209+
r"invalid escape sequence '\\%c'" % i):
12051210
check(b"\\" + b, b"\\" + b)
1206-
with self.assertWarns(DeprecationWarning):
1211+
with self.assertWarnsRegex(DeprecationWarning,
1212+
r"invalid escape sequence '\\%c'" % (i-32)):
12071213
check(b"\\" + b.upper(), b"\\" + b.upper())
1208-
with self.assertWarns(DeprecationWarning):
1214+
with self.assertWarnsRegex(DeprecationWarning,
1215+
r"invalid escape sequence '\\8'"):
12091216
check(br"\8", b"\\8")
12101217
with self.assertWarns(DeprecationWarning):
12111218
check(br"\9", b"\\9")
1212-
with self.assertWarns(DeprecationWarning):
1219+
with self.assertWarnsRegex(DeprecationWarning,
1220+
r"invalid escape sequence '\\\xfa'") as cm:
12131221
check(b"\\\xfa", b"\\\xfa")
12141222
for i in range(0o400, 0o1000):
1215-
with self.assertWarns(DeprecationWarning):
1223+
with self.assertWarnsRegex(DeprecationWarning,
1224+
r"invalid octal escape sequence '\\%o'" % i):
12161225
check(rb'\%o' % i, bytes([i & 0o377]))
12171226

1227+
with self.assertWarnsRegex(DeprecationWarning,
1228+
r"invalid escape sequence '\\z'"):
1229+
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
1230+
with self.assertWarnsRegex(DeprecationWarning,
1231+
r"invalid octal escape sequence '\\501'"):
1232+
self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6))
1233+
12181234
def test_errors(self):
12191235
decode = codecs.escape_decode
12201236
self.assertRaises(ValueError, decode, br"\x")
@@ -2487,24 +2503,40 @@ def test_escape_decode(self):
24872503
check(br"[\x410]", "[A0]")
24882504
check(br"\u20ac", "\u20ac")
24892505
check(br"\U0001d120", "\U0001d120")
2506+
2507+
def test_decode_warnings(self):
2508+
decode = codecs.unicode_escape_decode
2509+
check = coding_checker(self, decode)
24902510
for i in range(97, 123):
24912511
b = bytes([i])
24922512
if b not in b'abfnrtuvx':
2493-
with self.assertWarns(DeprecationWarning):
2513+
with self.assertWarnsRegex(DeprecationWarning,
2514+
r"invalid escape sequence '\\%c'" % i):
24942515
check(b"\\" + b, "\\" + chr(i))
24952516
if b.upper() not in b'UN':
2496-
with self.assertWarns(DeprecationWarning):
2517+
with self.assertWarnsRegex(DeprecationWarning,
2518+
r"invalid escape sequence '\\%c'" % (i-32)):
24972519
check(b"\\" + b.upper(), "\\" + chr(i-32))
2498-
with self.assertWarns(DeprecationWarning):
2520+
with self.assertWarnsRegex(DeprecationWarning,
2521+
r"invalid escape sequence '\\8'"):
24992522
check(br"\8", "\\8")
25002523
with self.assertWarns(DeprecationWarning):
25012524
check(br"\9", "\\9")
2502-
with self.assertWarns(DeprecationWarning):
2525+
with self.assertWarnsRegex(DeprecationWarning,
2526+
r"invalid escape sequence '\\\xfa'") as cm:
25032527
check(b"\\\xfa", "\\\xfa")
25042528
for i in range(0o400, 0o1000):
2505-
with self.assertWarns(DeprecationWarning):
2529+
with self.assertWarnsRegex(DeprecationWarning,
2530+
r"invalid octal escape sequence '\\%o'" % i):
25062531
check(rb'\%o' % i, chr(i))
25072532

2533+
with self.assertWarnsRegex(DeprecationWarning,
2534+
r"invalid escape sequence '\\z'"):
2535+
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
2536+
with self.assertWarnsRegex(DeprecationWarning,
2537+
r"invalid octal escape sequence '\\501'"):
2538+
self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6))
2539+
25082540
def test_decode_errors(self):
25092541
decode = codecs.unicode_escape_decode
25102542
for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
2+
handler.

‎Objects/bytesobject.c

Copy file name to clipboardExpand all lines: Objects/bytesobject.c
+38-18Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,10 +1057,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
10571057
}
10581058

10591059
/* Unescape a backslash-escaped string. */
1060-
PyObject *_PyBytes_DecodeEscape(const char *s,
1060+
PyObject *_PyBytes_DecodeEscape2(const char *s,
10611061
Py_ssize_t len,
10621062
const char *errors,
1063-
const char **first_invalid_escape)
1063+
int *first_invalid_escape_char,
1064+
const char **first_invalid_escape_ptr)
10641065
{
10651066
int c;
10661067
char *p;
@@ -1074,7 +1075,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
10741075
return NULL;
10751076
writer.overallocate = 1;
10761077

1077-
*first_invalid_escape = NULL;
1078+
*first_invalid_escape_char = -1;
1079+
*first_invalid_escape_ptr = NULL;
10781080

10791081
end = s + len;
10801082
while (s < end) {
@@ -1112,9 +1114,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11121114
c = (c<<3) + *s++ - '0';
11131115
}
11141116
if (c > 0377) {
1115-
if (*first_invalid_escape == NULL) {
1116-
*first_invalid_escape = s-3; /* Back up 3 chars, since we've
1117-
already incremented s. */
1117+
if (*first_invalid_escape_char == -1) {
1118+
*first_invalid_escape_char = c;
1119+
/* Back up 3 chars, since we've already incremented s. */
1120+
*first_invalid_escape_ptr = s - 3;
11181121
}
11191122
}
11201123
*p++ = c;
@@ -1155,9 +1158,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11551158
break;
11561159

11571160
default:
1158-
if (*first_invalid_escape == NULL) {
1159-
*first_invalid_escape = s-1; /* Back up one char, since we've
1160-
already incremented s. */
1161+
if (*first_invalid_escape_char == -1) {
1162+
*first_invalid_escape_char = (unsigned char)s[-1];
1163+
/* Back up one char, since we've already incremented s. */
1164+
*first_invalid_escape_ptr = s - 1;
11611165
}
11621166
*p++ = '\\';
11631167
s--;
@@ -1171,23 +1175,39 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11711175
return NULL;
11721176
}
11731177

1178+
// Export for binary compatibility.
1179+
PyObject *_PyBytes_DecodeEscape(const char *s,
1180+
Py_ssize_t len,
1181+
const char *errors,
1182+
const char **first_invalid_escape)
1183+
{
1184+
int first_invalid_escape_char;
1185+
return _PyBytes_DecodeEscape2(
1186+
s, len, errors,
1187+
&first_invalid_escape_char,
1188+
first_invalid_escape);
1189+
}
1190+
11741191
PyObject *PyBytes_DecodeEscape(const char *s,
11751192
Py_ssize_t len,
11761193
const char *errors,
11771194
Py_ssize_t Py_UNUSED(unicode),
11781195
const char *Py_UNUSED(recode_encoding))
11791196
{
1180-
const char* first_invalid_escape;
1181-
PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
1182-
&first_invalid_escape);
1197+
int first_invalid_escape_char;
1198+
const char *first_invalid_escape_ptr;
1199+
PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
1200+
&first_invalid_escape_char,
1201+
&first_invalid_escape_ptr);
11831202
if (result == NULL)
11841203
return NULL;
1185-
if (first_invalid_escape != NULL) {
1186-
unsigned char c = *first_invalid_escape;
1187-
if ('4' <= c && c <= '7') {
1204+
if (first_invalid_escape_char != -1) {
1205+
if (first_invalid_escape_char > 0xff) {
1206+
char buf[12] = "";
1207+
snprintf(buf, sizeof buf, "%o", first_invalid_escape_char);
11881208
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
1189-
"invalid octal escape sequence '\\%.3s'",
1190-
first_invalid_escape) < 0)
1209+
"invalid octal escape sequence '\\%s'",
1210+
buf) < 0)
11911211
{
11921212
Py_DECREF(result);
11931213
return NULL;
@@ -1196,7 +1216,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
11961216
else {
11971217
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
11981218
"invalid escape sequence '\\%c'",
1199-
c) < 0)
1219+
first_invalid_escape_char) < 0)
12001220
{
12011221
Py_DECREF(result);
12021222
return NULL;

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.