Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit a75953b

Browse filesBrowse files
[3.12] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944)
If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58) (cherry picked from commit 6279eb8) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 310cd89 commit a75953b
Copy full SHA for a75953b

File tree

8 files changed

+194
-57
lines changed
Filter options

8 files changed

+194
-57
lines changed

‎Include/cpython/bytesobject.h

Copy file name to clipboardExpand all lines: Include/cpython/bytesobject.h
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
2525
int use_bytearray);
2626

2727
/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
28+
PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
29+
const char *,
30+
int *, const char **);
31+
// Export for binary compatibility.
2832
PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
2933
const char *, const char **);
3034

‎Include/cpython/unicodeobject.h

Copy file name to clipboardExpand all lines: Include/cpython/unicodeobject.h
+13Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,19 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
684684
);
685685
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
686686
chars. */
687+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
688+
const char *string, /* Unicode-Escape encoded string */
689+
Py_ssize_t length, /* size of string */
690+
const char *errors, /* error handling */
691+
Py_ssize_t *consumed, /* bytes consumed */
692+
int *first_invalid_escape_char, /* on return, if not -1, contain the first
693+
invalid escaped char (<= 0xff) or invalid
694+
octal escape (> 0xff) in string. */
695+
const char **first_invalid_escape_ptr); /* on return, if not NULL, may
696+
point to the first invalid escaped
697+
char in string.
698+
May be NULL if errors is not NULL. */
699+
// Export for binary compatibility.
687700
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
688701
const char *string, /* Unicode-Escape encoded string */
689702
Py_ssize_t length, /* size of string */

‎Lib/test/test_codeccallbacks.py

Copy file name to clipboardExpand all lines: Lib/test/test_codeccallbacks.py
+38-1Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import codecs
22
import html.entities
33
import itertools
4+
import re
45
import sys
56
import unicodedata
67
import unittest
@@ -1124,7 +1125,7 @@ def test_bug828737(self):
11241125
text = 'abc<def>ghi'*n
11251126
text.translate(charmap)
11261127

1127-
def test_mutatingdecodehandler(self):
1128+
def test_mutating_decode_handler(self):
11281129
baddata = [
11291130
("ascii", b"\xff"),
11301131
("utf-7", b"++"),
@@ -1159,6 +1160,42 @@ def mutating(exc):
11591160
for (encoding, data) in baddata:
11601161
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
11611162

1163+
def test_mutating_decode_handler_unicode_escape(self):
1164+
decode = codecs.unicode_escape_decode
1165+
def mutating(exc):
1166+
if isinstance(exc, UnicodeDecodeError):
1167+
r = data.get(exc.object[:exc.end])
1168+
if r is not None:
1169+
exc.object = r[0] + exc.object[exc.end:]
1170+
return ('\u0404', r[1])
1171+
raise AssertionError("don't know how to handle %r" % exc)
1172+
1173+
codecs.register_error('test.mutating2', mutating)
1174+
data = {
1175+
br'\x0': (b'\\', 0),
1176+
br'\x3': (b'xxx\\', 3),
1177+
br'\x5': (b'x\\', 1),
1178+
}
1179+
def check(input, expected, msg):
1180+
with self.assertWarns(DeprecationWarning) as cm:
1181+
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
1182+
self.assertIn(msg, str(cm.warning))
1183+
1184+
check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1185+
check(br'\x0n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
1186+
check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
1187+
1188+
check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
1189+
check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
1190+
check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
1191+
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
1192+
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
1193+
1194+
check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1195+
check(br'\x5n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
1196+
check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
1197+
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
1198+
11621199
# issue32583
11631200
def test_crashing_decode_handler(self):
11641201
# better generating one more character to fill the extra space slot

‎Lib/test/test_codecs.py

Copy file name to clipboardExpand all lines: Lib/test/test_codecs.py
+42-10Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,23 +1196,39 @@ def test_escape(self):
11961196
check(br"[\1010]", b"[A0]")
11971197
check(br"[\x41]", b"[A]")
11981198
check(br"[\x410]", b"[A0]")
1199+
1200+
def test_warnings(self):
1201+
decode = codecs.escape_decode
1202+
check = coding_checker(self, decode)
11991203
for i in range(97, 123):
12001204
b = bytes([i])
12011205
if b not in b'abfnrtvx':
1202-
with self.assertWarns(DeprecationWarning):
1206+
with self.assertWarnsRegex(DeprecationWarning,
1207+
r"invalid escape sequence '\\%c'" % i):
12031208
check(b"\\" + b, b"\\" + b)
1204-
with self.assertWarns(DeprecationWarning):
1209+
with self.assertWarnsRegex(DeprecationWarning,
1210+
r"invalid escape sequence '\\%c'" % (i-32)):
12051211
check(b"\\" + b.upper(), b"\\" + b.upper())
1206-
with self.assertWarns(DeprecationWarning):
1212+
with self.assertWarnsRegex(DeprecationWarning,
1213+
r"invalid escape sequence '\\8'"):
12071214
check(br"\8", b"\\8")
12081215
with self.assertWarns(DeprecationWarning):
12091216
check(br"\9", b"\\9")
1210-
with self.assertWarns(DeprecationWarning):
1217+
with self.assertWarnsRegex(DeprecationWarning,
1218+
r"invalid escape sequence '\\\xfa'") as cm:
12111219
check(b"\\\xfa", b"\\\xfa")
12121220
for i in range(0o400, 0o1000):
1213-
with self.assertWarns(DeprecationWarning):
1221+
with self.assertWarnsRegex(DeprecationWarning,
1222+
r"invalid octal escape sequence '\\%o'" % i):
12141223
check(rb'\%o' % i, bytes([i & 0o377]))
12151224

1225+
with self.assertWarnsRegex(DeprecationWarning,
1226+
r"invalid escape sequence '\\z'"):
1227+
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
1228+
with self.assertWarnsRegex(DeprecationWarning,
1229+
r"invalid octal escape sequence '\\501'"):
1230+
self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6))
1231+
12161232
def test_errors(self):
12171233
decode = codecs.escape_decode
12181234
self.assertRaises(ValueError, decode, br"\x")
@@ -2479,24 +2495,40 @@ def test_escape_decode(self):
24792495
check(br"[\x410]", "[A0]")
24802496
check(br"\u20ac", "\u20ac")
24812497
check(br"\U0001d120", "\U0001d120")
2498+
2499+
def test_decode_warnings(self):
2500+
decode = codecs.unicode_escape_decode
2501+
check = coding_checker(self, decode)
24822502
for i in range(97, 123):
24832503
b = bytes([i])
24842504
if b not in b'abfnrtuvx':
2485-
with self.assertWarns(DeprecationWarning):
2505+
with self.assertWarnsRegex(DeprecationWarning,
2506+
r"invalid escape sequence '\\%c'" % i):
24862507
check(b"\\" + b, "\\" + chr(i))
24872508
if b.upper() not in b'UN':
2488-
with self.assertWarns(DeprecationWarning):
2509+
with self.assertWarnsRegex(DeprecationWarning,
2510+
r"invalid escape sequence '\\%c'" % (i-32)):
24892511
check(b"\\" + b.upper(), "\\" + chr(i-32))
2490-
with self.assertWarns(DeprecationWarning):
2512+
with self.assertWarnsRegex(DeprecationWarning,
2513+
r"invalid escape sequence '\\8'"):
24912514
check(br"\8", "\\8")
24922515
with self.assertWarns(DeprecationWarning):
24932516
check(br"\9", "\\9")
2494-
with self.assertWarns(DeprecationWarning):
2517+
with self.assertWarnsRegex(DeprecationWarning,
2518+
r"invalid escape sequence '\\\xfa'") as cm:
24952519
check(b"\\\xfa", "\\\xfa")
24962520
for i in range(0o400, 0o1000):
2497-
with self.assertWarns(DeprecationWarning):
2521+
with self.assertWarnsRegex(DeprecationWarning,
2522+
r"invalid octal escape sequence '\\%o'" % i):
24982523
check(rb'\%o' % i, chr(i))
24992524

2525+
with self.assertWarnsRegex(DeprecationWarning,
2526+
r"invalid escape sequence '\\z'"):
2527+
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
2528+
with self.assertWarnsRegex(DeprecationWarning,
2529+
r"invalid octal escape sequence '\\501'"):
2530+
self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6))
2531+
25002532
def test_decode_errors(self):
25012533
decode = codecs.unicode_escape_decode
25022534
for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
2+
handler.

‎Objects/bytesobject.c

Copy file name to clipboardExpand all lines: Objects/bytesobject.c
+36-18Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,10 +1048,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
10481048
}
10491049

10501050
/* Unescape a backslash-escaped string. */
1051-
PyObject *_PyBytes_DecodeEscape(const char *s,
1051+
PyObject *_PyBytes_DecodeEscape2(const char *s,
10521052
Py_ssize_t len,
10531053
const char *errors,
1054-
const char **first_invalid_escape)
1054+
int *first_invalid_escape_char,
1055+
const char **first_invalid_escape_ptr)
10551056
{
10561057
int c;
10571058
char *p;
@@ -1065,7 +1066,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
10651066
return NULL;
10661067
writer.overallocate = 1;
10671068

1068-
*first_invalid_escape = NULL;
1069+
*first_invalid_escape_char = -1;
1070+
*first_invalid_escape_ptr = NULL;
10691071

10701072
end = s + len;
10711073
while (s < end) {
@@ -1103,9 +1105,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11031105
c = (c<<3) + *s++ - '0';
11041106
}
11051107
if (c > 0377) {
1106-
if (*first_invalid_escape == NULL) {
1107-
*first_invalid_escape = s-3; /* Back up 3 chars, since we've
1108-
already incremented s. */
1108+
if (*first_invalid_escape_char == -1) {
1109+
*first_invalid_escape_char = c;
1110+
/* Back up 3 chars, since we've already incremented s. */
1111+
*first_invalid_escape_ptr = s - 3;
11091112
}
11101113
}
11111114
*p++ = c;
@@ -1146,9 +1149,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11461149
break;
11471150

11481151
default:
1149-
if (*first_invalid_escape == NULL) {
1150-
*first_invalid_escape = s-1; /* Back up one char, since we've
1151-
already incremented s. */
1152+
if (*first_invalid_escape_char == -1) {
1153+
*first_invalid_escape_char = (unsigned char)s[-1];
1154+
/* Back up one char, since we've already incremented s. */
1155+
*first_invalid_escape_ptr = s - 1;
11521156
}
11531157
*p++ = '\\';
11541158
s--;
@@ -1162,23 +1166,37 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11621166
return NULL;
11631167
}
11641168

1169+
// Export for binary compatibility.
1170+
PyObject *_PyBytes_DecodeEscape(const char *s,
1171+
Py_ssize_t len,
1172+
const char *errors,
1173+
const char **first_invalid_escape)
1174+
{
1175+
int first_invalid_escape_char;
1176+
return _PyBytes_DecodeEscape2(
1177+
s, len, errors,
1178+
&first_invalid_escape_char,
1179+
first_invalid_escape);
1180+
}
1181+
11651182
PyObject *PyBytes_DecodeEscape(const char *s,
11661183
Py_ssize_t len,
11671184
const char *errors,
11681185
Py_ssize_t Py_UNUSED(unicode),
11691186
const char *Py_UNUSED(recode_encoding))
11701187
{
1171-
const char* first_invalid_escape;
1172-
PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
1173-
&first_invalid_escape);
1188+
int first_invalid_escape_char;
1189+
const char *first_invalid_escape_ptr;
1190+
PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
1191+
&first_invalid_escape_char,
1192+
&first_invalid_escape_ptr);
11741193
if (result == NULL)
11751194
return NULL;
1176-
if (first_invalid_escape != NULL) {
1177-
unsigned char c = *first_invalid_escape;
1178-
if ('4' <= c && c <= '7') {
1195+
if (first_invalid_escape_char != -1) {
1196+
if (first_invalid_escape_char > 0xff) {
11791197
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
1180-
"invalid octal escape sequence '\\%.3s'",
1181-
first_invalid_escape) < 0)
1198+
"invalid octal escape sequence '\\%o'",
1199+
first_invalid_escape_char) < 0)
11821200
{
11831201
Py_DECREF(result);
11841202
return NULL;
@@ -1187,7 +1205,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
11871205
else {
11881206
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
11891207
"invalid escape sequence '\\%c'",
1190-
c) < 0)
1208+
first_invalid_escape_char) < 0)
11911209
{
11921210
Py_DECREF(result);
11931211
return NULL;

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.