Fix use-after-free in the unicode-escape decoder with error handler

If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal().
python · serhiy-storchaka · May 12, 2025 · Feb 4, 2025 · May 9, 2025 · May 9, 2025
commit 3a939ff2298d147459116f98a09549d0f1954039
diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h
@@ -20,8 +20,8 @@ extern PyObject* _PyBytes_FromHex(

 // Helper for PyBytes_DecodeEscape that detects invalid escape chars.
 // Export for test_peg_generator.
-PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
-                                            const char *, const char **);
+PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
+                                            const char *, int *);


 // Substring Search.

diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h
@@ -141,14 +141,14 @@ extern PyObject* _PyUnicode_DecodeUnicodeEscapeStateful(
 // Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
 // chars.
 // Export for test_peg_generator.
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
    const char *string,     /* Unicode-Escape encoded string */
    Py_ssize_t length,      /* size of string */
    const char *errors,     /* error handling */
    Py_ssize_t *consumed,   /* bytes consumed */
-    const char **first_invalid_escape); /* on return, points to first
-                                           invalid escaped char in
-                                           string. */
+    int *first_invalid_escape); /* on return, if not -1, contain the first
+                                   invalid escaped char (<= 0xff) or invalid
+                                   octal escape (> 0xff) in string. */

 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */


diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
@@ -2,6 +2,7 @@
 import codecs
 import html.entities
 import itertools
+import re
 import sys
 import unicodedata
 import unittest
@@ -1125,7 +1126,7 @@ def test_bug828737(self):
            text = 'abc<def>ghi'*n
            text.translate(charmap)

-    def test_mutatingdecodehandler(self):
+    def test_mutating_decode_handler(self):
        baddata = [
            ("ascii", b"\xff"),
            ("utf-7", b"++"),
@@ -1160,6 +1161,42 @@ def mutating(exc):
        for (encoding, data) in baddata:
            self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")

+    def test_mutating_decode_handler_unicode_escape(self):
+        decode = codecs.unicode_escape_decode
+        def mutating(exc):
+            if isinstance(exc, UnicodeDecodeError):
+                r = data.get(exc.object[:exc.end])
+                if r is not None:
+                    exc.object = r[0] + exc.object[exc.end:]
+                    return ('\u0404', r[1])
+            raise AssertionError("don't know how to handle %r" % exc)
+
+        codecs.register_error('test.mutating2', mutating)
+        data = {
+            br'\x0': (b'\\', 0),
+            br'\x3': (b'xxx\\', 3),
+            br'\x5': (b'x\\', 1),
+        }
+        def check(input, expected, msg):
+            with self.assertWarns(DeprecationWarning) as cm:
+                self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
+            self.assertIn(msg, str(cm.warning))
+
+        check(br'\x0n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
+        check(br'\x0n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
+        check(br'\x0z', '\u0404\\z', r'"\z" is an invalid escape sequence')
+
+        check(br'\x3n\zr', '\u0404\n\\zr', r'"\z" is an invalid escape sequence')
+        check(br'\x3zr', '\u0404\\zr', r'"\z" is an invalid escape sequence')
+        check(br'\x3z5', '\u0404\\z5', r'"\z" is an invalid escape sequence')
+        check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r'"\z" is an invalid escape sequence')
+        check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r'"\z" is an invalid escape sequence')
+
+        check(br'\x5n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
+        check(br'\x5n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
+        check(br'\x5z', '\u0404\\z', r'"\z" is an invalid escape sequence')
+        check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r'"\z" is an invalid escape sequence')
+
    # issue32583
    def test_crashing_decode_handler(self):
        # better generating one more character to fill the extra space slot

diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
@@ -1196,23 +1196,39 @@ def test_escape(self):
        check(br"[\1010]", b"[A0]")
        check(br"[\x41]", b"[A]")
        check(br"[\x410]", b"[A0]")
+
+    def test_warnings(self):
+        decode = codecs.escape_decode
+        check = coding_checker(self, decode)
        for i in range(97, 123):
            b = bytes([i])
            if b not in b'abfnrtvx':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarnsRegex(DeprecationWarning,
+                        r'"\\%c" is an invalid escape sequence' % i):
                    check(b"\\" + b, b"\\" + b)
-            with self.assertWarns(DeprecationWarning):
+            with self.assertWarnsRegex(DeprecationWarning,
+                    r'"\\%c" is an invalid escape sequence' % (i-32)):
                check(b"\\" + b.upper(), b"\\" + b.upper())
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
+                r'"\\8" is an invalid escape sequence'):
            check(br"\8", b"\\8")
        with self.assertWarns(DeprecationWarning):
            check(br"\9", b"\\9")
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
+                r'"\\\xfa" is an invalid escape sequence') as cm:
            check(b"\\\xfa", b"\\\xfa")
        for i in range(0o400, 0o1000):
-            with self.assertWarns(DeprecationWarning):
+            with self.assertWarnsRegex(DeprecationWarning,
+                    r'"\\%o" is an invalid octal escape sequence' % i):
                check(rb'\%o' % i, bytes([i & 0o377]))

+        with self.assertWarnsRegex(DeprecationWarning,
+                r'"\\z" is an invalid escape sequence'):
+            self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
+        with self.assertWarnsRegex(DeprecationWarning,
+                r'"\\501" is an invalid octal escape sequence'):
+            self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6))
+
    def test_errors(self):
        decode = codecs.escape_decode
        self.assertRaises(ValueError, decode, br"\x")
@@ -2661,24 +2677,40 @@ def test_escape_decode(self):
        check(br"[\x410]", "[A0]")
        check(br"\u20ac", "\u20ac")
        check(br"\U0001d120", "\U0001d120")
+
+    def test_decode_warnings(self):
+        decode = codecs.unicode_escape_decode
+        check = coding_checker(self, decode)
        for i in range(97, 123):
            b = bytes([i])
            if b not in b'abfnrtuvx':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarnsRegex(DeprecationWarning,
+                        r'"\\%c" is an invalid escape sequence' % i):
                    check(b"\\" + b, "\\" + chr(i))
            if b.upper() not in b'UN':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarnsRegex(DeprecationWarning,
+                        r'"\\%c" is an invalid escape sequence' % (i-32)):
                    check(b"\\" + b.upper(), "\\" + chr(i-32))
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
+                r'"\\8" is an invalid escape sequence'):
            check(br"\8", "\\8")
        with self.assertWarns(DeprecationWarning):
            check(br"\9", "\\9")
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
+                r'"\\\xfa" is an invalid escape sequence') as cm:
            check(b"\\\xfa", "\\\xfa")
        for i in range(0o400, 0o1000):
-            with self.assertWarns(DeprecationWarning):
+            with self.assertWarnsRegex(DeprecationWarning,
+                    r'"\\%o" is an invalid octal escape sequence' % i):
                check(rb'\%o' % i, chr(i))

+        with self.assertWarnsRegex(DeprecationWarning,
+                r'"\\z" is an invalid escape sequence'):
+            self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
+        with self.assertWarnsRegex(DeprecationWarning,
+                r'"\\501" is an invalid octal escape sequence'):
+            self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6))
+
    def test_decode_errors(self):
        decode = codecs.unicode_escape_decode
        for c, d in (b'x', 2), (b'u', 4), (b'U', 4):

diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
@@ -1076,10 +1076,10 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
 }

 /* Unescape a backslash-escaped string. */
-PyObject *_PyBytes_DecodeEscape(const char *s,
+PyObject *_PyBytes_DecodeEscape2(const char *s,
                                Py_ssize_t len,
                                const char *errors,
-                                const char **first_invalid_escape)
+                                int *first_invalid_escape)
 {
    int c;
    char *p;
@@ -1093,7 +1093,7 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
        return NULL;
    writer.overallocate = 1;

-    *first_invalid_escape = NULL;
+    *first_invalid_escape = -1;

    end = s + len;
    while (s < end) {
@@ -1131,9 +1131,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
                    c = (c<<3) + *s++ - '0';
            }
            if (c > 0377) {
-                if (*first_invalid_escape == NULL) {
-                    *first_invalid_escape = s-3; /* Back up 3 chars, since we've
-                                                    already incremented s. */
+                if (*first_invalid_escape == -1) {
+                    *first_invalid_escape = c;
                }
            }
            *p++ = c;
@@ -1174,9 +1173,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
            break;

        default:
-            if (*first_invalid_escape == NULL) {
-                *first_invalid_escape = s-1; /* Back up one char, since we've
-                                                already incremented s. */
+            if (*first_invalid_escape == -1) {
+                *first_invalid_escape = (unsigned char)s[-1];
            }
            *p++ = '\\';
            s--;
@@ -1196,16 +1194,15 @@ PyObject *PyBytes_DecodeEscape(const char *s,
                                Py_ssize_t Py_UNUSED(unicode),
                                const char *Py_UNUSED(recode_encoding))
 {
-    const char* first_invalid_escape;
-    PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
+    int first_invalid_escape;
+    PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
                                             &first_invalid_escape);
    if (result == NULL)
        return NULL;
-    if (first_invalid_escape != NULL) {
-        unsigned char c = *first_invalid_escape;
-        if ('4' <= c && c <= '7') {
+    if (first_invalid_escape != -1) {
+        if (first_invalid_escape > 0xff) {
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
-                                 "b\"\\%.3s\" is an invalid octal escape sequence. "
+                                 "b\"\\%o\" is an invalid octal escape sequence. "
                                 "Such sequences will not work in the future. ",
                                 first_invalid_escape) < 0)
            {
@@ -1217,7 +1214,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                                 "b\"\\%c\" is an invalid escape sequence. "
                                 "Such sequences will not work in the future. ",
-                                 c) < 0)
+                                 first_invalid_escape) < 0)
            {
                Py_DECREF(result);
                return NULL;

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -6599,11 +6599,11 @@ _PyUnicode_GetNameCAPI(void)
 /* --- Unicode Escape Codec ----------------------------------------------- */

 PyObject *
-_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
+_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
                               Py_ssize_t size,
                               const char *errors,
                               Py_ssize_t *consumed,
-                               const char **first_invalid_escape)
+                               int *first_invalid_escape)
 {
    const char *starts = s;
    _PyUnicodeWriter writer;
@@ -6613,7 +6613,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
    _PyUnicode_Name_CAPI *ucnhash_capi;

    // so we can remember if we've seen an invalid escape char or not
-    *first_invalid_escape = NULL;
+    *first_invalid_escape = -1;

    if (size == 0) {
        if (consumed) {
@@ -6701,9 +6701,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
                }
            }
            if (ch > 0377) {
-                if (*first_invalid_escape == NULL) {
-                    *first_invalid_escape = s-3; /* Back up 3 chars, since we've
-                                                    already incremented s. */
+                if (*first_invalid_escape == -1) {
+                    *first_invalid_escape = ch;
                }
            }
            WRITE_CHAR(ch);
@@ -6798,9 +6797,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
            goto error;

        default:
-            if (*first_invalid_escape == NULL) {
-                *first_invalid_escape = s-1; /* Back up one char, since we've
-                                                already incremented s. */
+            if (*first_invalid_escape == -1) {
+                *first_invalid_escape = c;
            }
            WRITE_ASCII_CHAR('\\');
            WRITE_CHAR(c);
@@ -6845,17 +6843,16 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
                              const char *errors,
                              Py_ssize_t *consumed)
 {
-    const char *first_invalid_escape;
-    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
+    int first_invalid_escape;
+    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
                                                      consumed,
                                                      &first_invalid_escape);
    if (result == NULL)
        return NULL;
-    if (first_invalid_escape != NULL) {
-        unsigned char c = *first_invalid_escape;
-        if ('4' <= c && c <= '7') {
+    if (first_invalid_escape != -1) {
+        if (first_invalid_escape > 0xff) {
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
-                                 "\"\\%.3s\" is an invalid octal escape sequence. "
+                                 "\"\\%o\" is an invalid octal escape sequence. "
                                 "Such sequences will not work in the future. ",
                                 first_invalid_escape) < 0)
            {
@@ -6867,7 +6864,7 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                                 "\"\\%c\" is an invalid escape sequence. "
                                 "Such sequences will not work in the future. ",
-                                 c) < 0)
+                                 first_invalid_escape) < 0)
            {
                Py_DECREF(result);
                return NULL;