Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler #129648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Fix use-after-free in the unicode-escape decoder with error handler
If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().

_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
  • Loading branch information
serhiy-storchaka committed Feb 4, 2025
commit 3a939ff2298d147459116f98a09549d0f1954039
4 changes: 2 additions & 2 deletions 4 Include/internal/pycore_bytesobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ extern PyObject* _PyBytes_FromHex(

// Helper for PyBytes_DecodeEscape that detects invalid escape chars.
// Export for test_peg_generator.
PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
const char *, const char **);
PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
const char *, int *);


// Substring Search.
Expand Down
8 changes: 4 additions & 4 deletions 8 Include/internal/pycore_unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,14 @@ extern PyObject* _PyUnicode_DecodeUnicodeEscapeStateful(
// Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
// chars.
// Export for test_peg_generator.
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed, /* bytes consumed */
const char **first_invalid_escape); /* on return, points to first
invalid escaped char in
string. */
int *first_invalid_escape); /* on return, if not -1, contain the first
invalid escaped char (<= 0xff) or invalid
octal escape (> 0xff) in string. */

/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */

Expand Down
39 changes: 38 additions & 1 deletion 39 Lib/test/test_codeccallbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import codecs
import html.entities
import itertools
import re
import sys
import unicodedata
import unittest
Expand Down Expand Up @@ -1125,7 +1126,7 @@ def test_bug828737(self):
text = 'abc<def>ghi'*n
text.translate(charmap)

def test_mutatingdecodehandler(self):
def test_mutating_decode_handler(self):
baddata = [
("ascii", b"\xff"),
("utf-7", b"++"),
Expand Down Expand Up @@ -1160,6 +1161,42 @@ def mutating(exc):
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")

def test_mutating_decode_handler_unicode_escape(self):
decode = codecs.unicode_escape_decode
def mutating(exc):
if isinstance(exc, UnicodeDecodeError):
r = data.get(exc.object[:exc.end])
if r is not None:
exc.object = r[0] + exc.object[exc.end:]
return ('\u0404', r[1])
raise AssertionError("don't know how to handle %r" % exc)

codecs.register_error('test.mutating2', mutating)
data = {
br'\x0': (b'\\', 0),
br'\x3': (b'xxx\\', 3),
br'\x5': (b'x\\', 1),
}
def check(input, expected, msg):
with self.assertWarns(DeprecationWarning) as cm:
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
self.assertIn(msg, str(cm.warning))

check(br'\x0n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
check(br'\x0n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
check(br'\x0z', '\u0404\\z', r'"\z" is an invalid escape sequence')

check(br'\x3n\zr', '\u0404\n\\zr', r'"\z" is an invalid escape sequence')
check(br'\x3zr', '\u0404\\zr', r'"\z" is an invalid escape sequence')
check(br'\x3z5', '\u0404\\z5', r'"\z" is an invalid escape sequence')
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r'"\z" is an invalid escape sequence')
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r'"\z" is an invalid escape sequence')

check(br'\x5n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
check(br'\x5n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
check(br'\x5z', '\u0404\\z', r'"\z" is an invalid escape sequence')
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r'"\z" is an invalid escape sequence')

# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot
Expand Down
52 changes: 42 additions & 10 deletions 52 Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1196,23 +1196,39 @@ def test_escape(self):
check(br"[\1010]", b"[A0]")
check(br"[\x41]", b"[A]")
check(br"[\x410]", b"[A0]")

def test_warnings(self):
decode = codecs.escape_decode
check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtvx':
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\%c" is an invalid escape sequence' % i):
check(b"\\" + b, b"\\" + b)
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\%c" is an invalid escape sequence' % (i-32)):
check(b"\\" + b.upper(), b"\\" + b.upper())
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\8" is an invalid escape sequence'):
check(br"\8", b"\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", b"\\9")
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\\xfa" is an invalid escape sequence') as cm:
check(b"\\\xfa", b"\\\xfa")
for i in range(0o400, 0o1000):
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\%o" is an invalid octal escape sequence' % i):
check(rb'\%o' % i, bytes([i & 0o377]))

with self.assertWarnsRegex(DeprecationWarning,
r'"\\z" is an invalid escape sequence'):
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
with self.assertWarnsRegex(DeprecationWarning,
r'"\\501" is an invalid octal escape sequence'):
self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6))

def test_errors(self):
decode = codecs.escape_decode
self.assertRaises(ValueError, decode, br"\x")
Expand Down Expand Up @@ -2661,24 +2677,40 @@ def test_escape_decode(self):
check(br"[\x410]", "[A0]")
check(br"\u20ac", "\u20ac")
check(br"\U0001d120", "\U0001d120")

def test_decode_warnings(self):
decode = codecs.unicode_escape_decode
check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtuvx':
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\%c" is an invalid escape sequence' % i):
check(b"\\" + b, "\\" + chr(i))
if b.upper() not in b'UN':
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\%c" is an invalid escape sequence' % (i-32)):
check(b"\\" + b.upper(), "\\" + chr(i-32))
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\8" is an invalid escape sequence'):
check(br"\8", "\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", "\\9")
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\\xfa" is an invalid escape sequence') as cm:
check(b"\\\xfa", "\\\xfa")
for i in range(0o400, 0o1000):
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r'"\\%o" is an invalid octal escape sequence' % i):
check(rb'\%o' % i, chr(i))

with self.assertWarnsRegex(DeprecationWarning,
r'"\\z" is an invalid escape sequence'):
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
with self.assertWarnsRegex(DeprecationWarning,
r'"\\501" is an invalid octal escape sequence'):
self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6))

def test_decode_errors(self):
decode = codecs.unicode_escape_decode
for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
Expand Down
29 changes: 13 additions & 16 deletions 29 Objects/bytesobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1076,10 +1076,10 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
}

/* Unescape a backslash-escaped string. */
PyObject *_PyBytes_DecodeEscape(const char *s,
PyObject *_PyBytes_DecodeEscape2(const char *s,
Py_ssize_t len,
const char *errors,
const char **first_invalid_escape)
int *first_invalid_escape)
{
int c;
char *p;
Expand All @@ -1093,7 +1093,7 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
return NULL;
writer.overallocate = 1;

*first_invalid_escape = NULL;
*first_invalid_escape = -1;

end = s + len;
while (s < end) {
Expand Down Expand Up @@ -1131,9 +1131,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
c = (c<<3) + *s++ - '0';
}
if (c > 0377) {
if (*first_invalid_escape == NULL) {
*first_invalid_escape = s-3; /* Back up 3 chars, since we've
already incremented s. */
if (*first_invalid_escape == -1) {
*first_invalid_escape = c;
}
}
*p++ = c;
Expand Down Expand Up @@ -1174,9 +1173,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
break;

default:
if (*first_invalid_escape == NULL) {
*first_invalid_escape = s-1; /* Back up one char, since we've
already incremented s. */
if (*first_invalid_escape == -1) {
*first_invalid_escape = (unsigned char)s[-1];
}
*p++ = '\\';
s--;
Expand All @@ -1196,16 +1194,15 @@ PyObject *PyBytes_DecodeEscape(const char *s,
Py_ssize_t Py_UNUSED(unicode),
const char *Py_UNUSED(recode_encoding))
{
const char* first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
int first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
&first_invalid_escape);
if (result == NULL)
return NULL;
if (first_invalid_escape != NULL) {
unsigned char c = *first_invalid_escape;
if ('4' <= c && c <= '7') {
if (first_invalid_escape != -1) {
if (first_invalid_escape > 0xff) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"b\"\\%.3s\" is an invalid octal escape sequence. "
"b\"\\%o\" is an invalid octal escape sequence. "
"Such sequences will not work in the future. ",
first_invalid_escape) < 0)
{
Expand All @@ -1217,7 +1214,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"b\"\\%c\" is an invalid escape sequence. "
"Such sequences will not work in the future. ",
c) < 0)
first_invalid_escape) < 0)
{
Py_DECREF(result);
return NULL;
Expand Down
29 changes: 13 additions & 16 deletions 29 Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -6599,11 +6599,11 @@ _PyUnicode_GetNameCAPI(void)
/* --- Unicode Escape Codec ----------------------------------------------- */

PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char **first_invalid_escape)
int *first_invalid_escape)
{
const char *starts = s;
_PyUnicodeWriter writer;
Expand All @@ -6613,7 +6613,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
_PyUnicode_Name_CAPI *ucnhash_capi;

// so we can remember if we've seen an invalid escape char or not
*first_invalid_escape = NULL;
*first_invalid_escape = -1;

if (size == 0) {
if (consumed) {
Expand Down Expand Up @@ -6701,9 +6701,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
}
}
if (ch > 0377) {
if (*first_invalid_escape == NULL) {
*first_invalid_escape = s-3; /* Back up 3 chars, since we've
already incremented s. */
if (*first_invalid_escape == -1) {
*first_invalid_escape = ch;
}
}
WRITE_CHAR(ch);
Expand Down Expand Up @@ -6798,9 +6797,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
goto error;

default:
if (*first_invalid_escape == NULL) {
*first_invalid_escape = s-1; /* Back up one char, since we've
already incremented s. */
if (*first_invalid_escape == -1) {
*first_invalid_escape = c;
}
WRITE_ASCII_CHAR('\\');
WRITE_CHAR(c);
Expand Down Expand Up @@ -6845,17 +6843,16 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
const char *errors,
Py_ssize_t *consumed)
{
const char *first_invalid_escape;
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
int first_invalid_escape;
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
consumed,
&first_invalid_escape);
if (result == NULL)
return NULL;
if (first_invalid_escape != NULL) {
unsigned char c = *first_invalid_escape;
if ('4' <= c && c <= '7') {
if (first_invalid_escape != -1) {
if (first_invalid_escape > 0xff) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"\"\\%.3s\" is an invalid octal escape sequence. "
"\"\\%o\" is an invalid octal escape sequence. "
"Such sequences will not work in the future. ",
first_invalid_escape) < 0)
{
Expand All @@ -6867,7 +6864,7 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"\"\\%c\" is an invalid escape sequence. "
"Such sequences will not work in the future. ",
c) < 0)
first_invalid_escape) < 0)
{
Py_DECREF(result);
return NULL;
Expand Down
Loading
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.