gh-119182: Decode PyUnicode_FromFormat() format from UTF-8

PyUnicode_FromFormat() now decodes the format string from UTF-8 with the "replace" error handler, instead of decoding it from ASCII. Remove unused 'consumed' parameter of unicode_decode_utf8_writer().
python · vstinner · Jun 7, 2024 · Jun 7, 2024 · Jun 11, 2024 · Jun 11, 2024
commit 3d5bca4d1fdaefcaaaeed7415c8f468fb4a2d8e7
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -387,7 +387,8 @@ APIs:
   arguments, calculate the size of the resulting Python Unicode string and return
   a string with the values formatted into it.  The variable arguments must be C
   types and must correspond exactly to the format characters in the *format*
-   ASCII-encoded string.
+   string. The *format* string is decoded from UTF-8 with the "replace" error
+   handler.

   A conversion specifier contains two or more characters and has the following
   components, which must occur in this order:
@@ -487,7 +488,8 @@ APIs:

      * - ``s``
        - :c:expr:`const char*` or :c:expr:`const wchar_t*`
-        - A null-terminated C character array.
+        - A null-terminated C character array. :c:expr:`const char*` is decoded
+          from UTF-8 with the "replace" error handler.

      * - ``p``
        - :c:expr:`const void*`
@@ -576,6 +578,9 @@ APIs:
   .. versionchanged:: 3.13
      Support for ``%T``, ``%#T``, ``%N`` and ``%#N`` formats added.

+   .. versionchanged:: 3.14
+      The format string is now decoded from UTF-8 instead of ASCII.
+

 .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs)


diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
@@ -261,6 +261,10 @@ New Features
 Porting to Python 3.14
 ----------------------

+* :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with
+  the "replace" error handler, instead of decoding it from ASCII.
+  (Contributed by Victor Stinner in :gh:`119182`.)
+
 Deprecated
 ----------


diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -384,12 +384,12 @@ def check_format(expected, format, *args):
        check_format('ascii\x7f=unicode\xe9',
                     b'ascii\x7f=%U', 'unicode\xe9')

-        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
-        # raises an error
-        self.assertRaisesRegex(ValueError,
-            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
-            'string, got a non-ASCII byte: 0xe9$',
-            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
+        # Non-ASCII format and non-ASCII arguments are both decoded
+        # from UTF-8/replace
+        check_format('unicode\xe9=\u20ac',
+                     'unicode\xe9=%s'.encode(), '\u20ac'.encode())
+        check_format('invalid\ufffd=abc\ufffd',
+                     b'invalid\xe9=%s', b'abc\xe9')

        # test "%c"
        check_format('\uabcd',

diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst
@@ -0,0 +1,3 @@
+:c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with
+the "replace" error handler, instead of decoding it from ASCII. Patch by
+Victor Stinner.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -205,8 +205,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 static int
 unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                           const char *s, Py_ssize_t size,
-                           _Py_error_handler error_handler, const char *errors,
-                           Py_ssize_t *consumed);
+                           _Py_error_handler error_handler, const char *errors);
 #ifdef Py_DEBUG
 static inline int unicode_is_finalizing(void);
 static int unicode_is_singleton(PyObject *unicode);
@@ -2402,7 +2401,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,

    if (width < 0) {
        return unicode_decode_utf8_writer(writer, str, length,
-                                          _Py_ERROR_REPLACE, "replace", NULL);
+                                          _Py_ERROR_REPLACE, "replace");
    }

    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
@@ -2896,28 +2895,21 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
            const char *p;
            Py_ssize_t len;

-            p = f;
-            do
-            {
-                if ((unsigned char)*p > 127) {
-                    PyErr_Format(PyExc_ValueError,
-                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
-                        "string, got a non-ASCII byte: 0x%02x",
-                        (unsigned char)*p);
-                    goto fail;
-                }
-                p++;
+            p = strchr(f, '%');
+            if (p != NULL) {
+                len = p - f;
            }
-            while (*p != '\0' && *p != '%');
-            len = p - f;
-
-            if (*p == '\0')
+            else {
+                len = strlen(f);
                writer.overallocate = 0;
+            }

-            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
+            if (unicode_decode_utf8_writer(&writer, f, len,
+                                           _Py_ERROR_REPLACE, "replace") < 0) {
                goto fail;
+            }

-            f = p;
+            f += len;
        }
    }
    va_end(vargs2);
@@ -4930,13 +4922,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 static int
 unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                           const char *s, Py_ssize_t size,
-                           _Py_error_handler error_handler, const char *errors,
-                           Py_ssize_t *consumed)
+                           _Py_error_handler error_handler, const char *errors)
 {
    if (size == 0) {
-        if (consumed) {
-            *consumed = 0;
-        }
        return 0;
    }

@@ -4954,17 +4942,14 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
        writer->pos += decoded;

        if (decoded == size) {
-            if (consumed) {
-                *consumed = size;
-            }
            return 0;
        }
        s += decoded;
        size -= decoded;
    }

    return unicode_decode_utf8_impl(writer, starts, s, end,
-                                    error_handler, errors, consumed);
+                                    error_handler, errors, NULL);
 }