From 8114fb9e544dad15aa1c6e54efb2af74ca2d50e8 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Fri, 14 Sep 2018 01:13:07 +1000 Subject: [PATCH 01/14] bpo-34589: Move locale coercion back to the CLI app --- Include/coreconfig.h | 5 ++- Include/pylifecycle.h | 3 +- Lib/test/test_embed.py | 3 +- Modules/_testcapimodule.c | 6 ++-- Modules/main.c | 69 ++++++++++++++++++++++++++++----------- Programs/_testembed.c | 5 ++- Python/coreconfig.c | 34 ++++++------------- Python/pylifecycle.c | 25 ++++++++------ 8 files changed, 84 insertions(+), 66 deletions(-) diff --git a/Include/coreconfig.h b/Include/coreconfig.h index ef043ab02df6d43..c18f65210aa315d 100644 --- a/Include/coreconfig.h +++ b/Include/coreconfig.h @@ -63,8 +63,7 @@ typedef struct { int show_alloc_count; /* -X showalloccount */ int dump_refs; /* PYTHONDUMPREFS */ int malloc_stats; /* PYTHONMALLOCSTATS */ - int coerce_c_locale; /* PYTHONCOERCECLOCALE, -1 means unknown */ - int coerce_c_locale_warn; /* PYTHONCOERCECLOCALE=warn */ + int warn_on_c_locale; /* PYTHONCOERCECLOCALE=warn */ /* Python filesystem encoding and error handler: sys.getfilesystemencoding() and sys.getfilesystemencodeerrors(). @@ -314,7 +313,7 @@ typedef struct { .use_hash_seed = -1, \ .faulthandler = -1, \ .tracemalloc = -1, \ - .coerce_c_locale = -1, \ + .warn_on_c_locale = -1, \ .utf8_mode = -1, \ .argc = -1, \ .nmodule_search_path = -1, \ diff --git a/Include/pylifecycle.h b/Include/pylifecycle.h index 04e672e96e17cb5..c08e5374d90d2ca 100644 --- a/Include/pylifecycle.h +++ b/Include/pylifecycle.h @@ -175,7 +175,8 @@ PyAPI_FUNC(int) _PyOS_URandomNonblock(void *buffer, Py_ssize_t size); /* Legacy locale support */ #ifndef Py_LIMITED_API -PyAPI_FUNC(void) _Py_CoerceLegacyLocale(int warn); +PyAPI_FUNC(int) _Py_CoerceLegacyLocale(const char **coercion_target, + const char **coercion_warning); PyAPI_FUNC(int) _Py_LegacyLocaleDetected(void); PyAPI_FUNC(char *) _Py_SetLocaleFromEnv(int category); #endif diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 9155c40f405ed08..24fe59f197feea0 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -273,8 +273,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): 'filesystem_errors': None, 'utf8_mode': 0, - 'coerce_c_locale': 0, - 'coerce_c_locale_warn': 0, + 'warn_on_c_locale': 0, 'pycache_prefix': '(null)', 'program_name': './_testembed', diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index add642f223afdc4..f5e067af8ef2521 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -4701,10 +4701,8 @@ get_coreconfig(PyObject *self, PyObject *Py_UNUSED(args)) PyLong_FromLong(config->dump_refs)); SET_ITEM("malloc_stats", PyLong_FromLong(config->malloc_stats)); - SET_ITEM("coerce_c_locale", - PyLong_FromLong(config->coerce_c_locale)); - SET_ITEM("coerce_c_locale_warn", - PyLong_FromLong(config->coerce_c_locale_warn)); + SET_ITEM("warn_on_c_locale", + PyLong_FromLong(config->warn_on_c_locale)); SET_ITEM("filesystem_encoding", FROM_STRING(config->filesystem_encoding)); SET_ITEM("filesystem_errors", diff --git a/Modules/main.c b/Modules/main.c index 455178af8bab12d..cfe2c9dae4f420c 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -318,6 +318,10 @@ typedef struct { char **bytes_argv; wchar_t **wchar_argv; + /* Allow the embedding app to emit a locale coercion message */ + const char *locale_coercion_warning; + const char *locale_coercion_target; + /* Exit status or "exit code": result of pymain_main() */ int status; /* Error message if a function failed */ @@ -1292,7 +1296,6 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, #endif _PyCoreConfig save_config = _PyCoreConfig_INIT; int res = -1; - int locale_coerced = 0; int loops = 0; if (_PyCoreConfig_Copy(&save_config, config) < 0) { @@ -1303,6 +1306,9 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, /* Set LC_CTYPE to the user preferred locale */ _Py_SetLocaleFromEnv(LC_CTYPE); + /* TODO: With locale coercion moved back to _PyUnix_Main, this can + * potentially be simplified now that it only needs to handle UTF-8 mode + */ while (1) { int utf8_mode = config->utf8_mode; int encoding_changed = 0; @@ -1332,22 +1338,6 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, goto done; } - /* The legacy C locale assumes ASCII as the default text encoding, which - * causes problems not only for the CPython runtime, but also other - * components like GNU readline. - * - * Accordingly, when the CLI detects it, it attempts to coerce it to a - * more capable UTF-8 based alternative. - * - * See the documentation of the PYTHONCOERCECLOCALE setting for more - * details. - */ - if (config->coerce_c_locale && !locale_coerced) { - locale_coerced = 1; - _Py_CoerceLegacyLocale(config->coerce_c_locale_warn); - encoding_changed = 1; - } - if (utf8_mode == -1) { if (config->utf8_mode == 1) { /* UTF-8 Mode enabled */ @@ -1367,7 +1357,6 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, /* Reset the configuration before reading again the configuration, just keep UTF-8 Mode value. */ int new_utf8_mode = config->utf8_mode; - int new_coerce_c_locale = config->coerce_c_locale; if (_PyCoreConfig_Copy(config, &save_config) < 0) { pymain->err = _Py_INIT_NO_MEMORY(); goto done; @@ -1375,7 +1364,6 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, pymain_clear_cmdline(pymain, cmdline); memset(cmdline, 0, sizeof(*cmdline)); config->utf8_mode = new_utf8_mode; - config->coerce_c_locale = new_coerce_c_locale; /* The encoding changed: read again the configuration with the new encoding */ @@ -1729,6 +1717,12 @@ pymain_init(_PyMain *pymain, PyInterpreterState **interp_p) pymain_init_stdio(pymain, config); + if (config->warn_on_c_locale && pymain->locale_coercion_warning != NULL) { + fprintf(stderr, pymain->locale_coercion_warning, pymain->locale_coercion_target); + pymain->locale_coercion_warning = NULL; + pymain->locale_coercion_target = NULL; + } + PyInterpreterState *interp; pymain->err = _Py_InitializeCore(&interp, config); if (_Py_INIT_FAILED(pymain->err)) { @@ -1739,6 +1733,7 @@ pymain_init(_PyMain *pymain, PyInterpreterState **interp_p) pymain_clear_config(config); config = &interp->core_config; + if (pymain_init_python_main(pymain, config, interp) < 0) { _Py_FatalInitError(pymain->err); } @@ -1789,6 +1784,42 @@ _Py_UnixMain(int argc, char **argv) pymain.argc = argc; pymain.bytes_argv = argv; + /* The legacy C locale assumes ASCII as the default text encoding, which + * causes problems not only for the CPython runtime, but also other + * components like GNU readline. + * + * Accordingly, when the CLI detects it, it attempts to coerce it to a + * more capable UTF-8 based alternative. + * + * See the documentation of the PYTHONCOERCECLOCALE setting for more + * details. + */ + _Py_SetLocaleFromEnv(LC_CTYPE); + if (_Py_LegacyLocaleDetected()) { + /* We ignore the Python -E and -I flags here, as the CLI needs to sort out + * the locale settings *before* we try to do anything with the command + * line arguments. For cross-platform debugging purposes, we also need + * to give end users a way to force even scripts that are otherwise + * isolated from their environment to use the legacy ASCII-centric C + * locale (otherwise a user on platform with a valid coercion target, + * like Fedora, can't readily debug a script that runs with -I or -E + * on a platform without a valid coercion target, like CentOS 7). + * + * Ignoring -E and -I is safe from a security perspective, as we only use + * the setting to turn *off* the implicit locale coercion, and anyone with + * access to the process environment already has the ability to set + * `LC_ALL=C` to override the C level locale settings anyway. + */ + const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); + if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { + _Py_CoerceLegacyLocale( + &pymain.locale_coercion_target, + &pymain.locale_coercion_warning + ); + } + } + + return pymain_main(&pymain); } diff --git a/Programs/_testembed.c b/Programs/_testembed.c index 99772eacbdc42af..b4e842bf0389667 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -330,8 +330,7 @@ dump_config(void) printf("filesystem_encoding = %s\n", config->filesystem_encoding); printf("filesystem_errors = %s\n", config->filesystem_errors); - printf("coerce_c_locale = %i\n", config->coerce_c_locale); - printf("coerce_c_locale_warn = %i\n", config->coerce_c_locale_warn); + printf("warn_on_c_locale = %i\n", config->warn_on_c_locale); printf("utf8_mode = %i\n", config->utf8_mode); printf("pycache_prefix = %ls\n", config->pycache_prefix); @@ -607,7 +606,7 @@ static int test_init_isolated(void) _PyCoreConfig config = _PyCoreConfig_INIT; /* Set coerce_c_locale and utf8_mode to not depend on the locale */ - config.coerce_c_locale = 0; + config.warn_on_c_locale = 0; config.utf8_mode = 0; /* Use path starting with "./" avoids a search along the PATH */ config.program_name = L"./_testembed"; diff --git a/Python/coreconfig.c b/Python/coreconfig.c index fae32e533aa9cd5..918ca13921713f8 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -303,8 +303,7 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2) COPY_ATTR(dump_refs); COPY_ATTR(malloc_stats); - COPY_ATTR(coerce_c_locale); - COPY_ATTR(coerce_c_locale_warn); + COPY_ATTR(warn_on_c_locale); COPY_ATTR(utf8_mode); COPY_WSTR_ATTR(pycache_prefix); @@ -810,17 +809,11 @@ config_read_env_vars(_PyCoreConfig *config) const char *env = _PyCoreConfig_GetEnv(config, "PYTHONCOERCECLOCALE"); if (env) { - if (strcmp(env, "0") == 0) { - if (config->coerce_c_locale < 0) { - config->coerce_c_locale = 0; - } - } - else if (strcmp(env, "warn") == 0) { - config->coerce_c_locale_warn = 1; - } - else { - if (config->coerce_c_locale < 0) { - config->coerce_c_locale = 1; + if (config->warn_on_c_locale < 0) { + if (strcmp(env, "warn") == 0) { + config->warn_on_c_locale = 1; + } else { + config->warn_on_c_locale = 0; } } } @@ -967,13 +960,6 @@ config_read_complex_options(_PyCoreConfig *config) static void config_init_locale(_PyCoreConfig *config) { - if (config->coerce_c_locale < 0) { - /* The C locale enables the C locale coercion (PEP 538) */ - if (_Py_LegacyLocaleDetected()) { - config->coerce_c_locale = 1; - } - } - #ifndef MS_WINDOWS if (config->utf8_mode < 0) { /* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */ @@ -1291,7 +1277,7 @@ _PyCoreConfig_Read(_PyCoreConfig *config) } } - if (config->utf8_mode < 0 || config->coerce_c_locale < 0) { + if (config->utf8_mode < 0) { config_init_locale(config); } @@ -1321,8 +1307,8 @@ _PyCoreConfig_Read(_PyCoreConfig *config) if (config->tracemalloc < 0) { config->tracemalloc = 0; } - if (config->coerce_c_locale < 0) { - config->coerce_c_locale = 0; + if (config->warn_on_c_locale < 0) { + config->warn_on_c_locale = 0; } if (config->utf8_mode < 0) { config->utf8_mode = 0; @@ -1343,7 +1329,7 @@ _PyCoreConfig_Read(_PyCoreConfig *config) return err; } - assert(config->coerce_c_locale >= 0); + assert(config->warn_on_c_locale >= 0); assert(config->use_environment >= 0); assert(config->filesystem_encoding != NULL); assert(config->filesystem_errors != NULL); diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 379e860b5ba4b4e..1ea49b73a2538b4 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -301,7 +301,7 @@ static const char *_C_LOCALE_WARNING = static void _emit_stderr_warning_for_legacy_locale(const _PyCoreConfig *core_config) { - if (core_config->coerce_c_locale_warn && _Py_LegacyLocaleDetected()) { + if (core_config->warn_on_c_locale && _Py_LegacyLocaleDetected()) { PySys_FormatStderr("%s", _C_LOCALE_WARNING); } } @@ -337,7 +337,7 @@ static const char C_LOCALE_COERCION_WARNING[] = "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n"; static void -_coerce_default_locale_settings(int warn, const _LocaleCoercionTarget *target) +_coerce_default_locale_settings(const _LocaleCoercionTarget *target) { const char *newloc = target->locale_name; @@ -350,24 +350,25 @@ _coerce_default_locale_settings(int warn, const _LocaleCoercionTarget *target) "Error setting LC_CTYPE, skipping C locale coercion\n"); return; } - if (warn) { - fprintf(stderr, C_LOCALE_COERCION_WARNING, newloc); - } /* Reconfigure with the overridden environment variables */ _Py_SetLocaleFromEnv(LC_ALL); } #endif -void -_Py_CoerceLegacyLocale(int warn) +int +_Py_CoerceLegacyLocale(const char **coercion_target, + const char **coercion_warning) { + int locale_was_coerced = 0; + *coercion_target = NULL; + *coercion_warning = NULL; #ifdef PY_COERCE_C_LOCALE char *oldloc = NULL; oldloc = _PyMem_RawStrdup(setlocale(LC_CTYPE, NULL)); if (oldloc == NULL) { - return; + return -1; } const char *locale_override = getenv("LC_ALL"); @@ -390,18 +391,22 @@ defined(HAVE_LANGINFO_H) && defined(CODESET) } #endif /* Successfully configured locale, so make it the default */ - _coerce_default_locale_settings(warn, target); + _coerce_default_locale_settings(target); + *coercion_target = target->locale_name; + *coercion_warning = C_LOCALE_COERCION_WARNING; + locale_was_coerced = 1; goto done; } } } + /* Failed to find a suitable target locale, so revert to the original */ /* No C locale warning here, as Py_Initialize will emit one later */ - setlocale(LC_CTYPE, oldloc); done: PyMem_RawFree(oldloc); #endif + return locale_was_coerced; } /* _Py_SetLocaleFromEnv() is a wrapper around setlocale(category, "") to From c46457e52bdd082899d7aa0f657be3c3c47a8a0f Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Thu, 20 Sep 2018 21:37:11 +1000 Subject: [PATCH 02/14] Add NEWS entry --- .../2018-09-20-21-36-57.bpo-34589.9zERCD.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2018-09-20-21-36-57.bpo-34589.9zERCD.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-09-20-21-36-57.bpo-34589.9zERCD.rst b/Misc/NEWS.d/next/Core and Builtins/2018-09-20-21-36-57.bpo-34589.9zERCD.rst new file mode 100644 index 000000000000000..6da8aa851e18cbb --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-09-20-21-36-57.bpo-34589.9zERCD.rst @@ -0,0 +1,7 @@ +The locale coercion implementation has been adjusted back to being closer to +the design documented in PEP 538. This means locale coercion is once again +solely the responsibility of the application embedding the CPython runtime +(in CPython's case, the CLI wrapper), with the runtime only providing +support to detect legacy locales, actually apply the changes needed to +coerce the locale to a UTF-8 based one, and to emit warnings when requested +via PYTHONCOERCECLOCALE=warn. From 7b453f1fd34e776ad2435972d4d610bcc9c6076a Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Thu, 20 Sep 2018 21:37:26 +1000 Subject: [PATCH 03/14] Update documentation --- Doc/using/cmdline.rst | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index b61df8a4b77dff5..ab99ec710201080 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -852,8 +852,46 @@ conflict. Availability: \*nix + .. note:: + + Running in the legacy C locale is `not actually supported`_, as it almost + inevitably leads to text encoding and decoding problems at operating + system interfaces. As such, the ability to disable locale coercion or emit + warnings when it occurs is provided solely as a debugging aid when + behavioural differences are encountered across different platforms + (where some platforms may not define a suitable coercion target), or + embedding applications (where some applications, including older versions + of CPython, will manage the locale differently from the way the CPython + 3.7+ CLI does). + + .. _not actually supported: https://www.python.org/dev/peps/pep-0011/#legacy-c-locale + + .. note:: + + As locale coercion occurs before the command line arguments are processed, + setting ``PYTHONCOERCECLOCALE=0`` will prevent locale coercion even when + the :option:`-E` or :option`-I` option is specified. The + ``PYTHONCOERCECLOCALE=warn`` case is handled later, and respects those + options as usual (i.e. no locale coercion warnings will ever be displayed + even when the :option:`-E` or :option`-I` option is specified). + .. versionadded:: 3.7 - See :pep:`538` for more details. + See :pep:`538` for more details. Note that the 3.7 implementation differs + from the PEP in a few key aspects due to some unintended interactions + with the :pep:`540` implementation: 1) ``LC_ALL=C`` must be used to + disable locale coercion when the :option:`-E` or :option`-I` option is + specified (``PYTHONCOERCECLOCALE=0`` won't work); 2) calling + :c:func:`Py_Initialize` in an embedding application will trigger locale + coercion when running in the C locale; 2) calling :c:func:`Py_Main` in + an embedding application will trigger locale coercion when running in the + C locale. + + .. versionchanged:: 3.8 + The locale coercion implementation has been brought back into line with + the approach described in :pep:`538`: calling :c:func:`Py_Initialize` and + :c:func:`Py_Main` will never implicitly trigger local coercion, and + ``PYTHONCOERCECLOCALE=0`` takes effect even when the :option:`-E` or + :option`-I` option is specified .. envvar:: PYTHONDEVMODE From a71ca03ac8f624412e532bc2a168b09ba1877bcf Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Thu, 20 Sep 2018 21:45:45 +1000 Subject: [PATCH 04/14] Add missing period --- Doc/using/cmdline.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index ab99ec710201080..ac1ece5146255a4 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -891,7 +891,7 @@ conflict. the approach described in :pep:`538`: calling :c:func:`Py_Initialize` and :c:func:`Py_Main` will never implicitly trigger local coercion, and ``PYTHONCOERCECLOCALE=0`` takes effect even when the :option:`-E` or - :option`-I` option is specified + :option`-I` option is specified. .. envvar:: PYTHONDEVMODE From bb16468bb69fb721ceadfd89f262d6b3172d4d6b Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Thu, 20 Sep 2018 22:21:24 +1000 Subject: [PATCH 05/14] Fix option markup (courtesy of 'make suspicious') --- Doc/using/cmdline.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index ac1ece5146255a4..90f3efdd7adda71 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -870,16 +870,16 @@ conflict. As locale coercion occurs before the command line arguments are processed, setting ``PYTHONCOERCECLOCALE=0`` will prevent locale coercion even when - the :option:`-E` or :option`-I` option is specified. The + the :option:`-E` or :option:`-I` option is specified. The ``PYTHONCOERCECLOCALE=warn`` case is handled later, and respects those options as usual (i.e. no locale coercion warnings will ever be displayed - even when the :option:`-E` or :option`-I` option is specified). + even when the :option:`-E` or :option:`-I` option is specified). .. versionadded:: 3.7 See :pep:`538` for more details. Note that the 3.7 implementation differs from the PEP in a few key aspects due to some unintended interactions with the :pep:`540` implementation: 1) ``LC_ALL=C`` must be used to - disable locale coercion when the :option:`-E` or :option`-I` option is + disable locale coercion when the :option:`-E` or :option:`-I` option is specified (``PYTHONCOERCECLOCALE=0`` won't work); 2) calling :c:func:`Py_Initialize` in an embedding application will trigger locale coercion when running in the C locale; 2) calling :c:func:`Py_Main` in @@ -891,7 +891,7 @@ conflict. the approach described in :pep:`538`: calling :c:func:`Py_Initialize` and :c:func:`Py_Main` will never implicitly trigger local coercion, and ``PYTHONCOERCECLOCALE=0`` takes effect even when the :option:`-E` or - :option`-I` option is specified. + :option:`-I` option is specified. .. envvar:: PYTHONDEVMODE From f4c686babca9870fc832797f2b960a6420c14f38 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Thu, 20 Sep 2018 22:44:56 +1000 Subject: [PATCH 06/14] Handle NULL output pointers in _Py_CoerceLegacyLocale --- Python/pylifecycle.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index b251a3687b3c0da..ae3389572111d7e 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -355,8 +355,12 @@ _Py_CoerceLegacyLocale(const char **coercion_target, const char **coercion_warning) { int locale_was_coerced = 0; - *coercion_target = NULL; - *coercion_warning = NULL; + if (coercion_target != NULL) { + *coercion_target = NULL; + } + if (coercion_warning != NULL) { + *coercion_warning = NULL; + } #ifdef PY_COERCE_C_LOCALE char *oldloc = NULL; @@ -386,8 +390,12 @@ defined(HAVE_LANGINFO_H) && defined(CODESET) #endif /* Successfully configured locale, so make it the default */ _coerce_default_locale_settings(target); - *coercion_target = target->locale_name; - *coercion_warning = C_LOCALE_COERCION_WARNING; + if (coercion_target != NULL) { + *coercion_target = target->locale_name; + } + if (coercion_warning != NULL) { + *coercion_warning = C_LOCALE_COERCION_WARNING; + } locale_was_coerced = 1; goto done; } From 5f8b61c7f06366d8e235457d1fb9a78342b8e8d3 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sun, 30 Sep 2018 17:08:05 +1000 Subject: [PATCH 07/14] Further adjustments to locale coercion - PYTHONCOERCECLOCALE is handled the same way as LANG, LC_CTYPE and LC_ALL (i.e. independently of -E and -I) - locale coercion tests are once again running in isolated mode - be explicit that the env var key and value must be encoded as ASCII to have any effect - implement bpo-30672, such that the POSIX locale is always handled the same way as the C locale, even when it isn't a simple platform level alias for the latter --- Doc/using/cmdline.rst | 76 +++++++++++++++++------------- Lib/test/test_c_locale_coercion.py | 11 ++--- Python/coreconfig.c | 42 ++++++++++++----- Python/pylifecycle.c | 18 +++---- 4 files changed, 88 insertions(+), 59 deletions(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 90f3efdd7adda71..ef0b3c386af99b0 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -806,23 +806,23 @@ conflict. .. envvar:: PYTHONCOERCECLOCALE - If set to the value ``0``, causes the main Python command line application - to skip coercing the legacy ASCII-based C and POSIX locales to a more - capable UTF-8 based alternative. - - If this variable is *not* set (or is set to a value other than ``0``), the - ``LC_ALL`` locale override environment variable is also not set, and the - current locale reported for the ``LC_CTYPE`` category is either the default - ``C`` locale, or else the explicitly ASCII-based ``POSIX`` locale, then the - Python CLI will attempt to configure the following locales for the - ``LC_CTYPE`` category in the order listed before loading the interpreter - runtime: + If set to the value ``0`` (encoded as ASCII bytes), causes the main Python + command line application to skip coercing the legacy ASCII-based C and + POSIX locales to a more capable UTF-8 based alternative. Locale coercion + is also skipped if the ``LC_ALL`` locale override environment variable is + set (as setting ``LC_CTYPE`` will have no effect in that case). + + Otherwise, when the current locale reported for the ``LC_CTYPE`` category is + either the default ``C`` locale, or else the explicitly ASCII-based ``POSIX`` + locale, then the Python CLI will attempt to configure the following locales + for the ``LC_CTYPE`` category in the order listed before beginning to + initalise the interpreter runtime: * ``C.UTF-8`` * ``C.utf8`` * ``UTF-8`` - If setting one of these locale categories succeeds, then the ``LC_CTYPE`` + If enabling one of these locale categories succeeds, then the ``LC_CTYPE`` environment variable will also be set accordingly in the current process environment before the Python runtime is initialized. This ensures that in addition to being seen by both the interpreter itself and other locale-aware @@ -842,38 +842,44 @@ conflict. For debugging purposes, setting ``PYTHONCOERCECLOCALE=warn`` will cause Python to emit warning messages on ``stderr`` if either the locale coercion activates, or else if a locale that *would* have triggered coercion is - still active when the Python runtime is initialized. + still active when the Python runtime is initialized. (Note: as with the + locale coercion disabling marker, both the environment variable name and + the ``warn`` value must be encoded as ASCII text in order to have the + described effect) Also note that even when locale coercion is disabled, or when it fails to - find a suitable target locale, :envvar:`PYTHONUTF8` will still activate by - default in legacy ASCII-based locales. Both features must be disabled in - order to force the interpreter to use ``ASCII`` instead of ``UTF-8`` for - system interfaces. + find a suitable target locale, UTF-8 mode (as described under + :envvar:`PYTHONUTF8`) will still activate by default in these legacy + ASCII-based locales. Both features must be explicitly disabled in order to + force the interpreter to use ``ASCII`` instead of ``UTF-8`` for system + interfaces. Availability: \*nix .. note:: - Running in the legacy C locale is `not actually supported`_, as it almost - inevitably leads to text encoding and decoding problems at operating - system interfaces. As such, the ability to disable locale coercion or emit - warnings when it occurs is provided solely as a debugging aid when - behavioural differences are encountered across different platforms + While UTF-8 mode is able to handle many of the issues that otherwise + arise, running in the legacy C locale is `not formally supported`_, as it + almost inevitably leads to text encoding and decoding problems at either + operating system interfaces or else between the interpreter runtime and + locale-aware extension modules. As such, the ability to disable locale + coercion or emit warnings when it occurs is provided solely as a debugging + aid when behavioural differences are encountered across different platforms (where some platforms may not define a suitable coercion target), or embedding applications (where some applications, including older versions of CPython, will manage the locale differently from the way the CPython 3.7+ CLI does). - .. _not actually supported: https://www.python.org/dev/peps/pep-0011/#legacy-c-locale + .. _not formally supported: https://www.python.org/dev/peps/pep-0011/#legacy-c-locale .. note:: - As locale coercion occurs before the command line arguments are processed, - setting ``PYTHONCOERCECLOCALE=0`` will prevent locale coercion even when - the :option:`-E` or :option:`-I` option is specified. The - ``PYTHONCOERCECLOCALE=warn`` case is handled later, and respects those - options as usual (i.e. no locale coercion warnings will ever be displayed - even when the :option:`-E` or :option:`-I` option is specified). + As a locale configuration variable, ``PYTHONCOERCECLOCALE`` is processed + in a way that is similar to the C level locale configuration variables + (``LANG``, ``LC_CTYPE``, and ``LC_ALL``): to have any effect, both the + key and the value must be encoded as ASCII bytes, and the setting is + processed even when :option:`-E` or :option:`-I` option is passed on the + command line. .. versionadded:: 3.7 See :pep:`538` for more details. Note that the 3.7 implementation differs @@ -887,11 +893,13 @@ conflict. C locale. .. versionchanged:: 3.8 - The locale coercion implementation has been brought back into line with - the approach described in :pep:`538`: calling :c:func:`Py_Initialize` and - :c:func:`Py_Main` will never implicitly trigger local coercion, and - ``PYTHONCOERCECLOCALE=0`` takes effect even when the :option:`-E` or - :option:`-I` option is specified. + Locale coercion is now triggered by the POSIX locale, even on platforms + where that isn't a simple alias for the C locale. + The locale coercion implementation has also been brought back into line + with the approach described in :pep:`538`: calling :c:func:`Py_Initialize` + and :c:func:`Py_Main` will never implicitly trigger locale coercion, and + ``PYTHONCOERCECLOCALE=0`` and ``PYTHONCOERCECLOCALE=warn`` take effect + even when the :option:`-E` or :option:`-I` option is specified. .. envvar:: PYTHONDEVMODE diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index 1db293b9c37359b..8e66efa23185d1b 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -86,7 +86,7 @@ def _set_locale_in_subprocess(locale_name): # If there's no valid CODESET, we expect coercion to be skipped cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))" cmd = cmd_fmt.format(locale_name) - result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='') + result, py_cmd = run_python_until_end("-c", cmd, __isolated=True) return result.rc == 0 @@ -152,7 +152,7 @@ def get_child_details(cls, env_vars): """ result, py_cmd = run_python_until_end( "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT, - **env_vars + __isolated=True, **env_vars ) if not result.rc == 0: result.fail(py_cmd) @@ -166,15 +166,15 @@ def get_child_details(cls, env_vars): # Details of the shared library warning emitted at runtime LEGACY_LOCALE_WARNING = ( - "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " - "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " + "Python runtime initialized with a legacy locale that uses a default ASCII " + "encoding, which may cause Unicode compatibility problems. Using C.UTF-8, " "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " "locales is recommended." ) # Details of the CLI locale coercion warning emitted at runtime CLI_COERCION_WARNING_FMT = ( - "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale " + "Python detected a legacy locale: LC_CTYPE coerced to {} (set LC_ALL " "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)." ) @@ -257,7 +257,6 @@ def test_external_target_locale_configuration(self): "LANG": "", "LC_CTYPE": "", "LC_ALL": "", - "PYTHONCOERCECLOCALE": "", } for env_var in ("LANG", "LC_CTYPE"): for locale_to_set in AVAILABLE_TARGETS: diff --git a/Python/coreconfig.c b/Python/coreconfig.c index 918ca13921713f8..6066b1c3271041f 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -761,6 +761,30 @@ get_env_flag(_PyCoreConfig *config, int *flag, const char *name) } } +static _PyInitError +config_read_unconditional_env_vars(_PyCoreConfig *config) +{ + if (config->warn_on_c_locale < 0) { + /* Special case: similar to the C level locale configuration variables + * like LANG, LC_CTYPE, and LC_ALL, PYTHONCOERCECLOCALE gets processed + * even when the environment is otherwise ignored. + * + * At the runtime level, we only need to determine whether to emit a + * warning or not - actually coercing the legacy C locale (or not) is + * the responsibility of the embedding application. + */ + const char *env = getenv("PYTHONCOERCECLOCALE"); + if (env) { + if (strcmp(env, "warn") == 0) { + config->warn_on_c_locale = 1; + } else { + config->warn_on_c_locale = 0; + } + } + } + + return _Py_INIT_OK(); +} static _PyInitError config_read_env_vars(_PyCoreConfig *config) @@ -807,17 +831,6 @@ config_read_env_vars(_PyCoreConfig *config) config->malloc_stats = 1; } - const char *env = _PyCoreConfig_GetEnv(config, "PYTHONCOERCECLOCALE"); - if (env) { - if (config->warn_on_c_locale < 0) { - if (strcmp(env, "warn") == 0) { - config->warn_on_c_locale = 1; - } else { - config->warn_on_c_locale = 0; - } - } - } - wchar_t *path; int res = _PyCoreConfig_GetEnvDup(config, &path, L"PYTHONPATH", "PYTHONPATH"); @@ -1236,6 +1249,13 @@ _PyCoreConfig_Read(_PyCoreConfig *config) } #endif + /* Read env vars that cannot be masked with -E or -I */ + err = config_read_unconditional_env_vars(config); + if (_Py_INIT_FAILED(err)) { + return err; + } + + /* Read the remaining environment variables */ if (config->use_environment) { err = config_read_env_vars(config); if (_Py_INIT_FAILED(err)) { diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index ae3389572111d7e..e6d38aae7712c96 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -273,13 +273,15 @@ int _Py_LegacyLocaleDetected(void) { #ifndef MS_WINDOWS - /* On non-Windows systems, the C locale is considered a legacy locale */ - /* XXX (ncoghlan): some platforms (notably Mac OS X) don't appear to treat - * the POSIX locale as a simple alias for the C locale, so - * we may also want to check for that explicitly. + /* On non-Windows systems, the ASCII-based C locale is considered a legacy + * locale that we don't trust to be accurate. + * Some platforms (notably Mac OS X and other *BSD platforms) don't treat + * the POSIX locale as a simple alias for the C locale, so we also check + * for that explicitly. */ const char *ctype_loc = setlocale(LC_CTYPE, NULL); - return ctype_loc != NULL && strcmp(ctype_loc, "C") == 0; + return ctype_loc != NULL && + (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "C") == 0); #else /* Windows uses code pages instead of locales, so no locale is legacy */ return 0; @@ -287,8 +289,8 @@ _Py_LegacyLocaleDetected(void) } static const char *_C_LOCALE_WARNING = - "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " - "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " + "Python runtime initialized with a legacy locale that uses a default ASCII " + "encoding, which may cause Unicode compatibility problems. Using C.UTF-8, " "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " "locales is recommended.\n"; @@ -327,7 +329,7 @@ _Py_IsLocaleCoercionTarget(const char *ctype_loc) #ifdef PY_COERCE_C_LOCALE static const char C_LOCALE_COERCION_WARNING[] = - "Python detected LC_CTYPE=C: LC_CTYPE coerced to %.20s (set another locale " + "Python detected a legacy locale: LC_CTYPE coerced to %.20s (set LC_ALL " "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n"; static void From e48e5b7bbab19cd97d0cc45883fd47b2f1f13811 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sun, 30 Sep 2018 17:18:40 +1000 Subject: [PATCH 08/14] bpo-30672: Make POSIX locale coercion tests unconditional --- Lib/test/test_c_locale_coercion.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index 8e66efa23185d1b..d9bcfa17141888e 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -15,7 +15,7 @@ ) # Set the list of ways we expect to be able to ask for the "C" locale -EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"] +EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii", "POSIX"] # Set our expectation for the default encoding used in the C locale # for the filesystem encoding and the standard streams @@ -31,12 +31,6 @@ # Android defaults to using UTF-8 for all system interfaces EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8" EXPECTED_C_LOCALE_FS_ENCODING = "utf-8" - else: - # Linux distros typically alias the POSIX locale directly to the C - # locale. - # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be - # able to check this case unconditionally - EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX") elif sys.platform.startswith("aix"): # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1" @@ -53,7 +47,6 @@ # Note that the above expectations are still wrong in some cases, such as: # * Windows when PYTHONLEGACYWINDOWSFSENCODING is set # * Any platform other than AIX that uses latin-1 in the C locale -# * Any Linux distro where POSIX isn't a simple alias for the C locale # * Any Linux distro where the default locale is something other than "C" # # Options for dealing with this: From 454d3797ce11550ad1174275b0b4bb4126e00c1d Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 6 Oct 2018 18:21:32 +1000 Subject: [PATCH 09/14] Address cosmetic review comments --- Modules/main.c | 56 +++++++++++++++++++++++++------------------- Python/coreconfig.c | 10 ++++---- Python/pylifecycle.c | 10 +++++++- 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index cfe2c9dae4f420c..6f673612284a97a 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -1284,8 +1284,17 @@ pymain_read_conf_impl(_PyMain *pymain, _PyCoreConfig *config, } -/* Read the configuration and initialize the LC_CTYPE locale: - enable UTF-8 mode (PEP 540) and/or coerce the C locale (PEP 538). */ +/* Read the configuration. + * + * Initially processes command line and environment variables based on the + * active LC_CTYPE locale encoding, but will clear and reprocess them as UTF-8 + * if the initial processing indicates that UTF-8 mode should be enabled. + * + * This works because the settings to enable/disable UTF-8 mode are all 7-bit + * ASCII text, and text encodings that actually get used in practice as locale + * encodings are sufficiently compatible with ASCII to always give the same + * UTF-8 mode setting regardless of the active locale encoding. + */ static int pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, _PyCmdline *cmdline) @@ -1733,7 +1742,6 @@ pymain_init(_PyMain *pymain, PyInterpreterState **interp_p) pymain_clear_config(config); config = &interp->core_config; - if (pymain_init_python_main(pymain, config, interp) < 0) { _Py_FatalInitError(pymain->err); } @@ -1796,27 +1804,27 @@ _Py_UnixMain(int argc, char **argv) */ _Py_SetLocaleFromEnv(LC_CTYPE); if (_Py_LegacyLocaleDetected()) { - /* We ignore the Python -E and -I flags here, as the CLI needs to sort out - * the locale settings *before* we try to do anything with the command - * line arguments. For cross-platform debugging purposes, we also need - * to give end users a way to force even scripts that are otherwise - * isolated from their environment to use the legacy ASCII-centric C - * locale (otherwise a user on platform with a valid coercion target, - * like Fedora, can't readily debug a script that runs with -I or -E - * on a platform without a valid coercion target, like CentOS 7). - * - * Ignoring -E and -I is safe from a security perspective, as we only use - * the setting to turn *off* the implicit locale coercion, and anyone with - * access to the process environment already has the ability to set - * `LC_ALL=C` to override the C level locale settings anyway. - */ - const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); - if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { - _Py_CoerceLegacyLocale( - &pymain.locale_coercion_target, - &pymain.locale_coercion_warning - ); - } + /* We ignore the Python -E and -I flags here, as the CLI needs to sort out + * the locale settings *before* we try to do anything with the command + * line arguments. For cross-platform debugging purposes, we also need + * to give end users a way to force even scripts that are otherwise + * isolated from their environment to use the legacy ASCII-centric C + * locale (otherwise a user on platform with a valid coercion target, + * like Fedora, can't readily debug a script that runs with -I or -E + * on a platform without a valid coercion target, like CentOS 7). + * + * Ignoring -E and -I is safe from a security perspective, as we only use + * the setting to turn *off* the implicit locale coercion, and anyone with + * access to the process environment already has the ability to set + * `LC_ALL=C` to override the C level locale settings anyway. + */ + const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); + if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { + _Py_CoerceLegacyLocale( + &pymain.locale_coercion_target, + &pymain.locale_coercion_warning + ); + } } diff --git a/Python/coreconfig.c b/Python/coreconfig.c index 6066b1c3271041f..e50544256d7896d 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -775,12 +775,12 @@ config_read_unconditional_env_vars(_PyCoreConfig *config) */ const char *env = getenv("PYTHONCOERCECLOCALE"); if (env) { - if (strcmp(env, "warn") == 0) { - config->warn_on_c_locale = 1; - } else { - config->warn_on_c_locale = 0; - } + if (strcmp(env, "warn") == 0) { + config->warn_on_c_locale = 1; + } else { + config->warn_on_c_locale = 0; } + } } return _Py_INIT_OK(); diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index e6d38aae7712c96..54a36da607023c3 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -272,6 +272,10 @@ initexternalimport(PyInterpreterState *interp) int _Py_LegacyLocaleDetected(void) { + /* This API is expected to be called before the Python runtime is + * initialised, so it needs to be very conservative as to which other APIs + * it calls, and which process resources it attempts to access. + */ #ifndef MS_WINDOWS /* On non-Windows systems, the ASCII-based C locale is considered a legacy * locale that we don't trust to be accurate. @@ -281,7 +285,7 @@ _Py_LegacyLocaleDetected(void) */ const char *ctype_loc = setlocale(LC_CTYPE, NULL); return ctype_loc != NULL && - (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "C") == 0); + (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0); #else /* Windows uses code pages instead of locales, so no locale is legacy */ return 0; @@ -356,6 +360,10 @@ int _Py_CoerceLegacyLocale(const char **coercion_target, const char **coercion_warning) { + /* This API is expected to be called before the Python runtime is + * initialised, so it needs to be very conservative as to which other APIs + * it calls, and which process resources it attempts to access. + */ int locale_was_coerced = 0; if (coercion_target != NULL) { *coercion_target = NULL; From c46a686ec42e09f01a3a1b34e98f4f38d90d7469 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Thu, 6 Dec 2018 23:00:51 +1000 Subject: [PATCH 10/14] Fix remaining merge issue --- Python/coreconfig.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Python/coreconfig.c b/Python/coreconfig.c index b9be390e5642b08..3febbd1aa1b3d5c 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -1518,8 +1518,7 @@ _PyCoreConfig_AsDict(const _PyCoreConfig *config) SET_ITEM_INT(show_alloc_count); SET_ITEM_INT(dump_refs); SET_ITEM_INT(malloc_stats); - SET_ITEM_INT(coerce_c_locale); - SET_ITEM_INT(coerce_c_locale_warn); + SET_ITEM_INT(warn_on_c_locale); SET_ITEM_STR(filesystem_encoding); SET_ITEM_STR(filesystem_errors); SET_ITEM_INT(utf8_mode); From 5a07ecb8d955e08e229ce836859172bb1ba47b52 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Thu, 6 Dec 2018 23:04:36 +1000 Subject: [PATCH 11/14] Note another potential test fragility resolution --- Lib/test/test_c_locale_coercion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index c1ccbe8751c7a61..4b153ec663d6d6e 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -56,6 +56,8 @@ # * Don't set the PY_COERCE_C_LOCALE preprocessor definition on # such platforms (e.g. it isn't set on Windows) # * Fix the test expectations to match the actual platform behaviour +# * Change the tests to be simply "Don't crash" tests when run on systems +# where we're less certain about the expected locale coercion behaviour # In order to get the warning messages to match up as expected, the candidate # order here must much the target locale order in Python/pylifecycle.c From 06b6849cf12d650e4c60e089eefd08790ab94b24 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Fri, 7 Dec 2018 00:02:29 +1000 Subject: [PATCH 12/14] Ensure there's no implicit C locale coercion --- Lib/test/test_embed.py | 10 ++++++++++ Modules/main.c | 2 +- Programs/_testembed.c | 45 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 645f7920e1da1ff..1b4dae3113619d1 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -596,6 +596,16 @@ def test_init_isolated(self): } self.check_config("init_isolated", config) +class NoImplicitLocaleCoercion(EmbeddingTestsMixin, unittest.TestCase): + + def test_no_implicit_locale_coercion_in_Py_Initialize(self): + out, err = self.run_embedded_interpreter("init_api_does_not_coerce_c_locale") + self.assertEqual(err, '') + + def test_no_implicit_locale_coercion_in_Py_Main(self): + out, err = self.run_embedded_interpreter("main_api_does_not_coerce_c_locale") + self.assertTrue(out.startswith('Python ')) + self.assertEqual(err, '') if __name__ == "__main__": unittest.main() diff --git a/Modules/main.c b/Modules/main.c index db3b5a455f32642..66408edd11d902a 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -1318,7 +1318,7 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config, /* Set LC_CTYPE to the user preferred locale */ _Py_SetLocaleFromEnv(LC_CTYPE); - /* TODO: With locale coercion moved back to _PyUnix_Main, this can + /* TODO: With locale coercion moved back to _PyUnixMain, this can * potentially be simplified now that it only needs to handle UTF-8 mode */ while (1) { diff --git a/Programs/_testembed.c b/Programs/_testembed.c index 0ef02ef74c349f3..f6e307bac928dd6 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -1,6 +1,7 @@ #include #include "pythread.h" #include +#include #include #include @@ -457,7 +458,7 @@ static int test_init_from_config(void) putenv("PYTHONMALLOCSTATS=0"); config.malloc_stats = 1; - /* FIXME: test coerce_c_locale and coerce_c_locale_warn */ + /* FIXME: explcitily test warn_on_c_locale */ putenv("PYTHONUTF8=0"); Py_UTF8Mode = 0; @@ -643,6 +644,46 @@ static int test_init_dev_mode(void) } +/* C locale coercion is a property of the Python CLI only when running + * as a standalone application, NOT when used as a support library (as + * arbitrary other code may have executed in the original locale before + * the Python runtime is started). + * + * Accordingly, the follow tests ensure C locale coercion does NOT occur as a + * side effect of initialising the library. + */ +static int _c_locale_is_active(void) +{ + const char *ctype_loc = setlocale(LC_CTYPE, NULL); + return (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0); +} + +static int test_init_api_does_not_coerce_c_locale(void) +{ + putenv("PYTHONCOERCECLOCALE=1"); + putenv("LANG=C"); + putenv("LC_CTYPE=C"); + _testembed_Py_Initialize(); + Py_Finalize(); + return _c_locale_is_active() ? 0 : -1; +} + +static int test_main_api_does_not_coerce_c_locale(void) +{ + int result = 0; + wchar_t *argv[] = {L"./_testembed", L"-V"}; + + putenv("PYTHONCOERCECLOCALE=1"); + putenv("LANG=C"); + putenv("LC_CTYPE=C"); + result = Py_Main(Py_ARRAY_LENGTH(argv), argv); + if (result) { + return result; + } + return _c_locale_is_active() ? 0 : -1; +} + + /* ********************************************************* * List of test cases and the function that implements it. * @@ -675,6 +716,8 @@ static struct TestCase TestCases[] = { { "init_env", test_init_env }, { "init_dev_mode", test_init_dev_mode }, { "init_isolated", test_init_isolated }, + { "init_api_does_not_coerce_c_locale", test_init_api_does_not_coerce_c_locale }, + { "main_api_does_not_coerce_c_locale", test_main_api_does_not_coerce_c_locale }, { NULL, NULL } }; From 590e64e5a89522389bf53cc5ddcd32380ba1e3cf Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Fri, 7 Dec 2018 01:08:01 +1000 Subject: [PATCH 13/14] Reinstate support for -E/-I affecting PYTHONCOERCECLOCALE --- Include/cpython/pylifecycle.h | 3 +- Modules/main.c | 27 ++++-------------- Python/pylifecycle.c | 52 ++++++++++++++++++++++++++++++++++- 3 files changed, 58 insertions(+), 24 deletions(-) diff --git a/Include/cpython/pylifecycle.h b/Include/cpython/pylifecycle.h index cd558a336610c94..de9a1844f37e6c5 100644 --- a/Include/cpython/pylifecycle.h +++ b/Include/cpython/pylifecycle.h @@ -90,9 +90,10 @@ PyAPI_FUNC(int) _PyOS_URandom(void *buffer, Py_ssize_t size); PyAPI_FUNC(int) _PyOS_URandomNonblock(void *buffer, Py_ssize_t size); /* Legacy locale support */ +PyAPI_FUNC(int) _Py_LegacyLocaleDetected(void); +PyAPI_FUNC(int) _Py_LegacyLocaleCoercionEnabled(int argc, char **argv); PyAPI_FUNC(int) _Py_CoerceLegacyLocale(const char **coercion_target, const char **coercion_warning); -PyAPI_FUNC(int) _Py_LegacyLocaleDetected(void); PyAPI_FUNC(char *) _Py_SetLocaleFromEnv(int category); diff --git a/Modules/main.c b/Modules/main.c index 66408edd11d902a..c1878a5691a71d6 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -1867,28 +1867,11 @@ _Py_UnixMain(int argc, char **argv) * details. */ _Py_SetLocaleFromEnv(LC_CTYPE); - if (_Py_LegacyLocaleDetected()) { - /* We ignore the Python -E and -I flags here, as the CLI needs to sort out - * the locale settings *before* we try to do anything with the command - * line arguments. For cross-platform debugging purposes, we also need - * to give end users a way to force even scripts that are otherwise - * isolated from their environment to use the legacy ASCII-centric C - * locale (otherwise a user on platform with a valid coercion target, - * like Fedora, can't readily debug a script that runs with -I or -E - * on a platform without a valid coercion target, like CentOS 7). - * - * Ignoring -E and -I is safe from a security perspective, as we only use - * the setting to turn *off* the implicit locale coercion, and anyone with - * access to the process environment already has the ability to set - * `LC_ALL=C` to override the C level locale settings anyway. - */ - const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); - if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { - _Py_CoerceLegacyLocale( - &pymain.locale_coercion_target, - &pymain.locale_coercion_warning - ); - } + if (_Py_LegacyLocaleDetected() && _Py_LegacyLocaleCoercionEnabled(argc, argv)) { + _Py_CoerceLegacyLocale( + &pymain.locale_coercion_target, + &pymain.locale_coercion_warning + ); } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index c80c34462cd3ab0..00b3ad0464745ea 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -248,7 +248,7 @@ initexternalimport(PyInterpreterState *interp) * Accordingly, when the CLI detects it, it attempts to coerce it to a * more capable UTF-8 based alternative as follows: * - * if (_Py_LegacyLocaleDetected()) { + * if (_Py_LegacyLocaleDetected() && _Py_LegacyLocaleCoercionEnabled(argc, argv)) { * _Py_CoerceLegacyLocale(); * } * @@ -282,6 +282,56 @@ _Py_LegacyLocaleDetected(void) #endif } +int +_Py_LegacyLocaleCoercionEnabled(int argc, char **argv) +{ + int enable_coercion = -1; + if (argc >= 1 && argv != NULL) { + /* Rudimentary arg parsing to allow -E and -I to disable + * PYTHONCOERCECLOCALE, even though locale coercion occurs + * well in advance of the main arg parsing loop. + * + * Ignore argv[0], as that's the program name. + */ + for (int i = 1; i < argc; i++) { + const char* arg = argv[i]; + if (arg[0] != '-') { + /* Non-option arg is a path to execute */ + break; + } else { + size_t arg_len = strlen(arg); + if (arg_len == 1) { + /* '-' is the end of the Python arg list */ + break; + } + if (arg_len == 2) { + char option = arg[1]; + if (option == 'X' || option == 'W') { + /* Skip argument to -X or -W, and check next arg */ + i++; + continue; + } + if (option == 'E' || option == 'I') { + /* If the environment is ignored, locale coercion is always on */ + enable_coercion = 1; + break; + } + if (option == 'c' || option == 'm') { + /* -c and -m also mark the end of the Python arg list */ + break; + } + } + } + } + } + if (enable_coercion < 0) { + const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); + enable_coercion = (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0); + } + return enable_coercion; +} + + static const char *_C_LOCALE_WARNING = "Python runtime initialized with a legacy locale that uses a default ASCII " "encoding, which may cause Unicode compatibility problems. Using C.UTF-8, " From da12f9d9c318d17ba662b4aa4acee85ac5fc7fad Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Fri, 7 Dec 2018 22:27:17 +1000 Subject: [PATCH 14/14] Address self-review comments --- Lib/test/test_c_locale_coercion.py | 54 ++++++++++++--- Programs/_testembed.c | 2 + Python/coreconfig.c | 44 ++++-------- Python/pylifecycle.c | 103 +++++++++++++++++------------ 4 files changed, 121 insertions(+), 82 deletions(-) diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index 4b153ec663d6d6e..a0b9420b822593e 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -137,7 +137,7 @@ def _handle_output_variations(data): return data @classmethod - def get_child_details(cls, env_vars): + def get_child_details(cls, env_vars, isolated): """Retrieves fsencoding and standard stream details from a child process Returns (encoding_details, stderr_lines): @@ -150,7 +150,7 @@ def get_child_details(cls, env_vars): """ result, py_cmd = run_python_until_end( "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT, - __isolated=True, **env_vars + __isolated=isolated, **env_vars ) if not result.rc == 0: result.fail(py_cmd) @@ -219,7 +219,8 @@ def _check_child_encoding_details(self, expected_fs_encoding, expected_stream_encoding, expected_warnings, - coercion_expected): + coercion_expected, + isolated): """Check the C locale handling for the given process environment Parameters: @@ -227,7 +228,7 @@ def _check_child_encoding_details(self, expected_stream_encoding: expected encoding for standard streams expected_warning: stderr output to expect (if any) """ - result = EncodingDetails.get_child_details(env_vars) + result = EncodingDetails.get_child_details(env_vars, isolated=isolated) encoding_details, stderr_lines = result expected_details = EncodingDetails.get_expected_details( coercion_expected, @@ -281,7 +282,14 @@ def test_external_target_locale_configuration(self): expected_fs_encoding, expected_stream_encoding, expected_warnings=None, - coercion_expected=False) + coercion_expected=False, + isolated=False) + self._check_child_encoding_details(var_dict, + expected_fs_encoding, + expected_stream_encoding, + expected_warnings=None, + coercion_expected=False, + isolated=True) @@ -296,6 +304,7 @@ def _check_c_locale_coercion(self, coerce_c_locale, expected_warnings=None, coercion_expected=True, + isolated=False, **extra_vars): """Check the C locale handling for various configurations @@ -348,7 +357,8 @@ def _check_c_locale_coercion(self, fs_encoding, stream_encoding, _expected_warnings, - _coercion_expected) + _coercion_expected, + isolated) # Check behaviour for explicitly configured locales for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS: @@ -363,7 +373,8 @@ def _check_c_locale_coercion(self, fs_encoding, stream_encoding, expected_warnings, - coercion_expected) + coercion_expected, + isolated) def test_PYTHONCOERCECLOCALE_not_set(self): # This should coerce to the first available target locale by default @@ -381,6 +392,18 @@ def test_PYTHONCOERCECLOCALE_set_to_warn(self): coerce_c_locale="warn", expected_warnings=[CLI_COERCION_WARNING]) + def test_PYTHONCOERCECLOCALE_set_to_warn_when_isolated(self): + # PYTHONCOERCECLOCALE=warn should be ignored by the -I switch + self._check_c_locale_coercion("utf-8", "utf-8", + coerce_c_locale="warn", + isolated=True) + # Setting LC_ALL=C should still render the locale coercion ineffective + self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, + EXPECTED_C_LOCALE_STREAM_ENCODING, + coerce_c_locale="warn", + LC_ALL="C", + isolated=True, + coercion_expected=False) def test_PYTHONCOERCECLOCALE_set_to_zero(self): # The setting "0" should result in the locale coercion being disabled @@ -395,6 +418,19 @@ def test_PYTHONCOERCECLOCALE_set_to_zero(self): LC_ALL="C", coercion_expected=False) + def test_PYTHONCOERCECLOCALE_set_to_zero_when_isolated(self): + # The setting "0" should be ignored by the -I switch + self._check_c_locale_coercion("utf-8", "utf-8", + coerce_c_locale="0", + isolated=True) + # Setting LC_ALL=C should still render the locale coercion ineffective + self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, + EXPECTED_C_LOCALE_STREAM_ENCODING, + coerce_c_locale="0", + LC_ALL="C", + isolated=True, + coercion_expected=False) + def test_LC_ALL_set_to_C(self): # Setting LC_ALL should render the locale coercion ineffective self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, @@ -415,13 +451,13 @@ def test_PYTHONCOERCECLOCALE_set_to_one(self): old_loc = locale.setlocale(locale.LC_CTYPE, None) self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc) loc = locale.setlocale(locale.LC_CTYPE, "") - if loc == "C": + if loc in ("C", "POSIX"): self.skipTest("test requires LC_CTYPE locale different than C") if loc in TARGET_LOCALES : self.skipTest("coerced LC_CTYPE locale: %s" % loc) # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale - # if it's not equal to "C" + # if it's not equal to "C" or "POSIX" code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))' env = dict(os.environ, PYTHONCOERCECLOCALE='1') cmd = subprocess.run([sys.executable, '-c', code], diff --git a/Programs/_testembed.c b/Programs/_testembed.c index f6e307bac928dd6..b9fc671102b45a6 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -663,6 +663,7 @@ static int test_init_api_does_not_coerce_c_locale(void) putenv("PYTHONCOERCECLOCALE=1"); putenv("LANG=C"); putenv("LC_CTYPE=C"); + putenv("LC_ALL="); _testembed_Py_Initialize(); Py_Finalize(); return _c_locale_is_active() ? 0 : -1; @@ -676,6 +677,7 @@ static int test_main_api_does_not_coerce_c_locale(void) putenv("PYTHONCOERCECLOCALE=1"); putenv("LANG=C"); putenv("LC_CTYPE=C"); + putenv("LC_ALL="); result = Py_Main(Py_ARRAY_LENGTH(argv), argv); if (result) { return result; diff --git a/Python/coreconfig.c b/Python/coreconfig.c index 3febbd1aa1b3d5c..43d98361152e12c 100644 --- a/Python/coreconfig.c +++ b/Python/coreconfig.c @@ -855,31 +855,6 @@ get_env_flag(_PyCoreConfig *config, int *flag, const char *name) } } -static _PyInitError -config_read_unconditional_env_vars(_PyCoreConfig *config) -{ - if (config->warn_on_c_locale < 0) { - /* Special case: similar to the C level locale configuration variables - * like LANG, LC_CTYPE, and LC_ALL, PYTHONCOERCECLOCALE gets processed - * even when the environment is otherwise ignored. - * - * At the runtime level, we only need to determine whether to emit a - * warning or not - actually coercing the legacy C locale (or not) is - * the responsibility of the embedding application. - */ - const char *env = getenv("PYTHONCOERCECLOCALE"); - if (env) { - if (strcmp(env, "warn") == 0) { - config->warn_on_c_locale = 1; - } else { - config->warn_on_c_locale = 0; - } - } - } - - return _Py_INIT_OK(); -} - static _PyInitError config_read_env_vars(_PyCoreConfig *config) { @@ -914,6 +889,17 @@ config_read_env_vars(_PyCoreConfig *config) "PYTHONLEGACYWINDOWSSTDIO"); #endif + if (config->warn_on_c_locale < 0) { + const char *env = _PyCoreConfig_GetEnv(config, "PYTHONCOERCECLOCALE"); + if (env) { + if (strcmp(env, "warn") == 0) { + config->warn_on_c_locale = 1; + } else { + config->warn_on_c_locale = 0; + } + } + } + if (config->allocator == NULL) { config->allocator = _PyCoreConfig_GetEnv(config, "PYTHONMALLOC"); } @@ -1345,13 +1331,7 @@ _PyCoreConfig_Read(_PyCoreConfig *config) } #endif - /* Read env vars that cannot be masked with -E or -I */ - err = config_read_unconditional_env_vars(config); - if (_Py_INIT_FAILED(err)) { - return err; - } - - /* Read the remaining environment variables */ + /* Read the environment variables */ if (config->use_environment) { err = config_read_env_vars(config); if (_Py_INIT_FAILED(err)) { diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 00b3ad0464745ea..77ac8e3c60a6675 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -282,47 +282,68 @@ _Py_LegacyLocaleDetected(void) #endif } +static int +_parse_arg_for_locale_coercion_options(const char *arg, int *enable_coercion) +{ + int skip_next_arg = 0; + size_t arg_len = strlen(arg); + if (arg_len < 2 || arg[0] != '-') { + /* Empty strings, '-', and a non-option string terminate the arg list */ + skip_next_arg = -1; + } else if (arg[1] != '-') { + /* Short option string to check for locale coercion related settings */ + for (size_t optind = 1; optind < arg_len; optind++) { + char option = arg[optind]; + if (option == 'X' || option == 'W') { + /* Skip checking argument to -X or -W */ + skip_next_arg = 1; + break; + } + if (option == 'c' || option == 'm') { + /* -c and -m also mark the end of the Python arg list */ + skip_next_arg = -1; + break; + } + if (option == 'E' || option == 'I') { + /* If the environment is ignored, locale coercion is always on */ + *enable_coercion = 1; + break; + } + } + } + + return skip_next_arg; +} + +static int +_parse_cmdline_for_locale_coercion(int argc, char **argv) +{ + /* Rudimentary arg parsing to allow -E and -I to disable + * PYTHONCOERCECLOCALE, even though locale coercion occurs + * well in advance of the main arg parsing loop. + * + * Ignores argv[0], as that's the program name. + */ + int enable_coercion = -1; + for (int i = 1; i < argc; i++) { + const char* arg = argv[i]; + int skip_next_arg = _parse_arg_for_locale_coercion_options(arg, &enable_coercion); + if (skip_next_arg < 0) { + break; /* Skip checking all remaining args */ + } + if (skip_next_arg) { + i++; /* Skip checking a single arg */ + } + } + return enable_coercion; +} + int _Py_LegacyLocaleCoercionEnabled(int argc, char **argv) { int enable_coercion = -1; if (argc >= 1 && argv != NULL) { - /* Rudimentary arg parsing to allow -E and -I to disable - * PYTHONCOERCECLOCALE, even though locale coercion occurs - * well in advance of the main arg parsing loop. - * - * Ignore argv[0], as that's the program name. - */ - for (int i = 1; i < argc; i++) { - const char* arg = argv[i]; - if (arg[0] != '-') { - /* Non-option arg is a path to execute */ - break; - } else { - size_t arg_len = strlen(arg); - if (arg_len == 1) { - /* '-' is the end of the Python arg list */ - break; - } - if (arg_len == 2) { - char option = arg[1]; - if (option == 'X' || option == 'W') { - /* Skip argument to -X or -W, and check next arg */ - i++; - continue; - } - if (option == 'E' || option == 'I') { - /* If the environment is ignored, locale coercion is always on */ - enable_coercion = 1; - break; - } - if (option == 'c' || option == 'm') { - /* -c and -m also mark the end of the Python arg list */ - break; - } - } - } - } + enable_coercion = _parse_cmdline_for_locale_coercion(argc, argv); } if (enable_coercion < 0) { const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); @@ -414,6 +435,9 @@ _Py_CoerceLegacyLocale(const char **coercion_target, #ifdef PY_COERCE_C_LOCALE char *oldloc = NULL; + /* Everything the raw allocator needs is statically allocated, so it can + * be used here to preserve the original locale for possible restoration. + */ oldloc = _PyMem_RawStrdup(setlocale(LC_CTYPE, NULL)); if (oldloc == NULL) { return -1; @@ -472,9 +496,6 @@ _Py_SetLocaleFromEnv(int category) #ifdef __ANDROID__ const char *locale; const char **pvar; -#ifdef PY_COERCE_C_LOCALE - const char *coerce_c_locale; -#endif const char *utf8_locale = "C.UTF-8"; const char *env_var_set[] = { "LC_ALL", @@ -504,8 +525,7 @@ _Py_SetLocaleFromEnv(int category) * string, the implementation-defined default locale shall be used." */ #ifdef PY_COERCE_C_LOCALE - coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); - if (coerce_c_locale == NULL || strcmp(coerce_c_locale, "0") != 0) { + if (_Py_LegacyLocaleCoercionEnabled(0, NULL)) { /* Some other ported code may check the environment variables (e.g. in * extension modules), so we make sure that they match the locale * configuration */ @@ -515,6 +535,7 @@ _Py_SetLocaleFromEnv(int category) } } #endif + res = setlocale(category, utf8_locale); #else /* !defined(__ANDROID__) */ res = setlocale(category, "");