From 10a60941ede0a0113165d8358afe2f30889812f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 6 Jul 2024 20:58:19 +0200 Subject: [PATCH 01/97] add implementation --- Modules/_fnmatchmodule.c | 246 ++++++++++++++++++++++++++++++ Modules/clinic/_fnmatchmodule.c.h | 192 +++++++++++++++++++++++ 2 files changed, 438 insertions(+) create mode 100644 Modules/_fnmatchmodule.c create mode 100644 Modules/clinic/_fnmatchmodule.c.h diff --git a/Modules/_fnmatchmodule.c b/Modules/_fnmatchmodule.c new file mode 100644 index 00000000000000..a5b77ec4c8d4f2 --- /dev/null +++ b/Modules/_fnmatchmodule.c @@ -0,0 +1,246 @@ +/* + * C accelerator for the 'fnmatch' module. + * + * Most functions expect string or bytes instances, and thus the Python + * implementation should first pre-process path-like objects, and possibly + * applying normalizations depending on the platform if needed. + */ + +#include "Python.h" + +#include "clinic/_fnmatchmodule.c.h" + +/*[clinic input] +module _fnmatch +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=356e324d57d93f08]*/ + +#include + +static inline int +validate_encoded_object(PyObject *name) +{ + if (!PyBytes_Check(name)) { + PyErr_Format(PyExc_TypeError, + "name must be a bytes object, got %.200s", + Py_TYPE(name)->tp_name); + return 0; + } + return 1; +} + +static inline int +validate_unicode_object(PyObject *name) +{ + if (!PyUnicode_Check(name)) { + PyErr_Format(PyExc_TypeError, + "name must be a string object, got %.200s", + Py_TYPE(name)->tp_name); + return 0; + } + return 1; +} + +static inline int +posix_fnmatch_encoded(const char *pattern, PyObject *name) +{ + if (!validate_encoded_object(name)) { + return -1; + } + // case-insensitive match +#ifdef FNM_CASEFOLD + return fnmatch(pattern, PyBytes_AS_STRING(name), FNM_CASEFOLD) == 0; +#else + // todo: fallback to Python implementation + return -1; +#endif +} + +static inline int +posix_fnmatchcase_encoded(const char *pattern, PyObject *name) +{ + if (!validate_encoded_object(name)) { + return -1; + } + // case-sensitive match + return fnmatch(pattern, PyBytes_AS_STRING(name), 0) == 0; +} + +static inline int +posix_fnmatch_unicode(const char *pattern, PyObject *name) +{ + if (!validate_unicode_object(name)) { + return -1; + } + // case-insensitive match +#ifdef FNM_CASEFOLD + return fnmatch(pattern, PyUnicode_AsUTF8(name), FNM_CASEFOLD) == 0; +#else + // todo: fallback to Python implementation + return -1; +#endif +} + +static inline int +posix_fnmatchcase_unicode(const char *pattern, PyObject *name) +{ + if (!validate_unicode_object(name)) { + return -1; + } + // case-sensitive match + return fnmatch(pattern, PyUnicode_AsUTF8(name), 0) == 0; +} + +static PyObject * +_fnmatch_filter_generic_impl(PyObject *module, + PyObject *names, + const char *pattern, + int (*match)(const char *, PyObject *)) +{ + PyObject *iter = PyObject_GetIter(names); + if (iter == NULL) { + return NULL; + } + + PyObject *res = PyList_New(0); + if (res == NULL) { + Py_DECREF(iter); + return NULL; + } + + PyObject *name = NULL; + while ((name = PyIter_Next(iter))) { + int rc = match(pattern, name); + if (rc < 0) { + goto abort; + } + if (rc == 1) { + if (PyList_Append(res, name) < 0) { + goto abort; + } + } + Py_DECREF(name); + if (PyErr_Occurred()) { + Py_DECREF(res); + Py_DECREF(iter); + return NULL; + } + } + Py_DECREF(iter); + return res; +abort: + Py_DECREF(name); + Py_DECREF(iter); + Py_DECREF(res); + return NULL; +} + +/*[clinic input] +_fnmatch.filter -> object + + names: object + pat: object + +[clinic start generated code]*/ + +static PyObject * +_fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) +/*[clinic end generated code: output=7f11aa68436d05fc input=1d233174e1c4157a]*/ +{ + // todo: handle os.path.normcase(...) + if (PyBytes_Check(pat)) { + const char *pattern = PyBytes_AS_STRING(pat); + return _fnmatch_filter_generic_impl(module, names, pattern, + &posix_fnmatch_encoded); + } + if (PyUnicode_Check(pat)) { + const char *pattern = PyUnicode_AsUTF8(pat); + return _fnmatch_filter_generic_impl(module, names, pattern, + &posix_fnmatch_unicode); + } + PyErr_Format(PyExc_TypeError, "pattern must be a string or a bytes object"); + return NULL; +} + +/*[clinic input] +_fnmatch.fnmatch -> bool + + name: object + pat: object + +[clinic start generated code]*/ + +static int +_fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat) +/*[clinic end generated code: output=b4cd0bd911e8bc93 input=c45e0366489540b8]*/ +{ + // todo: handle os.path.normcase(...) + if (PyBytes_Check(pat)) { + const char *pattern = PyBytes_AS_STRING(pat); + return posix_fnmatch_encoded(pattern, name); + } + if (PyUnicode_Check(pat)) { + const char *pattern = PyUnicode_AsUTF8(pat); + return posix_fnmatch_unicode(pattern, name); + } + PyErr_Format(PyExc_TypeError, "pattern must be a string or a bytes object"); + return NULL; +} + +/*[clinic input] +_fnmatch.fnmatchcase -> bool + + name: object + pat: object + +Test whether `name` matches `pattern`, including case. + +This is a version of fnmatch() which doesn't case-normalize +its arguments. + +[clinic start generated code]*/ + +static int +_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) +/*[clinic end generated code: output=4d1283b1b1fc7cb8 input=b02a6a5c8c5a46e2]*/ +{ + if (PyBytes_Check(pat)) { + const char *pattern = PyBytes_AS_STRING(pat); + return posix_fnmatchcase_encoded(pattern, name); + } + if (PyUnicode_Check(pat)) { + const char *pattern = PyUnicode_AsUTF8(pat); + return posix_fnmatchcase_unicode(pattern, name); + } + PyErr_Format(PyExc_TypeError, "pattern must be a string or a bytes object"); + return NULL; +} + +static PyMethodDef _fnmatch_methods[] = { + _FNMATCH_FILTER_METHODDEF + _FNMATCH_FNMATCH_METHODDEF + _FNMATCH_FNMATCHCASE_METHODDEF + {NULL, NULL} +}; + +static struct PyModuleDef_Slot _fnmatch_slots[] = { + {0, NULL} +}; + +static struct PyModuleDef _fnmatchmodule = { + PyModuleDef_HEAD_INIT, + "_fnmatch", + NULL, + 0, + _fnmatch_methods, + _fnmatch_slots, + NULL, + NULL, + NULL, +}; + +PyMODINIT_FUNC +PyInit__fnmatch(void) +{ + return PyModuleDef_Init(&_fnmatchmodule); +} diff --git a/Modules/clinic/_fnmatchmodule.c.h b/Modules/clinic/_fnmatchmodule.c.h new file mode 100644 index 00000000000000..a693bccee18ff5 --- /dev/null +++ b/Modules/clinic/_fnmatchmodule.c.h @@ -0,0 +1,192 @@ +/*[clinic input] +preserve +[clinic start generated code]*/ + +#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) +# include "pycore_gc.h" // PyGC_Head +# include "pycore_runtime.h" // _Py_ID() +#endif +#include "pycore_modsupport.h" // _PyArg_UnpackKeywords() + +PyDoc_STRVAR(_fnmatch_filter__doc__, +"filter($module, /, names, pat)\n" +"--\n" +"\n"); + +#define _FNMATCH_FILTER_METHODDEF \ + {"filter", _PyCFunction_CAST(_fnmatch_filter), METH_FASTCALL|METH_KEYWORDS, _fnmatch_filter__doc__}, + +static PyObject * +_fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat); + +static PyObject * +_fnmatch_filter(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(names), &_Py_ID(pat), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"names", "pat", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "filter", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + PyObject *names; + PyObject *pat; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); + if (!args) { + goto exit; + } + names = args[0]; + pat = args[1]; + return_value = _fnmatch_filter_impl(module, names, pat); + +exit: + return return_value; +} + +PyDoc_STRVAR(_fnmatch_fnmatch__doc__, +"fnmatch($module, /, name, pat)\n" +"--\n" +"\n"); + +#define _FNMATCH_FNMATCH_METHODDEF \ + {"fnmatch", _PyCFunction_CAST(_fnmatch_fnmatch), METH_FASTCALL|METH_KEYWORDS, _fnmatch_fnmatch__doc__}, + +static int +_fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat); + +static PyObject * +_fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(name), &_Py_ID(pat), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"name", "pat", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "fnmatch", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + PyObject *name; + PyObject *pat; + int _return_value; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); + if (!args) { + goto exit; + } + name = args[0]; + pat = args[1]; + _return_value = _fnmatch_fnmatch_impl(module, name, pat); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyBool_FromLong((long)_return_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(_fnmatch_fnmatchcase__doc__, +"fnmatchcase($module, /, name, pat)\n" +"--\n" +"\n" +"Test whether `name` matches `pattern`, including case.\n" +"\n" +"This is a version of fnmatch() which doesn\'t case-normalize\n" +"its arguments."); + +#define _FNMATCH_FNMATCHCASE_METHODDEF \ + {"fnmatchcase", _PyCFunction_CAST(_fnmatch_fnmatchcase), METH_FASTCALL|METH_KEYWORDS, _fnmatch_fnmatchcase__doc__}, + +static int +_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat); + +static PyObject * +_fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(name), &_Py_ID(pat), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"name", "pat", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "fnmatchcase", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + PyObject *name; + PyObject *pat; + int _return_value; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); + if (!args) { + goto exit; + } + name = args[0]; + pat = args[1]; + _return_value = _fnmatch_fnmatchcase_impl(module, name, pat); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyBool_FromLong((long)_return_value); + +exit: + return return_value; +} +/*[clinic end generated code: output=fd6cc9541aa95a9a input=a9049054013a1b77]*/ From ca0338811424ed6369ab511d4ea75a362deebc10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 6 Jul 2024 20:57:37 +0200 Subject: [PATCH 02/97] add initial tests --- Lib/test/test_fnmatch.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 10ed496d4e2f37..b086495b78c785 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -5,10 +5,11 @@ import string import warnings +import _fnmatch as c_fnmatch +import fnmatch as py_fnmatch from fnmatch import fnmatch, fnmatchcase, translate, filter class FnmatchTestCase(unittest.TestCase): - def check_match(self, filename, pattern, should_match=True, fn=fnmatch): if should_match: self.assertTrue(fn(filename, pattern), @@ -250,18 +251,25 @@ def test_translate(self): self.assertTrue(re.match(fatre, 'cbabcaxc')) self.assertFalse(re.match(fatre, 'dabccbad')) -class FilterTestCase(unittest.TestCase): + +class FilterTestCaseMixin: + fnmatch = None def test_filter(self): + filter = self.fnmatch.filter self.assertEqual(filter(['Python', 'Ruby', 'Perl', 'Tcl'], 'P*'), ['Python', 'Perl']) self.assertEqual(filter([b'Python', b'Ruby', b'Perl', b'Tcl'], b'P*'), [b'Python', b'Perl']) def test_mix_bytes_str(self): + filter = self.fnmatch.filter self.assertRaises(TypeError, filter, ['test'], b'*') self.assertRaises(TypeError, filter, [b'test'], '*') +class PurePythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): + fnmatch = py_fnmatch + def test_case(self): ignorecase = os.path.normcase('P') == os.path.normcase('p') self.assertEqual(filter(['Test.py', 'Test.rb', 'Test.PL'], '*.p*'), @@ -276,6 +284,9 @@ def test_sep(self): self.assertEqual(filter(['usr/bin', 'usr', 'usr\\lib'], 'usr\\*'), ['usr/bin', 'usr\\lib'] if normsep else ['usr\\lib']) +class CPythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): + fnmatch = c_fnmatch + if __name__ == "__main__": unittest.main() From adb6ed040279bc592b3c86133eb6569470758cc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 6 Jul 2024 20:57:17 +0200 Subject: [PATCH 03/97] add generated objects --- Include/internal/pycore_global_objects_fini_generated.h | 2 ++ Include/internal/pycore_global_strings.h | 2 ++ Include/internal/pycore_runtime_init_generated.h | 2 ++ Include/internal/pycore_unicodeobject_generated.h | 8 ++++++++ 4 files changed, 14 insertions(+) diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index c0840f9eb7eca2..77b2a8e2e7a7dc 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -1087,6 +1087,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n_unnamed_fields)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(name)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(name_from)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(names)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(namespace_separator)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(namespaces)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(narg)); @@ -1129,6 +1130,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pages)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(parent)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(password)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pat)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(path)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pattern)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(peek)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 51735a8a726e11..4896f6343087d3 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -576,6 +576,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(n_unnamed_fields) STRUCT_FOR_ID(name) STRUCT_FOR_ID(name_from) + STRUCT_FOR_ID(names) STRUCT_FOR_ID(namespace_separator) STRUCT_FOR_ID(namespaces) STRUCT_FOR_ID(narg) @@ -618,6 +619,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(pages) STRUCT_FOR_ID(parent) STRUCT_FOR_ID(password) + STRUCT_FOR_ID(pat) STRUCT_FOR_ID(path) STRUCT_FOR_ID(pattern) STRUCT_FOR_ID(peek) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index c5be67c6d80b9d..1249957fb29d1e 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -1085,6 +1085,7 @@ extern "C" { INIT_ID(n_unnamed_fields), \ INIT_ID(name), \ INIT_ID(name_from), \ + INIT_ID(names), \ INIT_ID(namespace_separator), \ INIT_ID(namespaces), \ INIT_ID(narg), \ @@ -1127,6 +1128,7 @@ extern "C" { INIT_ID(pages), \ INIT_ID(parent), \ INIT_ID(password), \ + INIT_ID(pat), \ INIT_ID(path), \ INIT_ID(pattern), \ INIT_ID(peek), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index 0e0ad6518771e9..0bd57f5db64ea9 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -2104,6 +2104,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(names); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(namespace_separator); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -2272,6 +2276,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(pat); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(path); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); From e95c255bc92ef15cde2e6ae252700aa32b98a7b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 6 Jul 2024 20:57:56 +0200 Subject: [PATCH 04/97] update PC/config.c --- PC/config.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PC/config.c b/PC/config.c index b744f711b0d636..f08a847a3f1206 100644 --- a/PC/config.c +++ b/PC/config.c @@ -12,6 +12,7 @@ extern PyObject* PyInit_cmath(void); extern PyObject* PyInit_errno(void); extern PyObject* PyInit_faulthandler(void); extern PyObject* PyInit__tracemalloc(void); +extern PyObject* PyInit_fnmatch(void); extern PyObject* PyInit_gc(void); extern PyObject* PyInit_math(void); extern PyObject* PyInit__md5(void); @@ -92,6 +93,7 @@ struct _inittab _PyImport_Inittab[] = { {"cmath", PyInit_cmath}, {"errno", PyInit_errno}, {"faulthandler", PyInit_faulthandler}, + {"fnmatch", PyInit_fnmatch}, {"gc", PyInit_gc}, {"math", PyInit_math}, {"nt", PyInit_nt}, /* Use the NT os functions, not posix */ From 9b1c20dcdc8fe562c2a7287b67ee20483e164b4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 6 Jul 2024 20:58:07 +0200 Subject: [PATCH 05/97] update Modules/Setup --- Modules/Setup | 1 + 1 file changed, 1 insertion(+) diff --git a/Modules/Setup b/Modules/Setup index e4acf6bc7de8ea..acb542b70946ea 100644 --- a/Modules/Setup +++ b/Modules/Setup @@ -137,6 +137,7 @@ PYTHONPATH=$(COREPYTHONPATH) #_datetime _datetimemodule.c #_decimal _decimal/_decimal.c #_heapq _heapqmodule.c +_fnmatch _fnmatchmodule.c #_interpchannels _interpchannelsmodule.c #_interpqueues _interpqueuesmodule.c #_interpreters _interpretersmodule.c From 85fa9533e9780c244eac5f22eeee432daeebc647 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 6 Jul 2024 20:58:25 +0200 Subject: [PATCH 06/97] update VCX project files --- PCbuild/pythoncore.vcxproj | 1 + PCbuild/pythoncore.vcxproj.filters | 3 +++ 2 files changed, 4 insertions(+) diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index f36fcb8caece33..639c497767cfa4 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -450,6 +450,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index a1b43addf9e36a..27f4905e6d5263 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -920,6 +920,9 @@ Modules + + Modules + Modules From 363ec36f747e453595bc5663fa44d2cfe99b9a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 6 Jul 2024 21:44:06 +0200 Subject: [PATCH 07/97] fix return value --- Modules/_fnmatchmodule.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/_fnmatchmodule.c b/Modules/_fnmatchmodule.c index a5b77ec4c8d4f2..19118fd2a4a740 100644 --- a/Modules/_fnmatchmodule.c +++ b/Modules/_fnmatchmodule.c @@ -184,7 +184,7 @@ _fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat) return posix_fnmatch_unicode(pattern, name); } PyErr_Format(PyExc_TypeError, "pattern must be a string or a bytes object"); - return NULL; + return -1; } /*[clinic input] @@ -213,7 +213,7 @@ _fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) return posix_fnmatchcase_unicode(pattern, name); } PyErr_Format(PyExc_TypeError, "pattern must be a string or a bytes object"); - return NULL; + return -1; } static PyMethodDef _fnmatch_methods[] = { From 42b019f9bb208e716cb87ba2c25f9c67e8178c0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 6 Jul 2024 21:45:09 +0200 Subject: [PATCH 08/97] fix typo in pythoncore.vcxproj --- PCbuild/pythoncore.vcxproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 639c497767cfa4..db9f960c61ce6c 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -449,8 +449,8 @@ + - From 4120a95dcf5b163f48f9d8973c36853635103777 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 6 Jul 2024 21:45:50 +0200 Subject: [PATCH 09/97] Update pythoncore.vcxproj.filters --- PCbuild/pythoncore.vcxproj.filters | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 27f4905e6d5263..24384e355f46ec 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -920,9 +920,6 @@ Modules - - Modules - Modules @@ -998,6 +995,9 @@ Modules + + Modules + Modules From 36394bbc19c6a8fe5d4c5559fd35227adea895b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 7 Jul 2024 00:37:56 +0200 Subject: [PATCH 10/97] Amend un-necessary modifications in `test_fnmatch.py` --- Lib/test/test_fnmatch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index b086495b78c785..94ec41958b07c0 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -10,6 +10,7 @@ from fnmatch import fnmatch, fnmatchcase, translate, filter class FnmatchTestCase(unittest.TestCase): + def check_match(self, filename, pattern, should_match=True, fn=fnmatch): if should_match: self.assertTrue(fn(filename, pattern), From 2c2f9f102d085bb0aef8fa98c5117c48bda0b346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 8 Jul 2024 18:57:08 +0200 Subject: [PATCH 11/97] update configurations --- configure.ac | 24 ++++++++++++++++++++++++ pyconfig.h.in | 3 +++ 2 files changed, 27 insertions(+) diff --git a/configure.ac b/configure.ac index 705f8752597b96..d4fdb81d34890a 100644 --- a/configure.ac +++ b/configure.ac @@ -3854,6 +3854,30 @@ if test "$ac_cv_c_complex_supported" = "yes"; then [Defined if _Complex C type is available.]) fi +# check for fnmatch(3) support +# +# We test for the plain POSIX implementation (case-sensitive match). +# +# To ensure that the implementation of fnmatch(3) is compliant +# we run some tests to make sure that everything works well. +# +# Note that MSVC does not support fnmatch(3). +AC_CACHE_CHECK([for case-sensititve fnmatch(3)], [ac_cv_fnmatch_supported], +[AC_RUN_IFELSE( + [AC_LANG_PROGRAM([@%:@include ], [[ + exit(!( + fnmatch("a*", "abc", 0) != FNM_NOMATCH && + fnmatch("a*", "Abc", 0) == FNM_NOMATCH + )); + ]])], [ac_cv_fnmatch_supported=yes], + [ac_cv_fnmatch_supported=no], + [ac_cv_fnmatch_supported=no] +)]) +if test "$ac_cv_fnmatch_supported" = "yes"; then + AC_DEFINE([Py_HAVE_FNMATCH], [1], + [Defined if case-sensitive fnmatch(3) is supported.]) +fi + # check for systems that require aligned memory access AC_CACHE_CHECK([aligned memory access is required], [ac_cv_aligned_required], [AC_RUN_IFELSE([AC_LANG_SOURCE([[ diff --git a/pyconfig.h.in b/pyconfig.h.in index 8fbba7ed3b949e..0997722334867c 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -1689,6 +1689,9 @@ /* Defined if _Complex C type is available. */ #undef Py_HAVE_C_COMPLEX +/* Defined if case-sensitive fnmatch(3) is supported. */ +#undef Py_HAVE_FNMATCH + /* Define if year with century should be normalized for strftime. */ #undef Py_NORMALIZE_CENTURY From ecf8146a566c692cae9930f56285f0d722de741c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 8 Jul 2024 18:57:28 +0200 Subject: [PATCH 12/97] add C implementation --- Modules/_fnmatchmodule.c | 865 ++++++++++++++++++++++++++---- Modules/clinic/_fnmatchmodule.c.h | 69 ++- 2 files changed, 799 insertions(+), 135 deletions(-) diff --git a/Modules/_fnmatchmodule.c b/Modules/_fnmatchmodule.c index 19118fd2a4a740..07d10f9112bc0e 100644 --- a/Modules/_fnmatchmodule.c +++ b/Modules/_fnmatchmodule.c @@ -1,8 +1,8 @@ /* - * C accelerator for the 'fnmatch' module. + * C accelerator for the 'fnmatch' module (POSIX only). * * Most functions expect string or bytes instances, and thus the Python - * implementation should first pre-process path-like objects, and possibly + * implementation should first pre-process path-like objects, possibly * applying normalizations depending on the platform if needed. */ @@ -10,92 +10,225 @@ #include "clinic/_fnmatchmodule.c.h" +#define INVALID_PATTERN_TYPE "pattern must be a string or a bytes object" + +// module state functions + +typedef struct { + PyObject *re_module; // 're' module + PyObject *os_module; // 'os' module + + PyObject *lru_cache; // optional cache for regex patterns, if needed +} fnmatchmodule_state; + +static inline fnmatchmodule_state * +get_fnmatchmodulestate_state(PyObject *module) +{ + void *state = PyModule_GetState(module); + assert(state != NULL); + return (fnmatchmodule_state *) state; +} + +static int +fnmatchmodule_clear(PyObject *m) +{ + fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); + Py_CLEAR(st->os_module); + Py_CLEAR(st->re_module); + Py_CLEAR(st->lru_cache); + return 0; +} + +static int +fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) +{ + fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); + Py_VISIT(st->os_module); + Py_VISIT(st->re_module); + Py_VISIT(st->lru_cache); + return 0; +} + +static void +fnmatchmodule_free(void *m) +{ + fnmatchmodule_clear((PyObject *) m); +} + +static int +fnmatchmodule_exec(PyObject *m) +{ + fnmatchmodule_state *state = get_fnmatchmodulestate_state(m); + + // imports + state->os_module = PyImport_ImportModule("os"); + if (state->os_module == NULL) { + return -1; + } + state->re_module = PyImport_ImportModule("re"); + if (state->re_module == NULL) { + return -1; + } + + // helpers + state->lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); + if (state->lru_cache == NULL) { + return -1; + } + // todo: handle LRU cache + return 0; +} + /*[clinic input] module _fnmatch [clinic start generated code]*/ /*[clinic end generated code: output=da39a3ee5e6b4b0d input=356e324d57d93f08]*/ +#ifdef Py_HAVE_FNMATCH #include +#define VERIFY_NAME_ARG_TYPE(name, check, expecting) \ + do { \ + if (!check) { \ + PyErr_Format(PyExc_TypeError, \ + "name must be a %s object, got %.200s", \ + expecting, Py_TYPE(name)->tp_name); \ + return -1; \ + } \ + } while (0) + +#define PROCESS_MATCH_RESULT(r) \ + do { \ + int res = (r); /* avoid variable capture */ \ + if (res < 0) { \ + return res; \ + } \ + return res != FNM_NOMATCH; \ + } while (0) + +/* + * Perform a case-sensitive match using fnmatch(3). + * + * Parameters + * + * pattern A UNIX shell pattern. + * name The filename to match (bytes object). + * + * Returns 1 if the 'name' matches the 'pattern' and 0 otherwise. + * + * Returns -1 if (1) 'name' is not a `bytes` object, and + * sets a TypeError exception, or (2) something went wrong. + */ static inline int -validate_encoded_object(PyObject *name) +posix_fnmatch_encoded(const char *pattern, PyObject *name) { - if (!PyBytes_Check(name)) { - PyErr_Format(PyExc_TypeError, - "name must be a bytes object, got %.200s", - Py_TYPE(name)->tp_name); - return 0; - } - return 1; + VERIFY_NAME_ARG_TYPE(name, PyBytes_Check(name), "bytes"); + PROCESS_MATCH_RESULT(fnmatch(pattern, PyBytes_AS_STRING(name), 0)); } +/* Same as `posix_fnmatch_encoded` but for string-like objects. */ static inline int -validate_unicode_object(PyObject *name) +posix_fnmatch_unicode(const char *pattern, PyObject *name) { - if (!PyUnicode_Check(name)) { - PyErr_Format(PyExc_TypeError, - "name must be a string object, got %.200s", - Py_TYPE(name)->tp_name); - return 0; - } - return 1; + VERIFY_NAME_ARG_TYPE(name, PyUnicode_Check(name), "string"); + PROCESS_MATCH_RESULT(fnmatch(pattern, PyUnicode_AsUTF8(name), 0)); } -static inline int -posix_fnmatch_encoded(const char *pattern, PyObject *name) +static PyObject * +posix_fnmatch_filter(const char *pattern, PyObject *names, + int (*match)(const char *, PyObject *)) { - if (!validate_encoded_object(name)) { - return -1; + PyObject *iter = PyObject_GetIter(names); + if (iter == NULL) { + return NULL; } - // case-insensitive match -#ifdef FNM_CASEFOLD - return fnmatch(pattern, PyBytes_AS_STRING(name), FNM_CASEFOLD) == 0; -#else - // todo: fallback to Python implementation - return -1; -#endif -} -static inline int -posix_fnmatchcase_encoded(const char *pattern, PyObject *name) -{ - if (!validate_encoded_object(name)) { - return -1; + PyObject *res = PyList_New(0); + if (res == NULL) { + Py_DECREF(iter); + return NULL; } - // case-sensitive match - return fnmatch(pattern, PyBytes_AS_STRING(name), 0) == 0; + + PyObject *name = NULL; + while ((name = PyIter_Next(iter))) { + int rc = match(pattern, name); + if (rc < 0) { + goto abort; + } + if (rc == 1) { + if (PyList_Append(res, name) < 0) { + goto abort; + } + } + Py_DECREF(name); + if (PyErr_Occurred()) { + Py_DECREF(res); + Py_DECREF(iter); + return NULL; + } + } + Py_DECREF(iter); + return res; +abort: + Py_XDECREF(name); + Py_DECREF(iter); + Py_DECREF(res); + return NULL; } +#else -static inline int -posix_fnmatch_unicode(const char *pattern, PyObject *name) +static PyObject * +get_match_function(PyObject *module, PyObject *pattern) { - if (!validate_unicode_object(name)) { - return -1; + PyObject *expr = _fnmatch_translate_impl(module, pattern); + if (expr == NULL) { + return NULL; } - // case-insensitive match -#ifdef FNM_CASEFOLD - return fnmatch(pattern, PyUnicode_AsUTF8(name), FNM_CASEFOLD) == 0; -#else - // todo: fallback to Python implementation - return -1; -#endif + fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + PyObject *compiled = PyObject_CallMethod(st->re_module, "compile", "O", expr); + Py_DECREF(expr); + if (compiled == NULL) { + return NULL; + } + PyObject *matcher = PyObject_GetAttr(compiled, &_Py_ID(match)); + Py_DECREF(compiled); + return matcher; } +static PyMethodDef get_match_function_method_def = { + "get_match_function", + _PyCFunction_CAST(get_match_function), + METH_O, + NULL +}; + +/* + * Perform a case-sensitive match using regular expressions. + * + * Parameters + * + * pattern A translated regular expression. + * name The filename to match. + * + * Returns 1 if the 'name' matches the 'pattern' and 0 otherwise. + * Returns -1 if something went wrong. + */ static inline int -posix_fnmatchcase_unicode(const char *pattern, PyObject *name) +regex_fnmatch_generic(PyObject *matcher, PyObject *name) { - if (!validate_unicode_object(name)) { + // If 'name' is of incorrect type, it will be detected when calling + // the matcher function (we emulate 're.compile(...).match(name)'). + PyObject *match = PyObject_CallFunction(matcher, "O", name); + if (match == NULL) { return -1; } - // case-sensitive match - return fnmatch(pattern, PyUnicode_AsUTF8(name), 0) == 0; + int matching = match != Py_None; + Py_DECREF(match); + return matching; } static PyObject * -_fnmatch_filter_generic_impl(PyObject *module, - PyObject *names, - const char *pattern, - int (*match)(const char *, PyObject *)) +regex_fnmatch_filter(PyObject *matcher, PyObject *names) { PyObject *iter = PyObject_GetIter(names); if (iter == NULL) { @@ -110,7 +243,7 @@ _fnmatch_filter_generic_impl(PyObject *module, PyObject *name = NULL; while ((name = PyIter_Next(iter))) { - int rc = match(pattern, name); + int rc = regex_fnmatch_generic(matcher, name); if (rc < 0) { goto abort; } @@ -129,11 +262,12 @@ _fnmatch_filter_generic_impl(PyObject *module, Py_DECREF(iter); return res; abort: - Py_DECREF(name); + Py_XDECREF(name); Py_DECREF(iter); Py_DECREF(res); return NULL; } +#endif /*[clinic input] _fnmatch.filter -> object @@ -147,34 +281,59 @@ static PyObject * _fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) /*[clinic end generated code: output=7f11aa68436d05fc input=1d233174e1c4157a]*/ { - // todo: handle os.path.normcase(...) +#ifndef Py_HAVE_FNMATCH + PyObject *matcher = get_match_function(module, pat); + if (matcher == NULL) { + return NULL; + } + PyObject *result = regex_fnmatch_filter(matcher, names); + Py_DECREF(matcher); + return result; +#else + // Note that the Python implementation of fnmatch.filter() does not + // call os.fspath() on the names being matched, whereas it does on NT. if (PyBytes_Check(pat)) { const char *pattern = PyBytes_AS_STRING(pat); - return _fnmatch_filter_generic_impl(module, names, pattern, - &posix_fnmatch_encoded); + return posix_fnmatch_filter(pattern, names, &posix_fnmatch_encoded); } if (PyUnicode_Check(pat)) { const char *pattern = PyUnicode_AsUTF8(pat); - return _fnmatch_filter_generic_impl(module, names, pattern, - &posix_fnmatch_unicode); + return posix_fnmatch_filter(pattern, names, &posix_fnmatch_unicode); } - PyErr_Format(PyExc_TypeError, "pattern must be a string or a bytes object"); + PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); return NULL; +#endif } /*[clinic input] -_fnmatch.fnmatch -> bool +_fnmatch.fnmatchcase -> bool name: object pat: object +Test whether `name` matches `pattern`, including case. + +This is a version of fnmatch() which doesn't case-normalize +its arguments. + [clinic start generated code]*/ static int -_fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat) -/*[clinic end generated code: output=b4cd0bd911e8bc93 input=c45e0366489540b8]*/ +_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) +/*[clinic end generated code: output=4d1283b1b1fc7cb8 input=b02a6a5c8c5a46e2]*/ { - // todo: handle os.path.normcase(...) +#ifndef Py_HAVE_FNMATCH + PyObject *matcher = get_match_function(module, pat); + if (matcher == NULL) { + return -1; + } + int res = regex_fnmatch_generic(matcher, name); + Py_DECREF(matcher); + return res; +#else + // This function does not transform path-like objects, nor does it + // case-normalize 'name' or 'pattern' (whether it is the Python or + // the C implementation). if (PyBytes_Check(pat)) { const char *pattern = PyBytes_AS_STRING(pat); return posix_fnmatch_encoded(pattern, name); @@ -183,60 +342,572 @@ _fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat) const char *pattern = PyUnicode_AsUTF8(pat); return posix_fnmatch_unicode(pattern, name); } - PyErr_Format(PyExc_TypeError, "pattern must be a string or a bytes object"); + PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); return -1; +#endif } -/*[clinic input] -_fnmatch.fnmatchcase -> bool - name: object - pat: object +static inline int /* number of written characters or -1 on error */ +write_normal_character(PyObject *re, _PyUnicodeWriter *writer, PyObject *cp) +{ + PyObject *ch = PyObject_CallMethodOneArg(re, &_Py_ID(escape), cp); + if (ch == NULL) { + return -1; + } + int written = PyUnicode_GetLength(ch); + int rc = _PyUnicodeWriter_WriteStr(writer, ch); + Py_DECREF(ch); + if (rc < 0) { + return -1; + } + assert(written > 0); + return written; +} -Test whether `name` matches `pattern`, including case. +static inline int /* number of written characters or -1 on error */ +write_translated_group(_PyUnicodeWriter *writer, PyObject *group) +{ +#define WRITE_ASCII(str, len) \ + do { \ + if (_PyUnicodeWriter_WriteASCIIString(writer, (str), (len)) < 0) { \ + return -1; \ + } \ + } while (0) -This is a version of fnmatch() which doesn't case-normalize -its arguments. +#define WRITE_CHAR(c) \ + do { \ + if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) { \ + return -1; \ + } \ + } while (0) + + Py_ssize_t grouplen; + const char *buffer = PyUnicode_AsUTF8AndSize(group, &grouplen); + if (grouplen == 0) { + /* empty range: never match */ + WRITE_ASCII("(?!)", 4); + return 4; + } + else if (grouplen == 1 && buffer[0] == '!') { + /* negated empty range: match any character */ + WRITE_CHAR('.'); + return 1; + } + else { + int extra = 0; + WRITE_CHAR('['); + switch (buffer[0]) { + case '!': { + WRITE_CHAR('^'); + if (_PyUnicodeWriter_WriteSubstring(writer, group, 1, grouplen) < 0) { + return -1; + } + break; + } + case '^': + case '[': { + WRITE_CHAR('\\'); + extra = 1; + break; + } + default: + if (_PyUnicodeWriter_WriteStr(writer, group) < 0) { + return -1; + } + break; + } + WRITE_CHAR(']'); + return 2 + grouplen + extra; + } +#undef WRITE_CHAR +#undef WRITE_ASCII +} + +static PyObject * +get_translated_group(PyObject *unicode, + Py_ssize_t i /* unicode[i-1] == '[' (incl.) */, + Py_ssize_t j /* unicode[j] == ']' (excl.) */) +{ + PyObject *chunks = PyList_New(0); + if (chunks == NULL) { + return NULL; + } + PyObject *chr = PySequence_GetItem(unicode, i); + if (chr == NULL) { + goto error; + } + Py_ssize_t k = PyUnicode_CompareWithASCIIString(chr, "!") == 0 ? i + 2 : i + 1; + Py_DECREF(chr); + Py_ssize_t chunkscount = 0; + while (k < j) { + PyObject *eobj = PyObject_CallMethod(unicode, "find", "ii", k, j); + if (eobj == NULL) { + goto error; + } + Py_ssize_t t = PyLong_AsSsize_t(eobj); + Py_DECREF(eobj); + if (t < 0) { + goto error; + } + PyObject *sub = PyUnicode_Substring(unicode, i, t); + if (sub == NULL) { + goto error; + } + int rc = PyList_Append(chunks, sub); + Py_DECREF(sub); + if (rc < 0) { + goto error; + } + chunkscount += 1; + i = t + 1; + k = t + 3; + } + if (i >= j) { + assert(chunkscount > 0); + PyObject *chunk = PyList_GET_ITEM(chunks, chunkscount - 1); + PyObject *hyphen = PyUnicode_FromOrdinal('-'); + if (hyphen == NULL) { + goto error; + } + PyObject *repl = PyUnicode_Concat(chunk, hyphen); + Py_DECREF(hyphen); + int rc = PyList_SetItem(chunks, chunkscount - 1, repl); + Py_DECREF(repl); + if (rc < 0) { + goto error; + } + } + else { + PyObject *sub = PyUnicode_Substring(unicode, i, j); + if (sub == NULL) { + goto error; + } + int rc = PyList_Append(chunks, sub); + Py_DECREF(sub); + if (rc < 0) { + goto error; + } + chunkscount += 1; + } + // remove empty ranges (they are not valid in RE) + Py_ssize_t c = chunkscount; + while (--c) { + PyObject *c1 = PyList_GET_ITEM(chunks, c - 1); + assert(c1 != NULL); + Py_ssize_t c1len = 0; + const char *c1buf = PyUnicode_AsUTF8AndSize(c1, &c1len); + if (c1buf == NULL) { + goto error; + } + assert(c1len > 0); + + PyObject *c2 = PyList_GET_ITEM(chunks, c); + assert(c2 != NULL); + Py_ssize_t c2len = 0; + const char *c2buf = PyUnicode_AsUTF8AndSize(c2, &c2len); + if (c2buf == NULL) { + goto error; + } + assert(c2len > 0); + + if (c1buf[c1len - 1] > c2buf[0]) { + // all but the last character in the chunk + PyObject *c1sub = PyUnicode_Substring(c1, 0, c1len - 1); + // all but the first character in the chunk + PyObject *c2sub = PyUnicode_Substring(c2, 1, c2len); + if (c1sub == NULL || c2sub == NULL) { + Py_XDECREF(c1sub); + Py_XDECREF(c2sub); + goto error; + } + PyObject *merged = PyUnicode_Concat(c1sub, c2sub); + Py_DECREF(c1sub); + Py_DECREF(c2sub); + if (merged == NULL) { + goto error; + } + int rc = PyList_SetItem(chunks, c - 1, merged); + Py_DECREF(merged); + if (rc < 0) { + goto error; + } + if (PySequence_DelItem(chunks, c) < 0) { + goto error; + } + chunkscount--; + } + } + // Escape backslashes and hyphens for set difference (--), + // but hyphens that create ranges should not be escaped. + for (c = 0; c < chunkscount; ++c) { + PyObject *s0 = PyList_GetItem(chunks, c); + if (s0 == NULL) { + goto error; + } + PyObject *s1 = PyObject_CallMethod(s0, "replace", "ss", "\\", "\\\\"); + if (s1 == NULL) { + goto error; + } + PyObject *s2 = PyObject_CallMethod(s1, "replace", "ss", "-", "\\-"); + Py_DECREF(s1); + if (s2 == NULL) { + goto error; + } + if (PyList_SetItem(chunks, c, s2) < 0) { + goto error; + } + } + PyObject *hyphen = PyUnicode_FromString("-"); + if (hyphen == NULL) { + goto error; + } + PyObject *res = PyUnicode_Join(hyphen, chunks); + Py_DECREF(hyphen); + if (res == NULL) { + goto error; + } + Py_DECREF(chunks); + return res; +error: + Py_XDECREF(chunks); + return NULL; +} + +static PyObject * +join_translated_parts(PyObject *parts, PyObject *indices) +{ +#define LOAD_STAR_INDEX(var, k) \ + do { \ + ind = PyList_GET_ITEM(indices, (k)); \ + var = PyLong_AsSsize_t(ind); \ + if (var < 0) { \ + goto abort; \ + } \ + } while (0) + +#define WRITE_SUBSTRING(i, j) \ + do { \ + if ((i) < (j)) { \ + if (_PyUnicodeWriter_WriteSubstring(_writer, parts, (i), (j)) < 0) { \ + goto abort; \ + } \ + } \ + } while (0) + +#define WRITE_WILDCARD() \ + do { \ + if (_PyUnicodeWriter_WriteASCIIString(_writer, ".*", 2) < 0) { \ + goto abort; \ + } \ + } while (0) + +#define WRITE_ATOMIC_SUBSTRING(i, j) \ + do { \ + if ((_PyUnicodeWriter_WriteASCIIString(_writer, "(?>.*?", 6) < 0) || \ + (_PyUnicodeWriter_WriteSubstring(_writer, parts, (i), (j)) < 0) || \ + (_PyUnicodeWriter_WriteChar(_writer, ')') < 0)) \ + { \ + goto abort; \ + } \ + } while (0) + + const Py_ssize_t m = PyList_GET_SIZE(indices); + if (m == 0) { + // just write fr'(?s:{parts} + ")\Z" + return PyUnicode_FromFormat("(?s:%S)\\Z", parts); + } + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)(writer); + + /* + * Special cases: indices[0] == 0 or indices[-1] + 1 == n + * + * If indices[0] == 0 write (?>.*?group_1) instead of abcdef + * If indices[-1] == n - 1 write '.*' instead of empty string + */ + PyObject *ind; + Py_ssize_t i, j, n = PyUnicode_GetLength(parts); + // handle the first group + LOAD_STAR_INDEX(i, 0); + if (i == 0) { + if (m == 1) { // pattern = '*TAIL' + WRITE_WILDCARD(); + WRITE_SUBSTRING(1, n); // write TAIL part + goto finalize; + } + else { // pattern = '*BODY*...' + LOAD_STAR_INDEX(j, 1); + WRITE_ATOMIC_SUBSTRING(i + 1, j); + i = j + 1; + } + } + else { + if (m == 1) { // pattern = 'HEAD*' or 'HEAD*TAIL' + WRITE_SUBSTRING(0, i); // write HEAD part + WRITE_WILDCARD(); + WRITE_SUBSTRING(i + 1, n); // write TAIL part (if any) + goto finalize; + } + else { // pattern = 'HEAD*STRING*...' + WRITE_SUBSTRING(0, i); // write HEAD part + i++; + } + } + // handle the inner groups + for (Py_ssize_t k = 1; k < m - 1; ++k) { + LOAD_STAR_INDEX(j, k + 1); + assert(i < j); + WRITE_ATOMIC_SUBSTRING(i, j); + i = j + 1; + } + // handle the last group + WRITE_WILDCARD(); + WRITE_SUBSTRING(i, n); // write TAIL part ( +finalize: + ; // empty statement for allowing a label before a declaration + PyObject *res = PyUnicodeWriter_Finish(writer); + if (res == NULL) { + return NULL; + } + return PyUnicode_FromFormat("(?s:%S)\\Z", res); +abort: + PyUnicodeWriter_Discard(writer); + return NULL; +} + +static PyObject * +translate(PyObject *module, PyObject *unicode) +/* new reference */ +{ + fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); + PyObject *re = state->re_module; + + Py_ssize_t estimate = 0; + PyUnicodeWriter *writer = PyUnicodeWriter_Create(estimate); + if (writer == NULL) { + return NULL; + } + _PyUnicodeWriter *_writer = (_PyUnicodeWriter *) (writer); + + // list containing the indices where '*' has a special meaning + PyObject *indices = PyList_New(0); + if (indices == NULL) { + goto abort; + } + + Py_ssize_t n = PyUnicode_GetLength(unicode); + if (n < 0) { + goto abort; + } + Py_ssize_t h = 0, i = 0; + PyObject *peek = NULL; + while (i < n) { + PyObject *chr = PySequence_GetItem(unicode, i); + if (chr == NULL) { + goto abort; + } + if (PyUnicode_CompareWithASCIIString(chr, "*") == 0) { + Py_DECREF(chr); + if (_PyUnicodeWriter_WriteChar(_writer, '*') < 0) { + goto abort; + } + // drop all other '*' that can be found afterwards + while (++i < n) { + peek = PySequence_GetItem(unicode, i); + if (peek == NULL) { + goto abort; + } + if (PyUnicode_CompareWithASCIIString(peek, "*") != 0) { + Py_DECREF(peek); + break; + } + Py_DECREF(peek); + } + PyObject *index = PyLong_FromLong(h++); + if (index == NULL) { + goto abort; + } + int rc = PyList_Append(indices, index); + Py_DECREF(index); + if (rc < 0) { + goto abort; + } + } + else if (PyUnicode_CompareWithASCIIString(chr, "?") == 0) { + Py_DECREF(chr); + // translate optional '?' (fnmatch) into optional '.' (regex) + if (_PyUnicodeWriter_WriteChar(_writer, '.') < 0) { + goto abort; + } + ++i; // advance for the next iteration + ++h; // increase the expected result's length + } + else if (PyUnicode_CompareWithASCIIString(chr, "[") == 0) { + Py_DECREF(chr); + // check the next characters (peek) + Py_ssize_t j = ++i; + if (j < n) { + peek = PySequence_GetItem(unicode, j); + if (peek == NULL) { + goto abort; + } + if (PyUnicode_CompareWithASCIIString(peek, "!") == 0) {// [! + ++j; + } + Py_DECREF(peek); + } + if (j < n) { + peek = PySequence_GetItem(unicode, j); + if (peek == NULL) { + goto abort; + } + if (PyUnicode_CompareWithASCIIString(peek, "]") == 0) { // [!] or [] + ++j; + } + Py_DECREF(peek); + } + while (j < n) { + peek = PySequence_GetItem(unicode, j); + if (peek == NULL) { + goto abort; + } + // locate the closing ']' + if (PyUnicode_CompareWithASCIIString(peek, "]") != 0) { + ++j; + } + Py_DECREF(peek); + } + if (j >= n) { + if (_PyUnicodeWriter_WriteASCIIString(_writer, "\\[", 2) < 0) { + goto abort; + } + h += 2; // we just wrote 2 characters + } + else { + // v--- pattern[j] (exclusive) + // '[' * ... * ']' + // ^----- pattern[i] (inclusive) + PyObject *s1 = NULL, *s2 = NULL; + if (PyUnicode_FindChar(unicode, '-', i, j, 1) >= 0) { + PyObject *group = PyUnicode_Substring(unicode, i, j); + if (group == NULL) { + goto abort; + } + s1 = PyObject_CallMethod(group, "replace", "ss", "\\", "\\\\"); + Py_DECREF(group); + } + else { + s1 = get_translated_group(unicode, i, j); + } + if (s1 == NULL) { + goto abort; + } + s2 = PyObject_CallMethod(re, "sub", "ssO", "([&~|])", "\\\\\\1", s1); + Py_DECREF(s1); + if (s2 == NULL) { + goto abort; + } + int difflen = write_translated_group(_writer, s2); + Py_DECREF(s2); + if (difflen < 0) { + goto abort; + } + h += difflen; + i = j + 1; // jump to the character after ']' + } + } + else { + int difflen = write_normal_character(re, _writer, chr); + Py_DECREF(chr); + if (difflen < 0) { + goto abort; + } + h += difflen; + ++i; + } + } + PyObject *parts = PyUnicodeWriter_Finish(writer); + if (parts == NULL) { + Py_DECREF(indices); + return NULL; + } + assert(h == PyUnicode_GET_LENGTH(parts)); + PyObject *res = join_translated_parts(parts, indices); + Py_DECREF(parts); + Py_DECREF(indices); + return res; +abort: + Py_XDECREF(indices); + PyUnicodeWriter_Discard(writer); + return NULL; +} + +/*[clinic input] +_fnmatch.translate -> object + + pat as pattern: object [clinic start generated code]*/ -static int -_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) -/*[clinic end generated code: output=4d1283b1b1fc7cb8 input=b02a6a5c8c5a46e2]*/ +static PyObject * +_fnmatch_translate_impl(PyObject *module, PyObject *pattern) +/*[clinic end generated code: output=2d9e3bbcbcc6e90e input=56e39f7beea97810]*/ { - if (PyBytes_Check(pat)) { - const char *pattern = PyBytes_AS_STRING(pat); - return posix_fnmatchcase_encoded(pattern, name); + if (PyBytes_Check(pattern)) { + PyObject *unicode = PyUnicode_DecodeLatin1(PyBytes_AS_STRING(pattern), + PyBytes_GET_SIZE(pattern), + "strict"); + if (unicode == NULL) { + return NULL; + } + // translated regular expression as a str object + PyObject *str_expr = translate(module, unicode); + Py_DECREF(unicode); + if (str_expr == NULL) { + return NULL; + } + PyObject *expr = PyUnicode_AsLatin1String(str_expr); + Py_DECREF(str_expr); + return expr; } - if (PyUnicode_Check(pat)) { - const char *pattern = PyUnicode_AsUTF8(pat); - return posix_fnmatchcase_unicode(pattern, name); + else if (PyUnicode_Check(pattern)) { + return translate(module, pattern); + } + else { + PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); + return NULL; } - PyErr_Format(PyExc_TypeError, "pattern must be a string or a bytes object"); - return -1; } -static PyMethodDef _fnmatch_methods[] = { +static PyMethodDef fnmatchmodule_methods[] = { _FNMATCH_FILTER_METHODDEF - _FNMATCH_FNMATCH_METHODDEF _FNMATCH_FNMATCHCASE_METHODDEF + _FNMATCH_TRANSLATE_METHODDEF {NULL, NULL} }; -static struct PyModuleDef_Slot _fnmatch_slots[] = { - {0, NULL} +static struct PyModuleDef_Slot fnmatchmodule_slots[] = { + {Py_mod_exec, fnmatchmodule_exec}, + {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, + {0, NULL}, }; static struct PyModuleDef _fnmatchmodule = { PyModuleDef_HEAD_INIT, "_fnmatch", NULL, - 0, - _fnmatch_methods, - _fnmatch_slots, - NULL, - NULL, - NULL, + .m_size = sizeof(fnmatchmodule_state), + .m_methods = fnmatchmodule_methods, + .m_slots = fnmatchmodule_slots, + .m_traverse = fnmatchmodule_traverse, + .m_clear = fnmatchmodule_clear, + .m_free = fnmatchmodule_free, }; PyMODINIT_FUNC diff --git a/Modules/clinic/_fnmatchmodule.c.h b/Modules/clinic/_fnmatchmodule.c.h index a693bccee18ff5..4b12f33113d3fb 100644 --- a/Modules/clinic/_fnmatchmodule.c.h +++ b/Modules/clinic/_fnmatchmodule.c.h @@ -64,19 +64,23 @@ _fnmatch_filter(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -PyDoc_STRVAR(_fnmatch_fnmatch__doc__, -"fnmatch($module, /, name, pat)\n" +PyDoc_STRVAR(_fnmatch_fnmatchcase__doc__, +"fnmatchcase($module, /, name, pat)\n" "--\n" -"\n"); +"\n" +"Test whether `name` matches `pattern`, including case.\n" +"\n" +"This is a version of fnmatch() which doesn\'t case-normalize\n" +"its arguments."); -#define _FNMATCH_FNMATCH_METHODDEF \ - {"fnmatch", _PyCFunction_CAST(_fnmatch_fnmatch), METH_FASTCALL|METH_KEYWORDS, _fnmatch_fnmatch__doc__}, +#define _FNMATCH_FNMATCHCASE_METHODDEF \ + {"fnmatchcase", _PyCFunction_CAST(_fnmatch_fnmatchcase), METH_FASTCALL|METH_KEYWORDS, _fnmatch_fnmatchcase__doc__}, static int -_fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat); +_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat); static PyObject * -_fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +_fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) @@ -100,7 +104,7 @@ _fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb static const char * const _keywords[] = {"name", "pat", NULL}; static _PyArg_Parser _parser = { .keywords = _keywords, - .fname = "fnmatch", + .fname = "fnmatchcase", .kwtuple = KWTUPLE, }; #undef KWTUPLE @@ -115,7 +119,7 @@ _fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb } name = args[0]; pat = args[1]; - _return_value = _fnmatch_fnmatch_impl(module, name, pat); + _return_value = _fnmatch_fnmatchcase_impl(module, name, pat); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -125,35 +129,31 @@ _fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb return return_value; } -PyDoc_STRVAR(_fnmatch_fnmatchcase__doc__, -"fnmatchcase($module, /, name, pat)\n" +PyDoc_STRVAR(_fnmatch_translate__doc__, +"translate($module, /, pat)\n" "--\n" -"\n" -"Test whether `name` matches `pattern`, including case.\n" -"\n" -"This is a version of fnmatch() which doesn\'t case-normalize\n" -"its arguments."); +"\n"); -#define _FNMATCH_FNMATCHCASE_METHODDEF \ - {"fnmatchcase", _PyCFunction_CAST(_fnmatch_fnmatchcase), METH_FASTCALL|METH_KEYWORDS, _fnmatch_fnmatchcase__doc__}, +#define _FNMATCH_TRANSLATE_METHODDEF \ + {"translate", _PyCFunction_CAST(_fnmatch_translate), METH_FASTCALL|METH_KEYWORDS, _fnmatch_translate__doc__}, -static int -_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat); +static PyObject * +_fnmatch_translate_impl(PyObject *module, PyObject *pattern); static PyObject * -_fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +_fnmatch_translate(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - #define NUM_KEYWORDS 2 + #define NUM_KEYWORDS 1 static struct { PyGC_Head _this_is_not_used; PyObject_VAR_HEAD PyObject *ob_item[NUM_KEYWORDS]; } _kwtuple = { .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) - .ob_item = { &_Py_ID(name), &_Py_ID(pat), }, + .ob_item = { &_Py_ID(pat), }, }; #undef NUM_KEYWORDS #define KWTUPLE (&_kwtuple.ob_base.ob_base) @@ -162,31 +162,24 @@ _fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, # define KWTUPLE NULL #endif // !Py_BUILD_CORE - static const char * const _keywords[] = {"name", "pat", NULL}; + static const char * const _keywords[] = {"pat", NULL}; static _PyArg_Parser _parser = { .keywords = _keywords, - .fname = "fnmatchcase", + .fname = "translate", .kwtuple = KWTUPLE, }; #undef KWTUPLE - PyObject *argsbuf[2]; - PyObject *name; - PyObject *pat; - int _return_value; + PyObject *argsbuf[1]; + PyObject *pattern; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf); if (!args) { goto exit; } - name = args[0]; - pat = args[1]; - _return_value = _fnmatch_fnmatchcase_impl(module, name, pat); - if ((_return_value == -1) && PyErr_Occurred()) { - goto exit; - } - return_value = PyBool_FromLong((long)_return_value); + pattern = args[0]; + return_value = _fnmatch_translate_impl(module, pattern); exit: return return_value; } -/*[clinic end generated code: output=fd6cc9541aa95a9a input=a9049054013a1b77]*/ +/*[clinic end generated code: output=b0366b259b101bdf input=a9049054013a1b77]*/ From cb16b6ac5bad479ea80933ebe8b43bb682408d30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 8 Jul 2024 18:57:37 +0200 Subject: [PATCH 13/97] update Python implementation --- Lib/fnmatch.py | 289 +++++++++++++++++++++++++------------------------ 1 file changed, 148 insertions(+), 141 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 73acb1fe8d4106..96487bc53fb2de 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -45,148 +45,155 @@ def _compile_pattern(pat): res = translate(pat) return re.compile(res).match -def filter(names, pat): - """Construct a list from those elements of the iterable NAMES that match PAT.""" - result = [] - pat = os.path.normcase(pat) - match = _compile_pattern(pat) - if os.path is posixpath: - # normcase on posix is NOP. Optimize it away from the loop. - for name in names: - if match(name): - result.append(name) - else: - for name in names: - if match(os.path.normcase(name)): - result.append(name) - return result - -def fnmatchcase(name, pat): - """Test whether FILENAME matches PATTERN, including case. - - This is a version of fnmatch() which doesn't case-normalize - its arguments. - """ - match = _compile_pattern(pat) - return match(name) is not None - - -def translate(pat): - """Translate a shell PATTERN to a regular expression. - - There is no way to quote meta-characters. - """ - - STAR = object() - parts = _translate(pat, STAR, '.') - return _join_translated_parts(parts, STAR) - - -def _translate(pat, STAR, QUESTION_MARK): - res = [] - add = res.append - i, n = 0, len(pat) - while i < n: - c = pat[i] - i = i+1 - if c == '*': - # compress consecutive `*` into one - if (not res) or res[-1] is not STAR: - add(STAR) - elif c == '?': - add(QUESTION_MARK) - elif c == '[': - j = i - if j < n and pat[j] == '!': - j = j+1 - if j < n and pat[j] == ']': - j = j+1 - while j < n and pat[j] != ']': - j = j+1 - if j >= n: - add('\\[') - else: - stuff = pat[i:j] - if '-' not in stuff: - stuff = stuff.replace('\\', r'\\') +try: + from _fnmatch import filter +except ImportError: + def filter(names, pat): + """Construct a list from those elements of the iterable NAMES that match PAT.""" + result = [] + pat = os.path.normcase(pat) + match = _compile_pattern(pat) + if os.path is posixpath: + # normcase on posix is NOP. Optimize it away from the loop. + for name in names: + if match(name): + result.append(name) + else: + for name in names: + if match(os.path.normcase(name)): + result.append(name) + return result + +try: + from _fnmatch import fnmatchcase +except ImportError: + def fnmatchcase(name, pat): + """Test whether FILENAME matches PATTERN, including case. + + This is a version of fnmatch() which doesn't case-normalize + its arguments. + """ + match = _compile_pattern(pat) + return match(name) is not None + +try: + from _fnmatch import translate +except ImportError: + def translate(pat): + """Translate a shell PATTERN to a regular expression. + + There is no way to quote meta-characters. + """ + + STAR = object() + parts = _translate(pat, STAR, '.') + return _join_translated_parts(parts, STAR) + + def _translate(pat, STAR, QUESTION_MARK): + res = [] + add = res.append + i, n = 0, len(pat) + while i < n: + c = pat[i] + i = i+1 + if c == '*': + # compress consecutive `*` into one + if (not res) or res[-1] is not STAR: + add(STAR) + elif c == '?': + add(QUESTION_MARK) + elif c == '[': + j = i + if j < n and pat[j] == '!': + j = j+1 + if j < n and pat[j] == ']': + j = j+1 + while j < n and pat[j] != ']': + j = j+1 + if j >= n: + add('\\[') else: - chunks = [] - k = i+2 if pat[i] == '!' else i+1 - while True: - k = pat.find('-', k, j) - if k < 0: - break - chunks.append(pat[i:k]) - i = k+1 - k = k+3 - chunk = pat[i:j] - if chunk: - chunks.append(chunk) + stuff = pat[i:j] + if '-' not in stuff: + stuff = stuff.replace('\\', r'\\') else: - chunks[-1] += '-' - # Remove empty ranges -- invalid in RE. - for k in range(len(chunks)-1, 0, -1): - if chunks[k-1][-1] > chunks[k][0]: - chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] - del chunks[k] - # Escape backslashes and hyphens for set difference (--). - # Hyphens that create ranges shouldn't be escaped. - stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') - for s in chunks) - # Escape set operations (&&, ~~ and ||). - stuff = re.sub(r'([&~|])', r'\\\1', stuff) - i = j+1 - if not stuff: - # Empty range: never match. - add('(?!)') - elif stuff == '!': - # Negated empty range: match any character. - add('.') - else: - if stuff[0] == '!': - stuff = '^' + stuff[1:] - elif stuff[0] in ('^', '['): - stuff = '\\' + stuff - add(f'[{stuff}]') - else: - add(re.escape(c)) - assert i == n - return res - - -def _join_translated_parts(inp, STAR): - # Deal with STARs. - res = [] - add = res.append - i, n = 0, len(inp) - # Fixed pieces at the start? - while i < n and inp[i] is not STAR: - add(inp[i]) - i += 1 - # Now deal with STAR fixed STAR fixed ... - # For an interior `STAR fixed` pairing, we want to do a minimal - # .*? match followed by `fixed`, with no possibility of backtracking. - # Atomic groups ("(?>...)") allow us to spell that directly. - # Note: people rely on the undocumented ability to join multiple - # translate() results together via "|" to build large regexps matching - # "one of many" shell patterns. - while i < n: - assert inp[i] is STAR - i += 1 - if i == n: - add(".*") - break - assert inp[i] is not STAR - fixed = [] + chunks = [] + k = i+2 if pat[i] == '!' else i+1 + while True: + k = pat.find('-', k, j) + if k < 0: + break + chunks.append(pat[i:k]) + i = k+1 + k = k+3 + chunk = pat[i:j] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += '-' + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks)-1, 0, -1): + if chunks[k-1][-1] > chunks[k][0]: + chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') + for s in chunks) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r'([&~|])', r'\\\1', stuff) + i = j+1 + if not stuff: + # Empty range: never match. + add('(?!)') + elif stuff == '!': + # Negated empty range: match any character. + add('.') + else: + if stuff[0] == '!': + stuff = '^' + stuff[1:] + elif stuff[0] in ('^', '['): + stuff = '\\' + stuff + add(f'[{stuff}]') + else: + add(re.escape(c)) + assert i == n + return res + + + def _join_translated_parts(inp, STAR): + # Deal with STARs. + res = [] + add = res.append + i, n = 0, len(inp) + # Fixed pieces at the start? while i < n and inp[i] is not STAR: - fixed.append(inp[i]) + add(inp[i]) i += 1 - fixed = "".join(fixed) - if i == n: - add(".*") - add(fixed) - else: - add(f"(?>.*?{fixed})") - assert i == n - res = "".join(res) - return fr'(?s:{res})\Z' + # Now deal with STAR fixed STAR fixed ... + # For an interior `STAR fixed` pairing, we want to do a minimal + # .*? match followed by `fixed`, with no possibility of backtracking. + # Atomic groups ("(?>...)") allow us to spell that directly. + # Note: people rely on the undocumented ability to join multiple + # translate() results together via "|" to build large regexps matching + # "one of many" shell patterns. + while i < n: + assert inp[i] is STAR + i += 1 + if i == n: + add(".*") + break + assert inp[i] is not STAR + fixed = [] + while i < n and inp[i] is not STAR: + fixed.append(inp[i]) + i += 1 + fixed = "".join(fixed) + if i == n: + add(".*") + add(fixed) + else: + add(f"(?>.*?{fixed})") + assert i == n + res = "".join(res) + return fr'(?s:{res})\Z' From 751c06906bead6192bc79bfe9f3db67136929502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 8 Jul 2024 18:57:42 +0200 Subject: [PATCH 14/97] update tests --- Lib/test/test_fnmatch.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 94ec41958b07c0..f7e9391722ac38 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -1,13 +1,18 @@ """Test cases for the fnmatch module.""" - -import unittest import os import string +import unittest import warnings -import _fnmatch as c_fnmatch -import fnmatch as py_fnmatch -from fnmatch import fnmatch, fnmatchcase, translate, filter +import test.support.import_helper + +c_fnmatch = test.support.import_helper.import_fresh_module("_fnmatch", blocked=["fnmatch"]) +py_fnmatch = test.support.import_helper.import_fresh_module("fnmatch", blocked=["_fnmatch"]) + +fnmatch = py_fnmatch.fnmatch +fnmatchcase = py_fnmatch.fnmatchcase +translate = py_fnmatch.translate +filter = py_fnmatch.filter class FnmatchTestCase(unittest.TestCase): From 92580688a0999401dd0b11cf5683d43bceb45d36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 8 Jul 2024 18:57:50 +0200 Subject: [PATCH 15/97] add generated objects --- Include/internal/pycore_global_objects_fini_generated.h | 1 + Include/internal/pycore_global_strings.h | 1 + Include/internal/pycore_runtime_init_generated.h | 1 + Include/internal/pycore_unicodeobject_generated.h | 4 ++++ 4 files changed, 7 insertions(+) diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index 77b2a8e2e7a7dc..8e3d405fc7c04b 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -916,6 +916,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(entrypoint)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(env)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(errors)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(escape)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(event)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eventmask)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_type)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 4896f6343087d3..7cbc1941ffa0ee 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -405,6 +405,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(entrypoint) STRUCT_FOR_ID(env) STRUCT_FOR_ID(errors) + STRUCT_FOR_ID(escape) STRUCT_FOR_ID(event) STRUCT_FOR_ID(eventmask) STRUCT_FOR_ID(exc_type) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index 1249957fb29d1e..4164c6cae7a8bf 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -914,6 +914,7 @@ extern "C" { INIT_ID(entrypoint), \ INIT_ID(env), \ INIT_ID(errors), \ + INIT_ID(escape), \ INIT_ID(event), \ INIT_ID(eventmask), \ INIT_ID(exc_type), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index 0bd57f5db64ea9..b15845cd16e814 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1420,6 +1420,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(escape); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(event); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); From 5a7183c80ed3ebcec0a04a6af8e2f5eaf3270fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 8 Jul 2024 19:01:33 +0200 Subject: [PATCH 16/97] re-expose private API --- Lib/fnmatch.py | 208 ++++++++++++++++++++++++------------------------- 1 file changed, 104 insertions(+), 104 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 96487bc53fb2de..ffa15825954f5a 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -89,111 +89,111 @@ def translate(pat): parts = _translate(pat, STAR, '.') return _join_translated_parts(parts, STAR) - def _translate(pat, STAR, QUESTION_MARK): - res = [] - add = res.append - i, n = 0, len(pat) - while i < n: - c = pat[i] - i = i+1 - if c == '*': - # compress consecutive `*` into one - if (not res) or res[-1] is not STAR: - add(STAR) - elif c == '?': - add(QUESTION_MARK) - elif c == '[': - j = i - if j < n and pat[j] == '!': - j = j+1 - if j < n and pat[j] == ']': - j = j+1 - while j < n and pat[j] != ']': - j = j+1 - if j >= n: - add('\\[') +def _translate(pat, STAR, QUESTION_MARK): + res = [] + add = res.append + i, n = 0, len(pat) + while i < n: + c = pat[i] + i = i+1 + if c == '*': + # compress consecutive `*` into one + if (not res) or res[-1] is not STAR: + add(STAR) + elif c == '?': + add(QUESTION_MARK) + elif c == '[': + j = i + if j < n and pat[j] == '!': + j = j+1 + if j < n and pat[j] == ']': + j = j+1 + while j < n and pat[j] != ']': + j = j+1 + if j >= n: + add('\\[') + else: + stuff = pat[i:j] + if '-' not in stuff: + stuff = stuff.replace('\\', r'\\') else: - stuff = pat[i:j] - if '-' not in stuff: - stuff = stuff.replace('\\', r'\\') - else: - chunks = [] - k = i+2 if pat[i] == '!' else i+1 - while True: - k = pat.find('-', k, j) - if k < 0: - break - chunks.append(pat[i:k]) - i = k+1 - k = k+3 - chunk = pat[i:j] - if chunk: - chunks.append(chunk) - else: - chunks[-1] += '-' - # Remove empty ranges -- invalid in RE. - for k in range(len(chunks)-1, 0, -1): - if chunks[k-1][-1] > chunks[k][0]: - chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] - del chunks[k] - # Escape backslashes and hyphens for set difference (--). - # Hyphens that create ranges shouldn't be escaped. - stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') - for s in chunks) - # Escape set operations (&&, ~~ and ||). - stuff = re.sub(r'([&~|])', r'\\\1', stuff) - i = j+1 - if not stuff: - # Empty range: never match. - add('(?!)') - elif stuff == '!': - # Negated empty range: match any character. - add('.') + chunks = [] + k = i+2 if pat[i] == '!' else i+1 + while True: + k = pat.find('-', k, j) + if k < 0: + break + chunks.append(pat[i:k]) + i = k+1 + k = k+3 + chunk = pat[i:j] + if chunk: + chunks.append(chunk) else: - if stuff[0] == '!': - stuff = '^' + stuff[1:] - elif stuff[0] in ('^', '['): - stuff = '\\' + stuff - add(f'[{stuff}]') - else: - add(re.escape(c)) - assert i == n - return res - - - def _join_translated_parts(inp, STAR): - # Deal with STARs. - res = [] - add = res.append - i, n = 0, len(inp) - # Fixed pieces at the start? + chunks[-1] += '-' + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks)-1, 0, -1): + if chunks[k-1][-1] > chunks[k][0]: + chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') + for s in chunks) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r'([&~|])', r'\\\1', stuff) + i = j+1 + if not stuff: + # Empty range: never match. + add('(?!)') + elif stuff == '!': + # Negated empty range: match any character. + add('.') + else: + if stuff[0] == '!': + stuff = '^' + stuff[1:] + elif stuff[0] in ('^', '['): + stuff = '\\' + stuff + add(f'[{stuff}]') + else: + add(re.escape(c)) + assert i == n + return res + + +def _join_translated_parts(inp, STAR): + # Deal with STARs. + res = [] + add = res.append + i, n = 0, len(inp) + # Fixed pieces at the start? + while i < n and inp[i] is not STAR: + add(inp[i]) + i += 1 + # Now deal with STAR fixed STAR fixed ... + # For an interior `STAR fixed` pairing, we want to do a minimal + # .*? match followed by `fixed`, with no possibility of backtracking. + # Atomic groups ("(?>...)") allow us to spell that directly. + # Note: people rely on the undocumented ability to join multiple + # translate() results together via "|" to build large regexps matching + # "one of many" shell patterns. + while i < n: + assert inp[i] is STAR + i += 1 + if i == n: + add(".*") + break + assert inp[i] is not STAR + fixed = [] while i < n and inp[i] is not STAR: - add(inp[i]) - i += 1 - # Now deal with STAR fixed STAR fixed ... - # For an interior `STAR fixed` pairing, we want to do a minimal - # .*? match followed by `fixed`, with no possibility of backtracking. - # Atomic groups ("(?>...)") allow us to spell that directly. - # Note: people rely on the undocumented ability to join multiple - # translate() results together via "|" to build large regexps matching - # "one of many" shell patterns. - while i < n: - assert inp[i] is STAR + fixed.append(inp[i]) i += 1 - if i == n: - add(".*") - break - assert inp[i] is not STAR - fixed = [] - while i < n and inp[i] is not STAR: - fixed.append(inp[i]) - i += 1 - fixed = "".join(fixed) - if i == n: - add(".*") - add(fixed) - else: - add(f"(?>.*?{fixed})") - assert i == n - res = "".join(res) - return fr'(?s:{res})\Z' + fixed = "".join(fixed) + if i == n: + add(".*") + add(fixed) + else: + add(f"(?>.*?{fixed})") + assert i == n + res = "".join(res) + return fr'(?s:{res})\Z' From 2a8020046cf62fb2ff6b7d92205f609d7d6856cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 11:54:59 +0200 Subject: [PATCH 17/97] fix implementation? --- Modules/_fnmatchmodule.c | 475 ++++++++++++++++++++------------------- 1 file changed, 248 insertions(+), 227 deletions(-) diff --git a/Modules/_fnmatchmodule.c b/Modules/_fnmatchmodule.c index 07d10f9112bc0e..251b71ea5f5cfd 100644 --- a/Modules/_fnmatchmodule.c +++ b/Modules/_fnmatchmodule.c @@ -7,6 +7,7 @@ */ #include "Python.h" +#include "pycore_call.h" // for _PyObject_CallMethod #include "clinic/_fnmatchmodule.c.h" @@ -19,6 +20,10 @@ typedef struct { PyObject *os_module; // 'os' module PyObject *lru_cache; // optional cache for regex patterns, if needed + + PyObject *str_atomic_bgroup; // (?>.*? + PyObject *str_atomic_egroup; // ) + PyObject *str_wildcard; // * } fnmatchmodule_state; static inline fnmatchmodule_state * @@ -36,6 +41,10 @@ fnmatchmodule_clear(PyObject *m) Py_CLEAR(st->os_module); Py_CLEAR(st->re_module); Py_CLEAR(st->lru_cache); + + Py_CLEAR(st->str_atomic_bgroup); + Py_CLEAR(st->str_atomic_egroup); + Py_CLEAR(st->str_wildcard); return 0; } @@ -46,6 +55,10 @@ fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) Py_VISIT(st->os_module); Py_VISIT(st->re_module); Py_VISIT(st->lru_cache); + + Py_VISIT(st->str_atomic_bgroup); + Py_VISIT(st->str_atomic_egroup); + Py_VISIT(st->str_wildcard); return 0; } @@ -58,17 +71,27 @@ fnmatchmodule_free(void *m) static int fnmatchmodule_exec(PyObject *m) { +#define IMPORT_MODULE(attr, name) \ + do { \ + state->attr = PyImport_ImportModule((name)); \ + if (state->attr == NULL) { \ + return -1; \ + } \ + } while (0) + +#define INTERN_STRING(attr, str) \ + do { \ + state->attr = PyUnicode_InternFromString((str)); \ + if (state->attr == NULL) { \ + return -1; \ + } \ + } while (0) + fnmatchmodule_state *state = get_fnmatchmodulestate_state(m); // imports - state->os_module = PyImport_ImportModule("os"); - if (state->os_module == NULL) { - return -1; - } - state->re_module = PyImport_ImportModule("re"); - if (state->re_module == NULL) { - return -1; - } + IMPORT_MODULE(os_module, "os"); + IMPORT_MODULE(re_module, "re"); // helpers state->lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); @@ -76,6 +99,15 @@ fnmatchmodule_exec(PyObject *m) return -1; } // todo: handle LRU cache + + // interned strings + INTERN_STRING(str_atomic_bgroup, "(?>.*?"); + INTERN_STRING(str_atomic_egroup, ")"); + INTERN_STRING(str_wildcard, "*"); + +#undef INTERN_STRING +#undef IMPORT_MODULE + return 0; } @@ -347,17 +379,48 @@ _fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) #endif } +/* + * Convert Py_UCS4 to (PyObject *). + * + * This creates a new reference. + * + * Note: this is 'unicode_char' taken from Objects/unicodeobject.c. + */ +static PyObject * +get_unicode_character(Py_UCS4 ch) +{ + assert(ch <= MAX_UNICODE); + if (ch < 256) { + PyObject *o = _Py_LATIN1_CHR(ch); + assert(_Py_IsImmortal(o)); + return o; + } + PyObject *unicode = PyUnicode_New(1, ch); + if (unicode == NULL) { + return NULL; + } + assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); + if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { + PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2) ch; + } + else { + assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); + PyUnicode_4BYTE_DATA(unicode)[0] = ch; + } + assert(_PyUnicode_CheckConsistency(unicode, 1)); + return unicode; +} -static inline int /* number of written characters or -1 on error */ -write_normal_character(PyObject *re, _PyUnicodeWriter *writer, PyObject *cp) +static Py_ssize_t /* number of written characters or -1 on error */ +write_escaped_string(PyObject *re, _PyUnicodeWriter *writer, PyObject *str) { - PyObject *ch = PyObject_CallMethodOneArg(re, &_Py_ID(escape), cp); - if (ch == NULL) { + PyObject *escaped = PyObject_CallMethodOneArg(re, &_Py_ID(escape), str); + if (escaped == NULL) { return -1; } - int written = PyUnicode_GetLength(ch); - int rc = _PyUnicodeWriter_WriteStr(writer, ch); - Py_DECREF(ch); + Py_ssize_t written = PyUnicode_GET_LENGTH(escaped); + int rc = _PyUnicodeWriter_WriteStr(writer, escaped); + Py_DECREF(escaped); if (rc < 0) { return -1; } @@ -365,7 +428,7 @@ write_normal_character(PyObject *re, _PyUnicodeWriter *writer, PyObject *cp) return written; } -static inline int /* number of written characters or -1 on error */ +static Py_ssize_t /* number of written characters or -1 on error */ write_translated_group(_PyUnicodeWriter *writer, PyObject *group) { #define WRITE_ASCII(str, len) \ @@ -395,7 +458,7 @@ write_translated_group(_PyUnicodeWriter *writer, PyObject *group) return 1; } else { - int extra = 0; + Py_ssize_t extra = 2; // '[' and ']' WRITE_CHAR('['); switch (buffer[0]) { case '!': { @@ -408,7 +471,7 @@ write_translated_group(_PyUnicodeWriter *writer, PyObject *group) case '^': case '[': { WRITE_CHAR('\\'); - extra = 1; + extra++; break; } default: @@ -418,30 +481,25 @@ write_translated_group(_PyUnicodeWriter *writer, PyObject *group) break; } WRITE_CHAR(']'); - return 2 + grouplen + extra; + return grouplen + extra; } #undef WRITE_CHAR #undef WRITE_ASCII } static PyObject * -get_translated_group(PyObject *unicode, - Py_ssize_t i /* unicode[i-1] == '[' (incl.) */, - Py_ssize_t j /* unicode[j] == ']' (excl.) */) +get_translated_group(PyObject *pattern, + Py_ssize_t i /* pattern[i-1] == '[' (incl.) */, + Py_ssize_t j /* pattern[j] == ']' (excl.) */) { PyObject *chunks = PyList_New(0); if (chunks == NULL) { return NULL; } - PyObject *chr = PySequence_GetItem(unicode, i); - if (chr == NULL) { - goto error; - } - Py_ssize_t k = PyUnicode_CompareWithASCIIString(chr, "!") == 0 ? i + 2 : i + 1; - Py_DECREF(chr); + Py_ssize_t k = (PyUnicode_READ_CHAR(pattern, i) == '!') ? i + 2 : i + 1; Py_ssize_t chunkscount = 0; while (k < j) { - PyObject *eobj = PyObject_CallMethod(unicode, "find", "ii", k, j); + PyObject *eobj = _PyObject_CallMethod(pattern, &_Py_ID(find), "ii", k, j); if (eobj == NULL) { goto error; } @@ -450,7 +508,7 @@ get_translated_group(PyObject *unicode, if (t < 0) { goto error; } - PyObject *sub = PyUnicode_Substring(unicode, i, t); + PyObject *sub = PyUnicode_Substring(pattern, i, t); if (sub == NULL) { goto error; } @@ -479,7 +537,7 @@ get_translated_group(PyObject *unicode, } } else { - PyObject *sub = PyUnicode_Substring(unicode, i, j); + PyObject *sub = PyUnicode_Substring(pattern, i, j); if (sub == NULL) { goto error; } @@ -494,24 +552,16 @@ get_translated_group(PyObject *unicode, Py_ssize_t c = chunkscount; while (--c) { PyObject *c1 = PyList_GET_ITEM(chunks, c - 1); - assert(c1 != NULL); - Py_ssize_t c1len = 0; - const char *c1buf = PyUnicode_AsUTF8AndSize(c1, &c1len); - if (c1buf == NULL) { - goto error; - } assert(c1len > 0); + Py_ssize_t c1len = PyUnicode_GET_LENGTH(c1); + assert(c1 != NULL); PyObject *c2 = PyList_GET_ITEM(chunks, c); assert(c2 != NULL); - Py_ssize_t c2len = 0; - const char *c2buf = PyUnicode_AsUTF8AndSize(c2, &c2len); - if (c2buf == NULL) { - goto error; - } + Py_ssize_t c2len = PyUnicode_GET_LENGTH(c2); assert(c2len > 0); - if (c1buf[c1len - 1] > c2buf[0]) { + if (PyUnicode_READ_CHAR(c1, c1len - 1) > PyUnicode_READ_CHAR(c2, 0)) { // all but the last character in the chunk PyObject *c1sub = PyUnicode_Substring(c1, 0, c1len - 1); // all but the first character in the chunk @@ -558,7 +608,7 @@ get_translated_group(PyObject *unicode, goto error; } } - PyObject *hyphen = PyUnicode_FromString("-"); + PyObject *hyphen = PyUnicode_FromOrdinal('-'); if (hyphen == NULL) { goto error; } @@ -575,101 +625,79 @@ get_translated_group(PyObject *unicode, } static PyObject * -join_translated_parts(PyObject *parts, PyObject *indices) +join_translated_parts(PyObject *module, PyObject *strings, PyObject *indices) { -#define LOAD_STAR_INDEX(var, k) \ - do { \ - ind = PyList_GET_ITEM(indices, (k)); \ - var = PyLong_AsSsize_t(ind); \ - if (var < 0) { \ - goto abort; \ - } \ - } while (0) - #define WRITE_SUBSTRING(i, j) \ do { \ if ((i) < (j)) { \ - if (_PyUnicodeWriter_WriteSubstring(_writer, parts, (i), (j)) < 0) { \ + if (_PyUnicodeWriter_WriteSubstring(_writer, strings, (i), (j)) < 0) { \ goto abort; \ } \ } \ } while (0) -#define WRITE_WILDCARD() \ - do { \ - if (_PyUnicodeWriter_WriteASCIIString(_writer, ".*", 2) < 0) { \ - goto abort; \ - } \ - } while (0) - -#define WRITE_ATOMIC_SUBSTRING(i, j) \ - do { \ - if ((_PyUnicodeWriter_WriteASCIIString(_writer, "(?>.*?", 6) < 0) || \ - (_PyUnicodeWriter_WriteSubstring(_writer, parts, (i), (j)) < 0) || \ - (_PyUnicodeWriter_WriteChar(_writer, ')') < 0)) \ - { \ - goto abort; \ - } \ - } while (0) - const Py_ssize_t m = PyList_GET_SIZE(indices); if (m == 0) { // just write fr'(?s:{parts} + ")\Z" - return PyUnicode_FromFormat("(?s:%S)\\Z", parts); + return PyUnicode_FromFormat("(?s:%S)\\Z", strings); } - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); - if (writer == NULL) { - return NULL; - } - _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)(writer); - /* * Special cases: indices[0] == 0 or indices[-1] + 1 == n * - * If indices[0] == 0 write (?>.*?group_1) instead of abcdef + * If indices[0] == 0 write (?>.*?abcdef) instead of abcdef * If indices[-1] == n - 1 write '.*' instead of empty string */ PyObject *ind; - Py_ssize_t i, j, n = PyUnicode_GetLength(parts); - // handle the first group - LOAD_STAR_INDEX(i, 0); - if (i == 0) { - if (m == 1) { // pattern = '*TAIL' - WRITE_WILDCARD(); - WRITE_SUBSTRING(1, n); // write TAIL part - goto finalize; - } - else { // pattern = '*BODY*...' - LOAD_STAR_INDEX(j, 1); - WRITE_ATOMIC_SUBSTRING(i + 1, j); - i = j + 1; - } + Py_ssize_t i = 0, j, n = PyUnicode_GET_LENGTH(strings); + /* + * If the pattern starts with '*', we will write everything + * before it. So we will write at least indices[0] characters. + * + * For the inner groups 'STAR STRING ...' we always surround + * the STRING by "(?>.*?" and ")", and thus we will write at + * least 7 + len(STRING) characters. + * + * We write one additional '.*' if indices[-1] + 1 = n. + * + * Since the result is surrounded by "(?s:" and ")\Z", we + * write at least "indices[0] + 7m + n + 6" characters, + * where 'm' is the number of stars and 'n' the length + * of the translated pattern. + */ + PyObject *jobj = PyList_GET_ITEM(indices, 0); + j = PyLong_AsSsize_t(jobj); // get the first position of '*' + if (j < 0) { + return NULL; } - else { - if (m == 1) { // pattern = 'HEAD*' or 'HEAD*TAIL' - WRITE_SUBSTRING(0, i); // write HEAD part - WRITE_WILDCARD(); - WRITE_SUBSTRING(i + 1, n); // write TAIL part (if any) - goto finalize; - } - else { // pattern = 'HEAD*STRING*...' - WRITE_SUBSTRING(0, i); // write HEAD part - i++; - } + Py_ssize_t estimate = j + 7 * m + n + 6; + PyUnicodeWriter *writer = PyUnicodeWriter_Create(estimate); + if (writer == NULL) { + return NULL; } - // handle the inner groups - for (Py_ssize_t k = 1; k < m - 1; ++k) { - LOAD_STAR_INDEX(j, k + 1); - assert(i < j); - WRITE_ATOMIC_SUBSTRING(i, j); + _PyUnicodeWriter *_writer = (_PyUnicodeWriter *) (writer); + + WRITE_SUBSTRING(i, j); // write stuff before '*' if needed + i = j + 1; // jump after the star + for (Py_ssize_t k = 1; k < m; ++k) { + ind = PyList_GET_ITEM(indices, k); + j = PyLong_AsSsize_t(ind); + assert(j < 0 || i > j); + if (j < 0 || + (_PyUnicodeWriter_WriteASCIIString(_writer, "(?>.*?", 6) < 0) || + (_PyUnicodeWriter_WriteSubstring(_writer, strings, i, j) < 0) || + (_PyUnicodeWriter_WriteChar(_writer, ')') < 0)) { + goto abort; + } i = j + 1; } // handle the last group - WRITE_WILDCARD(); - WRITE_SUBSTRING(i, n); // write TAIL part ( -finalize: - ; // empty statement for allowing a label before a declaration + if (_PyUnicodeWriter_WriteASCIIString(_writer, ".*", 2) < 0) { + goto abort; + } + WRITE_SUBSTRING(i, n); // write TAIL part + +#undef WRITE_SUBSTRING + PyObject *res = PyUnicodeWriter_Finish(writer); if (res == NULL) { return NULL; @@ -681,163 +709,156 @@ join_translated_parts(PyObject *parts, PyObject *indices) } static PyObject * -translate(PyObject *module, PyObject *unicode) +translate(PyObject *module, PyObject *pattern) /* new reference */ { +#define READ(ind) PyUnicode_READ(kind, data, (ind)) + +#define ADVANCE_IF_CHAR(ch, ind, maxind) \ + do { \ + if ((ind) < (maxind) && READ(ind) == (ch)) { \ + ++(ind); \ + } \ + } while (0) + +#define _WHILE_READ_CMP(ch, ind, maxind, cmp) \ + do { \ + while ((ind) < (maxind) && READ(ind) cmp (ch)) { \ + ++(ind); \ + } \ + } while (0) + +#define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, !=) +#define DROP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, ==) + fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); PyObject *re = state->re_module; - - Py_ssize_t estimate = 0; - PyUnicodeWriter *writer = PyUnicodeWriter_Create(estimate); + const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); + // We would write less data if there are successive '*', which should + // not be the case in general. Otherwise, we write >= n characters + // since escaping them would always add more characters so we will + // overestimate a bit the number of characters to write. + // + // TODO(picnixz): should we limit the estimation or not? + PyUnicodeWriter *writer = PyUnicodeWriter_Create((Py_ssize_t) (1.05 * n)); if (writer == NULL) { return NULL; } _PyUnicodeWriter *_writer = (_PyUnicodeWriter *) (writer); - // list containing the indices where '*' has a special meaning PyObject *indices = PyList_New(0); if (indices == NULL) { goto abort; } - - Py_ssize_t n = PyUnicode_GetLength(unicode); - if (n < 0) { - goto abort; - } + const int kind = PyUnicode_KIND(pattern); + const void *data = PyUnicode_DATA(pattern); Py_ssize_t h = 0, i = 0; - PyObject *peek = NULL; while (i < n) { - PyObject *chr = PySequence_GetItem(unicode, i); - if (chr == NULL) { - goto abort; - } - if (PyUnicode_CompareWithASCIIString(chr, "*") == 0) { - Py_DECREF(chr); - if (_PyUnicodeWriter_WriteChar(_writer, '*') < 0) { - goto abort; - } - // drop all other '*' that can be found afterwards - while (++i < n) { - peek = PySequence_GetItem(unicode, i); - if (peek == NULL) { - goto abort; - } - if (PyUnicode_CompareWithASCIIString(peek, "*") != 0) { - Py_DECREF(peek); - break; - } - Py_DECREF(peek); - } - PyObject *index = PyLong_FromLong(h++); - if (index == NULL) { - goto abort; - } - int rc = PyList_Append(indices, index); - Py_DECREF(index); - if (rc < 0) { - goto abort; - } - } - else if (PyUnicode_CompareWithASCIIString(chr, "?") == 0) { - Py_DECREF(chr); - // translate optional '?' (fnmatch) into optional '.' (regex) - if (_PyUnicodeWriter_WriteChar(_writer, '.') < 0) { - goto abort; - } - ++i; // advance for the next iteration - ++h; // increase the expected result's length - } - else if (PyUnicode_CompareWithASCIIString(chr, "[") == 0) { - Py_DECREF(chr); - // check the next characters (peek) - Py_ssize_t j = ++i; - if (j < n) { - peek = PySequence_GetItem(unicode, j); - if (peek == NULL) { + // read and advance to the next character + Py_UCS4 chr = READ(i++); + switch (chr) { + case '*': { + if (_PyUnicodeWriter_WriteChar(_writer, chr) < 0) { goto abort; } - if (PyUnicode_CompareWithASCIIString(peek, "!") == 0) {// [! - ++j; - } - Py_DECREF(peek); - } - if (j < n) { - peek = PySequence_GetItem(unicode, j); - if (peek == NULL) { + DROP_DUPLICATES('*', i, n); + PyObject *index = PyLong_FromSsize_t(h++); + if (index == NULL) { goto abort; } - if (PyUnicode_CompareWithASCIIString(peek, "]") == 0) { // [!] or [] - ++j; - } - Py_DECREF(peek); - } - while (j < n) { - peek = PySequence_GetItem(unicode, j); - if (peek == NULL) { + int rc = PyList_Append(indices, index); + Py_DECREF(index); + if (rc < 0) { goto abort; } - // locate the closing ']' - if (PyUnicode_CompareWithASCIIString(peek, "]") != 0) { - ++j; - } - Py_DECREF(peek); + break; } - if (j >= n) { - if (_PyUnicodeWriter_WriteASCIIString(_writer, "\\[", 2) < 0) { + case '?': { + // translate optional '?' (fnmatch) into optional '.' (regex) + if (_PyUnicodeWriter_WriteChar(_writer, '.') < 0) { goto abort; } - h += 2; // we just wrote 2 characters + ++h; // increase the expected result's length + break; } - else { - // v--- pattern[j] (exclusive) - // '[' * ... * ']' - // ^----- pattern[i] (inclusive) - PyObject *s1 = NULL, *s2 = NULL; - if (PyUnicode_FindChar(unicode, '-', i, j, 1) >= 0) { - PyObject *group = PyUnicode_Substring(unicode, i, j); - if (group == NULL) { + case '[': { + Py_ssize_t j = i; // 'i' is already at next char + ADVANCE_IF_CHAR('!', j, n); // [! + ADVANCE_IF_CHAR(']', j, n); // [!] or [] + ADVANCE_TO_NEXT(']', j, n); // locate closing ']' + if (j >= n) { + if (_PyUnicodeWriter_WriteASCIIString(_writer, "\\[", 2) < 0) { goto abort; } - s1 = PyObject_CallMethod(group, "replace", "ss", "\\", "\\\\"); - Py_DECREF(group); + h += 2; // we just wrote 2 characters + break; // early break for clarity } else { - s1 = get_translated_group(unicode, i, j); - } - if (s1 == NULL) { - goto abort; + // v--- pattern[j] (exclusive) + // '[' * ... * ']' + // ^----- pattern[i] (inclusive) + PyObject *s1 = NULL, *s2 = NULL; + int rc = PyUnicode_FindChar(pattern, '-', i, j, 1); + if (rc == -2) { + goto abort; + } + if (rc == -1) { + PyObject *group = PyUnicode_Substring(pattern, i, j); + if (group == NULL) { + goto abort; + } + s1 = _PyObject_CallMethod(group, &_Py_ID(replace), "ss", "\\", "\\\\"); + Py_DECREF(group); + } + else { + assert(rc >= 0); + s1 = get_translated_group(pattern, i, j); + } + if (s1 == NULL) { + goto abort; + } + s2 = _PyObject_CallMethod(re, &_Py_ID(sub), "ssO", "([&~|])", "\\\\\\1", s1); + Py_DECREF(s1); + if (s2 == NULL) { + goto abort; + } + int difflen = write_translated_group(_writer, s2); + Py_DECREF(s2); + if (difflen < 0) { + goto abort; + } + h += difflen; + i = j + 1; // jump to the character after ']' + break; // early break for clarity } - s2 = PyObject_CallMethod(re, "sub", "ssO", "([&~|])", "\\\\\\1", s1); - Py_DECREF(s1); - if (s2 == NULL) { + } + default: { + PyObject *str = get_unicode_character(chr); + if (str == NULL) { goto abort; } - int difflen = write_translated_group(_writer, s2); - Py_DECREF(s2); + int difflen = write_escaped_string(re, _writer, str); + Py_DECREF(str); if (difflen < 0) { goto abort; } h += difflen; - i = j + 1; // jump to the character after ']' - } - } - else { - int difflen = write_normal_character(re, _writer, chr); - Py_DECREF(chr); - if (difflen < 0) { - goto abort; + break; } - h += difflen; - ++i; } } +#undef DROP_DUPLICATES +#undef ADVANCE_TO_NEXT +#undef _WHILE_READ_CMP +#undef ADVANCE_IF_CHAR +#undef READ PyObject *parts = PyUnicodeWriter_Finish(writer); if (parts == NULL) { Py_DECREF(indices); return NULL; } assert(h == PyUnicode_GET_LENGTH(parts)); - PyObject *res = join_translated_parts(parts, indices); + PyObject *res = join_translated_parts(module, parts, indices); Py_DECREF(parts); Py_DECREF(indices); return res; From 36432e82044ecd3f39469a6270df7313a4c082de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 11:55:05 +0200 Subject: [PATCH 18/97] update generated objects --- Include/internal/pycore_global_objects_fini_generated.h | 2 ++ Include/internal/pycore_global_strings.h | 2 ++ Include/internal/pycore_runtime_init_generated.h | 2 ++ Include/internal/pycore_unicodeobject_generated.h | 8 ++++++++ 4 files changed, 14 insertions(+) diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index 8e3d405fc7c04b..fc82cfc1536feb 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -945,6 +945,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(filter)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(filters)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(final)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(find)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(find_class)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fix_imports)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(flags)); @@ -1229,6 +1230,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strict)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strict_mode)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(string)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sub)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sub_key)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(symmetric_difference_update)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tabsize)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 7cbc1941ffa0ee..78f56bdd2f6238 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -434,6 +434,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(filter) STRUCT_FOR_ID(filters) STRUCT_FOR_ID(final) + STRUCT_FOR_ID(find) STRUCT_FOR_ID(find_class) STRUCT_FOR_ID(fix_imports) STRUCT_FOR_ID(flags) @@ -718,6 +719,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(strict) STRUCT_FOR_ID(strict_mode) STRUCT_FOR_ID(string) + STRUCT_FOR_ID(sub) STRUCT_FOR_ID(sub_key) STRUCT_FOR_ID(symmetric_difference_update) STRUCT_FOR_ID(tabsize) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index 4164c6cae7a8bf..2c965ecd99fcf5 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -943,6 +943,7 @@ extern "C" { INIT_ID(filter), \ INIT_ID(filters), \ INIT_ID(final), \ + INIT_ID(find), \ INIT_ID(find_class), \ INIT_ID(fix_imports), \ INIT_ID(flags), \ @@ -1227,6 +1228,7 @@ extern "C" { INIT_ID(strict), \ INIT_ID(strict_mode), \ INIT_ID(string), \ + INIT_ID(sub), \ INIT_ID(sub_key), \ INIT_ID(symmetric_difference_update), \ INIT_ID(tabsize), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index b15845cd16e814..0307d1f4806ba7 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1536,6 +1536,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(find); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(find_class); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -2672,6 +2676,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(sub); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(sub_key); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); From 4a369809c65673e2d571574962049dcb8fa28409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 12:01:54 +0200 Subject: [PATCH 19/97] FIX BUILD --- Modules/_fnmatchmodule.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/_fnmatchmodule.c b/Modules/_fnmatchmodule.c index 251b71ea5f5cfd..82458b9499ee35 100644 --- a/Modules/_fnmatchmodule.c +++ b/Modules/_fnmatchmodule.c @@ -389,7 +389,7 @@ _fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) static PyObject * get_unicode_character(Py_UCS4 ch) { - assert(ch <= MAX_UNICODE); + assert(ch <= 0x10ffff); if (ch < 256) { PyObject *o = _Py_LATIN1_CHR(ch); assert(_Py_IsImmortal(o)); @@ -552,9 +552,9 @@ get_translated_group(PyObject *pattern, Py_ssize_t c = chunkscount; while (--c) { PyObject *c1 = PyList_GET_ITEM(chunks, c - 1); - assert(c1len > 0); - Py_ssize_t c1len = PyUnicode_GET_LENGTH(c1); assert(c1 != NULL); + Py_ssize_t c1len = PyUnicode_GET_LENGTH(c1); + assert(c1len > 0); PyObject *c2 = PyList_GET_ITEM(chunks, c); assert(c2 != NULL); From 4881f1cbb548497ef4acf526f44e2a2e9458de3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 12:14:03 +0200 Subject: [PATCH 20/97] remove interned strings --- Modules/_fnmatchmodule.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/Modules/_fnmatchmodule.c b/Modules/_fnmatchmodule.c index 82458b9499ee35..60528f18b1b658 100644 --- a/Modules/_fnmatchmodule.c +++ b/Modules/_fnmatchmodule.c @@ -20,10 +20,6 @@ typedef struct { PyObject *os_module; // 'os' module PyObject *lru_cache; // optional cache for regex patterns, if needed - - PyObject *str_atomic_bgroup; // (?>.*? - PyObject *str_atomic_egroup; // ) - PyObject *str_wildcard; // * } fnmatchmodule_state; static inline fnmatchmodule_state * @@ -41,10 +37,6 @@ fnmatchmodule_clear(PyObject *m) Py_CLEAR(st->os_module); Py_CLEAR(st->re_module); Py_CLEAR(st->lru_cache); - - Py_CLEAR(st->str_atomic_bgroup); - Py_CLEAR(st->str_atomic_egroup); - Py_CLEAR(st->str_wildcard); return 0; } @@ -55,10 +47,6 @@ fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) Py_VISIT(st->os_module); Py_VISIT(st->re_module); Py_VISIT(st->lru_cache); - - Py_VISIT(st->str_atomic_bgroup); - Py_VISIT(st->str_atomic_egroup); - Py_VISIT(st->str_wildcard); return 0; } @@ -100,12 +88,6 @@ fnmatchmodule_exec(PyObject *m) } // todo: handle LRU cache - // interned strings - INTERN_STRING(str_atomic_bgroup, "(?>.*?"); - INTERN_STRING(str_atomic_egroup, ")"); - INTERN_STRING(str_wildcard, "*"); - -#undef INTERN_STRING #undef IMPORT_MODULE return 0; @@ -212,12 +194,13 @@ posix_fnmatch_filter(const char *pattern, PyObject *names, static PyObject * get_match_function(PyObject *module, PyObject *pattern) { + // TODO(picnixz): use LRU-cache PyObject *expr = _fnmatch_translate_impl(module, pattern); if (expr == NULL) { return NULL; } fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); - PyObject *compiled = PyObject_CallMethod(st->re_module, "compile", "O", expr); + PyObject *compiled = _PyObject_CallMethod(st->re_module, &_Py_ID(compile), "O", expr); Py_DECREF(expr); if (compiled == NULL) { return NULL; @@ -678,12 +661,15 @@ join_translated_parts(PyObject *module, PyObject *strings, PyObject *indices) WRITE_SUBSTRING(i, j); // write stuff before '*' if needed i = j + 1; // jump after the star + + fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); for (Py_ssize_t k = 1; k < m; ++k) { ind = PyList_GET_ITEM(indices, k); j = PyLong_AsSsize_t(ind); - assert(j < 0 || i > j); - if (j < 0 || - (_PyUnicodeWriter_WriteASCIIString(_writer, "(?>.*?", 6) < 0) || + if (j < 0 || i > j) { + goto abort; + } + if ((_PyUnicodeWriter_WriteASCIIString(_writer, "(?>.*?", 6) < 0) || (_PyUnicodeWriter_WriteSubstring(_writer, strings, i, j) < 0) || (_PyUnicodeWriter_WriteChar(_writer, ')') < 0)) { goto abort; From ec5a922aa7a7333f8841cfccbd3e058eba01cfc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 16:14:03 +0200 Subject: [PATCH 21/97] revert addition --- Modules/Setup | 1 - 1 file changed, 1 deletion(-) diff --git a/Modules/Setup b/Modules/Setup index acb542b70946ea..e4acf6bc7de8ea 100644 --- a/Modules/Setup +++ b/Modules/Setup @@ -137,7 +137,6 @@ PYTHONPATH=$(COREPYTHONPATH) #_datetime _datetimemodule.c #_decimal _decimal/_decimal.c #_heapq _heapqmodule.c -_fnmatch _fnmatchmodule.c #_interpchannels _interpchannelsmodule.c #_interpqueues _interpqueuesmodule.c #_interpreters _interpretersmodule.c From b29ccb49042626ef3d1d205146d533b7de968de8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 17:49:16 +0200 Subject: [PATCH 22/97] make it fail on Windows for now --- PCbuild/pythoncore.vcxproj | 1 - PCbuild/pythoncore.vcxproj.filters | 3 --- 2 files changed, 4 deletions(-) diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index db9f960c61ce6c..f36fcb8caece33 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -449,7 +449,6 @@ - diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 24384e355f46ec..a1b43addf9e36a 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -995,9 +995,6 @@ Modules - - Modules - Modules From a91f689db9c418e259c5be0da5b4194bfb7f1b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 18:33:07 +0200 Subject: [PATCH 23/97] Update configurations? --- Makefile.pre.in | 9 +++++++++ Modules/Setup.bootstrap.in | 3 +++ configure.ac | 2 ++ 3 files changed, 14 insertions(+) diff --git a/Makefile.pre.in b/Makefile.pre.in index 94cfb74138a3d9..97aa13bcc34409 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -350,6 +350,12 @@ IO_OBJS= \ Modules/_io/bytesio.o \ Modules/_io/stringio.o +FNMATCH_H= Modules/_fnmatch/_fnmatchmodule.h + +FNMATCH_OBJS= \ + Modules/_fnmatch/_fnmatchmodule.o \ + Modules/_fnmatch/posix.o \ + Modules/_fnmatch/regex.o \ ########################################################################## # mimalloc @@ -1740,6 +1746,8 @@ Python/sysmodule.o: $(srcdir)/Python/sysmodule.c Makefile $(srcdir)/Include/pydt $(MULTIARCH_CPPFLAGS) \ -o $@ $(srcdir)/Python/sysmodule.c +$(FNMATCH_OBJS): $(FNMATCH_H) + $(IO_OBJS): $(IO_H) .PHONY: regen-pegen-metaparser @@ -3131,6 +3139,7 @@ MODULE__CTYPES_TEST_DEPS=$(srcdir)/Modules/_ctypes/_ctypes_test_generated.c.h MODULE__CTYPES_MALLOC_CLOSURE=@MODULE__CTYPES_MALLOC_CLOSURE@ MODULE__DECIMAL_DEPS=$(srcdir)/Modules/_decimal/docstrings.h @LIBMPDEC_INTERNAL@ MODULE__ELEMENTTREE_DEPS=$(srcdir)/Modules/pyexpat.c @LIBEXPAT_INTERNAL@ +MODULE__FNMATCH_DEPS=$(srcdir)/Modules/_fnmatch/_fnmatchmodule.h MODULE__HASHLIB_DEPS=$(srcdir)/Modules/hashlib.h MODULE__IO_DEPS=$(srcdir)/Modules/_io/_iomodule.h MODULE__MD5_DEPS=$(srcdir)/Modules/hashlib.h $(LIBHACL_HEADERS) Modules/_hacl/Hacl_Hash_MD5.h Modules/_hacl/Hacl_Hash_MD5.c diff --git a/Modules/Setup.bootstrap.in b/Modules/Setup.bootstrap.in index aa4e60e272653b..c54cd207aec57d 100644 --- a/Modules/Setup.bootstrap.in +++ b/Modules/Setup.bootstrap.in @@ -34,5 +34,8 @@ _operator _operator.c _stat _stat.c _symtable symtablemodule.c +# miscellaneous accelerators +_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/posix.c _fnmatch/regex.c _fnmatch/translate.c + # for systems without $HOME env, used by site._getuserbase() @MODULE_PWD_TRUE@pwd pwdmodule.c diff --git a/configure.ac b/configure.ac index d4fdb81d34890a..6093c994bd13af 100644 --- a/configure.ac +++ b/configure.ac @@ -7025,6 +7025,7 @@ SRCDIRS="\ Modules/_ctypes \ Modules/_decimal \ Modules/_decimal/libmpdec \ + Modules/_fnmatch \ Modules/_hacl \ Modules/_io \ Modules/_multiprocessing \ @@ -7701,6 +7702,7 @@ AC_DEFUN([PY_STDLIB_MOD_SIMPLE], [ ]) dnl static modules in Modules/Setup.bootstrap +PY_STDLIB_MOD_SIMPLE([_fnmatch], [-I\$(srcdir)/Modules/_fnmatch], []) PY_STDLIB_MOD_SIMPLE([_io], [-I\$(srcdir)/Modules/_io], []) PY_STDLIB_MOD_SIMPLE([time], [], [$TIMEMODULE_LIB]) From 9e93b589ba21e3a3f5da7517c8a233530b1ffa57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 18:39:44 +0200 Subject: [PATCH 24/97] split implementation --- Modules/_fnmatch/_fnmatchmodule.c | 259 +++++ Modules/_fnmatch/_fnmatchmodule.h | 73 ++ Modules/_fnmatch/clinic/_fnmatchmodule.c.h | 185 ++++ Modules/_fnmatch/posix.c | 82 ++ Modules/_fnmatch/regex.c | 67 ++ .../translate.c} | 979 ++++++------------ 6 files changed, 967 insertions(+), 678 deletions(-) create mode 100644 Modules/_fnmatch/_fnmatchmodule.c create mode 100644 Modules/_fnmatch/_fnmatchmodule.h create mode 100644 Modules/_fnmatch/clinic/_fnmatchmodule.c.h create mode 100644 Modules/_fnmatch/posix.c create mode 100644 Modules/_fnmatch/regex.c rename Modules/{_fnmatchmodule.c => _fnmatch/translate.c} (50%) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c new file mode 100644 index 00000000000000..6e566991188861 --- /dev/null +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -0,0 +1,259 @@ +/* + * C accelerator for the 'fnmatch' module (POSIX only). + * + * Most functions expect string or bytes instances, and thus the Python + * implementation should first pre-process path-like objects, possibly + * applying normalizations depending on the platform if needed. + */ + +#include "Python.h" +#include "pycore_call.h" // for _PyObject_CallMethod + +#include "_fnmatchmodule.h" +#include "clinic/_fnmatchmodule.c.h" + +#define INVALID_PATTERN_TYPE "pattern must be a string or a bytes object" + +// module state functions + +static int +fnmatchmodule_clear(PyObject *m) +{ + fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); + Py_CLEAR(st->os_module); + Py_CLEAR(st->re_module); + Py_CLEAR(st->lru_cache); + return 0; +} + +static int +fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) +{ + fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); + Py_VISIT(st->os_module); + Py_VISIT(st->re_module); + Py_VISIT(st->lru_cache); + return 0; +} + +static void +fnmatchmodule_free(void *m) +{ + fnmatchmodule_clear((PyObject *) m); +} + +static int +fnmatchmodule_exec(PyObject *m) +{ +#define IMPORT_MODULE(attr, name) \ + do { \ + state->attr = PyImport_ImportModule((name)); \ + if (state->attr == NULL) { \ + return -1; \ + } \ + } while (0) + +#define INTERN_STRING(attr, str) \ + do { \ + state->attr = PyUnicode_InternFromString((str)); \ + if (state->attr == NULL) { \ + return -1; \ + } \ + } while (0) + + fnmatchmodule_state *state = get_fnmatchmodulestate_state(m); + + // imports + IMPORT_MODULE(os_module, "os"); + IMPORT_MODULE(re_module, "re"); + + // helpers + state->lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); + if (state->lru_cache == NULL) { + return -1; + } + // todo: handle LRU cache + +#undef IMPORT_MODULE +#undef INTERN_STRING + + return 0; +} + +/*[clinic input] +module _fnmatch +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=356e324d57d93f08]*/ + +static PyObject * +get_match_function(PyObject *module, PyObject *pattern) +{ + // TODO(picnixz): use LRU-cache + PyObject *expr = _fnmatch_translate_impl(module, pattern); + if (expr == NULL) { + return NULL; + } + fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + PyObject *compiled = _PyObject_CallMethod(st->re_module, &_Py_ID(compile), "O", expr); + Py_DECREF(expr); + if (compiled == NULL) { + return NULL; + } + PyObject *matcher = PyObject_GetAttr(compiled, &_Py_ID(match)); + Py_DECREF(compiled); + return matcher; +} + +static PyMethodDef get_match_function_method_def = { + "get_match_function", + _PyCFunction_CAST(get_match_function), + METH_O, + NULL +}; + +/*[clinic input] +_fnmatch.filter -> object + + names: object + pat: object + +[clinic start generated code]*/ + +static PyObject * +_fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) +/*[clinic end generated code: output=7f11aa68436d05fc input=1d233174e1c4157a]*/ +{ +#ifndef Py_HAVE_FNMATCH + PyObject *matcher = get_match_function(module, pat); + if (matcher == NULL) { + return NULL; + } + PyObject *result = _regex_fnmatch_filter(matcher, names); + Py_DECREF(matcher); + return result; +#else + // Note that the Python implementation of fnmatch.filter() does not + // call os.fspath() on the names being matched, whereas it does on NT. + if (PyBytes_Check(pat)) { + const char *pattern = PyBytes_AS_STRING(pat); + return _posix_fnmatch_filter(pattern, names, &_posix_fnmatch_encoded); + } + if (PyUnicode_Check(pat)) { + const char *pattern = PyUnicode_AsUTF8(pat); + return _posix_fnmatch_filter(pattern, names, &_posix_fnmatch_unicode); + } + PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); + return NULL; +#endif +} + +/*[clinic input] +_fnmatch.fnmatchcase -> bool + + name: object + pat: object + +Test whether `name` matches `pattern`, including case. + +This is a version of fnmatch() which doesn't case-normalize +its arguments. + +[clinic start generated code]*/ + +static int +_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) +/*[clinic end generated code: output=4d1283b1b1fc7cb8 input=b02a6a5c8c5a46e2]*/ +{ +#ifndef Py_HAVE_FNMATCH + PyObject *matcher = get_match_function(module, pat); + if (matcher == NULL) { + return -1; + } + int res = _regex_fnmatch_generic(matcher, name); + Py_DECREF(matcher); + return res; +#else + // This function does not transform path-like objects, nor does it + // case-normalize 'name' or 'pattern' (whether it is the Python or + // the C implementation). + if (PyBytes_Check(pat)) { + const char *pattern = PyBytes_AS_STRING(pat); + return _posix_fnmatch_encoded(pattern, name); + } + if (PyUnicode_Check(pat)) { + const char *pattern = PyUnicode_AsUTF8(pat); + return _posix_fnmatch_unicode(pattern, name); + } + PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); + return -1; +#endif +} + +/*[clinic input] +_fnmatch.translate -> object + + pat as pattern: object + +[clinic start generated code]*/ + +static PyObject * +_fnmatch_translate_impl(PyObject *module, PyObject *pattern) +/*[clinic end generated code: output=2d9e3bbcbcc6e90e input=56e39f7beea97810]*/ +{ + if (PyBytes_Check(pattern)) { + PyObject *unicode = PyUnicode_DecodeLatin1(PyBytes_AS_STRING(pattern), + PyBytes_GET_SIZE(pattern), + "strict"); + if (unicode == NULL) { + return NULL; + } + // translated regular expression as a str object + PyObject *str_expr = translate(module, unicode); + Py_DECREF(unicode); + if (str_expr == NULL) { + return NULL; + } + PyObject *expr = PyUnicode_AsLatin1String(str_expr); + Py_DECREF(str_expr); + return expr; + } + else if (PyUnicode_Check(pattern)) { + return translate(module, pattern); + } + else { + PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); + return NULL; + } +} + +static PyMethodDef fnmatchmodule_methods[] = { + _FNMATCH_FILTER_METHODDEF + _FNMATCH_FNMATCHCASE_METHODDEF + _FNMATCH_TRANSLATE_METHODDEF + {NULL, NULL} +}; + +static struct PyModuleDef_Slot fnmatchmodule_slots[] = { + {Py_mod_exec, fnmatchmodule_exec}, + {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, + {0, NULL}, +}; + +static struct PyModuleDef _fnmatchmodule = { + PyModuleDef_HEAD_INIT, + "_fnmatch", + NULL, + .m_size = sizeof(fnmatchmodule_state), + .m_methods = fnmatchmodule_methods, + .m_slots = fnmatchmodule_slots, + .m_traverse = fnmatchmodule_traverse, + .m_clear = fnmatchmodule_clear, + .m_free = fnmatchmodule_free, +}; + +PyMODINIT_FUNC +PyInit__fnmatch(void) +{ + return PyModuleDef_Init(&_fnmatchmodule); +} diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h new file mode 100644 index 00000000000000..af271703791be3 --- /dev/null +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -0,0 +1,73 @@ +#ifndef _FNMATCHMODULE_H +#define _FNMATCHMODULE_H + +#include "Python.h" + +typedef struct { + PyObject *re_module; // 're' module + PyObject *os_module; // 'os' module + + PyObject *lru_cache; // optional cache for regex patterns, if needed +} fnmatchmodule_state; + +static inline fnmatchmodule_state * +get_fnmatchmodulestate_state(PyObject *module) +{ + void *state = PyModule_GetState(module); + assert(state != NULL); + return (fnmatchmodule_state *)state; +} + +/* + * The filter() function works differently depending on whether fnmatch(3) + * is present or not. + * + * If fnmatch(3) is present, the match is performed without using regular + * expressions. The functions being used are + * + * If fnmatch(3) is not present, the match is performed using regular + * expressions. + */ + +#ifdef Py_HAVE_FNMATCH +/* + * Type for a matching function. + * + * The function must take as input a pattern and a name, + * and is used to determine whether the name matches the + * pattern or not. + * + * If the pattern is obtained from str() types, then 'name' + * must be a string (it is left to the matcher the task for + * validating this part). + */ +typedef int (*Matcher)(const char *, PyObject *); + +extern PyObject * +_posix_fnmatch_filter(const char *pattern, PyObject *names, Matcher match); + +/* + * Perform a case-sensitive match using fnmatch(3). + * + * Parameters + * + * pattern A UNIX shell pattern. + * string The string to match (bytes object). + * + * Returns 1 if the 'string' matches the 'pattern' and 0 otherwise. + * + * Returns -1 if (1) 'string' is not a `bytes` object, and + * sets a TypeError exception, or (2) something went wrong. + */ +extern int _posix_fnmatch_encoded(const char *pattern, PyObject *string); +/* Same as _posix_fnmatch_encoded() but for unicode inputs. */ +extern int _posix_fnmatch_unicode(const char *pattern, PyObject *string); +#else +extern int _regex_fnmatch_generic(PyObject *matcher, PyObject *name); +extern PyObject * +_regex_fnmatch_filter(PyObject *matcher, PyObject *names); +#endif + +extern PyObject *translate(PyObject *module, PyObject *pattern); + +#endif // _FNMATCHMODULE_H diff --git a/Modules/_fnmatch/clinic/_fnmatchmodule.c.h b/Modules/_fnmatch/clinic/_fnmatchmodule.c.h new file mode 100644 index 00000000000000..4b12f33113d3fb --- /dev/null +++ b/Modules/_fnmatch/clinic/_fnmatchmodule.c.h @@ -0,0 +1,185 @@ +/*[clinic input] +preserve +[clinic start generated code]*/ + +#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) +# include "pycore_gc.h" // PyGC_Head +# include "pycore_runtime.h" // _Py_ID() +#endif +#include "pycore_modsupport.h" // _PyArg_UnpackKeywords() + +PyDoc_STRVAR(_fnmatch_filter__doc__, +"filter($module, /, names, pat)\n" +"--\n" +"\n"); + +#define _FNMATCH_FILTER_METHODDEF \ + {"filter", _PyCFunction_CAST(_fnmatch_filter), METH_FASTCALL|METH_KEYWORDS, _fnmatch_filter__doc__}, + +static PyObject * +_fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat); + +static PyObject * +_fnmatch_filter(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(names), &_Py_ID(pat), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"names", "pat", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "filter", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + PyObject *names; + PyObject *pat; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); + if (!args) { + goto exit; + } + names = args[0]; + pat = args[1]; + return_value = _fnmatch_filter_impl(module, names, pat); + +exit: + return return_value; +} + +PyDoc_STRVAR(_fnmatch_fnmatchcase__doc__, +"fnmatchcase($module, /, name, pat)\n" +"--\n" +"\n" +"Test whether `name` matches `pattern`, including case.\n" +"\n" +"This is a version of fnmatch() which doesn\'t case-normalize\n" +"its arguments."); + +#define _FNMATCH_FNMATCHCASE_METHODDEF \ + {"fnmatchcase", _PyCFunction_CAST(_fnmatch_fnmatchcase), METH_FASTCALL|METH_KEYWORDS, _fnmatch_fnmatchcase__doc__}, + +static int +_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat); + +static PyObject * +_fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(name), &_Py_ID(pat), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"name", "pat", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "fnmatchcase", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + PyObject *name; + PyObject *pat; + int _return_value; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); + if (!args) { + goto exit; + } + name = args[0]; + pat = args[1]; + _return_value = _fnmatch_fnmatchcase_impl(module, name, pat); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyBool_FromLong((long)_return_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(_fnmatch_translate__doc__, +"translate($module, /, pat)\n" +"--\n" +"\n"); + +#define _FNMATCH_TRANSLATE_METHODDEF \ + {"translate", _PyCFunction_CAST(_fnmatch_translate), METH_FASTCALL|METH_KEYWORDS, _fnmatch_translate__doc__}, + +static PyObject * +_fnmatch_translate_impl(PyObject *module, PyObject *pattern); + +static PyObject * +_fnmatch_translate(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(pat), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"pat", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "translate", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + PyObject *pattern; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf); + if (!args) { + goto exit; + } + pattern = args[0]; + return_value = _fnmatch_translate_impl(module, pattern); + +exit: + return return_value; +} +/*[clinic end generated code: output=b0366b259b101bdf input=a9049054013a1b77]*/ diff --git a/Modules/_fnmatch/posix.c b/Modules/_fnmatch/posix.c new file mode 100644 index 00000000000000..30d0845d7bae88 --- /dev/null +++ b/Modules/_fnmatch/posix.c @@ -0,0 +1,82 @@ +#ifdef Py_HAVE_FNMATCH + +#include // for fnmatch(3) + +#include "Python.h" +#include "_fnmatchmodule.h" // for PosixMatcher + +#define INVALID_TYPE_FOR_NAME "name must be a %s object, got %.200s" + +#define VERIFY_NAME_ARG_TYPE(name, check, expecting) \ + do { \ + if (!check) { \ + PyErr_Format(PyExc_TypeError, INVALID_TYPE_FOR_NAME, \ + expecting, Py_TYPE(name)->tp_name); \ + return -1; \ + } \ + } while (0) + +#define PROCESS_MATCH_RESULT(r) \ + do { \ + int res = (r); /* avoid variable capture */ \ + if (res < 0) { \ + return res; \ + } \ + return res != FNM_NOMATCH; \ + } while (0) + +inline int +_posix_fnmatch_encoded(const char *pattern, PyObject *string) +{ + VERIFY_NAME_ARG_TYPE(string, PyBytes_Check(string), "bytes"); + PROCESS_MATCH_RESULT(fnmatch(pattern, PyBytes_AS_STRING(string), 0)); +} + +inline int +_posix_fnmatch_unicode(const char *pattern, PyObject *string) +{ + VERIFY_NAME_ARG_TYPE(string, PyUnicode_Check(string), "string"); + PROCESS_MATCH_RESULT(fnmatch(pattern, PyUnicode_AsUTF8(string), 0)); +} + +PyObject * +_posix_fnmatch_filter(const char *pattern, PyObject *names, Matcher match) +{ + PyObject *iter = PyObject_GetIter(names); + if (iter == NULL) { + return NULL; + } + + PyObject *res = PyList_New(0); + if (res == NULL) { + Py_DECREF(iter); + return NULL; + } + + PyObject *name = NULL; + while ((name = PyIter_Next(iter))) { + int rc = match(pattern, name); + if (rc < 0) { + goto abort; + } + if (rc == 1) { + if (PyList_Append(res, name) < 0) { + goto abort; + } + } + Py_DECREF(name); + if (PyErr_Occurred()) { + Py_DECREF(res); + Py_DECREF(iter); + return NULL; + } + } + Py_DECREF(iter); + return res; +abort: + Py_XDECREF(name); + Py_DECREF(iter); + Py_DECREF(res); + return NULL; +} +#endif diff --git a/Modules/_fnmatch/regex.c b/Modules/_fnmatch/regex.c new file mode 100644 index 00000000000000..5ba96a214bc267 --- /dev/null +++ b/Modules/_fnmatch/regex.c @@ -0,0 +1,67 @@ +#include "Python.h" + +/* + * Perform a case-sensitive match using regular expressions. + * + * Parameters + * + * pattern A translated regular expression. + * name The filename to match. + * + * Returns 1 if the 'name' matches the 'pattern' and 0 otherwise. + * Returns -1 if something went wrong. + */ +int +_regex_fnmatch_generic(PyObject *matcher, PyObject *name) +{ + // If 'name' is of incorrect type, it will be detected when calling + // the matcher function (we emulate 're.compile(...).match(name)'). + PyObject *match = PyObject_CallFunction(matcher, "O", name); + if (match == NULL) { + return -1; + } + int matching = match != Py_None; + Py_DECREF(match); + return matching; +} + +PyObject * +_regex_fnmatch_filter(PyObject *matcher, PyObject *names) +{ + PyObject *iter = PyObject_GetIter(names); + if (iter == NULL) { + return NULL; + } + + PyObject *res = PyList_New(0); + if (res == NULL) { + Py_DECREF(iter); + return NULL; + } + + PyObject *name = NULL; + while ((name = PyIter_Next(iter))) { + int rc = _regex_fnmatch_generic(matcher, name); + if (rc < 0) { + goto abort; + } + if (rc == 1) { + if (PyList_Append(res, name) < 0) { + goto abort; + } + } + Py_DECREF(name); + if (PyErr_Occurred()) { + Py_DECREF(res); + Py_DECREF(iter); + return NULL; + } + } + Py_DECREF(iter); + return res; +abort: + Py_XDECREF(name); + Py_DECREF(iter); + Py_DECREF(res); + return NULL; +} diff --git a/Modules/_fnmatchmodule.c b/Modules/_fnmatch/translate.c similarity index 50% rename from Modules/_fnmatchmodule.c rename to Modules/_fnmatch/translate.c index 60528f18b1b658..8ac45d546826fc 100644 --- a/Modules/_fnmatchmodule.c +++ b/Modules/_fnmatch/translate.c @@ -1,375 +1,228 @@ /* - * C accelerator for the 'fnmatch' module (POSIX only). - * - * Most functions expect string or bytes instances, and thus the Python - * implementation should first pre-process path-like objects, possibly - * applying normalizations depending on the platform if needed. + * C accelerator for the translation function from UNIX shell patterns + * to RE patterns. This accelerator is platform-independent but can be + * disabled on demand. */ #include "Python.h" -#include "pycore_call.h" // for _PyObject_CallMethod - -#include "clinic/_fnmatchmodule.c.h" +#include "pycore_call.h" // for _PyObject_CallMethod() -#define INVALID_PATTERN_TYPE "pattern must be a string or a bytes object" +#include "_fnmatchmodule.h" // for get_fnmatchmodulestate_state() -// module state functions +// ==== Helper declarations ================================================== -typedef struct { - PyObject *re_module; // 're' module - PyObject *os_module; // 'os' module +/* + * Creates a new Unicode object from a Py_UCS4 character. + * + * Note: this is 'unicode_char' taken from Objects/unicodeobject.c. + */ +static PyObject * +get_unicode_character(Py_UCS4 ch); - PyObject *lru_cache; // optional cache for regex patterns, if needed -} fnmatchmodule_state; +/* + * Construct a regular expression out of a UNIX-style expression. + * + * The expression to translate is the content of an '[(BLOCK)]' expression + * or '[!(BLOCK)]' expression. The BLOCK contains single unicode characters + * or character ranges (e.g., 'a-z'). + * + * By convention 'start' and 'stop' represent the INCLUSIVE start index + * and EXCLUSIVE stop index of BLOCK in the full 'pattern'. Note that + * we always have pattern[stop] == ']' and pattern[start] == BLOCK[0]. + * + * For instance, for "ab[c-f]g[!1-5]", the values of 'start' and 'stop' + * for the sub-pattern '[c-f]' are 3 and 6 respectively, whereas their + * values for '[!1-5]' are 10 (not 9) and 13 respectively. + */ +static PyObject * +translate_expression(PyObject *pattern, Py_ssize_t start, Py_ssize_t stop); -static inline fnmatchmodule_state * -get_fnmatchmodulestate_state(PyObject *module) -{ - void *state = PyModule_GetState(module); - assert(state != NULL); - return (fnmatchmodule_state *) state; -} +/* + * Write an escaped string using re.escape(). + * + * This returns the number of written characters, or -1 if an error occurred. + */ +static Py_ssize_t +write_literal(fnmatchmodule_state *state, + _PyUnicodeWriter *writer, + PyObject *unicode); -static int -fnmatchmodule_clear(PyObject *m) -{ - fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); - Py_CLEAR(st->os_module); - Py_CLEAR(st->re_module); - Py_CLEAR(st->lru_cache); - return 0; -} +/* + * Write the translated pattern obtained by translate_expression(). + * + * This returns the number of written characters, or -1 if an error occurred. + */ +static Py_ssize_t +write_expression(_PyUnicodeWriter *writer, PyObject *expression); -static int -fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) -{ - fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); - Py_VISIT(st->os_module); - Py_VISIT(st->re_module); - Py_VISIT(st->lru_cache); - return 0; -} +/* + * Build the final regular expression by processing the wildcards. + * + * The position of each wildcard in 'strings' is given by 'indices'. + */ +static PyObject * +process_wildcards(PyObject *pattern, PyObject *indices); -static void -fnmatchmodule_free(void *m) -{ - fnmatchmodule_clear((PyObject *) m); -} +// ==== API implementation ==================================================== -static int -fnmatchmodule_exec(PyObject *m) +PyObject * +translate(PyObject *module, PyObject *pattern) { -#define IMPORT_MODULE(attr, name) \ - do { \ - state->attr = PyImport_ImportModule((name)); \ - if (state->attr == NULL) { \ - return -1; \ - } \ - } while (0) - -#define INTERN_STRING(attr, str) \ - do { \ - state->attr = PyUnicode_InternFromString((str)); \ - if (state->attr == NULL) { \ - return -1; \ - } \ - } while (0) - - fnmatchmodule_state *state = get_fnmatchmodulestate_state(m); - - // imports - IMPORT_MODULE(os_module, "os"); - IMPORT_MODULE(re_module, "re"); - - // helpers - state->lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); - if (state->lru_cache == NULL) { - return -1; - } - // todo: handle LRU cache - -#undef IMPORT_MODULE - - return 0; -} - -/*[clinic input] -module _fnmatch -[clinic start generated code]*/ -/*[clinic end generated code: output=da39a3ee5e6b4b0d input=356e324d57d93f08]*/ - -#ifdef Py_HAVE_FNMATCH -#include - -#define VERIFY_NAME_ARG_TYPE(name, check, expecting) \ +#define READ(ind) PyUnicode_READ(kind, data, (ind)) +#define ADVANCE_IF_CHAR(ch, ind, maxind) \ do { \ - if (!check) { \ - PyErr_Format(PyExc_TypeError, \ - "name must be a %s object, got %.200s", \ - expecting, Py_TYPE(name)->tp_name); \ - return -1; \ + if ((ind) < (maxind) && READ(ind) == (ch)) { \ + ++(ind); \ } \ } while (0) - -#define PROCESS_MATCH_RESULT(r) \ +#define _WHILE_READ_CMP(ch, ind, maxind, cmp) \ do { \ - int res = (r); /* avoid variable capture */ \ - if (res < 0) { \ - return res; \ + while ((ind) < (maxind) && READ(ind) cmp (ch)) { \ + ++(ind); \ } \ - return res != FNM_NOMATCH; \ } while (0) +#define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, !=) +#define DROP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, ==) -/* - * Perform a case-sensitive match using fnmatch(3). - * - * Parameters - * - * pattern A UNIX shell pattern. - * name The filename to match (bytes object). - * - * Returns 1 if the 'name' matches the 'pattern' and 0 otherwise. - * - * Returns -1 if (1) 'name' is not a `bytes` object, and - * sets a TypeError exception, or (2) something went wrong. - */ -static inline int -posix_fnmatch_encoded(const char *pattern, PyObject *name) -{ - VERIFY_NAME_ARG_TYPE(name, PyBytes_Check(name), "bytes"); - PROCESS_MATCH_RESULT(fnmatch(pattern, PyBytes_AS_STRING(name), 0)); -} - -/* Same as `posix_fnmatch_encoded` but for string-like objects. */ -static inline int -posix_fnmatch_unicode(const char *pattern, PyObject *name) -{ - VERIFY_NAME_ARG_TYPE(name, PyUnicode_Check(name), "string"); - PROCESS_MATCH_RESULT(fnmatch(pattern, PyUnicode_AsUTF8(name), 0)); -} - -static PyObject * -posix_fnmatch_filter(const char *pattern, PyObject *names, - int (*match)(const char *, PyObject *)) -{ - PyObject *iter = PyObject_GetIter(names); - if (iter == NULL) { + fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); + PyObject *re = state->re_module; + const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); + // We would write less data if there are successive '*', which should + // not be the case in general. Otherwise, we write >= n characters + // since escaping them would always add more characters so we will + // overestimate a bit the number of characters to write. + // + // TODO(picnixz): should we limit the estimation or not? + PyUnicodeWriter *writer = PyUnicodeWriter_Create((Py_ssize_t)(1.05 * n)); + if (writer == NULL) { return NULL; } - - PyObject *res = PyList_New(0); - if (res == NULL) { - Py_DECREF(iter); - return NULL; + _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)(writer); + // list containing the indices where '*' has a special meaning + PyObject *indices = PyList_New(0); + if (indices == NULL) { + goto abort; } - - PyObject *name = NULL; - while ((name = PyIter_Next(iter))) { - int rc = match(pattern, name); - if (rc < 0) { - goto abort; - } - if (rc == 1) { - if (PyList_Append(res, name) < 0) { - goto abort; + const int kind = PyUnicode_KIND(pattern); + const void *data = PyUnicode_DATA(pattern); + Py_ssize_t h = 0, i = 0; + while (i < n) { + // read and advance to the next character + Py_UCS4 chr = READ(i++); + switch (chr) { + case '*': { + if (_PyUnicodeWriter_WriteChar(_writer, chr) < 0) { + goto abort; + } + DROP_DUPLICATES('*', i, n); + PyObject *index = PyLong_FromSsize_t(h++); + if (index == NULL) { + goto abort; + } + int rc = PyList_Append(indices, index); + Py_DECREF(index); + if (rc < 0) { + goto abort; + } + break; + } + case '?': { + // translate optional '?' (fnmatch) into optional '.' (regex) + if (_PyUnicodeWriter_WriteChar(_writer, '.') < 0) { + goto abort; + } + ++h; // increase the expected result's length + break; + } + case '[': { + Py_ssize_t j = i; // 'i' is already at next char + ADVANCE_IF_CHAR('!', j, n); // [! + ADVANCE_IF_CHAR(']', j, n); // [!] or [] + ADVANCE_TO_NEXT(']', j, n); // locate closing ']' + if (j >= n) { + if (_PyUnicodeWriter_WriteASCIIString(_writer, "\\[", 2) < 0) { + goto abort; + } + h += 2; // we just wrote 2 characters + break; // early break for clarity + } + else { + // v--- pattern[j] (exclusive) + // '[' * ... * ']' + // ^----- pattern[i] (inclusive) + int rc = PyUnicode_FindChar(pattern, '-', i, j, 1); + if (rc == -2) { + goto abort; + } + PyObject *s1 = NULL, *s2 = NULL; + if (rc == -1) { + PyObject *group = PyUnicode_Substring(pattern, i, j); + if (group == NULL) { + goto abort; + } + s1 = _PyObject_CallMethod(group, &_Py_ID(replace), "ss", "\\", "\\\\"); + Py_DECREF(group); + } + else { + assert(rc >= 0); + s1 = translate_expression(pattern, i, j); + } + if (s1 == NULL) { + goto abort; + } + s2 = _PyObject_CallMethod(re, &_Py_ID(sub), "ssO", "([&~|])", "\\\\\\1", s1); + Py_DECREF(s1); + if (s2 == NULL) { + goto abort; + } + int difflen = write_expression(_writer, s2); + Py_DECREF(s2); + if (difflen < 0) { + goto abort; + } + h += difflen; + i = j + 1; // jump to the character after ']' + break; // early break for clarity + } + } + default: { + PyObject *str = get_unicode_character(chr); + if (str == NULL) { + goto abort; + } + int difflen = write_literal(state, _writer, str); + Py_DECREF(str); + if (difflen < 0) { + goto abort; + } + h += difflen; + break; } } - Py_DECREF(name); - if (PyErr_Occurred()) { - Py_DECREF(res); - Py_DECREF(iter); - return NULL; - } - } - Py_DECREF(iter); - return res; -abort: - Py_XDECREF(name); - Py_DECREF(iter); - Py_DECREF(res); - return NULL; -} -#else - -static PyObject * -get_match_function(PyObject *module, PyObject *pattern) -{ - // TODO(picnixz): use LRU-cache - PyObject *expr = _fnmatch_translate_impl(module, pattern); - if (expr == NULL) { - return NULL; - } - fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); - PyObject *compiled = _PyObject_CallMethod(st->re_module, &_Py_ID(compile), "O", expr); - Py_DECREF(expr); - if (compiled == NULL) { - return NULL; - } - PyObject *matcher = PyObject_GetAttr(compiled, &_Py_ID(match)); - Py_DECREF(compiled); - return matcher; -} - -static PyMethodDef get_match_function_method_def = { - "get_match_function", - _PyCFunction_CAST(get_match_function), - METH_O, - NULL -}; - -/* - * Perform a case-sensitive match using regular expressions. - * - * Parameters - * - * pattern A translated regular expression. - * name The filename to match. - * - * Returns 1 if the 'name' matches the 'pattern' and 0 otherwise. - * Returns -1 if something went wrong. - */ -static inline int -regex_fnmatch_generic(PyObject *matcher, PyObject *name) -{ - // If 'name' is of incorrect type, it will be detected when calling - // the matcher function (we emulate 're.compile(...).match(name)'). - PyObject *match = PyObject_CallFunction(matcher, "O", name); - if (match == NULL) { - return -1; - } - int matching = match != Py_None; - Py_DECREF(match); - return matching; -} - -static PyObject * -regex_fnmatch_filter(PyObject *matcher, PyObject *names) -{ - PyObject *iter = PyObject_GetIter(names); - if (iter == NULL) { - return NULL; } - - PyObject *res = PyList_New(0); - if (res == NULL) { - Py_DECREF(iter); +#undef DROP_DUPLICATES +#undef ADVANCE_TO_NEXT +#undef _WHILE_READ_CMP +#undef ADVANCE_IF_CHAR +#undef READ + PyObject *translated = PyUnicodeWriter_Finish(writer); + if (translated == NULL) { + Py_DECREF(indices); return NULL; } - - PyObject *name = NULL; - while ((name = PyIter_Next(iter))) { - int rc = regex_fnmatch_generic(matcher, name); - if (rc < 0) { - goto abort; - } - if (rc == 1) { - if (PyList_Append(res, name) < 0) { - goto abort; - } - } - Py_DECREF(name); - if (PyErr_Occurred()) { - Py_DECREF(res); - Py_DECREF(iter); - return NULL; - } - } - Py_DECREF(iter); + PyObject *res = process_wildcards(translated, indices); + Py_DECREF(translated); + Py_DECREF(indices); return res; abort: - Py_XDECREF(name); - Py_DECREF(iter); - Py_DECREF(res); - return NULL; -} -#endif - -/*[clinic input] -_fnmatch.filter -> object - - names: object - pat: object - -[clinic start generated code]*/ - -static PyObject * -_fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) -/*[clinic end generated code: output=7f11aa68436d05fc input=1d233174e1c4157a]*/ -{ -#ifndef Py_HAVE_FNMATCH - PyObject *matcher = get_match_function(module, pat); - if (matcher == NULL) { - return NULL; - } - PyObject *result = regex_fnmatch_filter(matcher, names); - Py_DECREF(matcher); - return result; -#else - // Note that the Python implementation of fnmatch.filter() does not - // call os.fspath() on the names being matched, whereas it does on NT. - if (PyBytes_Check(pat)) { - const char *pattern = PyBytes_AS_STRING(pat); - return posix_fnmatch_filter(pattern, names, &posix_fnmatch_encoded); - } - if (PyUnicode_Check(pat)) { - const char *pattern = PyUnicode_AsUTF8(pat); - return posix_fnmatch_filter(pattern, names, &posix_fnmatch_unicode); - } - PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); + PyUnicodeWriter_Discard(writer); + Py_XDECREF(indices); return NULL; -#endif } -/*[clinic input] -_fnmatch.fnmatchcase -> bool - - name: object - pat: object - -Test whether `name` matches `pattern`, including case. +// ==== Helper implementations ================================================ -This is a version of fnmatch() which doesn't case-normalize -its arguments. - -[clinic start generated code]*/ - -static int -_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) -/*[clinic end generated code: output=4d1283b1b1fc7cb8 input=b02a6a5c8c5a46e2]*/ -{ -#ifndef Py_HAVE_FNMATCH - PyObject *matcher = get_match_function(module, pat); - if (matcher == NULL) { - return -1; - } - int res = regex_fnmatch_generic(matcher, name); - Py_DECREF(matcher); - return res; -#else - // This function does not transform path-like objects, nor does it - // case-normalize 'name' or 'pattern' (whether it is the Python or - // the C implementation). - if (PyBytes_Check(pat)) { - const char *pattern = PyBytes_AS_STRING(pat); - return posix_fnmatch_encoded(pattern, name); - } - if (PyUnicode_Check(pat)) { - const char *pattern = PyUnicode_AsUTF8(pat); - return posix_fnmatch_unicode(pattern, name); - } - PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); - return -1; -#endif -} - -/* - * Convert Py_UCS4 to (PyObject *). - * - * This creates a new reference. - * - * Note: this is 'unicode_char' taken from Objects/unicodeobject.c. - */ -static PyObject * +PyObject * get_unicode_character(Py_UCS4 ch) { assert(ch <= 0x10ffff); @@ -382,98 +235,20 @@ get_unicode_character(Py_UCS4 ch) if (unicode == NULL) { return NULL; } - assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); - if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { - PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2) ch; - } - else { - assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); - PyUnicode_4BYTE_DATA(unicode)[0] = ch; - } - assert(_PyUnicode_CheckConsistency(unicode, 1)); - return unicode; -} - -static Py_ssize_t /* number of written characters or -1 on error */ -write_escaped_string(PyObject *re, _PyUnicodeWriter *writer, PyObject *str) -{ - PyObject *escaped = PyObject_CallMethodOneArg(re, &_Py_ID(escape), str); - if (escaped == NULL) { - return -1; - } - Py_ssize_t written = PyUnicode_GET_LENGTH(escaped); - int rc = _PyUnicodeWriter_WriteStr(writer, escaped); - Py_DECREF(escaped); - if (rc < 0) { - return -1; - } - assert(written > 0); - return written; -} - -static Py_ssize_t /* number of written characters or -1 on error */ -write_translated_group(_PyUnicodeWriter *writer, PyObject *group) -{ -#define WRITE_ASCII(str, len) \ - do { \ - if (_PyUnicodeWriter_WriteASCIIString(writer, (str), (len)) < 0) { \ - return -1; \ - } \ - } while (0) - -#define WRITE_CHAR(c) \ - do { \ - if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) { \ - return -1; \ - } \ - } while (0) - - Py_ssize_t grouplen; - const char *buffer = PyUnicode_AsUTF8AndSize(group, &grouplen); - if (grouplen == 0) { - /* empty range: never match */ - WRITE_ASCII("(?!)", 4); - return 4; - } - else if (grouplen == 1 && buffer[0] == '!') { - /* negated empty range: match any character */ - WRITE_CHAR('.'); - return 1; - } - else { - Py_ssize_t extra = 2; // '[' and ']' - WRITE_CHAR('['); - switch (buffer[0]) { - case '!': { - WRITE_CHAR('^'); - if (_PyUnicodeWriter_WriteSubstring(writer, group, 1, grouplen) < 0) { - return -1; - } - break; - } - case '^': - case '[': { - WRITE_CHAR('\\'); - extra++; - break; - } - default: - if (_PyUnicodeWriter_WriteStr(writer, group) < 0) { - return -1; - } - break; - } - WRITE_CHAR(']'); - return grouplen + extra; + assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); + if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { + PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; } -#undef WRITE_CHAR -#undef WRITE_ASCII + else { + assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); + PyUnicode_4BYTE_DATA(unicode)[0] = ch; + } + assert(_PyUnicode_CheckConsistency(unicode, 1)); + return unicode; } -static PyObject * -get_translated_group(PyObject *pattern, - Py_ssize_t i /* pattern[i-1] == '[' (incl.) */, - Py_ssize_t j /* pattern[j] == ']' (excl.) */) +PyObject * +translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) { PyObject *chunks = PyList_New(0); if (chunks == NULL) { @@ -482,7 +257,7 @@ get_translated_group(PyObject *pattern, Py_ssize_t k = (PyUnicode_READ_CHAR(pattern, i) == '!') ? i + 2 : i + 1; Py_ssize_t chunkscount = 0; while (k < j) { - PyObject *eobj = _PyObject_CallMethod(pattern, &_Py_ID(find), "ii", k, j); + PyObject *eobj = _PyObject_CallMethod(pattern, &_Py_ID(find), "sii", "-", k, j); if (eobj == NULL) { goto error; } @@ -607,13 +382,93 @@ get_translated_group(PyObject *pattern, return NULL; } -static PyObject * -join_translated_parts(PyObject *module, PyObject *strings, PyObject *indices) +Py_ssize_t +write_literal(fnmatchmodule_state *state, + _PyUnicodeWriter *writer, + PyObject *unicode) +{ + PyObject *escaped = PyObject_CallMethodOneArg(state->re_module, + &_Py_ID(escape), + unicode); + if (escaped == NULL) { + return -1; + } + Py_ssize_t written = PyUnicode_GET_LENGTH(escaped); + int rc = _PyUnicodeWriter_WriteStr(writer, escaped); + Py_DECREF(escaped); + if (rc < 0) { + return -1; + } + assert(written > 0); + return written; +} + +Py_ssize_t +write_expression(_PyUnicodeWriter *writer, PyObject *expression) +{ +#define WRITE_ASCII(str, len) \ + do { \ + if (_PyUnicodeWriter_WriteASCIIString(writer, (str), (len)) < 0) { \ + return -1; \ + } \ + } while (0) + +#define WRITE_CHAR(c) \ + do { \ + if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) { \ + return -1; \ + } \ + } while (0) + + Py_ssize_t grouplen; + const char *buffer = PyUnicode_AsUTF8AndSize(expression, &grouplen); + if (grouplen == 0) { + /* empty range: never match */ + WRITE_ASCII("(?!)", 4); + return 4; + } + else if (grouplen == 1 && buffer[0] == '!') { + /* negated empty range: match any character */ + WRITE_CHAR('.'); + return 1; + } + else { + Py_ssize_t extra = 2; // '[' and ']' + WRITE_CHAR('['); + switch (buffer[0]) { + case '!': { + WRITE_CHAR('^'); + if (_PyUnicodeWriter_WriteSubstring(writer, expression, 1, grouplen) < 0) { + return -1; + } + break; + } + case '^': + case '[': { + WRITE_CHAR('\\'); + extra++; + break; + } + default: + if (_PyUnicodeWriter_WriteStr(writer, expression) < 0) { + return -1; + } + break; + } + WRITE_CHAR(']'); + return grouplen + extra; + } +#undef WRITE_CHAR +#undef WRITE_ASCII +} + +PyObject * +process_wildcards(PyObject *pattern, PyObject *indices) { #define WRITE_SUBSTRING(i, j) \ do { \ - if ((i) < (j)) { \ - if (_PyUnicodeWriter_WriteSubstring(_writer, strings, (i), (j)) < 0) { \ + if ((i) < (j)) { /* write the substring if non-empty */ \ + if (_PyUnicodeWriter_WriteSubstring(_writer, pattern, (i), (j)) < 0) { \ goto abort; \ } \ } \ @@ -622,7 +477,7 @@ join_translated_parts(PyObject *module, PyObject *strings, PyObject *indices) const Py_ssize_t m = PyList_GET_SIZE(indices); if (m == 0) { // just write fr'(?s:{parts} + ")\Z" - return PyUnicode_FromFormat("(?s:%S)\\Z", strings); + return PyUnicode_FromFormat("(?s:%S)\\Z", pattern); } /* * Special cases: indices[0] == 0 or indices[-1] + 1 == n @@ -630,8 +485,7 @@ join_translated_parts(PyObject *module, PyObject *strings, PyObject *indices) * If indices[0] == 0 write (?>.*?abcdef) instead of abcdef * If indices[-1] == n - 1 write '.*' instead of empty string */ - PyObject *ind; - Py_ssize_t i = 0, j, n = PyUnicode_GET_LENGTH(strings); + Py_ssize_t i = 0, j, n = PyUnicode_GET_LENGTH(pattern); /* * If the pattern starts with '*', we will write everything * before it. So we will write at least indices[0] characters. @@ -657,21 +511,19 @@ join_translated_parts(PyObject *module, PyObject *strings, PyObject *indices) if (writer == NULL) { return NULL; } - _PyUnicodeWriter *_writer = (_PyUnicodeWriter *) (writer); + _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)(writer); WRITE_SUBSTRING(i, j); // write stuff before '*' if needed - i = j + 1; // jump after the star - - fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); + i = j + 1; // jump after the '*' for (Py_ssize_t k = 1; k < m; ++k) { - ind = PyList_GET_ITEM(indices, k); + PyObject *ind = PyList_GET_ITEM(indices, k); j = PyLong_AsSsize_t(ind); - if (j < 0 || i > j) { - goto abort; - } - if ((_PyUnicodeWriter_WriteASCIIString(_writer, "(?>.*?", 6) < 0) || - (_PyUnicodeWriter_WriteSubstring(_writer, strings, i, j) < 0) || - (_PyUnicodeWriter_WriteChar(_writer, ')') < 0)) { + assert(j < 0 || i < j); + if (j < 0 || + (_PyUnicodeWriter_WriteASCIIString(_writer, "(?>.*?", 6) < 0) || + (_PyUnicodeWriter_WriteSubstring(_writer, pattern, i, j) < 0) || + (_PyUnicodeWriter_WriteChar(_writer, ')') < 0)) + { goto abort; } i = j + 1; @@ -680,245 +532,16 @@ join_translated_parts(PyObject *module, PyObject *strings, PyObject *indices) if (_PyUnicodeWriter_WriteASCIIString(_writer, ".*", 2) < 0) { goto abort; } - WRITE_SUBSTRING(i, n); // write TAIL part - + WRITE_SUBSTRING(i, n); // write the remaining substring #undef WRITE_SUBSTRING - PyObject *res = PyUnicodeWriter_Finish(writer); if (res == NULL) { return NULL; } - return PyUnicode_FromFormat("(?s:%S)\\Z", res); -abort: - PyUnicodeWriter_Discard(writer); - return NULL; -} - -static PyObject * -translate(PyObject *module, PyObject *pattern) -/* new reference */ -{ -#define READ(ind) PyUnicode_READ(kind, data, (ind)) - -#define ADVANCE_IF_CHAR(ch, ind, maxind) \ - do { \ - if ((ind) < (maxind) && READ(ind) == (ch)) { \ - ++(ind); \ - } \ - } while (0) - -#define _WHILE_READ_CMP(ch, ind, maxind, cmp) \ - do { \ - while ((ind) < (maxind) && READ(ind) cmp (ch)) { \ - ++(ind); \ - } \ - } while (0) - -#define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, !=) -#define DROP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, ==) - - fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); - PyObject *re = state->re_module; - const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); - // We would write less data if there are successive '*', which should - // not be the case in general. Otherwise, we write >= n characters - // since escaping them would always add more characters so we will - // overestimate a bit the number of characters to write. - // - // TODO(picnixz): should we limit the estimation or not? - PyUnicodeWriter *writer = PyUnicodeWriter_Create((Py_ssize_t) (1.05 * n)); - if (writer == NULL) { - return NULL; - } - _PyUnicodeWriter *_writer = (_PyUnicodeWriter *) (writer); - // list containing the indices where '*' has a special meaning - PyObject *indices = PyList_New(0); - if (indices == NULL) { - goto abort; - } - const int kind = PyUnicode_KIND(pattern); - const void *data = PyUnicode_DATA(pattern); - Py_ssize_t h = 0, i = 0; - while (i < n) { - // read and advance to the next character - Py_UCS4 chr = READ(i++); - switch (chr) { - case '*': { - if (_PyUnicodeWriter_WriteChar(_writer, chr) < 0) { - goto abort; - } - DROP_DUPLICATES('*', i, n); - PyObject *index = PyLong_FromSsize_t(h++); - if (index == NULL) { - goto abort; - } - int rc = PyList_Append(indices, index); - Py_DECREF(index); - if (rc < 0) { - goto abort; - } - break; - } - case '?': { - // translate optional '?' (fnmatch) into optional '.' (regex) - if (_PyUnicodeWriter_WriteChar(_writer, '.') < 0) { - goto abort; - } - ++h; // increase the expected result's length - break; - } - case '[': { - Py_ssize_t j = i; // 'i' is already at next char - ADVANCE_IF_CHAR('!', j, n); // [! - ADVANCE_IF_CHAR(']', j, n); // [!] or [] - ADVANCE_TO_NEXT(']', j, n); // locate closing ']' - if (j >= n) { - if (_PyUnicodeWriter_WriteASCIIString(_writer, "\\[", 2) < 0) { - goto abort; - } - h += 2; // we just wrote 2 characters - break; // early break for clarity - } - else { - // v--- pattern[j] (exclusive) - // '[' * ... * ']' - // ^----- pattern[i] (inclusive) - PyObject *s1 = NULL, *s2 = NULL; - int rc = PyUnicode_FindChar(pattern, '-', i, j, 1); - if (rc == -2) { - goto abort; - } - if (rc == -1) { - PyObject *group = PyUnicode_Substring(pattern, i, j); - if (group == NULL) { - goto abort; - } - s1 = _PyObject_CallMethod(group, &_Py_ID(replace), "ss", "\\", "\\\\"); - Py_DECREF(group); - } - else { - assert(rc >= 0); - s1 = get_translated_group(pattern, i, j); - } - if (s1 == NULL) { - goto abort; - } - s2 = _PyObject_CallMethod(re, &_Py_ID(sub), "ssO", "([&~|])", "\\\\\\1", s1); - Py_DECREF(s1); - if (s2 == NULL) { - goto abort; - } - int difflen = write_translated_group(_writer, s2); - Py_DECREF(s2); - if (difflen < 0) { - goto abort; - } - h += difflen; - i = j + 1; // jump to the character after ']' - break; // early break for clarity - } - } - default: { - PyObject *str = get_unicode_character(chr); - if (str == NULL) { - goto abort; - } - int difflen = write_escaped_string(re, _writer, str); - Py_DECREF(str); - if (difflen < 0) { - goto abort; - } - h += difflen; - break; - } - } - } -#undef DROP_DUPLICATES -#undef ADVANCE_TO_NEXT -#undef _WHILE_READ_CMP -#undef ADVANCE_IF_CHAR -#undef READ - PyObject *parts = PyUnicodeWriter_Finish(writer); - if (parts == NULL) { - Py_DECREF(indices); - return NULL; - } - assert(h == PyUnicode_GET_LENGTH(parts)); - PyObject *res = join_translated_parts(module, parts, indices); - Py_DECREF(parts); - Py_DECREF(indices); - return res; + PyObject *formatted = PyUnicode_FromFormat("(?s:%S)\\Z", res); + Py_DECREF(res); + return formatted; abort: - Py_XDECREF(indices); PyUnicodeWriter_Discard(writer); return NULL; } - -/*[clinic input] -_fnmatch.translate -> object - - pat as pattern: object - -[clinic start generated code]*/ - -static PyObject * -_fnmatch_translate_impl(PyObject *module, PyObject *pattern) -/*[clinic end generated code: output=2d9e3bbcbcc6e90e input=56e39f7beea97810]*/ -{ - if (PyBytes_Check(pattern)) { - PyObject *unicode = PyUnicode_DecodeLatin1(PyBytes_AS_STRING(pattern), - PyBytes_GET_SIZE(pattern), - "strict"); - if (unicode == NULL) { - return NULL; - } - // translated regular expression as a str object - PyObject *str_expr = translate(module, unicode); - Py_DECREF(unicode); - if (str_expr == NULL) { - return NULL; - } - PyObject *expr = PyUnicode_AsLatin1String(str_expr); - Py_DECREF(str_expr); - return expr; - } - else if (PyUnicode_Check(pattern)) { - return translate(module, pattern); - } - else { - PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); - return NULL; - } -} - -static PyMethodDef fnmatchmodule_methods[] = { - _FNMATCH_FILTER_METHODDEF - _FNMATCH_FNMATCHCASE_METHODDEF - _FNMATCH_TRANSLATE_METHODDEF - {NULL, NULL} -}; - -static struct PyModuleDef_Slot fnmatchmodule_slots[] = { - {Py_mod_exec, fnmatchmodule_exec}, - {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, - {Py_mod_gil, Py_MOD_GIL_NOT_USED}, - {0, NULL}, -}; - -static struct PyModuleDef _fnmatchmodule = { - PyModuleDef_HEAD_INIT, - "_fnmatch", - NULL, - .m_size = sizeof(fnmatchmodule_state), - .m_methods = fnmatchmodule_methods, - .m_slots = fnmatchmodule_slots, - .m_traverse = fnmatchmodule_traverse, - .m_clear = fnmatchmodule_clear, - .m_free = fnmatchmodule_free, -}; - -PyMODINIT_FUNC -PyInit__fnmatch(void) -{ - return PyModuleDef_Init(&_fnmatchmodule); -} From 5cfd5801b78291796eedf22ab053ebc139c2be8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 18:39:54 +0200 Subject: [PATCH 25/97] regenerated objects --- Include/internal/pycore_global_objects_fini_generated.h | 1 + Include/internal/pycore_global_strings.h | 1 + Include/internal/pycore_runtime_init_generated.h | 1 + Include/internal/pycore_unicodeobject_generated.h | 4 ++++ 4 files changed, 7 insertions(+) diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index fc82cfc1536feb..ec11eec5eec27d 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -863,6 +863,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(col_offset)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(command)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(comment_factory)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compile)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compile_mode)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(consts)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(context)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 78f56bdd2f6238..f27bdeb0183aec 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -352,6 +352,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(col_offset) STRUCT_FOR_ID(command) STRUCT_FOR_ID(comment_factory) + STRUCT_FOR_ID(compile) STRUCT_FOR_ID(compile_mode) STRUCT_FOR_ID(consts) STRUCT_FOR_ID(context) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index 2c965ecd99fcf5..ab94af0cfb90c9 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -861,6 +861,7 @@ extern "C" { INIT_ID(col_offset), \ INIT_ID(command), \ INIT_ID(comment_factory), \ + INIT_ID(compile), \ INIT_ID(compile_mode), \ INIT_ID(consts), \ INIT_ID(context), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index 0307d1f4806ba7..a0e532edc1bfc6 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1208,6 +1208,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(compile); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(compile_mode); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); From 891a36820164714dfdc8d5c93adc0bfd2688eb81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 18:47:49 +0200 Subject: [PATCH 26/97] maybe it's like that? --- PC/config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PC/config.c b/PC/config.c index f08a847a3f1206..8f49d9255b4fbe 100644 --- a/PC/config.c +++ b/PC/config.c @@ -92,8 +92,8 @@ struct _inittab _PyImport_Inittab[] = { {"binascii", PyInit_binascii}, {"cmath", PyInit_cmath}, {"errno", PyInit_errno}, + {"_fnmatch", PyInit_fnmatch}, {"faulthandler", PyInit_faulthandler}, - {"fnmatch", PyInit_fnmatch}, {"gc", PyInit_gc}, {"math", PyInit_math}, {"nt", PyInit_nt}, /* Use the NT os functions, not posix */ From adc18bd7f58fc9bc0f8f2b79193096a7023d883a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 9 Jul 2024 19:00:54 +0200 Subject: [PATCH 27/97] maybe like this...? --- Makefile.pre.in | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.pre.in b/Makefile.pre.in index 97aa13bcc34409..ec99023f36b2b3 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -356,6 +356,7 @@ FNMATCH_OBJS= \ Modules/_fnmatch/_fnmatchmodule.o \ Modules/_fnmatch/posix.o \ Modules/_fnmatch/regex.o \ + Modules/_fnmatch/translate.o ########################################################################## # mimalloc From 658fb81ad684c40254d7356111c393d767da7f9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 10 Jul 2024 10:14:01 +0200 Subject: [PATCH 28/97] regenerate configure script --- configure | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/configure b/configure index 131ca5f7f897a7..0fefae0032587f 100755 --- a/configure +++ b/configure @@ -817,6 +817,8 @@ MODULE_TIME_FALSE MODULE_TIME_TRUE MODULE__IO_FALSE MODULE__IO_TRUE +MODULE__FNMATCH_FALSE +MODULE__FNMATCH_TRUE MODULE_BUILDTYPE TEST_MODULES LIBB2_LIBS @@ -14042,6 +14044,60 @@ printf "%s\n" "#define Py_HAVE_C_COMPLEX 1" >>confdefs.h fi +# check for fnmatch(3) support +# +# We test for the plain POSIX implementation (case-sensitive match). +# +# To ensure that the implementation of fnmatch(3) is compliant +# we run some tests to make sure that everything works well. +# +# Note that MSVC does not support fnmatch(3). +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for case-sensititve fnmatch(3)" >&5 +printf %s "checking for case-sensititve fnmatch(3)... " >&6; } +if test ${ac_cv_fnmatch_supported+y} +then : + printf %s "(cached) " >&6 +else $as_nop + if test "$cross_compiling" = yes +then : + ac_cv_fnmatch_supported=no + +else $as_nop + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main (void) +{ + + exit(!( + fnmatch("a*", "abc", 0) != FNM_NOMATCH && + fnmatch("a*", "Abc", 0) == FNM_NOMATCH + )); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO" +then : + ac_cv_fnmatch_supported=yes +else $as_nop + ac_cv_fnmatch_supported=no +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fnmatch_supported" >&5 +printf "%s\n" "$ac_cv_fnmatch_supported" >&6; } +if test "$ac_cv_fnmatch_supported" = "yes"; then + +printf "%s\n" "#define Py_HAVE_FNMATCH 1" >>confdefs.h + +fi + # check for systems that require aligned memory access { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking aligned memory access is required" >&5 printf %s "checking aligned memory access is required... " >&6; } @@ -27681,6 +27737,7 @@ SRCDIRS="\ Modules/_ctypes \ Modules/_decimal \ Modules/_decimal/libmpdec \ + Modules/_fnmatch \ Modules/_hacl \ Modules/_io \ Modules/_multiprocessing \ @@ -29013,6 +29070,28 @@ MODULE_BLOCK= + if test "$py_cv_module__fnmatch" != "n/a" +then : + py_cv_module__fnmatch=yes +fi + if test "$py_cv_module__fnmatch" = yes; then + MODULE__FNMATCH_TRUE= + MODULE__FNMATCH_FALSE='#' +else + MODULE__FNMATCH_TRUE='#' + MODULE__FNMATCH_FALSE= +fi + + as_fn_append MODULE_BLOCK "MODULE__FNMATCH_STATE=$py_cv_module__fnmatch$as_nl" + if test "x$py_cv_module__fnmatch" = xyes +then : + + as_fn_append MODULE_BLOCK "MODULE__FNMATCH_CFLAGS=-I\$(srcdir)/Modules/_fnmatch$as_nl" + + +fi + + if test "$py_cv_module__io" != "n/a" then : py_cv_module__io=yes @@ -31744,6 +31823,10 @@ LTLIBOBJS=$ac_ltlibobjs +if test -z "${MODULE__FNMATCH_TRUE}" && test -z "${MODULE__FNMATCH_FALSE}"; then + as_fn_error $? "conditional \"MODULE__FNMATCH\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi if test -z "${MODULE__IO_TRUE}" && test -z "${MODULE__IO_FALSE}"; then as_fn_error $? "conditional \"MODULE__IO\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 From cb29bd30546191c06d82799462e59f3607757eae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 10 Jul 2024 18:52:00 +0200 Subject: [PATCH 29/97] update tests --- Lib/test/test_fnmatch.py | 104 ++++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 24 deletions(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index f7e9391722ac38..6d72df182af862 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -1,4 +1,6 @@ """Test cases for the fnmatch module.""" +import itertools + import os import string import unittest @@ -6,25 +8,22 @@ import test.support.import_helper -c_fnmatch = test.support.import_helper.import_fresh_module("_fnmatch", blocked=["fnmatch"]) +c_fnmatch = test.support.import_helper.import_fresh_module("_fnmatch") py_fnmatch = test.support.import_helper.import_fresh_module("fnmatch", blocked=["_fnmatch"]) -fnmatch = py_fnmatch.fnmatch -fnmatchcase = py_fnmatch.fnmatchcase -translate = py_fnmatch.translate -filter = py_fnmatch.filter +class FnmatchTestCaseMixin: + fnmatch = None -class FnmatchTestCase(unittest.TestCase): + def check_match(self, filename, pattern, should_match=True, func=None): + if func is None: + func = self.fnmatch.fnmatch - def check_match(self, filename, pattern, should_match=True, fn=fnmatch): - if should_match: - self.assertTrue(fn(filename, pattern), - "expected %r to match pattern %r" - % (filename, pattern)) - else: - self.assertFalse(fn(filename, pattern), - "expected %r not to match pattern %r" - % (filename, pattern)) + with self.subTest(fn=func, name=filename, pattern=pattern): + res = func(filename, pattern) + if should_match: + self.assertTrue(res, f"expected {filename!r} to match pattern {pattern!r}") + else: + self.assertFalse(res, f"expected {filename!r} not to match pattern {pattern!r}") def test_fnmatch(self): check = self.check_match @@ -61,13 +60,17 @@ def test_slow_fnmatch(self): check('a' * 50 + 'b', '*a*a*a*a*a*a*a*a*a*a', False) def test_mix_bytes_str(self): + fnmatch = self.fnmatch.fnmatch self.assertRaises(TypeError, fnmatch, 'test', b'*') self.assertRaises(TypeError, fnmatch, b'test', '*') + + fnmatchcase = self.fnmatch.fnmatchcase self.assertRaises(TypeError, fnmatchcase, 'test', b'*') self.assertRaises(TypeError, fnmatchcase, b'test', '*') def test_fnmatchcase(self): check = self.check_match + fnmatchcase = self.fnmatch.fnmatchcase check('abc', 'abc', True, fnmatchcase) check('AbC', 'abc', False, fnmatchcase) check('abc', 'AbC', False, fnmatchcase) @@ -223,11 +226,18 @@ def test_warnings(self): check(',', '[a-z+--A-Z]') check('.', '[a-z--/A-Z]') +class PurePythonFnmatchTestCase(FnmatchTestCaseMixin, unittest.TestCase): + fnmatch = py_fnmatch + +class CPythonFnmatchTestCase(FnmatchTestCaseMixin, unittest.TestCase): + fnmatch = c_fnmatch -class TranslateTestCase(unittest.TestCase): +class TranslateTestCaseMixin: + fnmatch = None def test_translate(self): import re + translate = self.fnmatch.translate self.assertEqual(translate('*'), r'(?s:.*)\Z') self.assertEqual(translate('?'), r'(?s:.)\Z') self.assertEqual(translate('a?b*'), r'(?s:a.b.*)\Z') @@ -257,6 +267,11 @@ def test_translate(self): self.assertTrue(re.match(fatre, 'cbabcaxc')) self.assertFalse(re.match(fatre, 'dabccbad')) +class PurePythonTranslateTestCase(TranslateTestCaseMixin, unittest.TestCase): + fnmatch = py_fnmatch + +class CPythonTranslateTestCase(TranslateTestCaseMixin, unittest.TestCase): + fnmatch = c_fnmatch class FilterTestCaseMixin: fnmatch = None @@ -268,31 +283,72 @@ def test_filter(self): self.assertEqual(filter([b'Python', b'Ruby', b'Perl', b'Tcl'], b'P*'), [b'Python', b'Perl']) - def test_mix_bytes_str(self): - filter = self.fnmatch.filter - self.assertRaises(TypeError, filter, ['test'], b'*') - self.assertRaises(TypeError, filter, [b'test'], '*') - -class PurePythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): - fnmatch = py_fnmatch - def test_case(self): ignorecase = os.path.normcase('P') == os.path.normcase('p') + filter = self.fnmatch.filter self.assertEqual(filter(['Test.py', 'Test.rb', 'Test.PL'], '*.p*'), ['Test.py', 'Test.PL'] if ignorecase else ['Test.py']) self.assertEqual(filter(['Test.py', 'Test.rb', 'Test.PL'], '*.P*'), ['Test.py', 'Test.PL'] if ignorecase else ['Test.PL']) def test_sep(self): + filter = self.fnmatch.filter normsep = os.path.normcase('\\') == os.path.normcase('/') self.assertEqual(filter(['usr/bin', 'usr', 'usr\\lib'], 'usr/*'), ['usr/bin', 'usr\\lib'] if normsep else ['usr/bin']) self.assertEqual(filter(['usr/bin', 'usr', 'usr\\lib'], 'usr\\*'), ['usr/bin', 'usr\\lib'] if normsep else ['usr\\lib']) + def test_mix_bytes_str(self): + filter = self.fnmatch.filter + self.assertRaises(TypeError, filter, ['test'], b'*') + self.assertRaises(TypeError, filter, [b'test'], '*') + +class PurePythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): + fnmatch = py_fnmatch + class CPythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): fnmatch = c_fnmatch + @staticmethod + def translate_func(pattern): + STAR = object() + parts = py_fnmatch._translate(pattern, STAR, '.') + return py_fnmatch._join_translated_parts(parts, STAR) + + def test_translate(self): + # We want to check that the C implementation is EXACTLY the same + # as the Python implementation. For that, we will need to cover + # a lot of cases. + translate = self.fnmatch.translate + + for choice in itertools.combinations_with_replacement('*?.', 5): + for suffix in ['', '!']: + pat = suffix + ''.join(choice) + with self.subTest(pattern=pat): + self.assertEqual(translate(pat), self.translate_func(pat)) + + for pat in [ + '', + '!!a*', '!\\!a*', '!a*', '*', '**', '*******?', '*******c', '*****??', '**/', + '*.js', '*/man*/bash.*', '*???', '?', '?*****??', '?*****?c', '?***?****', + '?***?****?', '?***?****c', '?*?', '??', '???', '???*', '[!\\]', + '[*', '[-abc]', '[[]b', '[[a]b', '[\\\\]', '[\\]', '[]-]', '[][!]', + '[]]b', '[]a[]b', '[^a-c]*', '[a-\\z]', + '[a-c]b*', '[a-y]*[^c]', '[abc-]', '\\*', + '[0-4-3-2]', '[b-ac-z9-1]', '[!b-ac-z9-1]', '[!]b-ac-z9-1]', + '[]b-ac-z9-1]', '[]b-ac-z9-1]*', '*[]b-ac-z9-1]', + '\\**', '\\*\\*', 'a*', 'a*****?c', 'a****c**?**??*****', 'a***c', + 'a**?**cd**?**??***k', 'a**?**cd**?**??***k**', 'a**?**cd**?**??k', + 'a**?**cd**?**??k***', 'a*[^c]', + 'a*cd**?**??k', 'a/*', 'a/**', 'a/**/b', + 'a/**/b/**/c', 'a/.*/c', 'a/?', 'a/??', 'a[X-]b', 'a[\\.]c', + 'a[\\b]c', 'a[bc', 'a\\*?/*', 'a\\*b/*', + 'ab[!de]', 'ab[cd]', 'ab[cd]ef', 'abc', 'b*/', 'foo*', + 'man/man1/bash.1' + ]: + with self.subTest(pattern=pat): + self.assertEqual(translate(pat), self.translate_func(pat)) if __name__ == "__main__": unittest.main() From c1fae2425b13c34aef2b4b267f2ca5bb1bb45578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 10 Jul 2024 18:52:07 +0200 Subject: [PATCH 30/97] update files --- Modules/_fnmatch/_fnmatchmodule.c | 277 +++++++++++++-------- Modules/_fnmatch/_fnmatchmodule.h | 113 ++++++--- Modules/_fnmatch/clinic/_fnmatchmodule.c.h | 63 ++++- Modules/_fnmatch/posix.c | 161 ++++++++++-- Modules/_fnmatch/regex.c | 22 +- Modules/_fnmatch/translate.c | 243 +++++++++--------- 6 files changed, 596 insertions(+), 283 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 6e566991188861..6e4b23d83cfb53 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -1,115 +1,166 @@ -/* - * C accelerator for the 'fnmatch' module (POSIX only). - * - * Most functions expect string or bytes instances, and thus the Python - * implementation should first pre-process path-like objects, possibly - * applying normalizations depending on the platform if needed. - */ - #include "Python.h" #include "pycore_call.h" // for _PyObject_CallMethod #include "_fnmatchmodule.h" #include "clinic/_fnmatchmodule.c.h" -#define INVALID_PATTERN_TYPE "pattern must be a string or a bytes object" +#define COMPILED_CACHE_SIZE 32768 +#define INVALID_PATTERN_TYPE "pattern must be a string or a bytes object" + +// ==== Helper implementations ================================================ + +/* + * Compile a UNIX shell pattern into a RE pattern + * and returns the corresponding 'match()' method. + * + * This function is LRU-cached by the module itself. + */ +static PyObject * +fnmatchmodule_get_matcher_function(PyObject *module, PyObject *pattern) +{ + // translate the pattern into a RE pattern + assert(module != NULL); + PyObject *expr = _fnmatch_translate_impl(module, pattern); + if (expr == NULL) { + return NULL; + } + fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + // compile the pattern + PyObject *compiled = _PyObject_CallMethod(st->re_module, &_Py_ID(compile), "O", expr); + Py_DECREF(expr); + if (compiled == NULL) { + return NULL; + } + // get the compiled pattern matcher function + PyObject *matcher = PyObject_GetAttr(compiled, &_Py_ID(match)); + Py_DECREF(compiled); + return matcher; +} -// module state functions +static PyMethodDef get_matcher_function_def = { + "get_matcher_function", + (PyCFunction)(fnmatchmodule_get_matcher_function), + METH_O, + NULL +}; static int -fnmatchmodule_clear(PyObject *m) +fnmatchmodule_load_lru_cache(PyObject *module, fnmatchmodule_state *st) { - fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); - Py_CLEAR(st->os_module); - Py_CLEAR(st->re_module); - Py_CLEAR(st->lru_cache); + st->lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); + if (st->lru_cache == NULL) { + return -1; + } return 0; } static int -fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) +fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) { - fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); - Py_VISIT(st->os_module); - Py_VISIT(st->re_module); - Py_VISIT(st->lru_cache); + assert(st->lru_cache != NULL); + PyObject *maxsize = PyLong_FromLong(COMPILED_CACHE_SIZE); + if (maxsize == NULL) { + return -1; + } + PyObject *args[] = {NULL, maxsize, Py_True}; + size_t nargsf = 2 | PY_VECTORCALL_ARGUMENTS_OFFSET; + PyObject *decorator = PyObject_Vectorcall(st->lru_cache, args + 1, nargsf, NULL); + Py_DECREF(maxsize); + if (decorator == NULL) { + return -1; + } + // TODO(picnixz): should INCREF the refcount of 'module'? + assert(module != NULL); + PyObject *decorated = PyCFunction_New(&get_matcher_function_def, module); + PyObject *translator = PyObject_CallOneArg(decorator, decorated); + Py_DECREF(decorated); + Py_DECREF(decorator); + if (translator == NULL) { + return -1; + } + // reference on 'translator' will be removed upon module cleanup + st->translator = translator; return 0; } -static void -fnmatchmodule_free(void *m) +static inline PyObject * +get_matcher_function(PyObject *module, PyObject *pattern) { - fnmatchmodule_clear((PyObject *) m); + assert(module != NULL); + assert(pattern != NULL); + fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + assert(st->translator != NULL); + size_t nargsf = 1 | PY_VECTORCALL_ARGUMENTS_OFFSET; + return PyObject_Vectorcall(st->translator, &pattern, nargsf, NULL); } -static int -fnmatchmodule_exec(PyObject *m) -{ -#define IMPORT_MODULE(attr, name) \ - do { \ - state->attr = PyImport_ImportModule((name)); \ - if (state->attr == NULL) { \ - return -1; \ - } \ - } while (0) +// ==== Module state functions ================================================ -#define INTERN_STRING(attr, str) \ +#define IMPORT_MODULE(state, attribute, name) \ do { \ - state->attr = PyUnicode_InternFromString((str)); \ - if (state->attr == NULL) { \ + state->attribute = NULL; \ + state->attribute = PyImport_ImportModule((name)); \ + if (state->attribute == NULL) { \ return -1; \ } \ } while (0) - fnmatchmodule_state *state = get_fnmatchmodulestate_state(m); - - // imports - IMPORT_MODULE(os_module, "os"); - IMPORT_MODULE(re_module, "re"); - - // helpers - state->lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); - if (state->lru_cache == NULL) { +static int +fnmatchmodule_exec(PyObject *module) +{ + fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + st->py_module = NULL; + IMPORT_MODULE(st, py_module, "fnmatch"); + st->os_module = NULL; + IMPORT_MODULE(st, os_module, "os"); + st->re_module = NULL; + IMPORT_MODULE(st, re_module, "re"); + st->lru_cache = NULL; + if (fnmatchmodule_load_lru_cache(module, st) < 0) { return -1; } - // todo: handle LRU cache - + st->translator = NULL; + if (fnmatchmodule_load_translator(module, st) < 0) { + return -1; + } + return 0; +} #undef IMPORT_MODULE -#undef INTERN_STRING +static int +fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) +{ + fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); + Py_VISIT(st->py_module); + Py_VISIT(st->os_module); + Py_VISIT(st->re_module); + Py_VISIT(st->lru_cache); + Py_VISIT(st->translator); return 0; } -/*[clinic input] -module _fnmatch -[clinic start generated code]*/ -/*[clinic end generated code: output=da39a3ee5e6b4b0d input=356e324d57d93f08]*/ +static int +fnmatchmodule_clear(PyObject *m) +{ + fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); + Py_CLEAR(st->py_module); + Py_CLEAR(st->os_module); + Py_CLEAR(st->re_module); + Py_CLEAR(st->lru_cache); + Py_CLEAR(st->translator); + return 0; +} -static PyObject * -get_match_function(PyObject *module, PyObject *pattern) +static void +fnmatchmodule_free(void *m) { - // TODO(picnixz): use LRU-cache - PyObject *expr = _fnmatch_translate_impl(module, pattern); - if (expr == NULL) { - return NULL; - } - fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); - PyObject *compiled = _PyObject_CallMethod(st->re_module, &_Py_ID(compile), "O", expr); - Py_DECREF(expr); - if (compiled == NULL) { - return NULL; - } - PyObject *matcher = PyObject_GetAttr(compiled, &_Py_ID(match)); - Py_DECREF(compiled); - return matcher; + (void)fnmatchmodule_clear((PyObject *)m); } -static PyMethodDef get_match_function_method_def = { - "get_match_function", - _PyCFunction_CAST(get_match_function), - METH_O, - NULL -}; +/*[clinic input] +module _fnmatch +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=356e324d57d93f08]*/ /*[clinic input] _fnmatch.filter -> object @@ -123,30 +174,55 @@ static PyObject * _fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) /*[clinic end generated code: output=7f11aa68436d05fc input=1d233174e1c4157a]*/ { -#ifndef Py_HAVE_FNMATCH - PyObject *matcher = get_match_function(module, pat); - if (matcher == NULL) { - return NULL; - } - PyObject *result = _regex_fnmatch_filter(matcher, names); - Py_DECREF(matcher); - return result; -#else +#if defined(Py_HAVE_FNMATCH) && !defined(Py_USE_FNMATCH_FALLBACK) // Note that the Python implementation of fnmatch.filter() does not // call os.fspath() on the names being matched, whereas it does on NT. if (PyBytes_Check(pat)) { const char *pattern = PyBytes_AS_STRING(pat); - return _posix_fnmatch_filter(pattern, names, &_posix_fnmatch_encoded); + return _posix_fnmatch_encoded_filter_cached(pattern, names); } if (PyUnicode_Check(pat)) { const char *pattern = PyUnicode_AsUTF8(pat); - return _posix_fnmatch_filter(pattern, names, &_posix_fnmatch_unicode); + return _posix_fnmatch_unicode_filter_cached(pattern, names); } PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); return NULL; +#else + PyObject *matcher = get_matcher_function(module, pat); + if (matcher == NULL) { + return NULL; + } + PyObject *result = _regex_fnmatch_filter(matcher, names); + Py_DECREF(matcher); + return result; #endif } +/*[clinic input] +_fnmatch.fnmatch -> bool + + name: object + pat: object + +[clinic start generated code]*/ + +static int +_fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat) +/*[clinic end generated code: output=b4cd0bd911e8bc93 input=c45e0366489540b8]*/ +{ + fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + PyObject *res = _PyObject_CallMethod(st->py_module, &_Py_ID(fnmatch), "OO", name, pat); + if (res == NULL) { + return -1; + } + int matching = PyLong_AsLong(res); + if (matching < 0) { + return -1; + } + Py_DECREF(res); + return matching; +} + /*[clinic input] _fnmatch.fnmatchcase -> bool @@ -164,28 +240,28 @@ static int _fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) /*[clinic end generated code: output=4d1283b1b1fc7cb8 input=b02a6a5c8c5a46e2]*/ { -#ifndef Py_HAVE_FNMATCH - PyObject *matcher = get_match_function(module, pat); - if (matcher == NULL) { - return -1; - } - int res = _regex_fnmatch_generic(matcher, name); - Py_DECREF(matcher); - return res; -#else +#if defined(Py_HAVE_FNMATCH) && !defined(Py_USE_FNMATCH_FALLBACK) // This function does not transform path-like objects, nor does it // case-normalize 'name' or 'pattern' (whether it is the Python or // the C implementation). if (PyBytes_Check(pat)) { const char *pattern = PyBytes_AS_STRING(pat); - return _posix_fnmatch_encoded(pattern, name); + return _posix_fnmatch_encoded_cached(pattern, name); } if (PyUnicode_Check(pat)) { const char *pattern = PyUnicode_AsUTF8(pat); - return _posix_fnmatch_unicode(pattern, name); + return _posix_fnmatch_unicode_cached(pattern, name); } PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); return -1; +#else + PyObject *matcher = get_matcher_function(module, pat); + if (matcher == NULL) { + return -1; + } + int res = _regex_fnmatch_generic(matcher, name); + Py_DECREF(matcher); + return res; #endif } @@ -208,7 +284,7 @@ _fnmatch_translate_impl(PyObject *module, PyObject *pattern) return NULL; } // translated regular expression as a str object - PyObject *str_expr = translate(module, unicode); + PyObject *str_expr = _regex_translate(module, unicode); Py_DECREF(unicode); if (str_expr == NULL) { return NULL; @@ -218,7 +294,7 @@ _fnmatch_translate_impl(PyObject *module, PyObject *pattern) return expr; } else if (PyUnicode_Check(pattern)) { - return translate(module, pattern); + return _regex_translate(module, pattern); } else { PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); @@ -228,6 +304,7 @@ _fnmatch_translate_impl(PyObject *module, PyObject *pattern) static PyMethodDef fnmatchmodule_methods[] = { _FNMATCH_FILTER_METHODDEF + _FNMATCH_FNMATCH_METHODDEF _FNMATCH_FNMATCHCASE_METHODDEF _FNMATCH_TRANSLATE_METHODDEF {NULL, NULL} @@ -242,8 +319,8 @@ static struct PyModuleDef_Slot fnmatchmodule_slots[] = { static struct PyModuleDef _fnmatchmodule = { PyModuleDef_HEAD_INIT, - "_fnmatch", - NULL, + .m_name = "_fnmatch", + .m_doc = NULL, .m_size = sizeof(fnmatchmodule_state), .m_methods = fnmatchmodule_methods, .m_slots = fnmatchmodule_slots, diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index af271703791be3..2311e35efe691e 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -1,13 +1,35 @@ +/* +* C accelerator for the 'fnmatch' module (POSIX only). + * + * Most functions expect string or bytes instances, and thus the Python + * implementation should first pre-process path-like objects, possibly + * applying normalizations depending on the platform if needed. + */ + #ifndef _FNMATCHMODULE_H #define _FNMATCHMODULE_H #include "Python.h" +#undef Py_USE_FNMATCH_FALLBACK +/* + * For now, only test the C acceleration of the Python implementation. + * + * TODO(picnixz): Currently, I don't know how to handle backslashes + * TODO(picnixz): in fnmatch(3) so that they are treated correctly + * TODO(picnixz): depending on whether the string was a raw string + * TODO(picnixz): or not. To see the bug, uncomment the following + * TODO(picnixz): macro and run the tests. + */ +#define Py_USE_FNMATCH_FALLBACK 1 + typedef struct { - PyObject *re_module; // 're' module - PyObject *os_module; // 'os' module + PyObject *py_module; // 'fnmatch' module + PyObject *re_module; // 're' module + PyObject *os_module; // 'os' module - PyObject *lru_cache; // optional cache for regex patterns, if needed + PyObject *lru_cache; // the LRU cache decorator + PyObject *translator; // the translation unit whose calls are cached } fnmatchmodule_state; static inline fnmatchmodule_state * @@ -18,33 +40,22 @@ get_fnmatchmodulestate_state(PyObject *module) return (fnmatchmodule_state *)state; } +#if defined(Py_HAVE_FNMATCH) && !defined(Py_USE_FNMATCH_FALLBACK) /* - * The filter() function works differently depending on whether fnmatch(3) - * is present or not. - * - * If fnmatch(3) is present, the match is performed without using regular - * expressions. The functions being used are - * - * If fnmatch(3) is not present, the match is performed using regular - * expressions. - */ - -#ifdef Py_HAVE_FNMATCH -/* - * Type for a matching function. - * - * The function must take as input a pattern and a name, - * and is used to determine whether the name matches the - * pattern or not. - * - * If the pattern is obtained from str() types, then 'name' - * must be a string (it is left to the matcher the task for - * validating this part). + * Construct a list of filtered names using fnmatch(3). */ -typedef int (*Matcher)(const char *, PyObject *); +extern PyObject * +_posix_fnmatch_encoded_filter(PyObject *pattern, PyObject *names); +/* Same as _posix_fnmatch_encoded_filter() but for unicode inputs. */ +extern PyObject * +_posix_fnmatch_unicode_filter(PyObject *pattern, PyObject *names); +/* cached 'pattern' version of _posix_fnmatch_encoded_filter() */ +extern PyObject * +_posix_fnmatch_encoded_filter_cached(const char *pattern, PyObject *names); +/* cached 'pattern' version of _posix_fnmatch_unicode_filter() */ extern PyObject * -_posix_fnmatch_filter(const char *pattern, PyObject *names, Matcher match); +_posix_fnmatch_unicode_filter_cached(const char *pattern, PyObject *names); /* * Perform a case-sensitive match using fnmatch(3). @@ -59,15 +70,53 @@ _posix_fnmatch_filter(const char *pattern, PyObject *names, Matcher match); * Returns -1 if (1) 'string' is not a `bytes` object, and * sets a TypeError exception, or (2) something went wrong. */ -extern int _posix_fnmatch_encoded(const char *pattern, PyObject *string); +extern int +_posix_fnmatch_encoded(PyObject *pattern, PyObject *string); /* Same as _posix_fnmatch_encoded() but for unicode inputs. */ -extern int _posix_fnmatch_unicode(const char *pattern, PyObject *string); -#else -extern int _regex_fnmatch_generic(PyObject *matcher, PyObject *name); +extern int +_posix_fnmatch_unicode(PyObject *pattern, PyObject *string); + +/* cached 'pattern' version of _posix_fnmatch_encoded() */ +extern int +_posix_fnmatch_encoded_cached(const char *pattern, PyObject *names); +/* cached 'pattern' version of _posix_fnmatch_encoded() */ +extern int +_posix_fnmatch_unicode_cached(const char *pattern, PyObject *names); +#endif + +/* + * Test whether a name matches a compiled RE pattern. + * + * Parameters + * + * matcher A reference to the 'match()' method of a compiled pattern. + * string The string to match (str or bytes object). + * + * Returns 1 if the 'string' matches the pattern and 0 otherwise. + * + * Returns -1 if (1) 'string' is not a `str` or a `bytes` object, + * and sets a TypeError exception, or (2) something went wrong. + */ +extern int +_regex_fnmatch_generic(PyObject *matcher, PyObject *string); + +/* + * Perform a case-sensitive match using compiled RE patterns. + * + * Parameters + * + * matcher A reference to the 'match()' method of a compiled pattern. + * names An iterable of strings (str or bytes objects) to match. + * + * Returns a list of matched names, or NULL if an error occurred. + */ extern PyObject * _regex_fnmatch_filter(PyObject *matcher, PyObject *names); -#endif -extern PyObject *translate(PyObject *module, PyObject *pattern); +/* + * C accelerator for translating UNIX shell patterns into RE patterns. + */ +extern PyObject * +_regex_translate(PyObject *module, PyObject *pattern); #endif // _FNMATCHMODULE_H diff --git a/Modules/_fnmatch/clinic/_fnmatchmodule.c.h b/Modules/_fnmatch/clinic/_fnmatchmodule.c.h index 4b12f33113d3fb..5250bddbecc273 100644 --- a/Modules/_fnmatch/clinic/_fnmatchmodule.c.h +++ b/Modules/_fnmatch/clinic/_fnmatchmodule.c.h @@ -64,6 +64,67 @@ _fnmatch_filter(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } +PyDoc_STRVAR(_fnmatch_fnmatch__doc__, +"fnmatch($module, /, name, pat)\n" +"--\n" +"\n"); + +#define _FNMATCH_FNMATCH_METHODDEF \ + {"fnmatch", _PyCFunction_CAST(_fnmatch_fnmatch), METH_FASTCALL|METH_KEYWORDS, _fnmatch_fnmatch__doc__}, + +static int +_fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat); + +static PyObject * +_fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(name), &_Py_ID(pat), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"name", "pat", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "fnmatch", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + PyObject *name; + PyObject *pat; + int _return_value; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); + if (!args) { + goto exit; + } + name = args[0]; + pat = args[1]; + _return_value = _fnmatch_fnmatch_impl(module, name, pat); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyBool_FromLong((long)_return_value); + +exit: + return return_value; +} + PyDoc_STRVAR(_fnmatch_fnmatchcase__doc__, "fnmatchcase($module, /, name, pat)\n" "--\n" @@ -182,4 +243,4 @@ _fnmatch_translate(PyObject *module, PyObject *const *args, Py_ssize_t nargs, Py exit: return return_value; } -/*[clinic end generated code: output=b0366b259b101bdf input=a9049054013a1b77]*/ +/*[clinic end generated code: output=d9bb3df00c5c2b5e input=a9049054013a1b77]*/ diff --git a/Modules/_fnmatch/posix.c b/Modules/_fnmatch/posix.c index 30d0845d7bae88..d4fdbb42ba0210 100644 --- a/Modules/_fnmatch/posix.c +++ b/Modules/_fnmatch/posix.c @@ -1,24 +1,95 @@ -#ifdef Py_HAVE_FNMATCH +#include "Python.h" + +#include "_fnmatchmodule.h" // for pre-declarations + +#if defined(Py_HAVE_FNMATCH) && !defined(Py_USE_FNMATCH_FALLBACK) #include // for fnmatch(3) -#include "Python.h" -#include "_fnmatchmodule.h" // for PosixMatcher +#define INVALID_PATTERN_TYPE "pattern must be a %s object, got %.200s" +#define INVALID_NAME_TYPE "name must be a %s object, got %.200s" -#define INVALID_TYPE_FOR_NAME "name must be a %s object, got %.200s" +// ==== Helper declarations =================================================== -#define VERIFY_NAME_ARG_TYPE(name, check, expecting) \ - do { \ - if (!check) { \ - PyErr_Format(PyExc_TypeError, INVALID_TYPE_FOR_NAME, \ - expecting, Py_TYPE(name)->tp_name); \ - return -1; \ - } \ - } while (0) +/* + * Return a bytes object as a "const char *", or NULL on error. + * + * The 'error' message is either INVALID_PATTERN_TYPE or INVALID_NAME_TYPE, + * and is used to set a TypeError if 'arg' is of incorrect type. + */ +static inline const char * +from_encoded(PyObject *arg, const char *error); + +/* + * Return a str object as a "const char *", or NULL on error. + * + * The 'error' message is either INVALID_PATTERN_TYPE or INVALID_NAME_TYPE + * and is used to set a TypeError if 'arg' is of incorrect type. + */ +static inline const char * +from_unicode(PyObject *arg, const char *error); + +/* The type of from_encoded() or from_unicode() conversion functions. */ +typedef const char *(*Converter)(PyObject *string, const char *error); + +static inline PyObject * +_posix_fnmatch_filter(PyObject *pattern, PyObject *names, Converter converter); + +/* cached 'pattern' version of _posix_fnmatch_filter() */ +static /* not inline */ PyObject * +_posix_fnmatch_filter_cached(const char *pattern, PyObject *names, Converter converter); + +// ==== API implementation ==================================================== + +inline PyObject * +_posix_fnmatch_encoded_filter(PyObject *pattern, PyObject *names) +{ + return _posix_fnmatch_filter(pattern, names, &from_encoded); +} + +inline PyObject * +_posix_fnmatch_unicode_filter(PyObject *pattern, PyObject *names) +{ + return _posix_fnmatch_filter(pattern, names, &from_unicode); +} + +inline PyObject * +_posix_fnmatch_encoded_filter_cached(const char *pattern, PyObject *names) +{ + assert(pattern != NULL); + return _posix_fnmatch_filter_cached(pattern, names, &from_encoded); +} + +inline PyObject * +_posix_fnmatch_unicode_filter_cached(const char *pattern, PyObject *names) +{ + assert(pattern != NULL); + return _posix_fnmatch_filter_cached(pattern, names, &from_unicode); +} + +inline int +_posix_fnmatch_encoded(PyObject *pattern, PyObject *string) +{ + const char *p = from_encoded(pattern, INVALID_PATTERN_TYPE); + if (p == NULL) { + return -1; + } + return _posix_fnmatch_encoded_cached(p, string); +} + +inline int +_posix_fnmatch_unicode(PyObject *pattern, PyObject *string) +{ + const char *p = from_unicode(pattern, INVALID_PATTERN_TYPE); + if (p == NULL) { + return -1; + } + return _posix_fnmatch_unicode_cached(p, string); +} #define PROCESS_MATCH_RESULT(r) \ do { \ - int res = (r); /* avoid variable capture */ \ + int res = (r); \ if (res < 0) { \ return res; \ } \ @@ -26,40 +97,75 @@ } while (0) inline int -_posix_fnmatch_encoded(const char *pattern, PyObject *string) +_posix_fnmatch_encoded_cached(const char *pattern, PyObject *string) { - VERIFY_NAME_ARG_TYPE(string, PyBytes_Check(string), "bytes"); - PROCESS_MATCH_RESULT(fnmatch(pattern, PyBytes_AS_STRING(string), 0)); + assert(pattern != NULL); + const char *s = from_encoded(string, INVALID_NAME_TYPE); + if (s == NULL) { + return -1; + } + PROCESS_MATCH_RESULT(fnmatch(pattern, s, 0)); } inline int -_posix_fnmatch_unicode(const char *pattern, PyObject *string) +_posix_fnmatch_unicode_cached(const char *pattern, PyObject *string) +{ + assert(pattern != NULL); + const char *s = from_unicode(string, INVALID_NAME_TYPE); + if (s == NULL) { + return -1; + } + PROCESS_MATCH_RESULT(fnmatch(pattern, s, 0)); +} + +#undef PROCESS_MATCH_RESULT + +// ==== Helper implementations ================================================ + +#define GENERATE_CONVERTER(function, predicate, converter, expecting) \ + static inline const char * \ + function(PyObject *arg, const char *error) \ + { \ + if (!predicate(arg)) { \ + PyErr_Format(PyExc_TypeError, error, expecting, Py_TYPE(arg)->tp_name); \ + return NULL; \ + } \ + return converter(arg); \ + } +GENERATE_CONVERTER(from_encoded, PyBytes_Check, PyBytes_AS_STRING, "bytes") +GENERATE_CONVERTER(from_unicode, PyUnicode_Check, PyUnicode_AsUTF8, "str") +#undef GENERATE_CONVERTER + +static inline PyObject * +_posix_fnmatch_filter(PyObject *pattern, PyObject *names, Converter converter) { - VERIFY_NAME_ARG_TYPE(string, PyUnicode_Check(string), "string"); - PROCESS_MATCH_RESULT(fnmatch(pattern, PyUnicode_AsUTF8(string), 0)); + const char *p = converter(pattern, INVALID_PATTERN_TYPE); + if (p == NULL) { + return NULL; + } + return _posix_fnmatch_filter_cached(p, names, converter); } -PyObject * -_posix_fnmatch_filter(const char *pattern, PyObject *names, Matcher match) +static PyObject * +_posix_fnmatch_filter_cached(const char *pattern, PyObject *names, Converter converter) { + assert(pattern != NULL); PyObject *iter = PyObject_GetIter(names); if (iter == NULL) { return NULL; } - PyObject *res = PyList_New(0); if (res == NULL) { Py_DECREF(iter); return NULL; } - PyObject *name = NULL; while ((name = PyIter_Next(iter))) { - int rc = match(pattern, name); - if (rc < 0) { + const char *n = converter(name, INVALID_NAME_TYPE); + if (n == NULL) { goto abort; } - if (rc == 1) { + if (fnmatch(pattern, n, 0) != FNM_NOMATCH) { if (PyList_Append(res, name) < 0) { goto abort; } @@ -79,4 +185,7 @@ _posix_fnmatch_filter(const char *pattern, PyObject *names, Matcher match) Py_DECREF(res); return NULL; } + +#undef INVALID_NAME_TYPE +#undef INVALID_PATTERN_TYPE #endif diff --git a/Modules/_fnmatch/regex.c b/Modules/_fnmatch/regex.c index 5ba96a214bc267..807e773635a9da 100644 --- a/Modules/_fnmatch/regex.c +++ b/Modules/_fnmatch/regex.c @@ -1,26 +1,20 @@ #include "Python.h" -/* - * Perform a case-sensitive match using regular expressions. - * - * Parameters - * - * pattern A translated regular expression. - * name The filename to match. - * - * Returns 1 if the 'name' matches the 'pattern' and 0 otherwise. - * Returns -1 if something went wrong. - */ -int +#include "_fnmatchmodule.h" // for pre-declarations + +// ==== API implementation ==================================================== + +inline int _regex_fnmatch_generic(PyObject *matcher, PyObject *name) { // If 'name' is of incorrect type, it will be detected when calling // the matcher function (we emulate 're.compile(...).match(name)'). + assert(PyCallable_Check(matcher)); PyObject *match = PyObject_CallFunction(matcher, "O", name); if (match == NULL) { return -1; } - int matching = match != Py_None; + int matching = match == Py_None ? 0 : 1; Py_DECREF(match); return matching; } @@ -28,6 +22,7 @@ _regex_fnmatch_generic(PyObject *matcher, PyObject *name) PyObject * _regex_fnmatch_filter(PyObject *matcher, PyObject *names) { + assert(PyCallable_Check(matcher)); PyObject *iter = PyObject_GetIter(names); if (iter == NULL) { return NULL; @@ -43,6 +38,7 @@ _regex_fnmatch_filter(PyObject *matcher, PyObject *names) while ((name = PyIter_Next(iter))) { int rc = _regex_fnmatch_generic(matcher, name); if (rc < 0) { + assert(PyErr_Occurred()); goto abort; } if (rc == 1) { diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 8ac45d546826fc..d4d63a2693b7de 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -11,6 +11,33 @@ // ==== Helper declarations ================================================== +#define _WRITE_OR_FAIL(writeop, onerror) \ + do { \ + if ((writeop) < 0) { \ + onerror; \ + } \ + } while (0) + +#define _WRITE_CHAR _PyUnicodeWriter_WriteChar +#define _WRITE_CHAR_OR(_writer, ch, onerror) \ + _WRITE_OR_FAIL(_WRITE_CHAR((_writer), (ch)), onerror) + +#define _WRITE_ASCII _PyUnicodeWriter_WriteASCIIString +#define _WRITE_ASCII_OR(_writer, ascii, length, onerror) \ + _WRITE_OR_FAIL(_WRITE_ASCII((_writer), (ascii), (length)), onerror) + +#define _WRITE_STRING _PyUnicodeWriter_WriteStr +#define _WRITE_STRING_OR(_writer, string, onerror) \ + _WRITE_OR_FAIL(_WRITE_STRING((_writer), (string)), onerror) + +#define _WRITE_BLOCK _PyUnicodeWriter_WriteSubstring +#define _WRITE_BLOCK_OR(_writer, string, i, j, onerror) \ + do { \ + if ((i) < (j) && _WRITE_BLOCK((_writer), (string), (i), (j)) < 0) { \ + onerror; \ + } \ + } while (0) + /* * Creates a new Unicode object from a Py_UCS4 character. * @@ -66,23 +93,23 @@ process_wildcards(PyObject *pattern, PyObject *indices); // ==== API implementation ==================================================== PyObject * -translate(PyObject *module, PyObject *pattern) +_regex_translate(PyObject *module, PyObject *pattern) { #define READ(ind) PyUnicode_READ(kind, data, (ind)) #define ADVANCE_IF_CHAR(ch, ind, maxind) \ do { \ if ((ind) < (maxind) && READ(ind) == (ch)) { \ - ++(ind); \ + ++ind; \ } \ } while (0) #define _WHILE_READ_CMP(ch, ind, maxind, cmp) \ do { \ while ((ind) < (maxind) && READ(ind) cmp (ch)) { \ - ++(ind); \ + ++ind; \ } \ } while (0) #define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, !=) -#define DROP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, ==) +#define SKIP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, ==) fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); PyObject *re = state->re_module; @@ -111,10 +138,8 @@ translate(PyObject *module, PyObject *pattern) Py_UCS4 chr = READ(i++); switch (chr) { case '*': { - if (_PyUnicodeWriter_WriteChar(_writer, chr) < 0) { - goto abort; - } - DROP_DUPLICATES('*', i, n); + _WRITE_CHAR_OR(_writer, chr, goto abort); + SKIP_DUPLICATES('*', i, n); PyObject *index = PyLong_FromSsize_t(h++); if (index == NULL) { goto abort; @@ -128,9 +153,7 @@ translate(PyObject *module, PyObject *pattern) } case '?': { // translate optional '?' (fnmatch) into optional '.' (regex) - if (_PyUnicodeWriter_WriteChar(_writer, '.') < 0) { - goto abort; - } + _WRITE_CHAR_OR(_writer, '.', goto abort); ++h; // increase the expected result's length break; } @@ -140,9 +163,7 @@ translate(PyObject *module, PyObject *pattern) ADVANCE_IF_CHAR(']', j, n); // [!] or [] ADVANCE_TO_NEXT(']', j, n); // locate closing ']' if (j >= n) { - if (_PyUnicodeWriter_WriteASCIIString(_writer, "\\[", 2) < 0) { - goto abort; - } + _WRITE_ASCII_OR(_writer, "\\[", 2, goto abort); h += 2; // we just wrote 2 characters break; // early break for clarity } @@ -165,6 +186,7 @@ translate(PyObject *module, PyObject *pattern) } else { assert(rc >= 0); + assert(READ(j) == ']'); s1 = translate_expression(pattern, i, j); } if (s1 == NULL) { @@ -200,7 +222,7 @@ translate(PyObject *module, PyObject *pattern) } } } -#undef DROP_DUPLICATES +#undef SKIP_DUPLICATES #undef ADVANCE_TO_NEXT #undef _WHILE_READ_CMP #undef ADVANCE_IF_CHAR @@ -222,7 +244,7 @@ translate(PyObject *module, PyObject *pattern) // ==== Helper implementations ================================================ -PyObject * +static PyObject * get_unicode_character(Py_UCS4 ch) { assert(ch <= 0x10ffff); @@ -247,7 +269,7 @@ get_unicode_character(Py_UCS4 ch) return unicode; } -PyObject * +static PyObject * translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) { PyObject *chunks = PyList_New(0); @@ -259,21 +281,26 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) while (k < j) { PyObject *eobj = _PyObject_CallMethod(pattern, &_Py_ID(find), "sii", "-", k, j); if (eobj == NULL) { - goto error; + goto abort; } Py_ssize_t t = PyLong_AsSsize_t(eobj); Py_DECREF(eobj); if (t < 0) { - goto error; + if (PyErr_Occurred()) { + goto abort; + } + // -1 here means that '-' was not found + assert(t == -1); + break; } PyObject *sub = PyUnicode_Substring(pattern, i, t); if (sub == NULL) { - goto error; + goto abort; } int rc = PyList_Append(chunks, sub); Py_DECREF(sub); if (rc < 0) { - goto error; + goto abort; } chunkscount += 1; i = t + 1; @@ -282,27 +309,28 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) if (i >= j) { assert(chunkscount > 0); PyObject *chunk = PyList_GET_ITEM(chunks, chunkscount - 1); + assert(chunk != NULL); PyObject *hyphen = PyUnicode_FromOrdinal('-'); if (hyphen == NULL) { - goto error; + goto abort; } PyObject *repl = PyUnicode_Concat(chunk, hyphen); Py_DECREF(hyphen); - int rc = PyList_SetItem(chunks, chunkscount - 1, repl); - Py_DECREF(repl); - if (rc < 0) { - goto error; + // PyList_SetItem() does not create a new reference on 'repl' + // so we should not decref 'repl' after the call (I think?) + if (repl == NULL || PyList_SetItem(chunks, chunkscount - 1, repl) < 0) { + goto abort; } } else { PyObject *sub = PyUnicode_Substring(pattern, i, j); if (sub == NULL) { - goto error; + goto abort; } int rc = PyList_Append(chunks, sub); Py_DECREF(sub); if (rc < 0) { - goto error; + goto abort; } chunkscount += 1; } @@ -327,62 +355,60 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) if (c1sub == NULL || c2sub == NULL) { Py_XDECREF(c1sub); Py_XDECREF(c2sub); - goto error; + goto abort; } PyObject *merged = PyUnicode_Concat(c1sub, c2sub); Py_DECREF(c1sub); Py_DECREF(c2sub); if (merged == NULL) { - goto error; + goto abort; } int rc = PyList_SetItem(chunks, c - 1, merged); - Py_DECREF(merged); if (rc < 0) { - goto error; + goto abort; } if (PySequence_DelItem(chunks, c) < 0) { - goto error; + goto abort; } chunkscount--; } } + assert(chunkscount == PyList_GET_SIZE(chunks)); // Escape backslashes and hyphens for set difference (--), // but hyphens that create ranges should not be escaped. for (c = 0; c < chunkscount; ++c) { - PyObject *s0 = PyList_GetItem(chunks, c); - if (s0 == NULL) { - goto error; - } - PyObject *s1 = PyObject_CallMethod(s0, "replace", "ss", "\\", "\\\\"); + PyObject *s0 = PyList_GET_ITEM(chunks, c); + assert(s0 != NULL); + PyObject *s1 = _PyObject_CallMethod(s0, &_Py_ID(replace), "ss", "\\", "\\\\"); if (s1 == NULL) { - goto error; + goto abort; } - PyObject *s2 = PyObject_CallMethod(s1, "replace", "ss", "-", "\\-"); + PyObject *s2 = _PyObject_CallMethod(s1, &_Py_ID(replace), "ss", "-", "\\-"); Py_DECREF(s1); if (s2 == NULL) { - goto error; + goto abort; } if (PyList_SetItem(chunks, c, s2) < 0) { - goto error; + goto abort; } } PyObject *hyphen = PyUnicode_FromOrdinal('-'); if (hyphen == NULL) { - goto error; + goto abort; } PyObject *res = PyUnicode_Join(hyphen, chunks); Py_DECREF(hyphen); if (res == NULL) { - goto error; + goto abort; } Py_DECREF(chunks); return res; -error: +abort: Py_XDECREF(chunks); return NULL; } -Py_ssize_t +static Py_ssize_t write_literal(fnmatchmodule_state *state, _PyUnicodeWriter *writer, PyObject *unicode) @@ -403,77 +429,56 @@ write_literal(fnmatchmodule_state *state, return written; } -Py_ssize_t +static Py_ssize_t write_expression(_PyUnicodeWriter *writer, PyObject *expression) { -#define WRITE_ASCII(str, len) \ - do { \ - if (_PyUnicodeWriter_WriteASCIIString(writer, (str), (len)) < 0) { \ - return -1; \ - } \ - } while (0) - -#define WRITE_CHAR(c) \ - do { \ - if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) { \ - return -1; \ - } \ - } while (0) - - Py_ssize_t grouplen; - const char *buffer = PyUnicode_AsUTF8AndSize(expression, &grouplen); +#define WRITE_CHAR(c) _WRITE_CHAR_OR(writer, c, return -1) +#define WRITE_ASCII(s, n) _WRITE_ASCII_OR(writer, s, n, return -1) +#define WRITE_BLOCK(s, i, j) _WRITE_BLOCK_OR(writer, s, i, j, return -1) +#define WRITE_STRING(s) _WRITE_STRING_OR(writer, s, return -1) + Py_ssize_t grouplen = PyUnicode_GET_LENGTH(expression); if (grouplen == 0) { /* empty range: never match */ WRITE_ASCII("(?!)", 4); return 4; } - else if (grouplen == 1 && buffer[0] == '!') { + Py_UCS4 token = PyUnicode_READ_CHAR(expression, 0); + if (grouplen == 1 && token == '!') { /* negated empty range: match any character */ WRITE_CHAR('.'); return 1; } - else { - Py_ssize_t extra = 2; // '[' and ']' - WRITE_CHAR('['); - switch (buffer[0]) { - case '!': { - WRITE_CHAR('^'); - if (_PyUnicodeWriter_WriteSubstring(writer, expression, 1, grouplen) < 0) { - return -1; - } - break; - } - case '^': - case '[': { - WRITE_CHAR('\\'); - extra++; - break; - } - default: - if (_PyUnicodeWriter_WriteStr(writer, expression) < 0) { - return -1; - } - break; + Py_ssize_t extra = 2; // '[' and ']' + WRITE_CHAR('['); + switch (token) { + case '!': { + WRITE_CHAR('^'); + WRITE_BLOCK(expression, 1, grouplen); + break; + } + case '^': + case '[': { + WRITE_CHAR('\\'); + ++extra; + WRITE_STRING(expression); + break; + } + default: { + WRITE_STRING(expression); + break; } - WRITE_CHAR(']'); - return grouplen + extra; } -#undef WRITE_CHAR + WRITE_CHAR(']'); + return grouplen + extra; +#undef WRITE_STRING +#undef WRITE_BLOCK #undef WRITE_ASCII +#undef WRITE_CHAR } -PyObject * +static PyObject * process_wildcards(PyObject *pattern, PyObject *indices) { -#define WRITE_SUBSTRING(i, j) \ - do { \ - if ((i) < (j)) { /* write the substring if non-empty */ \ - if (_PyUnicodeWriter_WriteSubstring(_writer, pattern, (i), (j)) < 0) { \ - goto abort; \ - } \ - } \ - } while (0) - const Py_ssize_t m = PyList_GET_SIZE(indices); if (m == 0) { // just write fr'(?s:{parts} + ")\Z" @@ -502,6 +507,7 @@ process_wildcards(PyObject *pattern, PyObject *indices) * of the translated pattern. */ PyObject *jobj = PyList_GET_ITEM(indices, 0); + assert(jobj != NULL); j = PyLong_AsSsize_t(jobj); // get the first position of '*' if (j < 0) { return NULL; @@ -513,27 +519,32 @@ process_wildcards(PyObject *pattern, PyObject *indices) } _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)(writer); - WRITE_SUBSTRING(i, j); // write stuff before '*' if needed +#define WRITE_BLOCK(i, j) _WRITE_BLOCK_OR(_writer, pattern, i, j, goto abort) +#define WRITE_ATOMIC_BEGIN() _WRITE_ASCII_OR(_writer, "(?>.*?", 6, goto abort) +#define WRITE_ATOMIC_END() _WRITE_CHAR_OR(_writer, ')', goto abort) + + WRITE_BLOCK(i, j); // write stuff before '*' if needed i = j + 1; // jump after the '*' for (Py_ssize_t k = 1; k < m; ++k) { PyObject *ind = PyList_GET_ITEM(indices, k); + assert(ind != NULL); j = PyLong_AsSsize_t(ind); - assert(j < 0 || i < j); - if (j < 0 || - (_PyUnicodeWriter_WriteASCIIString(_writer, "(?>.*?", 6) < 0) || - (_PyUnicodeWriter_WriteSubstring(_writer, pattern, i, j) < 0) || - (_PyUnicodeWriter_WriteChar(_writer, ')') < 0)) - { + if (j < 0) { goto abort; } + assert(i < j); + // atomic group begin + WRITE_ATOMIC_BEGIN(); + WRITE_BLOCK(i, j); + WRITE_ATOMIC_END(); i = j + 1; } // handle the last group - if (_PyUnicodeWriter_WriteASCIIString(_writer, ".*", 2) < 0) { - goto abort; - } - WRITE_SUBSTRING(i, n); // write the remaining substring -#undef WRITE_SUBSTRING + _WRITE_ASCII_OR(_writer, ".*", 2, goto abort); + WRITE_BLOCK(i, n); // write the remaining substring +#undef WRITE_BLOCK +#undef WRITE_ATOMIC_END +#undef WRITE_ATOMIC_BEGIN PyObject *res = PyUnicodeWriter_Finish(writer); if (res == NULL) { return NULL; @@ -545,3 +556,13 @@ process_wildcards(PyObject *pattern, PyObject *indices) PyUnicodeWriter_Discard(writer); return NULL; } + +#undef _WRITE_BLOCK_OR +#undef _WRITE_BLOCK +#undef _WRITE_STRING_OR +#undef _WRITE_STRING +#undef _WRITE_ASCII_OR +#undef _WRITE_ASCII +#undef _WRITE_CHAR_OR +#undef _WRITE_CHAR +#undef _WRITE_OR_FAIL From f9343f3d545a84ec691b08322546403caff7e327 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 10 Jul 2024 18:52:11 +0200 Subject: [PATCH 31/97] update generated objects --- Include/internal/pycore_global_objects_fini_generated.h | 1 + Include/internal/pycore_global_strings.h | 1 + Include/internal/pycore_runtime_init_generated.h | 1 + Include/internal/pycore_unicodeobject_generated.h | 4 ++++ 4 files changed, 7 insertions(+) diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index ec11eec5eec27d..44820e0ce13fad 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -951,6 +951,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fix_imports)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(flags)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(flush)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fnmatch)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fold)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(follow_symlinks)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(format)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index f27bdeb0183aec..c21492376395e4 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -440,6 +440,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(fix_imports) STRUCT_FOR_ID(flags) STRUCT_FOR_ID(flush) + STRUCT_FOR_ID(fnmatch) STRUCT_FOR_ID(fold) STRUCT_FOR_ID(follow_symlinks) STRUCT_FOR_ID(format) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index ab94af0cfb90c9..9a99b3645fb717 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -949,6 +949,7 @@ extern "C" { INIT_ID(fix_imports), \ INIT_ID(flags), \ INIT_ID(flush), \ + INIT_ID(fnmatch), \ INIT_ID(fold), \ INIT_ID(follow_symlinks), \ INIT_ID(format), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index a0e532edc1bfc6..83ece722c6fa9d 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1560,6 +1560,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(fnmatch); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(fold); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); From 46d7744399a314393e41d450608a0904d0107cc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 10 Jul 2024 19:02:50 +0200 Subject: [PATCH 32/97] update module names --- Python/stdlib_module_names.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/stdlib_module_names.h b/Python/stdlib_module_names.h index 9686d10563aa4d..4b20baaf24d0c8 100644 --- a/Python/stdlib_module_names.h +++ b/Python/stdlib_module_names.h @@ -32,6 +32,7 @@ static const char* _Py_stdlib_module_names[] = { "_dbm", "_decimal", "_elementtree", +"_fnmatch", "_frozen_importlib", "_frozen_importlib_external", "_functools", From 5bc902e23a2e05c63be7478e726c5fe0349be1d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 10 Jul 2024 19:17:33 +0200 Subject: [PATCH 33/97] fix smelly names --- Modules/_fnmatch/_fnmatchmodule.c | 16 +++++++-------- Modules/_fnmatch/_fnmatchmodule.h | 34 +++++++++++++++---------------- Modules/_fnmatch/posix.c | 20 +++++++++--------- Modules/_fnmatch/regex.c | 6 +++--- Modules/_fnmatch/translate.c | 2 +- 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 6e4b23d83cfb53..77afbb9f819272 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -179,11 +179,11 @@ _fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) // call os.fspath() on the names being matched, whereas it does on NT. if (PyBytes_Check(pat)) { const char *pattern = PyBytes_AS_STRING(pat); - return _posix_fnmatch_encoded_filter_cached(pattern, names); + return _Py_posix_fnmatch_encoded_filter_cached(pattern, names); } if (PyUnicode_Check(pat)) { const char *pattern = PyUnicode_AsUTF8(pat); - return _posix_fnmatch_unicode_filter_cached(pattern, names); + return _Py_posix_fnmatch_unicode_filter_cached(pattern, names); } PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); return NULL; @@ -192,7 +192,7 @@ _fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) if (matcher == NULL) { return NULL; } - PyObject *result = _regex_fnmatch_filter(matcher, names); + PyObject *result = _Py_regex_fnmatch_filter(matcher, names); Py_DECREF(matcher); return result; #endif @@ -246,11 +246,11 @@ _fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) // the C implementation). if (PyBytes_Check(pat)) { const char *pattern = PyBytes_AS_STRING(pat); - return _posix_fnmatch_encoded_cached(pattern, name); + return _Py_posix_fnmatch_encoded_cached(pattern, name); } if (PyUnicode_Check(pat)) { const char *pattern = PyUnicode_AsUTF8(pat); - return _posix_fnmatch_unicode_cached(pattern, name); + return _Py_posix_fnmatch_unicode_cached(pattern, name); } PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); return -1; @@ -259,7 +259,7 @@ _fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) if (matcher == NULL) { return -1; } - int res = _regex_fnmatch_generic(matcher, name); + int res = _Py_regex_fnmatch_generic(matcher, name); Py_DECREF(matcher); return res; #endif @@ -284,7 +284,7 @@ _fnmatch_translate_impl(PyObject *module, PyObject *pattern) return NULL; } // translated regular expression as a str object - PyObject *str_expr = _regex_translate(module, unicode); + PyObject *str_expr = _Py_regex_translate(module, unicode); Py_DECREF(unicode); if (str_expr == NULL) { return NULL; @@ -294,7 +294,7 @@ _fnmatch_translate_impl(PyObject *module, PyObject *pattern) return expr; } else if (PyUnicode_Check(pattern)) { - return _regex_translate(module, pattern); + return _Py_regex_translate(module, pattern); } else { PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index 2311e35efe691e..9a2128e6e005ae 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -45,17 +45,17 @@ get_fnmatchmodulestate_state(PyObject *module) * Construct a list of filtered names using fnmatch(3). */ extern PyObject * -_posix_fnmatch_encoded_filter(PyObject *pattern, PyObject *names); -/* Same as _posix_fnmatch_encoded_filter() but for unicode inputs. */ +_Py_posix_fnmatch_encoded_filter(PyObject *pattern, PyObject *names); +/* Same as _Py_posix_fnmatch_encoded_filter() but for unicode inputs. */ extern PyObject * -_posix_fnmatch_unicode_filter(PyObject *pattern, PyObject *names); +_Py_posix_fnmatch_unicode_filter(PyObject *pattern, PyObject *names); -/* cached 'pattern' version of _posix_fnmatch_encoded_filter() */ +/* cached 'pattern' version of _Py_posix_fnmatch_encoded_filter() */ extern PyObject * -_posix_fnmatch_encoded_filter_cached(const char *pattern, PyObject *names); -/* cached 'pattern' version of _posix_fnmatch_unicode_filter() */ +_Py_posix_fnmatch_encoded_filter_cached(const char *pattern, PyObject *names); +/* cached 'pattern' version of _Py_posix_fnmatch_unicode_filter() */ extern PyObject * -_posix_fnmatch_unicode_filter_cached(const char *pattern, PyObject *names); +_Py_posix_fnmatch_unicode_filter_cached(const char *pattern, PyObject *names); /* * Perform a case-sensitive match using fnmatch(3). @@ -71,17 +71,17 @@ _posix_fnmatch_unicode_filter_cached(const char *pattern, PyObject *names); * sets a TypeError exception, or (2) something went wrong. */ extern int -_posix_fnmatch_encoded(PyObject *pattern, PyObject *string); -/* Same as _posix_fnmatch_encoded() but for unicode inputs. */ +_Py_posix_fnmatch_encoded(PyObject *pattern, PyObject *string); +/* Same as _Py_posix_fnmatch_encoded() but for unicode inputs. */ extern int -_posix_fnmatch_unicode(PyObject *pattern, PyObject *string); +_Py_posix_fnmatch_unicode(PyObject *pattern, PyObject *string); -/* cached 'pattern' version of _posix_fnmatch_encoded() */ +/* cached 'pattern' version of _Py_posix_fnmatch_encoded() */ extern int -_posix_fnmatch_encoded_cached(const char *pattern, PyObject *names); -/* cached 'pattern' version of _posix_fnmatch_encoded() */ +_Py_posix_fnmatch_encoded_cached(const char *pattern, PyObject *names); +/* cached 'pattern' version of _Py_posix_fnmatch_encoded() */ extern int -_posix_fnmatch_unicode_cached(const char *pattern, PyObject *names); +_Py_posix_fnmatch_unicode_cached(const char *pattern, PyObject *names); #endif /* @@ -98,7 +98,7 @@ _posix_fnmatch_unicode_cached(const char *pattern, PyObject *names); * and sets a TypeError exception, or (2) something went wrong. */ extern int -_regex_fnmatch_generic(PyObject *matcher, PyObject *string); +_Py_regex_fnmatch_generic(PyObject *matcher, PyObject *string); /* * Perform a case-sensitive match using compiled RE patterns. @@ -111,12 +111,12 @@ _regex_fnmatch_generic(PyObject *matcher, PyObject *string); * Returns a list of matched names, or NULL if an error occurred. */ extern PyObject * -_regex_fnmatch_filter(PyObject *matcher, PyObject *names); +_Py_regex_fnmatch_filter(PyObject *matcher, PyObject *names); /* * C accelerator for translating UNIX shell patterns into RE patterns. */ extern PyObject * -_regex_translate(PyObject *module, PyObject *pattern); +_Py_regex_translate(PyObject *module, PyObject *pattern); #endif // _FNMATCHMODULE_H diff --git a/Modules/_fnmatch/posix.c b/Modules/_fnmatch/posix.c index d4fdbb42ba0210..45fe88b5440f74 100644 --- a/Modules/_fnmatch/posix.c +++ b/Modules/_fnmatch/posix.c @@ -42,49 +42,49 @@ _posix_fnmatch_filter_cached(const char *pattern, PyObject *names, Converter con // ==== API implementation ==================================================== inline PyObject * -_posix_fnmatch_encoded_filter(PyObject *pattern, PyObject *names) +_Py_posix_fnmatch_encoded_filter(PyObject *pattern, PyObject *names) { return _posix_fnmatch_filter(pattern, names, &from_encoded); } inline PyObject * -_posix_fnmatch_unicode_filter(PyObject *pattern, PyObject *names) +_Py_posix_fnmatch_unicode_filter(PyObject *pattern, PyObject *names) { return _posix_fnmatch_filter(pattern, names, &from_unicode); } inline PyObject * -_posix_fnmatch_encoded_filter_cached(const char *pattern, PyObject *names) +_Py_posix_fnmatch_encoded_filter_cached(const char *pattern, PyObject *names) { assert(pattern != NULL); return _posix_fnmatch_filter_cached(pattern, names, &from_encoded); } inline PyObject * -_posix_fnmatch_unicode_filter_cached(const char *pattern, PyObject *names) +_Py_posix_fnmatch_unicode_filter_cached(const char *pattern, PyObject *names) { assert(pattern != NULL); return _posix_fnmatch_filter_cached(pattern, names, &from_unicode); } inline int -_posix_fnmatch_encoded(PyObject *pattern, PyObject *string) +_Py_posix_fnmatch_encoded(PyObject *pattern, PyObject *string) { const char *p = from_encoded(pattern, INVALID_PATTERN_TYPE); if (p == NULL) { return -1; } - return _posix_fnmatch_encoded_cached(p, string); + return _Py_posix_fnmatch_encoded_cached(p, string); } inline int -_posix_fnmatch_unicode(PyObject *pattern, PyObject *string) +_Py_posix_fnmatch_unicode(PyObject *pattern, PyObject *string) { const char *p = from_unicode(pattern, INVALID_PATTERN_TYPE); if (p == NULL) { return -1; } - return _posix_fnmatch_unicode_cached(p, string); + return _Py_posix_fnmatch_unicode_cached(p, string); } #define PROCESS_MATCH_RESULT(r) \ @@ -97,7 +97,7 @@ _posix_fnmatch_unicode(PyObject *pattern, PyObject *string) } while (0) inline int -_posix_fnmatch_encoded_cached(const char *pattern, PyObject *string) +_Py_posix_fnmatch_encoded_cached(const char *pattern, PyObject *string) { assert(pattern != NULL); const char *s = from_encoded(string, INVALID_NAME_TYPE); @@ -108,7 +108,7 @@ _posix_fnmatch_encoded_cached(const char *pattern, PyObject *string) } inline int -_posix_fnmatch_unicode_cached(const char *pattern, PyObject *string) +_Py_posix_fnmatch_unicode_cached(const char *pattern, PyObject *string) { assert(pattern != NULL); const char *s = from_unicode(string, INVALID_NAME_TYPE); diff --git a/Modules/_fnmatch/regex.c b/Modules/_fnmatch/regex.c index 807e773635a9da..b6715bb33283b0 100644 --- a/Modules/_fnmatch/regex.c +++ b/Modules/_fnmatch/regex.c @@ -5,7 +5,7 @@ // ==== API implementation ==================================================== inline int -_regex_fnmatch_generic(PyObject *matcher, PyObject *name) +_Py_regex_fnmatch_generic(PyObject *matcher, PyObject *name) { // If 'name' is of incorrect type, it will be detected when calling // the matcher function (we emulate 're.compile(...).match(name)'). @@ -20,7 +20,7 @@ _regex_fnmatch_generic(PyObject *matcher, PyObject *name) } PyObject * -_regex_fnmatch_filter(PyObject *matcher, PyObject *names) +_Py_regex_fnmatch_filter(PyObject *matcher, PyObject *names) { assert(PyCallable_Check(matcher)); PyObject *iter = PyObject_GetIter(names); @@ -36,7 +36,7 @@ _regex_fnmatch_filter(PyObject *matcher, PyObject *names) PyObject *name = NULL; while ((name = PyIter_Next(iter))) { - int rc = _regex_fnmatch_generic(matcher, name); + int rc = _Py_regex_fnmatch_generic(matcher, name); if (rc < 0) { assert(PyErr_Occurred()); goto abort; diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index d4d63a2693b7de..40c42beebbec26 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -93,7 +93,7 @@ process_wildcards(PyObject *pattern, PyObject *indices); // ==== API implementation ==================================================== PyObject * -_regex_translate(PyObject *module, PyObject *pattern) +_Py_regex_translate(PyObject *module, PyObject *pattern) { #define READ(ind) PyUnicode_READ(kind, data, (ind)) #define ADVANCE_IF_CHAR(ch, ind, maxind) \ From 78140286e55ee4429708f371f761132f22ac91da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:46:06 +0200 Subject: [PATCH 34/97] fix translation unit - update comments - remove calls to private API in functions - more macro protection - fix refleaks --- Modules/_fnmatch/translate.c | 162 +++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 72 deletions(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 40c42beebbec26..8900800f0c933c 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -11,6 +11,8 @@ // ==== Helper declarations ================================================== +typedef fnmatchmodule_state State; + #define _WRITE_OR_FAIL(writeop, onerror) \ do { \ if ((writeop) < 0) { \ @@ -18,22 +20,27 @@ } \ } while (0) -#define _WRITE_CHAR _PyUnicodeWriter_WriteChar -#define _WRITE_CHAR_OR(_writer, ch, onerror) \ - _WRITE_OR_FAIL(_WRITE_CHAR((_writer), (ch)), onerror) +#define _WRITE_CHAR(writer, ch) \ + _PyUnicodeWriter_WriteChar((_PyUnicodeWriter *)(writer), (ch)) +#define _WRITE_CHAR_OR(writer, ch, onerror) \ + _WRITE_OR_FAIL(_WRITE_CHAR((writer), (ch)), onerror) -#define _WRITE_ASCII _PyUnicodeWriter_WriteASCIIString -#define _WRITE_ASCII_OR(_writer, ascii, length, onerror) \ - _WRITE_OR_FAIL(_WRITE_ASCII((_writer), (ascii), (length)), onerror) +#define _WRITE_ASCII(writer, ascii, length) \ + _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter *)(writer), (ascii), (length)) +#define _WRITE_ASCII_OR(writer, ascii, length, onerror) \ + _WRITE_OR_FAIL(_WRITE_ASCII((writer), (ascii), (length)), onerror) -#define _WRITE_STRING _PyUnicodeWriter_WriteStr -#define _WRITE_STRING_OR(_writer, string, onerror) \ - _WRITE_OR_FAIL(_WRITE_STRING((_writer), (string)), onerror) +#define _WRITE_STRING(writer, string) \ + _PyUnicodeWriter_WriteStr((_PyUnicodeWriter *)(writer), (string)) +#define _WRITE_STRING_OR(writer, string, onerror) \ + _WRITE_OR_FAIL(_WRITE_STRING((writer), (string)), onerror) -#define _WRITE_BLOCK _PyUnicodeWriter_WriteSubstring -#define _WRITE_BLOCK_OR(_writer, string, i, j, onerror) \ +#define _WRITE_BLOCK(writer, string, i, j) \ + _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter *)(writer), (string), (i), (j)) +#define _WRITE_BLOCK_OR(writer, string, i, j, onerror) \ do { \ - if ((i) < (j) && _WRITE_BLOCK((_writer), (string), (i), (j)) < 0) { \ + Py_ssize_t _i = (i), _j = (j); /* to allow in-place operators on i or j */ \ + if (_i < _j && _WRITE_BLOCK((writer), (string), _i, _j) < 0) { \ onerror; \ } \ } while (0) @@ -70,9 +77,7 @@ translate_expression(PyObject *pattern, Py_ssize_t start, Py_ssize_t stop); * This returns the number of written characters, or -1 if an error occurred. */ static Py_ssize_t -write_literal(fnmatchmodule_state *state, - _PyUnicodeWriter *writer, - PyObject *unicode); +write_literal(State *state, PyUnicodeWriter *writer, PyObject *unicode); /* * Write the translated pattern obtained by translate_expression(). @@ -80,12 +85,12 @@ write_literal(fnmatchmodule_state *state, * This returns the number of written characters, or -1 if an error occurred. */ static Py_ssize_t -write_expression(_PyUnicodeWriter *writer, PyObject *expression); +write_expression(PyUnicodeWriter *writer, PyObject *expression); /* * Build the final regular expression by processing the wildcards. * - * The position of each wildcard in 'strings' is given by 'indices'. + * The position of each wildcard in 'pattern' is given by 'indices'. */ static PyObject * process_wildcards(PyObject *pattern, PyObject *indices); @@ -93,38 +98,52 @@ process_wildcards(PyObject *pattern, PyObject *indices); // ==== API implementation ==================================================== PyObject * -_Py_regex_translate(PyObject *module, PyObject *pattern) +_Py_fnmatch_translate(PyObject *module, PyObject *pattern) { #define READ(ind) PyUnicode_READ(kind, data, (ind)) #define ADVANCE_IF_CHAR(ch, ind, maxind) \ do { \ + /* the following forces ind to be a variable name */ \ + Py_ssize_t *Py_UNUSED(_ind) = &ind; \ if ((ind) < (maxind) && READ(ind) == (ch)) { \ ++ind; \ } \ } while (0) #define _WHILE_READ_CMP(ch, ind, maxind, cmp) \ do { \ + /* the following forces ind to be a variable name */ \ + Py_ssize_t *Py_UNUSED(_ind) = &ind; \ while ((ind) < (maxind) && READ(ind) cmp (ch)) { \ ++ind; \ } \ } while (0) -#define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, !=) -#define SKIP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP(ch, from, maxind, ==) +#define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), !=) +#define SKIP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), ==) - fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); + State *state = get_fnmatchmodulestate_state(module); PyObject *re = state->re_module; const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); - // We would write less data if there are successive '*', which should - // not be the case in general. Otherwise, we write >= n characters - // since escaping them would always add more characters so we will - // overestimate a bit the number of characters to write. + // We would write less data if there are successive '*', + // which should not be the case in general. Otherwise, + // we write >= n characters since escaping them always + // add more characters. + // + // Note that only b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f' need to + // be escaped when translated to RE patterns and '*' and '?' + // are already handled without being escaped. + // + // In general, UNIX style patterns are more likely to contain + // wildcards than characters to be escaped, with the exception + // of '-', '\' and '~' (we usually want to match filenmaes), + // and there is a sparse number of them. Therefore, we only + // estimate the number of characters to be written to be the + // same as the number of characters in the pattern. // - // TODO(picnixz): should we limit the estimation or not? - PyUnicodeWriter *writer = PyUnicodeWriter_Create((Py_ssize_t)(1.05 * n)); + // TODO: (picnixz): should we limit the estimation in case of a failure? + PyUnicodeWriter *writer = PyUnicodeWriter_Create(n); if (writer == NULL) { return NULL; } - _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)(writer); // list containing the indices where '*' has a special meaning PyObject *indices = PyList_New(0); if (indices == NULL) { @@ -138,7 +157,7 @@ _Py_regex_translate(PyObject *module, PyObject *pattern) Py_UCS4 chr = READ(i++); switch (chr) { case '*': { - _WRITE_CHAR_OR(_writer, chr, goto abort); + _WRITE_CHAR_OR(writer, chr, goto abort); SKIP_DUPLICATES('*', i, n); PyObject *index = PyLong_FromSsize_t(h++); if (index == NULL) { @@ -153,7 +172,7 @@ _Py_regex_translate(PyObject *module, PyObject *pattern) } case '?': { // translate optional '?' (fnmatch) into optional '.' (regex) - _WRITE_CHAR_OR(_writer, '.', goto abort); + _WRITE_CHAR_OR(writer, '.', goto abort); ++h; // increase the expected result's length break; } @@ -163,7 +182,7 @@ _Py_regex_translate(PyObject *module, PyObject *pattern) ADVANCE_IF_CHAR(']', j, n); // [!] or [] ADVANCE_TO_NEXT(']', j, n); // locate closing ']' if (j >= n) { - _WRITE_ASCII_OR(_writer, "\\[", 2, goto abort); + _WRITE_ASCII_OR(writer, "\\[", 2, goto abort); h += 2; // we just wrote 2 characters break; // early break for clarity } @@ -171,21 +190,21 @@ _Py_regex_translate(PyObject *module, PyObject *pattern) // v--- pattern[j] (exclusive) // '[' * ... * ']' // ^----- pattern[i] (inclusive) - int rc = PyUnicode_FindChar(pattern, '-', i, j, 1); - if (rc == -2) { + int pos = PyUnicode_FindChar(pattern, '-', i, j, 1); + if (pos == -2) { goto abort; } PyObject *s1 = NULL, *s2 = NULL; - if (rc == -1) { - PyObject *group = PyUnicode_Substring(pattern, i, j); - if (group == NULL) { + if (pos == -1) { + PyObject *s0 = PyUnicode_Substring(pattern, i, j); + if (s0 == NULL) { goto abort; } - s1 = _PyObject_CallMethod(group, &_Py_ID(replace), "ss", "\\", "\\\\"); - Py_DECREF(group); + s1 = _PyObject_CallMethod(s0, &_Py_ID(replace), "ss", "\\", "\\\\"); + Py_DECREF(s0); } else { - assert(rc >= 0); + assert(pos >= 0); assert(READ(j) == ']'); s1 = translate_expression(pattern, i, j); } @@ -197,7 +216,7 @@ _Py_regex_translate(PyObject *module, PyObject *pattern) if (s2 == NULL) { goto abort; } - int difflen = write_expression(_writer, s2); + int difflen = write_expression(writer, s2); Py_DECREF(s2); if (difflen < 0) { goto abort; @@ -212,7 +231,7 @@ _Py_regex_translate(PyObject *module, PyObject *pattern) if (str == NULL) { goto abort; } - int difflen = write_literal(state, _writer, str); + int difflen = write_literal(state, writer, str); Py_DECREF(str); if (difflen < 0) { goto abort; @@ -317,8 +336,10 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) PyObject *repl = PyUnicode_Concat(chunk, hyphen); Py_DECREF(hyphen); // PyList_SetItem() does not create a new reference on 'repl' - // so we should not decref 'repl' after the call (I think?) + // so we should not decref 'repl' after the call, unless there + // is an issue while setting the item. if (repl == NULL || PyList_SetItem(chunks, chunkscount - 1, repl) < 0) { + Py_XDECREF(repl); goto abort; } } @@ -360,11 +381,11 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) PyObject *merged = PyUnicode_Concat(c1sub, c2sub); Py_DECREF(c1sub); Py_DECREF(c2sub); - if (merged == NULL) { - goto abort; - } - int rc = PyList_SetItem(chunks, c - 1, merged); - if (rc < 0) { + // PyList_SetItem() does not create a new reference on 'merged' + // so we should not decref 'merged' after the call, unless there + // is an issue while setting the item. + if (merged == NULL || PyList_SetItem(chunks, c - 1, merged) < 0) { + Py_XDECREF(merged); goto abort; } if (PySequence_DelItem(chunks, c) < 0) { @@ -385,10 +406,11 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) } PyObject *s2 = _PyObject_CallMethod(s1, &_Py_ID(replace), "ss", "-", "\\-"); Py_DECREF(s1); - if (s2 == NULL) { - goto abort; - } - if (PyList_SetItem(chunks, c, s2) < 0) { + // PyList_SetItem() does not create a new reference on 's2' + // so we should not decref 's2' after the call, unless there + // is an issue while setting the item. + if (s2 == NULL || PyList_SetItem(chunks, c, s2) < 0) { + Py_XDECREF(s2); goto abort; } } @@ -409,9 +431,7 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) } static Py_ssize_t -write_literal(fnmatchmodule_state *state, - _PyUnicodeWriter *writer, - PyObject *unicode) +write_literal(State *state, PyUnicodeWriter *writer, PyObject *unicode) { PyObject *escaped = PyObject_CallMethodOneArg(state->re_module, &_Py_ID(escape), @@ -420,7 +440,8 @@ write_literal(fnmatchmodule_state *state, return -1; } Py_ssize_t written = PyUnicode_GET_LENGTH(escaped); - int rc = _PyUnicodeWriter_WriteStr(writer, escaped); + assert(written >= 0); + int rc = _WRITE_STRING(writer, escaped); Py_DECREF(escaped); if (rc < 0) { return -1; @@ -430,12 +451,12 @@ write_literal(fnmatchmodule_state *state, } static Py_ssize_t -write_expression(_PyUnicodeWriter *writer, PyObject *expression) +write_expression(PyUnicodeWriter *writer, PyObject *expression) { -#define WRITE_CHAR(c) _WRITE_CHAR_OR(writer, c, return -1) -#define WRITE_ASCII(s, n) _WRITE_ASCII_OR(writer, s, n, return -1) -#define WRITE_BLOCK(s, i, j) _WRITE_BLOCK_OR(writer, s, i, j, return -1) -#define WRITE_STRING(s) _WRITE_STRING_OR(writer, s, return -1) +#define WRITE_CHAR(c) _WRITE_CHAR_OR(writer, (c), return -1) +#define WRITE_ASCII(s, n) _WRITE_ASCII_OR(writer, (s), (n), return -1) +#define WRITE_BLOCK(s, i, j) _WRITE_BLOCK_OR(writer, (s), (i), (j), return -1) +#define WRITE_STRING(s) _WRITE_STRING_OR(writer, (s), return -1) Py_ssize_t grouplen = PyUnicode_GET_LENGTH(expression); if (grouplen == 0) { /* empty range: never match */ @@ -452,14 +473,14 @@ write_expression(_PyUnicodeWriter *writer, PyObject *expression) WRITE_CHAR('['); switch (token) { case '!': { - WRITE_CHAR('^'); + WRITE_CHAR('^'); // replace '!' by '^' WRITE_BLOCK(expression, 1, grouplen); break; } case '^': case '[': { WRITE_CHAR('\\'); - ++extra; + ++extra; // because we wrote '\\' WRITE_STRING(expression); break; } @@ -499,7 +520,7 @@ process_wildcards(PyObject *pattern, PyObject *indices) * the STRING by "(?>.*?" and ")", and thus we will write at * least 7 + len(STRING) characters. * - * We write one additional '.*' if indices[-1] + 1 = n. + * We write one additional '.*' if indices[-1] + 1 == n. * * Since the result is surrounded by "(?s:" and ")\Z", we * write at least "indices[0] + 7m + n + 6" characters, @@ -517,12 +538,9 @@ process_wildcards(PyObject *pattern, PyObject *indices) if (writer == NULL) { return NULL; } - _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)(writer); - -#define WRITE_BLOCK(i, j) _WRITE_BLOCK_OR(_writer, pattern, i, j, goto abort) -#define WRITE_ATOMIC_BEGIN() _WRITE_ASCII_OR(_writer, "(?>.*?", 6, goto abort) -#define WRITE_ATOMIC_END() _WRITE_CHAR_OR(_writer, ')', goto abort) - +#define WRITE_BLOCK(i, j) _WRITE_BLOCK_OR(writer, pattern, (i), (j), goto abort) +#define WRITE_ATOMIC_BEGIN() _WRITE_ASCII_OR(writer, "(?>.*?", 6, goto abort) +#define WRITE_ATOMIC_END() _WRITE_CHAR_OR(writer, ')', goto abort) WRITE_BLOCK(i, j); // write stuff before '*' if needed i = j + 1; // jump after the '*' for (Py_ssize_t k = 1; k < m; ++k) { @@ -533,15 +551,15 @@ process_wildcards(PyObject *pattern, PyObject *indices) goto abort; } assert(i < j); - // atomic group begin + // write the atomic RE group WRITE_ATOMIC_BEGIN(); WRITE_BLOCK(i, j); WRITE_ATOMIC_END(); i = j + 1; } // handle the last group - _WRITE_ASCII_OR(_writer, ".*", 2, goto abort); - WRITE_BLOCK(i, n); // write the remaining substring + _WRITE_ASCII_OR(writer, ".*", 2, goto abort); + WRITE_BLOCK(i, n); // write the remaining substring (if non-empty) #undef WRITE_BLOCK #undef WRITE_ATOMIC_END #undef WRITE_ATOMIC_BEGIN From 3f075bbc2233e98fb49d68965e9a4bed27607199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:46:58 +0200 Subject: [PATCH 35/97] remove fnmatch(3) detection --- configure.ac | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/configure.ac b/configure.ac index 6093c994bd13af..bd1440fdd5d6df 100644 --- a/configure.ac +++ b/configure.ac @@ -3854,30 +3854,6 @@ if test "$ac_cv_c_complex_supported" = "yes"; then [Defined if _Complex C type is available.]) fi -# check for fnmatch(3) support -# -# We test for the plain POSIX implementation (case-sensitive match). -# -# To ensure that the implementation of fnmatch(3) is compliant -# we run some tests to make sure that everything works well. -# -# Note that MSVC does not support fnmatch(3). -AC_CACHE_CHECK([for case-sensititve fnmatch(3)], [ac_cv_fnmatch_supported], -[AC_RUN_IFELSE( - [AC_LANG_PROGRAM([@%:@include ], [[ - exit(!( - fnmatch("a*", "abc", 0) != FNM_NOMATCH && - fnmatch("a*", "Abc", 0) == FNM_NOMATCH - )); - ]])], [ac_cv_fnmatch_supported=yes], - [ac_cv_fnmatch_supported=no], - [ac_cv_fnmatch_supported=no] -)]) -if test "$ac_cv_fnmatch_supported" = "yes"; then - AC_DEFINE([Py_HAVE_FNMATCH], [1], - [Defined if case-sensitive fnmatch(3) is supported.]) -fi - # check for systems that require aligned memory access AC_CACHE_CHECK([aligned memory access is required], [ac_cv_aligned_required], [AC_RUN_IFELSE([AC_LANG_SOURCE([[ From c78a813b0e988012718ce1f83ca1edb84d5aaecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:47:42 +0200 Subject: [PATCH 36/97] remove fnmatch(3) layout --- Makefile.pre.in | 1 - Modules/Setup.bootstrap.in | 2 +- Modules/_fnmatch/_fnmatchmodule.c | 39 +----- Modules/_fnmatch/_fnmatchmodule.h | 70 +---------- Modules/_fnmatch/posix.c | 191 ------------------------------ 5 files changed, 11 insertions(+), 292 deletions(-) delete mode 100644 Modules/_fnmatch/posix.c diff --git a/Makefile.pre.in b/Makefile.pre.in index ec99023f36b2b3..b751da584fb956 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -354,7 +354,6 @@ FNMATCH_H= Modules/_fnmatch/_fnmatchmodule.h FNMATCH_OBJS= \ Modules/_fnmatch/_fnmatchmodule.o \ - Modules/_fnmatch/posix.o \ Modules/_fnmatch/regex.o \ Modules/_fnmatch/translate.o diff --git a/Modules/Setup.bootstrap.in b/Modules/Setup.bootstrap.in index c54cd207aec57d..4001650e77682c 100644 --- a/Modules/Setup.bootstrap.in +++ b/Modules/Setup.bootstrap.in @@ -35,7 +35,7 @@ _stat _stat.c _symtable symtablemodule.c # miscellaneous accelerators -_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/posix.c _fnmatch/regex.c _fnmatch/translate.c +_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/regex.c _fnmatch/translate.c # for systems without $HOME env, used by site._getuserbase() @MODULE_PWD_TRUE@pwd pwdmodule.c diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 77afbb9f819272..d397785172eee8 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -174,28 +174,13 @@ static PyObject * _fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) /*[clinic end generated code: output=7f11aa68436d05fc input=1d233174e1c4157a]*/ { -#if defined(Py_HAVE_FNMATCH) && !defined(Py_USE_FNMATCH_FALLBACK) - // Note that the Python implementation of fnmatch.filter() does not - // call os.fspath() on the names being matched, whereas it does on NT. - if (PyBytes_Check(pat)) { - const char *pattern = PyBytes_AS_STRING(pat); - return _Py_posix_fnmatch_encoded_filter_cached(pattern, names); - } - if (PyUnicode_Check(pat)) { - const char *pattern = PyUnicode_AsUTF8(pat); - return _Py_posix_fnmatch_unicode_filter_cached(pattern, names); - } - PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); - return NULL; -#else PyObject *matcher = get_matcher_function(module, pat); if (matcher == NULL) { return NULL; } - PyObject *result = _Py_regex_fnmatch_filter(matcher, names); + PyObject *result = _Py_fnmatch_filter(matcher, names); Py_DECREF(matcher); return result; -#endif } /*[clinic input] @@ -240,29 +225,13 @@ static int _fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) /*[clinic end generated code: output=4d1283b1b1fc7cb8 input=b02a6a5c8c5a46e2]*/ { -#if defined(Py_HAVE_FNMATCH) && !defined(Py_USE_FNMATCH_FALLBACK) - // This function does not transform path-like objects, nor does it - // case-normalize 'name' or 'pattern' (whether it is the Python or - // the C implementation). - if (PyBytes_Check(pat)) { - const char *pattern = PyBytes_AS_STRING(pat); - return _Py_posix_fnmatch_encoded_cached(pattern, name); - } - if (PyUnicode_Check(pat)) { - const char *pattern = PyUnicode_AsUTF8(pat); - return _Py_posix_fnmatch_unicode_cached(pattern, name); - } - PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); - return -1; -#else PyObject *matcher = get_matcher_function(module, pat); if (matcher == NULL) { return -1; } - int res = _Py_regex_fnmatch_generic(matcher, name); + int res = _Py_fnmatch_fnmatch(matcher, name); Py_DECREF(matcher); return res; -#endif } /*[clinic input] @@ -284,7 +253,7 @@ _fnmatch_translate_impl(PyObject *module, PyObject *pattern) return NULL; } // translated regular expression as a str object - PyObject *str_expr = _Py_regex_translate(module, unicode); + PyObject *str_expr = _Py_fnmatch_translate(module, unicode); Py_DECREF(unicode); if (str_expr == NULL) { return NULL; @@ -294,7 +263,7 @@ _fnmatch_translate_impl(PyObject *module, PyObject *pattern) return expr; } else if (PyUnicode_Check(pattern)) { - return _Py_regex_translate(module, pattern); + return _Py_fnmatch_translate(module, pattern); } else { PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index 9a2128e6e005ae..cbedaccf80c905 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -1,9 +1,5 @@ /* -* C accelerator for the 'fnmatch' module (POSIX only). - * - * Most functions expect string or bytes instances, and thus the Python - * implementation should first pre-process path-like objects, possibly - * applying normalizations depending on the platform if needed. + * C accelerator for the 'fnmatch' module. */ #ifndef _FNMATCHMODULE_H @@ -11,18 +7,6 @@ #include "Python.h" -#undef Py_USE_FNMATCH_FALLBACK -/* - * For now, only test the C acceleration of the Python implementation. - * - * TODO(picnixz): Currently, I don't know how to handle backslashes - * TODO(picnixz): in fnmatch(3) so that they are treated correctly - * TODO(picnixz): depending on whether the string was a raw string - * TODO(picnixz): or not. To see the bug, uncomment the following - * TODO(picnixz): macro and run the tests. - */ -#define Py_USE_FNMATCH_FALLBACK 1 - typedef struct { PyObject *py_module; // 'fnmatch' module PyObject *re_module; // 're' module @@ -40,50 +24,6 @@ get_fnmatchmodulestate_state(PyObject *module) return (fnmatchmodule_state *)state; } -#if defined(Py_HAVE_FNMATCH) && !defined(Py_USE_FNMATCH_FALLBACK) -/* - * Construct a list of filtered names using fnmatch(3). - */ -extern PyObject * -_Py_posix_fnmatch_encoded_filter(PyObject *pattern, PyObject *names); -/* Same as _Py_posix_fnmatch_encoded_filter() but for unicode inputs. */ -extern PyObject * -_Py_posix_fnmatch_unicode_filter(PyObject *pattern, PyObject *names); - -/* cached 'pattern' version of _Py_posix_fnmatch_encoded_filter() */ -extern PyObject * -_Py_posix_fnmatch_encoded_filter_cached(const char *pattern, PyObject *names); -/* cached 'pattern' version of _Py_posix_fnmatch_unicode_filter() */ -extern PyObject * -_Py_posix_fnmatch_unicode_filter_cached(const char *pattern, PyObject *names); - -/* - * Perform a case-sensitive match using fnmatch(3). - * - * Parameters - * - * pattern A UNIX shell pattern. - * string The string to match (bytes object). - * - * Returns 1 if the 'string' matches the 'pattern' and 0 otherwise. - * - * Returns -1 if (1) 'string' is not a `bytes` object, and - * sets a TypeError exception, or (2) something went wrong. - */ -extern int -_Py_posix_fnmatch_encoded(PyObject *pattern, PyObject *string); -/* Same as _Py_posix_fnmatch_encoded() but for unicode inputs. */ -extern int -_Py_posix_fnmatch_unicode(PyObject *pattern, PyObject *string); - -/* cached 'pattern' version of _Py_posix_fnmatch_encoded() */ -extern int -_Py_posix_fnmatch_encoded_cached(const char *pattern, PyObject *names); -/* cached 'pattern' version of _Py_posix_fnmatch_encoded() */ -extern int -_Py_posix_fnmatch_unicode_cached(const char *pattern, PyObject *names); -#endif - /* * Test whether a name matches a compiled RE pattern. * @@ -98,7 +38,7 @@ _Py_posix_fnmatch_unicode_cached(const char *pattern, PyObject *names); * and sets a TypeError exception, or (2) something went wrong. */ extern int -_Py_regex_fnmatch_generic(PyObject *matcher, PyObject *string); +_Py_fnmatch_fnmatch(PyObject *matcher, PyObject *string); /* * Perform a case-sensitive match using compiled RE patterns. @@ -111,12 +51,14 @@ _Py_regex_fnmatch_generic(PyObject *matcher, PyObject *string); * Returns a list of matched names, or NULL if an error occurred. */ extern PyObject * -_Py_regex_fnmatch_filter(PyObject *matcher, PyObject *names); +_Py_fnmatch_filter(PyObject *matcher, PyObject *names); /* * C accelerator for translating UNIX shell patterns into RE patterns. + * + * Note: this is the C implementation of fnmatch.translate(). */ extern PyObject * -_Py_regex_translate(PyObject *module, PyObject *pattern); +_Py_fnmatch_translate(PyObject *module, PyObject *pattern); #endif // _FNMATCHMODULE_H diff --git a/Modules/_fnmatch/posix.c b/Modules/_fnmatch/posix.c deleted file mode 100644 index 45fe88b5440f74..00000000000000 --- a/Modules/_fnmatch/posix.c +++ /dev/null @@ -1,191 +0,0 @@ -#include "Python.h" - -#include "_fnmatchmodule.h" // for pre-declarations - -#if defined(Py_HAVE_FNMATCH) && !defined(Py_USE_FNMATCH_FALLBACK) - -#include // for fnmatch(3) - -#define INVALID_PATTERN_TYPE "pattern must be a %s object, got %.200s" -#define INVALID_NAME_TYPE "name must be a %s object, got %.200s" - -// ==== Helper declarations =================================================== - -/* - * Return a bytes object as a "const char *", or NULL on error. - * - * The 'error' message is either INVALID_PATTERN_TYPE or INVALID_NAME_TYPE, - * and is used to set a TypeError if 'arg' is of incorrect type. - */ -static inline const char * -from_encoded(PyObject *arg, const char *error); - -/* - * Return a str object as a "const char *", or NULL on error. - * - * The 'error' message is either INVALID_PATTERN_TYPE or INVALID_NAME_TYPE - * and is used to set a TypeError if 'arg' is of incorrect type. - */ -static inline const char * -from_unicode(PyObject *arg, const char *error); - -/* The type of from_encoded() or from_unicode() conversion functions. */ -typedef const char *(*Converter)(PyObject *string, const char *error); - -static inline PyObject * -_posix_fnmatch_filter(PyObject *pattern, PyObject *names, Converter converter); - -/* cached 'pattern' version of _posix_fnmatch_filter() */ -static /* not inline */ PyObject * -_posix_fnmatch_filter_cached(const char *pattern, PyObject *names, Converter converter); - -// ==== API implementation ==================================================== - -inline PyObject * -_Py_posix_fnmatch_encoded_filter(PyObject *pattern, PyObject *names) -{ - return _posix_fnmatch_filter(pattern, names, &from_encoded); -} - -inline PyObject * -_Py_posix_fnmatch_unicode_filter(PyObject *pattern, PyObject *names) -{ - return _posix_fnmatch_filter(pattern, names, &from_unicode); -} - -inline PyObject * -_Py_posix_fnmatch_encoded_filter_cached(const char *pattern, PyObject *names) -{ - assert(pattern != NULL); - return _posix_fnmatch_filter_cached(pattern, names, &from_encoded); -} - -inline PyObject * -_Py_posix_fnmatch_unicode_filter_cached(const char *pattern, PyObject *names) -{ - assert(pattern != NULL); - return _posix_fnmatch_filter_cached(pattern, names, &from_unicode); -} - -inline int -_Py_posix_fnmatch_encoded(PyObject *pattern, PyObject *string) -{ - const char *p = from_encoded(pattern, INVALID_PATTERN_TYPE); - if (p == NULL) { - return -1; - } - return _Py_posix_fnmatch_encoded_cached(p, string); -} - -inline int -_Py_posix_fnmatch_unicode(PyObject *pattern, PyObject *string) -{ - const char *p = from_unicode(pattern, INVALID_PATTERN_TYPE); - if (p == NULL) { - return -1; - } - return _Py_posix_fnmatch_unicode_cached(p, string); -} - -#define PROCESS_MATCH_RESULT(r) \ - do { \ - int res = (r); \ - if (res < 0) { \ - return res; \ - } \ - return res != FNM_NOMATCH; \ - } while (0) - -inline int -_Py_posix_fnmatch_encoded_cached(const char *pattern, PyObject *string) -{ - assert(pattern != NULL); - const char *s = from_encoded(string, INVALID_NAME_TYPE); - if (s == NULL) { - return -1; - } - PROCESS_MATCH_RESULT(fnmatch(pattern, s, 0)); -} - -inline int -_Py_posix_fnmatch_unicode_cached(const char *pattern, PyObject *string) -{ - assert(pattern != NULL); - const char *s = from_unicode(string, INVALID_NAME_TYPE); - if (s == NULL) { - return -1; - } - PROCESS_MATCH_RESULT(fnmatch(pattern, s, 0)); -} - -#undef PROCESS_MATCH_RESULT - -// ==== Helper implementations ================================================ - -#define GENERATE_CONVERTER(function, predicate, converter, expecting) \ - static inline const char * \ - function(PyObject *arg, const char *error) \ - { \ - if (!predicate(arg)) { \ - PyErr_Format(PyExc_TypeError, error, expecting, Py_TYPE(arg)->tp_name); \ - return NULL; \ - } \ - return converter(arg); \ - } -GENERATE_CONVERTER(from_encoded, PyBytes_Check, PyBytes_AS_STRING, "bytes") -GENERATE_CONVERTER(from_unicode, PyUnicode_Check, PyUnicode_AsUTF8, "str") -#undef GENERATE_CONVERTER - -static inline PyObject * -_posix_fnmatch_filter(PyObject *pattern, PyObject *names, Converter converter) -{ - const char *p = converter(pattern, INVALID_PATTERN_TYPE); - if (p == NULL) { - return NULL; - } - return _posix_fnmatch_filter_cached(p, names, converter); -} - -static PyObject * -_posix_fnmatch_filter_cached(const char *pattern, PyObject *names, Converter converter) -{ - assert(pattern != NULL); - PyObject *iter = PyObject_GetIter(names); - if (iter == NULL) { - return NULL; - } - PyObject *res = PyList_New(0); - if (res == NULL) { - Py_DECREF(iter); - return NULL; - } - PyObject *name = NULL; - while ((name = PyIter_Next(iter))) { - const char *n = converter(name, INVALID_NAME_TYPE); - if (n == NULL) { - goto abort; - } - if (fnmatch(pattern, n, 0) != FNM_NOMATCH) { - if (PyList_Append(res, name) < 0) { - goto abort; - } - } - Py_DECREF(name); - if (PyErr_Occurred()) { - Py_DECREF(res); - Py_DECREF(iter); - return NULL; - } - } - Py_DECREF(iter); - return res; -abort: - Py_XDECREF(name); - Py_DECREF(iter); - Py_DECREF(res); - return NULL; -} - -#undef INVALID_NAME_TYPE -#undef INVALID_PATTERN_TYPE -#endif From 2e166cce9adaba4a0cf5116c0523d7049486325f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:49:13 +0200 Subject: [PATCH 37/97] update configuration scripts --- configure | 54 --------------------------------------------------- pyconfig.h.in | 3 --- 2 files changed, 57 deletions(-) diff --git a/configure b/configure index 0fefae0032587f..7d3934825cfd6e 100755 --- a/configure +++ b/configure @@ -14044,60 +14044,6 @@ printf "%s\n" "#define Py_HAVE_C_COMPLEX 1" >>confdefs.h fi -# check for fnmatch(3) support -# -# We test for the plain POSIX implementation (case-sensitive match). -# -# To ensure that the implementation of fnmatch(3) is compliant -# we run some tests to make sure that everything works well. -# -# Note that MSVC does not support fnmatch(3). -{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for case-sensititve fnmatch(3)" >&5 -printf %s "checking for case-sensititve fnmatch(3)... " >&6; } -if test ${ac_cv_fnmatch_supported+y} -then : - printf %s "(cached) " >&6 -else $as_nop - if test "$cross_compiling" = yes -then : - ac_cv_fnmatch_supported=no - -else $as_nop - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ -#include -int -main (void) -{ - - exit(!( - fnmatch("a*", "abc", 0) != FNM_NOMATCH && - fnmatch("a*", "Abc", 0) == FNM_NOMATCH - )); - - ; - return 0; -} -_ACEOF -if ac_fn_c_try_run "$LINENO" -then : - ac_cv_fnmatch_supported=yes -else $as_nop - ac_cv_fnmatch_supported=no -fi -rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ - conftest.$ac_objext conftest.beam conftest.$ac_ext -fi - -fi -{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fnmatch_supported" >&5 -printf "%s\n" "$ac_cv_fnmatch_supported" >&6; } -if test "$ac_cv_fnmatch_supported" = "yes"; then - -printf "%s\n" "#define Py_HAVE_FNMATCH 1" >>confdefs.h - -fi - # check for systems that require aligned memory access { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking aligned memory access is required" >&5 printf %s "checking aligned memory access is required... " >&6; } diff --git a/pyconfig.h.in b/pyconfig.h.in index 0997722334867c..8fbba7ed3b949e 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -1689,9 +1689,6 @@ /* Defined if _Complex C type is available. */ #undef Py_HAVE_C_COMPLEX -/* Defined if case-sensitive fnmatch(3) is supported. */ -#undef Py_HAVE_FNMATCH - /* Define if year with century should be normalized for strftime. */ #undef Py_NORMALIZE_CENTURY From 14cd1fde9bdb16bcf295d07ce6408df88bb17bb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:51:31 +0200 Subject: [PATCH 38/97] update function names --- Modules/_fnmatch/regex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/_fnmatch/regex.c b/Modules/_fnmatch/regex.c index b6715bb33283b0..9ebf0c90dbf746 100644 --- a/Modules/_fnmatch/regex.c +++ b/Modules/_fnmatch/regex.c @@ -5,7 +5,7 @@ // ==== API implementation ==================================================== inline int -_Py_regex_fnmatch_generic(PyObject *matcher, PyObject *name) +_Py_fnmatch_fnmatch(PyObject *matcher, PyObject *name) { // If 'name' is of incorrect type, it will be detected when calling // the matcher function (we emulate 're.compile(...).match(name)'). @@ -20,7 +20,7 @@ _Py_regex_fnmatch_generic(PyObject *matcher, PyObject *name) } PyObject * -_Py_regex_fnmatch_filter(PyObject *matcher, PyObject *names) +_Py_fnmatch_filter(PyObject *matcher, PyObject *names) { assert(PyCallable_Check(matcher)); PyObject *iter = PyObject_GetIter(names); @@ -36,7 +36,7 @@ _Py_regex_fnmatch_filter(PyObject *matcher, PyObject *names) PyObject *name = NULL; while ((name = PyIter_Next(iter))) { - int rc = _Py_regex_fnmatch_generic(matcher, name); + int rc = _Py_fnmatch_fnmatch(matcher, name); if (rc < 0) { assert(PyErr_Occurred()); goto abort; From 2a718f4d71b0733ab04ff29a6e950d6b67329c27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 16:57:28 +0200 Subject: [PATCH 39/97] make the C interface equivalent to the Python one --- Modules/_fnmatch/_fnmatchmodule.c | 99 +++++++++++++++++++++---------- Modules/_fnmatch/_fnmatchmodule.h | 9 ++- Modules/_fnmatch/regex.c | 56 +++++++++++++++-- 3 files changed, 125 insertions(+), 39 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index d397785172eee8..0363fb98830883 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -26,7 +26,7 @@ fnmatchmodule_get_matcher_function(PyObject *module, PyObject *pattern) } fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); // compile the pattern - PyObject *compiled = _PyObject_CallMethod(st->re_module, &_Py_ID(compile), "O", expr); + PyObject *compiled = PyObject_CallMethodOneArg(st->re_module, &_Py_ID(compile), expr); Py_DECREF(expr); if (compiled == NULL) { return NULL; @@ -62,9 +62,7 @@ fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) if (maxsize == NULL) { return -1; } - PyObject *args[] = {NULL, maxsize, Py_True}; - size_t nargsf = 2 | PY_VECTORCALL_ARGUMENTS_OFFSET; - PyObject *decorator = PyObject_Vectorcall(st->lru_cache, args + 1, nargsf, NULL); + PyObject *decorator = PyObject_CallFunctionObjArgs(st->lru_cache, maxsize, Py_True, NULL); Py_DECREF(maxsize); if (decorator == NULL) { return -1; @@ -86,35 +84,30 @@ fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) static inline PyObject * get_matcher_function(PyObject *module, PyObject *pattern) { - assert(module != NULL); - assert(pattern != NULL); fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); assert(st->translator != NULL); - size_t nargsf = 1 | PY_VECTORCALL_ARGUMENTS_OFFSET; - return PyObject_Vectorcall(st->translator, &pattern, nargsf, NULL); + return PyObject_CallOneArg(st->translator, pattern); } // ==== Module state functions ================================================ -#define IMPORT_MODULE(state, attribute, name) \ +static int +fnmatchmodule_exec(PyObject *module) +{ +#define IMPORT_MODULE(attribute, name) \ do { \ - state->attribute = NULL; \ - state->attribute = PyImport_ImportModule((name)); \ - if (state->attribute == NULL) { \ + st->attribute = NULL; \ + st->attribute = PyImport_ImportModule((name)); \ + if (st->attribute == NULL) { \ return -1; \ } \ } while (0) -static int -fnmatchmodule_exec(PyObject *module) -{ fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); - st->py_module = NULL; - IMPORT_MODULE(st, py_module, "fnmatch"); - st->os_module = NULL; - IMPORT_MODULE(st, os_module, "os"); - st->re_module = NULL; - IMPORT_MODULE(st, re_module, "re"); + IMPORT_MODULE(os_module, "os"); + IMPORT_MODULE(posixpath_module, "posixpath"); + IMPORT_MODULE(re_module, "re"); +#undef IMPORT_MODULE st->lru_cache = NULL; if (fnmatchmodule_load_lru_cache(module, st) < 0) { return -1; @@ -125,14 +118,13 @@ fnmatchmodule_exec(PyObject *module) } return 0; } -#undef IMPORT_MODULE static int fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) { fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); - Py_VISIT(st->py_module); Py_VISIT(st->os_module); + Py_VISIT(st->posixpath_module); Py_VISIT(st->re_module); Py_VISIT(st->lru_cache); Py_VISIT(st->translator); @@ -143,8 +135,8 @@ static int fnmatchmodule_clear(PyObject *m) { fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); - Py_CLEAR(st->py_module); Py_CLEAR(st->os_module); + Py_CLEAR(st->posixpath_module); Py_CLEAR(st->re_module); Py_CLEAR(st->lru_cache); Py_CLEAR(st->translator); @@ -174,12 +166,40 @@ static PyObject * _fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) /*[clinic end generated code: output=7f11aa68436d05fc input=1d233174e1c4157a]*/ { - PyObject *matcher = get_matcher_function(module, pat); + fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + PyObject *os_path = PyObject_GetAttr(st->os_module, &_Py_ID(path)); + if (os_path == NULL) { + return NULL; + } + // filter() always calls os.path.normcase() on the pattern, + // but not on the names being mathed if os.path is posixmodule + // XXX: maybe this should be changed in Python as well? + // Note: the Python implementation uses the *runtime* os.path.normcase. + PyObject *normcase = PyObject_GetAttr(os_path, &_Py_ID(normcase)); + if (normcase == NULL) { + Py_DECREF(os_path); + return NULL; + } + PyObject *patobj = PyObject_CallOneArg(normcase, pat); + if (patobj == NULL) { + Py_DECREF(normcase); + Py_DECREF(os_path); + return NULL; + } + int isposix = Py_Is(os_path, st->posixpath_module); + Py_DECREF(os_path); + // the matcher is cached with respect to the *normalized* pattern + PyObject *matcher = get_matcher_function(module, patobj); + Py_DECREF(patobj); if (matcher == NULL) { + Py_DECREF(normcase); return NULL; } - PyObject *result = _Py_fnmatch_filter(matcher, names); + PyObject *result = isposix + ? _Py_fnmatch_filter(matcher, names) + : _Py_fnmatch_filter_normalized(matcher, names, normcase); Py_DECREF(matcher); + Py_DECREF(normcase); return result; } @@ -196,15 +216,31 @@ _fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat) /*[clinic end generated code: output=b4cd0bd911e8bc93 input=c45e0366489540b8]*/ { fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); - PyObject *res = _PyObject_CallMethod(st->py_module, &_Py_ID(fnmatch), "OO", name, pat); - if (res == NULL) { + // use the runtime 'os.path' value and not a cached one + PyObject *os_path = PyObject_GetAttr(st->os_module, &_Py_ID(path)); + if (os_path == NULL) { + return -1; + } + PyObject *normcase = PyObject_GetAttr(os_path, &_Py_ID(normcase)); + Py_DECREF(os_path); + if (normcase == NULL) { + return -1; + } + // apply case normalization on both arguments + PyObject *nameobj = PyObject_CallOneArg(normcase, name); + if (nameobj == NULL) { + Py_DECREF(normcase); return -1; } - int matching = PyLong_AsLong(res); - if (matching < 0) { + PyObject *patobj = PyObject_CallOneArg(normcase, pat); + Py_DECREF(normcase); + if (patobj == NULL) { + Py_DECREF(nameobj); return -1; } - Py_DECREF(res); + int matching = _fnmatch_fnmatchcase_impl(module, nameobj, patobj); + Py_DECREF(patobj); + Py_DECREF(nameobj); return matching; } @@ -225,6 +261,7 @@ static int _fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) /*[clinic end generated code: output=4d1283b1b1fc7cb8 input=b02a6a5c8c5a46e2]*/ { + // fnmatchcase() does not apply any case normalization on the inputs PyObject *matcher = get_matcher_function(module, pat); if (matcher == NULL) { return -1; diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index cbedaccf80c905..b9601e59b6b9fa 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -8,9 +8,9 @@ #include "Python.h" typedef struct { - PyObject *py_module; // 'fnmatch' module - PyObject *re_module; // 're' module - PyObject *os_module; // 'os' module + PyObject *os_module; // 'os' module + PyObject *posixpath_module; // 'posixpath' module + PyObject *re_module; // 're' module PyObject *lru_cache; // the LRU cache decorator PyObject *translator; // the translation unit whose calls are cached @@ -52,6 +52,9 @@ _Py_fnmatch_fnmatch(PyObject *matcher, PyObject *string); */ extern PyObject * _Py_fnmatch_filter(PyObject *matcher, PyObject *names); +/* same as _Py_fnmatch_filter() but calls os.path.normcase() on each name */ +extern PyObject * +_Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *normcase); /* * C accelerator for translating UNIX shell patterns into RE patterns. diff --git a/Modules/_fnmatch/regex.c b/Modules/_fnmatch/regex.c index 9ebf0c90dbf746..73f4a338927bb1 100644 --- a/Modules/_fnmatch/regex.c +++ b/Modules/_fnmatch/regex.c @@ -10,11 +10,11 @@ _Py_fnmatch_fnmatch(PyObject *matcher, PyObject *name) // If 'name' is of incorrect type, it will be detected when calling // the matcher function (we emulate 're.compile(...).match(name)'). assert(PyCallable_Check(matcher)); - PyObject *match = PyObject_CallFunction(matcher, "O", name); + PyObject *match = PyObject_CallOneArg(matcher, name); if (match == NULL) { return -1; } - int matching = match == Py_None ? 0 : 1; + int matching = Py_IsNone(match) ? 0 : 1; Py_DECREF(match); return matching; } @@ -48,15 +48,61 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names) } Py_DECREF(name); if (PyErr_Occurred()) { - Py_DECREF(res); - Py_DECREF(iter); - return NULL; + goto error; } } Py_DECREF(iter); return res; abort: Py_XDECREF(name); +error: + Py_DECREF(iter); + Py_DECREF(res); + return NULL; +} + +PyObject * +_Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *normcase) +{ + assert(PyCallable_Check(matcher)); + PyObject *iter = PyObject_GetIter(names); + if (iter == NULL) { + return NULL; + } + + PyObject *res = PyList_New(0); + if (res == NULL) { + Py_DECREF(iter); + return NULL; + } + + PyObject *name = NULL; + while ((name = PyIter_Next(iter))) { + PyObject *normalized = PyObject_CallOneArg(normcase, name); + if (normalized == NULL) { + goto abort; + } + int rc = _Py_fnmatch_fnmatch(matcher, normalized); + Py_DECREF(normalized); + if (rc < 0) { + assert(PyErr_Occurred()); + goto abort; + } + if (rc == 1) { + if (PyList_Append(res, name) < 0) { + goto abort; + } + } + Py_DECREF(name); + if (PyErr_Occurred()) { + goto error; + } + } + Py_DECREF(iter); + return res; +abort: + Py_XDECREF(name); +error: Py_DECREF(iter); Py_DECREF(res); return NULL; From c7a06854bdbb08219e25357c7e927f050cb8ed6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 16:57:44 +0200 Subject: [PATCH 40/97] fixups --- Modules/_fnmatch/_fnmatchmodule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 0363fb98830883..ffc3c4ca23a6f3 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -1,5 +1,5 @@ #include "Python.h" -#include "pycore_call.h" // for _PyObject_CallMethod +#include "pycore_call.h" #include "_fnmatchmodule.h" #include "clinic/_fnmatchmodule.c.h" From 1340fd25735f011c33e48560ad21ec03ecbccddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 16:57:50 +0200 Subject: [PATCH 41/97] update generated objects --- Include/internal/pycore_global_objects_fini_generated.h | 2 +- Include/internal/pycore_global_strings.h | 2 +- Include/internal/pycore_runtime_init_generated.h | 2 +- Include/internal/pycore_unicodeobject_generated.h | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index 44820e0ce13fad..99447e2dc06df3 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -951,7 +951,6 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fix_imports)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(flags)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(flush)); - _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fnmatch)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fold)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(follow_symlinks)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(format)); @@ -1105,6 +1104,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nlocals)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(node_depth)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(node_offset)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(normcase)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ns)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nstype)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nt)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index c21492376395e4..ff99456786f0d4 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -440,7 +440,6 @@ struct _Py_global_strings { STRUCT_FOR_ID(fix_imports) STRUCT_FOR_ID(flags) STRUCT_FOR_ID(flush) - STRUCT_FOR_ID(fnmatch) STRUCT_FOR_ID(fold) STRUCT_FOR_ID(follow_symlinks) STRUCT_FOR_ID(format) @@ -594,6 +593,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(nlocals) STRUCT_FOR_ID(node_depth) STRUCT_FOR_ID(node_offset) + STRUCT_FOR_ID(normcase) STRUCT_FOR_ID(ns) STRUCT_FOR_ID(nstype) STRUCT_FOR_ID(nt) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index 9a99b3645fb717..3c07832b03e270 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -949,7 +949,6 @@ extern "C" { INIT_ID(fix_imports), \ INIT_ID(flags), \ INIT_ID(flush), \ - INIT_ID(fnmatch), \ INIT_ID(fold), \ INIT_ID(follow_symlinks), \ INIT_ID(format), \ @@ -1103,6 +1102,7 @@ extern "C" { INIT_ID(nlocals), \ INIT_ID(node_depth), \ INIT_ID(node_offset), \ + INIT_ID(normcase), \ INIT_ID(ns), \ INIT_ID(nstype), \ INIT_ID(nt), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index 83ece722c6fa9d..cfc503079aed57 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1560,10 +1560,6 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); - string = &_Py_ID(fnmatch); - _PyUnicode_InternStatic(interp, &string); - assert(_PyUnicode_CheckConsistency(string, 1)); - assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(fold); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); @@ -2176,6 +2172,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(normcase); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(ns); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); From 2b6fe4f521db8a57aa50f64243ce033afb671e00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:05:04 +0200 Subject: [PATCH 42/97] reflect `__all__` ordering --- Lib/fnmatch.py | 61 ++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index ffa15825954f5a..bfdbc78ffce3a6 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -16,35 +16,6 @@ __all__ = ["filter", "fnmatch", "fnmatchcase", "translate"] -def fnmatch(name, pat): - """Test whether FILENAME matches PATTERN. - - Patterns are Unix shell style: - - * matches everything - ? matches any single character - [seq] matches any character in seq - [!seq] matches any char not in seq - - An initial period in FILENAME is not special. - Both FILENAME and PATTERN are first case-normalized - if the operating system requires it. - If you don't want this, use fnmatchcase(FILENAME, PATTERN). - """ - name = os.path.normcase(name) - pat = os.path.normcase(pat) - return fnmatchcase(name, pat) - -@functools.lru_cache(maxsize=32768, typed=True) -def _compile_pattern(pat): - if isinstance(pat, bytes): - pat_str = str(pat, 'ISO-8859-1') - res_str = translate(pat_str) - res = bytes(res_str, 'ISO-8859-1') - else: - res = translate(pat) - return re.compile(res).match - try: from _fnmatch import filter except ImportError: @@ -64,6 +35,28 @@ def filter(names, pat): result.append(name) return result +try: + from _fnmatch import fnmatch +except ImportError: + def fnmatch(name, pat): + """Test whether FILENAME matches PATTERN. + + Patterns are Unix shell style: + + * matches everything + ? matches any single character + [seq] matches any character in seq + [!seq] matches any char not in seq + + An initial period in FILENAME is not special. + Both FILENAME and PATTERN are first case-normalized + if the operating system requires it. + If you don't want this, use fnmatchcase(FILENAME, PATTERN). + """ + name = os.path.normcase(name) + pat = os.path.normcase(pat) + return fnmatchcase(name, pat) + try: from _fnmatch import fnmatchcase except ImportError: @@ -89,6 +82,16 @@ def translate(pat): parts = _translate(pat, STAR, '.') return _join_translated_parts(parts, STAR) +@functools.lru_cache(maxsize=32768, typed=True) +def _compile_pattern(pat): + if isinstance(pat, bytes): + pat_str = str(pat, 'ISO-8859-1') + res_str = translate(pat_str) + res = bytes(res_str, 'ISO-8859-1') + else: + res = translate(pat) + return re.compile(res).match + def _translate(pat, STAR, QUESTION_MARK): res = [] add = res.append From 124f8f86d633bfef0db4c25d485872607542a61c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:23:35 +0200 Subject: [PATCH 43/97] update comments --- Lib/test/test_fnmatch.py | 3 ++- Modules/_fnmatch/_fnmatchmodule.c | 2 ++ Modules/_fnmatch/_fnmatchmodule.h | 2 ++ Modules/_fnmatch/regex.c | 13 +++++++------ Modules/_fnmatch/translate.c | 12 ++++++------ 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 6d72df182af862..19f12db4fa2160 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -1,6 +1,6 @@ """Test cases for the fnmatch module.""" -import itertools +import itertools import os import string import unittest @@ -312,6 +312,7 @@ class CPythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): @staticmethod def translate_func(pattern): + # Pure Python implementation of translate() STAR = object() parts = py_fnmatch._translate(pattern, STAR, '.') return py_fnmatch._join_translated_parts(parts, STAR) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index ffc3c4ca23a6f3..d83a2d5a0e7405 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -308,6 +308,8 @@ _fnmatch_translate_impl(PyObject *module, PyObject *pattern) } } +// ==== Module specs ========================================================== + static PyMethodDef fnmatchmodule_methods[] = { _FNMATCH_FILTER_METHODDEF _FNMATCH_FNMATCH_METHODDEF diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index b9601e59b6b9fa..10a5811b15cea0 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -24,6 +24,8 @@ get_fnmatchmodulestate_state(PyObject *module) return (fnmatchmodule_state *)state; } +// ==== Helper prototypes ===================================================== + /* * Test whether a name matches a compiled RE pattern. * diff --git a/Modules/_fnmatch/regex.c b/Modules/_fnmatch/regex.c index 73f4a338927bb1..524dde992ccfc1 100644 --- a/Modules/_fnmatch/regex.c +++ b/Modules/_fnmatch/regex.c @@ -36,12 +36,12 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names) PyObject *name = NULL; while ((name = PyIter_Next(iter))) { - int rc = _Py_fnmatch_fnmatch(matcher, name); - if (rc < 0) { + int matching = _Py_fnmatch_fnmatch(matcher, name); + if (matching < 0) { assert(PyErr_Occurred()); goto abort; } - if (rc == 1) { + if (matching == 1) { if (PyList_Append(res, name) < 0) { goto abort; } @@ -82,13 +82,14 @@ _Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *norm if (normalized == NULL) { goto abort; } - int rc = _Py_fnmatch_fnmatch(matcher, normalized); + int matching = _Py_fnmatch_fnmatch(matcher, normalized); Py_DECREF(normalized); - if (rc < 0) { + if (matching < 0) { assert(PyErr_Occurred()); goto abort; } - if (rc == 1) { + if (matching == 1) { + // add the non-normalized name if its normalization matches if (PyList_Append(res, name) < 0) { goto abort; } diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 8900800f0c933c..dea389bd2f7a89 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -9,9 +9,7 @@ #include "_fnmatchmodule.h" // for get_fnmatchmodulestate_state() -// ==== Helper declarations ================================================== - -typedef fnmatchmodule_state State; +// ==== Macro definitions ===================================================== #define _WRITE_OR_FAIL(writeop, onerror) \ do { \ @@ -45,6 +43,8 @@ typedef fnmatchmodule_state State; } \ } while (0) +// ==== Helper declarations =================================================== + /* * Creates a new Unicode object from a Py_UCS4 character. * @@ -77,7 +77,7 @@ translate_expression(PyObject *pattern, Py_ssize_t start, Py_ssize_t stop); * This returns the number of written characters, or -1 if an error occurred. */ static Py_ssize_t -write_literal(State *state, PyUnicodeWriter *writer, PyObject *unicode); +write_literal(fnmatchmodule_state *state, PyUnicodeWriter *writer, PyObject *unicode); /* * Write the translated pattern obtained by translate_expression(). @@ -120,7 +120,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) #define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), !=) #define SKIP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), ==) - State *state = get_fnmatchmodulestate_state(module); + fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); PyObject *re = state->re_module; const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); // We would write less data if there are successive '*', @@ -431,7 +431,7 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) } static Py_ssize_t -write_literal(State *state, PyUnicodeWriter *writer, PyObject *unicode) +write_literal(fnmatchmodule_state *state, PyUnicodeWriter *writer, PyObject *unicode) { PyObject *escaped = PyObject_CallMethodOneArg(state->re_module, &_Py_ID(escape), From 7621d6c0248728340bc32025efd28080b6afc0bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 09:24:48 +0200 Subject: [PATCH 44/97] blurb --- .../next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst diff --git a/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst b/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst new file mode 100644 index 00000000000000..639af4fb31ff93 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst @@ -0,0 +1,2 @@ +Improve the performances of :func:`fnmatch.translate` by 50% and of +:func:`fnmatch.filter` by 10%. Patch by Bénédikt Tran. From 9b94fe6e75b172bc21d57390e47bf5c2ad54bedc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 09:31:24 +0200 Subject: [PATCH 45/97] (hopefully MSVC will be happy...) --- PCbuild/pythoncore.vcxproj | 4 ++++ PCbuild/pythoncore.vcxproj.filters | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index f36fcb8caece33..dbd27c2f0c44d4 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -368,6 +368,7 @@ + @@ -473,6 +474,9 @@ + + + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index a1b43addf9e36a..00300074a1e5ed 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -252,6 +252,9 @@ Modules + + Modules\_fnmatch + Modules\_io @@ -1058,6 +1061,15 @@ Modules + + Modules\_fnmatch + + + Modules\_fnmatch + + + Modules\_fnmatch + Modules\_io From 3903987e830befad80ee4c4a626c2e649aaf1590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 09:37:06 +0200 Subject: [PATCH 46/97] fix MSVC warnings --- Modules/_fnmatch/translate.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index dea389bd2f7a89..3b0427480a54f1 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -151,7 +151,8 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } const int kind = PyUnicode_KIND(pattern); const void *data = PyUnicode_DATA(pattern); - Py_ssize_t h = 0, i = 0; + // i is the current index, wi is the index of a wildcard + Py_ssize_t i = 0, wi = 0; while (i < n) { // read and advance to the next character Py_UCS4 chr = READ(i++); @@ -159,7 +160,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) case '*': { _WRITE_CHAR_OR(writer, chr, goto abort); SKIP_DUPLICATES('*', i, n); - PyObject *index = PyLong_FromSsize_t(h++); + PyObject *index = PyLong_FromSsize_t(wi++); if (index == NULL) { goto abort; } @@ -173,7 +174,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) case '?': { // translate optional '?' (fnmatch) into optional '.' (regex) _WRITE_CHAR_OR(writer, '.', goto abort); - ++h; // increase the expected result's length + ++wi; // increase the expected result's length break; } case '[': { @@ -183,14 +184,14 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) ADVANCE_TO_NEXT(']', j, n); // locate closing ']' if (j >= n) { _WRITE_ASCII_OR(writer, "\\[", 2, goto abort); - h += 2; // we just wrote 2 characters + wi += 2; // we just wrote 2 characters break; // early break for clarity } else { // v--- pattern[j] (exclusive) // '[' * ... * ']' // ^----- pattern[i] (inclusive) - int pos = PyUnicode_FindChar(pattern, '-', i, j, 1); + Py_ssize_t pos = PyUnicode_FindChar(pattern, '-', i, j, 1); if (pos == -2) { goto abort; } @@ -216,12 +217,12 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (s2 == NULL) { goto abort; } - int difflen = write_expression(writer, s2); + Py_ssize_t difflen = write_expression(writer, s2); Py_DECREF(s2); if (difflen < 0) { goto abort; } - h += difflen; + wi += difflen; i = j + 1; // jump to the character after ']' break; // early break for clarity } @@ -231,12 +232,12 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (str == NULL) { goto abort; } - int difflen = write_literal(state, writer, str); + Py_ssize_t difflen = write_literal(state, writer, str); Py_DECREF(str); if (difflen < 0) { goto abort; } - h += difflen; + wi += difflen; break; } } From c7422a5b282d095de5638cff505b7bbf575b2dfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 09:56:27 +0200 Subject: [PATCH 47/97] fixup typo! --- PC/config.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PC/config.c b/PC/config.c index 8f49d9255b4fbe..7c7c2540118cf2 100644 --- a/PC/config.c +++ b/PC/config.c @@ -10,9 +10,9 @@ extern PyObject* PyInit_array(void); extern PyObject* PyInit_binascii(void); extern PyObject* PyInit_cmath(void); extern PyObject* PyInit_errno(void); +extern PyObject* PyInit__fnmatch(void); extern PyObject* PyInit_faulthandler(void); extern PyObject* PyInit__tracemalloc(void); -extern PyObject* PyInit_fnmatch(void); extern PyObject* PyInit_gc(void); extern PyObject* PyInit_math(void); extern PyObject* PyInit__md5(void); @@ -92,7 +92,7 @@ struct _inittab _PyImport_Inittab[] = { {"binascii", PyInit_binascii}, {"cmath", PyInit_cmath}, {"errno", PyInit_errno}, - {"_fnmatch", PyInit_fnmatch}, + {"_fnmatch", PyInit__fnmatch}, {"faulthandler", PyInit_faulthandler}, {"gc", PyInit_gc}, {"math", PyInit_math}, From 7dbe55c46c4f0e2f76aeb3874c685e5c2a1f2d5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 10:16:03 +0200 Subject: [PATCH 48/97] `get_fnmatchmodulestate_state` -> `get_fnmatchmodule_state` --- Modules/_fnmatch/_fnmatchmodule.c | 14 +++++++------- Modules/_fnmatch/_fnmatchmodule.h | 2 +- Modules/_fnmatch/translate.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index d83a2d5a0e7405..c4e402de7c46ee 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -24,7 +24,7 @@ fnmatchmodule_get_matcher_function(PyObject *module, PyObject *pattern) if (expr == NULL) { return NULL; } - fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + fnmatchmodule_state *st = get_fnmatchmodule_state(module); // compile the pattern PyObject *compiled = PyObject_CallMethodOneArg(st->re_module, &_Py_ID(compile), expr); Py_DECREF(expr); @@ -84,7 +84,7 @@ fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) static inline PyObject * get_matcher_function(PyObject *module, PyObject *pattern) { - fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + fnmatchmodule_state *st = get_fnmatchmodule_state(module); assert(st->translator != NULL); return PyObject_CallOneArg(st->translator, pattern); } @@ -103,7 +103,7 @@ fnmatchmodule_exec(PyObject *module) } \ } while (0) - fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + fnmatchmodule_state *st = get_fnmatchmodule_state(module); IMPORT_MODULE(os_module, "os"); IMPORT_MODULE(posixpath_module, "posixpath"); IMPORT_MODULE(re_module, "re"); @@ -122,7 +122,7 @@ fnmatchmodule_exec(PyObject *module) static int fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) { - fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); + fnmatchmodule_state *st = get_fnmatchmodule_state(m); Py_VISIT(st->os_module); Py_VISIT(st->posixpath_module); Py_VISIT(st->re_module); @@ -134,7 +134,7 @@ fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) static int fnmatchmodule_clear(PyObject *m) { - fnmatchmodule_state *st = get_fnmatchmodulestate_state(m); + fnmatchmodule_state *st = get_fnmatchmodule_state(m); Py_CLEAR(st->os_module); Py_CLEAR(st->posixpath_module); Py_CLEAR(st->re_module); @@ -166,7 +166,7 @@ static PyObject * _fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) /*[clinic end generated code: output=7f11aa68436d05fc input=1d233174e1c4157a]*/ { - fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + fnmatchmodule_state *st = get_fnmatchmodule_state(module); PyObject *os_path = PyObject_GetAttr(st->os_module, &_Py_ID(path)); if (os_path == NULL) { return NULL; @@ -215,7 +215,7 @@ static int _fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat) /*[clinic end generated code: output=b4cd0bd911e8bc93 input=c45e0366489540b8]*/ { - fnmatchmodule_state *st = get_fnmatchmodulestate_state(module); + fnmatchmodule_state *st = get_fnmatchmodule_state(module); // use the runtime 'os.path' value and not a cached one PyObject *os_path = PyObject_GetAttr(st->os_module, &_Py_ID(path)); if (os_path == NULL) { diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index 10a5811b15cea0..cde36e56dc901f 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -17,7 +17,7 @@ typedef struct { } fnmatchmodule_state; static inline fnmatchmodule_state * -get_fnmatchmodulestate_state(PyObject *module) +get_fnmatchmodule_state(PyObject *module) { void *state = PyModule_GetState(module); assert(state != NULL); diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 3b0427480a54f1..5223f699a5bc82 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -120,7 +120,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) #define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), !=) #define SKIP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), ==) - fnmatchmodule_state *state = get_fnmatchmodulestate_state(module); + fnmatchmodule_state *state = get_fnmatchmodule_state(module); PyObject *re = state->re_module; const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); // We would write less data if there are successive '*', From 4a879112249bb9f045b337686838dcf87c919d2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:08:13 +0200 Subject: [PATCH 49/97] remove unused imports --- Modules/_fnmatch/translate.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 5223f699a5bc82..7dd5fae64ebe90 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -4,11 +4,10 @@ * disabled on demand. */ -#include "Python.h" -#include "pycore_call.h" // for _PyObject_CallMethod() - #include "_fnmatchmodule.h" // for get_fnmatchmodulestate_state() +#include "pycore_call.h" + // ==== Macro definitions ===================================================== #define _WRITE_OR_FAIL(writeop, onerror) \ From aca2b1b60cf40287d7e2ef214539d9504059f152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:08:31 +0200 Subject: [PATCH 50/97] update state --- Modules/_fnmatch/_fnmatchmodule.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index cde36e56dc901f..9e70e800de36ae 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -8,12 +8,14 @@ #include "Python.h" typedef struct { - PyObject *os_module; // 'os' module - PyObject *posixpath_module; // 'posixpath' module - PyObject *re_module; // 're' module + PyObject *os_module; // import os + PyObject *posixpath_module; // import posixpath + PyObject *re_module; // import re - PyObject *lru_cache; // the LRU cache decorator - PyObject *translator; // the translation unit whose calls are cached + PyObject *lru_cache; // functools.lru_cache() inner decorator + PyObject *translator; // the translation unit whose calls are cached + + PyObject *hyphen_str; // interned hyphen glyph '-' } fnmatchmodule_state; static inline fnmatchmodule_state * @@ -51,10 +53,17 @@ _Py_fnmatch_fnmatch(PyObject *matcher, PyObject *string); * names An iterable of strings (str or bytes objects) to match. * * Returns a list of matched names, or NULL if an error occurred. - */ +*/ extern PyObject * _Py_fnmatch_filter(PyObject *matcher, PyObject *names); -/* same as _Py_fnmatch_filter() but calls os.path.normcase() on each name */ + +/* + * Similar to _Py_fnmatch_filter() but matches os.path.normcase(name) + * instead. The returned values are however a sub-sequence of 'names'. + * + * The 'normcase' argument is a callable implementing os.path.normcase(). + * + */ extern PyObject * _Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *normcase); From 2ef61ad556f9afb3ce08aced5340d8af0eb14059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:08:52 +0200 Subject: [PATCH 51/97] simplify implementation --- Modules/_fnmatch/regex.c | 39 +++++---------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/Modules/_fnmatch/regex.c b/Modules/_fnmatch/regex.c index 524dde992ccfc1..1b4d55943ef7a5 100644 --- a/Modules/_fnmatch/regex.c +++ b/Modules/_fnmatch/regex.c @@ -1,5 +1,3 @@ -#include "Python.h" - #include "_fnmatchmodule.h" // for pre-declarations // ==== API implementation ==================================================== @@ -9,7 +7,6 @@ _Py_fnmatch_fnmatch(PyObject *matcher, PyObject *name) { // If 'name' is of incorrect type, it will be detected when calling // the matcher function (we emulate 're.compile(...).match(name)'). - assert(PyCallable_Check(matcher)); PyObject *match = PyObject_CallOneArg(matcher, name); if (match == NULL) { return -1; @@ -22,40 +19,27 @@ _Py_fnmatch_fnmatch(PyObject *matcher, PyObject *name) PyObject * _Py_fnmatch_filter(PyObject *matcher, PyObject *names) { - assert(PyCallable_Check(matcher)); PyObject *iter = PyObject_GetIter(names); if (iter == NULL) { return NULL; } - PyObject *res = PyList_New(0); if (res == NULL) { Py_DECREF(iter); return NULL; } - PyObject *name = NULL; while ((name = PyIter_Next(iter))) { int matching = _Py_fnmatch_fnmatch(matcher, name); - if (matching < 0) { - assert(PyErr_Occurred()); + if (matching < 0 || (matching == 1 && PyList_Append(res, name) < 0)) { goto abort; } - if (matching == 1) { - if (PyList_Append(res, name) < 0) { - goto abort; - } - } Py_DECREF(name); - if (PyErr_Occurred()) { - goto error; - } } Py_DECREF(iter); return res; abort: - Py_XDECREF(name); -error: + Py_DECREF(name); Py_DECREF(iter); Py_DECREF(res); return NULL; @@ -64,18 +48,15 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names) PyObject * _Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *normcase) { - assert(PyCallable_Check(matcher)); PyObject *iter = PyObject_GetIter(names); if (iter == NULL) { return NULL; } - PyObject *res = PyList_New(0); if (res == NULL) { Py_DECREF(iter); return NULL; } - PyObject *name = NULL; while ((name = PyIter_Next(iter))) { PyObject *normalized = PyObject_CallOneArg(normcase, name); @@ -84,26 +65,16 @@ _Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *norm } int matching = _Py_fnmatch_fnmatch(matcher, normalized); Py_DECREF(normalized); - if (matching < 0) { - assert(PyErr_Occurred()); + // add the non-normalized name if its normalization matches + if (matching < 0 || (matching == 1 && PyList_Append(res, name) < 0)) { goto abort; } - if (matching == 1) { - // add the non-normalized name if its normalization matches - if (PyList_Append(res, name) < 0) { - goto abort; - } - } Py_DECREF(name); - if (PyErr_Occurred()) { - goto error; - } } Py_DECREF(iter); return res; abort: - Py_XDECREF(name); -error: + Py_DECREF(name); Py_DECREF(iter); Py_DECREF(res); return NULL; From 13dc17ec98ec170e58cbc53ef0182a3d68f58028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 14:23:42 +0200 Subject: [PATCH 52/97] harmonize docs --- Lib/fnmatch.py | 15 ++++++++------- Makefile.pre.in | 2 +- PCbuild/pythoncore.vcxproj | 2 +- PCbuild/pythoncore.vcxproj.filters | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index bfdbc78ffce3a6..1dc52f2575ae6c 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -20,7 +20,7 @@ from _fnmatch import filter except ImportError: def filter(names, pat): - """Construct a list from those elements of the iterable NAMES that match PAT.""" + """Construct a list from the names in *names* matching *pat*.""" result = [] pat = os.path.normcase(pat) match = _compile_pattern(pat) @@ -39,7 +39,7 @@ def filter(names, pat): from _fnmatch import fnmatch except ImportError: def fnmatch(name, pat): - """Test whether FILENAME matches PATTERN. + """Test whether *name* matches *pat*. Patterns are Unix shell style: @@ -48,10 +48,11 @@ def fnmatch(name, pat): [seq] matches any character in seq [!seq] matches any char not in seq - An initial period in FILENAME is not special. - Both FILENAME and PATTERN are first case-normalized + An initial period in *name* is not special. + Both *name* and *pat* are first case-normalized if the operating system requires it. - If you don't want this, use fnmatchcase(FILENAME, PATTERN). + + If you don't want this, use fnmatchcase(name, pat). """ name = os.path.normcase(name) pat = os.path.normcase(pat) @@ -61,7 +62,7 @@ def fnmatch(name, pat): from _fnmatch import fnmatchcase except ImportError: def fnmatchcase(name, pat): - """Test whether FILENAME matches PATTERN, including case. + """Test whether *name* matches *pat*, including case. This is a version of fnmatch() which doesn't case-normalize its arguments. @@ -73,7 +74,7 @@ def fnmatchcase(name, pat): from _fnmatch import translate except ImportError: def translate(pat): - """Translate a shell PATTERN to a regular expression. + """Translate a shell pattern *pat* to a regular expression. There is no way to quote meta-characters. """ diff --git a/Makefile.pre.in b/Makefile.pre.in index f01f2e852a1d6d..bd5e471c50bd33 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -355,7 +355,7 @@ FNMATCH_H= Modules/_fnmatch/_fnmatchmodule.h FNMATCH_OBJS= \ Modules/_fnmatch/_fnmatchmodule.o \ - Modules/_fnmatch/regex.o \ + Modules/_fnmatch/matcher.o \ Modules/_fnmatch/translate.o ########################################################################## diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index dbd27c2f0c44d4..ea52c20cc66db1 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -475,7 +475,7 @@ - + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 00300074a1e5ed..912407b56ed783 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -1064,7 +1064,7 @@ Modules\_fnmatch - + Modules\_fnmatch From dba784b84197ddf1d7bd7217c5b728bd24f4a93f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 14:25:17 +0200 Subject: [PATCH 53/97] improvements - rename `regex.c` -> `matcher.c` - use interned strings - remove redundant macros - add comments to local macros - add some RFE notes --- Modules/Setup.bootstrap.in | 2 +- Modules/_fnmatch/_fnmatchmodule.c | 288 ++++++++++++--------- Modules/_fnmatch/_fnmatchmodule.h | 42 +-- Modules/_fnmatch/clinic/_fnmatchmodule.c.h | 88 ++++--- Modules/_fnmatch/{regex.c => matcher.c} | 12 +- Modules/_fnmatch/translate.c | 160 +++++++----- 6 files changed, 344 insertions(+), 248 deletions(-) rename Modules/_fnmatch/{regex.c => matcher.c} (85%) diff --git a/Modules/Setup.bootstrap.in b/Modules/Setup.bootstrap.in index 7201c857ddba0a..35198091329d01 100644 --- a/Modules/Setup.bootstrap.in +++ b/Modules/Setup.bootstrap.in @@ -36,7 +36,7 @@ _stat _stat.c _symtable symtablemodule.c # miscellaneous accelerators -_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/regex.c _fnmatch/translate.c +_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/matcher.c _fnmatch/translate.c # for systems without $HOME env, used by site._getuserbase() @MODULE_PWD_TRUE@pwd pwdmodule.c diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index c4e402de7c46ee..62a2e7b7f1ed40 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -1,13 +1,10 @@ -#include "Python.h" -#include "pycore_call.h" - #include "_fnmatchmodule.h" #include "clinic/_fnmatchmodule.c.h" #define COMPILED_CACHE_SIZE 32768 #define INVALID_PATTERN_TYPE "pattern must be a string or a bytes object" -// ==== Helper implementations ================================================ +// ==== Cached translation unit =============================================== /* * Compile a UNIX shell pattern into a RE pattern @@ -16,18 +13,20 @@ * This function is LRU-cached by the module itself. */ static PyObject * -fnmatchmodule_get_matcher_function(PyObject *module, PyObject *pattern) +get_matcher_function_impl(PyObject *module, PyObject *pattern) { // translate the pattern into a RE pattern assert(module != NULL); - PyObject *expr = _fnmatch_translate_impl(module, pattern); - if (expr == NULL) { + PyObject *translated = fnmatch_translate_impl(module, pattern); + if (translated == NULL) { return NULL; } fnmatchmodule_state *st = get_fnmatchmodule_state(module); // compile the pattern - PyObject *compiled = PyObject_CallMethodOneArg(st->re_module, &_Py_ID(compile), expr); - Py_DECREF(expr); + PyObject *compiled = PyObject_CallMethodOneArg(st->re_module, + &_Py_ID(compile), + translated); + Py_DECREF(translated); if (compiled == NULL) { return NULL; } @@ -39,49 +38,42 @@ fnmatchmodule_get_matcher_function(PyObject *module, PyObject *pattern) static PyMethodDef get_matcher_function_def = { "get_matcher_function", - (PyCFunction)(fnmatchmodule_get_matcher_function), + (PyCFunction)(get_matcher_function_impl), METH_O, NULL }; -static int -fnmatchmodule_load_lru_cache(PyObject *module, fnmatchmodule_state *st) -{ - st->lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); - if (st->lru_cache == NULL) { - return -1; - } - return 0; -} - static int fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) { - assert(st->lru_cache != NULL); + // make sure that this function is called once + assert(st->translator == NULL); PyObject *maxsize = PyLong_FromLong(COMPILED_CACHE_SIZE); if (maxsize == NULL) { return -1; } - PyObject *decorator = PyObject_CallFunctionObjArgs(st->lru_cache, maxsize, Py_True, NULL); + PyObject *lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); + PyObject *decorator = PyObject_CallFunctionObjArgs(lru_cache, maxsize, Py_True, NULL); + Py_DECREF(lru_cache); Py_DECREF(maxsize); if (decorator == NULL) { return -1; } - // TODO(picnixz): should INCREF the refcount of 'module'? assert(module != NULL); PyObject *decorated = PyCFunction_New(&get_matcher_function_def, module); - PyObject *translator = PyObject_CallOneArg(decorator, decorated); + // reference on 'translator' will be removed upon module cleanup + st->translator = PyObject_CallOneArg(decorator, decorated); Py_DECREF(decorated); Py_DECREF(decorator); - if (translator == NULL) { + if (st->translator == NULL) { return -1; } - // reference on 'translator' will be removed upon module cleanup - st->translator = translator; return 0; } -static inline PyObject * +// ==== Module data getters =================================================== + +static inline PyObject * /* reference to re.compile(pattern).match() */ get_matcher_function(PyObject *module, PyObject *pattern) { fnmatchmodule_state *st = get_fnmatchmodule_state(module); @@ -89,45 +81,75 @@ get_matcher_function(PyObject *module, PyObject *pattern) return PyObject_CallOneArg(st->translator, pattern); } +static inline PyObject * /* reference to os.path.normcase() */ +get_platform_normcase_function(PyObject *module, bool *isposix) +{ + fnmatchmodule_state *st = get_fnmatchmodule_state(module); + PyObject *os_path = PyObject_GetAttr(st->os_module, &_Py_ID(path)); + if (os_path == NULL) { + return NULL; + } + PyObject *normcase = PyObject_GetAttr(os_path, &_Py_ID(normcase)); + if (isposix != NULL) { + *isposix = (bool)Py_Is(os_path, st->posixpath_module); + } + Py_DECREF(os_path); + return normcase; +} + // ==== Module state functions ================================================ -static int -fnmatchmodule_exec(PyObject *module) -{ -#define IMPORT_MODULE(attribute, name) \ +#define IMPORT_MODULE(state, attribute, name) \ do { \ - st->attribute = NULL; \ - st->attribute = PyImport_ImportModule((name)); \ - if (st->attribute == NULL) { \ + /* make sure that the attribute is initialized once */ \ + assert(state->attribute == NULL); \ + state->attribute = PyImport_ImportModule((name)); \ + if (state->attribute == NULL) { \ return -1; \ } \ } while (0) +#define INTERN_STRING(state, attribute, literal) \ + do { \ + /* make sure that the attribute is initialized once */ \ + assert(state->attribute == NULL); \ + state->attribute = PyUnicode_InternFromString((literal)); \ + if (state->attribute == NULL) { \ + return -1; \ + } \ + } while (0) + +static int +fnmatchmodule_exec(PyObject *module) +{ fnmatchmodule_state *st = get_fnmatchmodule_state(module); - IMPORT_MODULE(os_module, "os"); - IMPORT_MODULE(posixpath_module, "posixpath"); - IMPORT_MODULE(re_module, "re"); -#undef IMPORT_MODULE - st->lru_cache = NULL; - if (fnmatchmodule_load_lru_cache(module, st) < 0) { - return -1; - } - st->translator = NULL; + IMPORT_MODULE(st, os_module, "os"); + IMPORT_MODULE(st, posixpath_module, "posixpath"); + IMPORT_MODULE(st, re_module, "re"); if (fnmatchmodule_load_translator(module, st) < 0) { return -1; } + INTERN_STRING(st, hyphen_str, "-"); + INTERN_STRING(st, re_empty_range_str, "(?!)"); + INTERN_STRING(st, re_atomic_bgroup_str, "(?>.*?"); + INTERN_STRING(st, re_wildcard_str, ".*"); return 0; } +#undef INTERN_STRING +#undef IMPORT_MODULE static int fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); - Py_VISIT(st->os_module); - Py_VISIT(st->posixpath_module); - Py_VISIT(st->re_module); - Py_VISIT(st->lru_cache); + Py_VISIT(st->re_wildcard_str); + Py_VISIT(st->re_atomic_bgroup_str); + Py_VISIT(st->re_empty_range_str); + Py_VISIT(st->hyphen_str); Py_VISIT(st->translator); + Py_VISIT(st->re_module); + Py_VISIT(st->posixpath_module); + Py_VISIT(st->os_module); return 0; } @@ -135,169 +157,176 @@ static int fnmatchmodule_clear(PyObject *m) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); - Py_CLEAR(st->os_module); - Py_CLEAR(st->posixpath_module); - Py_CLEAR(st->re_module); - Py_CLEAR(st->lru_cache); + Py_CLEAR(st->re_wildcard_str); + Py_CLEAR(st->re_atomic_bgroup_str); + Py_CLEAR(st->re_empty_range_str); + Py_CLEAR(st->hyphen_str); Py_CLEAR(st->translator); + Py_CLEAR(st->re_module); + Py_CLEAR(st->posixpath_module); + Py_CLEAR(st->os_module); return 0; } -static void +static inline void fnmatchmodule_free(void *m) { (void)fnmatchmodule_clear((PyObject *)m); } /*[clinic input] -module _fnmatch +module fnmatch [clinic start generated code]*/ -/*[clinic end generated code: output=da39a3ee5e6b4b0d input=356e324d57d93f08]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=797aa965370a9ef2]*/ /*[clinic input] -_fnmatch.filter -> object +fnmatch.filter -> object names: object - pat: object + pat as pattern: object + +Construct a list from the names in *names* matching *pat*. [clinic start generated code]*/ static PyObject * -_fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat) -/*[clinic end generated code: output=7f11aa68436d05fc input=1d233174e1c4157a]*/ +fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pattern) +/*[clinic end generated code: output=1a68530a2e3cf7d0 input=7ac729daad3b1404]*/ { - fnmatchmodule_state *st = get_fnmatchmodule_state(module); - PyObject *os_path = PyObject_GetAttr(st->os_module, &_Py_ID(path)); - if (os_path == NULL) { - return NULL; - } // filter() always calls os.path.normcase() on the pattern, // but not on the names being mathed if os.path is posixmodule // XXX: maybe this should be changed in Python as well? // Note: the Python implementation uses the *runtime* os.path.normcase. - PyObject *normcase = PyObject_GetAttr(os_path, &_Py_ID(normcase)); + bool isposix = 0; + PyObject *normcase = get_platform_normcase_function(module, &isposix); if (normcase == NULL) { - Py_DECREF(os_path); return NULL; } - PyObject *patobj = PyObject_CallOneArg(normcase, pat); - if (patobj == NULL) { + PyObject *normalized_pattern = PyObject_CallOneArg(normcase, pattern); + if (normalized_pattern == NULL) { Py_DECREF(normcase); - Py_DECREF(os_path); return NULL; } - int isposix = Py_Is(os_path, st->posixpath_module); - Py_DECREF(os_path); // the matcher is cached with respect to the *normalized* pattern - PyObject *matcher = get_matcher_function(module, patobj); - Py_DECREF(patobj); + PyObject *matcher = get_matcher_function(module, normalized_pattern); + Py_DECREF(normalized_pattern); if (matcher == NULL) { Py_DECREF(normcase); return NULL; } - PyObject *result = isposix + PyObject *filtered = isposix ? _Py_fnmatch_filter(matcher, names) : _Py_fnmatch_filter_normalized(matcher, names, normcase); Py_DECREF(matcher); Py_DECREF(normcase); - return result; + return filtered; } /*[clinic input] -_fnmatch.fnmatch -> bool +fnmatch.fnmatch -> bool name: object - pat: object + pat as pattern: object + +Test whether *name* matches *pat*. + +Patterns are Unix shell style: + +* matches everything +? matches any single character +[seq] matches any character in seq +[!seq] matches any char not in seq + +An initial period in *name* is not special. +Both *name* and *pat* are first case-normalized +if the operating system requires it. + +If you don't want this, use fnmatchcase(name, pat). [clinic start generated code]*/ static int -_fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat) -/*[clinic end generated code: output=b4cd0bd911e8bc93 input=c45e0366489540b8]*/ +fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pattern) +/*[clinic end generated code: output=c9dc542e8d6933b6 input=279a4a4f2ddea6a2]*/ { - fnmatchmodule_state *st = get_fnmatchmodule_state(module); // use the runtime 'os.path' value and not a cached one - PyObject *os_path = PyObject_GetAttr(st->os_module, &_Py_ID(path)); - if (os_path == NULL) { - return -1; - } - PyObject *normcase = PyObject_GetAttr(os_path, &_Py_ID(normcase)); - Py_DECREF(os_path); + PyObject *normcase = get_platform_normcase_function(module, NULL); if (normcase == NULL) { return -1; } // apply case normalization on both arguments - PyObject *nameobj = PyObject_CallOneArg(normcase, name); - if (nameobj == NULL) { + PyObject *norm_name = PyObject_CallOneArg(normcase, name); + if (norm_name == NULL) { Py_DECREF(normcase); return -1; } - PyObject *patobj = PyObject_CallOneArg(normcase, pat); + PyObject *norm_pattern = PyObject_CallOneArg(normcase, pattern); Py_DECREF(normcase); - if (patobj == NULL) { - Py_DECREF(nameobj); + if (norm_pattern == NULL) { + Py_DECREF(norm_name); return -1; } - int matching = _fnmatch_fnmatchcase_impl(module, nameobj, patobj); - Py_DECREF(patobj); - Py_DECREF(nameobj); + int matching = fnmatch_fnmatchcase_impl(module, norm_name, norm_pattern); + Py_DECREF(norm_pattern); + Py_DECREF(norm_name); return matching; } /*[clinic input] -_fnmatch.fnmatchcase -> bool +fnmatch.fnmatchcase -> bool name: object - pat: object + pat as pattern: object -Test whether `name` matches `pattern`, including case. +Test whether *name* matches *pat*, including case. This is a version of fnmatch() which doesn't case-normalize its arguments. - [clinic start generated code]*/ static int -_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat) -/*[clinic end generated code: output=4d1283b1b1fc7cb8 input=b02a6a5c8c5a46e2]*/ +fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pattern) +/*[clinic end generated code: output=4d6b268169001876 input=91d62999c08fd55e]*/ { // fnmatchcase() does not apply any case normalization on the inputs - PyObject *matcher = get_matcher_function(module, pat); + PyObject *matcher = get_matcher_function(module, pattern); if (matcher == NULL) { return -1; } - int res = _Py_fnmatch_fnmatch(matcher, name); + int matching = _Py_fnmatch_match(matcher, name); Py_DECREF(matcher); - return res; + return matching; } /*[clinic input] -_fnmatch.translate -> object +fnmatch.translate -> object pat as pattern: object +Translate a shell pattern *pat* to a regular expression. + +There is no way to quote meta-characters. [clinic start generated code]*/ static PyObject * -_fnmatch_translate_impl(PyObject *module, PyObject *pattern) -/*[clinic end generated code: output=2d9e3bbcbcc6e90e input=56e39f7beea97810]*/ +fnmatch_translate_impl(PyObject *module, PyObject *pattern) +/*[clinic end generated code: output=77e0f5de9fbb59bd input=2cc1203a34c571fd]*/ { if (PyBytes_Check(pattern)) { - PyObject *unicode = PyUnicode_DecodeLatin1(PyBytes_AS_STRING(pattern), + PyObject *decoded = PyUnicode_DecodeLatin1(PyBytes_AS_STRING(pattern), PyBytes_GET_SIZE(pattern), "strict"); - if (unicode == NULL) { + if (decoded == NULL) { return NULL; } - // translated regular expression as a str object - PyObject *str_expr = _Py_fnmatch_translate(module, unicode); - Py_DECREF(unicode); - if (str_expr == NULL) { + PyObject *translated = _Py_fnmatch_translate(module, decoded); + Py_DECREF(decoded); + if (translated == NULL) { return NULL; } - PyObject *expr = PyUnicode_AsLatin1String(str_expr); - Py_DECREF(str_expr); - return expr; + PyObject *res = PyUnicode_AsLatin1String(translated); + Py_DECREF(translated); + return res; } else if (PyUnicode_Check(pattern)) { return _Py_fnmatch_translate(module, pattern); @@ -310,11 +339,30 @@ _fnmatch_translate_impl(PyObject *module, PyObject *pattern) // ==== Module specs ========================================================== +/*[python input] +import fnmatch +import textwrap +fmt = 'PyDoc_STRVAR(fnmatchmodule_doc,\n"%s");' +print(fmt % '\\n\\\n'.join(fnmatch.__doc__.splitlines())) +[python start generated code]*/ +PyDoc_STRVAR(fnmatchmodule_doc, +"Filename matching with shell patterns.\n\ +\n\ +fnmatch(FILENAME, PATTERN) matches according to the local convention.\n\ +fnmatchcase(FILENAME, PATTERN) always takes case in account.\n\ +\n\ +The functions operate by translating the pattern into a regular\n\ +expression. They cache the compiled regular expressions for speed.\n\ +\n\ +The function translate(PATTERN) returns a regular expression\n\ +corresponding to PATTERN. (It does not compile it.)"); +/*[python end generated code: output=b5d0696157f04882 input=8dfe2add227b2686]*/ + static PyMethodDef fnmatchmodule_methods[] = { - _FNMATCH_FILTER_METHODDEF - _FNMATCH_FNMATCH_METHODDEF - _FNMATCH_FNMATCHCASE_METHODDEF - _FNMATCH_TRANSLATE_METHODDEF + FNMATCH_FILTER_METHODDEF + FNMATCH_FNMATCH_METHODDEF + FNMATCH_FNMATCHCASE_METHODDEF + FNMATCH_TRANSLATE_METHODDEF {NULL, NULL} }; @@ -328,7 +376,7 @@ static struct PyModuleDef_Slot fnmatchmodule_slots[] = { static struct PyModuleDef _fnmatchmodule = { PyModuleDef_HEAD_INIT, .m_name = "_fnmatch", - .m_doc = NULL, + .m_doc = fnmatchmodule_doc, .m_size = sizeof(fnmatchmodule_state), .m_methods = fnmatchmodule_methods, .m_slots = fnmatchmodule_slots, diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index 9e70e800de36ae..4169967e0961af 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -8,14 +8,17 @@ #include "Python.h" typedef struct { - PyObject *os_module; // import os - PyObject *posixpath_module; // import posixpath - PyObject *re_module; // import re + PyObject *os_module; // import os + PyObject *posixpath_module; // import posixpath + PyObject *re_module; // import re - PyObject *lru_cache; // functools.lru_cache() inner decorator - PyObject *translator; // the translation unit whose calls are cached + PyObject *translator; // LRU-cached translation unit - PyObject *hyphen_str; // interned hyphen glyph '-' + // strings used by translate.c + PyObject *hyphen_str; // hyphen glyph '-' + PyObject *re_empty_range_str; // RE empty range '(?!)' + PyObject *re_atomic_bgroup_str; // RE atomic group begin '(?>.*?' + PyObject *re_wildcard_str; // RE wildcard '.*' } fnmatchmodule_state; static inline fnmatchmodule_state * @@ -33,27 +36,26 @@ get_fnmatchmodule_state(PyObject *module) * * Parameters * - * matcher A reference to the 'match()' method of a compiled pattern. - * string The string to match (str or bytes object). + * matcher A reference to the 'match()' method of a compiled pattern. + * string The string to match (str or bytes object). * - * Returns 1 if the 'string' matches the pattern and 0 otherwise. + * Returns * - * Returns -1 if (1) 'string' is not a `str` or a `bytes` object, - * and sets a TypeError exception, or (2) something went wrong. + * -1 if the call 'matcher(string)' failed (e.g., invalid type), + * 0 if the 'string' does NOT match the pattern, + * 1 if the 'string' matches the pattern. */ extern int -_Py_fnmatch_fnmatch(PyObject *matcher, PyObject *string); +_Py_fnmatch_match(PyObject *matcher, PyObject *string); /* - * Perform a case-sensitive match using compiled RE patterns. + * Returns a list of matched names, or NULL if an error occurred. * * Parameters * - * matcher A reference to the 'match()' method of a compiled pattern. - * names An iterable of strings (str or bytes objects) to match. - * - * Returns a list of matched names, or NULL if an error occurred. -*/ + * matcher A reference to the 'match()' method of a compiled pattern. + * names An iterable of strings (str or bytes objects) to match. + */ extern PyObject * _Py_fnmatch_filter(PyObject *matcher, PyObject *names); @@ -62,7 +64,6 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names); * instead. The returned values are however a sub-sequence of 'names'. * * The 'normcase' argument is a callable implementing os.path.normcase(). - * */ extern PyObject * _Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *normcase); @@ -70,6 +71,9 @@ _Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *norm /* * C accelerator for translating UNIX shell patterns into RE patterns. * + * The 'pattern' must be a Unicode object (not a bytes) object, + * and the translated pattern will be a Unicode object as well. + * * Note: this is the C implementation of fnmatch.translate(). */ extern PyObject * diff --git a/Modules/_fnmatch/clinic/_fnmatchmodule.c.h b/Modules/_fnmatch/clinic/_fnmatchmodule.c.h index 5250bddbecc273..c611f01673b326 100644 --- a/Modules/_fnmatch/clinic/_fnmatchmodule.c.h +++ b/Modules/_fnmatch/clinic/_fnmatchmodule.c.h @@ -8,19 +8,20 @@ preserve #endif #include "pycore_modsupport.h" // _PyArg_UnpackKeywords() -PyDoc_STRVAR(_fnmatch_filter__doc__, +PyDoc_STRVAR(fnmatch_filter__doc__, "filter($module, /, names, pat)\n" "--\n" -"\n"); +"\n" +"Construct a list from the names in *names* matching *pat*."); -#define _FNMATCH_FILTER_METHODDEF \ - {"filter", _PyCFunction_CAST(_fnmatch_filter), METH_FASTCALL|METH_KEYWORDS, _fnmatch_filter__doc__}, +#define FNMATCH_FILTER_METHODDEF \ + {"filter", _PyCFunction_CAST(fnmatch_filter), METH_FASTCALL|METH_KEYWORDS, fnmatch_filter__doc__}, static PyObject * -_fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pat); +fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pattern); static PyObject * -_fnmatch_filter(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +fnmatch_filter(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) @@ -50,33 +51,47 @@ _fnmatch_filter(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj #undef KWTUPLE PyObject *argsbuf[2]; PyObject *names; - PyObject *pat; + PyObject *pattern; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); if (!args) { goto exit; } names = args[0]; - pat = args[1]; - return_value = _fnmatch_filter_impl(module, names, pat); + pattern = args[1]; + return_value = fnmatch_filter_impl(module, names, pattern); exit: return return_value; } -PyDoc_STRVAR(_fnmatch_fnmatch__doc__, +PyDoc_STRVAR(fnmatch_fnmatch__doc__, "fnmatch($module, /, name, pat)\n" "--\n" -"\n"); +"\n" +"Test whether *name* matches *pat*.\n" +"\n" +"Patterns are Unix shell style:\n" +"\n" +"* matches everything\n" +"? matches any single character\n" +"[seq] matches any character in seq\n" +"[!seq] matches any char not in seq\n" +"\n" +"An initial period in *name* is not special.\n" +"Both *name* and *pat* are first case-normalized\n" +"if the operating system requires it.\n" +"\n" +"If you don\'t want this, use fnmatchcase(name, pat)."); -#define _FNMATCH_FNMATCH_METHODDEF \ - {"fnmatch", _PyCFunction_CAST(_fnmatch_fnmatch), METH_FASTCALL|METH_KEYWORDS, _fnmatch_fnmatch__doc__}, +#define FNMATCH_FNMATCH_METHODDEF \ + {"fnmatch", _PyCFunction_CAST(fnmatch_fnmatch), METH_FASTCALL|METH_KEYWORDS, fnmatch_fnmatch__doc__}, static int -_fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pat); +fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pattern); static PyObject * -_fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) @@ -106,7 +121,7 @@ _fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb #undef KWTUPLE PyObject *argsbuf[2]; PyObject *name; - PyObject *pat; + PyObject *pattern; int _return_value; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); @@ -114,8 +129,8 @@ _fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb goto exit; } name = args[0]; - pat = args[1]; - _return_value = _fnmatch_fnmatch_impl(module, name, pat); + pattern = args[1]; + _return_value = fnmatch_fnmatch_impl(module, name, pattern); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -125,23 +140,23 @@ _fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyOb return return_value; } -PyDoc_STRVAR(_fnmatch_fnmatchcase__doc__, +PyDoc_STRVAR(fnmatch_fnmatchcase__doc__, "fnmatchcase($module, /, name, pat)\n" "--\n" "\n" -"Test whether `name` matches `pattern`, including case.\n" +"Test whether *name* matches *pat*, including case.\n" "\n" "This is a version of fnmatch() which doesn\'t case-normalize\n" "its arguments."); -#define _FNMATCH_FNMATCHCASE_METHODDEF \ - {"fnmatchcase", _PyCFunction_CAST(_fnmatch_fnmatchcase), METH_FASTCALL|METH_KEYWORDS, _fnmatch_fnmatchcase__doc__}, +#define FNMATCH_FNMATCHCASE_METHODDEF \ + {"fnmatchcase", _PyCFunction_CAST(fnmatch_fnmatchcase), METH_FASTCALL|METH_KEYWORDS, fnmatch_fnmatchcase__doc__}, static int -_fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pat); +fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pattern); static PyObject * -_fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) @@ -171,7 +186,7 @@ _fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, #undef KWTUPLE PyObject *argsbuf[2]; PyObject *name; - PyObject *pat; + PyObject *pattern; int _return_value; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); @@ -179,8 +194,8 @@ _fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, goto exit; } name = args[0]; - pat = args[1]; - _return_value = _fnmatch_fnmatchcase_impl(module, name, pat); + pattern = args[1]; + _return_value = fnmatch_fnmatchcase_impl(module, name, pattern); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -190,19 +205,22 @@ _fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, return return_value; } -PyDoc_STRVAR(_fnmatch_translate__doc__, +PyDoc_STRVAR(fnmatch_translate__doc__, "translate($module, /, pat)\n" "--\n" -"\n"); +"\n" +"Translate a shell pattern *pat* to a regular expression.\n" +"\n" +"There is no way to quote meta-characters."); -#define _FNMATCH_TRANSLATE_METHODDEF \ - {"translate", _PyCFunction_CAST(_fnmatch_translate), METH_FASTCALL|METH_KEYWORDS, _fnmatch_translate__doc__}, +#define FNMATCH_TRANSLATE_METHODDEF \ + {"translate", _PyCFunction_CAST(fnmatch_translate), METH_FASTCALL|METH_KEYWORDS, fnmatch_translate__doc__}, static PyObject * -_fnmatch_translate_impl(PyObject *module, PyObject *pattern); +fnmatch_translate_impl(PyObject *module, PyObject *pattern); static PyObject * -_fnmatch_translate(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +fnmatch_translate(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) @@ -238,9 +256,9 @@ _fnmatch_translate(PyObject *module, PyObject *const *args, Py_ssize_t nargs, Py goto exit; } pattern = args[0]; - return_value = _fnmatch_translate_impl(module, pattern); + return_value = fnmatch_translate_impl(module, pattern); exit: return return_value; } -/*[clinic end generated code: output=d9bb3df00c5c2b5e input=a9049054013a1b77]*/ +/*[clinic end generated code: output=50f858ef4bfb569a input=a9049054013a1b77]*/ diff --git a/Modules/_fnmatch/regex.c b/Modules/_fnmatch/matcher.c similarity index 85% rename from Modules/_fnmatch/regex.c rename to Modules/_fnmatch/matcher.c index 1b4d55943ef7a5..899fe56ee063d3 100644 --- a/Modules/_fnmatch/regex.c +++ b/Modules/_fnmatch/matcher.c @@ -1,9 +1,11 @@ -#include "_fnmatchmodule.h" // for pre-declarations +/* + * Provide the implementation of the high-level matcher-based functions. + */ -// ==== API implementation ==================================================== +#include "_fnmatchmodule.h" inline int -_Py_fnmatch_fnmatch(PyObject *matcher, PyObject *name) +_Py_fnmatch_match(PyObject *matcher, PyObject *name) { // If 'name' is of incorrect type, it will be detected when calling // the matcher function (we emulate 're.compile(...).match(name)'). @@ -30,7 +32,7 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names) } PyObject *name = NULL; while ((name = PyIter_Next(iter))) { - int matching = _Py_fnmatch_fnmatch(matcher, name); + int matching = _Py_fnmatch_match(matcher, name); if (matching < 0 || (matching == 1 && PyList_Append(res, name) < 0)) { goto abort; } @@ -63,7 +65,7 @@ _Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *norm if (normalized == NULL) { goto abort; } - int matching = _Py_fnmatch_fnmatch(matcher, normalized); + int matching = _Py_fnmatch_match(matcher, normalized); Py_DECREF(normalized); // add the non-normalized name if its normalization matches if (matching < 0 || (matching == 1 && PyList_Append(res, name) < 0)) { diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 7dd5fae64ebe90..f20740bbbb4d37 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -1,7 +1,6 @@ /* * C accelerator for the translation function from UNIX shell patterns - * to RE patterns. This accelerator is platform-independent but can be - * disabled on demand. + * to RE patterns. */ #include "_fnmatchmodule.h" // for get_fnmatchmodulestate_state() @@ -9,6 +8,10 @@ #include "pycore_call.h" // ==== Macro definitions ===================================================== +// +// The following _WRITE_* and _WRITE_*_OR macros do NOT check their inputs +// since they directly delegate to the _PyUnicodeWriter_Write* underlying +// function. #define _WRITE_OR_FAIL(writeop, onerror) \ do { \ @@ -17,23 +20,31 @@ } \ } while (0) +/* write a character 'ch' */ #define _WRITE_CHAR(writer, ch) \ _PyUnicodeWriter_WriteChar((_PyUnicodeWriter *)(writer), (ch)) +/* write a character 'ch', or execute 'onerror' if it fails */ #define _WRITE_CHAR_OR(writer, ch, onerror) \ _WRITE_OR_FAIL(_WRITE_CHAR((writer), (ch)), onerror) +/* write an ASCII 'string' of given 'length' */ #define _WRITE_ASCII(writer, ascii, length) \ _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter *)(writer), (ascii), (length)) +/* write an ASCII 'string' of given 'length', or execute 'onerror' if it fails */ #define _WRITE_ASCII_OR(writer, ascii, length, onerror) \ _WRITE_OR_FAIL(_WRITE_ASCII((writer), (ascii), (length)), onerror) +/* write a 'string' */ #define _WRITE_STRING(writer, string) \ _PyUnicodeWriter_WriteStr((_PyUnicodeWriter *)(writer), (string)) +/* write a 'string', or execute 'onerror' if it fails */ #define _WRITE_STRING_OR(writer, string, onerror) \ _WRITE_OR_FAIL(_WRITE_STRING((writer), (string)), onerror) +/* write the substring string[i:j] */ #define _WRITE_BLOCK(writer, string, i, j) \ _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter *)(writer), (string), (i), (j)) +/* write the substring string[i:j] if i < j, or execute 'onerror' if it fails */ #define _WRITE_BLOCK_OR(writer, string, i, j, onerror) \ do { \ Py_ssize_t _i = (i), _j = (j); /* to allow in-place operators on i or j */ \ @@ -68,7 +79,8 @@ get_unicode_character(Py_UCS4 ch); * values for '[!1-5]' are 10 (not 9) and 13 respectively. */ static PyObject * -translate_expression(PyObject *pattern, Py_ssize_t start, Py_ssize_t stop); +translate_expression(fnmatchmodule_state *state, + PyObject *pattern, Py_ssize_t start, Py_ssize_t stop); /* * Write an escaped string using re.escape(). @@ -76,7 +88,8 @@ translate_expression(PyObject *pattern, Py_ssize_t start, Py_ssize_t stop); * This returns the number of written characters, or -1 if an error occurred. */ static Py_ssize_t -write_literal(fnmatchmodule_state *state, PyUnicodeWriter *writer, PyObject *unicode); +write_literal(fnmatchmodule_state *state, + PyUnicodeWriter *writer, PyObject *literal); /* * Write the translated pattern obtained by translate_expression(). @@ -84,7 +97,8 @@ write_literal(fnmatchmodule_state *state, PyUnicodeWriter *writer, PyObject *uni * This returns the number of written characters, or -1 if an error occurred. */ static Py_ssize_t -write_expression(PyUnicodeWriter *writer, PyObject *expression); +write_expression(fnmatchmodule_state *state, + PyUnicodeWriter *writer, PyObject *expression); /* * Build the final regular expression by processing the wildcards. @@ -92,33 +106,15 @@ write_expression(PyUnicodeWriter *writer, PyObject *expression); * The position of each wildcard in 'pattern' is given by 'indices'. */ static PyObject * -process_wildcards(PyObject *pattern, PyObject *indices); +process_wildcards(fnmatchmodule_state *state, + PyObject *pattern, PyObject *indices); // ==== API implementation ==================================================== PyObject * _Py_fnmatch_translate(PyObject *module, PyObject *pattern) { -#define READ(ind) PyUnicode_READ(kind, data, (ind)) -#define ADVANCE_IF_CHAR(ch, ind, maxind) \ - do { \ - /* the following forces ind to be a variable name */ \ - Py_ssize_t *Py_UNUSED(_ind) = &ind; \ - if ((ind) < (maxind) && READ(ind) == (ch)) { \ - ++ind; \ - } \ - } while (0) -#define _WHILE_READ_CMP(ch, ind, maxind, cmp) \ - do { \ - /* the following forces ind to be a variable name */ \ - Py_ssize_t *Py_UNUSED(_ind) = &ind; \ - while ((ind) < (maxind) && READ(ind) cmp (ch)) { \ - ++ind; \ - } \ - } while (0) -#define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), !=) -#define SKIP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), ==) - + assert(PyUnicode_Check(pattern)); fnmatchmodule_state *state = get_fnmatchmodule_state(module); PyObject *re = state->re_module; const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); @@ -138,7 +134,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) // estimate the number of characters to be written to be the // same as the number of characters in the pattern. // - // TODO: (picnixz): should we limit the estimation in case of a failure? + // TODO: (picnixz): should we limit the estimation? PyUnicodeWriter *writer = PyUnicodeWriter_Create(n); if (writer == NULL) { return NULL; @@ -150,14 +146,38 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } const int kind = PyUnicode_KIND(pattern); const void *data = PyUnicode_DATA(pattern); - // i is the current index, wi is the index of a wildcard - Py_ssize_t i = 0, wi = 0; + /* declaration of some local helping macros */ +#define READ(ind) PyUnicode_READ(kind, data, (ind)) + /* advance 'ind' if the character is 'ch' */ +#define ADVANCE_IF_CHAR(ch, ind, maxind) \ + do { \ + /* the following forces ind to be a variable name */ \ + Py_ssize_t *Py_UNUSED(_ind) = &ind; \ + if ((ind) < (maxind) && READ(ind) == (ch)) { \ + ++ind; \ + } \ + } while (0) + /* advance 'ind' until the character compares to 'READ[ind] CMPOP ch' */ +#define _WHILE_READ_CMP(ch, ind, maxind, CMPOP) \ + do { \ + /* the following forces ind to be a variable name */ \ + Py_ssize_t *Py_UNUSED(_ind) = &ind; \ + while ((ind) < (maxind) && READ(ind) CMPOP (ch)) { \ + ++ind; \ + } \ + } while (0) + /* advance 'from' as long as READ(from) != ch */ +#define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), !=) + /* advance 'from' as long as READ(from) == ch */ +#define SKIP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), ==) + Py_ssize_t i = 0; // current index + Py_ssize_t wi = 0; // number of characters written while (i < n) { // read and advance to the next character Py_UCS4 chr = READ(i++); switch (chr) { case '*': { - _WRITE_CHAR_OR(writer, chr, goto abort); + _WRITE_CHAR_OR(writer, '*', goto abort); SKIP_DUPLICATES('*', i, n); PyObject *index = PyLong_FromSsize_t(wi++); if (index == NULL) { @@ -182,7 +202,8 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) ADVANCE_IF_CHAR(']', j, n); // [!] or [] ADVANCE_TO_NEXT(']', j, n); // locate closing ']' if (j >= n) { - _WRITE_ASCII_OR(writer, "\\[", 2, goto abort); + _WRITE_CHAR_OR(writer, '\\', goto abort); + _WRITE_CHAR_OR(writer, '[', goto abort); wi += 2; // we just wrote 2 characters break; // early break for clarity } @@ -200,23 +221,27 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (s0 == NULL) { goto abort; } + // NOTE(picnixz): maybe cache the method and intern the arguments? + // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() s1 = _PyObject_CallMethod(s0, &_Py_ID(replace), "ss", "\\", "\\\\"); Py_DECREF(s0); } else { assert(pos >= 0); assert(READ(j) == ']'); - s1 = translate_expression(pattern, i, j); + s1 = translate_expression(state, pattern, i, j); } if (s1 == NULL) { goto abort; } + // NOTE(picnixz): maybe cache the method and intern the arguments? + // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() s2 = _PyObject_CallMethod(re, &_Py_ID(sub), "ssO", "([&~|])", "\\\\\\1", s1); Py_DECREF(s1); if (s2 == NULL) { goto abort; } - Py_ssize_t difflen = write_expression(writer, s2); + Py_ssize_t difflen = write_expression(state, writer, s2); Py_DECREF(s2); if (difflen < 0) { goto abort; @@ -251,7 +276,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) Py_DECREF(indices); return NULL; } - PyObject *res = process_wildcards(translated, indices); + PyObject *res = process_wildcards(state, translated, indices); Py_DECREF(translated); Py_DECREF(indices); return res; @@ -289,7 +314,8 @@ get_unicode_character(Py_UCS4 ch) } static PyObject * -translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) +translate_expression(fnmatchmodule_state *state, + PyObject *pattern, Py_ssize_t i, Py_ssize_t j) { PyObject *chunks = PyList_New(0); if (chunks == NULL) { @@ -329,12 +355,7 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) assert(chunkscount > 0); PyObject *chunk = PyList_GET_ITEM(chunks, chunkscount - 1); assert(chunk != NULL); - PyObject *hyphen = PyUnicode_FromOrdinal('-'); - if (hyphen == NULL) { - goto abort; - } - PyObject *repl = PyUnicode_Concat(chunk, hyphen); - Py_DECREF(hyphen); + PyObject *repl = PyUnicode_Concat(chunk, state->hyphen_str); // PyList_SetItem() does not create a new reference on 'repl' // so we should not decref 'repl' after the call, unless there // is an issue while setting the item. @@ -400,10 +421,14 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) for (c = 0; c < chunkscount; ++c) { PyObject *s0 = PyList_GET_ITEM(chunks, c); assert(s0 != NULL); + // NOTE(picnixz): maybe cache the method and intern the arguments? + // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() PyObject *s1 = _PyObject_CallMethod(s0, &_Py_ID(replace), "ss", "\\", "\\\\"); if (s1 == NULL) { goto abort; } + // NOTE(picnixz): maybe cache the method and intern the arguments? + // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() PyObject *s2 = _PyObject_CallMethod(s1, &_Py_ID(replace), "ss", "-", "\\-"); Py_DECREF(s1); // PyList_SetItem() does not create a new reference on 's2' @@ -431,11 +456,11 @@ translate_expression(PyObject *pattern, Py_ssize_t i, Py_ssize_t j) } static Py_ssize_t -write_literal(fnmatchmodule_state *state, PyUnicodeWriter *writer, PyObject *unicode) +write_literal(fnmatchmodule_state *state, PyUnicodeWriter *writer, PyObject *literal) { PyObject *escaped = PyObject_CallMethodOneArg(state->re_module, &_Py_ID(escape), - unicode); + literal); if (escaped == NULL) { return -1; } @@ -451,7 +476,8 @@ write_literal(fnmatchmodule_state *state, PyUnicodeWriter *writer, PyObject *uni } static Py_ssize_t -write_expression(PyUnicodeWriter *writer, PyObject *expression) +write_expression(fnmatchmodule_state *state, + PyUnicodeWriter *writer, PyObject *expression) { #define WRITE_CHAR(c) _WRITE_CHAR_OR(writer, (c), return -1) #define WRITE_ASCII(s, n) _WRITE_ASCII_OR(writer, (s), (n), return -1) @@ -460,7 +486,7 @@ write_expression(PyUnicodeWriter *writer, PyObject *expression) Py_ssize_t grouplen = PyUnicode_GET_LENGTH(expression); if (grouplen == 0) { /* empty range: never match */ - WRITE_ASCII("(?!)", 4); + WRITE_STRING(state->re_empty_range_str); return 4; } Py_UCS4 token = PyUnicode_READ_CHAR(expression, 0); @@ -498,11 +524,12 @@ write_expression(PyUnicodeWriter *writer, PyObject *expression) } static PyObject * -process_wildcards(PyObject *pattern, PyObject *indices) +process_wildcards(fnmatchmodule_state *state, + PyObject *pattern, PyObject *indices) { const Py_ssize_t m = PyList_GET_SIZE(indices); if (m == 0) { - // just write fr'(?s:{parts} + ")\Z" + // "(?s:" + pattern + ")\Z" return PyUnicode_FromFormat("(?s:%S)\\Z", pattern); } /* @@ -523,9 +550,9 @@ process_wildcards(PyObject *pattern, PyObject *indices) * We write one additional '.*' if indices[-1] + 1 == n. * * Since the result is surrounded by "(?s:" and ")\Z", we - * write at least "indices[0] + 7m + n + 6" characters, + * write at least "indices[0] + 7*m + n + 6" characters, * where 'm' is the number of stars and 'n' the length - * of the translated pattern. + * of the /translated) pattern. */ PyObject *jobj = PyList_GET_ITEM(indices, 0); assert(jobj != NULL); @@ -538,12 +565,10 @@ process_wildcards(PyObject *pattern, PyObject *indices) if (writer == NULL) { return NULL; } -#define WRITE_BLOCK(i, j) _WRITE_BLOCK_OR(writer, pattern, (i), (j), goto abort) -#define WRITE_ATOMIC_BEGIN() _WRITE_ASCII_OR(writer, "(?>.*?", 6, goto abort) -#define WRITE_ATOMIC_END() _WRITE_CHAR_OR(writer, ')', goto abort) - WRITE_BLOCK(i, j); // write stuff before '*' if needed + _WRITE_BLOCK_OR(writer, pattern, i, j, goto abort); i = j + 1; // jump after the '*' for (Py_ssize_t k = 1; k < m; ++k) { + // process all but the last wildcard. PyObject *ind = PyList_GET_ITEM(indices, k); assert(ind != NULL); j = PyLong_AsSsize_t(ind); @@ -551,25 +576,24 @@ process_wildcards(PyObject *pattern, PyObject *indices) goto abort; } assert(i < j); - // write the atomic RE group - WRITE_ATOMIC_BEGIN(); - WRITE_BLOCK(i, j); - WRITE_ATOMIC_END(); + // write the atomic RE group '(?>.*?' + BLOCK + ')' + _WRITE_STRING_OR(writer, state->re_atomic_bgroup_str, goto abort); + _WRITE_BLOCK_OR(writer, pattern, i, j, goto abort); + _WRITE_CHAR_OR(writer, ')', goto abort); i = j + 1; } - // handle the last group - _WRITE_ASCII_OR(writer, ".*", 2, goto abort); - WRITE_BLOCK(i, n); // write the remaining substring (if non-empty) -#undef WRITE_BLOCK -#undef WRITE_ATOMIC_END -#undef WRITE_ATOMIC_BEGIN - PyObject *res = PyUnicodeWriter_Finish(writer); - if (res == NULL) { + // handle the remaining wildcard + _WRITE_STRING_OR(writer, state->re_wildcard_str, goto abort); + // write the remaining substring (if non-empty) + _WRITE_BLOCK_OR(writer, pattern, i, n, goto abort); + PyObject *processed = PyUnicodeWriter_Finish(writer); + if (processed == NULL) { return NULL; } - PyObject *formatted = PyUnicode_FromFormat("(?s:%S)\\Z", res); - Py_DECREF(res); - return formatted; + // "(?s:" + processed + ")\Z" + PyObject *res = PyUnicode_FromFormat("(?s:%S)\\Z", processed); + Py_DECREF(processed); + return res; abort: PyUnicodeWriter_Discard(writer); return NULL; From 5374ff45aee1a274120a6a60e6ee4974ff477321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 12 Jul 2024 14:50:49 +0200 Subject: [PATCH 54/97] revert some improvements that were not improvements --- Modules/_fnmatch/_fnmatchmodule.c | 9 ------- Modules/_fnmatch/_fnmatchmodule.h | 3 --- Modules/_fnmatch/translate.c | 42 +++++++++++++------------------ 3 files changed, 17 insertions(+), 37 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 62a2e7b7f1ed40..114d3ca6dc252d 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -130,9 +130,6 @@ fnmatchmodule_exec(PyObject *module) return -1; } INTERN_STRING(st, hyphen_str, "-"); - INTERN_STRING(st, re_empty_range_str, "(?!)"); - INTERN_STRING(st, re_atomic_bgroup_str, "(?>.*?"); - INTERN_STRING(st, re_wildcard_str, ".*"); return 0; } #undef INTERN_STRING @@ -142,9 +139,6 @@ static int fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); - Py_VISIT(st->re_wildcard_str); - Py_VISIT(st->re_atomic_bgroup_str); - Py_VISIT(st->re_empty_range_str); Py_VISIT(st->hyphen_str); Py_VISIT(st->translator); Py_VISIT(st->re_module); @@ -157,9 +151,6 @@ static int fnmatchmodule_clear(PyObject *m) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); - Py_CLEAR(st->re_wildcard_str); - Py_CLEAR(st->re_atomic_bgroup_str); - Py_CLEAR(st->re_empty_range_str); Py_CLEAR(st->hyphen_str); Py_CLEAR(st->translator); Py_CLEAR(st->re_module); diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index 4169967e0961af..bae2908969c7d0 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -16,9 +16,6 @@ typedef struct { // strings used by translate.c PyObject *hyphen_str; // hyphen glyph '-' - PyObject *re_empty_range_str; // RE empty range '(?!)' - PyObject *re_atomic_bgroup_str; // RE atomic group begin '(?>.*?' - PyObject *re_wildcard_str; // RE wildcard '.*' } fnmatchmodule_state; static inline fnmatchmodule_state * diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index f20740bbbb4d37..0c3d0757a62174 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -97,8 +97,7 @@ write_literal(fnmatchmodule_state *state, * This returns the number of written characters, or -1 if an error occurred. */ static Py_ssize_t -write_expression(fnmatchmodule_state *state, - PyUnicodeWriter *writer, PyObject *expression); +write_expression(PyUnicodeWriter *writer, PyObject *expression); /* * Build the final regular expression by processing the wildcards. @@ -106,8 +105,7 @@ write_expression(fnmatchmodule_state *state, * The position of each wildcard in 'pattern' is given by 'indices'. */ static PyObject * -process_wildcards(fnmatchmodule_state *state, - PyObject *pattern, PyObject *indices); +process_wildcards(PyObject *pattern, PyObject *indices); // ==== API implementation ==================================================== @@ -202,8 +200,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) ADVANCE_IF_CHAR(']', j, n); // [!] or [] ADVANCE_TO_NEXT(']', j, n); // locate closing ']' if (j >= n) { - _WRITE_CHAR_OR(writer, '\\', goto abort); - _WRITE_CHAR_OR(writer, '[', goto abort); + _WRITE_ASCII_OR(writer, "\\[", 2, goto abort); wi += 2; // we just wrote 2 characters break; // early break for clarity } @@ -221,7 +218,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (s0 == NULL) { goto abort; } - // NOTE(picnixz): maybe cache the method and intern the arguments? + // NOTE(picnixz): maybe cache the method and intern the arguments // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() s1 = _PyObject_CallMethod(s0, &_Py_ID(replace), "ss", "\\", "\\\\"); Py_DECREF(s0); @@ -234,14 +231,14 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (s1 == NULL) { goto abort; } - // NOTE(picnixz): maybe cache the method and intern the arguments? + // NOTE(picnixz): maybe cache the method and intern the arguments // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() s2 = _PyObject_CallMethod(re, &_Py_ID(sub), "ssO", "([&~|])", "\\\\\\1", s1); Py_DECREF(s1); if (s2 == NULL) { goto abort; } - Py_ssize_t difflen = write_expression(state, writer, s2); + Py_ssize_t difflen = write_expression(writer, s2); Py_DECREF(s2); if (difflen < 0) { goto abort; @@ -276,7 +273,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) Py_DECREF(indices); return NULL; } - PyObject *res = process_wildcards(state, translated, indices); + PyObject *res = process_wildcards(translated, indices); Py_DECREF(translated); Py_DECREF(indices); return res; @@ -421,13 +418,13 @@ translate_expression(fnmatchmodule_state *state, for (c = 0; c < chunkscount; ++c) { PyObject *s0 = PyList_GET_ITEM(chunks, c); assert(s0 != NULL); - // NOTE(picnixz): maybe cache the method and intern the arguments? + // NOTE(picnixz): maybe cache the method and intern the arguments // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() PyObject *s1 = _PyObject_CallMethod(s0, &_Py_ID(replace), "ss", "\\", "\\\\"); if (s1 == NULL) { goto abort; } - // NOTE(picnixz): maybe cache the method and intern the arguments? + // NOTE(picnixz): maybe cache the method and intern the arguments // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() PyObject *s2 = _PyObject_CallMethod(s1, &_Py_ID(replace), "ss", "-", "\\-"); Py_DECREF(s1); @@ -456,7 +453,8 @@ translate_expression(fnmatchmodule_state *state, } static Py_ssize_t -write_literal(fnmatchmodule_state *state, PyUnicodeWriter *writer, PyObject *literal) +write_literal(fnmatchmodule_state *state, + PyUnicodeWriter *writer, PyObject *literal) { PyObject *escaped = PyObject_CallMethodOneArg(state->re_module, &_Py_ID(escape), @@ -476,17 +474,14 @@ write_literal(fnmatchmodule_state *state, PyUnicodeWriter *writer, PyObject *lit } static Py_ssize_t -write_expression(fnmatchmodule_state *state, - PyUnicodeWriter *writer, PyObject *expression) +write_expression(PyUnicodeWriter *writer, PyObject *expression) { #define WRITE_CHAR(c) _WRITE_CHAR_OR(writer, (c), return -1) -#define WRITE_ASCII(s, n) _WRITE_ASCII_OR(writer, (s), (n), return -1) -#define WRITE_BLOCK(s, i, j) _WRITE_BLOCK_OR(writer, (s), (i), (j), return -1) #define WRITE_STRING(s) _WRITE_STRING_OR(writer, (s), return -1) Py_ssize_t grouplen = PyUnicode_GET_LENGTH(expression); if (grouplen == 0) { /* empty range: never match */ - WRITE_STRING(state->re_empty_range_str); + _WRITE_ASCII_OR(writer, "(?!)", 4, return -1); return 4; } Py_UCS4 token = PyUnicode_READ_CHAR(expression, 0); @@ -500,7 +495,7 @@ write_expression(fnmatchmodule_state *state, switch (token) { case '!': { WRITE_CHAR('^'); // replace '!' by '^' - WRITE_BLOCK(expression, 1, grouplen); + _WRITE_BLOCK_OR(writer, expression, 1, grouplen, return -1); break; } case '^': @@ -518,14 +513,11 @@ write_expression(fnmatchmodule_state *state, WRITE_CHAR(']'); return grouplen + extra; #undef WRITE_STRING -#undef WRITE_BLOCK -#undef WRITE_ASCII #undef WRITE_CHAR } static PyObject * -process_wildcards(fnmatchmodule_state *state, - PyObject *pattern, PyObject *indices) +process_wildcards(PyObject *pattern, PyObject *indices) { const Py_ssize_t m = PyList_GET_SIZE(indices); if (m == 0) { @@ -577,13 +569,13 @@ process_wildcards(fnmatchmodule_state *state, } assert(i < j); // write the atomic RE group '(?>.*?' + BLOCK + ')' - _WRITE_STRING_OR(writer, state->re_atomic_bgroup_str, goto abort); + _WRITE_ASCII_OR(writer, "(?>.*?", 6, goto abort); _WRITE_BLOCK_OR(writer, pattern, i, j, goto abort); _WRITE_CHAR_OR(writer, ')', goto abort); i = j + 1; } // handle the remaining wildcard - _WRITE_STRING_OR(writer, state->re_wildcard_str, goto abort); + _WRITE_ASCII_OR(writer, ".*", 2, goto abort); // write the remaining substring (if non-empty) _WRITE_BLOCK_OR(writer, pattern, i, n, goto abort); PyObject *processed = PyUnicodeWriter_Finish(writer); From 4ff4f370284b064abbfb50a1459aeda6efb6e8c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 13 Jul 2024 09:49:49 +0200 Subject: [PATCH 55/97] remove incorrect usage of clinic --- Modules/_fnmatch/_fnmatchmodule.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 114d3ca6dc252d..92a24a9681ab02 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -330,24 +330,16 @@ fnmatch_translate_impl(PyObject *module, PyObject *pattern) // ==== Module specs ========================================================== -/*[python input] -import fnmatch -import textwrap -fmt = 'PyDoc_STRVAR(fnmatchmodule_doc,\n"%s");' -print(fmt % '\\n\\\n'.join(fnmatch.__doc__.splitlines())) -[python start generated code]*/ +// fmt: off PyDoc_STRVAR(fnmatchmodule_doc, -"Filename matching with shell patterns.\n\ -\n\ -fnmatch(FILENAME, PATTERN) matches according to the local convention.\n\ -fnmatchcase(FILENAME, PATTERN) always takes case in account.\n\ -\n\ -The functions operate by translating the pattern into a regular\n\ -expression. They cache the compiled regular expressions for speed.\n\ -\n\ -The function translate(PATTERN) returns a regular expression\n\ -corresponding to PATTERN. (It does not compile it.)"); -/*[python end generated code: output=b5d0696157f04882 input=8dfe2add227b2686]*/ +"Filename matching with shell patterns.\n" +"fnmatch(FILENAME, PATTERN) matches according to the local convention.\n" +"fnmatchcase(FILENAME, PATTERN) always takes case in account.\n\n" +"The functions operate by translating the pattern into a regular\n" +"expression. They cache the compiled regular expressions for speed.\n\n" +"The function translate(PATTERN) returns a regular expression\n" +"corresponding to PATTERN. (It does not compile it.)"); +// fmt: on static PyMethodDef fnmatchmodule_methods[] = { FNMATCH_FILTER_METHODDEF From 178f2d3599ddc2cc79599ab005a845e7a288410c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 13 Jul 2024 10:12:31 +0200 Subject: [PATCH 56/97] update TODO note --- Modules/_fnmatch/translate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 0c3d0757a62174..00ad81c030a9e5 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -132,7 +132,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) // estimate the number of characters to be written to be the // same as the number of characters in the pattern. // - // TODO: (picnixz): should we limit the estimation? + // TODO(picnixz): should we limit the estimation? PyUnicodeWriter *writer = PyUnicodeWriter_Create(n); if (writer == NULL) { return NULL; From 9d237b13be2eac358e36f3cd5fbebe55a8cf3017 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:31:08 +0200 Subject: [PATCH 57/97] use a dynamic module instead --- Modules/Setup.bootstrap.in | 3 --- 1 file changed, 3 deletions(-) diff --git a/Modules/Setup.bootstrap.in b/Modules/Setup.bootstrap.in index 35198091329d01..4dcc0f55176d0e 100644 --- a/Modules/Setup.bootstrap.in +++ b/Modules/Setup.bootstrap.in @@ -35,8 +35,5 @@ _operator _operator.c _stat _stat.c _symtable symtablemodule.c -# miscellaneous accelerators -_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/matcher.c _fnmatch/translate.c - # for systems without $HOME env, used by site._getuserbase() @MODULE_PWD_TRUE@pwd pwdmodule.c From b1568d462b63ff1e263136b47a16d40d142d1067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:31:14 +0200 Subject: [PATCH 58/97] remove un-necessary definitions --- Makefile.pre.in | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Makefile.pre.in b/Makefile.pre.in index bd5e471c50bd33..52e32297d3685c 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -351,13 +351,6 @@ IO_OBJS= \ Modules/_io/bytesio.o \ Modules/_io/stringio.o -FNMATCH_H= Modules/_fnmatch/_fnmatchmodule.h - -FNMATCH_OBJS= \ - Modules/_fnmatch/_fnmatchmodule.o \ - Modules/_fnmatch/matcher.o \ - Modules/_fnmatch/translate.o - ########################################################################## # mimalloc @@ -1747,8 +1740,6 @@ Python/sysmodule.o: $(srcdir)/Python/sysmodule.c Makefile $(srcdir)/Include/pydt $(MULTIARCH_CPPFLAGS) \ -o $@ $(srcdir)/Python/sysmodule.c -$(FNMATCH_OBJS): $(FNMATCH_H) - $(IO_OBJS): $(IO_H) .PHONY: regen-pegen-metaparser From 84b0b82cf00545ea60ceade2a9c47be2c23bebf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:32:17 +0200 Subject: [PATCH 59/97] update configuration --- Modules/Setup.stdlib.in | 1 + Modules/_fnmatch/_fnmatchmodule.c | 3 ++ Modules/_fnmatch/_fnmatchmodule.h | 4 +++ configure | 56 +++++++++++++++---------------- configure.ac | 2 +- 5 files changed, 37 insertions(+), 29 deletions(-) diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in index dfc75077650df8..e689d18b70b035 100644 --- a/Modules/Setup.stdlib.in +++ b/Modules/Setup.stdlib.in @@ -33,6 +33,7 @@ @MODULE__BISECT_TRUE@_bisect _bisectmodule.c @MODULE__CONTEXTVARS_TRUE@_contextvars _contextvarsmodule.c @MODULE__CSV_TRUE@_csv _csv.c +@MODULE__FNMATCH_TRUE@_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/matcher.c _fnmatch/translate.c @MODULE__HEAPQ_TRUE@_heapq _heapqmodule.c @MODULE__JSON_TRUE@_json _json.c @MODULE__LSPROF_TRUE@_lsprof _lsprof.c rotatingtree.c diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 92a24a9681ab02..3c14745859c91b 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -1,4 +1,7 @@ #include "_fnmatchmodule.h" + +#include "pycore_runtime.h" // _Py_ID() + #include "clinic/_fnmatchmodule.c.h" #define COMPILED_CACHE_SIZE 32768 diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index bae2908969c7d0..c4ce1d23201156 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -5,6 +5,10 @@ #ifndef _FNMATCHMODULE_H #define _FNMATCHMODULE_H +#ifndef Py_BUILD_CORE_BUILTIN +# define Py_BUILD_CORE_MODULE 1 +#endif + #include "Python.h" typedef struct { diff --git a/configure b/configure index f1dcfa1fa1dbcd..09042aaf0507bd 100755 --- a/configure +++ b/configure @@ -801,6 +801,8 @@ MODULE__JSON_FALSE MODULE__JSON_TRUE MODULE__HEAPQ_FALSE MODULE__HEAPQ_TRUE +MODULE__FNMATCH_FALSE +MODULE__FNMATCH_TRUE MODULE__CSV_FALSE MODULE__CSV_TRUE MODULE__CONTEXTVARS_FALSE @@ -815,8 +817,6 @@ MODULE_TIME_FALSE MODULE_TIME_TRUE MODULE__IO_FALSE MODULE__IO_TRUE -MODULE__FNMATCH_FALSE -MODULE__FNMATCH_TRUE MODULE_BUILDTYPE TEST_MODULES LIBB2_LIBS @@ -29018,28 +29018,6 @@ MODULE_BLOCK= - if test "$py_cv_module__fnmatch" != "n/a" -then : - py_cv_module__fnmatch=yes -fi - if test "$py_cv_module__fnmatch" = yes; then - MODULE__FNMATCH_TRUE= - MODULE__FNMATCH_FALSE='#' -else - MODULE__FNMATCH_TRUE='#' - MODULE__FNMATCH_FALSE= -fi - - as_fn_append MODULE_BLOCK "MODULE__FNMATCH_STATE=$py_cv_module__fnmatch$as_nl" - if test "x$py_cv_module__fnmatch" = xyes -then : - - as_fn_append MODULE_BLOCK "MODULE__FNMATCH_CFLAGS=-I\$(srcdir)/Modules/_fnmatch$as_nl" - - -fi - - if test "$py_cv_module__io" != "n/a" then : py_cv_module__io=yes @@ -29192,6 +29170,28 @@ then : +fi + + + if test "$py_cv_module__fnmatch" != "n/a" +then : + py_cv_module__fnmatch=yes +fi + if test "$py_cv_module__fnmatch" = yes; then + MODULE__FNMATCH_TRUE= + MODULE__FNMATCH_FALSE='#' +else + MODULE__FNMATCH_TRUE='#' + MODULE__FNMATCH_FALSE= +fi + + as_fn_append MODULE_BLOCK "MODULE__FNMATCH_STATE=$py_cv_module__fnmatch$as_nl" + if test "x$py_cv_module__fnmatch" = xyes +then : + + as_fn_append MODULE_BLOCK "MODULE__FNMATCH_CFLAGS=-I\$(srcdir)/Modules/_fnmatch$as_nl" + + fi @@ -31749,10 +31749,6 @@ LTLIBOBJS=$ac_ltlibobjs -if test -z "${MODULE__FNMATCH_TRUE}" && test -z "${MODULE__FNMATCH_FALSE}"; then - as_fn_error $? "conditional \"MODULE__FNMATCH\" was never defined. -Usually this means the macro was only invoked conditionally." "$LINENO" 5 -fi if test -z "${MODULE__IO_TRUE}" && test -z "${MODULE__IO_FALSE}"; then as_fn_error $? "conditional \"MODULE__IO\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 @@ -31781,6 +31777,10 @@ if test -z "${MODULE__CSV_TRUE}" && test -z "${MODULE__CSV_FALSE}"; then as_fn_error $? "conditional \"MODULE__CSV\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${MODULE__FNMATCH_TRUE}" && test -z "${MODULE__FNMATCH_FALSE}"; then + as_fn_error $? "conditional \"MODULE__FNMATCH\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi if test -z "${MODULE__HEAPQ_TRUE}" && test -z "${MODULE__HEAPQ_FALSE}"; then as_fn_error $? "conditional \"MODULE__HEAPQ\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 diff --git a/configure.ac b/configure.ac index 7b8fc287c56c35..a0e476437a827e 100644 --- a/configure.ac +++ b/configure.ac @@ -7681,7 +7681,6 @@ AC_DEFUN([PY_STDLIB_MOD_SIMPLE], [ ]) dnl static modules in Modules/Setup.bootstrap -PY_STDLIB_MOD_SIMPLE([_fnmatch], [-I\$(srcdir)/Modules/_fnmatch], []) PY_STDLIB_MOD_SIMPLE([_io], [-I\$(srcdir)/Modules/_io], []) PY_STDLIB_MOD_SIMPLE([time], [], [$TIMEMODULE_LIB]) @@ -7691,6 +7690,7 @@ PY_STDLIB_MOD_SIMPLE([_asyncio]) PY_STDLIB_MOD_SIMPLE([_bisect]) PY_STDLIB_MOD_SIMPLE([_contextvars]) PY_STDLIB_MOD_SIMPLE([_csv]) +PY_STDLIB_MOD_SIMPLE([_fnmatch], [-I\$(srcdir)/Modules/_fnmatch], []) PY_STDLIB_MOD_SIMPLE([_heapq]) PY_STDLIB_MOD_SIMPLE([_json]) PY_STDLIB_MOD_SIMPLE([_lsprof]) From 86960246867060d805d92fa8d11d9b43e3d14637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:34:09 +0200 Subject: [PATCH 60/97] intern strings & functions --- Modules/_fnmatch/_fnmatchmodule.c | 22 ++++++++++++++++++++++ Modules/_fnmatch/_fnmatchmodule.h | 17 ++++++++++++----- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 3c14745859c91b..e51283a4af60c6 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -56,6 +56,10 @@ fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) return -1; } PyObject *lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); + if (lru_cache == NULL) { + Py_DECREF(maxsize); + return -1; + } PyObject *decorator = PyObject_CallFunctionObjArgs(lru_cache, maxsize, Py_True, NULL); Py_DECREF(lru_cache); Py_DECREF(maxsize); @@ -133,6 +137,11 @@ fnmatchmodule_exec(PyObject *module) return -1; } INTERN_STRING(st, hyphen_str, "-"); + INTERN_STRING(st, hyphen_esc_str, "\\-"); + INTERN_STRING(st, backslash_str, "\\"); + INTERN_STRING(st, backslash_esc_str, "\\\\"); + INTERN_STRING(st, inactive_toks_str, "([&~|])"); + INTERN_STRING(st, inactive_toks_repl_str, "\\\\\\1"); return 0; } #undef INTERN_STRING @@ -142,6 +151,11 @@ static int fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); + Py_VISIT(st->inactive_toks_repl_str); + Py_VISIT(st->inactive_toks_str); + Py_VISIT(st->backslash_esc_str); + Py_VISIT(st->backslash_str); + Py_VISIT(st->hyphen_esc_str); Py_VISIT(st->hyphen_str); Py_VISIT(st->translator); Py_VISIT(st->re_module); @@ -154,6 +168,11 @@ static int fnmatchmodule_clear(PyObject *m) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); + Py_CLEAR(st->inactive_toks_repl_str); + Py_CLEAR(st->inactive_toks_str); + Py_CLEAR(st->backslash_esc_str); + Py_CLEAR(st->backslash_str); + Py_CLEAR(st->hyphen_esc_str); Py_CLEAR(st->hyphen_str); Py_CLEAR(st->translator); Py_CLEAR(st->re_module); @@ -376,3 +395,6 @@ PyInit__fnmatch(void) { return PyModuleDef_Init(&_fnmatchmodule); } + +#undef INVALID_PATTERN_TYPE +#undef COMPILED_CACHE_SIZE diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index c4ce1d23201156..9c2dd217bb284b 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -12,14 +12,21 @@ #include "Python.h" typedef struct { - PyObject *os_module; // import os - PyObject *posixpath_module; // import posixpath - PyObject *re_module; // import re + PyObject *os_module; // import os + PyObject *posixpath_module; // import posixpath + PyObject *re_module; // import re - PyObject *translator; // LRU-cached translation unit + PyObject *translator; // LRU-cached translation unit // strings used by translate.c - PyObject *hyphen_str; // hyphen glyph '-' + PyObject *hyphen_str; // hyphen '-' + PyObject *hyphen_esc_str; // escaped hyphen '\\-' + + PyObject *backslash_str; // backslash '\\' + PyObject *backslash_esc_str; // escaped backslash '\\\\' + + PyObject *inactive_toks_str; // inactive tokens '([&~|])' + PyObject *inactive_toks_repl_str; // replacement pattern '\\\\\\1' } fnmatchmodule_state; static inline fnmatchmodule_state * From b27b6d89c475d35d4eb5dc478af36261e94dc9df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:34:43 +0200 Subject: [PATCH 61/97] various improvements - intern strings - local cached functions - split translate_expression() into helper functions --- Modules/_fnmatch/translate.c | 232 ++++++++++++++++++++++------------- 1 file changed, 147 insertions(+), 85 deletions(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 00ad81c030a9e5..eec3c6c14a8e65 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -82,15 +82,6 @@ static PyObject * translate_expression(fnmatchmodule_state *state, PyObject *pattern, Py_ssize_t start, Py_ssize_t stop); -/* - * Write an escaped string using re.escape(). - * - * This returns the number of written characters, or -1 if an error occurred. - */ -static Py_ssize_t -write_literal(fnmatchmodule_state *state, - PyUnicodeWriter *writer, PyObject *literal); - /* * Write the translated pattern obtained by translate_expression(). * @@ -114,8 +105,8 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) { assert(PyUnicode_Check(pattern)); fnmatchmodule_state *state = get_fnmatchmodule_state(module); - PyObject *re = state->re_module; const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); + // We would write less data if there are successive '*', // which should not be the case in general. Otherwise, // we write >= n characters since escaping them always @@ -137,11 +128,25 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (writer == NULL) { return NULL; } + // list containing the indices where '*' has a special meaning - PyObject *indices = PyList_New(0); + PyObject *indices = NULL; + // cached functions (cache is local to the call) + PyObject *re_escape_func = NULL, *re_sub_func = NULL; + + indices = PyList_New(0); if (indices == NULL) { goto abort; } + re_escape_func = PyObject_GetAttr(state->re_module, &_Py_ID(escape)); + if (re_escape_func == NULL) { + goto abort; + } + re_sub_func = PyObject_GetAttr(state->re_module, &_Py_ID(sub)); + if (re_sub_func == NULL) { + goto abort; + } + const int kind = PyUnicode_KIND(pattern); const void *data = PyUnicode_DATA(pattern); /* declaration of some local helping macros */ @@ -218,9 +223,11 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (s0 == NULL) { goto abort; } - // NOTE(picnixz): maybe cache the method and intern the arguments - // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() - s1 = _PyObject_CallMethod(s0, &_Py_ID(replace), "ss", "\\", "\\\\"); + s1 = PyObject_CallMethodObjArgs( + s0, &_Py_ID(replace), + state->backslash_str, state->backslash_esc_str, + NULL + ); Py_DECREF(s0); } else { @@ -231,9 +238,13 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (s1 == NULL) { goto abort; } - // NOTE(picnixz): maybe cache the method and intern the arguments - // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() - s2 = _PyObject_CallMethod(re, &_Py_ID(sub), "ssO", "([&~|])", "\\\\\\1", s1); + s2 = PyObject_CallFunctionObjArgs( + re_sub_func, + state->inactive_toks_str, + state->inactive_toks_repl_str, + s1, + NULL + ); Py_DECREF(s1); if (s2 == NULL) { goto abort; @@ -253,12 +264,14 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (str == NULL) { goto abort; } - Py_ssize_t difflen = write_literal(state, writer, str); + PyObject *escchr = PyObject_CallOneArg(re_escape_func, str); Py_DECREF(str); - if (difflen < 0) { + if (escchr == NULL) { goto abort; } - wi += difflen; + _WRITE_STRING_OR(writer, escchr, Py_DECREF(escchr); goto abort); + wi += PyUnicode_GET_LENGTH(escchr); + Py_DECREF(escchr); break; } } @@ -268,6 +281,8 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) #undef _WHILE_READ_CMP #undef ADVANCE_IF_CHAR #undef READ + Py_DECREF(re_sub_func); + Py_DECREF(re_escape_func); PyObject *translated = PyUnicodeWriter_Finish(writer); if (translated == NULL) { Py_DECREF(indices); @@ -278,8 +293,10 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) Py_DECREF(indices); return res; abort: - PyUnicodeWriter_Discard(writer); + Py_XDECREF(re_sub_func); + Py_XDECREF(re_escape_func); Py_XDECREF(indices); + PyUnicodeWriter_Discard(writer); return NULL; } @@ -310,18 +327,36 @@ get_unicode_character(Py_UCS4 ch) return unicode; } +/* + * Extract a list of chunks from the pattern group described by i and j. + * + * See translate_expression() for its usage. + */ static PyObject * -translate_expression(fnmatchmodule_state *state, - PyObject *pattern, Py_ssize_t i, Py_ssize_t j) +translate_expression_split(fnmatchmodule_state *state, + PyObject *pattern, Py_ssize_t i, Py_ssize_t j) { - PyObject *chunks = PyList_New(0); + PyObject *chunks = NULL; + // local cache for some objects + PyObject *str_find_func = NULL, *max_find_index = NULL; + + chunks = PyList_New(0); if (chunks == NULL) { - return NULL; + goto abort; + } + str_find_func = PyObject_GetAttr(pattern, &_Py_ID(find)); + if (str_find_func == NULL) { + goto abort; + } + max_find_index = PyLong_FromSsize_t(j); + if (max_find_index == NULL) { + goto abort; } + Py_ssize_t k = (PyUnicode_READ_CHAR(pattern, i) == '!') ? i + 2 : i + 1; - Py_ssize_t chunkscount = 0; while (k < j) { - PyObject *eobj = _PyObject_CallMethod(pattern, &_Py_ID(find), "sii", "-", k, j); + PyObject *eobj = PyObject_CallFunction( + str_find_func, "OnO", state->hyphen_str, k, max_find_index); if (eobj == NULL) { goto abort; } @@ -344,11 +379,12 @@ translate_expression(fnmatchmodule_state *state, if (rc < 0) { goto abort; } - chunkscount += 1; i = t + 1; k = t + 3; } + // handle the last group if (i >= j) { + Py_ssize_t chunkscount = PyList_GET_SIZE(chunks); assert(chunkscount > 0); PyObject *chunk = PyList_GET_ITEM(chunks, chunkscount - 1); assert(chunk != NULL); @@ -362,6 +398,7 @@ translate_expression(fnmatchmodule_state *state, } } else { + // add the remaining sub-pattern PyObject *sub = PyUnicode_Substring(pattern, i, j); if (sub == NULL) { goto abort; @@ -371,10 +408,26 @@ translate_expression(fnmatchmodule_state *state, if (rc < 0) { goto abort; } - chunkscount += 1; } - // remove empty ranges (they are not valid in RE) - Py_ssize_t c = chunkscount; + Py_DECREF(max_find_index); + Py_DECREF(str_find_func); + return chunks; +abort: + Py_XDECREF(max_find_index); + Py_XDECREF(str_find_func); + Py_XDECREF(chunks); + return NULL; +} + +/* + * Remove empty ranges (they are invalid in RE). + * + * See translate_expression() for its usage. + */ +static int +translate_expression_simplify(fnmatchmodule_state *st, PyObject *chunks) +{ + Py_ssize_t c = PyList_GET_SIZE(chunks); while (--c) { PyObject *c1 = PyList_GET_ITEM(chunks, c - 1); assert(c1 != NULL); @@ -387,64 +440,94 @@ translate_expression(fnmatchmodule_state *state, assert(c2len > 0); if (PyUnicode_READ_CHAR(c1, c1len - 1) > PyUnicode_READ_CHAR(c2, 0)) { - // all but the last character in the chunk - PyObject *c1sub = PyUnicode_Substring(c1, 0, c1len - 1); - // all but the first character in the chunk - PyObject *c2sub = PyUnicode_Substring(c2, 1, c2len); - if (c1sub == NULL || c2sub == NULL) { - Py_XDECREF(c1sub); - Py_XDECREF(c2sub); - goto abort; + Py_ssize_t olen = c1len + c2len - 2; + assert(olen >= 0); + // see https://github.com/python/cpython/issues/114917 for + // why we need olen + 1 and not olen currently + PyUnicodeWriter *writer = PyUnicodeWriter_Create(olen + 1); + if (writer == NULL) { + return -1; + } + // all but the last character in the first chunk + if (_WRITE_BLOCK(writer, c1, 0, c1len - 1) < 0) { + PyUnicodeWriter_Discard(writer); + return -1; + } + // all but the first character in the second chunk + if (_WRITE_BLOCK(writer, c2, 1, c2len) < 0) { + PyUnicodeWriter_Discard(writer); + return -1; } - PyObject *merged = PyUnicode_Concat(c1sub, c2sub); - Py_DECREF(c1sub); - Py_DECREF(c2sub); // PyList_SetItem() does not create a new reference on 'merged' // so we should not decref 'merged' after the call, unless there // is an issue while setting the item. + PyObject *merged = PyUnicodeWriter_Finish(writer); if (merged == NULL || PyList_SetItem(chunks, c - 1, merged) < 0) { Py_XDECREF(merged); - goto abort; + return -1; } if (PySequence_DelItem(chunks, c) < 0) { - goto abort; + return -1; } - chunkscount--; } } - assert(chunkscount == PyList_GET_SIZE(chunks)); - // Escape backslashes and hyphens for set difference (--), - // but hyphens that create ranges should not be escaped. - for (c = 0; c < chunkscount; ++c) { + return 0; +} + +/* + * Escape backslashes and hyphens for set difference (--), + * but hyphens that create ranges should not be escaped. + * + * See translate_expression() for its usage. + */ +static int +translate_expression_escape(fnmatchmodule_state *st, PyObject *chunks) +{ + for (Py_ssize_t c = 0; c < PyList_GET_SIZE(chunks); ++c) { PyObject *s0 = PyList_GET_ITEM(chunks, c); assert(s0 != NULL); - // NOTE(picnixz): maybe cache the method and intern the arguments - // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() - PyObject *s1 = _PyObject_CallMethod(s0, &_Py_ID(replace), "ss", "\\", "\\\\"); + PyObject *s1 = PyObject_CallMethodObjArgs(s0, + &_Py_ID(replace), + st->backslash_str, + st->backslash_esc_str, + NULL); if (s1 == NULL) { - goto abort; + return -1; } - // NOTE(picnixz): maybe cache the method and intern the arguments - // NOTE(picnixz): to be able to use PyObject_CallFunctionObjArgs() - PyObject *s2 = _PyObject_CallMethod(s1, &_Py_ID(replace), "ss", "-", "\\-"); + PyObject *s2 = PyObject_CallMethodObjArgs(s1, + &_Py_ID(replace), + st->hyphen_str, + st->hyphen_esc_str, + NULL); Py_DECREF(s1); // PyList_SetItem() does not create a new reference on 's2' // so we should not decref 's2' after the call, unless there // is an issue while setting the item. if (s2 == NULL || PyList_SetItem(chunks, c, s2) < 0) { Py_XDECREF(s2); - goto abort; + return -1; } } - PyObject *hyphen = PyUnicode_FromOrdinal('-'); - if (hyphen == NULL) { + return 0; +} + +static PyObject * +translate_expression(fnmatchmodule_state *state, + PyObject *pattern, Py_ssize_t i, Py_ssize_t j) +{ + PyObject *chunks = translate_expression_split(state, pattern, i, j); + if (chunks == NULL) { + goto abort; + } + // remove empty ranges + if (translate_expression_simplify(state, chunks) < 0) { goto abort; } - PyObject *res = PyUnicode_Join(hyphen, chunks); - Py_DECREF(hyphen); - if (res == NULL) { + // escape backslashes and set differences + if (translate_expression_escape(state, chunks) < 0) { goto abort; } + PyObject *res = PyUnicode_Join(state->hyphen_str, chunks); Py_DECREF(chunks); return res; abort: @@ -452,27 +535,6 @@ translate_expression(fnmatchmodule_state *state, return NULL; } -static Py_ssize_t -write_literal(fnmatchmodule_state *state, - PyUnicodeWriter *writer, PyObject *literal) -{ - PyObject *escaped = PyObject_CallMethodOneArg(state->re_module, - &_Py_ID(escape), - literal); - if (escaped == NULL) { - return -1; - } - Py_ssize_t written = PyUnicode_GET_LENGTH(escaped); - assert(written >= 0); - int rc = _WRITE_STRING(writer, escaped); - Py_DECREF(escaped); - if (rc < 0) { - return -1; - } - assert(written > 0); - return written; -} - static Py_ssize_t write_expression(PyUnicodeWriter *writer, PyObject *expression) { @@ -522,7 +584,7 @@ process_wildcards(PyObject *pattern, PyObject *indices) const Py_ssize_t m = PyList_GET_SIZE(indices); if (m == 0) { // "(?s:" + pattern + ")\Z" - return PyUnicode_FromFormat("(?s:%S)\\Z", pattern); + return PyUnicode_FromFormat("(?s:%U)\\Z", pattern); } /* * Special cases: indices[0] == 0 or indices[-1] + 1 == n @@ -583,7 +645,7 @@ process_wildcards(PyObject *pattern, PyObject *indices) return NULL; } // "(?s:" + processed + ")\Z" - PyObject *res = PyUnicode_FromFormat("(?s:%S)\\Z", processed); + PyObject *res = PyUnicode_FromFormat("(?s:%U)\\Z", processed); Py_DECREF(processed); return res; abort: From b564b2210d3c9961b8cf0582d03c6a4c5df5c088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 16 Jul 2024 17:21:26 +0200 Subject: [PATCH 62/97] PEP-7 fixes --- Modules/_fnmatch/_fnmatchmodule.c | 6 +- Modules/_fnmatch/_fnmatchmodule.h | 4 +- Modules/_fnmatch/matcher.c | 4 +- Modules/_fnmatch/translate.c | 161 +++++++++++++++--------------- 4 files changed, 91 insertions(+), 84 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index e51283a4af60c6..3609a59e9bcd7b 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -55,12 +55,14 @@ fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) if (maxsize == NULL) { return -1; } - PyObject *lru_cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); + PyObject *lru_cache = _PyImport_GetModuleAttrString("functools", + "lru_cache"); if (lru_cache == NULL) { Py_DECREF(maxsize); return -1; } - PyObject *decorator = PyObject_CallFunctionObjArgs(lru_cache, maxsize, Py_True, NULL); + PyObject *decorator = PyObject_CallFunctionObjArgs( + lru_cache, maxsize, Py_True, NULL); Py_DECREF(lru_cache); Py_DECREF(maxsize); if (decorator == NULL) { diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index 9c2dd217bb284b..46afd8e0dda630 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -74,7 +74,9 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names); * The 'normcase' argument is a callable implementing os.path.normcase(). */ extern PyObject * -_Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *normcase); +_Py_fnmatch_filter_normalized(PyObject *matcher, + PyObject *names, + PyObject *normcase); /* * C accelerator for translating UNIX shell patterns into RE patterns. diff --git a/Modules/_fnmatch/matcher.c b/Modules/_fnmatch/matcher.c index 899fe56ee063d3..22fdc41d719b59 100644 --- a/Modules/_fnmatch/matcher.c +++ b/Modules/_fnmatch/matcher.c @@ -48,7 +48,9 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names) } PyObject * -_Py_fnmatch_filter_normalized(PyObject *matcher, PyObject *names, PyObject *normcase) +_Py_fnmatch_filter_normalized(PyObject *matcher, + PyObject *names, + PyObject *normcase) { PyObject *iter = PyObject_GetIter(names); if (iter == NULL) { diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index eec3c6c14a8e65..6d841f10840c94 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -11,46 +11,55 @@ // // The following _WRITE_* and _WRITE_*_OR macros do NOT check their inputs // since they directly delegate to the _PyUnicodeWriter_Write* underlying -// function. +// function. In particular, the caller is responsible for type safety. -#define _WRITE_OR_FAIL(writeop, onerror) \ - do { \ - if ((writeop) < 0) { \ - onerror; \ - } \ +#define _WRITE_OR_FAIL(WRITE_OPERATION, ON_ERROR) \ + do { \ + if ((WRITE_OPERATION) < 0) { \ + ON_ERROR; \ + } \ } while (0) -/* write a character 'ch' */ -#define _WRITE_CHAR(writer, ch) \ - _PyUnicodeWriter_WriteChar((_PyUnicodeWriter *)(writer), (ch)) -/* write a character 'ch', or execute 'onerror' if it fails */ -#define _WRITE_CHAR_OR(writer, ch, onerror) \ - _WRITE_OR_FAIL(_WRITE_CHAR((writer), (ch)), onerror) - -/* write an ASCII 'string' of given 'length' */ -#define _WRITE_ASCII(writer, ascii, length) \ - _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter *)(writer), (ascii), (length)) -/* write an ASCII 'string' of given 'length', or execute 'onerror' if it fails */ -#define _WRITE_ASCII_OR(writer, ascii, length, onerror) \ - _WRITE_OR_FAIL(_WRITE_ASCII((writer), (ascii), (length)), onerror) - -/* write a 'string' */ -#define _WRITE_STRING(writer, string) \ - _PyUnicodeWriter_WriteStr((_PyUnicodeWriter *)(writer), (string)) -/* write a 'string', or execute 'onerror' if it fails */ -#define _WRITE_STRING_OR(writer, string, onerror) \ - _WRITE_OR_FAIL(_WRITE_STRING((writer), (string)), onerror) - -/* write the substring string[i:j] */ -#define _WRITE_BLOCK(writer, string, i, j) \ - _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter *)(writer), (string), (i), (j)) -/* write the substring string[i:j] if i < j, or execute 'onerror' if it fails */ -#define _WRITE_BLOCK_OR(writer, string, i, j, onerror) \ - do { \ - Py_ssize_t _i = (i), _j = (j); /* to allow in-place operators on i or j */ \ - if (_i < _j && _WRITE_BLOCK((writer), (string), _i, _j) < 0) { \ - onerror; \ - } \ +/* write a character CHAR */ +#define _WRITE_CHAR(WRITER, CHAR) \ + _PyUnicodeWriter_WriteChar((_PyUnicodeWriter *)(WRITER), (CHAR)) +/* write a character CHAR or execute the ON_ERROR statements if it fails */ +#define _WRITE_CHAR_OR(WRITER, CHAR, ON_ERROR) \ + _WRITE_OR_FAIL(_WRITE_CHAR((WRITER), (CHAR)), ON_ERROR) + +/* write an ASCII string STRING of given length LENGTH */ +#define _WRITE_ASCII(WRITER, ASCII, LENGTH) \ + _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter *)(WRITER), \ + (ASCII), (LENGTH)) +/* + * Write an ASCII string STRING of given length LENGTH, + * or execute the ON_ERROR statements if it fails. + */ +#define _WRITE_ASCII_OR(WRITER, ASCII, LENGTH, ON_ERROR) \ + _WRITE_OR_FAIL(_WRITE_ASCII((WRITER), (ASCII), (LENGTH)), ON_ERROR) + +/* write the string STRING */ +#define _WRITE_STRING(WRITER, STRING) \ + _PyUnicodeWriter_WriteStr((_PyUnicodeWriter *)(WRITER), (STRING)) +/* write the string STRING or execute the ON_ERROR statements if it fails */ +#define _WRITE_STRING_OR(WRITER, STRING, ON_ERROR) \ + _WRITE_OR_FAIL(_WRITE_STRING((WRITER), (STRING)), ON_ERROR) + +/* write the substring STRING[START:STOP] */ +#define _WRITE_BLOCK(WRITER, STRING, START, STOP) \ + _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter *)(WRITER), \ + (STRING), (START), (STOP)) +/* + * Write the substring STRING[START:STOP] if START < STOP, + * or execute the ON_ERROR statements if it fails. + */ +#define _WRITE_BLOCK_OR(WRITER, STRING, START, STOP, ON_ERROR) \ + do { \ + /* intermediate variables to allow in-place operations */ \ + Py_ssize_t _i = (START), _j = (STOP); \ + if (_i < _j && _WRITE_BLOCK((WRITER), (STRING), _i, _j) < 0) { \ + ON_ERROR; \ + } \ } while (0) // ==== Helper declarations =================================================== @@ -147,32 +156,20 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) goto abort; } - const int kind = PyUnicode_KIND(pattern); - const void *data = PyUnicode_DATA(pattern); + const int unicode_kind = PyUnicode_KIND(pattern); + const void *const unicode_data = PyUnicode_DATA(pattern); /* declaration of some local helping macros */ -#define READ(ind) PyUnicode_READ(kind, data, (ind)) - /* advance 'ind' if the character is 'ch' */ -#define ADVANCE_IF_CHAR(ch, ind, maxind) \ - do { \ - /* the following forces ind to be a variable name */ \ - Py_ssize_t *Py_UNUSED(_ind) = &ind; \ - if ((ind) < (maxind) && READ(ind) == (ch)) { \ - ++ind; \ - } \ - } while (0) - /* advance 'ind' until the character compares to 'READ[ind] CMPOP ch' */ -#define _WHILE_READ_CMP(ch, ind, maxind, CMPOP) \ - do { \ - /* the following forces ind to be a variable name */ \ - Py_ssize_t *Py_UNUSED(_ind) = &ind; \ - while ((ind) < (maxind) && READ(ind) CMPOP (ch)) { \ - ++ind; \ - } \ +#define READ(IND) PyUnicode_READ(unicode_kind, unicode_data, (IND)) + /* advance IND if the character is CHAR */ +#define ADVANCE_IF_NEXT_CHAR_IS(CHAR, IND, MAXIND) \ + do { \ + /* the following forces IND to be a variable name */ \ + void *Py_UNUSED(_ind) = &IND; \ + if ((IND) < (MAXIND) && READ(IND) == (CHAR)) { \ + ++IND; \ + } \ } while (0) - /* advance 'from' as long as READ(from) != ch */ -#define ADVANCE_TO_NEXT(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), !=) - /* advance 'from' as long as READ(from) == ch */ -#define SKIP_DUPLICATES(ch, from, maxind) _WHILE_READ_CMP((ch), (from), (maxind), ==) + Py_ssize_t i = 0; // current index Py_ssize_t wi = 0; // number of characters written while (i < n) { @@ -181,7 +178,8 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) switch (chr) { case '*': { _WRITE_CHAR_OR(writer, '*', goto abort); - SKIP_DUPLICATES('*', i, n); + // skip duplicated '*' + for (; i < n && READ(i) == '*'; ++i); PyObject *index = PyLong_FromSsize_t(wi++); if (index == NULL) { goto abort; @@ -200,10 +198,10 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) break; } case '[': { - Py_ssize_t j = i; // 'i' is already at next char - ADVANCE_IF_CHAR('!', j, n); // [! - ADVANCE_IF_CHAR(']', j, n); // [!] or [] - ADVANCE_TO_NEXT(']', j, n); // locate closing ']' + Py_ssize_t j = i; + ADVANCE_IF_NEXT_CHAR_IS('!', j, n); // [! + ADVANCE_IF_NEXT_CHAR_IS(']', j, n); // [!] or [] + for (; j < n && READ(j) != ']'; ++j); // locate closing ']' if (j >= n) { _WRITE_ASCII_OR(writer, "\\[", 2, goto abort); wi += 2; // we just wrote 2 characters @@ -224,8 +222,10 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) goto abort; } s1 = PyObject_CallMethodObjArgs( - s0, &_Py_ID(replace), - state->backslash_str, state->backslash_esc_str, + s0, + &_Py_ID(replace), + state->backslash_str, + state->backslash_esc_str, NULL ); Py_DECREF(s0); @@ -269,17 +269,18 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (escchr == NULL) { goto abort; } - _WRITE_STRING_OR(writer, escchr, Py_DECREF(escchr); goto abort); - wi += PyUnicode_GET_LENGTH(escchr); + Py_ssize_t difflen = PyUnicode_GET_LENGTH(escchr); + int rc = _WRITE_STRING(writer, escchr); Py_DECREF(escchr); + if (rc < 0) { + goto abort; + } + wi += difflen; break; } } } -#undef SKIP_DUPLICATES -#undef ADVANCE_TO_NEXT -#undef _WHILE_READ_CMP -#undef ADVANCE_IF_CHAR +#undef ADVANCE_IF_NEXT_CHAR_IS #undef READ Py_DECREF(re_sub_func); Py_DECREF(re_escape_func); @@ -388,12 +389,12 @@ translate_expression_split(fnmatchmodule_state *state, assert(chunkscount > 0); PyObject *chunk = PyList_GET_ITEM(chunks, chunkscount - 1); assert(chunk != NULL); - PyObject *repl = PyUnicode_Concat(chunk, state->hyphen_str); - // PyList_SetItem() does not create a new reference on 'repl' - // so we should not decref 'repl' after the call, unless there + PyObject *str = PyUnicode_Concat(chunk, state->hyphen_str); + // PyList_SetItem() does not create a new reference on 'str' + // so we should not decref 'str' after the call, unless there // is an issue while setting the item. - if (repl == NULL || PyList_SetItem(chunks, chunkscount - 1, repl) < 0) { - Py_XDECREF(repl); + if (str == NULL || PyList_SetItem(chunks, chunkscount - 1, str) < 0) { + Py_XDECREF(str); goto abort; } } @@ -620,7 +621,7 @@ process_wildcards(PyObject *pattern, PyObject *indices) return NULL; } _WRITE_BLOCK_OR(writer, pattern, i, j, goto abort); - i = j + 1; // jump after the '*' + i = j + 1; // jump after the '*' for (Py_ssize_t k = 1; k < m; ++k) { // process all but the last wildcard. PyObject *ind = PyList_GET_ITEM(indices, k); From d73f66d5c2b00c78e816bd53d9575047674aa2b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:18:11 +0200 Subject: [PATCH 63/97] update comments and names --- Modules/_fnmatch/_fnmatchmodule.c | 12 ++++++------ Modules/_fnmatch/_fnmatchmodule.h | 21 +++++++++++---------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 3609a59e9bcd7b..acab53951d7b89 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -142,8 +142,8 @@ fnmatchmodule_exec(PyObject *module) INTERN_STRING(st, hyphen_esc_str, "\\-"); INTERN_STRING(st, backslash_str, "\\"); INTERN_STRING(st, backslash_esc_str, "\\\\"); - INTERN_STRING(st, inactive_toks_str, "([&~|])"); - INTERN_STRING(st, inactive_toks_repl_str, "\\\\\\1"); + INTERN_STRING(st, setops_str, "([&~|])"); + INTERN_STRING(st, setops_repl_str, "\\\\\\1"); return 0; } #undef INTERN_STRING @@ -153,8 +153,8 @@ static int fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); - Py_VISIT(st->inactive_toks_repl_str); - Py_VISIT(st->inactive_toks_str); + Py_VISIT(st->setops_repl_str); + Py_VISIT(st->setops_str); Py_VISIT(st->backslash_esc_str); Py_VISIT(st->backslash_str); Py_VISIT(st->hyphen_esc_str); @@ -170,8 +170,8 @@ static int fnmatchmodule_clear(PyObject *m) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); - Py_CLEAR(st->inactive_toks_repl_str); - Py_CLEAR(st->inactive_toks_str); + Py_CLEAR(st->setops_repl_str); + Py_CLEAR(st->setops_str); Py_CLEAR(st->backslash_esc_str); Py_CLEAR(st->backslash_str); Py_CLEAR(st->hyphen_esc_str); diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/_fnmatchmodule.h index 46afd8e0dda630..ae1c01c90f8d26 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/_fnmatchmodule.h @@ -12,21 +12,22 @@ #include "Python.h" typedef struct { - PyObject *os_module; // import os - PyObject *posixpath_module; // import posixpath - PyObject *re_module; // import re + PyObject *os_module; // import os + PyObject *posixpath_module; // import posixpath + PyObject *re_module; // import re - PyObject *translator; // LRU-cached translation unit + PyObject *translator; // LRU-cached translation unit // strings used by translate.c - PyObject *hyphen_str; // hyphen '-' - PyObject *hyphen_esc_str; // escaped hyphen '\\-' + PyObject *hyphen_str; // hyphen '-' + PyObject *hyphen_esc_str; // escaped hyphen '\\-' - PyObject *backslash_str; // backslash '\\' - PyObject *backslash_esc_str; // escaped backslash '\\\\' + PyObject *backslash_str; // backslash '\\' + PyObject *backslash_esc_str; // escaped backslash '\\\\' - PyObject *inactive_toks_str; // inactive tokens '([&~|])' - PyObject *inactive_toks_repl_str; // replacement pattern '\\\\\\1' + /* set operation tokens (&&, ~~ and ||) are not supported in regex */ + PyObject *setops_str; // set operation tokens '([&~|])' + PyObject *setops_repl_str; // replacement pattern '\\\\\\1' } fnmatchmodule_state; static inline fnmatchmodule_state * From 6258b71981aae59445c84920073063fe181c734e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:21:01 +0200 Subject: [PATCH 64/97] refactorization: - improve comments - improve variable names - mark local macro definition region - fix and explain the result length formula - use `/* */` for docs and `//` for comments --- Modules/_fnmatch/translate.c | 519 +++++++++++++++++++---------------- 1 file changed, 285 insertions(+), 234 deletions(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 6d841f10840c94..47fd72a2de69f2 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -8,24 +8,25 @@ #include "pycore_call.h" // ==== Macro definitions ===================================================== -// + +/* Execute the ON_ERROR statements if "CALL < 0". */ +#define _INTERNAL_CALL_OR_FAIL(CALL, ON_ERROR) \ + do { \ + if ((CALL) < 0) { \ + ON_ERROR; \ + } \ + } while (0) + // The following _WRITE_* and _WRITE_*_OR macros do NOT check their inputs // since they directly delegate to the _PyUnicodeWriter_Write* underlying // function. In particular, the caller is responsible for type safety. -#define _WRITE_OR_FAIL(WRITE_OPERATION, ON_ERROR) \ - do { \ - if ((WRITE_OPERATION) < 0) { \ - ON_ERROR; \ - } \ - } while (0) - /* write a character CHAR */ #define _WRITE_CHAR(WRITER, CHAR) \ _PyUnicodeWriter_WriteChar((_PyUnicodeWriter *)(WRITER), (CHAR)) /* write a character CHAR or execute the ON_ERROR statements if it fails */ #define _WRITE_CHAR_OR(WRITER, CHAR, ON_ERROR) \ - _WRITE_OR_FAIL(_WRITE_CHAR((WRITER), (CHAR)), ON_ERROR) + _INTERNAL_CALL_OR_FAIL(_WRITE_CHAR((WRITER), (CHAR)), ON_ERROR) /* write an ASCII string STRING of given length LENGTH */ #define _WRITE_ASCII(WRITER, ASCII, LENGTH) \ @@ -36,14 +37,14 @@ * or execute the ON_ERROR statements if it fails. */ #define _WRITE_ASCII_OR(WRITER, ASCII, LENGTH, ON_ERROR) \ - _WRITE_OR_FAIL(_WRITE_ASCII((WRITER), (ASCII), (LENGTH)), ON_ERROR) + _INTERNAL_CALL_OR_FAIL(_WRITE_ASCII((WRITER), (ASCII), (LENGTH)), ON_ERROR) /* write the string STRING */ #define _WRITE_STRING(WRITER, STRING) \ _PyUnicodeWriter_WriteStr((_PyUnicodeWriter *)(WRITER), (STRING)) /* write the string STRING or execute the ON_ERROR statements if it fails */ #define _WRITE_STRING_OR(WRITER, STRING, ON_ERROR) \ - _WRITE_OR_FAIL(_WRITE_STRING((WRITER), (STRING)), ON_ERROR) + _INTERNAL_CALL_OR_FAIL(_WRITE_STRING((WRITER), (STRING)), ON_ERROR) /* write the substring STRING[START:STOP] */ #define _WRITE_BLOCK(WRITER, STRING, START, STOP) \ @@ -62,6 +63,38 @@ } \ } while (0) +// ==== Inline helpers ======================================================== + +/* replace backslashes in STRING by escaped backslashes */ +#define BACKSLASH_REPLACE(STATE, STRING) \ + PyObject_CallMethodObjArgs( \ + (STRING), \ + &_Py_ID(replace), \ + (STATE)->backslash_str, \ + (STATE)->backslash_esc_str, \ + NULL \ + ) + +/* replace hyphens in STRING by escaped hyphens */ +#define HYPHEN_REPLACE(STATE, STRING) \ + PyObject_CallMethodObjArgs( \ + (STRING), \ + &_Py_ID(replace), \ + (STATE)->hyphen_str, \ + (STATE)->hyphen_esc_str, \ + NULL \ + ) + +/* escape set operations in STRING using re.sub() */ +#define SETOPS_REPLACE(STATE, STRING, RE_SUB_FUNC) \ + PyObject_CallFunctionObjArgs( \ + (RE_SUB_FUNC), \ + (STATE)->setops_str, \ + (STATE)->setops_repl_str, \ + (STRING), \ + NULL \ + ) + // ==== Helper declarations =================================================== /* @@ -75,21 +108,25 @@ get_unicode_character(Py_UCS4 ch); /* * Construct a regular expression out of a UNIX-style expression. * - * The expression to translate is the content of an '[(BLOCK)]' expression - * or '[!(BLOCK)]' expression. The BLOCK contains single unicode characters - * or character ranges (e.g., 'a-z'). + * The expression to translate is the content of an '[(BLOCK)]' expression, + * which contains single unicode characters or character ranges (e.g., 'a-z'). + * + * By convention, 'start' and 'stop' represent the INCLUSIVE start index + * and EXCLUSIVE stop index of BLOCK in 'pattern'. Stated otherwise: * - * By convention 'start' and 'stop' represent the INCLUSIVE start index - * and EXCLUSIVE stop index of BLOCK in the full 'pattern'. Note that - * we always have pattern[stop] == ']' and pattern[start] == BLOCK[0]. + * pattern[start] == BLOCK[0] + * pattern[stop] == ']' * * For instance, for "ab[c-f]g[!1-5]", the values of 'start' and 'stop' - * for the sub-pattern '[c-f]' are 3 and 6 respectively, whereas their - * values for '[!1-5]' are 10 (not 9) and 13 respectively. + * for the sub-pattern '[c-f]' are 3 and 6 respectively, while their + * values for '[!1-5]' are 9 and 13 respectively. + * + * The 'pattern_str_find_meth' argument is a reference to pattern.find(). */ static PyObject * translate_expression(fnmatchmodule_state *state, - PyObject *pattern, Py_ssize_t start, Py_ssize_t stop); + PyObject *pattern, Py_ssize_t start, Py_ssize_t stop, + PyObject *pattern_str_find_meth); /* * Write the translated pattern obtained by translate_expression(). @@ -114,7 +151,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) { assert(PyUnicode_Check(pattern)); fnmatchmodule_state *state = get_fnmatchmodule_state(module); - const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); + const Py_ssize_t maxind = PyUnicode_GET_LENGTH(pattern); // We would write less data if there are successive '*', // which should not be the case in general. Otherwise, @@ -133,59 +170,61 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) // same as the number of characters in the pattern. // // TODO(picnixz): should we limit the estimation? - PyUnicodeWriter *writer = PyUnicodeWriter_Create(n); + PyUnicodeWriter *writer = PyUnicodeWriter_Create(maxind); if (writer == NULL) { return NULL; } // list containing the indices where '*' has a special meaning - PyObject *indices = NULL; + PyObject *wildcard_indices = NULL; // cached functions (cache is local to the call) PyObject *re_escape_func = NULL, *re_sub_func = NULL; + PyObject *pattern_str_find_meth = NULL; // bound method of pattern.find() - indices = PyList_New(0); - if (indices == NULL) { - goto abort; - } - re_escape_func = PyObject_GetAttr(state->re_module, &_Py_ID(escape)); - if (re_escape_func == NULL) { - goto abort; - } - re_sub_func = PyObject_GetAttr(state->re_module, &_Py_ID(sub)); - if (re_sub_func == NULL) { + wildcard_indices = PyList_New(0); + if (wildcard_indices == NULL) { goto abort; } - - const int unicode_kind = PyUnicode_KIND(pattern); - const void *const unicode_data = PyUnicode_DATA(pattern); - /* declaration of some local helping macros */ -#define READ(IND) PyUnicode_READ(unicode_kind, unicode_data, (IND)) +#define CACHE_ATTRIBUTE(DEST, OBJECT, NAME) \ + do { \ + DEST = PyObject_GetAttr((OBJECT), (NAME)); \ + if ((DEST) == NULL) { \ + goto abort; \ + } \ + } while (0); + CACHE_ATTRIBUTE(re_escape_func, state->re_module, &_Py_ID(escape)); + CACHE_ATTRIBUTE(re_sub_func, state->re_module, &_Py_ID(sub)); + CACHE_ATTRIBUTE(pattern_str_find_meth, pattern, &_Py_ID(find)); +#undef CACHE_ATTRIBUTE + + const int _unicode_kind = PyUnicode_KIND(pattern); + const void *const _unicode_data = PyUnicode_DATA(pattern); + // ---- def local macros -------------------------------------------------- +#define READ_CHAR(IND) PyUnicode_READ(_unicode_kind, _unicode_data, (IND)) +#define WRITE_CHAR(CHAR) _WRITE_CHAR_OR(writer, (CHAR), goto abort) /* advance IND if the character is CHAR */ -#define ADVANCE_IF_NEXT_CHAR_IS(CHAR, IND, MAXIND) \ - do { \ - /* the following forces IND to be a variable name */ \ - void *Py_UNUSED(_ind) = &IND; \ - if ((IND) < (MAXIND) && READ(IND) == (CHAR)) { \ - ++IND; \ - } \ +#define ADVANCE_IF_CHAR_IS(CHAR, IND, MAXIND) \ + do { \ + if ((IND) < (MAXIND) && READ_CHAR(IND) == (CHAR)) { \ + ++IND; \ + } \ } while (0) - - Py_ssize_t i = 0; // current index - Py_ssize_t wi = 0; // number of characters written - while (i < n) { - // read and advance to the next character - Py_UCS4 chr = READ(i++); + // ------------------------------------------------------------------------ + Py_ssize_t i = 0; // current index + Py_ssize_t written = 0; // number of characters written + while (i < maxind) { + Py_UCS4 chr = READ_CHAR(i++); switch (chr) { case '*': { - _WRITE_CHAR_OR(writer, '*', goto abort); + WRITE_CHAR('*'); // skip duplicated '*' - for (; i < n && READ(i) == '*'; ++i); - PyObject *index = PyLong_FromSsize_t(wi++); - if (index == NULL) { + for (; i < maxind && READ_CHAR(i) == '*'; ++i); + PyObject *wildcard_index = PyLong_FromSsize_t(written++); + if (wildcard_index == NULL) { goto abort; } - int rc = PyList_Append(indices, index); - Py_DECREF(index); + int rc = PyList_Append(wildcard_indices, wildcard_index); + Py_DECREF(wildcard_index); if (rc < 0) { goto abort; } @@ -193,68 +232,55 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } case '?': { // translate optional '?' (fnmatch) into optional '.' (regex) - _WRITE_CHAR_OR(writer, '.', goto abort); - ++wi; // increase the expected result's length + WRITE_CHAR('.'); + ++written; // increase the expected result's length break; } case '[': { + assert(i > 0); + assert(READ_CHAR(i - 1) == '['); Py_ssize_t j = i; - ADVANCE_IF_NEXT_CHAR_IS('!', j, n); // [! - ADVANCE_IF_NEXT_CHAR_IS(']', j, n); // [!] or [] - for (; j < n && READ(j) != ']'; ++j); // locate closing ']' - if (j >= n) { + ADVANCE_IF_CHAR_IS('!', j, maxind); // [! + ADVANCE_IF_CHAR_IS(']', j, maxind); // [!] or [] + for (; j < maxind && READ_CHAR(j) != ']'; ++j); // locate ']' + if (j >= maxind) { _WRITE_ASCII_OR(writer, "\\[", 2, goto abort); - wi += 2; // we just wrote 2 characters + written += 2; // we just wrote 2 characters break; // early break for clarity } else { - // v--- pattern[j] (exclusive) - // '[' * ... * ']' - // ^----- pattern[i] (inclusive) + assert(READ_CHAR(j) == ']'); Py_ssize_t pos = PyUnicode_FindChar(pattern, '-', i, j, 1); if (pos == -2) { goto abort; } - PyObject *s1 = NULL, *s2 = NULL; + PyObject *pre_expr = NULL, *expr = NULL; if (pos == -1) { - PyObject *s0 = PyUnicode_Substring(pattern, i, j); - if (s0 == NULL) { + PyObject *tmp = PyUnicode_Substring(pattern, i, j); + if (tmp == NULL) { goto abort; } - s1 = PyObject_CallMethodObjArgs( - s0, - &_Py_ID(replace), - state->backslash_str, - state->backslash_esc_str, - NULL - ); - Py_DECREF(s0); + pre_expr = BACKSLASH_REPLACE(state, tmp); + Py_DECREF(tmp); } else { - assert(pos >= 0); - assert(READ(j) == ']'); - s1 = translate_expression(state, pattern, i, j); + pre_expr = translate_expression(state, pattern, i, j, + pattern_str_find_meth); } - if (s1 == NULL) { + if (pre_expr == NULL) { goto abort; } - s2 = PyObject_CallFunctionObjArgs( - re_sub_func, - state->inactive_toks_str, - state->inactive_toks_repl_str, - s1, - NULL - ); - Py_DECREF(s1); - if (s2 == NULL) { + expr = SETOPS_REPLACE(state, pre_expr, re_sub_func); + Py_DECREF(pre_expr); + if (expr == NULL) { goto abort; } - Py_ssize_t difflen = write_expression(writer, s2); - Py_DECREF(s2); - if (difflen < 0) { + Py_ssize_t expr_len = write_expression(writer, expr); + Py_DECREF(expr); + if (expr_len < 0) { goto abort; } - wi += difflen; + written += expr_len; i = j + 1; // jump to the character after ']' break; // early break for clarity } @@ -264,39 +290,42 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (str == NULL) { goto abort; } - PyObject *escchr = PyObject_CallOneArg(re_escape_func, str); + PyObject *escaped = PyObject_CallOneArg(re_escape_func, str); Py_DECREF(str); - if (escchr == NULL) { + if (escaped == NULL) { goto abort; } - Py_ssize_t difflen = PyUnicode_GET_LENGTH(escchr); - int rc = _WRITE_STRING(writer, escchr); - Py_DECREF(escchr); + Py_ssize_t escaped_len = PyUnicode_GET_LENGTH(escaped); + int rc = _WRITE_STRING(writer, escaped); + Py_DECREF(escaped); if (rc < 0) { goto abort; } - wi += difflen; + written += escaped_len; break; } } } -#undef ADVANCE_IF_NEXT_CHAR_IS +#undef ADVANCE_IF_CHAR_IS +#undef WRITE_CHAR #undef READ + Py_DECREF(pattern_str_find_meth); Py_DECREF(re_sub_func); Py_DECREF(re_escape_func); PyObject *translated = PyUnicodeWriter_Finish(writer); if (translated == NULL) { - Py_DECREF(indices); + Py_DECREF(wildcard_indices); return NULL; } - PyObject *res = process_wildcards(translated, indices); + PyObject *res = process_wildcards(translated, wildcard_indices); Py_DECREF(translated); - Py_DECREF(indices); + Py_DECREF(wildcard_indices); return res; abort: + Py_XDECREF(pattern_str_find_meth); Py_XDECREF(re_sub_func); Py_XDECREF(re_escape_func); - Py_XDECREF(indices); + Py_XDECREF(wildcard_indices); PyUnicodeWriter_Discard(writer); return NULL; } @@ -329,67 +358,77 @@ get_unicode_character(Py_UCS4 ch) } /* - * Extract a list of chunks from the pattern group described by i and j. + * Extract a list of chunks from the pattern group described by start and stop. + * + * For instance, the chunks for [a-z0-9] or [!a-z0-9] are ['a', 'z0', '9']. * * See translate_expression() for its usage. */ static PyObject * -translate_expression_split(fnmatchmodule_state *state, - PyObject *pattern, Py_ssize_t i, Py_ssize_t j) +split_expression(fnmatchmodule_state *state, + PyObject *pattern, Py_ssize_t start, Py_ssize_t stop, + PyObject *str_find_func) { - PyObject *chunks = NULL; - // local cache for some objects - PyObject *str_find_func = NULL, *max_find_index = NULL; + PyObject *chunks = NULL, *maxind = NULL; + PyObject *hyphen = state->hyphen_str; chunks = PyList_New(0); if (chunks == NULL) { goto abort; } - str_find_func = PyObject_GetAttr(pattern, &_Py_ID(find)); - if (str_find_func == NULL) { - goto abort; - } - max_find_index = PyLong_FromSsize_t(j); - if (max_find_index == NULL) { + maxind = PyLong_FromSsize_t(stop); + if (maxind == NULL) { goto abort; } - Py_ssize_t k = (PyUnicode_READ_CHAR(pattern, i) == '!') ? i + 2 : i + 1; - while (k < j) { - PyObject *eobj = PyObject_CallFunction( - str_find_func, "OnO", state->hyphen_str, k, max_find_index); - if (eobj == NULL) { + // ---- def local macros -------------------------------------------------- + /* add pattern[START:STOP] to the list of chunks */ +#define ADD_CHUNK(START, STOP) \ + do { \ + PyObject *chunk = PyUnicode_Substring(pattern, (START), (STOP)); \ + if (chunk == NULL) { \ + goto abort; \ + } \ + int rc = PyList_Append(chunks, chunk); \ + Py_DECREF(chunk); \ + if (rc < 0) { \ + goto abort; \ + } \ + } while (0) + // ------------------------------------------------------------------------ + Py_ssize_t chunk_start = start; + bool is_complement = PyUnicode_READ_CHAR(pattern, start) == '!'; + // skip '!' character (it is handled separately in write_expression()) + Py_ssize_t ind = is_complement ? start + 2 : start + 1; + while (ind < stop) { + PyObject *p_chunk_stop = PyObject_CallFunction(str_find_func, "OnO", + hyphen, ind, maxind); + if (p_chunk_stop == NULL) { goto abort; } - Py_ssize_t t = PyLong_AsSsize_t(eobj); - Py_DECREF(eobj); - if (t < 0) { + Py_ssize_t chunk_stop = PyLong_AsSsize_t(p_chunk_stop); + Py_DECREF(p_chunk_stop); + if (chunk_stop < 0) { if (PyErr_Occurred()) { goto abort; } // -1 here means that '-' was not found - assert(t == -1); + assert(chunk_stop == -1); break; } - PyObject *sub = PyUnicode_Substring(pattern, i, t); - if (sub == NULL) { - goto abort; - } - int rc = PyList_Append(chunks, sub); - Py_DECREF(sub); - if (rc < 0) { - goto abort; - } - i = t + 1; - k = t + 3; + ADD_CHUNK(chunk_start, chunk_stop); + chunk_start = chunk_stop + 1; // jump after '-' + ind = chunk_stop + 3; // ensure a non-empty next chunk + } + if (chunk_start < stop) { + ADD_CHUNK(chunk_start, stop); } - // handle the last group - if (i >= j) { + else { Py_ssize_t chunkscount = PyList_GET_SIZE(chunks); assert(chunkscount > 0); PyObject *chunk = PyList_GET_ITEM(chunks, chunkscount - 1); assert(chunk != NULL); - PyObject *str = PyUnicode_Concat(chunk, state->hyphen_str); + PyObject *str = PyUnicode_Concat(chunk, hyphen); // PyList_SetItem() does not create a new reference on 'str' // so we should not decref 'str' after the call, unless there // is an issue while setting the item. @@ -398,24 +437,11 @@ translate_expression_split(fnmatchmodule_state *state, goto abort; } } - else { - // add the remaining sub-pattern - PyObject *sub = PyUnicode_Substring(pattern, i, j); - if (sub == NULL) { - goto abort; - } - int rc = PyList_Append(chunks, sub); - Py_DECREF(sub); - if (rc < 0) { - goto abort; - } - } - Py_DECREF(max_find_index); - Py_DECREF(str_find_func); +#undef ADD_CHUNK + Py_DECREF(maxind); return chunks; abort: - Py_XDECREF(max_find_index); - Py_XDECREF(str_find_func); + Py_XDECREF(maxind); Py_XDECREF(chunks); return NULL; } @@ -426,16 +452,16 @@ translate_expression_split(fnmatchmodule_state *state, * See translate_expression() for its usage. */ static int -translate_expression_simplify(fnmatchmodule_state *st, PyObject *chunks) +simplify_expression(PyObject *chunks) { - Py_ssize_t c = PyList_GET_SIZE(chunks); - while (--c) { - PyObject *c1 = PyList_GET_ITEM(chunks, c - 1); + // for k in range(len(chunks) - 1, 0, -1): + for (Py_ssize_t k = PyList_GET_SIZE(chunks) - 1; k > 0; --k) { + PyObject *c1 = PyList_GET_ITEM(chunks, k - 1); assert(c1 != NULL); Py_ssize_t c1len = PyUnicode_GET_LENGTH(c1); assert(c1len > 0); - PyObject *c2 = PyList_GET_ITEM(chunks, c); + PyObject *c2 = PyList_GET_ITEM(chunks, k); assert(c2 != NULL); Py_ssize_t c2len = PyUnicode_GET_LENGTH(c2); assert(c2len > 0); @@ -443,31 +469,46 @@ translate_expression_simplify(fnmatchmodule_state *st, PyObject *chunks) if (PyUnicode_READ_CHAR(c1, c1len - 1) > PyUnicode_READ_CHAR(c2, 0)) { Py_ssize_t olen = c1len + c2len - 2; assert(olen >= 0); - // see https://github.com/python/cpython/issues/114917 for - // why we need olen + 1 and not olen currently - PyUnicodeWriter *writer = PyUnicodeWriter_Create(olen + 1); - if (writer == NULL) { - return -1; + PyObject *str = NULL; + if (olen == 0) { // c1[:1] + c2[1:] == '' + str = Py_GetConstant(Py_CONSTANT_EMPTY_STR); + assert(_Py_IsImmortal(str)); } - // all but the last character in the first chunk - if (_WRITE_BLOCK(writer, c1, 0, c1len - 1) < 0) { - PyUnicodeWriter_Discard(writer); - return -1; + else if (c1len == 1) { // c1[:1] + c2[1:] == c2[1:] + assert(c2len > 1); + str = PyUnicode_Substring(c2, 1, c2len); } - // all but the first character in the second chunk - if (_WRITE_BLOCK(writer, c2, 1, c2len) < 0) { - PyUnicodeWriter_Discard(writer); - return -1; + else if (c2len == 1) { // c1[:1] + c2[1:] == c1[:1] + assert(c1len > 1); + str = PyUnicode_Substring(c1, 0, c1len - 1); + } + else { + assert(c1len > 1); + assert(c2len > 1); + PyUnicodeWriter *writer = PyUnicodeWriter_Create(olen); + if (writer == NULL) { + return -1; + } + // all but the last character in the first chunk + if (_WRITE_BLOCK(writer, c1, 0, c1len - 1) < 0) { + PyUnicodeWriter_Discard(writer); + return -1; + } + // all but the first character in the second chunk + if (_WRITE_BLOCK(writer, c2, 1, c2len) < 0) { + PyUnicodeWriter_Discard(writer); + return -1; + } + str = PyUnicodeWriter_Finish(writer); } - // PyList_SetItem() does not create a new reference on 'merged' - // so we should not decref 'merged' after the call, unless there + // PyList_SetItem() does not create a new reference on 'str' + // so we should not decref 'str' after the call, unless there // is an issue while setting the item. - PyObject *merged = PyUnicodeWriter_Finish(writer); - if (merged == NULL || PyList_SetItem(chunks, c - 1, merged) < 0) { - Py_XDECREF(merged); + if (str == NULL || PyList_SetItem(chunks, k - 1, str) < 0) { + Py_XDECREF(str); return -1; } - if (PySequence_DelItem(chunks, c) < 0) { + if (PySequence_DelItem(chunks, k) < 0) { return -1; } } @@ -482,24 +523,16 @@ translate_expression_simplify(fnmatchmodule_state *st, PyObject *chunks) * See translate_expression() for its usage. */ static int -translate_expression_escape(fnmatchmodule_state *st, PyObject *chunks) +escape_expression(fnmatchmodule_state *state, PyObject *chunks) { for (Py_ssize_t c = 0; c < PyList_GET_SIZE(chunks); ++c) { PyObject *s0 = PyList_GET_ITEM(chunks, c); assert(s0 != NULL); - PyObject *s1 = PyObject_CallMethodObjArgs(s0, - &_Py_ID(replace), - st->backslash_str, - st->backslash_esc_str, - NULL); + PyObject *s1 = BACKSLASH_REPLACE(state, s0); if (s1 == NULL) { return -1; } - PyObject *s2 = PyObject_CallMethodObjArgs(s1, - &_Py_ID(replace), - st->hyphen_str, - st->hyphen_esc_str, - NULL); + PyObject *s2 = HYPHEN_REPLACE(state, s1); Py_DECREF(s1); // PyList_SetItem() does not create a new reference on 's2' // so we should not decref 's2' after the call, unless there @@ -514,18 +547,20 @@ translate_expression_escape(fnmatchmodule_state *st, PyObject *chunks) static PyObject * translate_expression(fnmatchmodule_state *state, - PyObject *pattern, Py_ssize_t i, Py_ssize_t j) + PyObject *pattern, Py_ssize_t start, Py_ssize_t stop, + PyObject *pattern_str_find_meth) { - PyObject *chunks = translate_expression_split(state, pattern, i, j); + PyObject *chunks = split_expression(state, pattern, start, stop, + pattern_str_find_meth); if (chunks == NULL) { goto abort; } // remove empty ranges - if (translate_expression_simplify(state, chunks) < 0) { + if (simplify_expression(chunks) < 0) { goto abort; } // escape backslashes and set differences - if (translate_expression_escape(state, chunks) < 0) { + if (escape_expression(state, chunks) < 0) { goto abort; } PyObject *res = PyUnicode_Join(state->hyphen_str, chunks); @@ -539,17 +574,19 @@ translate_expression(fnmatchmodule_state *state, static Py_ssize_t write_expression(PyUnicodeWriter *writer, PyObject *expression) { -#define WRITE_CHAR(c) _WRITE_CHAR_OR(writer, (c), return -1) -#define WRITE_STRING(s) _WRITE_STRING_OR(writer, (s), return -1) + // ---- def local macros -------------------------------------------------- +#define WRITE_CHAR(CHAR) _WRITE_CHAR_OR(writer, (CHAR), return -1) +#define WRITE_STRING(STR) _WRITE_STRING_OR(writer, (STR), return -1) + // ------------------------------------------------------------------------ Py_ssize_t grouplen = PyUnicode_GET_LENGTH(expression); if (grouplen == 0) { - /* empty range: never match */ + // empty range: never match _WRITE_ASCII_OR(writer, "(?!)", 4, return -1); return 4; } Py_UCS4 token = PyUnicode_READ_CHAR(expression, 0); if (grouplen == 1 && token == '!') { - /* negated empty range: match any character */ + // negated empty range: match any character WRITE_CHAR('.'); return 1; } @@ -582,48 +619,57 @@ write_expression(PyUnicodeWriter *writer, PyObject *expression) static PyObject * process_wildcards(PyObject *pattern, PyObject *indices) { - const Py_ssize_t m = PyList_GET_SIZE(indices); - if (m == 0) { + const Py_ssize_t M = PyList_GET_SIZE(indices); + if (M == 0) { // "(?s:" + pattern + ")\Z" return PyUnicode_FromFormat("(?s:%U)\\Z", pattern); } - /* - * Special cases: indices[0] == 0 or indices[-1] + 1 == n - * - * If indices[0] == 0 write (?>.*?abcdef) instead of abcdef - * If indices[-1] == n - 1 write '.*' instead of empty string - */ - Py_ssize_t i = 0, j, n = PyUnicode_GET_LENGTH(pattern); - /* - * If the pattern starts with '*', we will write everything - * before it. So we will write at least indices[0] characters. - * - * For the inner groups 'STAR STRING ...' we always surround - * the STRING by "(?>.*?" and ")", and thus we will write at - * least 7 + len(STRING) characters. - * - * We write one additional '.*' if indices[-1] + 1 == n. - * - * Since the result is surrounded by "(?s:" and ")\Z", we - * write at least "indices[0] + 7*m + n + 6" characters, - * where 'm' is the number of stars and 'n' the length - * of the /translated) pattern. - */ - PyObject *jobj = PyList_GET_ITEM(indices, 0); - assert(jobj != NULL); - j = PyLong_AsSsize_t(jobj); // get the first position of '*' + // Special cases: indices[0] == 0 or indices[-1] + 1 == n + // + // If indices[0] == 0 write (?>.*?abcdef) instead of abcdef + // If indices[-1] == n - 1 write '.*' instead of empty string + Py_ssize_t i = 0, N = PyUnicode_GET_LENGTH(pattern); + // get the first position of '*' + Py_ssize_t j = PyLong_AsSsize_t(PyList_GET_ITEM(indices, 0)); if (j < 0) { return NULL; } - Py_ssize_t estimate = j + 7 * m + n + 6; - PyUnicodeWriter *writer = PyUnicodeWriter_Create(estimate); + // By construction, we have + // + // pattern = [PREFIX] [[(* INNER) ... (* INNER)] (* OUTER)] [*] + // + // where [...] is an optional group and () is required to exist. + // + // Case 1: pattern ends with a wildcard: + // + // - Write the PREFIX. + // - Write any group (* GROUP) as "(?>.*?" + GROUP + ")". + // - Write a final ".*" due to the final wildcard. + // - Number of characters to write: N + 6 * (M - 1) + 1, where + // the +1 is because the '*' in the final ".*" is counted by N. + // + // Case 2: pattern does not end with a wildcard: + // + // - Write the PREFIX. + // - Write an INNER group (* INNER) as "(?>.*?" + INNER + ")". + // - Write the OUTER group (* OUTER) as ".*" + OUTER. + // - Number of characters to write: N + 6 * (M - 1) + 1, where + // the +1 is because the '*' in ".*" + OUTER is counted by N. + // + // In both cases, we write N + 6(M - 1) + 1 characters. Since the final + // result is surrounded by "(?s:" and ")\\Z", we have: + // + // Number of written characters: N + 6(M - 1) + 1 + 7 = N + 6M + 2. + Py_ssize_t output_size = 6 * M + N + 2; + PyUnicodeWriter *writer = PyUnicodeWriter_Create(output_size); if (writer == NULL) { return NULL; } + // write everything before the first wildcard normally _WRITE_BLOCK_OR(writer, pattern, i, j, goto abort); i = j + 1; // jump after the '*' - for (Py_ssize_t k = 1; k < m; ++k) { - // process all but the last wildcard. + for (Py_ssize_t k = 1; k < M; ++k) { + // process all but the last wildcard PyObject *ind = PyList_GET_ITEM(indices, k); assert(ind != NULL); j = PyLong_AsSsize_t(ind); @@ -640,13 +686,14 @@ process_wildcards(PyObject *pattern, PyObject *indices) // handle the remaining wildcard _WRITE_ASCII_OR(writer, ".*", 2, goto abort); // write the remaining substring (if non-empty) - _WRITE_BLOCK_OR(writer, pattern, i, n, goto abort); + _WRITE_BLOCK_OR(writer, pattern, i, N, goto abort); PyObject *processed = PyUnicodeWriter_Finish(writer); if (processed == NULL) { return NULL; } - // "(?s:" + processed + ")\Z" + // "(?s:" + processed + ")\\Z" PyObject *res = PyUnicode_FromFormat("(?s:%U)\\Z", processed); + assert(PyUnicode_GET_LENGTH(res) == output_size); Py_DECREF(processed); return res; abort: @@ -654,6 +701,10 @@ process_wildcards(PyObject *pattern, PyObject *indices) return NULL; } +#undef SETOPS_REPLACE +#undef HYPHEN_REPLACE +#undef BACKSLASH_REPLACE + #undef _WRITE_BLOCK_OR #undef _WRITE_BLOCK #undef _WRITE_STRING_OR @@ -662,4 +713,4 @@ process_wildcards(PyObject *pattern, PyObject *indices) #undef _WRITE_ASCII #undef _WRITE_CHAR_OR #undef _WRITE_CHAR -#undef _WRITE_OR_FAIL +#undef _INTERNAL_CALL_OR_FAIL From d595cb449e02ab5c68864e1eeb1ce7a3a707947b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:01:13 +0200 Subject: [PATCH 65/97] improve coverage --- Lib/test/test_fnmatch.py | 135 +++++++++++++++++++++++++++------------ 1 file changed, 94 insertions(+), 41 deletions(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 19f12db4fa2160..4ddfce72ff7652 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -267,12 +267,106 @@ def test_translate(self): self.assertTrue(re.match(fatre, 'cbabcaxc')) self.assertFalse(re.match(fatre, 'dabccbad')) + def test_translate_wildcards(self): + for pattern, expect in [ + ('ab*', r'(?s:ab.*)\Z'), + ('ab*cd', r'(?s:ab.*cd)\Z'), + ('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'), + ('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'), + ('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'), + ('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'), + ('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'), + ]: + translated = self.fnmatch.translate(pattern) + self.assertEqual(translated, expect, pattern) + + for pattern, expect in [ + ('*ab', r'(?s:.*ab)\Z'), + ('*ab*', r'(?s:(?>.*?ab).*)\Z'), + ('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'), + ('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'), + ('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'), + ('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'), + ('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'), + ('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'), + ]: + translated = self.fnmatch.translate(pattern) + self.assertEqual(translated, expect, pattern) + + def test_translate_expressions(self): + '[', '[-abc]', '[[]b', '[[a]b', '[\\\\]', '[\\]', '[]-]', '[][!]', + '[]]b', '[]a[]b', '[^a-c]*', '[a-\\z]', + '[a-c]b*', '[a-y]*[^c]', '[abc-]', '\\*', + '[0-4-3-2]', '[b-ac-z9-1]', '[!b-ac-z9-1]', '[!]b-ac-z9-1]', + '[]b-ac-z9-1]', '[]b-ac-z9-1]*', '*[]b-ac-z9-1]', + for pattern, expect in [ + ('[', r'(?s:\[)\Z'), + ('[!', r'(?s:\[!)\Z'), + ('[]', r'(?s:\[\])\Z'), + ('[abc', r'(?s:\[abc)\Z'), + ('[!abc', r'(?s:\[!abc)\Z'), + ('[abc]', r'(?s:[abc])\Z'), + ('[!abc]', r'(?s:[^abc])\Z'), + # with [[ + ('[[', r'(?s:\[\[)\Z'), + ('[[a', r'(?s:\[\[a)\Z'), + ('[[]', r'(?s:[\[])\Z'), + ('[[]a', r'(?s:[\[]a)\Z'), + ('[[]]', r'(?s:[\[]\])\Z'), + ('[[]a]', r'(?s:[\[]a\])\Z'), + ('[[a]', r'(?s:[\[a])\Z'), + ('[[a]]', r'(?s:[\[a]\])\Z'), + ('[[a]b', r'(?s:[\[a]b)\Z'), + # backslashes + ('[\\', r'(?s:\[\\)\Z'), + (r'[\]', r'(?s:[\\])\Z'), + (r'[\\]', r'(?s:[\\\\])\Z'), + ]: + translated = self.fnmatch.translate(pattern) + self.assertEqual(translated, expect, pattern) + class PurePythonTranslateTestCase(TranslateTestCaseMixin, unittest.TestCase): fnmatch = py_fnmatch class CPythonTranslateTestCase(TranslateTestCaseMixin, unittest.TestCase): fnmatch = c_fnmatch + @staticmethod + def translate_func(pattern): + # Pure Python implementation of translate() + STAR = object() + parts = py_fnmatch._translate(pattern, STAR, '.') + return py_fnmatch._join_translated_parts(parts, STAR) + + def test_translate(self): + # We want to check that the C implementation is EXACTLY the same + # as the Python implementation. For that, we will need to cover + # a lot of cases. + translate = self.fnmatch.translate + + for choice in itertools.combinations_with_replacement('*?.', 5): + for suffix in ['', '!']: + pat = suffix + ''.join(choice) + with self.subTest(pattern=pat): + self.assertEqual(translate(pat), self.translate_func(pat)) + + for pat in [ + '', + '!!a*', '!\\!a*', '!a*', '*', '**', '*******?', '*******c', '*****??', '**/', + '*.js', '*/man*/bash.*', '*???', '?', '?*****??', '?*****?c', '?***?****', + '?***?****?', '?***?****c', '?*?', '??', '???', '???*', '[!\\]', + '\\**', '\\*\\*', 'a*', 'a*****?c', 'a****c**?**??*****', 'a***c', + 'a**?**cd**?**??***k', 'a**?**cd**?**??***k**', 'a**?**cd**?**??k', + 'a**?**cd**?**??k***', 'a*[^c]', + 'a*cd**?**??k', 'a/*', 'a/**', 'a/**/b', + 'a/**/b/**/c', 'a/.*/c', 'a/?', 'a/??', 'a[X-]b', 'a[\\.]c', + 'a[\\b]c', 'a[bc', 'a\\*?/*', 'a\\*b/*', + 'ab[!de]', 'ab[cd]', 'ab[cd]ef', 'abc', 'b*/', 'foo*', + 'man/man1/bash.1' + ]: + with self.subTest(pattern=pat): + self.assertEqual(translate(pat), self.translate_func(pat)) + class FilterTestCaseMixin: fnmatch = None @@ -310,46 +404,5 @@ class PurePythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): class CPythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): fnmatch = c_fnmatch - @staticmethod - def translate_func(pattern): - # Pure Python implementation of translate() - STAR = object() - parts = py_fnmatch._translate(pattern, STAR, '.') - return py_fnmatch._join_translated_parts(parts, STAR) - - def test_translate(self): - # We want to check that the C implementation is EXACTLY the same - # as the Python implementation. For that, we will need to cover - # a lot of cases. - translate = self.fnmatch.translate - - for choice in itertools.combinations_with_replacement('*?.', 5): - for suffix in ['', '!']: - pat = suffix + ''.join(choice) - with self.subTest(pattern=pat): - self.assertEqual(translate(pat), self.translate_func(pat)) - - for pat in [ - '', - '!!a*', '!\\!a*', '!a*', '*', '**', '*******?', '*******c', '*****??', '**/', - '*.js', '*/man*/bash.*', '*???', '?', '?*****??', '?*****?c', '?***?****', - '?***?****?', '?***?****c', '?*?', '??', '???', '???*', '[!\\]', - '[*', '[-abc]', '[[]b', '[[a]b', '[\\\\]', '[\\]', '[]-]', '[][!]', - '[]]b', '[]a[]b', '[^a-c]*', '[a-\\z]', - '[a-c]b*', '[a-y]*[^c]', '[abc-]', '\\*', - '[0-4-3-2]', '[b-ac-z9-1]', '[!b-ac-z9-1]', '[!]b-ac-z9-1]', - '[]b-ac-z9-1]', '[]b-ac-z9-1]*', '*[]b-ac-z9-1]', - '\\**', '\\*\\*', 'a*', 'a*****?c', 'a****c**?**??*****', 'a***c', - 'a**?**cd**?**??***k', 'a**?**cd**?**??***k**', 'a**?**cd**?**??k', - 'a**?**cd**?**??k***', 'a*[^c]', - 'a*cd**?**??k', 'a/*', 'a/**', 'a/**/b', - 'a/**/b/**/c', 'a/.*/c', 'a/?', 'a/??', 'a[X-]b', 'a[\\.]c', - 'a[\\b]c', 'a[bc', 'a\\*?/*', 'a\\*b/*', - 'ab[!de]', 'ab[cd]', 'ab[cd]ef', 'abc', 'b*/', 'foo*', - 'man/man1/bash.1' - ]: - with self.subTest(pattern=pat): - self.assertEqual(translate(pat), self.translate_func(pat)) - if __name__ == "__main__": unittest.main() From e4296d8fb21e17c2ae0ec75aacad617a9101c89d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 18 Jul 2024 15:40:02 +0200 Subject: [PATCH 66/97] update benchmarks --- .../Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst b/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst index 639af4fb31ff93..f374f28456d65d 100644 --- a/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst +++ b/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst @@ -1,2 +1,2 @@ -Improve the performances of :func:`fnmatch.translate` by 50% and of -:func:`fnmatch.filter` by 10%. Patch by Bénédikt Tran. +Improve the performances of :func:`fnmatch.translate` by 2x and of +:func:`fnmatch.filter` by 1.1x. Patch by Bénédikt Tran. From cc92c4be8cd61cd6bcae680ffee801a72ec9faf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 18 Jul 2024 15:41:01 +0200 Subject: [PATCH 67/97] fixup --- Lib/test/test_fnmatch.py | 10 +++++----- Makefile.pre.in | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 4ddfce72ff7652..9e00054d6ab14c 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -377,6 +377,11 @@ def test_filter(self): self.assertEqual(filter([b'Python', b'Ruby', b'Perl', b'Tcl'], b'P*'), [b'Python', b'Perl']) + def test_mix_bytes_str(self): + filter = self.fnmatch.filter + self.assertRaises(TypeError, filter, ['test'], b'*') + self.assertRaises(TypeError, filter, [b'test'], '*') + def test_case(self): ignorecase = os.path.normcase('P') == os.path.normcase('p') filter = self.fnmatch.filter @@ -393,11 +398,6 @@ def test_sep(self): self.assertEqual(filter(['usr/bin', 'usr', 'usr\\lib'], 'usr\\*'), ['usr/bin', 'usr\\lib'] if normsep else ['usr\\lib']) - def test_mix_bytes_str(self): - filter = self.fnmatch.filter - self.assertRaises(TypeError, filter, ['test'], b'*') - self.assertRaises(TypeError, filter, [b'test'], '*') - class PurePythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): fnmatch = py_fnmatch diff --git a/Makefile.pre.in b/Makefile.pre.in index 7722873a83ea57..07cbd0a7567233 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -348,6 +348,7 @@ IO_OBJS= \ Modules/_io/bytesio.o \ Modules/_io/stringio.o + ########################################################################## # mimalloc From ee27297ea3b4f4abc411fb866bfb5ea627c3c6ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 20 Jul 2024 12:07:01 +0200 Subject: [PATCH 68/97] refactorization: - split into more files - reduce the number of exported symbols - extract global macros into `macros.h` - simplify `fnmatch.filter()` - make the flow of `fnmatch.translate()` more readable --- Modules/_fnmatch/_fnmatchmodule.c | 84 +++-- Modules/_fnmatch/filter.c | 50 +++ Modules/_fnmatch/macros.h | 119 +++++++ Modules/_fnmatch/matcher.c | 85 ----- Modules/_fnmatch/translate.c | 301 ++++++------------ Modules/_fnmatch/{_fnmatchmodule.h => util.h} | 45 +-- 6 files changed, 342 insertions(+), 342 deletions(-) create mode 100644 Modules/_fnmatch/filter.c create mode 100644 Modules/_fnmatch/macros.h delete mode 100644 Modules/_fnmatch/matcher.c rename Modules/_fnmatch/{_fnmatchmodule.h => util.h} (60%) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index acab53951d7b89..1f03f050f4c831 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -1,6 +1,35 @@ -#include "_fnmatchmodule.h" +/* + * C accelerator for the 'fnmatch' module. + * + * Currently, the following inconsistencies in the Python implementation exist: + * + * - fnmatch.filter(NAMES, PATTERN) works with pathlib.Path() instances + * in NAMES on Windows but raises a TypeError on POSIX platforms. + * + * The reason is that os.path.normcase() is called on each NAME in NAMES + * but not on POSIX platforms. In particular, os.fspath() is never called: + * + * POSIX fnmatch.filter([Path("a")], "*") -> TypeError + * Windows fnmatch.filter([Path("a")], "*") -> [Path("a")] + * + * - Case normalization uses the runtime value of os.path.normcase(), + * forcing us to query the attribute each time. + * + * The C implementation of fnmatch.filter() uses the same os.path.normcase() + * when iterating over NAMES, ignoring side-effects on os.path.normcase() + * that may occur when processing a NAME in NAMES. + * + * More generally, os.path.normcase() is retrieved at most once per call + * to fnmatch.filter() or fnmatch.fnmatch(). + */ + +#ifndef Py_BUILD_CORE_BUILTIN +# define Py_BUILD_CORE_MODULE 1 +#endif + +#include "util.h" // prototypes -#include "pycore_runtime.h" // _Py_ID() +#include "pycore_runtime.h" // for _Py_ID() #include "clinic/_fnmatchmodule.c.h" @@ -26,9 +55,13 @@ get_matcher_function_impl(PyObject *module, PyObject *pattern) } fnmatchmodule_state *st = get_fnmatchmodule_state(module); // compile the pattern - PyObject *compiled = PyObject_CallMethodOneArg(st->re_module, - &_Py_ID(compile), - translated); + PyObject *compile_func = PyObject_GetAttr(st->re_module, &_Py_ID(compile)); + if (compile_func == NULL) { + Py_DECREF(translated); + return NULL; + } + PyObject *compiled = PyObject_CallOneArg(compile_func, translated); + Py_DECREF(compile_func); Py_DECREF(translated); if (compiled == NULL) { return NULL; @@ -41,7 +74,7 @@ get_matcher_function_impl(PyObject *module, PyObject *pattern) static PyMethodDef get_matcher_function_def = { "get_matcher_function", - (PyCFunction)(get_matcher_function_impl), + get_matcher_function_impl, METH_O, NULL }; @@ -55,25 +88,25 @@ fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) if (maxsize == NULL) { return -1; } - PyObject *lru_cache = _PyImport_GetModuleAttrString("functools", - "lru_cache"); - if (lru_cache == NULL) { + PyObject *cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); + if (cache == NULL) { Py_DECREF(maxsize); return -1; } - PyObject *decorator = PyObject_CallFunctionObjArgs( - lru_cache, maxsize, Py_True, NULL); - Py_DECREF(lru_cache); + PyObject *args[3] = {NULL, maxsize, Py_True}; + size_t nargsf = 2 | PY_VECTORCALL_ARGUMENTS_OFFSET; + PyObject *wrapper = PyObject_Vectorcall(cache, &args[1], nargsf, NULL); Py_DECREF(maxsize); - if (decorator == NULL) { + Py_DECREF(cache); + if (wrapper == NULL) { return -1; } assert(module != NULL); - PyObject *decorated = PyCFunction_New(&get_matcher_function_def, module); + PyObject *wrapped = PyCFunction_New(&get_matcher_function_def, module); // reference on 'translator' will be removed upon module cleanup - st->translator = PyObject_CallOneArg(decorator, decorated); - Py_DECREF(decorated); - Py_DECREF(decorator); + st->translator = PyObject_CallOneArg(wrapper, wrapped); + Py_DECREF(wrapped); + Py_DECREF(wrapper); if (st->translator == NULL) { return -1; } @@ -100,7 +133,7 @@ get_platform_normcase_function(PyObject *module, bool *isposix) } PyObject *normcase = PyObject_GetAttr(os_path, &_Py_ID(normcase)); if (isposix != NULL) { - *isposix = (bool)Py_Is(os_path, st->posixpath_module); + *isposix = Py_Is(os_path, st->posixpath_module); } Py_DECREF(os_path); return normcase; @@ -208,10 +241,6 @@ static PyObject * fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pattern) /*[clinic end generated code: output=1a68530a2e3cf7d0 input=7ac729daad3b1404]*/ { - // filter() always calls os.path.normcase() on the pattern, - // but not on the names being mathed if os.path is posixmodule - // XXX: maybe this should be changed in Python as well? - // Note: the Python implementation uses the *runtime* os.path.normcase. bool isposix = 0; PyObject *normcase = get_platform_normcase_function(module, &isposix); if (normcase == NULL) { @@ -229,9 +258,8 @@ fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pattern) Py_DECREF(normcase); return NULL; } - PyObject *filtered = isposix - ? _Py_fnmatch_filter(matcher, names) - : _Py_fnmatch_filter_normalized(matcher, names, normcase); + PyObject *normalizer = isposix ? NULL : normcase; + PyObject *filtered = _Py_fnmatch_filter(matcher, names, normalizer); Py_DECREF(matcher); Py_DECREF(normcase); return filtered; @@ -308,8 +336,12 @@ fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pattern) if (matcher == NULL) { return -1; } - int matching = _Py_fnmatch_match(matcher, name); + // If 'name' is of incorrect type, it will be detected when calling + // the matcher function (we check 're.compile(pattern).match(name)'). + PyObject *match = PyObject_CallOneArg(matcher, name); Py_DECREF(matcher); + int matching = match == NULL ? -1 : !Py_IsNone(match); + Py_XDECREF(match); return matching; } diff --git a/Modules/_fnmatch/filter.c b/Modules/_fnmatch/filter.c new file mode 100644 index 00000000000000..5b44f6accfc8df --- /dev/null +++ b/Modules/_fnmatch/filter.c @@ -0,0 +1,50 @@ +/* + * Provide the implementation of the high-level matcher-based functions. + */ + +#include "Python.h" + +PyObject * +_Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normalizer) +{ + PyObject *iter = PyObject_GetIter(names); + if (iter == NULL) { + return NULL; + } + PyObject *res = PyList_New(0); + if (res == NULL) { + Py_DECREF(iter); + return NULL; + } + PyObject *name = NULL; + while ((name = PyIter_Next(iter))) { + PyObject *match; + if (normalizer == NULL) { + match = PyObject_CallOneArg(matcher, name); + } + else { + PyObject *normalized = PyObject_CallOneArg(normalizer, name); + if (normalized == NULL) { + goto abort; + } + match = PyObject_CallOneArg(matcher, normalized); + Py_DECREF(normalized); + } + if (match == NULL) { + goto abort; + } + int matching = Py_IsNone(match) == 0; + Py_DECREF(match); + if (matching && PyList_Append(res, name) < 0) { + goto abort; + } + Py_DECREF(name); + } + Py_DECREF(iter); + return res; +abort: + Py_DECREF(name); + Py_DECREF(iter); + Py_DECREF(res); + return NULL; +} diff --git a/Modules/_fnmatch/macros.h b/Modules/_fnmatch/macros.h new file mode 100644 index 00000000000000..04bf4a684a3035 --- /dev/null +++ b/Modules/_fnmatch/macros.h @@ -0,0 +1,119 @@ +/* + * This file contains various macro definitions in order to reduce the + * number of lines in translate.c. Do not use them for something else. + */ + +#ifndef _FNMATCH_MACROS_H +#define _FNMATCH_MACROS_H + +// ==== Macro definitions ===================================================== + +// The following _WRITE_* and _WRITE_*_OR macros do NOT check their inputs +// since they directly delegate to the _PyUnicodeWriter_Write* underlying +// function. In particular, the caller is responsible for type safety. + +/* Write a character CHAR. */ +#define _WRITE_CHAR(WRITER, CHAR) \ + _PyUnicodeWriter_WriteChar((_PyUnicodeWriter *)(WRITER), (CHAR)) + +/* Write an ASCII string STRING of given length LENGTH. */ +#define _WRITE_ASCII(WRITER, STRING, LENGTH) \ + _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter *)(WRITER), \ + (STRING), (LENGTH)) +/* Write the string STRING. */ +#define _WRITE_STRING(WRITER, STRING) \ + _PyUnicodeWriter_WriteStr((_PyUnicodeWriter *)(WRITER), (STRING)) + +/* Write the substring STRING[START:STOP]. */ +#define _WRITE_BLOCK(WRITER, STRING, START, STOP) \ + _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter *)(WRITER), \ + (STRING), (START), (STOP)) + +// ---------------------------------------------------------------------------- + +/* Write a character CHAR or execute the ON_ERROR statements if it fails. */ +#define WRITE_CHAR_OR(WRITER, CHAR, ON_ERROR) \ + do { \ + if (_WRITE_CHAR((WRITER), (CHAR)) < 0) { \ + ON_ERROR; \ + } \ + } while (0) + +/* + * Write an ASCII string STRING of given length LENGTH, + * or execute the ON_ERROR statements if it fails. + */ +#define WRITE_ASCII_OR(WRITER, ASCII, LENGTH, ON_ERROR) \ + do { \ + if (_WRITE_ASCII((WRITER), (ASCII), (LENGTH)) < 0) { \ + ON_ERROR; \ + } \ + } while (0) + +/* Write the string STRING or execute the ON_ERROR statements if it fails. */ +#define WRITE_STRING_OR(WRITER, STRING, ON_ERROR) \ + do { \ + if (_WRITE_STRING((WRITER), (STRING)) < 0) { \ + ON_ERROR; \ + } \ + } while (0) + +/* + * Write the substring STRING[START:STOP] if START < STOP, + * or execute the ON_ERROR statements if it fails. + */ +#define WRITE_BLOCK_OR(WRITER, STRING, START, STOP, ON_ERROR) \ + do { \ + /* intermediate variables to allow in-place operations */ \ + Py_ssize_t _i = (START), _j = (STOP); \ + if (_i < _j && _WRITE_BLOCK((WRITER), (STRING), _i, _j) < 0) { \ + ON_ERROR; \ + } \ + } while (0) + +// ---------------------------------------------------------------------------- + +// Macros which execute "goto abort" if an error occurs. + +#define WRITE_CHAR_OR_ABORT(WRITER, CHAR) \ + WRITE_CHAR_OR((WRITER), (CHAR), goto abort) +#define WRITE_ASCII_OR_ABORT(WRITER, STRING, LENGTH) \ + WRITE_ASCII_OR((WRITER), (STRING), (LENGTH), goto abort) +#define WRITE_STRING_OR_ABORT(WRITER, STRING) \ + WRITE_STRING_OR((WRITER), (STRING), goto abort) +#define WRITE_BLOCK_OR_ABORT(WRITER, STRING, START, STOP) \ + WRITE_BLOCK_OR((WRITER), (STRING), (START), (STOP), goto abort) + +// ---------------------------------------------------------------------------- + +/* Replace backslashes in STRING by escaped backslashes. */ +#define BACKSLASH_REPLACE(STATE, STRING) \ + PyObject_CallMethodObjArgs( \ + (STRING), \ + &_Py_ID(replace), \ + (STATE)->backslash_str, \ + (STATE)->backslash_esc_str, \ + NULL \ + ) + +/* Replace hyphens in STRING by escaped hyphens. */ +#define HYPHEN_REPLACE(STATE, STRING) \ + PyObject_CallMethodObjArgs( \ + (STRING), \ + &_Py_ID(replace), \ + (STATE)->hyphen_str, \ + (STATE)->hyphen_esc_str, \ + NULL \ + ) + +/* Escape set operations in STRING using re.sub(). */ +#define SETOPS_REPLACE(STATE, STRING, RE_SUB_FUNC) \ + PyObject_CallFunctionObjArgs( \ + (RE_SUB_FUNC), \ + (STATE)->setops_str, \ + (STATE)->setops_repl_str, \ + (STRING), \ + NULL \ + ) + +#endif // _FNMATCH_MACROS_H diff --git a/Modules/_fnmatch/matcher.c b/Modules/_fnmatch/matcher.c deleted file mode 100644 index 22fdc41d719b59..00000000000000 --- a/Modules/_fnmatch/matcher.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Provide the implementation of the high-level matcher-based functions. - */ - -#include "_fnmatchmodule.h" - -inline int -_Py_fnmatch_match(PyObject *matcher, PyObject *name) -{ - // If 'name' is of incorrect type, it will be detected when calling - // the matcher function (we emulate 're.compile(...).match(name)'). - PyObject *match = PyObject_CallOneArg(matcher, name); - if (match == NULL) { - return -1; - } - int matching = Py_IsNone(match) ? 0 : 1; - Py_DECREF(match); - return matching; -} - -PyObject * -_Py_fnmatch_filter(PyObject *matcher, PyObject *names) -{ - PyObject *iter = PyObject_GetIter(names); - if (iter == NULL) { - return NULL; - } - PyObject *res = PyList_New(0); - if (res == NULL) { - Py_DECREF(iter); - return NULL; - } - PyObject *name = NULL; - while ((name = PyIter_Next(iter))) { - int matching = _Py_fnmatch_match(matcher, name); - if (matching < 0 || (matching == 1 && PyList_Append(res, name) < 0)) { - goto abort; - } - Py_DECREF(name); - } - Py_DECREF(iter); - return res; -abort: - Py_DECREF(name); - Py_DECREF(iter); - Py_DECREF(res); - return NULL; -} - -PyObject * -_Py_fnmatch_filter_normalized(PyObject *matcher, - PyObject *names, - PyObject *normcase) -{ - PyObject *iter = PyObject_GetIter(names); - if (iter == NULL) { - return NULL; - } - PyObject *res = PyList_New(0); - if (res == NULL) { - Py_DECREF(iter); - return NULL; - } - PyObject *name = NULL; - while ((name = PyIter_Next(iter))) { - PyObject *normalized = PyObject_CallOneArg(normcase, name); - if (normalized == NULL) { - goto abort; - } - int matching = _Py_fnmatch_match(matcher, normalized); - Py_DECREF(normalized); - // add the non-normalized name if its normalization matches - if (matching < 0 || (matching == 1 && PyList_Append(res, name) < 0)) { - goto abort; - } - Py_DECREF(name); - } - Py_DECREF(iter); - return res; -abort: - Py_DECREF(name); - Py_DECREF(iter); - Py_DECREF(res); - return NULL; -} diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 47fd72a2de69f2..10552434a7d616 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -3,97 +3,14 @@ * to RE patterns. */ -#include "_fnmatchmodule.h" // for get_fnmatchmodulestate_state() +#ifndef Py_BUILD_CORE_BUILTIN +# define Py_BUILD_CORE_MODULE 1 +#endif -#include "pycore_call.h" +#include "macros.h" +#include "util.h" // for get_fnmatchmodulestate_state() -// ==== Macro definitions ===================================================== - -/* Execute the ON_ERROR statements if "CALL < 0". */ -#define _INTERNAL_CALL_OR_FAIL(CALL, ON_ERROR) \ - do { \ - if ((CALL) < 0) { \ - ON_ERROR; \ - } \ - } while (0) - -// The following _WRITE_* and _WRITE_*_OR macros do NOT check their inputs -// since they directly delegate to the _PyUnicodeWriter_Write* underlying -// function. In particular, the caller is responsible for type safety. - -/* write a character CHAR */ -#define _WRITE_CHAR(WRITER, CHAR) \ - _PyUnicodeWriter_WriteChar((_PyUnicodeWriter *)(WRITER), (CHAR)) -/* write a character CHAR or execute the ON_ERROR statements if it fails */ -#define _WRITE_CHAR_OR(WRITER, CHAR, ON_ERROR) \ - _INTERNAL_CALL_OR_FAIL(_WRITE_CHAR((WRITER), (CHAR)), ON_ERROR) - -/* write an ASCII string STRING of given length LENGTH */ -#define _WRITE_ASCII(WRITER, ASCII, LENGTH) \ - _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter *)(WRITER), \ - (ASCII), (LENGTH)) -/* - * Write an ASCII string STRING of given length LENGTH, - * or execute the ON_ERROR statements if it fails. - */ -#define _WRITE_ASCII_OR(WRITER, ASCII, LENGTH, ON_ERROR) \ - _INTERNAL_CALL_OR_FAIL(_WRITE_ASCII((WRITER), (ASCII), (LENGTH)), ON_ERROR) - -/* write the string STRING */ -#define _WRITE_STRING(WRITER, STRING) \ - _PyUnicodeWriter_WriteStr((_PyUnicodeWriter *)(WRITER), (STRING)) -/* write the string STRING or execute the ON_ERROR statements if it fails */ -#define _WRITE_STRING_OR(WRITER, STRING, ON_ERROR) \ - _INTERNAL_CALL_OR_FAIL(_WRITE_STRING((WRITER), (STRING)), ON_ERROR) - -/* write the substring STRING[START:STOP] */ -#define _WRITE_BLOCK(WRITER, STRING, START, STOP) \ - _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter *)(WRITER), \ - (STRING), (START), (STOP)) -/* - * Write the substring STRING[START:STOP] if START < STOP, - * or execute the ON_ERROR statements if it fails. - */ -#define _WRITE_BLOCK_OR(WRITER, STRING, START, STOP, ON_ERROR) \ - do { \ - /* intermediate variables to allow in-place operations */ \ - Py_ssize_t _i = (START), _j = (STOP); \ - if (_i < _j && _WRITE_BLOCK((WRITER), (STRING), _i, _j) < 0) { \ - ON_ERROR; \ - } \ - } while (0) - -// ==== Inline helpers ======================================================== - -/* replace backslashes in STRING by escaped backslashes */ -#define BACKSLASH_REPLACE(STATE, STRING) \ - PyObject_CallMethodObjArgs( \ - (STRING), \ - &_Py_ID(replace), \ - (STATE)->backslash_str, \ - (STATE)->backslash_esc_str, \ - NULL \ - ) - -/* replace hyphens in STRING by escaped hyphens */ -#define HYPHEN_REPLACE(STATE, STRING) \ - PyObject_CallMethodObjArgs( \ - (STRING), \ - &_Py_ID(replace), \ - (STATE)->hyphen_str, \ - (STATE)->hyphen_esc_str, \ - NULL \ - ) - -/* escape set operations in STRING using re.sub() */ -#define SETOPS_REPLACE(STATE, STRING, RE_SUB_FUNC) \ - PyObject_CallFunctionObjArgs( \ - (RE_SUB_FUNC), \ - (STATE)->setops_str, \ - (STATE)->setops_repl_str, \ - (STRING), \ - NULL \ - ) +#include "pycore_runtime.h" // for _Py_ID() // ==== Helper declarations =================================================== @@ -154,11 +71,11 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) const Py_ssize_t maxind = PyUnicode_GET_LENGTH(pattern); // We would write less data if there are successive '*', - // which should not be the case in general. Otherwise, - // we write >= n characters since escaping them always - // add more characters. + // which usually happens once or twice in the pattern. + // Otherwise, we write >= maxind characters since escaping + // them always add more characters. // - // Note that only b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f' need to + // Note that only '()[]{}?*+-|^$\\.&~# \t\n\r\v\f' need to // be escaped when translated to RE patterns and '*' and '?' // are already handled without being escaped. // @@ -168,8 +85,6 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) // and there is a sparse number of them. Therefore, we only // estimate the number of characters to be written to be the // same as the number of characters in the pattern. - // - // TODO(picnixz): should we limit the estimation? PyUnicodeWriter *writer = PyUnicodeWriter_Create(maxind); if (writer == NULL) { return NULL; @@ -197,11 +112,12 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) CACHE_ATTRIBUTE(pattern_str_find_meth, pattern, &_Py_ID(find)); #undef CACHE_ATTRIBUTE - const int _unicode_kind = PyUnicode_KIND(pattern); - const void *const _unicode_data = PyUnicode_DATA(pattern); + const int pattern_kind = PyUnicode_KIND(pattern); + const void *const pattern_data = PyUnicode_DATA(pattern); // ---- def local macros -------------------------------------------------- -#define READ_CHAR(IND) PyUnicode_READ(_unicode_kind, _unicode_data, (IND)) -#define WRITE_CHAR(CHAR) _WRITE_CHAR_OR(writer, (CHAR), goto abort) +#define READ_CHAR(IND) PyUnicode_READ(pattern_kind, pattern_data, IND) +#define WRITE_CHAR(CHAR) WRITE_CHAR_OR_ABORT(writer, CHAR) +#define WRITE_ASCII(STR, LEN) WRITE_ASCII_OR_ABORT(writer, STR, LEN) /* advance IND if the character is CHAR */ #define ADVANCE_IF_CHAR_IS(CHAR, IND, MAXIND) \ do { \ @@ -219,6 +135,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) WRITE_CHAR('*'); // skip duplicated '*' for (; i < maxind && READ_CHAR(i) == '*'; ++i); + // store the position of the wildcard PyObject *wildcard_index = PyLong_FromSsize_t(written++); if (wildcard_index == NULL) { goto abort; @@ -244,9 +161,9 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) ADVANCE_IF_CHAR_IS(']', j, maxind); // [!] or [] for (; j < maxind && READ_CHAR(j) != ']'; ++j); // locate ']' if (j >= maxind) { - _WRITE_ASCII_OR(writer, "\\[", 2, goto abort); - written += 2; // we just wrote 2 characters - break; // early break for clarity + WRITE_ASCII("\\[", 2); + written += 2; // we just wrote 2 characters + break; // explicit early break for clarity } else { assert(READ_CHAR(j) == ']'); @@ -282,7 +199,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } written += expr_len; i = j + 1; // jump to the character after ']' - break; // early break for clarity + break; // explicit early break for clarity } } default: { @@ -296,6 +213,8 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) goto abort; } Py_ssize_t escaped_len = PyUnicode_GET_LENGTH(escaped); + // Do NOT use WRITE_STRING_OR_ABORT() since 'escaped' + // must be first decref'ed in case of an error. int rc = _WRITE_STRING(writer, escaped); Py_DECREF(escaped); if (rc < 0) { @@ -307,8 +226,9 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } } #undef ADVANCE_IF_CHAR_IS +#undef WRITE_ASCII #undef WRITE_CHAR -#undef READ +#undef READ_CHAR Py_DECREF(pattern_str_find_meth); Py_DECREF(re_sub_func); Py_DECREF(re_escape_func); @@ -487,17 +407,17 @@ simplify_expression(PyObject *chunks) assert(c2len > 1); PyUnicodeWriter *writer = PyUnicodeWriter_Create(olen); if (writer == NULL) { - return -1; + goto abort; } // all but the last character in the first chunk if (_WRITE_BLOCK(writer, c1, 0, c1len - 1) < 0) { PyUnicodeWriter_Discard(writer); - return -1; + goto abort; } // all but the first character in the second chunk if (_WRITE_BLOCK(writer, c2, 1, c2len) < 0) { PyUnicodeWriter_Discard(writer); - return -1; + goto abort; } str = PyUnicodeWriter_Finish(writer); } @@ -506,14 +426,16 @@ simplify_expression(PyObject *chunks) // is an issue while setting the item. if (str == NULL || PyList_SetItem(chunks, k - 1, str) < 0) { Py_XDECREF(str); - return -1; + goto abort; } if (PySequence_DelItem(chunks, k) < 0) { - return -1; + goto abort; } } } return 0; +abort: + return -1; } /* @@ -530,7 +452,7 @@ escape_expression(fnmatchmodule_state *state, PyObject *chunks) assert(s0 != NULL); PyObject *s1 = BACKSLASH_REPLACE(state, s0); if (s1 == NULL) { - return -1; + goto abort; } PyObject *s2 = HYPHEN_REPLACE(state, s1); Py_DECREF(s1); @@ -539,10 +461,12 @@ escape_expression(fnmatchmodule_state *state, PyObject *chunks) // is an issue while setting the item. if (s2 == NULL || PyList_SetItem(chunks, c, s2) < 0) { Py_XDECREF(s2); - return -1; + goto abort; } } return 0; +abort: + return -1; } static PyObject * @@ -575,13 +499,15 @@ static Py_ssize_t write_expression(PyUnicodeWriter *writer, PyObject *expression) { // ---- def local macros -------------------------------------------------- -#define WRITE_CHAR(CHAR) _WRITE_CHAR_OR(writer, (CHAR), return -1) -#define WRITE_STRING(STR) _WRITE_STRING_OR(writer, (STR), return -1) +#define WRITE_CHAR(CHAR) WRITE_CHAR_OR_ABORT(writer, CHAR) +#define WRITE_ASCII(STR, LEN) WRITE_ASCII_OR_ABORT(writer, STR, LEN) +#define WRITE_STRING(STR) WRITE_STRING_OR_ABORT(writer, STR) +#define WRITE_BLOCK(STR, I, J) WRITE_BLOCK_OR_ABORT(writer, STR, I, J) // ------------------------------------------------------------------------ Py_ssize_t grouplen = PyUnicode_GET_LENGTH(expression); if (grouplen == 0) { // empty range: never match - _WRITE_ASCII_OR(writer, "(?!)", 4, return -1); + WRITE_ASCII("(?!)", 4); return 4; } Py_UCS4 token = PyUnicode_READ_CHAR(expression, 0); @@ -595,7 +521,7 @@ write_expression(PyUnicodeWriter *writer, PyObject *expression) switch (token) { case '!': { WRITE_CHAR('^'); // replace '!' by '^' - _WRITE_BLOCK_OR(writer, expression, 1, grouplen, return -1); + WRITE_BLOCK(expression, 1, grouplen); break; } case '^': @@ -612,105 +538,90 @@ write_expression(PyUnicodeWriter *writer, PyObject *expression) } WRITE_CHAR(']'); return grouplen + extra; +abort: + return -1; +#undef WRITE_BLOCK #undef WRITE_STRING +#undef WRITE_ASCII #undef WRITE_CHAR } static PyObject * process_wildcards(PyObject *pattern, PyObject *indices) { - const Py_ssize_t M = PyList_GET_SIZE(indices); - if (M == 0) { - // "(?s:" + pattern + ")\Z" - return PyUnicode_FromFormat("(?s:%U)\\Z", pattern); - } - // Special cases: indices[0] == 0 or indices[-1] + 1 == n - // - // If indices[0] == 0 write (?>.*?abcdef) instead of abcdef - // If indices[-1] == n - 1 write '.*' instead of empty string - Py_ssize_t i = 0, N = PyUnicode_GET_LENGTH(pattern); - // get the first position of '*' - Py_ssize_t j = PyLong_AsSsize_t(PyList_GET_ITEM(indices, 0)); - if (j < 0) { - return NULL; - } - // By construction, we have - // - // pattern = [PREFIX] [[(* INNER) ... (* INNER)] (* OUTER)] [*] - // - // where [...] is an optional group and () is required to exist. + const Py_ssize_t n = PyUnicode_GET_LENGTH(pattern); + const Py_ssize_t m = PyList_GET_SIZE(indices); + // Let m = len(indices) and n = len(pattern). By construction, // - // Case 1: pattern ends with a wildcard: + // pattern = [PREFIX] [[(* INNER) ... (* INNER)] (*) [OUTER]] // - // - Write the PREFIX. - // - Write any group (* GROUP) as "(?>.*?" + GROUP + ")". - // - Write a final ".*" due to the final wildcard. - // - Number of characters to write: N + 6 * (M - 1) + 1, where - // the +1 is because the '*' in the final ".*" is counted by N. + // where [...] is an optional group and (...) is a required group. // - // Case 2: pattern does not end with a wildcard: + // The algorithm is as follows: // - // - Write the PREFIX. - // - Write an INNER group (* INNER) as "(?>.*?" + INNER + ")". - // - Write the OUTER group (* OUTER) as ".*" + OUTER. - // - Number of characters to write: N + 6 * (M - 1) + 1, where - // the +1 is because the '*' in ".*" + OUTER is counted by N. + // - Write "(?s:". + // - Write the optional PREFIX. + // - Write an INNER group (* INNER) as "(?>.*?" + INNER + ")". + // - Write ".*" instead of the last wildcard. + // - Write an optional OUTER string normally. + // - Write ")\\Z". // - // In both cases, we write N + 6(M - 1) + 1 characters. Since the final - // result is surrounded by "(?s:" and ")\\Z", we have: - // - // Number of written characters: N + 6(M - 1) + 1 + 7 = N + 6M + 2. - Py_ssize_t output_size = 6 * M + N + 2; - PyUnicodeWriter *writer = PyUnicodeWriter_Create(output_size); + // If m = 0, the writer needs n + 7 characters. Otherwise, it requires + // exactly n + 6(m-1) + 1 + 7 = n + 6m + 2 characters, where the "+1" + // is due to the fact that writing ".*" instead of "*" only increases + // the total length of the pattern by 1 (and not by 2). + const Py_ssize_t reslen = m == 0 ? n + 7 : n + 6 * m + 2; + PyUnicodeWriter *writer = PyUnicodeWriter_Create(reslen); if (writer == NULL) { return NULL; } - // write everything before the first wildcard normally - _WRITE_BLOCK_OR(writer, pattern, i, j, goto abort); - i = j + 1; // jump after the '*' - for (Py_ssize_t k = 1; k < M; ++k) { - // process all but the last wildcard - PyObject *ind = PyList_GET_ITEM(indices, k); - assert(ind != NULL); - j = PyLong_AsSsize_t(ind); - if (j < 0) { - goto abort; - } - assert(i < j); - // write the atomic RE group '(?>.*?' + BLOCK + ')' - _WRITE_ASCII_OR(writer, "(?>.*?", 6, goto abort); - _WRITE_BLOCK_OR(writer, pattern, i, j, goto abort); - _WRITE_CHAR_OR(writer, ')', goto abort); - i = j + 1; + // ---- def local macros -------------------------------------------------- +#define WRITE_CHAR(CHAR) WRITE_CHAR_OR_ABORT(writer, CHAR) +#define WRITE_ASCII(STR, LEN) WRITE_ASCII_OR_ABORT(writer, STR, LEN) +#define WRITE_STRING(STR) WRITE_STRING_OR_ABORT(writer, STR) +#define WRITE_BLOCK(STR, I, J) WRITE_BLOCK_OR_ABORT(writer, STR, I, J) +#define LOAD_WILDCARD_INDEX(VAR, IND) \ + do { \ + VAR = PyLong_AsSsize_t(PyList_GET_ITEM(indices, IND)); \ + if (VAR < 0) { \ + goto abort; \ + } \ + } while (0) + // ------------------------------------------------------------------------ + WRITE_ASCII("(?s:", 4); + if (m == 0) { + WRITE_STRING(pattern); } - // handle the remaining wildcard - _WRITE_ASCII_OR(writer, ".*", 2, goto abort); - // write the remaining substring (if non-empty) - _WRITE_BLOCK_OR(writer, pattern, i, N, goto abort); - PyObject *processed = PyUnicodeWriter_Finish(writer); - if (processed == NULL) { - return NULL; + else { + Py_ssize_t i = 0, j = -1; + // process the optional PREFIX + LOAD_WILDCARD_INDEX(j, 0); + WRITE_BLOCK(pattern, i, j); + i = j + 1; + for (Py_ssize_t k = 1; k < m; ++k) { + // process the (* INNER) groups + LOAD_WILDCARD_INDEX(j, k); + assert(i < j); + // write the atomic RE group '(?>.*?' + INNER + ')' + WRITE_ASCII("(?>.*?", 6); + WRITE_BLOCK(pattern, i, j); + WRITE_CHAR(')'); + i = j + 1; + } + // handle the (*) [OUTER] part + WRITE_ASCII(".*", 2); + WRITE_BLOCK(pattern, i, n); } - // "(?s:" + processed + ")\\Z" - PyObject *res = PyUnicode_FromFormat("(?s:%U)\\Z", processed); - assert(PyUnicode_GET_LENGTH(res) == output_size); - Py_DECREF(processed); + WRITE_ASCII(")\\Z", 3); + PyObject *res = PyUnicodeWriter_Finish(writer); + assert(res == NULL || PyUnicode_GET_LENGTH(res) == reslen); return res; abort: PyUnicodeWriter_Discard(writer); return NULL; +#undef LOAD_WILDCARD_INDEX +#undef WRITE_BLOCK +#undef WRITE_STRING +#undef WRITE_ASCII +#undef WRITE_CHAR } - -#undef SETOPS_REPLACE -#undef HYPHEN_REPLACE -#undef BACKSLASH_REPLACE - -#undef _WRITE_BLOCK_OR -#undef _WRITE_BLOCK -#undef _WRITE_STRING_OR -#undef _WRITE_STRING -#undef _WRITE_ASCII_OR -#undef _WRITE_ASCII -#undef _WRITE_CHAR_OR -#undef _WRITE_CHAR -#undef _INTERNAL_CALL_OR_FAIL diff --git a/Modules/_fnmatch/_fnmatchmodule.h b/Modules/_fnmatch/util.h similarity index 60% rename from Modules/_fnmatch/_fnmatchmodule.h rename to Modules/_fnmatch/util.h index ae1c01c90f8d26..371930f5c17262 100644 --- a/Modules/_fnmatch/_fnmatchmodule.h +++ b/Modules/_fnmatch/util.h @@ -1,13 +1,9 @@ /* - * C accelerator for the 'fnmatch' module. + * This file contains helper prototypes and structures. */ -#ifndef _FNMATCHMODULE_H -#define _FNMATCHMODULE_H - -#ifndef Py_BUILD_CORE_BUILTIN -# define Py_BUILD_CORE_MODULE 1 -#endif +#ifndef _FNMATCH_UTIL_H +#define _FNMATCH_UTIL_H #include "Python.h" @@ -40,23 +36,6 @@ get_fnmatchmodule_state(PyObject *module) // ==== Helper prototypes ===================================================== -/* - * Test whether a name matches a compiled RE pattern. - * - * Parameters - * - * matcher A reference to the 'match()' method of a compiled pattern. - * string The string to match (str or bytes object). - * - * Returns - * - * -1 if the call 'matcher(string)' failed (e.g., invalid type), - * 0 if the 'string' does NOT match the pattern, - * 1 if the 'string' matches the pattern. - */ -extern int -_Py_fnmatch_match(PyObject *matcher, PyObject *string); - /* * Returns a list of matched names, or NULL if an error occurred. * @@ -64,20 +43,14 @@ _Py_fnmatch_match(PyObject *matcher, PyObject *string); * * matcher A reference to the 'match()' method of a compiled pattern. * names An iterable of strings (str or bytes objects) to match. - */ -extern PyObject * -_Py_fnmatch_filter(PyObject *matcher, PyObject *names); - -/* - * Similar to _Py_fnmatch_filter() but matches os.path.normcase(name) - * instead. The returned values are however a sub-sequence of 'names'. + * normalizer Optional normalization function. + * + * This is equivalent to: * - * The 'normcase' argument is a callable implementing os.path.normcase(). + * [name for name in names if matcher(normalizer(name))] */ extern PyObject * -_Py_fnmatch_filter_normalized(PyObject *matcher, - PyObject *names, - PyObject *normcase); +_Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normalizer); /* * C accelerator for translating UNIX shell patterns into RE patterns. @@ -90,4 +63,4 @@ _Py_fnmatch_filter_normalized(PyObject *matcher, extern PyObject * _Py_fnmatch_translate(PyObject *module, PyObject *pattern); -#endif // _FNMATCHMODULE_H +#endif // _FNMATCH_UTIL_H From 0622be6118734ccb89d8c175117f2169f1241ac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 20 Jul 2024 12:07:15 +0200 Subject: [PATCH 69/97] update dependencies --- Makefile.pre.in | 2 +- Modules/Setup.stdlib.in | 2 +- PCbuild/pythoncore.vcxproj | 4 ++-- PCbuild/pythoncore.vcxproj.filters | 7 +++++-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Makefile.pre.in b/Makefile.pre.in index 07cbd0a7567233..8bc19b0410fdfe 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -3117,7 +3117,7 @@ MODULE__CTYPES_TEST_DEPS=$(srcdir)/Modules/_ctypes/_ctypes_test_generated.c.h MODULE__CTYPES_MALLOC_CLOSURE=@MODULE__CTYPES_MALLOC_CLOSURE@ MODULE__DECIMAL_DEPS=$(srcdir)/Modules/_decimal/docstrings.h @LIBMPDEC_INTERNAL@ MODULE__ELEMENTTREE_DEPS=$(srcdir)/Modules/pyexpat.c @LIBEXPAT_INTERNAL@ -MODULE__FNMATCH_DEPS=$(srcdir)/Modules/_fnmatch/_fnmatchmodule.h +MODULE__FNMATCH_DEPS=$(srcdir)/Modules/_fnmatch/macros.h $(srcdir)/Modules/_fnmatch/util.h MODULE__HASHLIB_DEPS=$(srcdir)/Modules/hashlib.h MODULE__IO_DEPS=$(srcdir)/Modules/_io/_iomodule.h MODULE__MD5_DEPS=$(srcdir)/Modules/hashlib.h $(LIBHACL_HEADERS) Modules/_hacl/Hacl_Hash_MD5.h Modules/_hacl/Hacl_Hash_MD5.c diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in index e689d18b70b035..f33af67aa26499 100644 --- a/Modules/Setup.stdlib.in +++ b/Modules/Setup.stdlib.in @@ -33,7 +33,7 @@ @MODULE__BISECT_TRUE@_bisect _bisectmodule.c @MODULE__CONTEXTVARS_TRUE@_contextvars _contextvarsmodule.c @MODULE__CSV_TRUE@_csv _csv.c -@MODULE__FNMATCH_TRUE@_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/matcher.c _fnmatch/translate.c +@MODULE__FNMATCH_TRUE@_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/filter.c _fnmatch/translate.c @MODULE__HEAPQ_TRUE@_heapq _heapqmodule.c @MODULE__JSON_TRUE@_json _json.c @MODULE__LSPROF_TRUE@_lsprof _lsprof.c rotatingtree.c diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index ea52c20cc66db1..0d83ac770348b1 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -368,7 +368,7 @@ - + @@ -475,7 +475,7 @@ - + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 912407b56ed783..d68d8df7aa3ba7 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -252,7 +252,10 @@ Modules - + + Modules\_fnmatch + + Modules\_fnmatch @@ -1064,7 +1067,7 @@ Modules\_fnmatch - + Modules\_fnmatch From 481fae0ecc42a13cc53b95e0106103dba6477cb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 20 Jul 2024 12:07:20 +0200 Subject: [PATCH 70/97] merge commit --- Tools/cases_generator/tier1_generator.py | 2 +- Tools/cases_generator/tier2_generator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Tools/cases_generator/tier1_generator.py b/Tools/cases_generator/tier1_generator.py index 5dec66e8e0af15..a5a771cbf25392 100644 --- a/Tools/cases_generator/tier1_generator.py +++ b/Tools/cases_generator/tier1_generator.py @@ -51,7 +51,7 @@ def declare_variables(inst: Instruction, out: CWriter) -> None: for var in reversed(part.stack.inputs): stack.pop(var) for var in part.stack.outputs: - stack.push(var) + stack.push(var) except StackError as ex: raise analysis_error(ex.args[0], part.body[0]) from None required = set(stack.defined) diff --git a/Tools/cases_generator/tier2_generator.py b/Tools/cases_generator/tier2_generator.py index 88ad0fd797f0cc..6e4b92f4c28c6b 100644 --- a/Tools/cases_generator/tier2_generator.py +++ b/Tools/cases_generator/tier2_generator.py @@ -53,7 +53,7 @@ def declare_variables(uop: Uop, out: CWriter) -> None: for var in reversed(uop.stack.inputs): stack.pop(var) for var in uop.stack.outputs: - stack.push(var) + stack.push(var) required = set(stack.defined) for var in reversed(uop.stack.inputs): declare_variable(var, uop, required, out) From ff60eeb068c1611b38f7cef52444c9c97d85e292 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 20 Jul 2024 12:08:12 +0200 Subject: [PATCH 71/97] Revert "merge commit" This reverts commit 481fae0ecc42a13cc53b95e0106103dba6477cb8. --- Tools/cases_generator/tier1_generator.py | 2 +- Tools/cases_generator/tier2_generator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Tools/cases_generator/tier1_generator.py b/Tools/cases_generator/tier1_generator.py index a5a771cbf25392..5dec66e8e0af15 100644 --- a/Tools/cases_generator/tier1_generator.py +++ b/Tools/cases_generator/tier1_generator.py @@ -51,7 +51,7 @@ def declare_variables(inst: Instruction, out: CWriter) -> None: for var in reversed(part.stack.inputs): stack.pop(var) for var in part.stack.outputs: - stack.push(var) + stack.push(var) except StackError as ex: raise analysis_error(ex.args[0], part.body[0]) from None required = set(stack.defined) diff --git a/Tools/cases_generator/tier2_generator.py b/Tools/cases_generator/tier2_generator.py index 6e4b92f4c28c6b..88ad0fd797f0cc 100644 --- a/Tools/cases_generator/tier2_generator.py +++ b/Tools/cases_generator/tier2_generator.py @@ -53,7 +53,7 @@ def declare_variables(uop: Uop, out: CWriter) -> None: for var in reversed(uop.stack.inputs): stack.pop(var) for var in uop.stack.outputs: - stack.push(var) + stack.push(var) required = set(stack.defined) for var in reversed(uop.stack.inputs): declare_variable(var, uop, required, out) From 372758283504bb4db652ccf3675664a807633c49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 20 Jul 2024 12:29:53 +0200 Subject: [PATCH 72/97] update dependencies --- PCbuild/pythoncore.vcxproj | 1 + 1 file changed, 1 insertion(+) diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 0d83ac770348b1..cef1d0691be9f3 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -368,6 +368,7 @@ + From c89cf47ab8b71e3219f0303f5029348c33dc6f58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 25 Jul 2024 14:41:01 +0200 Subject: [PATCH 73/97] Fix missing exception handler --- Lib/test/test_fnmatch.py | 10 ++++++++++ Modules/_fnmatch/filter.c | 3 +++ 2 files changed, 13 insertions(+) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 9e00054d6ab14c..11b734f266012d 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -377,6 +377,16 @@ def test_filter(self): self.assertEqual(filter([b'Python', b'Ruby', b'Perl', b'Tcl'], b'P*'), [b'Python', b'Perl']) + def test_filter_iter_errors(self): + class BadList: + def __iter__(self): + yield 'abc' + raise ValueError("nope") + + with self.assertRaisesRegex(ValueError, r'^nope$'): + self.fnmatch.filter(BadList(), '*') + + def test_mix_bytes_str(self): filter = self.fnmatch.filter self.assertRaises(TypeError, filter, ['test'], b'*') diff --git a/Modules/_fnmatch/filter.c b/Modules/_fnmatch/filter.c index 5b44f6accfc8df..d3611b7f5f883e 100644 --- a/Modules/_fnmatch/filter.c +++ b/Modules/_fnmatch/filter.c @@ -41,6 +41,9 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normalizer) Py_DECREF(name); } Py_DECREF(iter); + if (PyErr_Occurred()) { + Py_CLEAR(res); + } return res; abort: Py_DECREF(name); From 4fbd06b90da09acf01f5b391240e8c26bc98d156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Thu, 25 Jul 2024 14:41:18 +0200 Subject: [PATCH 74/97] cosmetic changes --- Modules/_fnmatch/_fnmatchmodule.c | 1 + Modules/_fnmatch/macros.h | 11 ++++++----- Modules/_fnmatch/translate.c | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 1f03f050f4c831..a771b406fccc0e 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -179,6 +179,7 @@ fnmatchmodule_exec(PyObject *module) INTERN_STRING(st, setops_repl_str, "\\\\\\1"); return 0; } + #undef INTERN_STRING #undef IMPORT_MODULE diff --git a/Modules/_fnmatch/macros.h b/Modules/_fnmatch/macros.h index 04bf4a684a3035..6c0a9ea72b45c0 100644 --- a/Modules/_fnmatch/macros.h +++ b/Modules/_fnmatch/macros.h @@ -12,19 +12,20 @@ // since they directly delegate to the _PyUnicodeWriter_Write* underlying // function. In particular, the caller is responsible for type safety. -/* Write a character CHAR. */ +/* Cast WRITER and call _PyUnicodeWriter_WriteChar(). */ #define _WRITE_CHAR(WRITER, CHAR) \ _PyUnicodeWriter_WriteChar((_PyUnicodeWriter *)(WRITER), (CHAR)) -/* Write an ASCII string STRING of given length LENGTH. */ +/* Cast WRITER and call _PyUnicodeWriter_WriteASCIIString(). */ #define _WRITE_ASCII(WRITER, STRING, LENGTH) \ _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter *)(WRITER), \ (STRING), (LENGTH)) -/* Write the string STRING. */ + +/* Cast WRITER and call _PyUnicodeWriter_WriteStr(). */ #define _WRITE_STRING(WRITER, STRING) \ _PyUnicodeWriter_WriteStr((_PyUnicodeWriter *)(WRITER), (STRING)) -/* Write the substring STRING[START:STOP]. */ +/* Cast WRITER and call _PyUnicodeWriter_WriteSubstring(). */ #define _WRITE_BLOCK(WRITER, STRING, START, STOP) \ _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter *)(WRITER), \ (STRING), (START), (STOP)) @@ -59,7 +60,7 @@ } while (0) /* - * Write the substring STRING[START:STOP] if START < STOP, + * Write the substring STRING[START:STOP] (no-op if the substring is empty) * or execute the ON_ERROR statements if it fails. */ #define WRITE_BLOCK_OR(WRITER, STRING, START, STOP, ON_ERROR) \ diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 10552434a7d616..eec2367934e56c 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -477,7 +477,7 @@ translate_expression(fnmatchmodule_state *state, PyObject *chunks = split_expression(state, pattern, start, stop, pattern_str_find_meth); if (chunks == NULL) { - goto abort; + return NULL; } // remove empty ranges if (simplify_expression(chunks) < 0) { @@ -491,7 +491,7 @@ translate_expression(fnmatchmodule_state *state, Py_DECREF(chunks); return res; abort: - Py_XDECREF(chunks); + Py_DECREF(chunks); return NULL; } @@ -570,7 +570,7 @@ process_wildcards(PyObject *pattern, PyObject *indices) // exactly n + 6(m-1) + 1 + 7 = n + 6m + 2 characters, where the "+1" // is due to the fact that writing ".*" instead of "*" only increases // the total length of the pattern by 1 (and not by 2). - const Py_ssize_t reslen = m == 0 ? n + 7 : n + 6 * m + 2; + const Py_ssize_t reslen = m == 0 ? (n + 7) : (n + 6 * m + 2); PyUnicodeWriter *writer = PyUnicodeWriter_Create(reslen); if (writer == NULL) { return NULL; From 3b348f528df9d6c305808989e8564899d340f115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 12:14:40 +0200 Subject: [PATCH 75/97] update comments --- Modules/_fnmatch/util.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Modules/_fnmatch/util.h b/Modules/_fnmatch/util.h index 371930f5c17262..4e5228659af4b4 100644 --- a/Modules/_fnmatch/util.h +++ b/Modules/_fnmatch/util.h @@ -7,6 +7,9 @@ #include "Python.h" +typedef struct { +} translatemodule_state; + typedef struct { PyObject *os_module; // import os PyObject *posixpath_module; // import posixpath @@ -55,10 +58,14 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normalizer); /* * C accelerator for translating UNIX shell patterns into RE patterns. * - * The 'pattern' must be a Unicode object (not a bytes) object, - * and the translated pattern will be a Unicode object as well. + * Parameters + * + * module A module with a state given by get_fnmatchmodule_state(). + * pattern A Unicode object to translate. + * + * Returns * - * Note: this is the C implementation of fnmatch.translate(). + * A translated unicode RE pattern. */ extern PyObject * _Py_fnmatch_translate(PyObject *module, PyObject *pattern); From da42703ac99b4f8fb47f165b7c7b124c9536f1a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 12:14:45 +0200 Subject: [PATCH 76/97] remove some macros --- Modules/_fnmatch/translate.c | 79 +++++++++++++----------------------- 1 file changed, 29 insertions(+), 50 deletions(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index eec2367934e56c..eb69aba35964cc 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -115,9 +115,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) const int pattern_kind = PyUnicode_KIND(pattern); const void *const pattern_data = PyUnicode_DATA(pattern); // ---- def local macros -------------------------------------------------- -#define READ_CHAR(IND) PyUnicode_READ(pattern_kind, pattern_data, IND) -#define WRITE_CHAR(CHAR) WRITE_CHAR_OR_ABORT(writer, CHAR) -#define WRITE_ASCII(STR, LEN) WRITE_ASCII_OR_ABORT(writer, STR, LEN) +#define READ_CHAR(IND) PyUnicode_READ(pattern_kind, pattern_data, IND) /* advance IND if the character is CHAR */ #define ADVANCE_IF_CHAR_IS(CHAR, IND, MAXIND) \ do { \ @@ -132,7 +130,8 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) Py_UCS4 chr = READ_CHAR(i++); switch (chr) { case '*': { - WRITE_CHAR('*'); + // translate wildcard '*' (fnmatch) into optional '.' (regex) + WRITE_CHAR_OR_ABORT(writer, '*'); // skip duplicated '*' for (; i < maxind && READ_CHAR(i) == '*'; ++i); // store the position of the wildcard @@ -149,7 +148,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } case '?': { // translate optional '?' (fnmatch) into optional '.' (regex) - WRITE_CHAR('.'); + WRITE_CHAR_OR_ABORT(writer, '.'); ++written; // increase the expected result's length break; } @@ -161,7 +160,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) ADVANCE_IF_CHAR_IS(']', j, maxind); // [!] or [] for (; j < maxind && READ_CHAR(j) != ']'; ++j); // locate ']' if (j >= maxind) { - WRITE_ASCII("\\[", 2); + WRITE_ASCII_OR_ABORT(writer, "\\[", 2); written += 2; // we just wrote 2 characters break; // explicit early break for clarity } @@ -226,8 +225,6 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } } #undef ADVANCE_IF_CHAR_IS -#undef WRITE_ASCII -#undef WRITE_CHAR #undef READ_CHAR Py_DECREF(pattern_str_find_meth); Py_DECREF(re_sub_func); @@ -498,52 +495,42 @@ translate_expression(fnmatchmodule_state *state, static Py_ssize_t write_expression(PyUnicodeWriter *writer, PyObject *expression) { - // ---- def local macros -------------------------------------------------- -#define WRITE_CHAR(CHAR) WRITE_CHAR_OR_ABORT(writer, CHAR) -#define WRITE_ASCII(STR, LEN) WRITE_ASCII_OR_ABORT(writer, STR, LEN) -#define WRITE_STRING(STR) WRITE_STRING_OR_ABORT(writer, STR) -#define WRITE_BLOCK(STR, I, J) WRITE_BLOCK_OR_ABORT(writer, STR, I, J) - // ------------------------------------------------------------------------ Py_ssize_t grouplen = PyUnicode_GET_LENGTH(expression); if (grouplen == 0) { // empty range: never match - WRITE_ASCII("(?!)", 4); + WRITE_ASCII_OR_ABORT(writer, "(?!)", 4); return 4; } Py_UCS4 token = PyUnicode_READ_CHAR(expression, 0); if (grouplen == 1 && token == '!') { // negated empty range: match any character - WRITE_CHAR('.'); + WRITE_CHAR_OR_ABORT(writer, '.'); return 1; } Py_ssize_t extra = 2; // '[' and ']' - WRITE_CHAR('['); + WRITE_CHAR_OR_ABORT(writer, '['); switch (token) { case '!': { - WRITE_CHAR('^'); // replace '!' by '^' - WRITE_BLOCK(expression, 1, grouplen); + WRITE_CHAR_OR_ABORT(writer, '^'); // replace '!' by '^' + WRITE_BLOCK_OR_ABORT(writer, expression, 1, grouplen); break; } case '^': case '[': { - WRITE_CHAR('\\'); + WRITE_CHAR_OR_ABORT(writer, '\\'); ++extra; // because we wrote '\\' - WRITE_STRING(expression); + WRITE_STRING_OR_ABORT(writer, expression); break; } default: { - WRITE_STRING(expression); + WRITE_STRING_OR_ABORT(writer, expression); break; } } - WRITE_CHAR(']'); + WRITE_CHAR_OR_ABORT(writer, ']'); return grouplen + extra; abort: return -1; -#undef WRITE_BLOCK -#undef WRITE_STRING -#undef WRITE_ASCII -#undef WRITE_CHAR } static PyObject * @@ -576,43 +563,39 @@ process_wildcards(PyObject *pattern, PyObject *indices) return NULL; } // ---- def local macros -------------------------------------------------- -#define WRITE_CHAR(CHAR) WRITE_CHAR_OR_ABORT(writer, CHAR) -#define WRITE_ASCII(STR, LEN) WRITE_ASCII_OR_ABORT(writer, STR, LEN) -#define WRITE_STRING(STR) WRITE_STRING_OR_ABORT(writer, STR) -#define WRITE_BLOCK(STR, I, J) WRITE_BLOCK_OR_ABORT(writer, STR, I, J) -#define LOAD_WILDCARD_INDEX(VAR, IND) \ - do { \ - VAR = PyLong_AsSsize_t(PyList_GET_ITEM(indices, IND)); \ - if (VAR < 0) { \ - goto abort; \ - } \ +#define LOAD_WILDCARD_INDEX(VAR, IND) \ + do { \ + VAR = PyLong_AsSsize_t(PyList_GET_ITEM(indices, (IND))); \ + if (VAR < 0) { \ + goto abort; \ + } \ } while (0) // ------------------------------------------------------------------------ - WRITE_ASCII("(?s:", 4); + WRITE_ASCII_OR_ABORT(writer, "(?s:", 4); if (m == 0) { - WRITE_STRING(pattern); + WRITE_STRING_OR_ABORT(writer, pattern); } else { Py_ssize_t i = 0, j = -1; // process the optional PREFIX LOAD_WILDCARD_INDEX(j, 0); - WRITE_BLOCK(pattern, i, j); + WRITE_BLOCK_OR_ABORT(writer, pattern, 0, j); i = j + 1; for (Py_ssize_t k = 1; k < m; ++k) { // process the (* INNER) groups LOAD_WILDCARD_INDEX(j, k); assert(i < j); // write the atomic RE group '(?>.*?' + INNER + ')' - WRITE_ASCII("(?>.*?", 6); - WRITE_BLOCK(pattern, i, j); - WRITE_CHAR(')'); + WRITE_ASCII_OR_ABORT(writer, "(?>.*?", 6); + WRITE_BLOCK_OR_ABORT(writer, pattern, i, j); + WRITE_CHAR_OR_ABORT(writer, ')'); i = j + 1; } // handle the (*) [OUTER] part - WRITE_ASCII(".*", 2); - WRITE_BLOCK(pattern, i, n); + WRITE_ASCII_OR_ABORT(writer, ".*", 2); + WRITE_BLOCK_OR_ABORT(writer, pattern, i, n); } - WRITE_ASCII(")\\Z", 3); + WRITE_ASCII_OR_ABORT(writer, ")\\Z", 3); PyObject *res = PyUnicodeWriter_Finish(writer); assert(res == NULL || PyUnicode_GET_LENGTH(res) == reslen); return res; @@ -620,8 +603,4 @@ process_wildcards(PyObject *pattern, PyObject *indices) PyUnicodeWriter_Discard(writer); return NULL; #undef LOAD_WILDCARD_INDEX -#undef WRITE_BLOCK -#undef WRITE_STRING -#undef WRITE_ASCII -#undef WRITE_CHAR } From 97ed24d2eeaf64192ec850e1bb3a75e7676f181d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 14:14:19 +0200 Subject: [PATCH 77/97] cleanup --- Lib/test/test_fnmatch.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 11b734f266012d..ea4f86743506db 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -294,11 +294,6 @@ def test_translate_wildcards(self): self.assertEqual(translated, expect, pattern) def test_translate_expressions(self): - '[', '[-abc]', '[[]b', '[[a]b', '[\\\\]', '[\\]', '[]-]', '[][!]', - '[]]b', '[]a[]b', '[^a-c]*', '[a-\\z]', - '[a-c]b*', '[a-y]*[^c]', '[abc-]', '\\*', - '[0-4-3-2]', '[b-ac-z9-1]', '[!b-ac-z9-1]', '[!]b-ac-z9-1]', - '[]b-ac-z9-1]', '[]b-ac-z9-1]*', '*[]b-ac-z9-1]', for pattern, expect in [ ('[', r'(?s:\[)\Z'), ('[!', r'(?s:\[!)\Z'), From d38a0cb664f469df5f3a17d7d293c10f1d7ce948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 14:14:22 +0200 Subject: [PATCH 78/97] test empty range --- Lib/test/test_fnmatch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index ea4f86743506db..6ab244021ea20d 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -269,6 +269,7 @@ def test_translate(self): def test_translate_wildcards(self): for pattern, expect in [ + ('', r'(?s:)\Z'), ('ab*', r'(?s:ab.*)\Z'), ('ab*cd', r'(?s:ab.*cd)\Z'), ('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'), From db88ff521f04a9bc8b4da8799456523104672666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 14:55:27 +0200 Subject: [PATCH 79/97] update macros.h - add convenience macros - make some macros private --- Modules/_fnmatch/macros.h | 63 +++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/Modules/_fnmatch/macros.h b/Modules/_fnmatch/macros.h index 6c0a9ea72b45c0..14ef4f142e9b98 100644 --- a/Modules/_fnmatch/macros.h +++ b/Modules/_fnmatch/macros.h @@ -8,6 +8,34 @@ // ==== Macro definitions ===================================================== +/* + * Check that INTVAL is >= 0 or execute 'goto abort'. + * + * This macro is provided for convenience and should be + * carefully used if more resources should be released + * before jumping to the 'abort' label. + */ +#define CHECK_INTVAL_OR_ABORT(INTVAL) \ + do { \ + if ((INTVAL) < 0) { \ + goto abort; \ + } \ + } while (0) + +/* + * Check that OBJ is not NULL or execute 'goto abort'. + * + * This macro is provided for convenience and should be + * carefully used if more resources should be released + * before jumping to the 'abort' label. + */ +#define CHECK_NON_NULL_OR_ABORT(OBJ) \ + do { \ + if ((OBJ) == NULL) { \ + goto abort; \ + } \ + } while (0) + // The following _WRITE_* and _WRITE_*_OR macros do NOT check their inputs // since they directly delegate to the _PyUnicodeWriter_Write* underlying // function. In particular, the caller is responsible for type safety. @@ -33,7 +61,7 @@ // ---------------------------------------------------------------------------- /* Write a character CHAR or execute the ON_ERROR statements if it fails. */ -#define WRITE_CHAR_OR(WRITER, CHAR, ON_ERROR) \ +#define _WRITE_CHAR_OR(WRITER, CHAR, ON_ERROR) \ do { \ if (_WRITE_CHAR((WRITER), (CHAR)) < 0) { \ ON_ERROR; \ @@ -44,7 +72,7 @@ * Write an ASCII string STRING of given length LENGTH, * or execute the ON_ERROR statements if it fails. */ -#define WRITE_ASCII_OR(WRITER, ASCII, LENGTH, ON_ERROR) \ +#define _WRITE_ASCII_OR(WRITER, ASCII, LENGTH, ON_ERROR) \ do { \ if (_WRITE_ASCII((WRITER), (ASCII), (LENGTH)) < 0) { \ ON_ERROR; \ @@ -52,7 +80,7 @@ } while (0) /* Write the string STRING or execute the ON_ERROR statements if it fails. */ -#define WRITE_STRING_OR(WRITER, STRING, ON_ERROR) \ +#define _WRITE_STRING_OR(WRITER, STRING, ON_ERROR) \ do { \ if (_WRITE_STRING((WRITER), (STRING)) < 0) { \ ON_ERROR; \ @@ -63,7 +91,7 @@ * Write the substring STRING[START:STOP] (no-op if the substring is empty) * or execute the ON_ERROR statements if it fails. */ -#define WRITE_BLOCK_OR(WRITER, STRING, START, STOP, ON_ERROR) \ +#define _WRITE_SUBSTRING_OR(WRITER, STRING, START, STOP, ON_ERROR) \ do { \ /* intermediate variables to allow in-place operations */ \ Py_ssize_t _i = (START), _j = (STOP); \ @@ -77,13 +105,13 @@ // Macros which execute "goto abort" if an error occurs. #define WRITE_CHAR_OR_ABORT(WRITER, CHAR) \ - WRITE_CHAR_OR((WRITER), (CHAR), goto abort) + _WRITE_CHAR_OR((WRITER), (CHAR), goto abort) #define WRITE_ASCII_OR_ABORT(WRITER, STRING, LENGTH) \ - WRITE_ASCII_OR((WRITER), (STRING), (LENGTH), goto abort) + _WRITE_ASCII_OR((WRITER), (STRING), (LENGTH), goto abort) #define WRITE_STRING_OR_ABORT(WRITER, STRING) \ - WRITE_STRING_OR((WRITER), (STRING), goto abort) + _WRITE_STRING_OR((WRITER), (STRING), goto abort) #define WRITE_BLOCK_OR_ABORT(WRITER, STRING, START, STOP) \ - WRITE_BLOCK_OR((WRITER), (STRING), (START), (STOP), goto abort) + _WRITE_SUBSTRING_OR((WRITER), (STRING), (START), (STOP), goto abort) // ---------------------------------------------------------------------------- @@ -107,14 +135,17 @@ NULL \ ) -/* Escape set operations in STRING using re.sub(). */ -#define SETOPS_REPLACE(STATE, STRING, RE_SUB_FUNC) \ - PyObject_CallFunctionObjArgs( \ - (RE_SUB_FUNC), \ - (STATE)->setops_str, \ - (STATE)->setops_repl_str, \ - (STRING), \ - NULL \ +/* + * Escape set operations in STRING using re.sub(). + * + * SETOPS_RE_SUB_METH is a reference to re.compile('([&~|])').sub(). + */ +#define SETOPS_REPLACE(STATE, STRING, SETOPS_RE_SUB_METH) \ + PyObject_CallFunctionObjArgs( \ + (SETOPS_RE_SUB_METH), \ + (STATE)->setops_repl_str, \ + (STRING), \ + NULL \ ) #endif // _FNMATCH_MACROS_H From 2caa5b89fbe0dac0233df0eae3f3e10c7d4ea175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 14:56:46 +0200 Subject: [PATCH 80/97] update algorithm Previous improvement: 2.05x. This improvement: 2.8x. --- Modules/_fnmatch/translate.c | 188 +++++++++++++++++++++-------------- 1 file changed, 114 insertions(+), 74 deletions(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index eb69aba35964cc..798a2e89dfd64b 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -15,12 +15,16 @@ // ==== Helper declarations =================================================== /* - * Creates a new Unicode object from a Py_UCS4 character. + * Write re.escape(pattern[start:stop]). * - * Note: this is 'unicode_char' taken from Objects/unicodeobject.c. + * This returns the number of written characters, or -1 if an error occurred. + * + * @pre 0 <= start < stop <= len(pattern) */ -static PyObject * -get_unicode_character(Py_UCS4 ch); +static inline Py_ssize_t +escape_block(PyUnicodeWriter *writer, + PyObject *pattern, Py_ssize_t start, Py_ssize_t stop, + PyObject *re_escape_func); /* * Construct a regular expression out of a UNIX-style expression. @@ -51,7 +55,9 @@ translate_expression(fnmatchmodule_state *state, * This returns the number of written characters, or -1 if an error occurred. */ static Py_ssize_t -write_expression(PyUnicodeWriter *writer, PyObject *expression); +write_expression(fnmatchmodule_state *state, + PyUnicodeWriter *writer, PyObject *expression, + PyObject *setops_re_sub_meth); /* * Build the final regular expression by processing the wildcards. @@ -63,6 +69,17 @@ process_wildcards(PyObject *pattern, PyObject *indices); // ==== API implementation ==================================================== +static inline PyObject * +get_setops_re_sub_method(fnmatchmodule_state *state) +{ + PyObject *compiled = PyObject_CallMethodOneArg(state->re_module, + &_Py_ID(compile), + state->setops_str); + PyObject *method = PyObject_GetAttr(compiled, &_Py_ID(sub)); + Py_DECREF(compiled); + return method; +} + PyObject * _Py_fnmatch_translate(PyObject *module, PyObject *pattern) { @@ -90,28 +107,34 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) return NULL; } + // ---- decl local objects ------------------------------------------------ // list containing the indices where '*' has a special meaning PyObject *wildcard_indices = NULL; // cached functions (cache is local to the call) - PyObject *re_escape_func = NULL, *re_sub_func = NULL; - PyObject *pattern_str_find_meth = NULL; // bound method of pattern.find() - + PyObject *re_escape_func = NULL; // re.escape() + PyObject *setops_re_subfn = NULL; // re.compile('([&~|])').sub() + PyObject *pattern_str_find_meth = NULL; // pattern.find() + // ---- def local objects ------------------------------------------------- wildcard_indices = PyList_New(0); if (wildcard_indices == NULL) { goto abort; } -#define CACHE_ATTRIBUTE(DEST, OBJECT, NAME) \ - do { \ - DEST = PyObject_GetAttr((OBJECT), (NAME)); \ - if ((DEST) == NULL) { \ - goto abort; \ - } \ - } while (0); - CACHE_ATTRIBUTE(re_escape_func, state->re_module, &_Py_ID(escape)); - CACHE_ATTRIBUTE(re_sub_func, state->re_module, &_Py_ID(sub)); - CACHE_ATTRIBUTE(pattern_str_find_meth, pattern, &_Py_ID(find)); -#undef CACHE_ATTRIBUTE - + // The Python implementation always takes queries re.escape() and re.sub() + // inside translate() and thus we should at least allow external users to + // mock those functions (thus, we cannot cache them in the module's state). + re_escape_func = PyObject_GetAttr(state->re_module, &_Py_ID(escape)); + if (re_escape_func == NULL) { + goto abort; + } + setops_re_subfn = get_setops_re_sub_method(state); + if (setops_re_subfn == NULL) { + goto abort; + } + pattern_str_find_meth = PyObject_GetAttr(pattern, &_Py_ID(find)); + if (pattern_str_find_meth == NULL) { + goto abort; + } + // ------------------------------------------------------------------------ const int pattern_kind = PyUnicode_KIND(pattern); const void *const pattern_data = PyUnicode_DATA(pattern); // ---- def local macros -------------------------------------------------- @@ -123,13 +146,28 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) ++IND; \ } \ } while (0) +#define WRITE_PENDING(ESCSTOP) \ + do { \ + if (escstart != -1) { \ + Py_ssize_t t = escape_block(writer, pattern, \ + escstart, (ESCSTOP), \ + re_escape_func); \ + if (t < 0) { \ + goto abort; \ + } \ + written += t; \ + escstart = -1; \ + } \ + } while (0) // ------------------------------------------------------------------------ - Py_ssize_t i = 0; // current index - Py_ssize_t written = 0; // number of characters written - while (i < maxind) { + Py_ssize_t i = 0; // current index + Py_ssize_t written = 0; // number of characters written + Py_ssize_t escstart = -1, escstop = -1; // start/stop escaping indices + while ((escstop = i) < maxind) { Py_UCS4 chr = READ_CHAR(i++); switch (chr) { case '*': { + WRITE_PENDING(escstop); // translate wildcard '*' (fnmatch) into optional '.' (regex) WRITE_CHAR_OR_ABORT(writer, '*'); // skip duplicated '*' @@ -147,12 +185,14 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) break; } case '?': { + WRITE_PENDING(escstop); // translate optional '?' (fnmatch) into optional '.' (regex) WRITE_CHAR_OR_ABORT(writer, '.'); ++written; // increase the expected result's length break; } case '[': { + WRITE_PENDING(escstop); assert(i > 0); assert(READ_CHAR(i - 1) == '['); Py_ssize_t j = i; @@ -170,28 +210,24 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) if (pos == -2) { goto abort; } - PyObject *pre_expr = NULL, *expr = NULL; + PyObject *expr = NULL; if (pos == -1) { PyObject *tmp = PyUnicode_Substring(pattern, i, j); if (tmp == NULL) { goto abort; } - pre_expr = BACKSLASH_REPLACE(state, tmp); + expr = BACKSLASH_REPLACE(state, tmp); Py_DECREF(tmp); } else { - pre_expr = translate_expression(state, pattern, i, j, - pattern_str_find_meth); - } - if (pre_expr == NULL) { - goto abort; + expr = translate_expression(state, pattern, i, j, + pattern_str_find_meth); } - expr = SETOPS_REPLACE(state, pre_expr, re_sub_func); - Py_DECREF(pre_expr); if (expr == NULL) { goto abort; } - Py_ssize_t expr_len = write_expression(writer, expr); + Py_ssize_t expr_len = write_expression(state, writer, expr, + setops_re_subfn); Py_DECREF(expr); if (expr_len < 0) { goto abort; @@ -202,32 +238,20 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } } default: { - PyObject *str = get_unicode_character(chr); - if (str == NULL) { - goto abort; - } - PyObject *escaped = PyObject_CallOneArg(re_escape_func, str); - Py_DECREF(str); - if (escaped == NULL) { - goto abort; + if (escstart == -1) { + assert(i >= 1); + escstart = i - 1; } - Py_ssize_t escaped_len = PyUnicode_GET_LENGTH(escaped); - // Do NOT use WRITE_STRING_OR_ABORT() since 'escaped' - // must be first decref'ed in case of an error. - int rc = _WRITE_STRING(writer, escaped); - Py_DECREF(escaped); - if (rc < 0) { - goto abort; - } - written += escaped_len; break; } } } + WRITE_PENDING(maxind); +#undef WRITE_PENDING #undef ADVANCE_IF_CHAR_IS #undef READ_CHAR Py_DECREF(pattern_str_find_meth); - Py_DECREF(re_sub_func); + Py_DECREF(setops_re_subfn); Py_DECREF(re_escape_func); PyObject *translated = PyUnicodeWriter_Finish(writer); if (translated == NULL) { @@ -240,7 +264,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) return res; abort: Py_XDECREF(pattern_str_find_meth); - Py_XDECREF(re_sub_func); + Py_XDECREF(setops_re_subfn); Py_XDECREF(re_escape_func); Py_XDECREF(wildcard_indices); PyUnicodeWriter_Discard(writer); @@ -249,29 +273,35 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) // ==== Helper implementations ================================================ -static PyObject * -get_unicode_character(Py_UCS4 ch) +static inline Py_ssize_t +escape_block(PyUnicodeWriter *writer, + PyObject *pattern, Py_ssize_t start, Py_ssize_t stop, + PyObject *re_escape_func) { - assert(ch <= 0x10ffff); - if (ch < 256) { - PyObject *o = _Py_LATIN1_CHR(ch); - assert(_Py_IsImmortal(o)); - return o; +#ifdef Py_DEBUG + if (start < 0 || start >= stop || stop > PyUnicode_GET_LENGTH(pattern)) { + PyErr_BadInternalCall(); + return -1; } - PyObject *unicode = PyUnicode_New(1, ch); - if (unicode == NULL) { - return NULL; +#endif + PyObject *str = PyUnicode_Substring(pattern, start, stop); + if (str == NULL) { + goto abort; } - assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); - if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { - PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; + PyObject *escaped = PyObject_CallOneArg(re_escape_func, str); + Py_DECREF(str); + if (escaped == NULL) { + goto abort; } - else { - assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); - PyUnicode_4BYTE_DATA(unicode)[0] = ch; + Py_ssize_t written = PyUnicode_GET_LENGTH(escaped); + int rc = _WRITE_STRING(writer, escaped); + Py_DECREF(escaped); + if (rc < 0) { + goto abort; } - assert(_PyUnicode_CheckConsistency(unicode, 1)); - return unicode; + return written; +abort: + return -1; } /* @@ -493,8 +523,11 @@ translate_expression(fnmatchmodule_state *state, } static Py_ssize_t -write_expression(PyUnicodeWriter *writer, PyObject *expression) +write_expression(fnmatchmodule_state *state, + PyUnicodeWriter *writer, PyObject *expression, + PyObject *setops_re_sub_meth) { + PyObject *safe_expression = NULL; // for the 'goto abort' statements Py_ssize_t grouplen = PyUnicode_GET_LENGTH(expression); if (grouplen == 0) { // empty range: never match @@ -509,27 +542,34 @@ write_expression(PyUnicodeWriter *writer, PyObject *expression) } Py_ssize_t extra = 2; // '[' and ']' WRITE_CHAR_OR_ABORT(writer, '['); + // escape set operations as late as possible + safe_expression = SETOPS_REPLACE(state, expression, setops_re_sub_meth); + if (safe_expression == NULL) { + goto abort; + } switch (token) { case '!': { WRITE_CHAR_OR_ABORT(writer, '^'); // replace '!' by '^' - WRITE_BLOCK_OR_ABORT(writer, expression, 1, grouplen); + WRITE_BLOCK_OR_ABORT(writer, safe_expression, 1, grouplen); break; } case '^': case '[': { WRITE_CHAR_OR_ABORT(writer, '\\'); ++extra; // because we wrote '\\' - WRITE_STRING_OR_ABORT(writer, expression); + WRITE_STRING_OR_ABORT(writer, safe_expression); break; } default: { - WRITE_STRING_OR_ABORT(writer, expression); + WRITE_STRING_OR_ABORT(writer, safe_expression); break; } } + Py_DECREF(safe_expression); WRITE_CHAR_OR_ABORT(writer, ']'); return grouplen + extra; abort: + Py_XDECREF(safe_expression); return -1; } From ef2de2a737059a7afe3ff30e67eb8b967ffe30cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 15:09:49 +0200 Subject: [PATCH 81/97] PEP 7 --- Modules/_fnmatch/_fnmatchmodule.c | 34 ++++++++++++++++--------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index a771b406fccc0e..a050616217c71e 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -141,24 +141,26 @@ get_platform_normcase_function(PyObject *module, bool *isposix) // ==== Module state functions ================================================ -#define IMPORT_MODULE(state, attribute, name) \ - do { \ - /* make sure that the attribute is initialized once */ \ - assert(state->attribute == NULL); \ - state->attribute = PyImport_ImportModule((name)); \ - if (state->attribute == NULL) { \ - return -1; \ - } \ +/* Import a named module and store it in 'STATE->ATTRIBUTE'. */ +#define IMPORT_MODULE(STATE, ATTRIBUTE, MODULE_NAME) \ + do { \ + /* make sure that the attribute is initialized once */ \ + assert(STATE->ATTRIBUTE == NULL); \ + STATE->ATTRIBUTE = PyImport_ImportModule((MODULE_NAME)); \ + if (STATE->ATTRIBUTE == NULL) { \ + return -1; \ + } \ } while (0) -#define INTERN_STRING(state, attribute, literal) \ - do { \ - /* make sure that the attribute is initialized once */ \ - assert(state->attribute == NULL); \ - state->attribute = PyUnicode_InternFromString((literal)); \ - if (state->attribute == NULL) { \ - return -1; \ - } \ +/* Intern a literal STRING and store it in 'STATE->ATTRIBUTE'. */ +#define INTERN_STRING(STATE, ATTRIBUTE, STRING) \ + do { \ + /* make sure that the attribute is initialized once */ \ + assert(STATE->ATTRIBUTE == NULL); \ + STATE->ATTRIBUTE = PyUnicode_InternFromString((STRING)); \ + if (STATE->ATTRIBUTE == NULL) { \ + return -1; \ + } \ } while (0) static int From c00c8f91c0089cda964a27de1557db8a327fb344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 15:16:46 +0200 Subject: [PATCH 82/97] remove un-necessary code --- Modules/_fnmatch/util.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/Modules/_fnmatch/util.h b/Modules/_fnmatch/util.h index 4e5228659af4b4..379fed066de3d8 100644 --- a/Modules/_fnmatch/util.h +++ b/Modules/_fnmatch/util.h @@ -7,9 +7,6 @@ #include "Python.h" -typedef struct { -} translatemodule_state; - typedef struct { PyObject *os_module; // import os PyObject *posixpath_module; // import posixpath From e93cd878031c11bedd44dd057257f64353fe66df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 15:17:33 +0200 Subject: [PATCH 83/97] use convenience macros for aborting flow --- Modules/_fnmatch/translate.c | 111 ++++++++++------------------------- 1 file changed, 30 insertions(+), 81 deletions(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 798a2e89dfd64b..175b04eef5f1c6 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -116,24 +116,16 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) PyObject *pattern_str_find_meth = NULL; // pattern.find() // ---- def local objects ------------------------------------------------- wildcard_indices = PyList_New(0); - if (wildcard_indices == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(wildcard_indices); // The Python implementation always takes queries re.escape() and re.sub() // inside translate() and thus we should at least allow external users to // mock those functions (thus, we cannot cache them in the module's state). re_escape_func = PyObject_GetAttr(state->re_module, &_Py_ID(escape)); - if (re_escape_func == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(re_escape_func); setops_re_subfn = get_setops_re_sub_method(state); - if (setops_re_subfn == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(setops_re_subfn); pattern_str_find_meth = PyObject_GetAttr(pattern, &_Py_ID(find)); - if (pattern_str_find_meth == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(pattern_str_find_meth); // ------------------------------------------------------------------------ const int pattern_kind = PyUnicode_KIND(pattern); const void *const pattern_data = PyUnicode_DATA(pattern); @@ -152,9 +144,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) Py_ssize_t t = escape_block(writer, pattern, \ escstart, (ESCSTOP), \ re_escape_func); \ - if (t < 0) { \ - goto abort; \ - } \ + CHECK_INTVAL_OR_ABORT(t); \ written += t; \ escstart = -1; \ } \ @@ -174,14 +164,10 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) for (; i < maxind && READ_CHAR(i) == '*'; ++i); // store the position of the wildcard PyObject *wildcard_index = PyLong_FromSsize_t(written++); - if (wildcard_index == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(wildcard_index); int rc = PyList_Append(wildcard_indices, wildcard_index); Py_DECREF(wildcard_index); - if (rc < 0) { - goto abort; - } + CHECK_INTVAL_OR_ABORT(rc); break; } case '?': { @@ -193,8 +179,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } case '[': { WRITE_PENDING(escstop); - assert(i > 0); - assert(READ_CHAR(i - 1) == '['); + assert(READ_CHAR(escstop) == '['); Py_ssize_t j = i; ADVANCE_IF_CHAR_IS('!', j, maxind); // [! ADVANCE_IF_CHAR_IS(']', j, maxind); // [!] or [] @@ -213,9 +198,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) PyObject *expr = NULL; if (pos == -1) { PyObject *tmp = PyUnicode_Substring(pattern, i, j); - if (tmp == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(tmp); expr = BACKSLASH_REPLACE(state, tmp); Py_DECREF(tmp); } @@ -223,15 +206,11 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) expr = translate_expression(state, pattern, i, j, pattern_str_find_meth); } - if (expr == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(expr); Py_ssize_t expr_len = write_expression(state, writer, expr, setops_re_subfn); Py_DECREF(expr); - if (expr_len < 0) { - goto abort; - } + CHECK_INTVAL_OR_ABORT(expr_len); written += expr_len; i = j + 1; // jump to the character after ']' break; // explicit early break for clarity @@ -285,20 +264,14 @@ escape_block(PyUnicodeWriter *writer, } #endif PyObject *str = PyUnicode_Substring(pattern, start, stop); - if (str == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(str); PyObject *escaped = PyObject_CallOneArg(re_escape_func, str); Py_DECREF(str); - if (escaped == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(escaped); Py_ssize_t written = PyUnicode_GET_LENGTH(escaped); int rc = _WRITE_STRING(writer, escaped); Py_DECREF(escaped); - if (rc < 0) { - goto abort; - } + CHECK_INTVAL_OR_ABORT(rc); return written; abort: return -1; @@ -316,31 +289,23 @@ split_expression(fnmatchmodule_state *state, PyObject *pattern, Py_ssize_t start, Py_ssize_t stop, PyObject *str_find_func) { + // ---- decl local objects ------------------------------------------------ PyObject *chunks = NULL, *maxind = NULL; PyObject *hyphen = state->hyphen_str; - + // ---- def local objects ------------------------------------------------- chunks = PyList_New(0); - if (chunks == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(chunks); maxind = PyLong_FromSsize_t(stop); - if (maxind == NULL) { - goto abort; - } - + CHECK_NON_NULL_OR_ABORT(maxind); // ---- def local macros -------------------------------------------------- /* add pattern[START:STOP] to the list of chunks */ #define ADD_CHUNK(START, STOP) \ do { \ PyObject *chunk = PyUnicode_Substring(pattern, (START), (STOP)); \ - if (chunk == NULL) { \ - goto abort; \ - } \ + CHECK_NON_NULL_OR_ABORT(chunk); \ int rc = PyList_Append(chunks, chunk); \ Py_DECREF(chunk); \ - if (rc < 0) { \ - goto abort; \ - } \ + CHECK_INTVAL_OR_ABORT(rc); \ } while (0) // ------------------------------------------------------------------------ Py_ssize_t chunk_start = start; @@ -350,9 +315,7 @@ split_expression(fnmatchmodule_state *state, while (ind < stop) { PyObject *p_chunk_stop = PyObject_CallFunction(str_find_func, "OnO", hyphen, ind, maxind); - if (p_chunk_stop == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(p_chunk_stop); Py_ssize_t chunk_stop = PyLong_AsSsize_t(p_chunk_stop); Py_DECREF(p_chunk_stop); if (chunk_stop < 0) { @@ -433,9 +396,7 @@ simplify_expression(PyObject *chunks) assert(c1len > 1); assert(c2len > 1); PyUnicodeWriter *writer = PyUnicodeWriter_Create(olen); - if (writer == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(writer); // all but the last character in the first chunk if (_WRITE_BLOCK(writer, c1, 0, c1len - 1) < 0) { PyUnicodeWriter_Discard(writer); @@ -455,9 +416,7 @@ simplify_expression(PyObject *chunks) Py_XDECREF(str); goto abort; } - if (PySequence_DelItem(chunks, k) < 0) { - goto abort; - } + CHECK_INTVAL_OR_ABORT(PySequence_DelItem(chunks, k)); } } return 0; @@ -478,9 +437,7 @@ escape_expression(fnmatchmodule_state *state, PyObject *chunks) PyObject *s0 = PyList_GET_ITEM(chunks, c); assert(s0 != NULL); PyObject *s1 = BACKSLASH_REPLACE(state, s0); - if (s1 == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(s1); PyObject *s2 = HYPHEN_REPLACE(state, s1); Py_DECREF(s1); // PyList_SetItem() does not create a new reference on 's2' @@ -503,22 +460,16 @@ translate_expression(fnmatchmodule_state *state, { PyObject *chunks = split_expression(state, pattern, start, stop, pattern_str_find_meth); - if (chunks == NULL) { - return NULL; - } + CHECK_NON_NULL_OR_ABORT(chunks); // remove empty ranges - if (simplify_expression(chunks) < 0) { - goto abort; - } + CHECK_INTVAL_OR_ABORT(simplify_expression(chunks)); // escape backslashes and set differences - if (escape_expression(state, chunks) < 0) { - goto abort; - } + CHECK_INTVAL_OR_ABORT(escape_expression(state, chunks)); PyObject *res = PyUnicode_Join(state->hyphen_str, chunks); Py_DECREF(chunks); return res; abort: - Py_DECREF(chunks); + Py_XDECREF(chunks); return NULL; } @@ -544,9 +495,7 @@ write_expression(fnmatchmodule_state *state, WRITE_CHAR_OR_ABORT(writer, '['); // escape set operations as late as possible safe_expression = SETOPS_REPLACE(state, expression, setops_re_sub_meth); - if (safe_expression == NULL) { - goto abort; - } + CHECK_NON_NULL_OR_ABORT(safe_expression); switch (token) { case '!': { WRITE_CHAR_OR_ABORT(writer, '^'); // replace '!' by '^' @@ -606,7 +555,7 @@ process_wildcards(PyObject *pattern, PyObject *indices) #define LOAD_WILDCARD_INDEX(VAR, IND) \ do { \ VAR = PyLong_AsSsize_t(PyList_GET_ITEM(indices, (IND))); \ - if (VAR < 0) { \ + if ((VAR) < 0 && PyErr_Occurred()) { \ goto abort; \ } \ } while (0) From 3a0567670225b993aea7e029d6b436ae94405faa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 28 Jul 2024 16:06:22 +0200 Subject: [PATCH 84/97] refactor macros --- Modules/_fnmatch/macros.h | 77 ++++++++++++------------------------ Modules/_fnmatch/translate.c | 66 ++++++++++++++++--------------- 2 files changed, 61 insertions(+), 82 deletions(-) diff --git a/Modules/_fnmatch/macros.h b/Modules/_fnmatch/macros.h index 14ef4f142e9b98..d78a75b52d1dc0 100644 --- a/Modules/_fnmatch/macros.h +++ b/Modules/_fnmatch/macros.h @@ -9,15 +9,16 @@ // ==== Macro definitions ===================================================== /* - * Check that INTVAL is >= 0 or execute 'goto abort'. + * Check that STATUS is >= 0 or execute 'goto abort'. * * This macro is provided for convenience and should be * carefully used if more resources should be released * before jumping to the 'abort' label. */ -#define CHECK_INTVAL_OR_ABORT(INTVAL) \ +#define CHECK_RET_CODE_OR_ABORT(STATUS) \ do { \ - if ((INTVAL) < 0) { \ + if ((STATUS) < 0) { \ + assert(PyErr_Occurred()); \ goto abort; \ } \ } while (0) @@ -29,11 +30,11 @@ * carefully used if more resources should be released * before jumping to the 'abort' label. */ -#define CHECK_NON_NULL_OR_ABORT(OBJ) \ - do { \ - if ((OBJ) == NULL) { \ - goto abort; \ - } \ +#define CHECK_NOT_NULL_OR_ABORT(OBJ) \ + do { \ + if ((OBJ) == NULL) { \ + goto abort; \ + } \ } while (0) // The following _WRITE_* and _WRITE_*_OR macros do NOT check their inputs @@ -54,67 +55,41 @@ _PyUnicodeWriter_WriteStr((_PyUnicodeWriter *)(WRITER), (STRING)) /* Cast WRITER and call _PyUnicodeWriter_WriteSubstring(). */ -#define _WRITE_BLOCK(WRITER, STRING, START, STOP) \ +#define _WRITE_SUBSTRING(WRITER, STRING, START, STOP) \ _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter *)(WRITER), \ (STRING), (START), (STOP)) // ---------------------------------------------------------------------------- -/* Write a character CHAR or execute the ON_ERROR statements if it fails. */ -#define _WRITE_CHAR_OR(WRITER, CHAR, ON_ERROR) \ - do { \ - if (_WRITE_CHAR((WRITER), (CHAR)) < 0) { \ - ON_ERROR; \ - } \ - } while (0) +/* Write the character CHAR or jump to the 'abort' label on failure. */ +#define WRITE_CHAR_OR_ABORT(WRITER, CHAR) \ + CHECK_RET_CODE_OR_ABORT(_WRITE_CHAR((WRITER), (CHAR))) /* * Write an ASCII string STRING of given length LENGTH, - * or execute the ON_ERROR statements if it fails. + * or jump to the 'abort' label on failure. */ -#define _WRITE_ASCII_OR(WRITER, ASCII, LENGTH, ON_ERROR) \ - do { \ - if (_WRITE_ASCII((WRITER), (ASCII), (LENGTH)) < 0) { \ - ON_ERROR; \ - } \ - } while (0) +#define WRITE_ASCII_OR_ABORT(WRITER, ASCII, LENGTH) \ + CHECK_RET_CODE_OR_ABORT(_WRITE_ASCII((WRITER), (ASCII), (LENGTH))) -/* Write the string STRING or execute the ON_ERROR statements if it fails. */ -#define _WRITE_STRING_OR(WRITER, STRING, ON_ERROR) \ - do { \ - if (_WRITE_STRING((WRITER), (STRING)) < 0) { \ - ON_ERROR; \ - } \ - } while (0) +/* Write the string STRING or jump to the 'abort' label on failure. */ +#define WRITE_STRING_OR_ABORT(WRITER, STRING) \ + CHECK_RET_CODE_OR_ABORT(_WRITE_STRING((WRITER), (STRING))) /* - * Write the substring STRING[START:STOP] (no-op if the substring is empty) - * or execute the ON_ERROR statements if it fails. + * Write the substring STRING[START:STOP] (no-op if empty) + * or jump to the 'abort' label on failure. */ -#define _WRITE_SUBSTRING_OR(WRITER, STRING, START, STOP, ON_ERROR) \ +#define WRITE_SUBSTRING_OR_ABORT(WRITER, STRING, START, STOP) \ do { \ - /* intermediate variables to allow in-place operations */ \ - Py_ssize_t _i = (START), _j = (STOP); \ - if (_i < _j && _WRITE_BLOCK((WRITER), (STRING), _i, _j) < 0) { \ - ON_ERROR; \ - } \ + const Py_ssize_t _START = (START); \ + const Py_ssize_t _STOP = (STOP); \ + int _RC = _WRITE_SUBSTRING((WRITER), (STRING), _START, _STOP); \ + CHECK_RET_CODE_OR_ABORT(_RC); \ } while (0) // ---------------------------------------------------------------------------- -// Macros which execute "goto abort" if an error occurs. - -#define WRITE_CHAR_OR_ABORT(WRITER, CHAR) \ - _WRITE_CHAR_OR((WRITER), (CHAR), goto abort) -#define WRITE_ASCII_OR_ABORT(WRITER, STRING, LENGTH) \ - _WRITE_ASCII_OR((WRITER), (STRING), (LENGTH), goto abort) -#define WRITE_STRING_OR_ABORT(WRITER, STRING) \ - _WRITE_STRING_OR((WRITER), (STRING), goto abort) -#define WRITE_BLOCK_OR_ABORT(WRITER, STRING, START, STOP) \ - _WRITE_SUBSTRING_OR((WRITER), (STRING), (START), (STOP), goto abort) - -// ---------------------------------------------------------------------------- - /* Replace backslashes in STRING by escaped backslashes. */ #define BACKSLASH_REPLACE(STATE, STRING) \ PyObject_CallMethodObjArgs( \ diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 175b04eef5f1c6..e0ddce101338ad 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -116,16 +116,16 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) PyObject *pattern_str_find_meth = NULL; // pattern.find() // ---- def local objects ------------------------------------------------- wildcard_indices = PyList_New(0); - CHECK_NON_NULL_OR_ABORT(wildcard_indices); + CHECK_NOT_NULL_OR_ABORT(wildcard_indices); // The Python implementation always takes queries re.escape() and re.sub() // inside translate() and thus we should at least allow external users to // mock those functions (thus, we cannot cache them in the module's state). re_escape_func = PyObject_GetAttr(state->re_module, &_Py_ID(escape)); - CHECK_NON_NULL_OR_ABORT(re_escape_func); + CHECK_NOT_NULL_OR_ABORT(re_escape_func); setops_re_subfn = get_setops_re_sub_method(state); - CHECK_NON_NULL_OR_ABORT(setops_re_subfn); + CHECK_NOT_NULL_OR_ABORT(setops_re_subfn); pattern_str_find_meth = PyObject_GetAttr(pattern, &_Py_ID(find)); - CHECK_NON_NULL_OR_ABORT(pattern_str_find_meth); + CHECK_NOT_NULL_OR_ABORT(pattern_str_find_meth); // ------------------------------------------------------------------------ const int pattern_kind = PyUnicode_KIND(pattern); const void *const pattern_data = PyUnicode_DATA(pattern); @@ -144,7 +144,9 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) Py_ssize_t t = escape_block(writer, pattern, \ escstart, (ESCSTOP), \ re_escape_func); \ - CHECK_INTVAL_OR_ABORT(t); \ + if (t < 0) { \ + goto abort; \ + } \ written += t; \ escstart = -1; \ } \ @@ -164,10 +166,10 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) for (; i < maxind && READ_CHAR(i) == '*'; ++i); // store the position of the wildcard PyObject *wildcard_index = PyLong_FromSsize_t(written++); - CHECK_NON_NULL_OR_ABORT(wildcard_index); + CHECK_NOT_NULL_OR_ABORT(wildcard_index); int rc = PyList_Append(wildcard_indices, wildcard_index); Py_DECREF(wildcard_index); - CHECK_INTVAL_OR_ABORT(rc); + CHECK_RET_CODE_OR_ABORT(rc); break; } case '?': { @@ -198,7 +200,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) PyObject *expr = NULL; if (pos == -1) { PyObject *tmp = PyUnicode_Substring(pattern, i, j); - CHECK_NON_NULL_OR_ABORT(tmp); + CHECK_NOT_NULL_OR_ABORT(tmp); expr = BACKSLASH_REPLACE(state, tmp); Py_DECREF(tmp); } @@ -206,11 +208,13 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) expr = translate_expression(state, pattern, i, j, pattern_str_find_meth); } - CHECK_NON_NULL_OR_ABORT(expr); + CHECK_NOT_NULL_OR_ABORT(expr); Py_ssize_t expr_len = write_expression(state, writer, expr, setops_re_subfn); Py_DECREF(expr); - CHECK_INTVAL_OR_ABORT(expr_len); + if (expr_len < 0) { + goto abort; + } written += expr_len; i = j + 1; // jump to the character after ']' break; // explicit early break for clarity @@ -264,14 +268,14 @@ escape_block(PyUnicodeWriter *writer, } #endif PyObject *str = PyUnicode_Substring(pattern, start, stop); - CHECK_NON_NULL_OR_ABORT(str); + CHECK_NOT_NULL_OR_ABORT(str); PyObject *escaped = PyObject_CallOneArg(re_escape_func, str); Py_DECREF(str); - CHECK_NON_NULL_OR_ABORT(escaped); + CHECK_NOT_NULL_OR_ABORT(escaped); Py_ssize_t written = PyUnicode_GET_LENGTH(escaped); int rc = _WRITE_STRING(writer, escaped); Py_DECREF(escaped); - CHECK_INTVAL_OR_ABORT(rc); + CHECK_RET_CODE_OR_ABORT(rc); return written; abort: return -1; @@ -294,18 +298,18 @@ split_expression(fnmatchmodule_state *state, PyObject *hyphen = state->hyphen_str; // ---- def local objects ------------------------------------------------- chunks = PyList_New(0); - CHECK_NON_NULL_OR_ABORT(chunks); + CHECK_NOT_NULL_OR_ABORT(chunks); maxind = PyLong_FromSsize_t(stop); - CHECK_NON_NULL_OR_ABORT(maxind); + CHECK_NOT_NULL_OR_ABORT(maxind); // ---- def local macros -------------------------------------------------- /* add pattern[START:STOP] to the list of chunks */ #define ADD_CHUNK(START, STOP) \ do { \ PyObject *chunk = PyUnicode_Substring(pattern, (START), (STOP)); \ - CHECK_NON_NULL_OR_ABORT(chunk); \ + CHECK_NOT_NULL_OR_ABORT(chunk); \ int rc = PyList_Append(chunks, chunk); \ Py_DECREF(chunk); \ - CHECK_INTVAL_OR_ABORT(rc); \ + CHECK_RET_CODE_OR_ABORT(rc); \ } while (0) // ------------------------------------------------------------------------ Py_ssize_t chunk_start = start; @@ -315,7 +319,7 @@ split_expression(fnmatchmodule_state *state, while (ind < stop) { PyObject *p_chunk_stop = PyObject_CallFunction(str_find_func, "OnO", hyphen, ind, maxind); - CHECK_NON_NULL_OR_ABORT(p_chunk_stop); + CHECK_NOT_NULL_OR_ABORT(p_chunk_stop); Py_ssize_t chunk_stop = PyLong_AsSsize_t(p_chunk_stop); Py_DECREF(p_chunk_stop); if (chunk_stop < 0) { @@ -396,14 +400,14 @@ simplify_expression(PyObject *chunks) assert(c1len > 1); assert(c2len > 1); PyUnicodeWriter *writer = PyUnicodeWriter_Create(olen); - CHECK_NON_NULL_OR_ABORT(writer); + CHECK_NOT_NULL_OR_ABORT(writer); // all but the last character in the first chunk - if (_WRITE_BLOCK(writer, c1, 0, c1len - 1) < 0) { + if (_WRITE_SUBSTRING(writer, c1, 0, c1len - 1) < 0) { PyUnicodeWriter_Discard(writer); goto abort; } // all but the first character in the second chunk - if (_WRITE_BLOCK(writer, c2, 1, c2len) < 0) { + if (_WRITE_SUBSTRING(writer, c2, 1, c2len) < 0) { PyUnicodeWriter_Discard(writer); goto abort; } @@ -416,7 +420,7 @@ simplify_expression(PyObject *chunks) Py_XDECREF(str); goto abort; } - CHECK_INTVAL_OR_ABORT(PySequence_DelItem(chunks, k)); + CHECK_RET_CODE_OR_ABORT(PySequence_DelItem(chunks, k)); } } return 0; @@ -437,7 +441,7 @@ escape_expression(fnmatchmodule_state *state, PyObject *chunks) PyObject *s0 = PyList_GET_ITEM(chunks, c); assert(s0 != NULL); PyObject *s1 = BACKSLASH_REPLACE(state, s0); - CHECK_NON_NULL_OR_ABORT(s1); + CHECK_NOT_NULL_OR_ABORT(s1); PyObject *s2 = HYPHEN_REPLACE(state, s1); Py_DECREF(s1); // PyList_SetItem() does not create a new reference on 's2' @@ -460,11 +464,11 @@ translate_expression(fnmatchmodule_state *state, { PyObject *chunks = split_expression(state, pattern, start, stop, pattern_str_find_meth); - CHECK_NON_NULL_OR_ABORT(chunks); + CHECK_NOT_NULL_OR_ABORT(chunks); // remove empty ranges - CHECK_INTVAL_OR_ABORT(simplify_expression(chunks)); + CHECK_RET_CODE_OR_ABORT(simplify_expression(chunks)); // escape backslashes and set differences - CHECK_INTVAL_OR_ABORT(escape_expression(state, chunks)); + CHECK_RET_CODE_OR_ABORT(escape_expression(state, chunks)); PyObject *res = PyUnicode_Join(state->hyphen_str, chunks); Py_DECREF(chunks); return res; @@ -495,11 +499,11 @@ write_expression(fnmatchmodule_state *state, WRITE_CHAR_OR_ABORT(writer, '['); // escape set operations as late as possible safe_expression = SETOPS_REPLACE(state, expression, setops_re_sub_meth); - CHECK_NON_NULL_OR_ABORT(safe_expression); + CHECK_NOT_NULL_OR_ABORT(safe_expression); switch (token) { case '!': { WRITE_CHAR_OR_ABORT(writer, '^'); // replace '!' by '^' - WRITE_BLOCK_OR_ABORT(writer, safe_expression, 1, grouplen); + WRITE_SUBSTRING_OR_ABORT(writer, safe_expression, 1, grouplen); break; } case '^': @@ -568,7 +572,7 @@ process_wildcards(PyObject *pattern, PyObject *indices) Py_ssize_t i = 0, j = -1; // process the optional PREFIX LOAD_WILDCARD_INDEX(j, 0); - WRITE_BLOCK_OR_ABORT(writer, pattern, 0, j); + WRITE_SUBSTRING_OR_ABORT(writer, pattern, i, j); i = j + 1; for (Py_ssize_t k = 1; k < m; ++k) { // process the (* INNER) groups @@ -576,13 +580,13 @@ process_wildcards(PyObject *pattern, PyObject *indices) assert(i < j); // write the atomic RE group '(?>.*?' + INNER + ')' WRITE_ASCII_OR_ABORT(writer, "(?>.*?", 6); - WRITE_BLOCK_OR_ABORT(writer, pattern, i, j); + WRITE_SUBSTRING_OR_ABORT(writer, pattern, i, j); WRITE_CHAR_OR_ABORT(writer, ')'); i = j + 1; } // handle the (*) [OUTER] part WRITE_ASCII_OR_ABORT(writer, ".*", 2); - WRITE_BLOCK_OR_ABORT(writer, pattern, i, n); + WRITE_SUBSTRING_OR_ABORT(writer, pattern, i, n); } WRITE_ASCII_OR_ABORT(writer, ")\\Z", 3); PyObject *res = PyUnicodeWriter_Finish(writer); From db756262f75e0b7a4ab0d853850f6078677f442f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 18 Aug 2024 12:42:01 +0200 Subject: [PATCH 85/97] rename `COMPILED_CACHE_SIZE` to `LRU_CACHE_SIZE` --- Modules/_fnmatch/_fnmatchmodule.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index a050616217c71e..7aa4cae265e77f 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -33,7 +33,7 @@ #include "clinic/_fnmatchmodule.c.h" -#define COMPILED_CACHE_SIZE 32768 +#define LRU_CACHE_SIZE 32768 #define INVALID_PATTERN_TYPE "pattern must be a string or a bytes object" // ==== Cached translation unit =============================================== @@ -84,7 +84,7 @@ fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) { // make sure that this function is called once assert(st->translator == NULL); - PyObject *maxsize = PyLong_FromLong(COMPILED_CACHE_SIZE); + PyObject *maxsize = PyLong_FromLong(LRU_CACHE_SIZE); if (maxsize == NULL) { return -1; } From df76ba3dcd5cb6d84ba4f06f45fb663ea1e33bf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 18 Aug 2024 12:58:18 +0200 Subject: [PATCH 86/97] add LRU cache for `re.escape` --- Modules/_fnmatch/_fnmatchmodule.c | 41 +++++++++++++++++++++++++++++++ Modules/_fnmatch/util.h | 1 + 2 files changed, 42 insertions(+) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 7aa4cae265e77f..b6170c8719c277 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -113,6 +113,42 @@ fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) return 0; } +// ==== Cached re.escape() unit =============================================== + +/* Create an LRU-cached function for re.escape(). */ +static int +fnmatchmodule_load_escapefunc(PyObject *Py_UNUSED(module), + fnmatchmodule_state *st) +{ + // make sure that this function is called once + assert(st->re_escape == NULL); + PyObject *maxsize = PyLong_FromLong(LRU_CACHE_SIZE); + if (maxsize == NULL) { + return -1; + } + PyObject *cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); + if (cache == NULL) { + Py_DECREF(maxsize); + return -1; + } + PyObject *wrapper = PyObject_CallOneArg(cache, maxsize); + Py_DECREF(maxsize); + Py_DECREF(cache); + if (wrapper == NULL) { + return -1; + } + assert(st->re_module != NULL); + PyObject *wrapped = PyObject_GetAttr(st->re_module, &_Py_ID(escape)); + // reference on 'escapechar' will be removed upon module cleanup + st->re_escape = PyObject_CallOneArg(wrapper, wrapped); + Py_DECREF(wrapped); + Py_DECREF(wrapper); + if (st->re_escape == NULL) { + return -1; + } + return 0; +} + // ==== Module data getters =================================================== static inline PyObject * /* reference to re.compile(pattern).match() */ @@ -173,6 +209,9 @@ fnmatchmodule_exec(PyObject *module) if (fnmatchmodule_load_translator(module, st) < 0) { return -1; } + if (fnmatchmodule_load_escapefunc(module, st) < 0) { + return -1; + } INTERN_STRING(st, hyphen_str, "-"); INTERN_STRING(st, hyphen_esc_str, "\\-"); INTERN_STRING(st, backslash_str, "\\"); @@ -195,6 +234,7 @@ fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) Py_VISIT(st->backslash_str); Py_VISIT(st->hyphen_esc_str); Py_VISIT(st->hyphen_str); + Py_VISIT(st->re_escape); Py_VISIT(st->translator); Py_VISIT(st->re_module); Py_VISIT(st->posixpath_module); @@ -212,6 +252,7 @@ fnmatchmodule_clear(PyObject *m) Py_CLEAR(st->backslash_str); Py_CLEAR(st->hyphen_esc_str); Py_CLEAR(st->hyphen_str); + Py_CLEAR(st->re_escape); Py_CLEAR(st->translator); Py_CLEAR(st->re_module); Py_CLEAR(st->posixpath_module); diff --git a/Modules/_fnmatch/util.h b/Modules/_fnmatch/util.h index 379fed066de3d8..36e21bc6f5f09e 100644 --- a/Modules/_fnmatch/util.h +++ b/Modules/_fnmatch/util.h @@ -13,6 +13,7 @@ typedef struct { PyObject *re_module; // import re PyObject *translator; // LRU-cached translation unit + PyObject *re_escape; // LRU-cached re.escape() function // strings used by translate.c PyObject *hyphen_str; // hyphen '-' From ac46e2cbb656440c31461da4d063c05bd041856a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 19 Aug 2024 15:36:28 +0200 Subject: [PATCH 87/97] cache `re.compile(...).sub` for set operations tokens --- Modules/_fnmatch/_fnmatchmodule.c | 36 ++++++++++++++++++++++++++++--- Modules/_fnmatch/util.h | 2 +- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index b6170c8719c277..bceca5e73cff6d 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -149,6 +149,34 @@ fnmatchmodule_load_escapefunc(PyObject *Py_UNUSED(module), return 0; } +// ==== Cached re.sub() unit for set operation tokens ========================= + +/* Create an LRU-cached function for re.compile('([&~|])').sub(). */ +static int +fnmatchmodule_load_setops_re_sub(PyObject *Py_UNUSED(module), + fnmatchmodule_state *st) +{ + // make sure that this function is called once + assert(st->setops_re_subfn == NULL); + PyObject *pattern = PyUnicode_FromString("([&~|])"); + if (pattern == NULL) { + return -1; + } + PyObject *compiled = PyObject_CallMethodOneArg(st->re_module, + &_Py_ID(compile), + pattern); + Py_DECREF(pattern); + if (compiled == NULL) { + return -1; + } + st->setops_re_subfn = PyObject_GetAttr(compiled, &_Py_ID(sub)); + Py_DECREF(compiled); + if (st->setops_re_subfn == NULL) { + return -1; + } + return 0; +} + // ==== Module data getters =================================================== static inline PyObject * /* reference to re.compile(pattern).match() */ @@ -216,7 +244,9 @@ fnmatchmodule_exec(PyObject *module) INTERN_STRING(st, hyphen_esc_str, "\\-"); INTERN_STRING(st, backslash_str, "\\"); INTERN_STRING(st, backslash_esc_str, "\\\\"); - INTERN_STRING(st, setops_str, "([&~|])"); + if (fnmatchmodule_load_setops_re_sub(module, st) < 0) { + return -1; + } INTERN_STRING(st, setops_repl_str, "\\\\\\1"); return 0; } @@ -229,7 +259,7 @@ fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); Py_VISIT(st->setops_repl_str); - Py_VISIT(st->setops_str); + Py_VISIT(st->setops_re_subfn); Py_VISIT(st->backslash_esc_str); Py_VISIT(st->backslash_str); Py_VISIT(st->hyphen_esc_str); @@ -247,7 +277,7 @@ fnmatchmodule_clear(PyObject *m) { fnmatchmodule_state *st = get_fnmatchmodule_state(m); Py_CLEAR(st->setops_repl_str); - Py_CLEAR(st->setops_str); + Py_CLEAR(st->setops_re_subfn); Py_CLEAR(st->backslash_esc_str); Py_CLEAR(st->backslash_str); Py_CLEAR(st->hyphen_esc_str); diff --git a/Modules/_fnmatch/util.h b/Modules/_fnmatch/util.h index 36e21bc6f5f09e..ac5c4362d78a4e 100644 --- a/Modules/_fnmatch/util.h +++ b/Modules/_fnmatch/util.h @@ -23,7 +23,7 @@ typedef struct { PyObject *backslash_esc_str; // escaped backslash '\\\\' /* set operation tokens (&&, ~~ and ||) are not supported in regex */ - PyObject *setops_str; // set operation tokens '([&~|])' + PyObject *setops_re_subfn; // cached re.compile('([&~|])').sub() PyObject *setops_repl_str; // replacement pattern '\\\\\\1' } fnmatchmodule_state; From fa04ea1be6127e57877da84ae38b1f803dd69c43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 18 Aug 2024 13:30:27 +0200 Subject: [PATCH 88/97] update implementation --- Modules/_fnmatch/translate.c | 117 +++++++++++++---------------------- 1 file changed, 42 insertions(+), 75 deletions(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index e0ddce101338ad..b6f0304664c610 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -15,16 +15,12 @@ // ==== Helper declarations =================================================== /* - * Write re.escape(pattern[start:stop]). + * Write re.escape(ch). * * This returns the number of written characters, or -1 if an error occurred. - * - * @pre 0 <= start < stop <= len(pattern) */ -static inline Py_ssize_t -escape_block(PyUnicodeWriter *writer, - PyObject *pattern, Py_ssize_t start, Py_ssize_t stop, - PyObject *re_escape_func); +static Py_ssize_t +escape_char(fnmatchmodule_state *state, PyUnicodeWriter *writer, Py_UCS4 ch); /* * Construct a regular expression out of a UNIX-style expression. @@ -56,8 +52,7 @@ translate_expression(fnmatchmodule_state *state, */ static Py_ssize_t write_expression(fnmatchmodule_state *state, - PyUnicodeWriter *writer, PyObject *expression, - PyObject *setops_re_sub_meth); + PyUnicodeWriter *writer, PyObject *expression); /* * Build the final regular expression by processing the wildcards. @@ -69,17 +64,6 @@ process_wildcards(PyObject *pattern, PyObject *indices); // ==== API implementation ==================================================== -static inline PyObject * -get_setops_re_sub_method(fnmatchmodule_state *state) -{ - PyObject *compiled = PyObject_CallMethodOneArg(state->re_module, - &_Py_ID(compile), - state->setops_str); - PyObject *method = PyObject_GetAttr(compiled, &_Py_ID(sub)); - Py_DECREF(compiled); - return method; -} - PyObject * _Py_fnmatch_translate(PyObject *module, PyObject *pattern) { @@ -110,20 +94,11 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) // ---- decl local objects ------------------------------------------------ // list containing the indices where '*' has a special meaning PyObject *wildcard_indices = NULL; - // cached functions (cache is local to the call) - PyObject *re_escape_func = NULL; // re.escape() - PyObject *setops_re_subfn = NULL; // re.compile('([&~|])').sub() + // call-level cached functions PyObject *pattern_str_find_meth = NULL; // pattern.find() // ---- def local objects ------------------------------------------------- wildcard_indices = PyList_New(0); CHECK_NOT_NULL_OR_ABORT(wildcard_indices); - // The Python implementation always takes queries re.escape() and re.sub() - // inside translate() and thus we should at least allow external users to - // mock those functions (thus, we cannot cache them in the module's state). - re_escape_func = PyObject_GetAttr(state->re_module, &_Py_ID(escape)); - CHECK_NOT_NULL_OR_ABORT(re_escape_func); - setops_re_subfn = get_setops_re_sub_method(state); - CHECK_NOT_NULL_OR_ABORT(setops_re_subfn); pattern_str_find_meth = PyObject_GetAttr(pattern, &_Py_ID(find)); CHECK_NOT_NULL_OR_ABORT(pattern_str_find_meth); // ------------------------------------------------------------------------ @@ -138,28 +113,13 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) ++IND; \ } \ } while (0) -#define WRITE_PENDING(ESCSTOP) \ - do { \ - if (escstart != -1) { \ - Py_ssize_t t = escape_block(writer, pattern, \ - escstart, (ESCSTOP), \ - re_escape_func); \ - if (t < 0) { \ - goto abort; \ - } \ - written += t; \ - escstart = -1; \ - } \ - } while (0) // ------------------------------------------------------------------------ Py_ssize_t i = 0; // current index Py_ssize_t written = 0; // number of characters written - Py_ssize_t escstart = -1, escstop = -1; // start/stop escaping indices - while ((escstop = i) < maxind) { + while (i < maxind) { Py_UCS4 chr = READ_CHAR(i++); switch (chr) { case '*': { - WRITE_PENDING(escstop); // translate wildcard '*' (fnmatch) into optional '.' (regex) WRITE_CHAR_OR_ABORT(writer, '*'); // skip duplicated '*' @@ -173,15 +133,13 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) break; } case '?': { - WRITE_PENDING(escstop); // translate optional '?' (fnmatch) into optional '.' (regex) WRITE_CHAR_OR_ABORT(writer, '.'); ++written; // increase the expected result's length break; } case '[': { - WRITE_PENDING(escstop); - assert(READ_CHAR(escstop) == '['); + assert(READ_CHAR(i - 1) == '['); Py_ssize_t j = i; ADVANCE_IF_CHAR_IS('!', j, maxind); // [! ADVANCE_IF_CHAR_IS(']', j, maxind); // [!] or [] @@ -209,8 +167,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) pattern_str_find_meth); } CHECK_NOT_NULL_OR_ABORT(expr); - Py_ssize_t expr_len = write_expression(state, writer, expr, - setops_re_subfn); + Py_ssize_t expr_len = write_expression(state, writer, expr); Py_DECREF(expr); if (expr_len < 0) { goto abort; @@ -221,21 +178,16 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } } default: { - if (escstart == -1) { - assert(i >= 1); - escstart = i - 1; - } + Py_ssize_t t = escape_char(state, writer, chr); + CHECK_RET_CODE_OR_ABORT(t); + written += t; break; } } } - WRITE_PENDING(maxind); -#undef WRITE_PENDING #undef ADVANCE_IF_CHAR_IS #undef READ_CHAR Py_DECREF(pattern_str_find_meth); - Py_DECREF(setops_re_subfn); - Py_DECREF(re_escape_func); PyObject *translated = PyUnicodeWriter_Finish(writer); if (translated == NULL) { Py_DECREF(wildcard_indices); @@ -247,8 +199,6 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) return res; abort: Py_XDECREF(pattern_str_find_meth); - Py_XDECREF(setops_re_subfn); - Py_XDECREF(re_escape_func); Py_XDECREF(wildcard_indices); PyUnicodeWriter_Discard(writer); return NULL; @@ -256,20 +206,38 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) // ==== Helper implementations ================================================ -static inline Py_ssize_t -escape_block(PyUnicodeWriter *writer, - PyObject *pattern, Py_ssize_t start, Py_ssize_t stop, - PyObject *re_escape_func) +/* taken from unicodeobject.c */ +static inline PyObject * +unicode_char(Py_UCS4 ch) { -#ifdef Py_DEBUG - if (start < 0 || start >= stop || stop > PyUnicode_GET_LENGTH(pattern)) { - PyErr_BadInternalCall(); - return -1; +#define MAX_UNICODE 0x10ffff + assert(ch <= MAX_UNICODE); +#undef MAX_UNICODE + if (ch < 256) { + return _Py_LATIN1_CHR(ch); } -#endif - PyObject *str = PyUnicode_Substring(pattern, start, stop); + PyObject *unicode = PyUnicode_New(1, ch); + if (unicode == NULL) { + return NULL; + } + assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); + if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { + PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; + } + else { + assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); + PyUnicode_4BYTE_DATA(unicode)[0] = ch; + } + assert(_PyUnicode_CheckConsistency(unicode, 1)); + return unicode; +} + +static Py_ssize_t +escape_char(fnmatchmodule_state *state, PyUnicodeWriter *writer, Py_UCS4 ch) +{ + PyObject *str = unicode_char(ch); CHECK_NOT_NULL_OR_ABORT(str); - PyObject *escaped = PyObject_CallOneArg(re_escape_func, str); + PyObject *escaped = PyObject_CallOneArg(state->re_escape, str); Py_DECREF(str); CHECK_NOT_NULL_OR_ABORT(escaped); Py_ssize_t written = PyUnicode_GET_LENGTH(escaped); @@ -479,8 +447,7 @@ translate_expression(fnmatchmodule_state *state, static Py_ssize_t write_expression(fnmatchmodule_state *state, - PyUnicodeWriter *writer, PyObject *expression, - PyObject *setops_re_sub_meth) + PyUnicodeWriter *writer, PyObject *expression) { PyObject *safe_expression = NULL; // for the 'goto abort' statements Py_ssize_t grouplen = PyUnicode_GET_LENGTH(expression); @@ -498,7 +465,7 @@ write_expression(fnmatchmodule_state *state, Py_ssize_t extra = 2; // '[' and ']' WRITE_CHAR_OR_ABORT(writer, '['); // escape set operations as late as possible - safe_expression = SETOPS_REPLACE(state, expression, setops_re_sub_meth); + safe_expression = SETOPS_REPLACE(state, expression, state->setops_re_subfn); CHECK_NOT_NULL_OR_ABORT(safe_expression); switch (token) { case '!': { From 0b4ccede1f7b9ddf5328090113785cd9a98507b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 19 Aug 2024 16:20:19 +0200 Subject: [PATCH 89/97] use macros for abort-flow --- Modules/_fnmatch/_fnmatchmodule.c | 87 ++++++++++++------------------- 1 file changed, 34 insertions(+), 53 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index bceca5e73cff6d..48330d78249ea2 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -27,6 +27,7 @@ # define Py_BUILD_CORE_MODULE 1 #endif +#include "macros.h" #include "util.h" // prototypes #include "pycore_runtime.h" // for _Py_ID() @@ -159,22 +160,18 @@ fnmatchmodule_load_setops_re_sub(PyObject *Py_UNUSED(module), // make sure that this function is called once assert(st->setops_re_subfn == NULL); PyObject *pattern = PyUnicode_FromString("([&~|])"); - if (pattern == NULL) { - return -1; - } + CHECK_NOT_NULL_OR_ABORT(pattern); PyObject *compiled = PyObject_CallMethodOneArg(st->re_module, &_Py_ID(compile), pattern); Py_DECREF(pattern); - if (compiled == NULL) { - return -1; - } + CHECK_NOT_NULL_OR_ABORT(compiled); st->setops_re_subfn = PyObject_GetAttr(compiled, &_Py_ID(sub)); Py_DECREF(compiled); - if (st->setops_re_subfn == NULL) { - return -1; - } + CHECK_NOT_NULL_OR_ABORT(st->setops_re_subfn); return 0; +abort: + return -1; } // ==== Module data getters =================================================== @@ -205,54 +202,45 @@ get_platform_normcase_function(PyObject *module, bool *isposix) // ==== Module state functions ================================================ -/* Import a named module and store it in 'STATE->ATTRIBUTE'. */ +static int +fnmatchmodule_exec(PyObject *module) +{ + // ---- def local macros -------------------------------------------------- + /* Import a named module and store it in 'STATE->ATTRIBUTE'. */ #define IMPORT_MODULE(STATE, ATTRIBUTE, MODULE_NAME) \ do { \ /* make sure that the attribute is initialized once */ \ assert(STATE->ATTRIBUTE == NULL); \ STATE->ATTRIBUTE = PyImport_ImportModule((MODULE_NAME)); \ - if (STATE->ATTRIBUTE == NULL) { \ - return -1; \ - } \ + CHECK_NOT_NULL_OR_ABORT(STATE->ATTRIBUTE); \ } while (0) - -/* Intern a literal STRING and store it in 'STATE->ATTRIBUTE'. */ + /* Intern a literal STRING and store it in 'STATE->ATTRIBUTE'. */ #define INTERN_STRING(STATE, ATTRIBUTE, STRING) \ do { \ /* make sure that the attribute is initialized once */ \ assert(STATE->ATTRIBUTE == NULL); \ STATE->ATTRIBUTE = PyUnicode_InternFromString((STRING)); \ - if (STATE->ATTRIBUTE == NULL) { \ - return -1; \ - } \ + CHECK_NOT_NULL_OR_ABORT(STATE->ATTRIBUTE); \ } while (0) - -static int -fnmatchmodule_exec(PyObject *module) -{ + // ------------------------------------------------------------------------ fnmatchmodule_state *st = get_fnmatchmodule_state(module); IMPORT_MODULE(st, os_module, "os"); IMPORT_MODULE(st, posixpath_module, "posixpath"); IMPORT_MODULE(st, re_module, "re"); - if (fnmatchmodule_load_translator(module, st) < 0) { - return -1; - } - if (fnmatchmodule_load_escapefunc(module, st) < 0) { - return -1; - } + CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_translator(module, st)); + CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_escapefunc(module, st)); INTERN_STRING(st, hyphen_str, "-"); INTERN_STRING(st, hyphen_esc_str, "\\-"); INTERN_STRING(st, backslash_str, "\\"); INTERN_STRING(st, backslash_esc_str, "\\\\"); - if (fnmatchmodule_load_setops_re_sub(module, st) < 0) { - return -1; - } + CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_setops_re_sub(module, st)); INTERN_STRING(st, setops_repl_str, "\\\\\\1"); return 0; -} - +abort: + return -1; #undef INTERN_STRING #undef IMPORT_MODULE +} static int fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) @@ -316,27 +304,22 @@ fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pattern) /*[clinic end generated code: output=1a68530a2e3cf7d0 input=7ac729daad3b1404]*/ { bool isposix = 0; - PyObject *normcase = get_platform_normcase_function(module, &isposix); - if (normcase == NULL) { - return NULL; - } + PyObject *normcase = NULL; // for the 'goto abort' statements + normcase = get_platform_normcase_function(module, &isposix); + CHECK_NOT_NULL_OR_ABORT(normcase); PyObject *normalized_pattern = PyObject_CallOneArg(normcase, pattern); - if (normalized_pattern == NULL) { - Py_DECREF(normcase); - return NULL; - } + CHECK_NOT_NULL_OR_ABORT(normalized_pattern); // the matcher is cached with respect to the *normalized* pattern PyObject *matcher = get_matcher_function(module, normalized_pattern); Py_DECREF(normalized_pattern); - if (matcher == NULL) { - Py_DECREF(normcase); - return NULL; - } - PyObject *normalizer = isposix ? NULL : normcase; - PyObject *filtered = _Py_fnmatch_filter(matcher, names, normalizer); + CHECK_NOT_NULL_OR_ABORT(matcher); + PyObject *filtered = _Py_fnmatch_filter(matcher, names, normcase); Py_DECREF(matcher); Py_DECREF(normcase); return filtered; +abort: + Py_XDECREF(normcase); + return NULL; } /*[clinic input] @@ -437,14 +420,10 @@ fnmatch_translate_impl(PyObject *module, PyObject *pattern) PyObject *decoded = PyUnicode_DecodeLatin1(PyBytes_AS_STRING(pattern), PyBytes_GET_SIZE(pattern), "strict"); - if (decoded == NULL) { - return NULL; - } + CHECK_NOT_NULL_OR_ABORT(decoded); PyObject *translated = _Py_fnmatch_translate(module, decoded); Py_DECREF(decoded); - if (translated == NULL) { - return NULL; - } + CHECK_NOT_NULL_OR_ABORT(translated); PyObject *res = PyUnicode_AsLatin1String(translated); Py_DECREF(translated); return res; @@ -456,6 +435,8 @@ fnmatch_translate_impl(PyObject *module, PyObject *pattern) PyErr_SetString(PyExc_TypeError, INVALID_PATTERN_TYPE); return NULL; } +abort: + return NULL; } // ==== Module specs ========================================================== From 5039bcedb37dd504e8ecde58f671a08ee29b3810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 19 Aug 2024 16:37:51 +0200 Subject: [PATCH 90/97] allow path-like objects in `fnmatch.filter` See gh-123122 for the rationale. --- Modules/_fnmatch/_fnmatchmodule.c | 14 +++----------- Modules/_fnmatch/filter.c | 19 +++++++------------ Modules/_fnmatch/util.h | 7 +++---- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 48330d78249ea2..cd963103f05723 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -185,7 +185,7 @@ get_matcher_function(PyObject *module, PyObject *pattern) } static inline PyObject * /* reference to os.path.normcase() */ -get_platform_normcase_function(PyObject *module, bool *isposix) +get_platform_normcase_function(PyObject *module) { fnmatchmodule_state *st = get_fnmatchmodule_state(module); PyObject *os_path = PyObject_GetAttr(st->os_module, &_Py_ID(path)); @@ -193,9 +193,6 @@ get_platform_normcase_function(PyObject *module, bool *isposix) return NULL; } PyObject *normcase = PyObject_GetAttr(os_path, &_Py_ID(normcase)); - if (isposix != NULL) { - *isposix = Py_Is(os_path, st->posixpath_module); - } Py_DECREF(os_path); return normcase; } @@ -225,7 +222,6 @@ fnmatchmodule_exec(PyObject *module) // ------------------------------------------------------------------------ fnmatchmodule_state *st = get_fnmatchmodule_state(module); IMPORT_MODULE(st, os_module, "os"); - IMPORT_MODULE(st, posixpath_module, "posixpath"); IMPORT_MODULE(st, re_module, "re"); CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_translator(module, st)); CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_escapefunc(module, st)); @@ -255,7 +251,6 @@ fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) Py_VISIT(st->re_escape); Py_VISIT(st->translator); Py_VISIT(st->re_module); - Py_VISIT(st->posixpath_module); Py_VISIT(st->os_module); return 0; } @@ -273,7 +268,6 @@ fnmatchmodule_clear(PyObject *m) Py_CLEAR(st->re_escape); Py_CLEAR(st->translator); Py_CLEAR(st->re_module); - Py_CLEAR(st->posixpath_module); Py_CLEAR(st->os_module); return 0; } @@ -303,9 +297,8 @@ static PyObject * fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pattern) /*[clinic end generated code: output=1a68530a2e3cf7d0 input=7ac729daad3b1404]*/ { - bool isposix = 0; PyObject *normcase = NULL; // for the 'goto abort' statements - normcase = get_platform_normcase_function(module, &isposix); + normcase = get_platform_normcase_function(module); CHECK_NOT_NULL_OR_ABORT(normcase); PyObject *normalized_pattern = PyObject_CallOneArg(normcase, pattern); CHECK_NOT_NULL_OR_ABORT(normalized_pattern); @@ -349,8 +342,7 @@ static int fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pattern) /*[clinic end generated code: output=c9dc542e8d6933b6 input=279a4a4f2ddea6a2]*/ { - // use the runtime 'os.path' value and not a cached one - PyObject *normcase = get_platform_normcase_function(module, NULL); + PyObject *normcase = get_platform_normcase_function(module); if (normcase == NULL) { return -1; } diff --git a/Modules/_fnmatch/filter.c b/Modules/_fnmatch/filter.c index d3611b7f5f883e..bd1d6c8ec85073 100644 --- a/Modules/_fnmatch/filter.c +++ b/Modules/_fnmatch/filter.c @@ -5,8 +5,9 @@ #include "Python.h" PyObject * -_Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normalizer) +_Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normcase) { + assert(normcase != NULL); PyObject *iter = PyObject_GetIter(names); if (iter == NULL) { return NULL; @@ -18,18 +19,12 @@ _Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normalizer) } PyObject *name = NULL; while ((name = PyIter_Next(iter))) { - PyObject *match; - if (normalizer == NULL) { - match = PyObject_CallOneArg(matcher, name); - } - else { - PyObject *normalized = PyObject_CallOneArg(normalizer, name); - if (normalized == NULL) { - goto abort; - } - match = PyObject_CallOneArg(matcher, normalized); - Py_DECREF(normalized); + PyObject *normalized = PyObject_CallOneArg(normcase, name); + if (normalized == NULL) { + goto abort; } + PyObject *match = PyObject_CallOneArg(matcher, normalized); + Py_DECREF(normalized); if (match == NULL) { goto abort; } diff --git a/Modules/_fnmatch/util.h b/Modules/_fnmatch/util.h index ac5c4362d78a4e..276921328dd868 100644 --- a/Modules/_fnmatch/util.h +++ b/Modules/_fnmatch/util.h @@ -9,7 +9,6 @@ typedef struct { PyObject *os_module; // import os - PyObject *posixpath_module; // import posixpath PyObject *re_module; // import re PyObject *translator; // LRU-cached translation unit @@ -44,14 +43,14 @@ get_fnmatchmodule_state(PyObject *module) * * matcher A reference to the 'match()' method of a compiled pattern. * names An iterable of strings (str or bytes objects) to match. - * normalizer Optional normalization function. + * normcase A reference to os.path.normcase(). * * This is equivalent to: * - * [name for name in names if matcher(normalizer(name))] + * [name for name in names if matcher(normcase(name))] */ extern PyObject * -_Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normalizer); +_Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normcase); /* * C accelerator for translating UNIX shell patterns into RE patterns. From afc22b2101cac0b66a41dee595f48152cf1552a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 19 Aug 2024 16:43:22 +0200 Subject: [PATCH 91/97] macros bike-shedding --- Modules/_fnmatch/macros.h | 8 +++++++- Modules/_fnmatch/translate.c | 11 ++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Modules/_fnmatch/macros.h b/Modules/_fnmatch/macros.h index d78a75b52d1dc0..a39586338ea62a 100644 --- a/Modules/_fnmatch/macros.h +++ b/Modules/_fnmatch/macros.h @@ -1,6 +1,6 @@ /* * This file contains various macro definitions in order to reduce the - * number of lines in translate.c. Do not use them for something else. + * number of lines in '_fnmatch'. Do not use them for something else. */ #ifndef _FNMATCH_MACROS_H @@ -23,6 +23,12 @@ } \ } while (0) +/* + * Identical to CHECK_RET_CODE_OR_ABORT but where the + * argument is semantically used as a positive integer. + */ +#define CHECK_UNSIGNED_INT_OR_ABORT CHECK_RET_CODE_OR_ABORT + /* * Check that OBJ is not NULL or execute 'goto abort'. * diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index b6f0304664c610..cc22fd5e7ef7af 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -169,9 +169,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) CHECK_NOT_NULL_OR_ABORT(expr); Py_ssize_t expr_len = write_expression(state, writer, expr); Py_DECREF(expr); - if (expr_len < 0) { - goto abort; - } + CHECK_UNSIGNED_INT_OR_ABORT(expr_len); written += expr_len; i = j + 1; // jump to the character after ']' break; // explicit early break for clarity @@ -179,7 +177,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) } default: { Py_ssize_t t = escape_char(state, writer, chr); - CHECK_RET_CODE_OR_ABORT(t); + CHECK_UNSIGNED_INT_OR_ABORT(t); written += t; break; } @@ -526,9 +524,8 @@ process_wildcards(PyObject *pattern, PyObject *indices) #define LOAD_WILDCARD_INDEX(VAR, IND) \ do { \ VAR = PyLong_AsSsize_t(PyList_GET_ITEM(indices, (IND))); \ - if ((VAR) < 0 && PyErr_Occurred()) { \ - goto abort; \ - } \ + /* wildcard indices must be >= 0 */ \ + CHECK_UNSIGNED_INT_OR_ABORT(VAR); \ } while (0) // ------------------------------------------------------------------------ WRITE_ASCII_OR_ABORT(writer, "(?s:", 4); From e66a602843251cd4eb6936770c2dfffcb7369b49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 19 Aug 2024 16:43:35 +0200 Subject: [PATCH 92/97] type bike-shedding --- Modules/_fnmatch/translate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index cc22fd5e7ef7af..3dbd0d59d094d2 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -102,7 +102,7 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) pattern_str_find_meth = PyObject_GetAttr(pattern, &_Py_ID(find)); CHECK_NOT_NULL_OR_ABORT(pattern_str_find_meth); // ------------------------------------------------------------------------ - const int pattern_kind = PyUnicode_KIND(pattern); + const unsigned int pattern_kind = PyUnicode_KIND(pattern); const void *const pattern_data = PyUnicode_DATA(pattern); // ---- def local macros -------------------------------------------------- #define READ_CHAR(IND) PyUnicode_READ(pattern_kind, pattern_data, IND) From 2b59064103393da0d3a30fe564ed13190005c628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 19 Aug 2024 16:58:58 +0200 Subject: [PATCH 93/97] update outdated comment --- Modules/_fnmatch/_fnmatchmodule.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index cd963103f05723..4c05fa2d5d1b68 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -1,17 +1,6 @@ /* * C accelerator for the 'fnmatch' module. * - * Currently, the following inconsistencies in the Python implementation exist: - * - * - fnmatch.filter(NAMES, PATTERN) works with pathlib.Path() instances - * in NAMES on Windows but raises a TypeError on POSIX platforms. - * - * The reason is that os.path.normcase() is called on each NAME in NAMES - * but not on POSIX platforms. In particular, os.fspath() is never called: - * - * POSIX fnmatch.filter([Path("a")], "*") -> TypeError - * Windows fnmatch.filter([Path("a")], "*") -> [Path("a")] - * * - Case normalization uses the runtime value of os.path.normcase(), * forcing us to query the attribute each time. * From 8efbe9a358d9675b336a6a3767cb906ab2f2774b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 20 Aug 2024 16:40:53 +0200 Subject: [PATCH 94/97] only keep `fnmatch.translate` in C --- .../pycore_global_objects_fini_generated.h | 1 - Include/internal/pycore_global_strings.h | 1 - .../internal/pycore_runtime_init_generated.h | 1 - .../internal/pycore_unicodeobject_generated.h | 4 - Lib/fnmatch.py | 103 +++---- Lib/test/test_fnmatch.py | 7 - Modules/Setup.stdlib.in | 2 +- Modules/_fnmatch/_fnmatchmodule.c | 291 ++---------------- Modules/_fnmatch/clinic/_fnmatchmodule.c.h | 199 +----------- Modules/_fnmatch/filter.c | 48 --- Modules/_fnmatch/macros.h | 20 +- Modules/_fnmatch/translate.c | 87 +----- Modules/_fnmatch/util.h | 20 -- PCbuild/pythoncore.vcxproj | 1 - PCbuild/pythoncore.vcxproj.filters | 3 - 15 files changed, 89 insertions(+), 699 deletions(-) delete mode 100644 Modules/_fnmatch/filter.c diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index 661490cd73c00d..209410d716a07d 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -1103,7 +1103,6 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nlocals)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(node_depth)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(node_offset)); - _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(normcase)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ns)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nstype)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nt)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 2882390f9780c2..5431ba18bf4b24 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -592,7 +592,6 @@ struct _Py_global_strings { STRUCT_FOR_ID(nlocals) STRUCT_FOR_ID(node_depth) STRUCT_FOR_ID(node_offset) - STRUCT_FOR_ID(normcase) STRUCT_FOR_ID(ns) STRUCT_FOR_ID(nstype) STRUCT_FOR_ID(nt) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index ecd624e4bca02a..f3e8d4c5fab26d 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -1101,7 +1101,6 @@ extern "C" { INIT_ID(nlocals), \ INIT_ID(node_depth), \ INIT_ID(node_offset), \ - INIT_ID(normcase), \ INIT_ID(ns), \ INIT_ID(nstype), \ INIT_ID(nt), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index e114fca09aefe8..2a494149e6143a 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -2168,10 +2168,6 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); - string = &_Py_ID(normcase); - _PyUnicode_InternStatic(interp, &string); - assert(_PyUnicode_CheckConsistency(string, 1)); - assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(ns); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 1dc52f2575ae6c..0a1dc7c5196597 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -16,65 +16,65 @@ __all__ = ["filter", "fnmatch", "fnmatchcase", "translate"] -try: - from _fnmatch import filter -except ImportError: - def filter(names, pat): - """Construct a list from the names in *names* matching *pat*.""" - result = [] - pat = os.path.normcase(pat) - match = _compile_pattern(pat) - if os.path is posixpath: - # normcase on posix is NOP. Optimize it away from the loop. - for name in names: - if match(name): - result.append(name) - else: - for name in names: - if match(os.path.normcase(name)): - result.append(name) - return result +def fnmatch(name, pat): + """Test whether FILENAME matches PATTERN. -try: - from _fnmatch import fnmatch -except ImportError: - def fnmatch(name, pat): - """Test whether *name* matches *pat*. + Patterns are Unix shell style: - Patterns are Unix shell style: + * matches everything + ? matches any single character + [seq] matches any character in seq + [!seq] matches any char not in seq - * matches everything - ? matches any single character - [seq] matches any character in seq - [!seq] matches any char not in seq + An initial period in FILENAME is not special. + Both FILENAME and PATTERN are first case-normalized + if the operating system requires it. + If you don't want this, use fnmatchcase(FILENAME, PATTERN). + """ + name = os.path.normcase(name) + pat = os.path.normcase(pat) + return fnmatchcase(name, pat) - An initial period in *name* is not special. - Both *name* and *pat* are first case-normalized - if the operating system requires it. +@functools.lru_cache(maxsize=32768, typed=True) +def _compile_pattern(pat): + if isinstance(pat, bytes): + pat_str = str(pat, 'ISO-8859-1') + res_str = translate(pat_str) + res = bytes(res_str, 'ISO-8859-1') + else: + res = translate(pat) + return re.compile(res).match - If you don't want this, use fnmatchcase(name, pat). - """ - name = os.path.normcase(name) - pat = os.path.normcase(pat) - return fnmatchcase(name, pat) +def filter(names, pat): + """Construct a list from those elements of the iterable NAMES that match PAT.""" + result = [] + pat = os.path.normcase(pat) + match = _compile_pattern(pat) + if os.path is posixpath: + # normcase on posix is NOP. Optimize it away from the loop. + for name in names: + if match(name): + result.append(name) + else: + for name in names: + if match(os.path.normcase(name)): + result.append(name) + return result -try: - from _fnmatch import fnmatchcase -except ImportError: - def fnmatchcase(name, pat): - """Test whether *name* matches *pat*, including case. +def fnmatchcase(name, pat): + """Test whether FILENAME matches PATTERN, including case. - This is a version of fnmatch() which doesn't case-normalize - its arguments. - """ - match = _compile_pattern(pat) - return match(name) is not None + This is a version of fnmatch() which doesn't case-normalize + its arguments. + """ + match = _compile_pattern(pat) + return match(name) is not None try: from _fnmatch import translate except ImportError: def translate(pat): - """Translate a shell pattern *pat* to a regular expression. + """Translate a shell PATTERN to a regular expression. There is no way to quote meta-characters. """ @@ -83,15 +83,6 @@ def translate(pat): parts = _translate(pat, STAR, '.') return _join_translated_parts(parts, STAR) -@functools.lru_cache(maxsize=32768, typed=True) -def _compile_pattern(pat): - if isinstance(pat, bytes): - pat_str = str(pat, 'ISO-8859-1') - res_str = translate(pat_str) - res = bytes(res_str, 'ISO-8859-1') - else: - res = translate(pat) - return re.compile(res).match def _translate(pat, STAR, QUESTION_MARK): res = [] diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 6ab244021ea20d..034324139511bb 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -229,9 +229,6 @@ def test_warnings(self): class PurePythonFnmatchTestCase(FnmatchTestCaseMixin, unittest.TestCase): fnmatch = py_fnmatch -class CPythonFnmatchTestCase(FnmatchTestCaseMixin, unittest.TestCase): - fnmatch = c_fnmatch - class TranslateTestCaseMixin: fnmatch = None @@ -382,7 +379,6 @@ def __iter__(self): with self.assertRaisesRegex(ValueError, r'^nope$'): self.fnmatch.filter(BadList(), '*') - def test_mix_bytes_str(self): filter = self.fnmatch.filter self.assertRaises(TypeError, filter, ['test'], b'*') @@ -407,8 +403,5 @@ def test_sep(self): class PurePythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): fnmatch = py_fnmatch -class CPythonFilterTestCase(FilterTestCaseMixin, unittest.TestCase): - fnmatch = c_fnmatch - if __name__ == "__main__": unittest.main() diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in index f33af67aa26499..8195b7c75c2aa8 100644 --- a/Modules/Setup.stdlib.in +++ b/Modules/Setup.stdlib.in @@ -33,7 +33,7 @@ @MODULE__BISECT_TRUE@_bisect _bisectmodule.c @MODULE__CONTEXTVARS_TRUE@_contextvars _contextvarsmodule.c @MODULE__CSV_TRUE@_csv _csv.c -@MODULE__FNMATCH_TRUE@_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/filter.c _fnmatch/translate.c +@MODULE__FNMATCH_TRUE@_fnmatch _fnmatch/_fnmatchmodule.c _fnmatch/translate.c @MODULE__HEAPQ_TRUE@_heapq _heapqmodule.c @MODULE__JSON_TRUE@_json _json.c @MODULE__LSPROF_TRUE@_lsprof _lsprof.c rotatingtree.c diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 4c05fa2d5d1b68..1ae44424a7ffcd 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -1,15 +1,5 @@ /* * C accelerator for the 'fnmatch' module. - * - * - Case normalization uses the runtime value of os.path.normcase(), - * forcing us to query the attribute each time. - * - * The C implementation of fnmatch.filter() uses the same os.path.normcase() - * when iterating over NAMES, ignoring side-effects on os.path.normcase() - * that may occur when processing a NAME in NAMES. - * - * More generally, os.path.normcase() is retrieved at most once per call - * to fnmatch.filter() or fnmatch.fnmatch(). */ #ifndef Py_BUILD_CORE_BUILTIN @@ -26,83 +16,6 @@ #define LRU_CACHE_SIZE 32768 #define INVALID_PATTERN_TYPE "pattern must be a string or a bytes object" -// ==== Cached translation unit =============================================== - -/* - * Compile a UNIX shell pattern into a RE pattern - * and returns the corresponding 'match()' method. - * - * This function is LRU-cached by the module itself. - */ -static PyObject * -get_matcher_function_impl(PyObject *module, PyObject *pattern) -{ - // translate the pattern into a RE pattern - assert(module != NULL); - PyObject *translated = fnmatch_translate_impl(module, pattern); - if (translated == NULL) { - return NULL; - } - fnmatchmodule_state *st = get_fnmatchmodule_state(module); - // compile the pattern - PyObject *compile_func = PyObject_GetAttr(st->re_module, &_Py_ID(compile)); - if (compile_func == NULL) { - Py_DECREF(translated); - return NULL; - } - PyObject *compiled = PyObject_CallOneArg(compile_func, translated); - Py_DECREF(compile_func); - Py_DECREF(translated); - if (compiled == NULL) { - return NULL; - } - // get the compiled pattern matcher function - PyObject *matcher = PyObject_GetAttr(compiled, &_Py_ID(match)); - Py_DECREF(compiled); - return matcher; -} - -static PyMethodDef get_matcher_function_def = { - "get_matcher_function", - get_matcher_function_impl, - METH_O, - NULL -}; - -static int -fnmatchmodule_load_translator(PyObject *module, fnmatchmodule_state *st) -{ - // make sure that this function is called once - assert(st->translator == NULL); - PyObject *maxsize = PyLong_FromLong(LRU_CACHE_SIZE); - if (maxsize == NULL) { - return -1; - } - PyObject *cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); - if (cache == NULL) { - Py_DECREF(maxsize); - return -1; - } - PyObject *args[3] = {NULL, maxsize, Py_True}; - size_t nargsf = 2 | PY_VECTORCALL_ARGUMENTS_OFFSET; - PyObject *wrapper = PyObject_Vectorcall(cache, &args[1], nargsf, NULL); - Py_DECREF(maxsize); - Py_DECREF(cache); - if (wrapper == NULL) { - return -1; - } - assert(module != NULL); - PyObject *wrapped = PyCFunction_New(&get_matcher_function_def, module); - // reference on 'translator' will be removed upon module cleanup - st->translator = PyObject_CallOneArg(wrapper, wrapped); - Py_DECREF(wrapped); - Py_DECREF(wrapper); - if (st->translator == NULL) { - return -1; - } - return 0; -} - // ==== Cached re.escape() unit =============================================== /* Create an LRU-cached function for re.escape(). */ @@ -113,9 +26,7 @@ fnmatchmodule_load_escapefunc(PyObject *Py_UNUSED(module), // make sure that this function is called once assert(st->re_escape == NULL); PyObject *maxsize = PyLong_FromLong(LRU_CACHE_SIZE); - if (maxsize == NULL) { - return -1; - } + CHECK_NOT_NULL_OR_ABORT(maxsize); PyObject *cache = _PyImport_GetModuleAttrString("functools", "lru_cache"); if (cache == NULL) { Py_DECREF(maxsize); @@ -124,35 +35,39 @@ fnmatchmodule_load_escapefunc(PyObject *Py_UNUSED(module), PyObject *wrapper = PyObject_CallOneArg(cache, maxsize); Py_DECREF(maxsize); Py_DECREF(cache); - if (wrapper == NULL) { + CHECK_NOT_NULL_OR_ABORT(wrapper); + PyObject *wrapped = _PyImport_GetModuleAttrString("re", "escape"); + if (wrapped == NULL) { + Py_DECREF(wrapper); return -1; } - assert(st->re_module != NULL); - PyObject *wrapped = PyObject_GetAttr(st->re_module, &_Py_ID(escape)); - // reference on 'escapechar' will be removed upon module cleanup st->re_escape = PyObject_CallOneArg(wrapper, wrapped); Py_DECREF(wrapped); Py_DECREF(wrapper); - if (st->re_escape == NULL) { - return -1; - } + CHECK_NOT_NULL_OR_ABORT(st->re_escape); return 0; +abort: + return -1; } // ==== Cached re.sub() unit for set operation tokens ========================= -/* Create an LRU-cached function for re.compile('([&~|])').sub(). */ +/* Store a reference to re.compile('([&~|])').sub(). */ static int fnmatchmodule_load_setops_re_sub(PyObject *Py_UNUSED(module), fnmatchmodule_state *st) { // make sure that this function is called once assert(st->setops_re_subfn == NULL); - PyObject *pattern = PyUnicode_FromString("([&~|])"); + PyObject *pattern = PyUnicode_FromStringAndSize("([&~|])", 7); CHECK_NOT_NULL_OR_ABORT(pattern); - PyObject *compiled = PyObject_CallMethodOneArg(st->re_module, - &_Py_ID(compile), - pattern); + PyObject *re_compile = _PyImport_GetModuleAttrString("re", "compile"); + if (re_compile == NULL) { + Py_DECREF(pattern); + return -1; + } + PyObject *compiled = PyObject_CallOneArg(re_compile, pattern); + Py_DECREF(re_compile); Py_DECREF(pattern); CHECK_NOT_NULL_OR_ABORT(compiled); st->setops_re_subfn = PyObject_GetAttr(compiled, &_Py_ID(sub)); @@ -163,56 +78,20 @@ fnmatchmodule_load_setops_re_sub(PyObject *Py_UNUSED(module), return -1; } -// ==== Module data getters =================================================== - -static inline PyObject * /* reference to re.compile(pattern).match() */ -get_matcher_function(PyObject *module, PyObject *pattern) -{ - fnmatchmodule_state *st = get_fnmatchmodule_state(module); - assert(st->translator != NULL); - return PyObject_CallOneArg(st->translator, pattern); -} - -static inline PyObject * /* reference to os.path.normcase() */ -get_platform_normcase_function(PyObject *module) -{ - fnmatchmodule_state *st = get_fnmatchmodule_state(module); - PyObject *os_path = PyObject_GetAttr(st->os_module, &_Py_ID(path)); - if (os_path == NULL) { - return NULL; - } - PyObject *normcase = PyObject_GetAttr(os_path, &_Py_ID(normcase)); - Py_DECREF(os_path); - return normcase; -} - // ==== Module state functions ================================================ static int fnmatchmodule_exec(PyObject *module) { // ---- def local macros -------------------------------------------------- - /* Import a named module and store it in 'STATE->ATTRIBUTE'. */ -#define IMPORT_MODULE(STATE, ATTRIBUTE, MODULE_NAME) \ - do { \ - /* make sure that the attribute is initialized once */ \ - assert(STATE->ATTRIBUTE == NULL); \ - STATE->ATTRIBUTE = PyImport_ImportModule((MODULE_NAME)); \ - CHECK_NOT_NULL_OR_ABORT(STATE->ATTRIBUTE); \ - } while (0) /* Intern a literal STRING and store it in 'STATE->ATTRIBUTE'. */ #define INTERN_STRING(STATE, ATTRIBUTE, STRING) \ do { \ - /* make sure that the attribute is initialized once */ \ - assert(STATE->ATTRIBUTE == NULL); \ STATE->ATTRIBUTE = PyUnicode_InternFromString((STRING)); \ CHECK_NOT_NULL_OR_ABORT(STATE->ATTRIBUTE); \ } while (0) // ------------------------------------------------------------------------ fnmatchmodule_state *st = get_fnmatchmodule_state(module); - IMPORT_MODULE(st, os_module, "os"); - IMPORT_MODULE(st, re_module, "re"); - CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_translator(module, st)); CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_escapefunc(module, st)); INTERN_STRING(st, hyphen_str, "-"); INTERN_STRING(st, hyphen_esc_str, "\\-"); @@ -224,7 +103,6 @@ fnmatchmodule_exec(PyObject *module) abort: return -1; #undef INTERN_STRING -#undef IMPORT_MODULE } static int @@ -238,9 +116,6 @@ fnmatchmodule_traverse(PyObject *m, visitproc visit, void *arg) Py_VISIT(st->hyphen_esc_str); Py_VISIT(st->hyphen_str); Py_VISIT(st->re_escape); - Py_VISIT(st->translator); - Py_VISIT(st->re_module); - Py_VISIT(st->os_module); return 0; } @@ -255,9 +130,6 @@ fnmatchmodule_clear(PyObject *m) Py_CLEAR(st->hyphen_esc_str); Py_CLEAR(st->hyphen_str); Py_CLEAR(st->re_escape); - Py_CLEAR(st->translator); - Py_CLEAR(st->re_module); - Py_CLEAR(st->os_module); return 0; } @@ -272,117 +144,6 @@ module fnmatch [clinic start generated code]*/ /*[clinic end generated code: output=da39a3ee5e6b4b0d input=797aa965370a9ef2]*/ -/*[clinic input] -fnmatch.filter -> object - - names: object - pat as pattern: object - -Construct a list from the names in *names* matching *pat*. - -[clinic start generated code]*/ - -static PyObject * -fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pattern) -/*[clinic end generated code: output=1a68530a2e3cf7d0 input=7ac729daad3b1404]*/ -{ - PyObject *normcase = NULL; // for the 'goto abort' statements - normcase = get_platform_normcase_function(module); - CHECK_NOT_NULL_OR_ABORT(normcase); - PyObject *normalized_pattern = PyObject_CallOneArg(normcase, pattern); - CHECK_NOT_NULL_OR_ABORT(normalized_pattern); - // the matcher is cached with respect to the *normalized* pattern - PyObject *matcher = get_matcher_function(module, normalized_pattern); - Py_DECREF(normalized_pattern); - CHECK_NOT_NULL_OR_ABORT(matcher); - PyObject *filtered = _Py_fnmatch_filter(matcher, names, normcase); - Py_DECREF(matcher); - Py_DECREF(normcase); - return filtered; -abort: - Py_XDECREF(normcase); - return NULL; -} - -/*[clinic input] -fnmatch.fnmatch -> bool - - name: object - pat as pattern: object - -Test whether *name* matches *pat*. - -Patterns are Unix shell style: - -* matches everything -? matches any single character -[seq] matches any character in seq -[!seq] matches any char not in seq - -An initial period in *name* is not special. -Both *name* and *pat* are first case-normalized -if the operating system requires it. - -If you don't want this, use fnmatchcase(name, pat). - -[clinic start generated code]*/ - -static int -fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pattern) -/*[clinic end generated code: output=c9dc542e8d6933b6 input=279a4a4f2ddea6a2]*/ -{ - PyObject *normcase = get_platform_normcase_function(module); - if (normcase == NULL) { - return -1; - } - // apply case normalization on both arguments - PyObject *norm_name = PyObject_CallOneArg(normcase, name); - if (norm_name == NULL) { - Py_DECREF(normcase); - return -1; - } - PyObject *norm_pattern = PyObject_CallOneArg(normcase, pattern); - Py_DECREF(normcase); - if (norm_pattern == NULL) { - Py_DECREF(norm_name); - return -1; - } - int matching = fnmatch_fnmatchcase_impl(module, norm_name, norm_pattern); - Py_DECREF(norm_pattern); - Py_DECREF(norm_name); - return matching; -} - -/*[clinic input] -fnmatch.fnmatchcase -> bool - - name: object - pat as pattern: object - -Test whether *name* matches *pat*, including case. - -This is a version of fnmatch() which doesn't case-normalize -its arguments. -[clinic start generated code]*/ - -static int -fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pattern) -/*[clinic end generated code: output=4d6b268169001876 input=91d62999c08fd55e]*/ -{ - // fnmatchcase() does not apply any case normalization on the inputs - PyObject *matcher = get_matcher_function(module, pattern); - if (matcher == NULL) { - return -1; - } - // If 'name' is of incorrect type, it will be detected when calling - // the matcher function (we check 're.compile(pattern).match(name)'). - PyObject *match = PyObject_CallOneArg(matcher, name); - Py_DECREF(matcher); - int matching = match == NULL ? -1 : !Py_IsNone(match); - Py_XDECREF(match); - return matching; -} - /*[clinic input] fnmatch.translate -> object @@ -422,21 +183,7 @@ fnmatch_translate_impl(PyObject *module, PyObject *pattern) // ==== Module specs ========================================================== -// fmt: off -PyDoc_STRVAR(fnmatchmodule_doc, -"Filename matching with shell patterns.\n" -"fnmatch(FILENAME, PATTERN) matches according to the local convention.\n" -"fnmatchcase(FILENAME, PATTERN) always takes case in account.\n\n" -"The functions operate by translating the pattern into a regular\n" -"expression. They cache the compiled regular expressions for speed.\n\n" -"The function translate(PATTERN) returns a regular expression\n" -"corresponding to PATTERN. (It does not compile it.)"); -// fmt: on - static PyMethodDef fnmatchmodule_methods[] = { - FNMATCH_FILTER_METHODDEF - FNMATCH_FNMATCH_METHODDEF - FNMATCH_FNMATCHCASE_METHODDEF FNMATCH_TRANSLATE_METHODDEF {NULL, NULL} }; @@ -451,7 +198,7 @@ static struct PyModuleDef_Slot fnmatchmodule_slots[] = { static struct PyModuleDef _fnmatchmodule = { PyModuleDef_HEAD_INIT, .m_name = "_fnmatch", - .m_doc = fnmatchmodule_doc, + .m_doc = NULL, .m_size = sizeof(fnmatchmodule_state), .m_methods = fnmatchmodule_methods, .m_slots = fnmatchmodule_slots, @@ -467,4 +214,4 @@ PyInit__fnmatch(void) } #undef INVALID_PATTERN_TYPE -#undef COMPILED_CACHE_SIZE +#undef LRU_CACHE_SIZE diff --git a/Modules/_fnmatch/clinic/_fnmatchmodule.c.h b/Modules/_fnmatch/clinic/_fnmatchmodule.c.h index c611f01673b326..38129540d37433 100644 --- a/Modules/_fnmatch/clinic/_fnmatchmodule.c.h +++ b/Modules/_fnmatch/clinic/_fnmatchmodule.c.h @@ -8,203 +8,6 @@ preserve #endif #include "pycore_modsupport.h" // _PyArg_UnpackKeywords() -PyDoc_STRVAR(fnmatch_filter__doc__, -"filter($module, /, names, pat)\n" -"--\n" -"\n" -"Construct a list from the names in *names* matching *pat*."); - -#define FNMATCH_FILTER_METHODDEF \ - {"filter", _PyCFunction_CAST(fnmatch_filter), METH_FASTCALL|METH_KEYWORDS, fnmatch_filter__doc__}, - -static PyObject * -fnmatch_filter_impl(PyObject *module, PyObject *names, PyObject *pattern); - -static PyObject * -fnmatch_filter(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) -{ - PyObject *return_value = NULL; - #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - - #define NUM_KEYWORDS 2 - static struct { - PyGC_Head _this_is_not_used; - PyObject_VAR_HEAD - PyObject *ob_item[NUM_KEYWORDS]; - } _kwtuple = { - .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) - .ob_item = { &_Py_ID(names), &_Py_ID(pat), }, - }; - #undef NUM_KEYWORDS - #define KWTUPLE (&_kwtuple.ob_base.ob_base) - - #else // !Py_BUILD_CORE - # define KWTUPLE NULL - #endif // !Py_BUILD_CORE - - static const char * const _keywords[] = {"names", "pat", NULL}; - static _PyArg_Parser _parser = { - .keywords = _keywords, - .fname = "filter", - .kwtuple = KWTUPLE, - }; - #undef KWTUPLE - PyObject *argsbuf[2]; - PyObject *names; - PyObject *pattern; - - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); - if (!args) { - goto exit; - } - names = args[0]; - pattern = args[1]; - return_value = fnmatch_filter_impl(module, names, pattern); - -exit: - return return_value; -} - -PyDoc_STRVAR(fnmatch_fnmatch__doc__, -"fnmatch($module, /, name, pat)\n" -"--\n" -"\n" -"Test whether *name* matches *pat*.\n" -"\n" -"Patterns are Unix shell style:\n" -"\n" -"* matches everything\n" -"? matches any single character\n" -"[seq] matches any character in seq\n" -"[!seq] matches any char not in seq\n" -"\n" -"An initial period in *name* is not special.\n" -"Both *name* and *pat* are first case-normalized\n" -"if the operating system requires it.\n" -"\n" -"If you don\'t want this, use fnmatchcase(name, pat)."); - -#define FNMATCH_FNMATCH_METHODDEF \ - {"fnmatch", _PyCFunction_CAST(fnmatch_fnmatch), METH_FASTCALL|METH_KEYWORDS, fnmatch_fnmatch__doc__}, - -static int -fnmatch_fnmatch_impl(PyObject *module, PyObject *name, PyObject *pattern); - -static PyObject * -fnmatch_fnmatch(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) -{ - PyObject *return_value = NULL; - #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - - #define NUM_KEYWORDS 2 - static struct { - PyGC_Head _this_is_not_used; - PyObject_VAR_HEAD - PyObject *ob_item[NUM_KEYWORDS]; - } _kwtuple = { - .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) - .ob_item = { &_Py_ID(name), &_Py_ID(pat), }, - }; - #undef NUM_KEYWORDS - #define KWTUPLE (&_kwtuple.ob_base.ob_base) - - #else // !Py_BUILD_CORE - # define KWTUPLE NULL - #endif // !Py_BUILD_CORE - - static const char * const _keywords[] = {"name", "pat", NULL}; - static _PyArg_Parser _parser = { - .keywords = _keywords, - .fname = "fnmatch", - .kwtuple = KWTUPLE, - }; - #undef KWTUPLE - PyObject *argsbuf[2]; - PyObject *name; - PyObject *pattern; - int _return_value; - - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); - if (!args) { - goto exit; - } - name = args[0]; - pattern = args[1]; - _return_value = fnmatch_fnmatch_impl(module, name, pattern); - if ((_return_value == -1) && PyErr_Occurred()) { - goto exit; - } - return_value = PyBool_FromLong((long)_return_value); - -exit: - return return_value; -} - -PyDoc_STRVAR(fnmatch_fnmatchcase__doc__, -"fnmatchcase($module, /, name, pat)\n" -"--\n" -"\n" -"Test whether *name* matches *pat*, including case.\n" -"\n" -"This is a version of fnmatch() which doesn\'t case-normalize\n" -"its arguments."); - -#define FNMATCH_FNMATCHCASE_METHODDEF \ - {"fnmatchcase", _PyCFunction_CAST(fnmatch_fnmatchcase), METH_FASTCALL|METH_KEYWORDS, fnmatch_fnmatchcase__doc__}, - -static int -fnmatch_fnmatchcase_impl(PyObject *module, PyObject *name, PyObject *pattern); - -static PyObject * -fnmatch_fnmatchcase(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) -{ - PyObject *return_value = NULL; - #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - - #define NUM_KEYWORDS 2 - static struct { - PyGC_Head _this_is_not_used; - PyObject_VAR_HEAD - PyObject *ob_item[NUM_KEYWORDS]; - } _kwtuple = { - .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) - .ob_item = { &_Py_ID(name), &_Py_ID(pat), }, - }; - #undef NUM_KEYWORDS - #define KWTUPLE (&_kwtuple.ob_base.ob_base) - - #else // !Py_BUILD_CORE - # define KWTUPLE NULL - #endif // !Py_BUILD_CORE - - static const char * const _keywords[] = {"name", "pat", NULL}; - static _PyArg_Parser _parser = { - .keywords = _keywords, - .fname = "fnmatchcase", - .kwtuple = KWTUPLE, - }; - #undef KWTUPLE - PyObject *argsbuf[2]; - PyObject *name; - PyObject *pattern; - int _return_value; - - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 2, 2, 0, argsbuf); - if (!args) { - goto exit; - } - name = args[0]; - pattern = args[1]; - _return_value = fnmatch_fnmatchcase_impl(module, name, pattern); - if ((_return_value == -1) && PyErr_Occurred()) { - goto exit; - } - return_value = PyBool_FromLong((long)_return_value); - -exit: - return return_value; -} - PyDoc_STRVAR(fnmatch_translate__doc__, "translate($module, /, pat)\n" "--\n" @@ -261,4 +64,4 @@ fnmatch_translate(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyO exit: return return_value; } -/*[clinic end generated code: output=50f858ef4bfb569a input=a9049054013a1b77]*/ +/*[clinic end generated code: output=eab39d3bb9f3a13d input=a9049054013a1b77]*/ diff --git a/Modules/_fnmatch/filter.c b/Modules/_fnmatch/filter.c deleted file mode 100644 index bd1d6c8ec85073..00000000000000 --- a/Modules/_fnmatch/filter.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Provide the implementation of the high-level matcher-based functions. - */ - -#include "Python.h" - -PyObject * -_Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normcase) -{ - assert(normcase != NULL); - PyObject *iter = PyObject_GetIter(names); - if (iter == NULL) { - return NULL; - } - PyObject *res = PyList_New(0); - if (res == NULL) { - Py_DECREF(iter); - return NULL; - } - PyObject *name = NULL; - while ((name = PyIter_Next(iter))) { - PyObject *normalized = PyObject_CallOneArg(normcase, name); - if (normalized == NULL) { - goto abort; - } - PyObject *match = PyObject_CallOneArg(matcher, normalized); - Py_DECREF(normalized); - if (match == NULL) { - goto abort; - } - int matching = Py_IsNone(match) == 0; - Py_DECREF(match); - if (matching && PyList_Append(res, name) < 0) { - goto abort; - } - Py_DECREF(name); - } - Py_DECREF(iter); - if (PyErr_Occurred()) { - Py_CLEAR(res); - } - return res; -abort: - Py_DECREF(name); - Py_DECREF(iter); - Py_DECREF(res); - return NULL; -} diff --git a/Modules/_fnmatch/macros.h b/Modules/_fnmatch/macros.h index a39586338ea62a..2363e1b8051ff8 100644 --- a/Modules/_fnmatch/macros.h +++ b/Modules/_fnmatch/macros.h @@ -6,8 +6,6 @@ #ifndef _FNMATCH_MACROS_H #define _FNMATCH_MACROS_H -// ==== Macro definitions ===================================================== - /* * Check that STATUS is >= 0 or execute 'goto abort'. * @@ -116,17 +114,13 @@ NULL \ ) -/* - * Escape set operations in STRING using re.sub(). - * - * SETOPS_RE_SUB_METH is a reference to re.compile('([&~|])').sub(). - */ -#define SETOPS_REPLACE(STATE, STRING, SETOPS_RE_SUB_METH) \ - PyObject_CallFunctionObjArgs( \ - (SETOPS_RE_SUB_METH), \ - (STATE)->setops_repl_str, \ - (STRING), \ - NULL \ +/* Escape set operations in STRING using re.sub(). */ +#define SETOPS_REPLACE(STATE, STRING) \ + PyObject_CallFunctionObjArgs( \ + (STATE)->setops_re_subfn, \ + (STATE)->setops_repl_str, \ + (STRING), \ + NULL \ ) #endif // _FNMATCH_MACROS_H diff --git a/Modules/_fnmatch/translate.c b/Modules/_fnmatch/translate.c index 3dbd0d59d094d2..ef2d2e43f4b3b1 100644 --- a/Modules/_fnmatch/translate.c +++ b/Modules/_fnmatch/translate.c @@ -70,32 +70,13 @@ _Py_fnmatch_translate(PyObject *module, PyObject *pattern) assert(PyUnicode_Check(pattern)); fnmatchmodule_state *state = get_fnmatchmodule_state(module); const Py_ssize_t maxind = PyUnicode_GET_LENGTH(pattern); - - // We would write less data if there are successive '*', - // which usually happens once or twice in the pattern. - // Otherwise, we write >= maxind characters since escaping - // them always add more characters. - // - // Note that only '()[]{}?*+-|^$\\.&~# \t\n\r\v\f' need to - // be escaped when translated to RE patterns and '*' and '?' - // are already handled without being escaped. - // - // In general, UNIX style patterns are more likely to contain - // wildcards than characters to be escaped, with the exception - // of '-', '\' and '~' (we usually want to match filenmaes), - // and there is a sparse number of them. Therefore, we only - // estimate the number of characters to be written to be the - // same as the number of characters in the pattern. PyUnicodeWriter *writer = PyUnicodeWriter_Create(maxind); if (writer == NULL) { return NULL; } - // ---- decl local objects ------------------------------------------------ - // list containing the indices where '*' has a special meaning - PyObject *wildcard_indices = NULL; - // call-level cached functions - PyObject *pattern_str_find_meth = NULL; // pattern.find() + PyObject *wildcard_indices = NULL; // positions of stars + PyObject *pattern_str_find_meth = NULL; // cached pattern.find() // ---- def local objects ------------------------------------------------- wildcard_indices = PyList_New(0); CHECK_NOT_NULL_OR_ABORT(wildcard_indices); @@ -251,8 +232,6 @@ escape_char(fnmatchmodule_state *state, PyUnicodeWriter *writer, Py_UCS4 ch) * Extract a list of chunks from the pattern group described by start and stop. * * For instance, the chunks for [a-z0-9] or [!a-z0-9] are ['a', 'z0', '9']. - * - * See translate_expression() for its usage. */ static PyObject * split_expression(fnmatchmodule_state *state, @@ -307,11 +286,7 @@ split_expression(fnmatchmodule_state *state, Py_ssize_t chunkscount = PyList_GET_SIZE(chunks); assert(chunkscount > 0); PyObject *chunk = PyList_GET_ITEM(chunks, chunkscount - 1); - assert(chunk != NULL); PyObject *str = PyUnicode_Concat(chunk, hyphen); - // PyList_SetItem() does not create a new reference on 'str' - // so we should not decref 'str' after the call, unless there - // is an issue while setting the item. if (str == NULL || PyList_SetItem(chunks, chunkscount - 1, str) < 0) { Py_XDECREF(str); goto abort; @@ -326,25 +301,17 @@ split_expression(fnmatchmodule_state *state, return NULL; } -/* - * Remove empty ranges (they are invalid in RE). - * - * See translate_expression() for its usage. - */ +/* Remove empty ranges (they are invalid in RE). */ static int simplify_expression(PyObject *chunks) { // for k in range(len(chunks) - 1, 0, -1): for (Py_ssize_t k = PyList_GET_SIZE(chunks) - 1; k > 0; --k) { PyObject *c1 = PyList_GET_ITEM(chunks, k - 1); - assert(c1 != NULL); Py_ssize_t c1len = PyUnicode_GET_LENGTH(c1); - assert(c1len > 0); PyObject *c2 = PyList_GET_ITEM(chunks, k); - assert(c2 != NULL); Py_ssize_t c2len = PyUnicode_GET_LENGTH(c2); - assert(c2len > 0); if (PyUnicode_READ_CHAR(c1, c1len - 1) > PyUnicode_READ_CHAR(c2, 0)) { Py_ssize_t olen = c1len + c2len - 2; @@ -352,19 +319,14 @@ simplify_expression(PyObject *chunks) PyObject *str = NULL; if (olen == 0) { // c1[:1] + c2[1:] == '' str = Py_GetConstant(Py_CONSTANT_EMPTY_STR); - assert(_Py_IsImmortal(str)); } else if (c1len == 1) { // c1[:1] + c2[1:] == c2[1:] - assert(c2len > 1); str = PyUnicode_Substring(c2, 1, c2len); } else if (c2len == 1) { // c1[:1] + c2[1:] == c1[:1] - assert(c1len > 1); str = PyUnicode_Substring(c1, 0, c1len - 1); } else { - assert(c1len > 1); - assert(c2len > 1); PyUnicodeWriter *writer = PyUnicodeWriter_Create(olen); CHECK_NOT_NULL_OR_ABORT(writer); // all but the last character in the first chunk @@ -379,9 +341,6 @@ simplify_expression(PyObject *chunks) } str = PyUnicodeWriter_Finish(writer); } - // PyList_SetItem() does not create a new reference on 'str' - // so we should not decref 'str' after the call, unless there - // is an issue while setting the item. if (str == NULL || PyList_SetItem(chunks, k - 1, str) < 0) { Py_XDECREF(str); goto abort; @@ -394,26 +353,17 @@ simplify_expression(PyObject *chunks) return -1; } -/* - * Escape backslashes and hyphens for set difference (--), - * but hyphens that create ranges should not be escaped. - * - * See translate_expression() for its usage. - */ +/* Escape backslashes and hyphens for set difference (--). */ static int escape_expression(fnmatchmodule_state *state, PyObject *chunks) { - for (Py_ssize_t c = 0; c < PyList_GET_SIZE(chunks); ++c) { - PyObject *s0 = PyList_GET_ITEM(chunks, c); - assert(s0 != NULL); - PyObject *s1 = BACKSLASH_REPLACE(state, s0); + for (Py_ssize_t i = 0; i < PyList_GET_SIZE(chunks); ++i) { + PyObject *chunk = PyList_GET_ITEM(chunks, i); + PyObject *s1 = BACKSLASH_REPLACE(state, chunk); CHECK_NOT_NULL_OR_ABORT(s1); PyObject *s2 = HYPHEN_REPLACE(state, s1); Py_DECREF(s1); - // PyList_SetItem() does not create a new reference on 's2' - // so we should not decref 's2' after the call, unless there - // is an issue while setting the item. - if (s2 == NULL || PyList_SetItem(chunks, c, s2) < 0) { + if (s2 == NULL || PyList_SetItem(chunks, i, s2) < 0) { Py_XDECREF(s2); goto abort; } @@ -431,9 +381,7 @@ translate_expression(fnmatchmodule_state *state, PyObject *chunks = split_expression(state, pattern, start, stop, pattern_str_find_meth); CHECK_NOT_NULL_OR_ABORT(chunks); - // remove empty ranges CHECK_RET_CODE_OR_ABORT(simplify_expression(chunks)); - // escape backslashes and set differences CHECK_RET_CODE_OR_ABORT(escape_expression(state, chunks)); PyObject *res = PyUnicode_Join(state->hyphen_str, chunks); Py_DECREF(chunks); @@ -463,7 +411,7 @@ write_expression(fnmatchmodule_state *state, Py_ssize_t extra = 2; // '[' and ']' WRITE_CHAR_OR_ABORT(writer, '['); // escape set operations as late as possible - safe_expression = SETOPS_REPLACE(state, expression, state->setops_re_subfn); + safe_expression = SETOPS_REPLACE(state, expression); CHECK_NOT_NULL_OR_ABORT(safe_expression); switch (token) { case '!': { @@ -520,27 +468,21 @@ process_wildcards(PyObject *pattern, PyObject *indices) if (writer == NULL) { return NULL; } - // ---- def local macros -------------------------------------------------- -#define LOAD_WILDCARD_INDEX(VAR, IND) \ - do { \ - VAR = PyLong_AsSsize_t(PyList_GET_ITEM(indices, (IND))); \ - /* wildcard indices must be >= 0 */ \ - CHECK_UNSIGNED_INT_OR_ABORT(VAR); \ - } while (0) - // ------------------------------------------------------------------------ WRITE_ASCII_OR_ABORT(writer, "(?s:", 4); if (m == 0) { WRITE_STRING_OR_ABORT(writer, pattern); } else { - Py_ssize_t i = 0, j = -1; + Py_ssize_t i = 0; // process the optional PREFIX - LOAD_WILDCARD_INDEX(j, 0); + Py_ssize_t j = PyLong_AsSsize_t(PyList_GET_ITEM(indices, 0)); + CHECK_UNSIGNED_INT_OR_ABORT(j); WRITE_SUBSTRING_OR_ABORT(writer, pattern, i, j); i = j + 1; for (Py_ssize_t k = 1; k < m; ++k) { // process the (* INNER) groups - LOAD_WILDCARD_INDEX(j, k); + j = PyLong_AsSsize_t(PyList_GET_ITEM(indices, k)); + CHECK_UNSIGNED_INT_OR_ABORT(j); assert(i < j); // write the atomic RE group '(?>.*?' + INNER + ')' WRITE_ASCII_OR_ABORT(writer, "(?>.*?", 6); @@ -559,5 +501,4 @@ process_wildcards(PyObject *pattern, PyObject *indices) abort: PyUnicodeWriter_Discard(writer); return NULL; -#undef LOAD_WILDCARD_INDEX } diff --git a/Modules/_fnmatch/util.h b/Modules/_fnmatch/util.h index 276921328dd868..8f598fa66f7c1d 100644 --- a/Modules/_fnmatch/util.h +++ b/Modules/_fnmatch/util.h @@ -8,10 +8,6 @@ #include "Python.h" typedef struct { - PyObject *os_module; // import os - PyObject *re_module; // import re - - PyObject *translator; // LRU-cached translation unit PyObject *re_escape; // LRU-cached re.escape() function // strings used by translate.c @@ -36,22 +32,6 @@ get_fnmatchmodule_state(PyObject *module) // ==== Helper prototypes ===================================================== -/* - * Returns a list of matched names, or NULL if an error occurred. - * - * Parameters - * - * matcher A reference to the 'match()' method of a compiled pattern. - * names An iterable of strings (str or bytes objects) to match. - * normcase A reference to os.path.normcase(). - * - * This is equivalent to: - * - * [name for name in names if matcher(normcase(name))] - */ -extern PyObject * -_Py_fnmatch_filter(PyObject *matcher, PyObject *names, PyObject *normcase); - /* * C accelerator for translating UNIX shell patterns into RE patterns. * diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 2083072f6cf8cf..20141f370bc7a4 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -477,7 +477,6 @@ - diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 301030d50b5733..94de5f38778401 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -1070,9 +1070,6 @@ Modules\_fnmatch - - Modules\_fnmatch - Modules\_fnmatch From 5c37da720f06ea70974223437e42a0657b7fb533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:28:30 +0200 Subject: [PATCH 95/97] remove legacy tests --- Lib/test/test_fnmatch.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 034324139511bb..186f4eb81dee83 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -370,15 +370,6 @@ def test_filter(self): self.assertEqual(filter([b'Python', b'Ruby', b'Perl', b'Tcl'], b'P*'), [b'Python', b'Perl']) - def test_filter_iter_errors(self): - class BadList: - def __iter__(self): - yield 'abc' - raise ValueError("nope") - - with self.assertRaisesRegex(ValueError, r'^nope$'): - self.fnmatch.filter(BadList(), '*') - def test_mix_bytes_str(self): filter = self.fnmatch.filter self.assertRaises(TypeError, filter, ['test'], b'*') From 79fb2f2df098bd11f44186f7bd60db38e73fabee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:28:33 +0200 Subject: [PATCH 96/97] update NEWS --- .../Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst b/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst index f374f28456d65d..e310ca0a76bc0d 100644 --- a/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst +++ b/Misc/NEWS.d/next/Library/2024-07-12-09-24-38.gh-issue-121445.KYtNOZ.rst @@ -1,2 +1,2 @@ -Improve the performances of :func:`fnmatch.translate` by 2x and of -:func:`fnmatch.filter` by 1.1x. Patch by Bénédikt Tran. +Improve the performances of :func:`fnmatch.translate` by a factor 7. +Patch by Bénédikt Tran. From 6e9879f5bce6718138a067dcffada80022919aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:41:17 +0200 Subject: [PATCH 97/97] cleanup --- Modules/_fnmatch/_fnmatchmodule.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/Modules/_fnmatch/_fnmatchmodule.c b/Modules/_fnmatch/_fnmatchmodule.c index 1ae44424a7ffcd..9b3413cf3f233a 100644 --- a/Modules/_fnmatch/_fnmatchmodule.c +++ b/Modules/_fnmatch/_fnmatchmodule.c @@ -20,8 +20,7 @@ /* Create an LRU-cached function for re.escape(). */ static int -fnmatchmodule_load_escapefunc(PyObject *Py_UNUSED(module), - fnmatchmodule_state *st) +fnmatchmodule_load_escapefunc(fnmatchmodule_state *st) { // make sure that this function is called once assert(st->re_escape == NULL); @@ -54,8 +53,7 @@ fnmatchmodule_load_escapefunc(PyObject *Py_UNUSED(module), /* Store a reference to re.compile('([&~|])').sub(). */ static int -fnmatchmodule_load_setops_re_sub(PyObject *Py_UNUSED(module), - fnmatchmodule_state *st) +fnmatchmodule_load_setops_re_sub(fnmatchmodule_state *st) { // make sure that this function is called once assert(st->setops_re_subfn == NULL); @@ -92,17 +90,17 @@ fnmatchmodule_exec(PyObject *module) } while (0) // ------------------------------------------------------------------------ fnmatchmodule_state *st = get_fnmatchmodule_state(module); - CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_escapefunc(module, st)); + CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_escapefunc(st)); INTERN_STRING(st, hyphen_str, "-"); INTERN_STRING(st, hyphen_esc_str, "\\-"); INTERN_STRING(st, backslash_str, "\\"); INTERN_STRING(st, backslash_esc_str, "\\\\"); - CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_setops_re_sub(module, st)); + CHECK_RET_CODE_OR_ABORT(fnmatchmodule_load_setops_re_sub(st)); INTERN_STRING(st, setops_repl_str, "\\\\\\1"); +#undef INTERN_STRING return 0; abort: return -1; -#undef INTERN_STRING } static int