diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 8be16c8da926bf1..8792df123b49793 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -17,6 +17,37 @@ from test import support, string_tests from test.support.script_helper import assert_python_failure +# First invalid code point past the valid range +INVALID_CODEPOINT = sys.maxunicode + 1 + +# Characters in the ASCII range (U+0000-U+007f) +ASCII_CHAR = "a" +ASCII_LAST_CHAR = "\x7f" +# Characters in the UCS1 ("latin1") range (U+0000-U+00ff), +# but not in the ASCII range +UCS1_CHAR = "\xe9" +UCS1_LAST_CHAR = "\xff" +# Characters in UCS2 ("BMP") range (U+0000-U+FFFF), +# but not in the UCS1 range +UCS2_CHAR = "\u20ac" # euro sign +UCS2_LAST_CHAR = "\uffff" +# Characters in UCS4 ("Astral") range (U+0000-U+FFFF), +# but not in the UCS2 range +UCS4_CHAR = '\U0001F355' # slice of pizza +UCS4_LAST_CHAR = chr(sys.maxunicode) + +# Test characters of the ASCII range (U+0000-U+007f) +ASCII_CHARS = ("\x00", ASCII_CHAR, ASCII_LAST_CHAR) +# Test characters of the UCS1 range (U+0000-U+00ff), but not in ASCII range +UCS1_ONLY_CHARS = ("\x80", UCS1_CHAR, UCS1_LAST_CHAR) +# Test characters of the UCS1 range (U+0000-U+00ff) +UCS1_CHARS = ASCII_CHARS + UCS1_ONLY_CHARS +# Test characters of the Basic Multilingual Plane (U+0000-U+ffff) +BMP_CHARS = UCS1_CHARS + ("\u0100", UCS2_CHAR, UCS2_LAST_CHAR) +# Test characters of the full Unicode Character Set (U+0000-U+10ffff) +FULL_UCS_CHARS = BMP_CHARS + ('\U00010000', UCS4_CHAR, UCS4_LAST_CHAR) + + # Error handling (bad decoder return) def search_function(encoding): def decode1(input, errors="strict"): @@ -78,7 +109,7 @@ def test_literals(self): self.assertEqual('\uffff', '\U0000ffff') self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'') self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'') - self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000) + self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % INVALID_CODEPOINT) # raw strings should not have unicode escapes self.assertNotEqual(r"\u0020", " ") @@ -341,12 +372,13 @@ def test_maketrans_translate(self): "[]") self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})), "[XXX]") - self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})), - "[\xe9]") self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})), "x123") - self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})), - "x\xe9") + for ch in UCS1_ONLY_CHARS: + self.assertEqual("[a]".translate(str.maketrans({'a': ch})), + f"[{ch}]") + self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': ch})), + f"x{ch}") # test non-ASCII (don't take the fast-path) self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})), @@ -361,9 +393,8 @@ def test_maketrans_translate(self): "[<\u20ac>\xe9]") # invalid Unicode characters - invalid_char = 0x10ffff+1 - for before in "a\xe9\u20ac\U0010ffff": - mapping = str.maketrans({before: invalid_char}) + for before in FULL_UCS_CHARS: + mapping = str.maketrans({before: INVALID_CODEPOINT}) text = "[%s]" % before self.assertRaises(ValueError, text.translate, mapping) @@ -642,8 +673,9 @@ def test_isalpha(self): def test_isascii(self): super().test_isascii() - self.assertFalse("\u20ac".isascii()) - self.assertFalse("\U0010ffff".isascii()) + for ch in FULL_UCS_CHARS: + self.assertEqual(ch.isascii(), ord(ch) < 0x80, + hex(ord(ch))) def test_isdecimal(self): self.checkequalnofix(False, '', 'isdecimal') @@ -861,12 +893,15 @@ def test_swapcase(self): def test_center(self): string_tests.CommonTest.test_center(self) - self.assertEqual('x'.center(2, '\U0010FFFF'), - 'x\U0010FFFF') - self.assertEqual('x'.center(3, '\U0010FFFF'), - '\U0010FFFFx\U0010FFFF') - self.assertEqual('x'.center(4, '\U0010FFFF'), - '\U0010FFFFx\U0010FFFF\U0010FFFF') + for ch1 in FULL_UCS_CHARS: + for ch2 in FULL_UCS_CHARS: + with self.subTest(ch1=ch1, ch2=ch2): + self.assertEqual(ch1.center(2, ch2), + ch1 + ch2) + self.assertEqual(ch1.center(3, ch2), + ch2 + ch1 + ch2) + self.assertEqual(ch1.center(4, ch2), + ch2 + ch1 + ch2 + ch2) @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system") @support.cpython_only @@ -1355,7 +1390,7 @@ def test_formatting(self): self.assertEqual('%c' % 0x1234, '\u1234') self.assertEqual('%c' % 0x21483, '\U00021483') - self.assertRaises(OverflowError, "%c".__mod__, (0x110000,)) + self.assertRaises(OverflowError, "%c".__mod__, (INVALID_CODEPOINT,)) self.assertEqual('%c' % '\U00021483', '\U00021483') self.assertRaises(TypeError, "%c".__mod__, "aa") self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3)) @@ -1392,8 +1427,10 @@ def __str__(self): self.assertEqual('%F' % INF, 'INF') # PEP 393 - self.assertEqual('%.1s' % "a\xe9\u20ac", 'a') - self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9') + text = ''.join(FULL_UCS_CHARS) + for length in range(len(text)): + fmt = f'%.{length}s' + self.assertEqual(fmt % text, text[:length]) #issue 19995 class PseudoInt: @@ -1503,7 +1540,7 @@ def test_constructor(self): 'unicode remains unicode' ) - for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'): + for text in FULL_UCS_CHARS: subclass = StrSubclass(text) self.assertEqual(str(subclass), text) self.assertEqual(len(subclass), len(text)) @@ -2131,7 +2168,7 @@ def test_codecs(self): # UTF-8 must be roundtrip safe for all code points # (except surrogates, which are forbidden). u = ''.join(map(chr, list(range(0, 0xd800)) + - list(range(0xe000, 0x110000)))) + list(range(0xe000, sys.maxunicode + 1)))) for encoding in ('utf-8',): self.assertEqual(str(u.encode(encoding),encoding), u) @@ -2221,6 +2258,7 @@ def test_ucs4(self): y = br'\U00100000' x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") self.assertEqual(x, y) + y = br'\U00010000' x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") self.assertEqual(x, y) @@ -2297,7 +2335,7 @@ def test_raiseMemError(self): ascii_struct_size = 24 compact_struct_size = 36 - for char in ('a', '\xe9', '\u20ac', '\U0010ffff'): + for char in FULL_UCS_CHARS: code = ord(char) if code < 0x100: char_size = 1 # sizeof(Py_UCS1) @@ -2363,19 +2401,7 @@ def test_resize(self): def test_compare(self): # Issue #17615 N = 10 - ascii = 'a' * N - ascii2 = 'z' * N - latin = '\x80' * N - latin2 = '\xff' * N - bmp = '\u0100' * N - bmp2 = '\uffff' * N - astral = '\U00100000' * N - astral2 = '\U0010ffff' * N - strings = ( - ascii, ascii2, - latin, latin2, - bmp, bmp2, - astral, astral2) + strings = tuple(ch * N for ch in FULL_UCS_CHARS) for text1, text2 in itertools.combinations(strings, 2): equal = (text1 is text2) self.assertEqual(text1 == text2, equal) @@ -2398,6 +2424,15 @@ def test_compare(self): self.assertTrue(copy1 <= copy2) self.assertTrue(copy2 >= copy2) + ascii = ASCII_CHAR * N + ascii2 = ASCII_LAST_CHAR * N + latin = UCS1_CHAR * N + latin2 = UCS1_LAST_CHAR * N + bmp = UCS2_CHAR * N + bmp2 = UCS2_LAST_CHAR * N + astral = UCS4_CHAR * N + astral2 = UCS4_LAST_CHAR * N + self.assertTrue(ascii < ascii2) self.assertTrue(ascii < latin) self.assertTrue(ascii < bmp) @@ -2536,10 +2571,10 @@ def check_format(expected, format, *args): # test "%c" check_format('\uabcd', b'%c', c_int(0xabcd)) - check_format('\U0010ffff', - b'%c', c_int(0x10ffff)) + check_format(chr(sys.maxunicode), + b'%c', c_int(sys.maxunicode)) with self.assertRaises(OverflowError): - PyUnicode_FromFormat(b'%c', c_int(0x110000)) + PyUnicode_FromFormat(b'%c', c_int(INVALID_CODEPOINT)) # Issue #18183 check_format('\U00010000\U00100000', b'%c%c', c_int(0x10000), c_int(0x100000)) @@ -2706,8 +2741,9 @@ def check_format(expected, format, *args): b'%100.80x', c_int(0x123)) # test %A - check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", - b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') + check_format(r"%%A:'abc\x%02x\u%04x\U%08x'" + % (ord(UCS1_CHAR), ord(UCS2_CHAR), ord(UCS4_CHAR)), + b'%%A:%A', 'abc' + UCS1_CHAR + UCS2_CHAR + UCS4_CHAR) # test %V check_format('repr=abc', @@ -2767,7 +2803,7 @@ def test_aswidechar(self): self.assertEqual(size, 7) self.assertEqual(wchar, 'abc\0def\0') - nonbmp = chr(0x10ffff) + nonbmp = UCS4_CHAR if sizeof(c_wchar) == 2: buflen = 3 nchar = 2 @@ -2793,7 +2829,7 @@ def test_aswidecharstring(self): self.assertEqual(size, 7) self.assertEqual(wchar, 'abc\0def\0') - nonbmp = chr(0x10ffff) + nonbmp = UCS4_CHAR if sizeof(c_wchar) == 2: nchar = 2 else: # sizeof(c_wchar) == 4 @@ -2858,8 +2894,8 @@ def test_findchar(self): self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i) str = "!>_ 0xFFFF) && (ch <= 0x10FFFF)); + assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE)); s += 4; if (STRINGLIB_MAX_CHAR <= 0xFFFF || - (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) + (STRINGLIB_MAX_CHAR < MAX_UNICODE && ch > STRINGLIB_MAX_CHAR)) /* Out-of-range */ goto Return; *p++ = ch; diff --git a/Objects/stringlib/find_max_char.h b/Objects/stringlib/find_max_char.h index 8ccbc3094463df3..fc8c405623bd90a 100644 --- a/Objects/stringlib/find_max_char.h +++ b/Objects/stringlib/find_max_char.h @@ -54,7 +54,7 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) #define MAX_CHAR_ASCII 0x7f #define MAX_CHAR_UCS1 0xff #define MAX_CHAR_UCS2 0xffff -#define MAX_CHAR_UCS4 0x10ffff +#define MAX_CHAR_UCS4 MAX_UNICODE Py_LOCAL_INLINE(Py_UCS4) STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5545eae79505a38..59f4f438952d819 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -90,8 +90,9 @@ NOTE: In the interpreter's initialization phase, some globals are currently extern "C" { #endif -/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ +/* Maximum code point of Unicode 12.0: 0x10ffff (1,114,111) */ #define MAX_UNICODE 0x10ffff +#define MAX_UNICODE_RANGE "range(0x110000)" #ifdef Py_DEBUG # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) @@ -469,13 +470,13 @@ unicode_check_encoding_errors(const char *encoding, const char *errors) } -/* The max unicode value is always 0x10FFFF while using the PEP-393 API. +/* The max unicode value is always MAX_UNICODE while using the PEP-393 API. This function is kept for backward compatibility with the old API. */ Py_UNICODE PyUnicode_GetMax(void) { #ifdef Py_UNICODE_WIDE - return 0x10FFFF; + return MAX_UNICODE; #else /* This is actually an illegal character, so it should not be passed to unichr. */ @@ -2771,7 +2772,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, int ordinal = va_arg(*vargs, int); if (ordinal < 0 || ordinal > MAX_UNICODE) { PyErr_SetString(PyExc_OverflowError, - "character argument not in range(0x110000)"); + "character argument not in " MAX_UNICODE_RANGE); return NULL; } if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) @@ -3209,7 +3210,7 @@ PyUnicode_FromOrdinal(int ordinal) { if (ordinal < 0 || ordinal > MAX_UNICODE) { PyErr_SetString(PyExc_ValueError, - "chr() arg not in range(0x110000)"); + "chr() arg not in " MAX_UNICODE_RANGE); return NULL; } @@ -5562,13 +5563,13 @@ PyUnicode_DecodeUTF32Stateful(const char *s, endinpos = ((const char *)e) - starts; } else { - if (ch < 0x110000) { + if (ch <= MAX_UNICODE) { if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) goto onError; q += 4; continue; } - errmsg = "code point not in range(0x110000)"; + errmsg = "code point not in " MAX_UNICODE_RANGE; startinpos = ((const char *)q) - starts; endinpos = startinpos + 4; } @@ -13677,7 +13678,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, { case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; - case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; + case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break; default: Py_UNREACHABLE(); } @@ -14496,7 +14497,7 @@ formatchar(PyObject *v) if (x < 0 || x > MAX_UNICODE) { PyErr_SetString(PyExc_OverflowError, - "%c arg not in range(0x110000)"); + "%c arg not in " MAX_UNICODE_RANGE); return (Py_UCS4) -1; }