From 317215995037d5d11cecb8e06e6baeab8c16df3b Mon Sep 17 00:00:00 2001 From: Greg Price Date: Tue, 30 Jul 2019 00:21:23 -0700 Subject: [PATCH 1/4] Correct documentation of `str.isspace`. The documented definition was much broader than the real one: there are tons of characters with general category "Other", and we don't (and shouldn't) treat most of them as whitespace. Rewrite the definition to agree with the comment on _PyUnicode_IsWhitespace, and with the logic in makeunicodedata.py, which is what generates that function and so ultimately governs. Add suitable breadcrumbs so that a reader who wants to pin down exactly what this definition means (what's a "bidirectional class" of "B"?) can do so. The `unicodedata` module documentation is an appropriate central place for our references to Unicode's own copious documentation, so point there. Also add to the `isspace` test a thorough check that the implementation agrees with the intended definition. --- Doc/library/stdtypes.rst | 10 +++++++--- Lib/test/test_unicode.py | 10 +++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 9dd557fabaae2e4..08c5ae876c1b9d4 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1763,9 +1763,13 @@ expression support in the :mod:`re` module). .. method:: str.isspace() Return true if there are only whitespace characters in the string and there is - at least one character, false otherwise. Whitespace characters are those - characters defined in the Unicode character database as "Other" or "Separator" - and those with bidirectional property being one of "WS", "B", or "S". + at least one character, false otherwise. + + A character is *whitespace* if in the Unicode character database + (see :mod:`unicodedata`), either its general category is ``Zs`` + ("Separator, space"), or its bidirectional class is one of ``WS``, + ``B``, or ``S``. + .. method:: str.istitle() diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 8be16c8da926bf1..f09fe12ec5c4f23 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -12,6 +12,7 @@ import struct import sys import textwrap +import unicodedata import unittest import warnings from test import support, string_tests @@ -617,7 +618,14 @@ def test_isspace(self): self.checkequalnofix(True, '\u2000', 'isspace') self.checkequalnofix(True, '\u200a', 'isspace') self.checkequalnofix(False, '\u2014', 'isspace') - # apparently there are no non-BMP spaces chars in Unicode 6 + for i in range(0x10000): + char = chr(i) + bidirectional = unicodedata.bidirectional(char) + category = unicodedata.category(char) + self.assertEqual(char.isspace(), + (bidirectional in ('WS', 'B', 'S') + or category == 'Zs')) + # There are no non-BMP whitespace chars as of Unicode 12. for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']: self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch)) From feeb2c62998ca9b62ab36128a6c461ea35e0f16b Mon Sep 17 00:00:00 2001 From: Greg Price Date: Tue, 30 Jul 2019 21:57:36 -0700 Subject: [PATCH 2/4] Cover all characters, using a name; move to test_unicodedata. --- Lib/test/test_unicode.py | 8 -------- Lib/test/test_unicodedata.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index f09fe12ec5c4f23..35f3d17bb7e1a34 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -12,7 +12,6 @@ import struct import sys import textwrap -import unicodedata import unittest import warnings from test import support, string_tests @@ -618,13 +617,6 @@ def test_isspace(self): self.checkequalnofix(True, '\u2000', 'isspace') self.checkequalnofix(True, '\u200a', 'isspace') self.checkequalnofix(False, '\u2014', 'isspace') - for i in range(0x10000): - char = chr(i) - bidirectional = unicodedata.bidirectional(char) - category = unicodedata.category(char) - self.assertEqual(char.isspace(), - (bidirectional in ('WS', 'B', 'S') - or category == 'Zs')) # There are no non-BMP whitespace chars as of Unicode 12. for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']: diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index a52b6de547fbc90..b85fbe658d3a339 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -14,6 +14,10 @@ encoding = 'utf-8' errors = 'surrogatepass' +def all_chars(): + '''Each Unicode codepoint, as a one-character string.''' + for codepoint in range(0x110000): + yield chr(codepoint) ### Run tests @@ -102,6 +106,14 @@ def test_function_checksum(self): result = h.hexdigest() self.assertEqual(result, self.expectedchecksum) + def test_isspace_invariant(self): + for char in all_chars(): + bidirectional = self.db.bidirectional(char) + category = self.db.category(char) + self.assertEqual(char.isspace(), + (bidirectional in ('WS', 'B', 'S') + or category == 'Zs')) + def test_digit(self): self.assertEqual(self.db.digit('A', None), None) self.assertEqual(self.db.digit('9'), 9) From 578335f4477945fb33f775029d04504ed246b812 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Wed, 31 Jul 2019 21:55:45 -0700 Subject: [PATCH 3/4] Move back to test_unicode; open-code loop; mark as uses-CPU. --- Lib/test/test_unicode.py | 11 +++++++++++ Lib/test/test_unicodedata.py | 12 ------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 35f3d17bb7e1a34..80fe0355d4047e9 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -12,6 +12,7 @@ import struct import sys import textwrap +import unicodedata import unittest import warnings from test import support, string_tests @@ -622,6 +623,16 @@ def test_isspace(self): '\U0001F40D', '\U0001F46F']: self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch)) + @support.requires_resource('cpu') + def test_isspace_invariant(self): + for codepoint in range(0x110000): + char = chr(codepoint) + bidirectional = unicodedata.bidirectional(char) + category = unicodedata.category(char) + self.assertEqual(char.isspace(), + (bidirectional in ('WS', 'B', 'S') + or category == 'Zs')) + def test_isalnum(self): super().test_isalnum() for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index b85fbe658d3a339..a52b6de547fbc90 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -14,10 +14,6 @@ encoding = 'utf-8' errors = 'surrogatepass' -def all_chars(): - '''Each Unicode codepoint, as a one-character string.''' - for codepoint in range(0x110000): - yield chr(codepoint) ### Run tests @@ -106,14 +102,6 @@ def test_function_checksum(self): result = h.hexdigest() self.assertEqual(result, self.expectedchecksum) - def test_isspace_invariant(self): - for char in all_chars(): - bidirectional = self.db.bidirectional(char) - category = self.db.category(char) - self.assertEqual(char.isspace(), - (bidirectional in ('WS', 'B', 'S') - or category == 'Zs')) - def test_digit(self): self.assertEqual(self.db.digit('A', None), None) self.assertEqual(self.db.digit('9'), 9) From d8c347d296e0182ba0c0d3a7b887b435178d8c9a Mon Sep 17 00:00:00 2001 From: Greg Price Date: Thu, 1 Aug 2019 11:30:20 -0700 Subject: [PATCH 4/4] Use `sys.maxunicode + 1` rather than explicit 0x110000. --- Lib/test/test_unicode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 80fe0355d4047e9..7bd7f51b592b347 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -625,7 +625,7 @@ def test_isspace(self): @support.requires_resource('cpu') def test_isspace_invariant(self): - for codepoint in range(0x110000): + for codepoint in range(sys.maxunicode + 1): char = chr(codepoint) bidirectional = unicodedata.bidirectional(char) category = unicodedata.category(char)