From 317215995037d5d11cecb8e06e6baeab8c16df3b Mon Sep 17 00:00:00 2001
From: Greg Price <gnprice@gmail.com>
Date: Tue, 30 Jul 2019 00:21:23 -0700
Subject: [PATCH 1/4] Correct documentation of `str.isspace`.

The documented definition was much broader than the real one:
there are tons of characters with general category "Other",
and we don't (and shouldn't) treat most of them as whitespace.

Rewrite the definition to agree with the comment on
_PyUnicode_IsWhitespace, and with the logic in makeunicodedata.py,
which is what generates that function and so ultimately governs.

Add suitable breadcrumbs so that a reader who wants to pin down
exactly what this definition means (what's a "bidirectional class"
of "B"?) can do so.  The `unicodedata` module documentation is an
appropriate central place for our references to Unicode's own copious
documentation, so point there.

Also add to the `isspace` test a thorough check that the
implementation agrees with the intended definition.
---
 Doc/library/stdtypes.rst | 10 +++++++---
 Lib/test/test_unicode.py | 10 +++++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
index 9dd557fabaae2e4..08c5ae876c1b9d4 100644
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -1763,9 +1763,13 @@ expression support in the :mod:`re` module).
 .. method:: str.isspace()
 
    Return true if there are only whitespace characters in the string and there is
-   at least one character, false otherwise.  Whitespace characters  are those
-   characters defined in the Unicode character database as "Other" or "Separator"
-   and those with bidirectional property being one of "WS", "B", or "S".
+   at least one character, false otherwise.
+
+   A character is *whitespace* if in the Unicode character database
+   (see :mod:`unicodedata`), either its general category is ``Zs``
+   ("Separator, space"), or its bidirectional class is one of ``WS``,
+   ``B``, or ``S``.
+
 
 .. method:: str.istitle()
 
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 8be16c8da926bf1..f09fe12ec5c4f23 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -12,6 +12,7 @@
 import struct
 import sys
 import textwrap
+import unicodedata
 import unittest
 import warnings
 from test import support, string_tests
@@ -617,7 +618,14 @@ def test_isspace(self):
         self.checkequalnofix(True, '\u2000', 'isspace')
         self.checkequalnofix(True, '\u200a', 'isspace')
         self.checkequalnofix(False, '\u2014', 'isspace')
-        # apparently there are no non-BMP spaces chars in Unicode 6
+        for i in range(0x10000):
+            char = chr(i)
+            bidirectional = unicodedata.bidirectional(char)
+            category = unicodedata.category(char)
+            self.assertEqual(char.isspace(),
+                             (bidirectional in ('WS', 'B', 'S')
+                              or category == 'Zs'))
+        # There are no non-BMP whitespace chars as of Unicode 12.
         for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
                    '\U0001F40D', '\U0001F46F']:
             self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))

From feeb2c62998ca9b62ab36128a6c461ea35e0f16b Mon Sep 17 00:00:00 2001
From: Greg Price <gnprice@gmail.com>
Date: Tue, 30 Jul 2019 21:57:36 -0700
Subject: [PATCH 2/4] Cover all characters, using a name; move to
 test_unicodedata.

---
 Lib/test/test_unicode.py     |  8 --------
 Lib/test/test_unicodedata.py | 12 ++++++++++++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index f09fe12ec5c4f23..35f3d17bb7e1a34 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -12,7 +12,6 @@
 import struct
 import sys
 import textwrap
-import unicodedata
 import unittest
 import warnings
 from test import support, string_tests
@@ -618,13 +617,6 @@ def test_isspace(self):
         self.checkequalnofix(True, '\u2000', 'isspace')
         self.checkequalnofix(True, '\u200a', 'isspace')
         self.checkequalnofix(False, '\u2014', 'isspace')
-        for i in range(0x10000):
-            char = chr(i)
-            bidirectional = unicodedata.bidirectional(char)
-            category = unicodedata.category(char)
-            self.assertEqual(char.isspace(),
-                             (bidirectional in ('WS', 'B', 'S')
-                              or category == 'Zs'))
         # There are no non-BMP whitespace chars as of Unicode 12.
         for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
                    '\U0001F40D', '\U0001F46F']:
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index a52b6de547fbc90..b85fbe658d3a339 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -14,6 +14,10 @@
 encoding = 'utf-8'
 errors = 'surrogatepass'
 
+def all_chars():
+    '''Each Unicode codepoint, as a one-character string.'''
+    for codepoint in range(0x110000):
+        yield chr(codepoint)
 
 ### Run tests
 
@@ -102,6 +106,14 @@ def test_function_checksum(self):
         result = h.hexdigest()
         self.assertEqual(result, self.expectedchecksum)
 
+    def test_isspace_invariant(self):
+        for char in all_chars():
+            bidirectional = self.db.bidirectional(char)
+            category = self.db.category(char)
+            self.assertEqual(char.isspace(),
+                             (bidirectional in ('WS', 'B', 'S')
+                              or category == 'Zs'))
+
     def test_digit(self):
         self.assertEqual(self.db.digit('A', None), None)
         self.assertEqual(self.db.digit('9'), 9)

From 578335f4477945fb33f775029d04504ed246b812 Mon Sep 17 00:00:00 2001
From: Greg Price <gnprice@gmail.com>
Date: Wed, 31 Jul 2019 21:55:45 -0700
Subject: [PATCH 3/4] Move back to test_unicode; open-code loop; mark as
 uses-CPU.

---
 Lib/test/test_unicode.py     | 11 +++++++++++
 Lib/test/test_unicodedata.py | 12 ------------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 35f3d17bb7e1a34..80fe0355d4047e9 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -12,6 +12,7 @@
 import struct
 import sys
 import textwrap
+import unicodedata
 import unittest
 import warnings
 from test import support, string_tests
@@ -622,6 +623,16 @@ def test_isspace(self):
                    '\U0001F40D', '\U0001F46F']:
             self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
 
+    @support.requires_resource('cpu')
+    def test_isspace_invariant(self):
+        for codepoint in range(0x110000):
+            char = chr(codepoint)
+            bidirectional = unicodedata.bidirectional(char)
+            category = unicodedata.category(char)
+            self.assertEqual(char.isspace(),
+                             (bidirectional in ('WS', 'B', 'S')
+                              or category == 'Zs'))
+
     def test_isalnum(self):
         super().test_isalnum()
         for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index b85fbe658d3a339..a52b6de547fbc90 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -14,10 +14,6 @@
 encoding = 'utf-8'
 errors = 'surrogatepass'
 
-def all_chars():
-    '''Each Unicode codepoint, as a one-character string.'''
-    for codepoint in range(0x110000):
-        yield chr(codepoint)
 
 ### Run tests
 
@@ -106,14 +102,6 @@ def test_function_checksum(self):
         result = h.hexdigest()
         self.assertEqual(result, self.expectedchecksum)
 
-    def test_isspace_invariant(self):
-        for char in all_chars():
-            bidirectional = self.db.bidirectional(char)
-            category = self.db.category(char)
-            self.assertEqual(char.isspace(),
-                             (bidirectional in ('WS', 'B', 'S')
-                              or category == 'Zs'))
-
     def test_digit(self):
         self.assertEqual(self.db.digit('A', None), None)
         self.assertEqual(self.db.digit('9'), 9)

From d8c347d296e0182ba0c0d3a7b887b435178d8c9a Mon Sep 17 00:00:00 2001
From: Greg Price <gnprice@gmail.com>
Date: Thu, 1 Aug 2019 11:30:20 -0700
Subject: [PATCH 4/4] Use `sys.maxunicode + 1` rather than explicit 0x110000.

---
 Lib/test/test_unicode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 80fe0355d4047e9..7bd7f51b592b347 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -625,7 +625,7 @@ def test_isspace(self):
 
     @support.requires_resource('cpu')
     def test_isspace_invariant(self):
-        for codepoint in range(0x110000):
+        for codepoint in range(sys.maxunicode + 1):
             char = chr(codepoint)
             bidirectional = unicodedata.bidirectional(char)
             category = unicodedata.category(char)