python · gnprice · Aug 4, 2019 · Numerlor · May 3, 2022
diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
@@ -1764,10 +1764,13 @@ expression support in the :mod:`re` module).
   Return true if there are only whitespace characters in the string and there is
   at least one character, false otherwise.

-   A character is *whitespace* if in the Unicode character database
-   (see :mod:`unicodedata`), either its general category is ``Zs``
-   ("Separator, space"), or its bidirectional class is one of ``WS``,
-   ``B``, or ``S``.
+   The whitespace characters are `as defined by Unicode
+   <https://www.unicode.org/reports/tr44/#White_Space>`_.
+
+   .. versionchanged:: 3.9
+      Previously used a different definition of whitespace characters,
+      which included the additional characters ``\x1c``, ``\x1d``,
+      ``\x1e``, and ``\x1f``.


 .. method:: str.istitle()

diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py
@@ -884,6 +884,7 @@ def test_isspace(self):
        self.checkequal(True, '\t', 'isspace')
        self.checkequal(True, '\r', 'isspace')
        self.checkequal(True, '\n', 'isspace')
+        self.checkequal(False, '\x1c', 'isspace')
        self.checkequal(True, ' \t\r\n', 'isspace')
        self.checkequal(False, ' \t\r\na', 'isspace')
        self.checkraises(TypeError, 'abc', 'isspace', 42)

diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
@@ -625,13 +625,21 @@ def test_isspace(self):

    @support.requires_resource('cpu')
    def test_isspace_invariant(self):
+        '''
+        Before Python 3.9, `str.isspace` used a certain formula
+        combining the General Category and the Bidirectional Class.
+
+        Since Python 3.9, we use the White_Space property that Unicode
+        now provides.  Check that these differ only on U+001C..U+001F .
+        '''
        for codepoint in range(sys.maxunicode + 1):
            char = chr(codepoint)
            bidirectional = unicodedata.bidirectional(char)
            category = unicodedata.category(char)
            self.assertEqual(char.isspace(),
-                             (bidirectional in ('WS', 'B', 'S')
-                              or category == 'Zs'))
+                             ((bidirectional in ('WS', 'B', 'S')
+                               or category == 'Zs')
+                              and codepoint not in range(0x1c, 0x20)))

    def test_isalnum(self):
        super().test_isalnum()
@@ -705,6 +713,21 @@ def test_isnumeric(self):
                   '\U000104A0', '\U0001F107']:
            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))

+    def test_bytes_consistency(self):
+        '''Check str and bytes agree on character properties within ASCII.'''
+        for codepoint in range(0x80):
+            char = chr(codepoint)
+            byte = bytes([codepoint])
+            # Including the codepoint in the tuple gets it printed on failure.
+            self.assertEqual(
+                (codepoint, char.islower(), char.isupper(), char.isalpha(),
+                 char.isdigit(), char.isalnum(), char.isspace()),
+                (codepoint, byte.islower(), byte.isupper(), byte.isalpha(),
+                 byte.isdigit(), byte.isalnum(), byte.isspace()))
+            self.assertEqual(
+                (codepoint, ord(char.lower()), ord(char.upper())),
+                (codepoint, ord(byte.lower()), ord(byte.upper())))
+
    def test_isidentifier(self):
        self.assertTrue("a".isidentifier())
        self.assertTrue("Z".isidentifier())

diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -17,7 +17,7 @@
 class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
-    expectedchecksum = 'e728278035eb76cf92d86f07852266b0433f16a5'
+    expectedchecksum = 'd0652da04440a4ea690fbd6fd3bf79af39090547'

    @requires_resource('cpu')
    def test_method_checksum(self):

diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-08-07-21-30-13.bpo-18236.YtYsEp.rst b/Misc/NEWS.d/next/Core and Builtins/2019-08-07-21-30-13.bpo-18236.YtYsEp.rst
@@ -0,0 +1,4 @@
+The definition of "whitespace" provided by :meth:`str.isspace` and
+:c:func:`Py_UNICODE_ISSPACE` now follows Unicode's ``White_Space`` property.
+This has the effect that the four characters ``\x1c``, ``\x1d``, ``\x1e``,
+``\x1f`` are no longer treated as whitespace.  Patch by Greg Price.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -292,11 +292,7 @@ const unsigned char _Py_ascii_whitespace[] = {
 /*     case 0x000D: * CARRIAGE RETURN */
    0, 1, 1, 1, 1, 1, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
-/*     case 0x001C: * FILE SEPARATOR */
-/*     case 0x001D: * GROUP SEPARATOR */
-/*     case 0x001E: * RECORD SEPARATOR */
-/*     case 0x001F: * UNIT SEPARATOR */
-    0, 0, 0, 0, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0,
 /*     case 0x0020: * SPACE */
    1, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,

diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
@@ -27,6 +27,7 @@
 #

 import dataclasses
+import itertools
 import os
 import sys
 import zipfile
@@ -46,6 +47,7 @@
 #   * Doc/reference/lexical_analysis.rst (two occurrences)
 UNIDATA_VERSION = "12.1.0"
 UNICODE_DATA = "UnicodeData%s.txt"
+PROP_LIST = "PropList%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
 UNIHAN = "Unihan%s.zip"
@@ -416,7 +418,7 @@ def makeunicodetype(unicode, trace):
            if 'Line_Break' in properties or bidirectional == "B":
                flags |= LINEBREAK_MASK
                linebreaks.append(char)
-            if category == "Zs" or bidirectional in ("WS", "B", "S"):
+            if 'White_Space' in properties:
                flags |= SPACE_MASK
                spaces.append(char)
            if category == "Lt":
@@ -561,8 +563,7 @@ def makeunicodetype(unicode, trace):
        fprint()

        # Generate code for _PyUnicode_IsWhitespace()
-        fprint("/* Returns 1 for Unicode characters having the bidirectional")
-        fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
+        fprint("/* Returns 1 for Unicode characters with property White_Space.")
        fprint(" */")
        fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
        fprint('{')
@@ -974,6 +975,7 @@ class UcdRecord:

    # Binary properties, as a set of those that are true.
    # Taken from multiple files:
+    #   https://www.unicode.org/reports/tr44/#PropList.txt
    #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
    #   https://www.unicode.org/reports/tr44/#LineBreak.txt
    binary_properties: Set[str]
@@ -1081,7 +1083,10 @@ def __init__(self, version, cjk_check=True):
            if table[i] is not None:
                table[i].east_asian_width = widths[i]

-        for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
+        for char, (p,) in itertools.chain(
+                UcdFile(PROP_LIST, version).expanded(),
+                UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(),
+        ):
            if table[char]:
                # Some properties (e.g. Default_Ignorable_Code_Point)
                # apply to unassigned code points; ignore them