Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions 11 Doc/library/stdtypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1764,10 +1764,13 @@ expression support in the :mod:`re` module).
Return true if there are only whitespace characters in the string and there is
at least one character, false otherwise.

A character is *whitespace* if in the Unicode character database
(see :mod:`unicodedata`), either its general category is ``Zs``
("Separator, space"), or its bidirectional class is one of ``WS``,
``B``, or ``S``.
The whitespace characters are `as defined by Unicode
<https://www.unicode.org/reports/tr44/#White_Space>`_.

.. versionchanged:: 3.9
Previously used a different definition of whitespace characters,
which included the additional characters ``\x1c``, ``\x1d``,
``\x1e``, and ``\x1f``.


.. method:: str.istitle()
Expand Down
1 change: 1 addition & 0 deletions 1 Lib/test/string_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,7 @@ def test_isspace(self):
self.checkequal(True, '\t', 'isspace')
self.checkequal(True, '\r', 'isspace')
self.checkequal(True, '\n', 'isspace')
self.checkequal(False, '\x1c', 'isspace')
self.checkequal(True, ' \t\r\n', 'isspace')
self.checkequal(False, ' \t\r\na', 'isspace')
self.checkraises(TypeError, 'abc', 'isspace', 42)
Expand Down
27 changes: 25 additions & 2 deletions 27 Lib/test/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,13 +625,21 @@ def test_isspace(self):

@support.requires_resource('cpu')
def test_isspace_invariant(self):
'''
Before Python 3.9, `str.isspace` used a certain formula
combining the General Category and the Bidirectional Class.

Since Python 3.9, we use the White_Space property that Unicode
now provides. Check that these differ only on U+001C..U+001F .
'''
for codepoint in range(sys.maxunicode + 1):
char = chr(codepoint)
bidirectional = unicodedata.bidirectional(char)
category = unicodedata.category(char)
self.assertEqual(char.isspace(),
(bidirectional in ('WS', 'B', 'S')
or category == 'Zs'))
((bidirectional in ('WS', 'B', 'S')
or category == 'Zs')
and codepoint not in range(0x1c, 0x20)))
Comment on lines +640 to +642

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this guaranteed to hold in future unicode versions? i.e. should it be tested on the string method if it's only testing the unicode database parsing and generation?


def test_isalnum(self):
super().test_isalnum()
Expand Down Expand Up @@ -705,6 +713,21 @@ def test_isnumeric(self):
'\U000104A0', '\U0001F107']:
self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))

def test_bytes_consistency(self):
'''Check str and bytes agree on character properties within ASCII.'''
for codepoint in range(0x80):
char = chr(codepoint)
byte = bytes([codepoint])
# Including the codepoint in the tuple gets it printed on failure.
self.assertEqual(
(codepoint, char.islower(), char.isupper(), char.isalpha(),
char.isdigit(), char.isalnum(), char.isspace()),
(codepoint, byte.islower(), byte.isupper(), byte.isalpha(),
byte.isdigit(), byte.isalnum(), byte.isspace()))
self.assertEqual(
(codepoint, ord(char.lower()), ord(char.upper())),
(codepoint, ord(byte.lower()), ord(byte.upper())))

def test_isidentifier(self):
self.assertTrue("a".isidentifier())
self.assertTrue("Z".isidentifier())
Expand Down
2 changes: 1 addition & 1 deletion 2 Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
class UnicodeMethodsTest(unittest.TestCase):

# update this, if the database changes
expectedchecksum = 'e728278035eb76cf92d86f07852266b0433f16a5'
expectedchecksum = 'd0652da04440a4ea690fbd6fd3bf79af39090547'

@requires_resource('cpu')
def test_method_checksum(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
The definition of "whitespace" provided by :meth:`str.isspace` and
:c:func:`Py_UNICODE_ISSPACE` now follows Unicode's ``White_Space`` property.
This has the effect that the four characters ``\x1c``, ``\x1d``, ``\x1e``,
``\x1f`` are no longer treated as whitespace. Patch by Greg Price.
6 changes: 1 addition & 5 deletions 6 Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -292,11 +292,7 @@ const unsigned char _Py_ascii_whitespace[] = {
/* case 0x000D: * CARRIAGE RETURN */
0, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* case 0x001C: * FILE SEPARATOR */
/* case 0x001D: * GROUP SEPARATOR */
/* case 0x001E: * RECORD SEPARATOR */
/* case 0x001F: * UNIT SEPARATOR */
0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0,
/* case 0x0020: * SPACE */
1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
Expand Down
3,319 changes: 1,658 additions & 1,661 deletions 3,319 Objects/unicodetype_db.h

Large diffs are not rendered by default.

13 changes: 9 additions & 4 deletions 13 Tools/unicode/makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#

import dataclasses
import itertools
import os
import sys
import zipfile
Expand All @@ -46,6 +47,7 @@
# * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "12.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
PROP_LIST = "PropList%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.zip"
Expand Down Expand Up @@ -416,7 +418,7 @@ def makeunicodetype(unicode, trace):
if 'Line_Break' in properties or bidirectional == "B":
flags |= LINEBREAK_MASK
linebreaks.append(char)
if category == "Zs" or bidirectional in ("WS", "B", "S"):
if 'White_Space' in properties:
flags |= SPACE_MASK
spaces.append(char)
if category == "Lt":
Expand Down Expand Up @@ -561,8 +563,7 @@ def makeunicodetype(unicode, trace):
fprint()

# Generate code for _PyUnicode_IsWhitespace()
fprint("/* Returns 1 for Unicode characters having the bidirectional")
fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
fprint("/* Returns 1 for Unicode characters with property White_Space.")
fprint(" */")
fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
fprint('{')
Expand Down Expand Up @@ -974,6 +975,7 @@ class UcdRecord:

# Binary properties, as a set of those that are true.
# Taken from multiple files:
# https://www.unicode.org/reports/tr44/#PropList.txt
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
# https://www.unicode.org/reports/tr44/#LineBreak.txt
binary_properties: Set[str]
Expand Down Expand Up @@ -1081,7 +1083,10 @@ def __init__(self, version, cjk_check=True):
if table[i] is not None:
table[i].east_asian_width = widths[i]

for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
for char, (p,) in itertools.chain(
UcdFile(PROP_LIST, version).expanded(),
UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(),
):
if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.