From 57b28823c7df75d8957fbc2d5562578382f543e7 Mon Sep 17 00:00:00 2001 From: Julien Palard Date: Tue, 13 Feb 2018 02:05:07 +0100 Subject: [PATCH] bpo-24665: Add CJK support in textwrap by default. Co-authored-by: Florent Gallaire --- Lib/test/test_textwrap.py | 17 ++++- Lib/textwrap.py | 67 ++++++++++++++----- Misc/ACKS | 1 + .../2018-02-13-02-06-24.bpo-24665.re7KqM.rst | 2 + 4 files changed, 70 insertions(+), 17 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2018-02-13-02-06-24.bpo-24665.re7KqM.rst diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index ed97f70ba1fa40..e18ba72c718afe 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -435,6 +435,9 @@ def test_bad_width(self): text = "Whatever, it doesn't matter." self.assertRaises(ValueError, wrap, text, 0) self.assertRaises(ValueError, wrap, text, -1) + # Ensure that we raise while trying to split wide characters. + text = 'Did you say "いろはにほへとちりぬるをいろはにほ?"' + self.assertRaises(ValueError, wrap, text, 1) def test_no_split_at_umlaut(self): text = "Die Empf\xe4nger-Auswahl" @@ -578,7 +581,10 @@ def setUp(self): Did you say "supercalifragilisticexpialidocious?" How *do* you spell that odd word, anyways? ''' - + self.text_cjk = '''\ +Did you say "いろはにほへとちりぬるをいろはにほ?" +How りぬ るをいろはにほり ぬるは, anyways? +''' def test_break_long(self): # Wrap text with long words and lots of punctuation @@ -590,7 +596,14 @@ def test_break_long(self): self.check_wrap(self.text, 50, ['Did you say "supercalifragilisticexpialidocious?"', 'How *do* you spell that odd word, anyways?']) - + self.check_wrap(self.text_cjk, 30, + ['Did you say "いろはにほへとち', + 'りぬるをいろはにほ?" How りぬ', + 'るをいろはにほり ぬるは,', + 'anyways?']) + self.check_wrap(self.text_cjk, 50, + ['Did you say "いろはにほへとちりぬるをいろはにほ?"', + 'How りぬ るをいろはにほり ぬるは, anyways?']) # SF bug 797650. Prevent an infinite loop by making sure that at # least one character gets split off on every pass. self.check_wrap('-'*10+'hello', 10, diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 8103f347452d35..9df180191a1ffe 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -14,6 +14,40 @@ # some Unicode spaces (like \u00a0) are non-breaking whitespaces. _whitespace = '\t\n\x0b\x0c\r ' +try: + from unicodedata import east_asian_width + + def _width(text): + """Return the display width of the text in columns, according to + unicodedata.east_asian_width only. + """ + return sum(2 if east_asian_width(char) in {'F', 'W'} else 1 + for char in text) + + def _slice(text, index): + """Return the two slices of text cut to index. + """ + width = 0 + pos = 0 + for char in text: + width += 2 if east_asian_width(char) in {'F', 'W'} else 1 + if width > index: + break + pos += 1 + return text[:pos], text[pos:] + +except ImportError: + + def _width(text): + """Fallback in case unicodedata is not available: The display width of + a text is just its number of characters. + """ + return len(text) + + def _slice(text, index): + return text[:index], text[index:] + + class TextWrapper: """ Object for wrapping/filling text. The public interface consists of @@ -215,8 +249,9 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): # If we're allowed to break long words, then do so: put as much # of the next chunk onto the current line as will fit. if self.break_long_words: - cur_line.append(reversed_chunks[-1][:space_left]) - reversed_chunks[-1] = reversed_chunks[-1][space_left:] + left, right = _slice(reversed_chunks[-1], space_left) + cur_line.append(left) + reversed_chunks[-1] = right # Otherwise, we have to preserve the long word intact. Only add # it to the current line if there's nothing already there -- @@ -244,14 +279,13 @@ def _wrap_chunks(self, chunks): lines, but apart from that whitespace is preserved. """ lines = [] - if self.width <= 0: - raise ValueError("invalid width %r (must be > 0)" % self.width) if self.max_lines is not None: if self.max_lines > 1: indent = self.subsequent_indent else: indent = self.initial_indent - if len(indent) + len(self.placeholder.lstrip()) > self.width: + if (_width(indent) + + _width(self.placeholder.lstrip()) > self.width): raise ValueError("placeholder too large for max width") # Arrange in reverse order so items can be efficiently popped @@ -272,7 +306,7 @@ def _wrap_chunks(self, chunks): indent = self.initial_indent # Maximum width for this line. - width = self.width - len(indent) + width = self.width - _width(indent) # First chunk on line is whitespace -- drop it, unless this # is the very beginning of the text (ie. no lines started yet). @@ -280,7 +314,7 @@ def _wrap_chunks(self, chunks): del chunks[-1] while chunks: - l = len(chunks[-1]) + l = _width(chunks[-1]) # Can at least squeeze this chunk onto the current line. if cur_len + l <= width: @@ -290,16 +324,15 @@ def _wrap_chunks(self, chunks): # Nope, this line is full. else: break - # The current line is full, and the next chunk is too big to # fit on *any* line (not just this one). - if chunks and len(chunks[-1]) > width: + if chunks and _width(chunks[-1]) > width: self._handle_long_word(chunks, cur_line, cur_len, width) - cur_len = sum(map(len, cur_line)) + cur_len = sum(map(_width, cur_line)) # If the last chunk on this line is all whitespace, drop it. if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': - cur_len -= len(cur_line[-1]) + cur_len -= _width(cur_line[-1]) del cur_line[-1] if cur_line: @@ -315,17 +348,17 @@ def _wrap_chunks(self, chunks): else: while cur_line: if (cur_line[-1].strip() and - cur_len + len(self.placeholder) <= width): + cur_len + _width(self.placeholder) <= width): cur_line.append(self.placeholder) lines.append(indent + ''.join(cur_line)) break - cur_len -= len(cur_line[-1]) + cur_len -= _width(cur_line[-1]) del cur_line[-1] else: if lines: prev_line = lines[-1].rstrip() - if (len(prev_line) + len(self.placeholder) <= - self.width): + if (_width(prev_line) + + _width(self.placeholder) <= self.width): lines[-1] = prev_line + self.placeholder break lines.append(indent + self.placeholder.lstrip()) @@ -348,6 +381,10 @@ def wrap(self, text): and all other whitespace characters (including newline) are converted to space. """ + if self.width <= 0: + raise ValueError("invalid width %r (must be > 0)" % self.width) + elif self.width == 1 and _width(text) > len(text): + raise ValueError("invalid width 1 (must be > 1 when CJK chars)") chunks = self._split_chunks(text) if self.fix_sentence_endings: self._fix_sentence_endings(chunks) diff --git a/Misc/ACKS b/Misc/ACKS index ea1d9418870aa9..2ef6fb7db59f3c 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -514,6 +514,7 @@ Lele Gaifax Santiago Gala Yitzchak Gale Matthew Gallagher +Florent Gallaire Quentin Gallet-Gilles Riccardo Attilio Galli Raymund Galvin diff --git a/Misc/NEWS.d/next/Library/2018-02-13-02-06-24.bpo-24665.re7KqM.rst b/Misc/NEWS.d/next/Library/2018-02-13-02-06-24.bpo-24665.re7KqM.rst new file mode 100644 index 00000000000000..ba9a6a69640cb8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-02-13-02-06-24.bpo-24665.re7KqM.rst @@ -0,0 +1,2 @@ +Textwrap now take into account CJK double characters while measuring line +width.