Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 57b2882

Browse filesBrowse files
bpo-24665: Add CJK support in textwrap by default.
Co-authored-by: Florent Gallaire <fgallaire@gmail.com>
1 parent f34e03e commit 57b2882
Copy full SHA for 57b2882

File tree

Expand file treeCollapse file tree

4 files changed

+70
-17
lines changed
Filter options
Expand file treeCollapse file tree

4 files changed

+70
-17
lines changed

‎Lib/test/test_textwrap.py

Copy file name to clipboardExpand all lines: Lib/test/test_textwrap.py
+15-2Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,9 @@ def test_bad_width(self):
435435
text = "Whatever, it doesn't matter."
436436
self.assertRaises(ValueError, wrap, text, 0)
437437
self.assertRaises(ValueError, wrap, text, -1)
438+
# Ensure that we raise while trying to split wide characters.
439+
text = 'Did you say "いろはにほへとちりぬるをいろはにほ?"'
440+
self.assertRaises(ValueError, wrap, text, 1)
438441

439442
def test_no_split_at_umlaut(self):
440443
text = "Die Empf\xe4nger-Auswahl"
@@ -578,7 +581,10 @@ def setUp(self):
578581
Did you say "supercalifragilisticexpialidocious?"
579582
How *do* you spell that odd word, anyways?
580583
'''
581-
584+
self.text_cjk = '''\
585+
Did you say "いろはにほへとちりぬるをいろはにほ?"
586+
How りぬ るをいろはにほり ぬるは, anyways?
587+
'''
582588
def test_break_long(self):
583589
# Wrap text with long words and lots of punctuation
584590

@@ -590,7 +596,14 @@ def test_break_long(self):
590596
self.check_wrap(self.text, 50,
591597
['Did you say "supercalifragilisticexpialidocious?"',
592598
'How *do* you spell that odd word, anyways?'])
593-
599+
self.check_wrap(self.text_cjk, 30,
600+
['Did you say "いろはにほへとち',
601+
'りぬるをいろはにほ?" How りぬ',
602+
'るをいろはにほり ぬるは,',
603+
'anyways?'])
604+
self.check_wrap(self.text_cjk, 50,
605+
['Did you say "いろはにほへとちりぬるをいろはにほ?"',
606+
'How りぬ るをいろはにほり ぬるは, anyways?'])
594607
# SF bug 797650. Prevent an infinite loop by making sure that at
595608
# least one character gets split off on every pass.
596609
self.check_wrap('-'*10+'hello', 10,

‎Lib/textwrap.py

Copy file name to clipboardExpand all lines: Lib/textwrap.py
+52-15Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,40 @@
1414
# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
1515
_whitespace = '\t\n\x0b\x0c\r '
1616

17+
try:
18+
from unicodedata import east_asian_width
19+
20+
def _width(text):
21+
"""Return the display width of the text in columns, according to
22+
unicodedata.east_asian_width only.
23+
"""
24+
return sum(2 if east_asian_width(char) in {'F', 'W'} else 1
25+
for char in text)
26+
27+
def _slice(text, index):
28+
"""Return the two slices of text cut to index.
29+
"""
30+
width = 0
31+
pos = 0
32+
for char in text:
33+
width += 2 if east_asian_width(char) in {'F', 'W'} else 1
34+
if width > index:
35+
break
36+
pos += 1
37+
return text[:pos], text[pos:]
38+
39+
except ImportError:
40+
41+
def _width(text):
42+
"""Fallback in case unicodedata is not available: The display width of
43+
a text is just its number of characters.
44+
"""
45+
return len(text)
46+
47+
def _slice(text, index):
48+
return text[:index], text[index:]
49+
50+
1751
class TextWrapper:
1852
"""
1953
Object for wrapping/filling text. The public interface consists of
@@ -215,8 +249,9 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
215249
# If we're allowed to break long words, then do so: put as much
216250
# of the next chunk onto the current line as will fit.
217251
if self.break_long_words:
218-
cur_line.append(reversed_chunks[-1][:space_left])
219-
reversed_chunks[-1] = reversed_chunks[-1][space_left:]
252+
left, right = _slice(reversed_chunks[-1], space_left)
253+
cur_line.append(left)
254+
reversed_chunks[-1] = right
220255

221256
# Otherwise, we have to preserve the long word intact. Only add
222257
# it to the current line if there's nothing already there --
@@ -244,14 +279,13 @@ def _wrap_chunks(self, chunks):
244279
lines, but apart from that whitespace is preserved.
245280
"""
246281
lines = []
247-
if self.width <= 0:
248-
raise ValueError("invalid width %r (must be > 0)" % self.width)
249282
if self.max_lines is not None:
250283
if self.max_lines > 1:
251284
indent = self.subsequent_indent
252285
else:
253286
indent = self.initial_indent
254-
if len(indent) + len(self.placeholder.lstrip()) > self.width:
287+
if (_width(indent) +
288+
_width(self.placeholder.lstrip()) > self.width):
255289
raise ValueError("placeholder too large for max width")
256290

257291
# Arrange in reverse order so items can be efficiently popped
@@ -272,15 +306,15 @@ def _wrap_chunks(self, chunks):
272306
indent = self.initial_indent
273307

274308
# Maximum width for this line.
275-
width = self.width - len(indent)
309+
width = self.width - _width(indent)
276310

277311
# First chunk on line is whitespace -- drop it, unless this
278312
# is the very beginning of the text (ie. no lines started yet).
279313
if self.drop_whitespace and chunks[-1].strip() == '' and lines:
280314
del chunks[-1]
281315

282316
while chunks:
283-
l = len(chunks[-1])
317+
l = _width(chunks[-1])
284318

285319
# Can at least squeeze this chunk onto the current line.
286320
if cur_len + l <= width:
@@ -290,16 +324,15 @@ def _wrap_chunks(self, chunks):
290324
# Nope, this line is full.
291325
else:
292326
break
293-
294327
# The current line is full, and the next chunk is too big to
295328
# fit on *any* line (not just this one).
296-
if chunks and len(chunks[-1]) > width:
329+
if chunks and _width(chunks[-1]) > width:
297330
self._handle_long_word(chunks, cur_line, cur_len, width)
298-
cur_len = sum(map(len, cur_line))
331+
cur_len = sum(map(_width, cur_line))
299332

300333
# If the last chunk on this line is all whitespace, drop it.
301334
if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
302-
cur_len -= len(cur_line[-1])
335+
cur_len -= _width(cur_line[-1])
303336
del cur_line[-1]
304337

305338
if cur_line:
@@ -315,17 +348,17 @@ def _wrap_chunks(self, chunks):
315348
else:
316349
while cur_line:
317350
if (cur_line[-1].strip() and
318-
cur_len + len(self.placeholder) <= width):
351+
cur_len + _width(self.placeholder) <= width):
319352
cur_line.append(self.placeholder)
320353
lines.append(indent + ''.join(cur_line))
321354
break
322-
cur_len -= len(cur_line[-1])
355+
cur_len -= _width(cur_line[-1])
323356
del cur_line[-1]
324357
else:
325358
if lines:
326359
prev_line = lines[-1].rstrip()
327-
if (len(prev_line) + len(self.placeholder) <=
328-
self.width):
360+
if (_width(prev_line) +
361+
_width(self.placeholder) <= self.width):
329362
lines[-1] = prev_line + self.placeholder
330363
break
331364
lines.append(indent + self.placeholder.lstrip())
@@ -348,6 +381,10 @@ def wrap(self, text):
348381
and all other whitespace characters (including newline) are
349382
converted to space.
350383
"""
384+
if self.width <= 0:
385+
raise ValueError("invalid width %r (must be > 0)" % self.width)
386+
elif self.width == 1 and _width(text) > len(text):
387+
raise ValueError("invalid width 1 (must be > 1 when CJK chars)")
351388
chunks = self._split_chunks(text)
352389
if self.fix_sentence_endings:
353390
self._fix_sentence_endings(chunks)

‎Misc/ACKS

Copy file name to clipboardExpand all lines: Misc/ACKS
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,7 @@ Lele Gaifax
514514
Santiago Gala
515515
Yitzchak Gale
516516
Matthew Gallagher
517+
Florent Gallaire
517518
Quentin Gallet-Gilles
518519
Riccardo Attilio Galli
519520
Raymund Galvin
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Textwrap now take into account CJK double characters while measuring line
2+
width.

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.