-
-
Notifications
You must be signed in to change notification settings - Fork 32.1k
bpo-24665: Add CJK support in textwrap by default. #5649
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
Co-authored-by: Florent Gallaire <fgallaire@gmail.com>
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,40 @@ | |
# some Unicode spaces (like \u00a0) are non-breaking whitespaces. | ||
_whitespace = '\t\n\x0b\x0c\r ' | ||
|
||
try: | ||
from unicodedata import east_asian_width | ||
|
||
def _width(text): | ||
"""Return the display width of the text in columns, according to | ||
unicodedata.east_asian_width only. | ||
""" | ||
return sum(2 if east_asian_width(char) in {'F', 'W'} else 1 | ||
for char in text) | ||
|
||
def _slice(text, index): | ||
"""Return the two slices of text cut to index. | ||
""" | ||
width = 0 | ||
pos = 0 | ||
for char in text: | ||
width += 2 if east_asian_width(char) in {'F', 'W'} else 1 | ||
if width > index: | ||
break | ||
pos += 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why note use enumerate(), it's less readable (less pythonic) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because it does not works with enumerate as the last incrementation were not done. I do not remember which case exactly but if you run the unit test you'll spot it easily, it was failing, I'll do if needed but can't right now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interested in that, the code was strongly tested for txt2tags and don't catch this problem. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your initial implementation was working thanks to your There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Explicit is better than implicit." but the more important is that both solutions are correct. |
||
return text[:pos], text[pos:] | ||
|
||
except ImportError: | ||
|
||
def _width(text): | ||
"""Fallback in case unicodedata is not available: The display width of | ||
a text is just its number of characters. | ||
""" | ||
return len(text) | ||
|
||
def _slice(text, index): | ||
return text[:index], text[index:] | ||
|
||
|
||
class TextWrapper: | ||
""" | ||
Object for wrapping/filling text. The public interface consists of | ||
|
@@ -215,8 +249,9 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): | |
# If we're allowed to break long words, then do so: put as much | ||
# of the next chunk onto the current line as will fit. | ||
if self.break_long_words: | ||
cur_line.append(reversed_chunks[-1][:space_left]) | ||
reversed_chunks[-1] = reversed_chunks[-1][space_left:] | ||
left, right = _slice(reversed_chunks[-1], space_left) | ||
cur_line.append(left) | ||
reversed_chunks[-1] = right | ||
|
||
# Otherwise, we have to preserve the long word intact. Only add | ||
# it to the current line if there's nothing already there -- | ||
|
@@ -244,14 +279,13 @@ def _wrap_chunks(self, chunks): | |
lines, but apart from that whitespace is preserved. | ||
""" | ||
lines = [] | ||
if self.width <= 0: | ||
raise ValueError("invalid width %r (must be > 0)" % self.width) | ||
if self.max_lines is not None: | ||
if self.max_lines > 1: | ||
indent = self.subsequent_indent | ||
else: | ||
indent = self.initial_indent | ||
if len(indent) + len(self.placeholder.lstrip()) > self.width: | ||
if (_width(indent) + | ||
_width(self.placeholder.lstrip()) > self.width): | ||
raise ValueError("placeholder too large for max width") | ||
|
||
# Arrange in reverse order so items can be efficiently popped | ||
|
@@ -272,15 +306,15 @@ def _wrap_chunks(self, chunks): | |
indent = self.initial_indent | ||
|
||
# Maximum width for this line. | ||
width = self.width - len(indent) | ||
width = self.width - _width(indent) | ||
|
||
# First chunk on line is whitespace -- drop it, unless this | ||
# is the very beginning of the text (ie. no lines started yet). | ||
if self.drop_whitespace and chunks[-1].strip() == '' and lines: | ||
del chunks[-1] | ||
|
||
while chunks: | ||
l = len(chunks[-1]) | ||
l = _width(chunks[-1]) | ||
|
||
# Can at least squeeze this chunk onto the current line. | ||
if cur_len + l <= width: | ||
|
@@ -290,16 +324,15 @@ def _wrap_chunks(self, chunks): | |
# Nope, this line is full. | ||
else: | ||
break | ||
|
||
# The current line is full, and the next chunk is too big to | ||
# fit on *any* line (not just this one). | ||
if chunks and len(chunks[-1]) > width: | ||
if chunks and _width(chunks[-1]) > width: | ||
self._handle_long_word(chunks, cur_line, cur_len, width) | ||
cur_len = sum(map(len, cur_line)) | ||
cur_len = sum(map(_width, cur_line)) | ||
|
||
# If the last chunk on this line is all whitespace, drop it. | ||
if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': | ||
cur_len -= len(cur_line[-1]) | ||
cur_len -= _width(cur_line[-1]) | ||
del cur_line[-1] | ||
|
||
if cur_line: | ||
|
@@ -315,17 +348,17 @@ def _wrap_chunks(self, chunks): | |
else: | ||
while cur_line: | ||
if (cur_line[-1].strip() and | ||
cur_len + len(self.placeholder) <= width): | ||
cur_len + _width(self.placeholder) <= width): | ||
cur_line.append(self.placeholder) | ||
lines.append(indent + ''.join(cur_line)) | ||
break | ||
cur_len -= len(cur_line[-1]) | ||
cur_len -= _width(cur_line[-1]) | ||
del cur_line[-1] | ||
else: | ||
if lines: | ||
prev_line = lines[-1].rstrip() | ||
if (len(prev_line) + len(self.placeholder) <= | ||
self.width): | ||
if (_width(prev_line) + | ||
_width(self.placeholder) <= self.width): | ||
lines[-1] = prev_line + self.placeholder | ||
break | ||
lines.append(indent + self.placeholder.lstrip()) | ||
|
@@ -348,6 +381,10 @@ def wrap(self, text): | |
and all other whitespace characters (including newline) are | ||
converted to space. | ||
""" | ||
if self.width <= 0: | ||
raise ValueError("invalid width %r (must be > 0)" % self.width) | ||
elif self.width == 1 and _width(text) > len(text): | ||
raise ValueError("invalid width 1 (must be > 1 when CJK chars)") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have done a more complex solution:
It throws the exception earlier, but it's probably not absolutely necessary. |
||
chunks = self._split_chunks(text) | ||
if self.fix_sentence_endings: | ||
self._fix_sentence_endings(chunks) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Textwrap now take into account CJK double characters while measuring line | ||
width. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why inlining _len(), I don't have seen performance issues and it's less readable (less pythonic)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How do you know where to break once you have the whole value?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hello I was reading too fast. In my version there's the
_wide
boolean function.So here
width += _wide(char) + 1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And
_len
is justreturn sum(2 if _wide(char) else 1 for char in text)
with no performance issuesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
More pythonic, DRY.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I won't bet on the performances, calling _len from _slice adds two functions calls per character (one to _len and one to sum). In one case I'm doing it on a character, and in the other case in a whole string. Yes I could also factorize this ternary to a third function, but I don't find it more readable.