bpo-24665: Add CJK support in textwrap by default.

Co-authored-by: Florent Gallaire <fgallaire@gmail.com>
python · JulienPalard · Feb 13, 2018 · Mar 6, 2018 · 57b28823c7df75d8957fbc2d5562578382f543e7 · fgallaire
commit 57b28823c7df75d8957fbc2d5562578382f543e7
diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py
@@ -435,6 +435,9 @@ def test_bad_width(self):
        text = "Whatever, it doesn't matter."
        self.assertRaises(ValueError, wrap, text, 0)
        self.assertRaises(ValueError, wrap, text, -1)
+        # Ensure that we raise while trying to split wide characters.
+        text = 'Did you say "いろはにほへとちりぬるをいろはにほ?"'
+        self.assertRaises(ValueError, wrap, text, 1)

    def test_no_split_at_umlaut(self):
        text = "Die Empf\xe4nger-Auswahl"
@@ -578,7 +581,10 @@ def setUp(self):
 Did you say "supercalifragilisticexpialidocious?"
 How *do* you spell that odd word, anyways?
 '''
-
+        self.text_cjk = '''\
+Did you say "いろはにほへとちりぬるをいろはにほ?"
+How りぬ るをいろはにほり ぬるは, anyways?
+'''
    def test_break_long(self):
        # Wrap text with long words and lots of punctuation

@@ -590,7 +596,14 @@ def test_break_long(self):
        self.check_wrap(self.text, 50,
                        ['Did you say "supercalifragilisticexpialidocious?"',
                         'How *do* you spell that odd word, anyways?'])
-
+        self.check_wrap(self.text_cjk, 30,
+                        ['Did you say "いろはにほへとち',
+                         'りぬるをいろはにほ?" How りぬ',
+                         'るをいろはにほり ぬるは,',
+                         'anyways?'])
+        self.check_wrap(self.text_cjk, 50,
+                        ['Did you say "いろはにほへとちりぬるをいろはにほ?"',
+                         'How りぬ るをいろはにほり ぬるは, anyways?'])
        # SF bug 797650.  Prevent an infinite loop by making sure that at
        # least one character gets split off on every pass.
        self.check_wrap('-'*10+'hello', 10,

diff --git a/Lib/textwrap.py b/Lib/textwrap.py
@@ -14,6 +14,40 @@
 # some Unicode spaces (like \u00a0) are non-breaking whitespaces.
 _whitespace = '\t\n\x0b\x0c\r '

+try:
+    from unicodedata import east_asian_width
+
+    def _width(text):
+        """Return the display width of the text in columns, according to
+        unicodedata.east_asian_width only.
+        """
+        return sum(2 if east_asian_width(char) in {'F', 'W'} else 1
+                 for char in text)
+
+    def _slice(text, index):
+        """Return the two slices of text cut to index.
+        """
+        width = 0
+        pos = 0
+        for char in text:
+            width += 2 if east_asian_width(char) in {'F', 'W'} else 1
+            if width > index:
+                break
+            pos += 1
+        return text[:pos], text[pos:]
+
+except ImportError:
+
+    def _width(text):
+        """Fallback in case unicodedata is not available: The display width of
+        a text is just its number of characters.
+        """
+        return len(text)
+
+    def _slice(text, index):
+        return text[:index], text[index:]
+
+
 class TextWrapper:
    """
    Object for wrapping/filling text.  The public interface consists of
@@ -215,8 +249,9 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
        # If we're allowed to break long words, then do so: put as much
        # of the next chunk onto the current line as will fit.
        if self.break_long_words:
-            cur_line.append(reversed_chunks[-1][:space_left])
-            reversed_chunks[-1] = reversed_chunks[-1][space_left:]
+            left, right = _slice(reversed_chunks[-1], space_left)
+            cur_line.append(left)
+            reversed_chunks[-1] = right

        # Otherwise, we have to preserve the long word intact.  Only add
        # it to the current line if there's nothing already there --
@@ -244,14 +279,13 @@ def _wrap_chunks(self, chunks):
        lines, but apart from that whitespace is preserved.
        """
        lines = []
-        if self.width <= 0:
-            raise ValueError("invalid width %r (must be > 0)" % self.width)
        if self.max_lines is not None:
            if self.max_lines > 1:
                indent = self.subsequent_indent
            else:
                indent = self.initial_indent
-            if len(indent) + len(self.placeholder.lstrip()) > self.width:
+            if (_width(indent) +
+                _width(self.placeholder.lstrip()) > self.width):
                raise ValueError("placeholder too large for max width")

        # Arrange in reverse order so items can be efficiently popped
@@ -272,15 +306,15 @@ def _wrap_chunks(self, chunks):
                indent = self.initial_indent

            # Maximum width for this line.
-            width = self.width - len(indent)
+            width = self.width - _width(indent)

            # First chunk on line is whitespace -- drop it, unless this
            # is the very beginning of the text (ie. no lines started yet).
            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
                del chunks[-1]

            while chunks:
-                l = len(chunks[-1])
+                l = _width(chunks[-1])

                # Can at least squeeze this chunk onto the current line.
                if cur_len + l <= width:
@@ -290,16 +324,15 @@ def _wrap_chunks(self, chunks):
                # Nope, this line is full.
                else:
                    break
-
            # The current line is full, and the next chunk is too big to
            # fit on *any* line (not just this one).
-            if chunks and len(chunks[-1]) > width:
+            if chunks and _width(chunks[-1]) > width:
                self._handle_long_word(chunks, cur_line, cur_len, width)
-                cur_len = sum(map(len, cur_line))
+                cur_len = sum(map(_width, cur_line))

            # If the last chunk on this line is all whitespace, drop it.
            if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
-                cur_len -= len(cur_line[-1])
+                cur_len -= _width(cur_line[-1])
                del cur_line[-1]

            if cur_line:
@@ -315,17 +348,17 @@ def _wrap_chunks(self, chunks):
                else:
                    while cur_line:
                        if (cur_line[-1].strip() and
-                            cur_len + len(self.placeholder) <= width):
+                            cur_len + _width(self.placeholder) <= width):
                            cur_line.append(self.placeholder)
                            lines.append(indent + ''.join(cur_line))
                            break
-                        cur_len -= len(cur_line[-1])
+                        cur_len -= _width(cur_line[-1])
                        del cur_line[-1]
                    else:
                        if lines:
                            prev_line = lines[-1].rstrip()
-                            if (len(prev_line) + len(self.placeholder) <=
-                                    self.width):
+                            if (_width(prev_line) +
+                                _width(self.placeholder) <= self.width):
                                lines[-1] = prev_line + self.placeholder
                                break
                        lines.append(indent + self.placeholder.lstrip())
@@ -348,6 +381,10 @@ def wrap(self, text):
        and all other whitespace characters (including newline) are
        converted to space.
        """
+        if self.width <= 0:
+            raise ValueError("invalid width %r (must be > 0)" % self.width)
+        elif self.width == 1 and _width(text) > len(text):
+            raise ValueError("invalid width 1 (must be > 1 when CJK chars)")
        chunks = self._split_chunks(text)
        if self.fix_sentence_endings:
            self._fix_sentence_endings(chunks)

diff --git a/Misc/ACKS b/Misc/ACKS
@@ -514,6 +514,7 @@ Lele Gaifax
 Santiago Gala
 Yitzchak Gale
 Matthew Gallagher
+Florent Gallaire
 Quentin Gallet-Gilles
 Riccardo Attilio Galli
 Raymund Galvin

diff --git a/Misc/NEWS.d/next/Library/2018-02-13-02-06-24.bpo-24665.re7KqM.rst b/Misc/NEWS.d/next/Library/2018-02-13-02-06-24.bpo-24665.re7KqM.rst
@@ -0,0 +1,2 @@
+Textwrap now take into account CJK double characters while measuring line
+width.
-Original file line number
+Diff line change
@@ -14,6 +14,40 @@
     # some Unicode spaces (like \u00a0) are non-breaking whitespaces.
     _whitespace = '\t\n\x0b\x0c\r '
+    try:
+        from unicodedata import east_asian_width
+        def _width(text):
+            """Return the display width of the text in columns, according to
+            unicodedata.east_asian_width only.
+            """
+            return sum(2 if east_asian_width(char) in {'F', 'W'} else 1
+                     for char in text)
+        def _slice(text, index):
+            """Return the two slices of text cut to index.
+            """
+            width = 0
+            pos = 0
+            for char in text:
+                width += 2 if east_asian_width(char) in {'F', 'W'} else 1
             Copy link

  
      
    
  

  
      



      

  

  
    
      

      
            fgallaire
  

      

      

      


        Feb 14, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Why inlining _len(), I don't have seen performance issues and it's less readable (less pythonic)
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
  Member


      

  Author


  

  
    
      

      
            JulienPalard
  

      

      

      


        Feb 15, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      How do you know where to break once you have the whole value?
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      



      

  

  
    
      

      
            fgallaire
  

      

      

      


        Feb 15, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Hello I was reading too fast. In my version there's the _wide boolean function.

So here width += _wide(char) + 1
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      



      

  

  
    
      

      
            fgallaire
  

      

      

      


        Feb 15, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      And _len is just return sum(2 if _wide(char) else 1 for char in text) with no performance issues
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      



      

  

  
    
      

      
            fgallaire
  

      

      

      


        Feb 15, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      More pythonic, DRY.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    





    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
  Member


      

  Author


  

  
    
      

      
            JulienPalard
  

      

      

      


        Mar 6, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      I won't bet on the performances, calling _len from _slice adds two functions calls per character (one to _len and one to sum). In one case I'm doing it on a character, and in the other case in a whole string. Yes I could also factorize this ternary to a third function, but I don't find it more readable.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
+                if width > index:
+                    break
+                pos += 1
             Copy link

  
      
    
  

  
      



      

  

  
    
      

      
            fgallaire
  

      

      

      


        Feb 14, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Why note use enumerate(), it's less readable (less pythonic)
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
  Member


      

  Author


  

  
    
      

      
            JulienPalard
  

      

      

      


        Feb 15, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Because it does not works with enumerate as the last incrementation were not done. I do not remember which case exactly but if you run the unit test you'll spot it easily, it was failing, I'll do if needed but can't right now.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      



      

  

  
    
      

      
            fgallaire
  

      

      

      


        Feb 15, 2018


      
  •

  
    
      
        
          edited
          
        
        
      
    
    
      
  
        
      Loading


  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  
    
  


    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Interested in that, the code was strongly tested for txt2tags and don't catch this problem.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
  Member


      

  Author


  

  
    
      

      
            JulienPalard
  

      

      

      


        Mar 6, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Your initial implementation was working thanks to your if cjk_len(text) <= index: return text, '' fixing the special case explicitly, I may have tried to avoid it.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      



      

  

  
    
      

      
            fgallaire
  

      

      

      


        Mar 6, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      "Explicit is better than implicit." but the more important is that both solutions are correct.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
+            return text[:pos], text[pos:]
+    except ImportError:
+        def _width(text):
+            """Fallback in case unicodedata is not available: The display width of
+            a text is just its number of characters.
+            """
+            return len(text)
+        def _slice(text, index):
+            return text[:index], text[index:]
     class TextWrapper:
         """
         Object for wrapping/filling text.  The public interface consists of
@@ -215,8 +249,9 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
             # If we're allowed to break long words, then do so: put as much
             # of the next chunk onto the current line as will fit.
             if self.break_long_words:
-                cur_line.append(reversed_chunks[-1][:space_left])
-                reversed_chunks[-1] = reversed_chunks[-1][space_left:]
+                left, right = _slice(reversed_chunks[-1], space_left)
+                cur_line.append(left)
+                reversed_chunks[-1] = right
             # Otherwise, we have to preserve the long word intact.  Only add
             # it to the current line if there's nothing already there --
@@ -244,14 +279,13 @@ def _wrap_chunks(self, chunks):
             lines, but apart from that whitespace is preserved.
             """
             lines = []
-            if self.width <= 0:
-                raise ValueError("invalid width %r (must be > 0)" % self.width)
             if self.max_lines is not None:
                 if self.max_lines > 1:
                     indent = self.subsequent_indent
                 else:
                     indent = self.initial_indent
-                if len(indent) + len(self.placeholder.lstrip()) > self.width:
+                if (_width(indent) +
+                    _width(self.placeholder.lstrip()) > self.width):
                     raise ValueError("placeholder too large for max width")
             # Arrange in reverse order so items can be efficiently popped
@@ -272,15 +306,15 @@ def _wrap_chunks(self, chunks):
                     indent = self.initial_indent
                 # Maximum width for this line.
-                width = self.width - len(indent)
+                width = self.width - _width(indent)
                 # First chunk on line is whitespace -- drop it, unless this
                 # is the very beginning of the text (ie. no lines started yet).
                 if self.drop_whitespace and chunks[-1].strip() == '' and lines:
                     del chunks[-1]
                 while chunks:
-                    l = len(chunks[-1])
+                    l = _width(chunks[-1])
                     # Can at least squeeze this chunk onto the current line.
                     if cur_len + l <= width:
@@ -290,16 +324,15 @@ def _wrap_chunks(self, chunks):
                     # Nope, this line is full.
                     else:
                         break
                 # The current line is full, and the next chunk is too big to
                 # fit on *any* line (not just this one).
-                if chunks and len(chunks[-1]) > width:
+                if chunks and _width(chunks[-1]) > width:
                     self._handle_long_word(chunks, cur_line, cur_len, width)
-                    cur_len = sum(map(len, cur_line))
+                    cur_len = sum(map(_width, cur_line))
                 # If the last chunk on this line is all whitespace, drop it.
                 if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
-                    cur_len -= len(cur_line[-1])
+                    cur_len -= _width(cur_line[-1])
                     del cur_line[-1]
                 if cur_line:
@@ -315,17 +348,17 @@ def _wrap_chunks(self, chunks):
                     else:
                         while cur_line:
                             if (cur_line[-1].strip() and
-                                cur_len + len(self.placeholder) <= width):
+                                cur_len + _width(self.placeholder) <= width):
                                 cur_line.append(self.placeholder)
                                 lines.append(indent + ''.join(cur_line))
                                 break
-                            cur_len -= len(cur_line[-1])
+                            cur_len -= _width(cur_line[-1])
                             del cur_line[-1]
                         else:
                             if lines:
                                 prev_line = lines[-1].rstrip()
-                                if (len(prev_line) + len(self.placeholder) <=
-                                        self.width):
+                                if (_width(prev_line) +
+                                    _width(self.placeholder) <= self.width):
                                     lines[-1] = prev_line + self.placeholder
                                     break
                             lines.append(indent + self.placeholder.lstrip())
@@ -348,6 +381,10 @@ def wrap(self, text):
             and all other whitespace characters (including newline) are
             converted to space.
             """
+            if self.width <= 0:
+                raise ValueError("invalid width %r (must be > 0)" % self.width)
+            elif self.width == 1 and _width(text) > len(text):
+                raise ValueError("invalid width 1 (must be > 1 when CJK chars)")
             Copy link

  
      
    
  

  
      



      

  

  
    
      

      
            fgallaire
  

      

      

      


        Mar 6, 2018


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      I have done a more complex solution:
elif self.width == 1 and (sum(self._width(chunk) for chunk in chunks) >
                              sum(len(chunk) for chunk in chunks)):

It throws the exception earlier, but it's probably not absolutely necessary.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
             chunks = self._split_chunks(text)
             if self.fix_sentence_endings:
                 self._fix_sentence_endings(chunks)
-          Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Textwrap now take into account CJK double characters while measuring line
		width.