Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 11312ea

Browse filesBrowse files
authored
gh-110913: Fix WindowsConsoleIO chunking of UTF-8 text (GH-111007)
1 parent b60f058 commit 11312ea
Copy full SHA for 11312ea

File tree

Expand file treeCollapse file tree

2 files changed

+21
-16
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+21
-16
lines changed
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
WindowsConsoleIO now correctly chunks large buffers without splitting up UTF-8 sequences.

‎Modules/_io/winconsoleio.c

Copy file name to clipboardExpand all lines: Modules/_io/winconsoleio.c
+20-16Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,23 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
134134
return m;
135135
}
136136

137+
static DWORD
138+
_find_last_utf8_boundary(const char *buf, DWORD len)
139+
{
140+
/* This function never returns 0, returns the original len instead */
141+
DWORD count = 1;
142+
if (len == 0 || (buf[len - 1] & 0x80) == 0) {
143+
return len;
144+
}
145+
for (;; count++) {
146+
if (count > 3 || count >= len) {
147+
return len;
148+
}
149+
if ((buf[len - count] & 0xc0) != 0x80) {
150+
return len - count;
151+
}
152+
}
153+
}
137154

138155
/*[clinic input]
139156
module _io
@@ -975,7 +992,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
975992
{
976993
BOOL res = TRUE;
977994
wchar_t *wbuf;
978-
DWORD len, wlen, orig_len, n = 0;
995+
DWORD len, wlen, n = 0;
979996
HANDLE handle;
980997

981998
if (self->fd == -1)
@@ -1007,21 +1024,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
10071024
have to reduce and recalculate. */
10081025
while (wlen > 32766 / sizeof(wchar_t)) {
10091026
len /= 2;
1010-
orig_len = len;
1011-
/* Reduce the length until we hit the final byte of a UTF-8 sequence
1012-
* (top bit is unset). Fix for github issue 82052.
1013-
*/
1014-
while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
1015-
--len;
1016-
/* If we hit a length of 0, something has gone wrong. This shouldn't
1017-
* be possible, as valid UTF-8 can have at most 3 non-final bytes
1018-
* before a final one, and our buffer is way longer than that.
1019-
* But to be on the safe side, if we hit this issue we just restore
1020-
* the original length and let the console API sort it out.
1021-
*/
1022-
if (len == 0) {
1023-
len = orig_len;
1024-
}
1027+
/* Fix for github issues gh-110913 and gh-82052. */
1028+
len = _find_last_utf8_boundary(b->buf, len);
10251029
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
10261030
}
10271031
Py_END_ALLOW_THREADS

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.