Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 22cde39

Browse filesBrowse files
authored
[3.11] bpo-43950: handle wide unicode characters in tracebacks (GH-28150) (#111373)
1 parent 762aba7 commit 22cde39
Copy full SHA for 22cde39

File tree

6 files changed

+189
-15
lines changed
Filter options

6 files changed

+189
-15
lines changed

‎Lib/test/test_traceback.py

Copy file name to clipboardExpand all lines: Lib/test/test_traceback.py
+56-1Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,62 @@ def f():
893893
f" callable()",
894894
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f",
895895
f" print(1, www(",
896-
f" ^^^^",
896+
f" ^^^^^^^",
897+
]
898+
self.assertEqual(actual, expected)
899+
900+
def test_byte_offset_with_wide_characters_term_highlight(self):
901+
def f():
902+
说明说明 = 1
903+
şçöğıĤellö = 0 # not wide but still non-ascii
904+
return 说明说明 / şçöğıĤellö
905+
906+
actual = self.get_exception(f)
907+
expected = [
908+
f"Traceback (most recent call last):",
909+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
910+
f" callable()",
911+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 3}, in f",
912+
f" return 说明说明 / şçöğıĤellö",
913+
f" ~~~~~~~~~^~~~~~~~~~~~",
914+
]
915+
self.assertEqual(actual, expected)
916+
917+
def test_byte_offset_with_emojis_term_highlight(self):
918+
def f():
919+
return "✨🐍" + func_说明说明("📗🚛",
920+
"📗🚛") + "🐍"
921+
922+
actual = self.get_exception(f)
923+
expected = [
924+
f"Traceback (most recent call last):",
925+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
926+
f" callable()",
927+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f",
928+
f' return "✨🐍" + func_说明说明("📗🚛",',
929+
f" ^^^^^^^^^^^^^",
930+
]
931+
self.assertEqual(actual, expected)
932+
933+
def test_byte_offset_wide_chars_subscript(self):
934+
def f():
935+
my_dct = {
936+
"✨🚛✨": {
937+
"说明": {
938+
"🐍🐍🐍": None
939+
}
940+
}
941+
}
942+
return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]
943+
944+
actual = self.get_exception(f)
945+
expected = [
946+
f"Traceback (most recent call last):",
947+
f" File \"{__file__}\", line {self.callable_line}, in get_exception",
948+
f" callable()",
949+
f" File \"{__file__}\", line {f.__code__.co_firstlineno + 8}, in f",
950+
f' return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]',
951+
f" ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^",
897952
]
898953
self.assertEqual(actual, expected)
899954

‎Lib/traceback.py

Copy file name to clipboardExpand all lines: Lib/traceback.py
+41-12Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -465,39 +465,49 @@ def format_frame_summary(self, frame_summary):
465465
stripped_line = frame_summary.line.strip()
466466
row.append(' {}\n'.format(stripped_line))
467467

468-
orig_line_len = len(frame_summary._original_line)
468+
line = frame_summary._original_line
469+
orig_line_len = len(line)
469470
frame_line_len = len(frame_summary.line.lstrip())
470471
stripped_characters = orig_line_len - frame_line_len
471472
if (
472473
frame_summary.colno is not None
473474
and frame_summary.end_colno is not None
474475
):
475476
start_offset = _byte_offset_to_character_offset(
476-
frame_summary._original_line, frame_summary.colno) + 1
477+
line, frame_summary.colno)
477478
end_offset = _byte_offset_to_character_offset(
478-
frame_summary._original_line, frame_summary.end_colno) + 1
479+
line, frame_summary.end_colno)
480+
code_segment = line[start_offset:end_offset]
479481

480482
anchors = None
481483
if frame_summary.lineno == frame_summary.end_lineno:
482484
with suppress(Exception):
483-
anchors = _extract_caret_anchors_from_line_segment(
484-
frame_summary._original_line[start_offset - 1:end_offset - 1]
485-
)
485+
anchors = _extract_caret_anchors_from_line_segment(code_segment)
486486
else:
487-
end_offset = stripped_characters + len(stripped_line)
487+
# Don't count the newline since the anchors only need to
488+
# go up until the last character of the line.
489+
end_offset = len(line.rstrip())
488490

489491
# show indicators if primary char doesn't span the frame line
490492
if end_offset - start_offset < len(stripped_line) or (
491493
anchors and anchors.right_start_offset - anchors.left_end_offset > 0):
494+
# When showing this on a terminal, some of the non-ASCII characters
495+
# might be rendered as double-width characters, so we need to take
496+
# that into account when calculating the length of the line.
497+
dp_start_offset = _display_width(line, start_offset) + 1
498+
dp_end_offset = _display_width(line, end_offset) + 1
499+
492500
row.append(' ')
493-
row.append(' ' * (start_offset - stripped_characters))
501+
row.append(' ' * (dp_start_offset - stripped_characters))
494502

495503
if anchors:
496-
row.append(anchors.primary_char * (anchors.left_end_offset))
497-
row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset))
498-
row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset))
504+
dp_left_end_offset = _display_width(code_segment, anchors.left_end_offset)
505+
dp_right_start_offset = _display_width(code_segment, anchors.right_start_offset)
506+
row.append(anchors.primary_char * dp_left_end_offset)
507+
row.append(anchors.secondary_char * (dp_right_start_offset - dp_left_end_offset))
508+
row.append(anchors.primary_char * (dp_end_offset - dp_start_offset - dp_right_start_offset))
499509
else:
500-
row.append('^' * (end_offset - start_offset))
510+
row.append('^' * (dp_end_offset - dp_start_offset))
501511

502512
row.append('\n')
503513

@@ -618,6 +628,25 @@ def _extract_caret_anchors_from_line_segment(segment):
618628

619629
return None
620630

631+
_WIDE_CHAR_SPECIFIERS = "WF"
632+
633+
def _display_width(line, offset):
634+
"""Calculate the extra amount of width space the given source
635+
code segment might take if it were to be displayed on a fixed
636+
width output device. Supports wide unicode characters and emojis."""
637+
638+
# Fast track for ASCII-only strings
639+
if line.isascii():
640+
return offset
641+
642+
import unicodedata
643+
644+
return sum(
645+
2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1
646+
for char in line[:offset]
647+
)
648+
649+
621650

622651
class _ExceptionPrintContext:
623652
def __init__(self):
+3Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Traceback location ranges involving wide unicode characters (like emoji and
2+
asian characters) now are properly highlighted. Patch by Batuhan Taskaya and
3+
Pablo Galindo.

‎Parser/pegen.c

Copy file name to clipboardExpand all lines: Parser/pegen.c
+55Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,61 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
3838
return size;
3939
}
4040

41+
// Calculate the extra amount of width space the given source
42+
// code segment might take if it were to be displayed on a fixed
43+
// width output device. Supports wide unicode characters and emojis.
44+
Py_ssize_t
45+
_PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
46+
{
47+
PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
48+
if (!segment) {
49+
return -1;
50+
}
51+
52+
// Fast track for ascii strings
53+
if (PyUnicode_IS_ASCII(segment)) {
54+
Py_DECREF(segment);
55+
return character_offset;
56+
}
57+
58+
PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
59+
if (!width_fn) {
60+
return -1;
61+
}
62+
63+
Py_ssize_t width = 0;
64+
Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
65+
for (Py_ssize_t i = 0; i < len; i++) {
66+
PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
67+
if (!chr) {
68+
Py_DECREF(segment);
69+
Py_DECREF(width_fn);
70+
return -1;
71+
}
72+
73+
PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
74+
Py_DECREF(chr);
75+
if (!width_specifier) {
76+
Py_DECREF(segment);
77+
Py_DECREF(width_fn);
78+
return -1;
79+
}
80+
81+
if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
82+
_PyUnicode_EqualToASCIIString(width_specifier, "F")) {
83+
width += 2;
84+
}
85+
else {
86+
width += 1;
87+
}
88+
Py_DECREF(width_specifier);
89+
}
90+
91+
Py_DECREF(segment);
92+
Py_DECREF(width_fn);
93+
return width;
94+
}
95+
4196
// Here, mark is the start of the node, while p->mark is the end.
4297
// If node==NULL, they should be the same.
4398
int

‎Parser/pegen.h

Copy file name to clipboardExpand all lines: Parser/pegen.h
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ expr_ty _PyPegen_name_token(Parser *p);
143143
expr_ty _PyPegen_number_token(Parser *p);
144144
void *_PyPegen_string_token(Parser *p);
145145
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
146+
Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
146147

147148
// Error handling functions and APIs
148149
typedef enum {

‎Python/traceback.c

Copy file name to clipboardExpand all lines: Python/traceback.c
+33-2Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -907,8 +907,39 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
907907
goto done;
908908
}
909909

910-
if (print_error_location_carets(f, truncation, start_offset, end_offset,
911-
right_start_offset, left_end_offset,
910+
// Convert all offsets to display offsets (e.g. the space they would take up if printed
911+
// on the screen).
912+
Py_ssize_t dp_start = _PyPegen_calculate_display_width(source_line, start_offset);
913+
if (dp_start < 0) {
914+
err = ignore_source_errors() < 0;
915+
goto done;
916+
}
917+
918+
Py_ssize_t dp_end = _PyPegen_calculate_display_width(source_line, end_offset);
919+
if (dp_end < 0) {
920+
err = ignore_source_errors() < 0;
921+
goto done;
922+
}
923+
924+
Py_ssize_t dp_left_end = -1;
925+
Py_ssize_t dp_right_start = -1;
926+
if (has_secondary_ranges) {
927+
dp_left_end = _PyPegen_calculate_display_width(source_line, left_end_offset);
928+
if (dp_left_end < 0) {
929+
err = ignore_source_errors() < 0;
930+
goto done;
931+
}
932+
933+
dp_right_start = _PyPegen_calculate_display_width(source_line, right_start_offset);
934+
if (dp_right_start < 0) {
935+
err = ignore_source_errors() < 0;
936+
goto done;
937+
}
938+
}
939+
940+
941+
if (print_error_location_carets(f, truncation, dp_start, dp_end,
942+
dp_right_start, dp_left_end,
912943
primary_error_char, secondary_error_char) < 0) {
913944
err = -1;
914945
goto done;

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.