Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit d87b015

Browse filesBrowse files
gh-119118: Fix performance regression in tokenize module (#119615)
* gh-119118: Fix performance regression in tokenize module - Cache line object to avoid creating a Unicode object for all of the tokens in the same line. - Speed up byte offset to column offset conversion by using the smallest buffer possible to measure the difference. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
1 parent ae9140f commit d87b015
Copy full SHA for d87b015

File tree

4 files changed

+68
-4
lines changed
Filter options

4 files changed

+68
-4
lines changed
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix performance regression in the :mod:`tokenize` module by caching the ``line``
2+
token attribute and calculating the column offset more efficiently.

‎Parser/pegen.c

Copy file name to clipboardExpand all lines: Parser/pegen.c
+25Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,31 @@ _PyPegen_interactive_exit(Parser *p)
1818
return NULL;
1919
}
2020

21+
Py_ssize_t
22+
_PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
23+
{
24+
const char *data = PyUnicode_AsUTF8(line);
25+
26+
Py_ssize_t len = 0;
27+
while (col_offset < end_col_offset) {
28+
Py_UCS4 ch = data[col_offset];
29+
if (ch < 0x80) {
30+
col_offset += 1;
31+
} else if ((ch & 0xe0) == 0xc0) {
32+
col_offset += 2;
33+
} else if ((ch & 0xf0) == 0xe0) {
34+
col_offset += 3;
35+
} else if ((ch & 0xf8) == 0xf0) {
36+
col_offset += 4;
37+
} else {
38+
PyErr_SetString(PyExc_ValueError, "Invalid UTF-8 sequence");
39+
return -1;
40+
}
41+
len++;
42+
}
43+
return len;
44+
}
45+
2146
Py_ssize_t
2247
_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
2348
{

‎Parser/pegen.h

Copy file name to clipboardExpand all lines: Parser/pegen.h
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ int _PyPegen_fill_token(Parser *p);
148148
expr_ty _PyPegen_name_token(Parser *p);
149149
expr_ty _PyPegen_number_token(Parser *p);
150150
void *_PyPegen_string_token(Parser *p);
151+
Py_ssize_t _PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset);
151152
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
152153
Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);
153154

‎Python/Python-tokenize.c

Copy file name to clipboardExpand all lines: Python/Python-tokenize.c
+40-4Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ typedef struct
3232
{
3333
PyObject_HEAD struct tok_state *tok;
3434
int done;
35+
36+
/* Needed to cache line for performance */
37+
PyObject *last_line;
38+
Py_ssize_t last_lineno;
39+
Py_ssize_t byte_col_offset_diff;
3540
} tokenizeriterobject;
3641

3742
/*[clinic input]
@@ -68,6 +73,11 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
6873
self->tok->tok_extra_tokens = 1;
6974
}
7075
self->done = 0;
76+
77+
self->last_line = NULL;
78+
self->byte_col_offset_diff = 0;
79+
self->last_lineno = 0;
80+
7181
return (PyObject *)self;
7282
}
7383

@@ -210,7 +220,18 @@ tokenizeriter_next(tokenizeriterobject *it)
210220
if (size >= 1 && it->tok->implicit_newline) {
211221
size -= 1;
212222
}
213-
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
223+
224+
if (it->tok->lineno != it->last_lineno) {
225+
// Line has changed since last token, so we fetch the new line and cache it
226+
// in the iter object.
227+
Py_XDECREF(it->last_line);
228+
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
229+
it->last_line = line;
230+
it->byte_col_offset_diff = 0;
231+
} else {
232+
// Line hasn't changed so we reuse the cached one.
233+
line = it->last_line;
234+
}
214235
}
215236
if (line == NULL) {
216237
Py_DECREF(str);
@@ -219,13 +240,28 @@ tokenizeriter_next(tokenizeriterobject *it)
219240

220241
Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
221242
Py_ssize_t end_lineno = it->tok->lineno;
243+
it->last_lineno = lineno;
244+
222245
Py_ssize_t col_offset = -1;
223246
Py_ssize_t end_col_offset = -1;
247+
Py_ssize_t byte_offset = -1;
224248
if (token.start != NULL && token.start >= line_start) {
225-
col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
249+
byte_offset = token.start - line_start;
250+
col_offset = byte_offset - it->byte_col_offset_diff;
226251
}
227252
if (token.end != NULL && token.end >= it->tok->line_start) {
228-
end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
253+
Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
254+
if (lineno == end_lineno) {
255+
// If the whole token is at the same line, we can just use the token.start
256+
// buffer for figuring out the new column offset, since using line is not
257+
// performant for very long lines.
258+
Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
259+
end_col_offset = col_offset + token_col_offset;
260+
it->byte_col_offset_diff += token.end - token.start - token_col_offset;
261+
} else {
262+
end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
263+
it->byte_col_offset_diff += end_byte_offset - end_col_offset;
264+
}
229265
}
230266

231267
if (it->tok->tok_extra_tokens) {
@@ -262,7 +298,7 @@ tokenizeriter_next(tokenizeriterobject *it)
262298
}
263299
}
264300

265-
result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
301+
result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
266302
exit:
267303
_PyToken_Free(&token);
268304
if (type == ENDMARKER) {

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.