Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

gh-133672: Allow LOAD_FAST to be optimized to LOAD_FAST_BORROW #133721

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
Loading
from
16 changes: 14 additions & 2 deletions 16 Lib/test/test_peepholer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2537,7 +2537,19 @@ def test_for_iter(self):
("LOAD_CONST", 0, 7),
("RETURN_VALUE", None, 8),
]
self.cfg_optimization_test(insts, insts, consts=[None])
expected = [
("LOAD_FAST_BORROW", 0, 1),
top := self.Label(),
("FOR_ITER", end := self.Label(), 2),
("STORE_FAST", 2, 3),
("JUMP", top, 4),
end,
("END_FOR", None, 5),
("POP_TOP", None, 6),
("LOAD_CONST", 0, 7),
("RETURN_VALUE", None, 8),
]
self.cfg_optimization_test(insts, expected, consts=[None])

def test_load_attr(self):
insts = [
Expand Down Expand Up @@ -2603,7 +2615,7 @@ def test_send(self):
("RETURN_VALUE", None, 7)
]
expected = [
("LOAD_FAST", 0, 1),
("LOAD_FAST_BORROW", 0, 1),
("LOAD_FAST_BORROW", 1, 2),
("SEND", end := self.Label(), 3),
("LOAD_CONST", 0, 4),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The peephole optimizer now converts LOAD_FAST to LOAD_FAST_BORROW more aggressively. This optimization is applied even if the loaded variable remains live on the stack at the end of a basic block, as long as the borrow is otherwise determined to be safe. This can reduce reference counting overhead in certain code patterns, such as with iterables in loops.
321 changes: 312 additions & 9 deletions 321 Python/flowgraph.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ typedef struct _PyCfgBasicblock {
unsigned b_cold : 1;
/* b_warm is used by the cold-detection algorithm to mark blocks which are definitely not cold */
unsigned b_warm : 1;
/* b_dfs_visited_gen is used by the borrow-safe analysis to mark whether a block has been visited */
int b_dfs_visited_gen;
} basicblock;


Expand All @@ -95,6 +97,8 @@ typedef struct _PyCfgBuilder cfg_builder;
#define LOCATION(LNO, END_LNO, COL, END_COL) \
((const _Py_SourceLocation){(LNO), (END_LNO), (COL), (END_COL)})

static int cfg_dfs_generation_counter = 0;

static inline int
is_block_push(cfg_instr *i)
{
Expand Down Expand Up @@ -2701,6 +2705,239 @@ load_fast_push_block(basicblock ***sp, basicblock *target,
}
}

/*
* Recursively determine if a borrowed reference remains safe along all CFG paths.
*
* This function performs a DFS starting from 'current_block'.
* It tracks a specific value that was notionally loaded by a LOAD_FAST_BORROW
* instruction and is currently at 'depth_of_value_on_entry' on the operand stack
* (0 being TOS, -1 if already known to be consumed). The 'original_local_idx'
* is the index of the local variable from which this value was borrowed.
*
* A borrow is considered UNSAFE if, along any execution path before the
* borrowed value is consumed from the stack:
* 1. The 'original_local_idx' (the backing store for the borrow) is overwritten
* (e.g., by STORE_FAST, DELETE_FAST on that local).
* 2. The borrowed value itself, when at the top of the stack (depth 0), is
* consumed by an instruction that stores it into any local variable
* (e.g., STORE_FAST).
* 3. The value is not consumed and the CFG path ends (e.g., end of function
* without consumption) or enters a cycle where it's not consumed.
*
* The function uses 'cfg_dfs_generation_counter' in conjunction with
* 'current_block->b_dfs_visited_gen' to detect cycles within the current
* specific DFS traversal, preventing infinite loops and deeming such cyclic
* paths unsafe if the value isn't consumed within the cycle.
*
* Stack depths are tracked by:
* - 'depth_of_value_on_entry': The depth of the tracked item upon entering 'current_block'.
* - 'current_target_depth': The depth of the item as instructions in 'current_block' are processed.
* - 'depth_before_jump_op_effect': When a jump instruction is encountered, this
* is calculated by simulating all prior instructions in 'current_block' to find
* the item's depth *just before* the jump instruction itself has any stack effect.
* This precise depth is then used to calculate the 'target_entry_depth' for the
* recursive call to the jump's target block.
*
*/
static bool
is_borrow_safe(
basicblock *current_block,
Py_ssize_t depth_of_value_on_entry,
int original_local_idx,
cfg_builder *g)
{
if (depth_of_value_on_entry == -1) {
return true;
}

if (current_block->b_dfs_visited_gen == cfg_dfs_generation_counter) {
return false;
}
current_block->b_dfs_visited_gen = cfg_dfs_generation_counter;

Py_ssize_t current_target_depth = depth_of_value_on_entry;

for (int i = 0; i < current_block->b_iused; i++) {
cfg_instr *instr = &current_block->b_instr[i];
int opcode = instr->i_opcode;
int oparg = instr->i_oparg;

if ((opcode == STORE_FAST || opcode == DELETE_FAST || opcode == LOAD_FAST_AND_CLEAR) &&
oparg == original_local_idx) {
return false;
}

stack_effects effects_noj;
if (get_stack_effects(opcode, oparg, 0, &effects_noj) < 0) {
return false;
}
int num_popped = _PyOpcode_num_popped(opcode, oparg);
int num_pushed = _PyOpcode_num_pushed(opcode, oparg);

if (current_target_depth < num_popped) {
if (opcode == STORE_FAST && current_target_depth == 0) {
return false;
}
return true;
}
current_target_depth = current_target_depth - num_popped + num_pushed;

if (HAS_TARGET(opcode)) {
if (i != current_block->b_iused - 1) {
continue; // Skip jumps that are not the last instruction in the block.
}
bool safe_on_all_branches = true;

Py_ssize_t depth_before_jump_op_effect = depth_of_value_on_entry;
for(int k=0; k < i; ++k) {
cfg_instr *prev_instr_in_block = &current_block->b_instr[k];
// Kill check for intermediate instructions
if ((prev_instr_in_block->i_opcode == STORE_FAST ||
prev_instr_in_block->i_opcode == DELETE_FAST ||
prev_instr_in_block->i_opcode == LOAD_FAST_AND_CLEAR) &&
prev_instr_in_block->i_oparg == original_local_idx) {
return false;
}
stack_effects prev_effects;
if (get_stack_effects(prev_instr_in_block->i_opcode, prev_instr_in_block->i_oparg, 0, &prev_effects) < 0) {
return false;
}
int prev_popped_val = _PyOpcode_num_popped(prev_instr_in_block->i_opcode, prev_instr_in_block->i_oparg);
if (depth_before_jump_op_effect < prev_popped_val) {
if (prev_instr_in_block->i_opcode == STORE_FAST && depth_before_jump_op_effect == 0) {
return false;
}
return true;
}
depth_before_jump_op_effect = depth_before_jump_op_effect - prev_popped_val +
_PyOpcode_num_pushed(prev_instr_in_block->i_opcode, prev_instr_in_block->i_oparg);
}

// Analyze jump target path
stack_effects effects_jump;
if (get_stack_effects(opcode, oparg, 1, &effects_jump) < 0) return false;
int jump_popped_val = _PyOpcode_num_popped(opcode, oparg);
Py_ssize_t target_entry_depth = depth_before_jump_op_effect;

if (target_entry_depth < jump_popped_val) {
if (opcode == STORE_FAST && target_entry_depth == 0) {
safe_on_all_branches = false;
}
} else { // Not consumed by jump op, recurse on target.
target_entry_depth = target_entry_depth - jump_popped_val + _PyOpcode_num_pushed(opcode, oparg);
if (!is_borrow_safe(instr->i_target, target_entry_depth, original_local_idx, g)) {
safe_on_all_branches = false;
}
}

// Analyze fallthrough path for conditional jumps, if the jump path was safe.
if (safe_on_all_branches && IS_CONDITIONAL_JUMP_OPCODE(opcode) && current_block->b_next) {
if (!is_borrow_safe(current_block->b_next, current_target_depth, original_local_idx, g)) {
safe_on_all_branches = false;
}
}
return safe_on_all_branches;
}
}

// When the instructions finish and the value isn't consumed, check fallthrough.
if (current_block->b_next && BB_HAS_FALLTHROUGH(current_block)) {
return is_borrow_safe(current_block->b_next, current_target_depth, original_local_idx, g);
}

/*
No further path (or no fallthrough) and value is still on stack (current_target_depth is valid).
This means it's unconsumed at the end of a CFG path, so it's unsafe.
*/
return false;
}


/*
* Initiates a CFG wide safety check for a borrowed reference.
*
* This function is called when a LOAD_FAST instruction is a candidate for
* LOAD_FAST_BORROW, but its produced value ('REF_UNCONSUMED' flag is set)
* might live beyond the current basic block ('producer_block').
*
* It determines the immediate successor(s) of the 'producer_block' and the
* stack depth at which the borrowed value (identified by 'original_local_idx'
* and initially at 'depth_of_value_at_producer_end' from TOS) would enter
* these successors.
*
* It then calls 'is_borrow_safe()' for each successor path. The borrow is
* considered globally safe only if 'is_borrow_safe()' returns true for ALL
* possible successor paths.
*
* A new DFS traversal is started by incrementing 'cfg_dfs_generation_counter'
* to ensure that 'is_borrow_safe()' uses a fresh set of visited markers
* ('b_dfs_visited_gen') for its analysis.
*
*/
static bool
check_borrow_safety_globally(
basicblock *producer_block,
Py_ssize_t depth_of_value_at_producer_end,
int original_local_idx,
cfg_builder *g)
{
cfg_dfs_generation_counter++;
bool overall_safety = true;

cfg_instr *last_instr = basicblock_last_instr(producer_block);

// If depth is -1, implies value was already consumed or is invalid for tracking.
if (depth_of_value_at_producer_end == -1) {
return false;
}

if (last_instr && HAS_TARGET(last_instr->i_opcode)) {
stack_effects effects_jump;
if (get_stack_effects(last_instr->i_opcode, last_instr->i_oparg, 1, &effects_jump) < 0) return false;
int jump_popped = _PyOpcode_num_popped(last_instr->i_opcode, last_instr->i_oparg);
Py_ssize_t entry_depth_for_target = depth_of_value_at_producer_end;

if (entry_depth_for_target < jump_popped) {
if (last_instr->i_opcode == STORE_FAST && entry_depth_for_target == 0) {
overall_safety = false;
}
} else {
entry_depth_for_target = entry_depth_for_target - jump_popped +
_PyOpcode_num_pushed(last_instr->i_opcode, last_instr->i_oparg);
if (!is_borrow_safe(last_instr->i_target, entry_depth_for_target, original_local_idx, g)) {
overall_safety = false;
}
}

// Analyze fallthrough path for conditional jumps, if the jump path was safe.
if (overall_safety && IS_CONDITIONAL_JUMP_OPCODE(last_instr->i_opcode) && producer_block->b_next) {
stack_effects effects_fall;
if (get_stack_effects(last_instr->i_opcode, last_instr->i_oparg, 0, &effects_fall) < 0) return false;
int fall_popped = _PyOpcode_num_popped(last_instr->i_opcode, last_instr->i_oparg);
Py_ssize_t entry_depth_for_fallthrough = depth_of_value_at_producer_end;

if (entry_depth_for_fallthrough < fall_popped) {
if (last_instr->i_opcode == STORE_FAST && entry_depth_for_fallthrough == 0) {
overall_safety = false;
}
} else { // Recurse on fallthrough.
entry_depth_for_fallthrough = entry_depth_for_fallthrough - fall_popped +
_PyOpcode_num_pushed(last_instr->i_opcode, last_instr->i_oparg);
if (!is_borrow_safe(producer_block->b_next, entry_depth_for_fallthrough, original_local_idx, g)) {
overall_safety = false;
}
}
}
} else if (producer_block->b_next && BB_HAS_FALLTHROUGH(producer_block)) { // Standard fallthrough, no jump.
if (!is_borrow_safe(producer_block->b_next, depth_of_value_at_producer_end, original_local_idx, g)) {
overall_safety = false;
}
} else {
overall_safety = false;
}
return overall_safety;
}

/*
* Strength reduce LOAD_FAST{_LOAD_FAST} instructions into faster variants that
* load borrowed references onto the operand stack.
Expand Down Expand Up @@ -2747,6 +2984,7 @@ optimize_load_fast(cfg_builder *g)
basicblock *entryblock = g->g_entryblock;
for (basicblock *b = entryblock; b != NULL; b = b->b_next) {
max_instrs = Py_MAX(max_instrs, b->b_iused);
b->b_dfs_visited_gen = 0;
}
size_t instr_flags_size = max_instrs * sizeof(uint8_t);
uint8_t *instr_flags = PyMem_Malloc(instr_flags_size);
Expand Down Expand Up @@ -2976,17 +3214,82 @@ optimize_load_fast(cfg_builder *g)

// Optimize instructions
for (int i = 0; i < block->b_iused; i++) {
if (!instr_flags[i]) {
cfg_instr *instr = &block->b_instr[i];
switch (instr->i_opcode) {
case LOAD_FAST:
cfg_instr *instr = &block->b_instr[i];
bool is_load_fast_type = (instr->i_opcode == LOAD_FAST || instr->i_opcode == LOAD_FAST_LOAD_FAST);

if (is_load_fast_type) {
bool killed_or_stored_locally = (instr_flags[i] & (SUPPORT_KILLED | STORED_AS_LOCAL));
if (killed_or_stored_locally) {
continue; // Definitely cannot borrow due to local block issues
}

bool unconsumed_in_block = (instr_flags[i] & REF_UNCONSUMED);
bool can_borrow = false;

if (!unconsumed_in_block) {
can_borrow = true; // Safe by local analysis, consumed in current block
} else {
if (instr->i_opcode == LOAD_FAST) {
int local_idx = instr->i_oparg;
Py_ssize_t depth_from_tos_at_block_end = -1;
// Find the specific item on the `refs` stack (at end of current block simulation)
for (Py_ssize_t k = 0; k < refs.size; k++) {
ref r = ref_stack_at(&refs, k);
if (r.instr == i && r.local == local_idx) { // Match instruction and local
depth_from_tos_at_block_end = refs.size - 1 - k;
break;
}
}

if (depth_from_tos_at_block_end != -1) {
can_borrow = check_borrow_safety_globally(block, depth_from_tos_at_block_end, local_idx, g);
} else {
// If REF_UNCONSUMED is set but we couldn't find its depth, assume unsafe.
// This can happen if refs.size is 0, yet flag is set.
can_borrow = false;
}

} else { // LOAD_FAST_LOAD_FAST
can_borrow = true; // Assume true, prove false if any part is unsafe
int local_idx1 = instr->i_oparg >> 4;
int local_idx2 = instr->i_oparg & 15;
Py_ssize_t depth1 = -1, depth2 = -1;

// Find depths on `refs` stack for both products of this LFLF instruction `i`
for (Py_ssize_t k = 0; k < refs.size; k++) {
ref r = ref_stack_at(&refs, k);
if (r.instr == i) {
Py_ssize_t current_depth = refs.size - 1 - k;
if (r.local == local_idx1 && depth1 == -1) depth1 = current_depth;
else if (r.local == local_idx2 && depth2 == -1) depth2 = current_depth;
}
}

bool found_any_on_stack = (depth1 != -1 || depth2 != -1);

if (depth1 != -1) {
if (!check_borrow_safety_globally(block, depth1, local_idx1, g)) {
can_borrow = false;
}
}
if (can_borrow && depth2 != -1) {
if (!check_borrow_safety_globally(block, depth2, local_idx2, g)) {
can_borrow = false;
}
}

if (unconsumed_in_block && !found_any_on_stack) {
can_borrow = false;
}
}
}

if (can_borrow) {
if (instr->i_opcode == LOAD_FAST) {
instr->i_opcode = LOAD_FAST_BORROW;
break;
case LOAD_FAST_LOAD_FAST:
} else if (instr->i_opcode == LOAD_FAST_LOAD_FAST) {
instr->i_opcode = LOAD_FAST_BORROW_LOAD_FAST_BORROW;
break;
default:
break;
}
}
}
}
Expand Down
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.