Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 07df93d

Browse filesBrowse files
authored
gh-119105: difflib.py Differ.compare is too slow [for degenerate cases] (#119376)
Track all pairs achieving the best ratio in Differ(). This repairs the "very deep recursion and cubic time" bad cases in a way that preserves previous output.
1 parent e3bf538 commit 07df93d
Copy full SHA for 07df93d

File tree

Expand file treeCollapse file tree

2 files changed

+89
-59
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+89
-59
lines changed

‎Lib/difflib.py

Copy file name to clipboardExpand all lines: Lib/difflib.py
+69-59Lines changed: 69 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -908,95 +908,105 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
908908
+ abcdefGhijkl
909909
? ^ ^ ^
910910
"""
911-
912-
# don't synch up unless the lines have a similarity score of at
913-
# least cutoff; best_ratio tracks the best score seen so far
914-
# best_ratio is a tuple storing the best .ratio() seen so far, and
915-
# a measure of how far the indices are from their index range
916-
# midpoints. The latter is used to resolve ratio ties. Favoring
917-
# indices near the midpoints tends to cut the ranges in half. Else,
918-
# if there are many pairs with the best ratio, recursion can grow
919-
# very deep, and runtime becomes cubic. See:
911+
from operator import ge, gt
912+
# Don't synch up unless the lines have a similarity score of at
913+
# least cutoff; best_ratio tracks the best score seen so far.
914+
# Keep track of all index pairs achieving the best ratio and
915+
# deal with them here. Previously only the smallest pair was
916+
# handled here, and if there are many pairs with the best ratio,
917+
# recursion could grow very deep, and runtime cubic. See:
920918
# https://github.com/python/cpython/issues/119105
921-
best_ratio, cutoff = (0.74, 0), 0.75
919+
best_ratio, cutoff = 0.74, 0.75
922920
cruncher = SequenceMatcher(self.charjunk)
923921
eqi, eqj = None, None # 1st indices of equal lines (if any)
922+
# List of index pairs achieving best_ratio. Strictly increasing
923+
# in both index positions.
924+
max_pairs = []
925+
maxi = -1 # `i` index of last pair in max_pairs
924926

925927
# search for the pair that matches best without being identical
926-
# (identical lines must be junk lines, & we don't want to synch up
927-
# on junk -- unless we have to)
928-
amid = (alo + ahi - 1) / 2
929-
bmid = (blo + bhi - 1) / 2
928+
# (identical lines must be junk lines, & we don't want to synch
929+
# up on junk -- unless we have to)
930+
crqr = cruncher.real_quick_ratio
931+
cqr = cruncher.quick_ratio
932+
cr = cruncher.ratio
930933
for j in range(blo, bhi):
931934
bj = b[j]
932935
cruncher.set_seq2(bj)
933-
weight_j = - abs(j - bmid)
936+
# Find new best, if possible. Else search for the smallest i
937+
# (if any) > maxi that equals the best ratio
938+
search_equal = True
934939
for i in range(alo, ahi):
935940
ai = a[i]
936941
if ai == bj:
937942
if eqi is None:
938943
eqi, eqj = i, j
939944
continue
940945
cruncher.set_seq1(ai)
941-
# weight is used to balance the recursion by prioritizing
942-
# i and j in the middle of their ranges
943-
weight = weight_j - abs(i - amid)
944946
# computing similarity is expensive, so use the quick
945947
# upper bounds first -- have seen this speed up messy
946948
# compares by a factor of 3.
947-
# note that ratio() is only expensive to compute the first
948-
# time it's called on a sequence pair; the expensive part
949-
# of the computation is cached by cruncher
950-
if (cruncher.real_quick_ratio(), weight) > best_ratio and \
951-
(cruncher.quick_ratio(), weight) > best_ratio and \
952-
(cruncher.ratio(), weight) > best_ratio:
953-
best_ratio, best_i, best_j = (cruncher.ratio(), weight), i, j
954-
best_ratio, _ = best_ratio
949+
cmp = ge if search_equal and i > maxi else gt
950+
if (cmp(crqr(), best_ratio)
951+
and cmp(cqr(), best_ratio)
952+
and cmp((ratio := cr()), best_ratio)):
953+
if ratio > best_ratio:
954+
best_ratio = ratio
955+
max_pairs.clear()
956+
else:
957+
assert best_ratio == ratio and search_equal
958+
assert i > maxi
959+
max_pairs.append((i, j))
960+
maxi = i
961+
search_equal = False
955962
if best_ratio < cutoff:
963+
assert not max_pairs
956964
# no non-identical "pretty close" pair
957965
if eqi is None:
958966
# no identical pair either -- treat it as a straight replace
959967
yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
960968
return
961969
# no close pair, but an identical pair -- synch up on that
962-
best_i, best_j, best_ratio = eqi, eqj, 1.0
970+
max_pairs = [(eqi, eqj)]
963971
else:
964972
# there's a close pair, so forget the identical pair (if any)
973+
assert max_pairs
965974
eqi = None
966975

967-
# a[best_i] very similar to b[best_j]; eqi is None iff they're not
968-
# identical
969-
970-
# pump out diffs from before the synch point
971-
yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
972-
973-
# do intraline marking on the synch pair
974-
aelt, belt = a[best_i], b[best_j]
975-
if eqi is None:
976-
# pump out a '-', '?', '+', '?' quad for the synched lines
977-
atags = btags = ""
978-
cruncher.set_seqs(aelt, belt)
979-
for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
980-
la, lb = ai2 - ai1, bj2 - bj1
981-
if tag == 'replace':
982-
atags += '^' * la
983-
btags += '^' * lb
984-
elif tag == 'delete':
985-
atags += '-' * la
986-
elif tag == 'insert':
987-
btags += '+' * lb
988-
elif tag == 'equal':
989-
atags += ' ' * la
990-
btags += ' ' * lb
991-
else:
992-
raise ValueError('unknown tag %r' % (tag,))
993-
yield from self._qformat(aelt, belt, atags, btags)
994-
else:
995-
# the synch pair is identical
996-
yield ' ' + aelt
976+
last_i, last_j = alo, blo
977+
for this_i, this_j in max_pairs:
978+
# pump out diffs from before the synch point
979+
yield from self._fancy_helper(a, last_i, this_i,
980+
b, last_j, this_j)
981+
# do intraline marking on the synch pair
982+
aelt, belt = a[this_i], b[this_j]
983+
if eqi is None:
984+
# pump out a '-', '?', '+', '?' quad for the synched lines
985+
atags = btags = ""
986+
cruncher.set_seqs(aelt, belt)
987+
for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
988+
la, lb = ai2 - ai1, bj2 - bj1
989+
if tag == 'replace':
990+
atags += '^' * la
991+
btags += '^' * lb
992+
elif tag == 'delete':
993+
atags += '-' * la
994+
elif tag == 'insert':
995+
btags += '+' * lb
996+
elif tag == 'equal':
997+
atags += ' ' * la
998+
btags += ' ' * lb
999+
else:
1000+
raise ValueError('unknown tag %r' % (tag,))
1001+
yield from self._qformat(aelt, belt, atags, btags)
1002+
else:
1003+
# the synch pair is identical
1004+
yield ' ' + aelt
1005+
last_i, last_j = this_i + 1, this_j + 1
9971006

998-
# pump out diffs from after the synch point
999-
yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
1007+
# pump out diffs from after the last synch point
1008+
yield from self._fancy_helper(a, last_i, ahi,
1009+
b, last_j, bhi)
10001010

10011011
def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
10021012
g = []

‎Lib/test/test_difflib.py

Copy file name to clipboardExpand all lines: Lib/test/test_difflib.py
+20Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,26 @@ def test_make_file_usascii_charset_with_nonascii_input(self):
272272
self.assertIn('content="text/html; charset=us-ascii"', output)
273273
self.assertIn('&#305;mpl&#305;c&#305;t', output)
274274

275+
class TestDiffer(unittest.TestCase):
276+
def test_close_matches_aligned(self):
277+
# Of the 4 closely matching pairs, we want 1 to match with 3,
278+
# and 2 with 4, to align with a "top to bottom" mental model.
279+
a = ["cat\n", "dog\n", "close match 1\n", "close match 2\n"]
280+
b = ["close match 3\n", "close match 4\n", "kitten\n", "puppy\n"]
281+
m = difflib.Differ().compare(a, b)
282+
self.assertEqual(list(m),
283+
['- cat\n',
284+
'- dog\n',
285+
'- close match 1\n',
286+
'? ^\n',
287+
'+ close match 3\n',
288+
'? ^\n',
289+
'- close match 2\n',
290+
'? ^\n',
291+
'+ close match 4\n',
292+
'? ^\n',
293+
'+ kitten\n',
294+
'+ puppy\n'])
275295

276296
class TestOutputFormat(unittest.TestCase):
277297
def test_tab_delimiter(self):

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.