Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 649857a

Browse filesBrowse files
jjslobodamethane
andauthored
gh-85287: Change codecs to raise precise UnicodeEncodeError and UnicodeDecodeError (#113674)
Co-authored-by: Inada Naoki <songofacandy@gmail.com>
1 parent c514a97 commit 649857a
Copy full SHA for 649857a

File tree

Expand file treeCollapse file tree

9 files changed

+306
-81
lines changed
Filter options
Expand file treeCollapse file tree

9 files changed

+306
-81
lines changed

‎Lib/encodings/idna.py

Copy file name to clipboardExpand all lines: Lib/encodings/idna.py
+117-47Lines changed: 117 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
sace_prefix = "xn--"
1212

1313
# This assumes query strings, so AllowUnassigned is true
14-
def nameprep(label):
14+
def nameprep(label): # type: (str) -> str
1515
# Map
1616
newlabel = []
1717
for c in label:
@@ -25,7 +25,7 @@ def nameprep(label):
2525
label = unicodedata.normalize("NFKC", label)
2626

2727
# Prohibit
28-
for c in label:
28+
for i, c in enumerate(label):
2929
if stringprep.in_table_c12(c) or \
3030
stringprep.in_table_c22(c) or \
3131
stringprep.in_table_c3(c) or \
@@ -35,7 +35,7 @@ def nameprep(label):
3535
stringprep.in_table_c7(c) or \
3636
stringprep.in_table_c8(c) or \
3737
stringprep.in_table_c9(c):
38-
raise UnicodeError("Invalid character %r" % c)
38+
raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}")
3939

4040
# Check bidi
4141
RandAL = [stringprep.in_table_d1(x) for x in label]
@@ -46,59 +46,73 @@ def nameprep(label):
4646
# This is table C.8, which was already checked
4747
# 2) If a string contains any RandALCat character, the string
4848
# MUST NOT contain any LCat character.
49-
if any(stringprep.in_table_d2(x) for x in label):
50-
raise UnicodeError("Violation of BIDI requirement 2")
49+
for i, x in enumerate(label):
50+
if stringprep.in_table_d2(x):
51+
raise UnicodeEncodeError("idna", label, i, i+1,
52+
"Violation of BIDI requirement 2")
5153
# 3) If a string contains any RandALCat character, a
5254
# RandALCat character MUST be the first character of the
5355
# string, and a RandALCat character MUST be the last
5456
# character of the string.
55-
if not RandAL[0] or not RandAL[-1]:
56-
raise UnicodeError("Violation of BIDI requirement 3")
57+
if not RandAL[0]:
58+
raise UnicodeEncodeError("idna", label, 0, 1,
59+
"Violation of BIDI requirement 3")
60+
if not RandAL[-1]:
61+
raise UnicodeEncodeError("idna", label, len(label)-1, len(label),
62+
"Violation of BIDI requirement 3")
5763

5864
return label
5965

60-
def ToASCII(label):
66+
def ToASCII(label): # type: (str) -> bytes
6167
try:
6268
# Step 1: try ASCII
63-
label = label.encode("ascii")
64-
except UnicodeError:
69+
label_ascii = label.encode("ascii")
70+
except UnicodeEncodeError:
6571
pass
6672
else:
6773
# Skip to step 3: UseSTD3ASCIIRules is false, so
6874
# Skip to step 8.
69-
if 0 < len(label) < 64:
70-
return label
71-
raise UnicodeError("label empty or too long")
75+
if 0 < len(label_ascii) < 64:
76+
return label_ascii
77+
if len(label) == 0:
78+
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
79+
else:
80+
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
7281

7382
# Step 2: nameprep
7483
label = nameprep(label)
7584

7685
# Step 3: UseSTD3ASCIIRules is false
7786
# Step 4: try ASCII
7887
try:
79-
label = label.encode("ascii")
80-
except UnicodeError:
88+
label_ascii = label.encode("ascii")
89+
except UnicodeEncodeError:
8190
pass
8291
else:
8392
# Skip to step 8.
8493
if 0 < len(label) < 64:
85-
return label
86-
raise UnicodeError("label empty or too long")
94+
return label_ascii
95+
if len(label) == 0:
96+
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
97+
else:
98+
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
8799

88100
# Step 5: Check ACE prefix
89-
if label[:4].lower() == sace_prefix:
90-
raise UnicodeError("Label starts with ACE prefix")
101+
if label.lower().startswith(sace_prefix):
102+
raise UnicodeEncodeError(
103+
"idna", label, 0, len(sace_prefix), "Label starts with ACE prefix")
91104

92105
# Step 6: Encode with PUNYCODE
93-
label = label.encode("punycode")
106+
label_ascii = label.encode("punycode")
94107

95108
# Step 7: Prepend ACE prefix
96-
label = ace_prefix + label
109+
label_ascii = ace_prefix + label_ascii
97110

98111
# Step 8: Check size
99-
if 0 < len(label) < 64:
100-
return label
101-
raise UnicodeError("label empty or too long")
112+
# do not check for empty as we prepend ace_prefix.
113+
if len(label_ascii) < 64:
114+
return label_ascii
115+
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
102116

103117
def ToUnicode(label):
104118
if len(label) > 1024:
@@ -110,41 +124,51 @@ def ToUnicode(label):
110124
# per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
111125
# preventing us from wasting time decoding a big thing that'll just
112126
# hit the actual <= 63 length limit in Step 6.
113-
raise UnicodeError("label way too long")
127+
if isinstance(label, str):
128+
label = label.encode("utf-8", errors="backslashreplace")
129+
raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long")
114130
# Step 1: Check for ASCII
115131
if isinstance(label, bytes):
116132
pure_ascii = True
117133
else:
118134
try:
119135
label = label.encode("ascii")
120136
pure_ascii = True
121-
except UnicodeError:
137+
except UnicodeEncodeError:
122138
pure_ascii = False
123139
if not pure_ascii:
140+
assert isinstance(label, str)
124141
# Step 2: Perform nameprep
125142
label = nameprep(label)
126143
# It doesn't say this, but apparently, it should be ASCII now
127144
try:
128145
label = label.encode("ascii")
129-
except UnicodeError:
130-
raise UnicodeError("Invalid character in IDN label")
146+
except UnicodeEncodeError as exc:
147+
raise UnicodeEncodeError("idna", label, exc.start, exc.end,
148+
"Invalid character in IDN label")
131149
# Step 3: Check for ACE prefix
132-
if not label[:4].lower() == ace_prefix:
150+
assert isinstance(label, bytes)
151+
if not label.lower().startswith(ace_prefix):
133152
return str(label, "ascii")
134153

135154
# Step 4: Remove ACE prefix
136155
label1 = label[len(ace_prefix):]
137156

138157
# Step 5: Decode using PUNYCODE
139-
result = label1.decode("punycode")
158+
try:
159+
result = label1.decode("punycode")
160+
except UnicodeDecodeError as exc:
161+
offset = len(ace_prefix)
162+
raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason)
140163

141164
# Step 6: Apply ToASCII
142165
label2 = ToASCII(result)
143166

144167
# Step 7: Compare the result of step 6 with the one of step 3
145168
# label2 will already be in lower case.
146169
if str(label, "ascii").lower() != str(label2, "ascii"):
147-
raise UnicodeError("IDNA does not round-trip", label, label2)
170+
raise UnicodeDecodeError("idna", label, 0, len(label),
171+
f"IDNA does not round-trip, '{label!r}' != '{label2!r}'")
148172

149173
# Step 8: return the result of step 5
150174
return result
@@ -156,7 +180,7 @@ def encode(self, input, errors='strict'):
156180

157181
if errors != 'strict':
158182
# IDNA is quite clear that implementations must be strict
159-
raise UnicodeError("unsupported error handling "+errors)
183+
raise UnicodeError(f"Unsupported error handling: {errors}")
160184

161185
if not input:
162186
return b'', 0
@@ -168,11 +192,16 @@ def encode(self, input, errors='strict'):
168192
else:
169193
# ASCII name: fast path
170194
labels = result.split(b'.')
171-
for label in labels[:-1]:
172-
if not (0 < len(label) < 64):
173-
raise UnicodeError("label empty or too long")
174-
if len(labels[-1]) >= 64:
175-
raise UnicodeError("label too long")
195+
for i, label in enumerate(labels[:-1]):
196+
if len(label) == 0:
197+
offset = sum(len(l) for l in labels[:i]) + i
198+
raise UnicodeEncodeError("idna", input, offset, offset+1,
199+
"label empty")
200+
for i, label in enumerate(labels):
201+
if len(label) >= 64:
202+
offset = sum(len(l) for l in labels[:i]) + i
203+
raise UnicodeEncodeError("idna", input, offset, offset+len(label),
204+
"label too long")
176205
return result, len(input)
177206

178207
result = bytearray()
@@ -182,17 +211,27 @@ def encode(self, input, errors='strict'):
182211
del labels[-1]
183212
else:
184213
trailing_dot = b''
185-
for label in labels:
214+
for i, label in enumerate(labels):
186215
if result:
187216
# Join with U+002E
188217
result.extend(b'.')
189-
result.extend(ToASCII(label))
218+
try:
219+
result.extend(ToASCII(label))
220+
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
221+
offset = sum(len(l) for l in labels[:i]) + i
222+
raise UnicodeEncodeError(
223+
"idna",
224+
input,
225+
offset + exc.start,
226+
offset + exc.end,
227+
exc.reason,
228+
)
190229
return bytes(result+trailing_dot), len(input)
191230

192231
def decode(self, input, errors='strict'):
193232

194233
if errors != 'strict':
195-
raise UnicodeError("Unsupported error handling "+errors)
234+
raise UnicodeError(f"Unsupported error handling: {errors}")
196235

197236
if not input:
198237
return "", 0
@@ -218,16 +257,23 @@ def decode(self, input, errors='strict'):
218257
trailing_dot = ''
219258

220259
result = []
221-
for label in labels:
222-
result.append(ToUnicode(label))
260+
for i, label in enumerate(labels):
261+
try:
262+
u_label = ToUnicode(label)
263+
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
264+
offset = sum(len(x) for x in labels[:i]) + len(labels[:i])
265+
raise UnicodeDecodeError(
266+
"idna", input, offset+exc.start, offset+exc.end, exc.reason)
267+
else:
268+
result.append(u_label)
223269

224270
return ".".join(result)+trailing_dot, len(input)
225271

226272
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
227273
def _buffer_encode(self, input, errors, final):
228274
if errors != 'strict':
229275
# IDNA is quite clear that implementations must be strict
230-
raise UnicodeError("unsupported error handling "+errors)
276+
raise UnicodeError(f"Unsupported error handling: {errors}")
231277

232278
if not input:
233279
return (b'', 0)
@@ -251,7 +297,16 @@ def _buffer_encode(self, input, errors, final):
251297
# Join with U+002E
252298
result.extend(b'.')
253299
size += 1
254-
result.extend(ToASCII(label))
300+
try:
301+
result.extend(ToASCII(label))
302+
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
303+
raise UnicodeEncodeError(
304+
"idna",
305+
input,
306+
size + exc.start,
307+
size + exc.end,
308+
exc.reason,
309+
)
255310
size += len(label)
256311

257312
result += trailing_dot
@@ -261,7 +316,7 @@ def _buffer_encode(self, input, errors, final):
261316
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
262317
def _buffer_decode(self, input, errors, final):
263318
if errors != 'strict':
264-
raise UnicodeError("Unsupported error handling "+errors)
319+
raise UnicodeError("Unsupported error handling: {errors}")
265320

266321
if not input:
267322
return ("", 0)
@@ -271,7 +326,11 @@ def _buffer_decode(self, input, errors, final):
271326
labels = dots.split(input)
272327
else:
273328
# Must be ASCII string
274-
input = str(input, "ascii")
329+
try:
330+
input = str(input, "ascii")
331+
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
332+
raise UnicodeDecodeError("idna", input,
333+
exc.start, exc.end, exc.reason)
275334
labels = input.split(".")
276335

277336
trailing_dot = ''
@@ -288,7 +347,18 @@ def _buffer_decode(self, input, errors, final):
288347
result = []
289348
size = 0
290349
for label in labels:
291-
result.append(ToUnicode(label))
350+
try:
351+
u_label = ToUnicode(label)
352+
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
353+
raise UnicodeDecodeError(
354+
"idna",
355+
input.encode("ascii", errors="backslashreplace"),
356+
size + exc.start,
357+
size + exc.end,
358+
exc.reason,
359+
)
360+
else:
361+
result.append(u_label)
292362
if size:
293363
size += 1
294364
size += len(label)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.