Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit a32fb28

Browse filesBrowse files
committed
RFC: Support full Unicode in lexer
Replicates graphql/graphql-js@8ca3d89
1 parent 0fa2c49 commit a32fb28
Copy full SHA for a32fb28

File tree

6 files changed

+463
-36
lines changed
Filter options

6 files changed

+463
-36
lines changed

‎src/graphql/language/lexer.py

Copy file name to clipboardExpand all lines: src/graphql/language/lexer.py
+93-15Lines changed: 93 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,16 @@ def print_code_point_at(self, location: int) -> str:
6767
if location >= len(body):
6868
return TokenKind.EOF.value
6969
char = body[location]
70-
code = ord(char)
7170
# Printable ASCII
72-
if 0x20 <= code <= 0x7E:
71+
if "\x20" <= char <= "\x7E":
7372
return "'\"'" if char == '"' else f"'{char}'"
7473
# Unicode code point
75-
return f"U+{code:04X}"
74+
point = (
75+
decode_surrogate_pair(ord(char), ord(body[location + 1]))
76+
if is_supplementary_code_point(body, location)
77+
else ord(char)
78+
)
79+
return f"U+{point:04X}"
7680

7781
def create_token(
7882
self, kind: TokenKind, start: int, end: int, value: Optional[str] = None
@@ -141,7 +145,8 @@ def read_next_token(self, start: int) -> Token:
141145
if char == "'"
142146
else (
143147
f"Unexpected character: {self.print_code_point_at(position)}."
144-
if is_source_character(char)
148+
if is_unicode_scalar_value(char)
149+
or is_supplementary_code_point(body, position)
145150
else f"Invalid character: {self.print_code_point_at(position)}."
146151
)
147152
)
@@ -158,10 +163,14 @@ def read_comment(self, start: int) -> Token:
158163
position = start + 1
159164
while position < body_length:
160165
char = body[position]
161-
162-
if char in "\r\n" or not is_source_character(char):
166+
if char in "\r\n":
163167
break
164-
position += 1
168+
if is_unicode_scalar_value(char):
169+
position += 1
170+
elif is_supplementary_code_point(body, position):
171+
position += 2
172+
else:
173+
break # pragma: no cover
165174

166175
return self.create_token(
167176
TokenKind.COMMENT,
@@ -270,7 +279,11 @@ def read_string(self, start: int) -> Token:
270279
if char == "\\":
271280
append(body[chunk_start:position])
272281
escape = (
273-
self.read_escaped_unicode(position)
282+
(
283+
self.read_escaped_unicode_variable_width(position)
284+
if body[position + 2 : position + 3] == "{"
285+
else self.read_escaped_unicode_fixed_width(position)
286+
)
274287
if body[position + 1 : position + 2] == "u"
275288
else self.read_escaped_character(position)
276289
)
@@ -282,8 +295,10 @@ def read_string(self, start: int) -> Token:
282295
if char in "\r\n":
283296
break
284297

285-
if is_source_character(char):
298+
if is_unicode_scalar_value(char):
286299
position += 1
300+
elif is_supplementary_code_point(body, position):
301+
position += 2
287302
else:
288303
raise GraphQLSyntaxError(
289304
self.source,
@@ -294,11 +309,50 @@ def read_string(self, start: int) -> Token:
294309

295310
raise GraphQLSyntaxError(self.source, position, "Unterminated string.")
296311

297-
def read_escaped_unicode(self, position: int) -> EscapeSequence:
312+
def read_escaped_unicode_variable_width(self, position: int) -> EscapeSequence:
313+
body = self.source.body
314+
point = 0
315+
size = 3
316+
max_size = min(12, len(body) - position)
317+
# Cannot be larger than 12 chars (\u{00000000}).
318+
while size < max_size:
319+
char = body[position + size]
320+
size += 1
321+
if char == "}":
322+
# Must be at least 5 chars (\u{0}) and encode a Unicode scalar value.
323+
if size < 5 or not (
324+
0 <= point <= 0xD7FF or 0xE000 <= point <= 0x10FFFF
325+
):
326+
break
327+
return EscapeSequence(chr(point), size)
328+
# Append this hex digit to the code point.
329+
point = (point << 4) | read_hex_digit(char)
330+
if point < 0:
331+
break
332+
333+
raise GraphQLSyntaxError(
334+
self.source,
335+
position,
336+
f"Invalid Unicode escape sequence: '{body[position: position + size]}'.",
337+
)
338+
339+
def read_escaped_unicode_fixed_width(self, position: int) -> EscapeSequence:
298340
body = self.source.body
299341
code = read_16_bit_hex_code(body, position + 2)
300-
if code >= 0:
342+
343+
if 0 <= code <= 0xD7FF or 0xE000 <= code <= 0x10FFFF:
301344
return EscapeSequence(chr(code), 6)
345+
346+
# GraphQL allows JSON-style surrogate pair escape sequences, but only when
347+
# a valid pair is formed.
348+
if 0xD800 <= code <= 0xDBFF:
349+
if body[position + 6 : position + 8] == "\\u":
350+
trailing_code = read_16_bit_hex_code(body, position + 8)
351+
if 0xDC00 <= trailing_code <= 0xDFFF:
352+
return EscapeSequence(
353+
chr(decode_surrogate_pair(code, trailing_code)), 12
354+
)
355+
302356
raise GraphQLSyntaxError(
303357
self.source,
304358
position,
@@ -351,8 +405,10 @@ def read_block_string(self, start: int) -> Token:
351405
self.line_start = position
352406
continue
353407

354-
if is_source_character(char):
408+
if is_unicode_scalar_value(char):
355409
position += 1
410+
elif is_supplementary_code_point(body, position):
411+
position += 2
356412
else:
357413
raise GraphQLSyntaxError(
358414
self.source,
@@ -477,9 +533,31 @@ def read_hex_digit(char: str) -> int:
477533
return -1
478534

479535

480-
def is_source_character(char: str) -> bool:
481-
"""Check whether this is a SourceCharacter"""
482-
return char >= " " or char in "\t\r\n"
536+
def is_unicode_scalar_value(char: str) -> bool:
537+
"""Check whether this is a Unicode scalar value.
538+
539+
A Unicode scalar value is any Unicode code point except surrogate code
540+
points. In other words, the inclusive ranges of values 0x0000 to 0xD7FF and
541+
0xE000 to 0x10FFFF.
542+
"""
543+
return "\x00" <= char <= "\ud7ff" or "\ue000" <= char <= "\U0010ffff"
544+
545+
546+
def is_supplementary_code_point(body: str, location: int) -> bool:
547+
"""
548+
Check whether the current location is a supplementary code point.
549+
550+
The GraphQL specification defines source text as a sequence of unicode scalar
551+
values (which Unicode defines to exclude surrogate code points).
552+
"""
553+
return (
554+
"\ud800" <= body[location] <= "\udbff"
555+
and "\udc00" <= body[location + 1] <= "\udfff"
556+
)
557+
558+
559+
def decode_surrogate_pair(leading: int, trailing: int) -> int:
560+
return 0x10000 + (((leading & 0x03FF) << 10) | (trailing & 0x03FF))
483561

484562

485563
def is_name_start(char: str) -> bool:

‎src/graphql/language/print_string.py

Copy file name to clipboard
+81Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
__all__ = ["print_string"]
2+
3+
4+
def print_string(s: str) -> str:
5+
""" "Print a string as a GraphQL StringValue literal.
6+
7+
Replaces control characters and excluded characters (" U+0022 and \\ U+005C)
8+
with escape sequences.
9+
"""
10+
return f'"{s.translate(escape_sequences)}"'
11+
12+
13+
escape_sequences = {
14+
0x00: "\\u0000",
15+
0x01: "\\u0001",
16+
0x02: "\\u0002",
17+
0x03: "\\u0003",
18+
0x04: "\\u0004",
19+
0x05: "\\u0005",
20+
0x06: "\\u0006",
21+
0x07: "\\u0007",
22+
0x08: "\\b",
23+
0x09: "\\t",
24+
0x0A: "\\n",
25+
0x0B: "\\u000B",
26+
0x0C: "\\f",
27+
0x0D: "\\r",
28+
0x0E: "\\u000E",
29+
0x0F: "\\u000F",
30+
0x10: "\\u0010",
31+
0x11: "\\u0011",
32+
0x12: "\\u0012",
33+
0x13: "\\u0013",
34+
0x14: "\\u0014",
35+
0x15: "\\u0015",
36+
0x16: "\\u0016",
37+
0x17: "\\u0017",
38+
0x18: "\\u0018",
39+
0x19: "\\u0019",
40+
0x1A: "\\u001A",
41+
0x1B: "\\u001B",
42+
0x1C: "\\u001C",
43+
0x1D: "\\u001D",
44+
0x1E: "\\u001E",
45+
0x1F: "\\u001F",
46+
0x22: '\\"',
47+
0x5C: "\\\\",
48+
0x7F: "\\u007F",
49+
0x80: "\\u0080",
50+
0x81: "\\u0081",
51+
0x82: "\\u0082",
52+
0x83: "\\u0083",
53+
0x84: "\\u0084",
54+
0x85: "\\u0085",
55+
0x86: "\\u0086",
56+
0x87: "\\u0087",
57+
0x88: "\\u0088",
58+
0x89: "\\u0089",
59+
0x8A: "\\u008A",
60+
0x8B: "\\u008B",
61+
0x8C: "\\u008C",
62+
0x8D: "\\u008D",
63+
0x8E: "\\u008E",
64+
0x8F: "\\u008F",
65+
0x90: "\\u0090",
66+
0x91: "\\u0091",
67+
0x92: "\\u0092",
68+
0x93: "\\u0093",
69+
0x94: "\\u0094",
70+
0x95: "\\u0095",
71+
0x96: "\\u0096",
72+
0x97: "\\u0097",
73+
0x98: "\\u0098",
74+
0x99: "\\u0099",
75+
0x9A: "\\u009A",
76+
0x9B: "\\u009B",
77+
0x9C: "\\u009C",
78+
0x9D: "\\u009D",
79+
0x9E: "\\u009E",
80+
0x9F: "\\u009F",
81+
}

‎src/graphql/language/printer.py

Copy file name to clipboardExpand all lines: src/graphql/language/printer.py
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
from json import dumps
21
from typing import Any, Collection, Optional
32

43
from ..language.ast import Node, OperationType
5-
from .visitor import visit, Visitor
64
from .block_string import print_block_string
5+
from .print_string import print_string
6+
from .visitor import visit, Visitor
77

88
__all__ = ["print_ast"]
99

@@ -148,7 +148,7 @@ def leave_float_value(node: PrintedNode, *_args: Any) -> str:
148148
def leave_string_value(node: PrintedNode, *_args: Any) -> str:
149149
if node.block:
150150
return print_block_string(node.value)
151-
return dumps(node.value)
151+
return print_string(node.value)
152152

153153
@staticmethod
154154
def leave_boolean_value(node: PrintedNode, *_args: Any) -> str:

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.