@@ -67,12 +67,16 @@ def print_code_point_at(self, location: int) -> str:
67
67
if location >= len (body ):
68
68
return TokenKind .EOF .value
69
69
char = body [location ]
70
- code = ord (char )
71
70
# Printable ASCII
72
- if 0x20 <= code <= 0x7E :
71
+ if " \x20 " <= char <= " \x7E " :
73
72
return "'\" '" if char == '"' else f"'{ char } '"
74
73
# Unicode code point
75
- return f"U+{ code :04X} "
74
+ point = (
75
+ decode_surrogate_pair (ord (char ), ord (body [location + 1 ]))
76
+ if is_supplementary_code_point (body , location )
77
+ else ord (char )
78
+ )
79
+ return f"U+{ point :04X} "
76
80
77
81
def create_token (
78
82
self , kind : TokenKind , start : int , end : int , value : Optional [str ] = None
@@ -141,7 +145,8 @@ def read_next_token(self, start: int) -> Token:
141
145
if char == "'"
142
146
else (
143
147
f"Unexpected character: { self .print_code_point_at (position )} ."
144
- if is_source_character (char )
148
+ if is_unicode_scalar_value (char )
149
+ or is_supplementary_code_point (body , position )
145
150
else f"Invalid character: { self .print_code_point_at (position )} ."
146
151
)
147
152
)
@@ -158,10 +163,14 @@ def read_comment(self, start: int) -> Token:
158
163
position = start + 1
159
164
while position < body_length :
160
165
char = body [position ]
161
-
162
- if char in "\r \n " or not is_source_character (char ):
166
+ if char in "\r \n " :
163
167
break
164
- position += 1
168
+ if is_unicode_scalar_value (char ):
169
+ position += 1
170
+ elif is_supplementary_code_point (body , position ):
171
+ position += 2
172
+ else :
173
+ break # pragma: no cover
165
174
166
175
return self .create_token (
167
176
TokenKind .COMMENT ,
@@ -270,7 +279,11 @@ def read_string(self, start: int) -> Token:
270
279
if char == "\\ " :
271
280
append (body [chunk_start :position ])
272
281
escape = (
273
- self .read_escaped_unicode (position )
282
+ (
283
+ self .read_escaped_unicode_variable_width (position )
284
+ if body [position + 2 : position + 3 ] == "{"
285
+ else self .read_escaped_unicode_fixed_width (position )
286
+ )
274
287
if body [position + 1 : position + 2 ] == "u"
275
288
else self .read_escaped_character (position )
276
289
)
@@ -282,8 +295,10 @@ def read_string(self, start: int) -> Token:
282
295
if char in "\r \n " :
283
296
break
284
297
285
- if is_source_character (char ):
298
+ if is_unicode_scalar_value (char ):
286
299
position += 1
300
+ elif is_supplementary_code_point (body , position ):
301
+ position += 2
287
302
else :
288
303
raise GraphQLSyntaxError (
289
304
self .source ,
@@ -294,11 +309,50 @@ def read_string(self, start: int) -> Token:
294
309
295
310
raise GraphQLSyntaxError (self .source , position , "Unterminated string." )
296
311
297
- def read_escaped_unicode (self , position : int ) -> EscapeSequence :
312
+ def read_escaped_unicode_variable_width (self , position : int ) -> EscapeSequence :
313
+ body = self .source .body
314
+ point = 0
315
+ size = 3
316
+ max_size = min (12 , len (body ) - position )
317
+ # Cannot be larger than 12 chars (\u{00000000}).
318
+ while size < max_size :
319
+ char = body [position + size ]
320
+ size += 1
321
+ if char == "}" :
322
+ # Must be at least 5 chars (\u{0}) and encode a Unicode scalar value.
323
+ if size < 5 or not (
324
+ 0 <= point <= 0xD7FF or 0xE000 <= point <= 0x10FFFF
325
+ ):
326
+ break
327
+ return EscapeSequence (chr (point ), size )
328
+ # Append this hex digit to the code point.
329
+ point = (point << 4 ) | read_hex_digit (char )
330
+ if point < 0 :
331
+ break
332
+
333
+ raise GraphQLSyntaxError (
334
+ self .source ,
335
+ position ,
336
+ f"Invalid Unicode escape sequence: '{ body [position : position + size ]} '." ,
337
+ )
338
+
339
+ def read_escaped_unicode_fixed_width (self , position : int ) -> EscapeSequence :
298
340
body = self .source .body
299
341
code = read_16_bit_hex_code (body , position + 2 )
300
- if code >= 0 :
342
+
343
+ if 0 <= code <= 0xD7FF or 0xE000 <= code <= 0x10FFFF :
301
344
return EscapeSequence (chr (code ), 6 )
345
+
346
+ # GraphQL allows JSON-style surrogate pair escape sequences, but only when
347
+ # a valid pair is formed.
348
+ if 0xD800 <= code <= 0xDBFF :
349
+ if body [position + 6 : position + 8 ] == "\\ u" :
350
+ trailing_code = read_16_bit_hex_code (body , position + 8 )
351
+ if 0xDC00 <= trailing_code <= 0xDFFF :
352
+ return EscapeSequence (
353
+ chr (decode_surrogate_pair (code , trailing_code )), 12
354
+ )
355
+
302
356
raise GraphQLSyntaxError (
303
357
self .source ,
304
358
position ,
@@ -351,8 +405,10 @@ def read_block_string(self, start: int) -> Token:
351
405
self .line_start = position
352
406
continue
353
407
354
- if is_source_character (char ):
408
+ if is_unicode_scalar_value (char ):
355
409
position += 1
410
+ elif is_supplementary_code_point (body , position ):
411
+ position += 2
356
412
else :
357
413
raise GraphQLSyntaxError (
358
414
self .source ,
@@ -477,9 +533,31 @@ def read_hex_digit(char: str) -> int:
477
533
return - 1
478
534
479
535
480
- def is_source_character (char : str ) -> bool :
481
- """Check whether this is a SourceCharacter"""
482
- return char >= " " or char in "\t \r \n "
536
+ def is_unicode_scalar_value (char : str ) -> bool :
537
+ """Check whether this is a Unicode scalar value.
538
+
539
+ A Unicode scalar value is any Unicode code point except surrogate code
540
+ points. In other words, the inclusive ranges of values 0x0000 to 0xD7FF and
541
+ 0xE000 to 0x10FFFF.
542
+ """
543
+ return "\x00 " <= char <= "\ud7ff " or "\ue000 " <= char <= "\U0010ffff "
544
+
545
+
546
+ def is_supplementary_code_point (body : str , location : int ) -> bool :
547
+ """
548
+ Check whether the current location is a supplementary code point.
549
+
550
+ The GraphQL specification defines source text as a sequence of unicode scalar
551
+ values (which Unicode defines to exclude surrogate code points).
552
+ """
553
+ return (
554
+ "\ud800 " <= body [location ] <= "\udbff "
555
+ and "\udc00 " <= body [location + 1 ] <= "\udfff "
556
+ )
557
+
558
+
559
+ def decode_surrogate_pair (leading : int , trailing : int ) -> int :
560
+ return 0x10000 + (((leading & 0x03FF ) << 10 ) | (trailing & 0x03FF ))
483
561
484
562
485
563
def is_name_start (char : str ) -> bool :
0 commit comments