1
- """ Test script for the unicodedata module.
1
+ """ Tests for the unicodedata module.
2
2
3
3
Written by Marc-Andre Lemburg (mal@lemburg.com).
4
4
5
5
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
6
7
7
"""
8
8
9
+ import hashlib
10
+ from http .client import HTTPException
9
11
import sys
12
+ import unicodedata
10
13
import unittest
11
- import hashlib
12
- from test .support import script_helper
13
-
14
- encoding = 'utf-8'
15
- errors = 'surrogatepass'
14
+ from test .support import (open_urlresource , requires_resource , script_helper ,
15
+ cpython_only , check_disallow_instantiation ,
16
+ ResourceDenied )
16
17
17
18
18
- ### Run tests
19
-
20
19
class UnicodeMethodsTest (unittest .TestCase ):
21
20
22
21
# update this, if the database changes
23
- expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1 '
22
+ expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326 '
24
23
25
24
# TODO: RUSTPYTHON
26
25
@unittest .expectedFailure
26
+ @requires_resource ('cpu' )
27
27
def test_method_checksum (self ):
28
28
h = hashlib .sha1 ()
29
- for i in range (0x10000 ):
29
+ for i in range (sys . maxunicode + 1 ):
30
30
char = chr (i )
31
31
data = [
32
32
# Predicates (single char)
@@ -63,33 +63,26 @@ def test_method_checksum(self):
63
63
(char + 'ABC' ).title (),
64
64
65
65
]
66
- h .update ('' .join (data ).encode (encoding , errors ))
66
+ h .update ('' .join (data ).encode ('utf-8' , 'surrogatepass' ))
67
67
result = h .hexdigest ()
68
68
self .assertEqual (result , self .expectedchecksum )
69
69
70
70
class UnicodeDatabaseTest (unittest .TestCase ):
71
-
72
- def setUp (self ):
73
- # In case unicodedata is not available, this will raise an ImportError,
74
- # but the other test cases will still be run
75
- import unicodedata
76
- self .db = unicodedata
77
-
78
- def tearDown (self ):
79
- del self .db
71
+ db = unicodedata
80
72
81
73
class UnicodeFunctionsTest (UnicodeDatabaseTest ):
82
74
83
75
# Update this if the database changes. Make sure to do a full rebuild
84
76
# (e.g. 'make distclean && make') to get the correct checksum.
85
- expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652 '
77
+ expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370 '
86
78
# TODO: RUSTPYTHON
87
79
@unittest .expectedFailure
80
+ @requires_resource ('cpu' )
88
81
def test_function_checksum (self ):
89
82
data = []
90
83
h = hashlib .sha1 ()
91
84
92
- for i in range (0x10000 ):
85
+ for i in range (sys . maxunicode + 1 ):
93
86
char = chr (i )
94
87
data = [
95
88
# Properties
@@ -106,6 +99,15 @@ def test_function_checksum(self):
106
99
result = h .hexdigest ()
107
100
self .assertEqual (result , self .expectedchecksum )
108
101
102
+ # TODO: RUSTPYTHON
103
+ @unittest .expectedFailure
104
+ @requires_resource ('cpu' )
105
+ def test_name_inverse_lookup (self ):
106
+ for i in range (sys .maxunicode + 1 ):
107
+ char = chr (i )
108
+ if looked_name := self .db .name (char , None ):
109
+ self .assertEqual (self .db .lookup (looked_name ), char )
110
+
109
111
# TODO: RUSTPYTHON
110
112
@unittest .expectedFailure
111
113
def test_digit (self ):
@@ -201,15 +203,8 @@ def test_combining(self):
201
203
self .assertRaises (TypeError , self .db .combining )
202
204
self .assertRaises (TypeError , self .db .combining , 'xx' )
203
205
204
- def test_normalize (self ):
205
- self .assertRaises (TypeError , self .db .normalize )
206
- self .assertRaises (ValueError , self .db .normalize , 'unknown' , 'xx' )
207
- self .assertEqual (self .db .normalize ('NFKC' , '' ), '' )
208
- # The rest can be found in test_normalization.py
209
- # which requires an external file.
210
-
211
206
def test_pr29 (self ):
212
- # http ://www.unicode.org/review/pr-29.html
207
+ # https ://www.unicode.org/review/pr-29.html
213
208
# See issues #1054943 and #10254.
214
209
composed = ("\u0b47 \u0300 \u0b3e " , "\u1100 \u0300 \u1161 " ,
215
210
'Li\u030d t-s\u1e73 \u0301 ' ,
@@ -240,9 +235,6 @@ def test_issue29456(self):
240
235
self .assertEqual (self .db .normalize ('NFC' , u11a7_str_a ), u11a7_str_b )
241
236
self .assertEqual (self .db .normalize ('NFC' , u11c3_str_a ), u11c3_str_b )
242
237
243
- # For tests of unicodedata.is_normalized / self.db.is_normalized ,
244
- # see test_normalization.py .
245
-
246
238
def test_east_asian_width (self ):
247
239
eaw = self .db .east_asian_width
248
240
self .assertRaises (TypeError , eaw , b'a' )
@@ -265,6 +257,11 @@ def test_east_asian_width_9_0_changes(self):
265
257
266
258
class UnicodeMiscTest (UnicodeDatabaseTest ):
267
259
260
+ @cpython_only
261
+ def test_disallow_instantiation (self ):
262
+ # Ensure that the type disallows instantiation (bpo-43916)
263
+ check_disallow_instantiation (self , unicodedata .UCD )
264
+
268
265
# TODO: RUSTPYTHON
269
266
@unittest .expectedFailure
270
267
def test_failed_import_during_compiling (self ):
@@ -363,5 +360,103 @@ def test_linebreak_7643(self):
363
360
self .assertEqual (len (lines ), 1 ,
364
361
r"\u%.4x should not be a linebreak" % i )
365
362
363
+ class NormalizationTest (unittest .TestCase ):
364
+ @staticmethod
365
+ def check_version (testfile ):
366
+ hdr = testfile .readline ()
367
+ return unicodedata .unidata_version in hdr
368
+
369
+ @staticmethod
370
+ def unistr (data ):
371
+ data = [int (x , 16 ) for x in data .split (" " )]
372
+ return "" .join ([chr (x ) for x in data ])
373
+
374
+ @requires_resource ('network' )
375
+ def test_normalization (self ):
376
+ TESTDATAFILE = "NormalizationTest.txt"
377
+ TESTDATAURL = f"http://www.pythontest.net/unicode/{ unicodedata .unidata_version } /{ TESTDATAFILE } "
378
+
379
+ # Hit the exception early
380
+ try :
381
+ testdata = open_urlresource (TESTDATAURL , encoding = "utf-8" ,
382
+ check = self .check_version )
383
+ except PermissionError :
384
+ self .skipTest (f"Permission error when downloading { TESTDATAURL } "
385
+ f"into the test data directory" )
386
+ except (OSError , HTTPException ) as exc :
387
+ self .skipTest (f"Failed to download { TESTDATAURL } : { exc } " )
388
+
389
+ with testdata :
390
+ self .run_normalization_tests (testdata )
391
+
392
+ def run_normalization_tests (self , testdata ):
393
+ part = None
394
+ part1_data = {}
395
+
396
+ def NFC (str ):
397
+ return unicodedata .normalize ("NFC" , str )
398
+
399
+ def NFKC (str ):
400
+ return unicodedata .normalize ("NFKC" , str )
401
+
402
+ def NFD (str ):
403
+ return unicodedata .normalize ("NFD" , str )
404
+
405
+ def NFKD (str ):
406
+ return unicodedata .normalize ("NFKD" , str )
407
+
408
+ for line in testdata :
409
+ if '#' in line :
410
+ line = line .split ('#' )[0 ]
411
+ line = line .strip ()
412
+ if not line :
413
+ continue
414
+ if line .startswith ("@Part" ):
415
+ part = line .split ()[0 ]
416
+ continue
417
+ c1 ,c2 ,c3 ,c4 ,c5 = [self .unistr (x ) for x in line .split (';' )[:- 1 ]]
418
+
419
+ # Perform tests
420
+ self .assertTrue (c2 == NFC (c1 ) == NFC (c2 ) == NFC (c3 ), line )
421
+ self .assertTrue (c4 == NFC (c4 ) == NFC (c5 ), line )
422
+ self .assertTrue (c3 == NFD (c1 ) == NFD (c2 ) == NFD (c3 ), line )
423
+ self .assertTrue (c5 == NFD (c4 ) == NFD (c5 ), line )
424
+ self .assertTrue (c4 == NFKC (c1 ) == NFKC (c2 ) == \
425
+ NFKC (c3 ) == NFKC (c4 ) == NFKC (c5 ),
426
+ line )
427
+ self .assertTrue (c5 == NFKD (c1 ) == NFKD (c2 ) == \
428
+ NFKD (c3 ) == NFKD (c4 ) == NFKD (c5 ),
429
+ line )
430
+
431
+ self .assertTrue (unicodedata .is_normalized ("NFC" , c2 ))
432
+ self .assertTrue (unicodedata .is_normalized ("NFC" , c4 ))
433
+
434
+ self .assertTrue (unicodedata .is_normalized ("NFD" , c3 ))
435
+ self .assertTrue (unicodedata .is_normalized ("NFD" , c5 ))
436
+
437
+ self .assertTrue (unicodedata .is_normalized ("NFKC" , c4 ))
438
+ self .assertTrue (unicodedata .is_normalized ("NFKD" , c5 ))
439
+
440
+ # Record part 1 data
441
+ if part == "@Part1" :
442
+ part1_data [c1 ] = 1
443
+
444
+ # Perform tests for all other data
445
+ for c in range (sys .maxunicode + 1 ):
446
+ X = chr (c )
447
+ if X in part1_data :
448
+ continue
449
+ self .assertTrue (X == NFC (X ) == NFD (X ) == NFKC (X ) == NFKD (X ), c )
450
+
451
+ def test_edge_cases (self ):
452
+ self .assertRaises (TypeError , unicodedata .normalize )
453
+ self .assertRaises (ValueError , unicodedata .normalize , 'unknown' , 'xx' )
454
+ self .assertEqual (unicodedata .normalize ('NFKC' , '' ), '' )
455
+
456
+ def test_bug_834676 (self ):
457
+ # Check for bug 834676
458
+ unicodedata .normalize ('NFC' , '\ud55c \uae00 ' )
459
+
460
+
366
461
if __name__ == "__main__" :
367
462
unittest .main ()
0 commit comments