Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit f1253c5

Browse filesBrowse files
committed
🎉 学习第4章Unicode文本和字节序列
1 parent ce910b4 commit f1253c5
Copy full SHA for f1253c5

File tree

Expand file treeCollapse file tree

6 files changed

+1067
-0
lines changed
Filter options
Expand file treeCollapse file tree

6 files changed

+1067
-0
lines changed

‎README.md

Copy file name to clipboard
+3Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# 《流畅的Python》阅读笔记
2+
3+
原书的项目地址:https://github.com/fluentpython/example-code-2e

‎codes/ch04/default_encoding.py

Copy file name to clipboard
+33Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
"""
4+
@author: HuRuiFeng
5+
@file: default_encoding.py
6+
@time: 2023/9/18 13:09
7+
@project: fluent-python
8+
@desc: P100 探索默认编码
9+
"""
10+
11+
import locale
12+
import sys
13+
14+
expressions = """
15+
locale.getpreferredencoding()
16+
type(my_file)
17+
my_file.encoding
18+
sys.stdout.isatty()
19+
sys.stdout.encoding
20+
sys.stdin.isatty()
21+
sys.stdin.encoding
22+
sys.stderr.isatty()
23+
sys.stderr.encoding
24+
sys.getdefaultencoding()
25+
sys.getfilesystemencoding()
26+
"""
27+
28+
my_file = open('dummy', 'w')
29+
30+
for expression in expressions.split():
31+
value = eval(expression)
32+
print(f'{expression:>30} -> {value!r}')
33+

‎codes/ch04/normeq.py

Copy file name to clipboard
+27Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
"""
4+
@author: HuRuiFeng
5+
@file: normeq.py
6+
@time: 2023/9/18 14:11
7+
@project: fluent-python
8+
@desc: P108 规范化Unicode字符串,准确比较
9+
"""
10+
from unicodedata import normalize
11+
12+
13+
def nfc_equal(str1, str2):
14+
return normalize('NFC', str1) == normalize('NFC', str2)
15+
16+
17+
def fold_equal(str1, str2):
18+
return (normalize('NFC', str1).casefold() ==
19+
normalize('NFC', str2).casefold())
20+
21+
22+
if __name__ == '__main__':
23+
s1 = 'café'
24+
s2 = 'cafe\u0301'
25+
print(s1 == s2)
26+
print(nfc_equal(s1, s2))
27+
print(nfc_equal('A', 'a'))

‎codes/ch04/ramanujan.py

Copy file name to clipboard
+35Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
"""
4+
@author: HuRuiFeng
5+
@file: ramanujan.py
6+
@time: 2023/9/18 15:56
7+
@project: fluent-python
8+
@desc: P117 比较简单的str和bytes正则表达式的行为
9+
"""
10+
import re
11+
12+
# str类型
13+
re_numbers_str = re.compile(r'\d+')
14+
re_words_str = re.compile(r'\w+')
15+
# bytes类型
16+
re_numbers_bytes = re.compile(rb'\d+')
17+
re_words_bytes = re.compile(rb'\w+')
18+
19+
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
20+
" as 1729 = 1³ + 12³ = 9³ + 10³.")
21+
22+
# bytes正则表达式只能搜索bytes字符串
23+
text_bytes = text_str.encode('utf_8')
24+
25+
print(f'Text\n {text_str!r}')
26+
print('Numbers')
27+
# str模式r'\d+'只能匹配泰米尔数值和ASCII数字
28+
print(' str :', re_numbers_str.findall(text_str))
29+
# bytes模式rb'\d+'只能匹配ASCII字节中的数字
30+
print(' bytes:', re_numbers_bytes.findall(text_bytes))
31+
print('Words')
32+
# str模式r'\w+'能匹配字母、上标、泰米尔数字和ASCII数字
33+
print(' str :', re_words_str.findall(text_str))
34+
# bytes模式rb'\w+'只能匹配ASCII字节中的字母和数字
35+
print(' bytes:', re_words_bytes.findall(text_bytes))

‎codes/ch04/simplify.py

Copy file name to clipboard
+81Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
"""
4+
@author: HuRuiFeng
5+
@file: simplify.py
6+
@time: 2023/9/18 14:14
7+
@project: fluent-python
8+
@desc: P109 去掉全部组合记号的函数
9+
"""
10+
import string
11+
12+
import unicodedata
13+
14+
15+
def shave_marks(txt):
16+
"""删除所有变音符"""
17+
# 把所有字符分解成基字符和组合记号
18+
norm_txt = unicodedata.normalize('NFD', txt)
19+
# 过滤所有组合记号
20+
shaved = ''.join(c for c in norm_txt
21+
if not unicodedata.combining(c))
22+
# 重组所有字符
23+
return unicodedata.normalize('NFC', shaved)
24+
25+
26+
def shave_marks_latin(txt):
27+
"""删除所有拉丁基字符上的变音符"""
28+
norm_txt = unicodedata.normalize('NFD', txt)
29+
latin_base = False
30+
preserve = []
31+
for c in norm_txt:
32+
if unicodedata.combining(c) and latin_base:
33+
continue # 忽略拉丁基字符的变音符
34+
preserve.append(c)
35+
# 如果不是组合字符,那就是新的基字符
36+
if not unicodedata.combining(c):
37+
latin_base = c in string.ascii_letters
38+
shaved = ''.join(preserve)
39+
return unicodedata.normalize('NFC', shaved)
40+
41+
42+
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", # <1>
43+
"""'f"^<''""---~>""")
44+
45+
multi_map = str.maketrans({ # <2>
46+
'€': 'EUR',
47+
'…': '...',
48+
'Æ': 'AE',
49+
'æ': 'ae',
50+
'Œ': 'OE',
51+
'œ': 'oe',
52+
'™': '(TM)',
53+
'‰': '<per mille>',
54+
'†': '**',
55+
'‡': '***',
56+
})
57+
58+
multi_map.update(single_map) # <3>
59+
60+
61+
def dewinize(txt):
62+
"""把cp1252符号替换为ASCII字符或字符序列"""
63+
return txt.translate(multi_map)
64+
65+
66+
def asciize(txt):
67+
# 去掉变音符
68+
no_marks = shave_marks_latin(dewinize(txt))
69+
no_marks = no_marks.replace('ß', 'ss')
70+
# 使用NFKC规范化形式把字符和码点组合起来
71+
return unicodedata.normalize('NFKC', no_marks)
72+
73+
74+
if __name__ == '__main__':
75+
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
76+
print(shave_marks(order))
77+
greek = 'Ζέφυρος, Zéfiro'
78+
print(shave_marks(greek))
79+
80+
print(dewinize(order))
81+
print(asciize(order))

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.