Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit c6cab4c

Browse filesBrowse files
committed
Parse surrogates in string literals properly
1 parent 2ab8716 commit c6cab4c
Copy full SHA for c6cab4c

File tree

16 files changed

+506
-80
lines changed
Filter options

16 files changed

+506
-80
lines changed

‎Cargo.lock

Copy file name to clipboardExpand all lines: Cargo.lock
+3Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎Lib/test/test_codeccallbacks.py

Copy file name to clipboardExpand all lines: Lib/test/test_codeccallbacks.py
-6Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -536,8 +536,6 @@ def test_badandgoodxmlcharrefreplaceexceptions(self):
536536
("".join("&#%d;" % c for c in cs), 1 + len(s))
537537
)
538538

539-
# TODO: RUSTPYTHON
540-
@unittest.expectedFailure
541539
def test_badandgoodbackslashreplaceexceptions(self):
542540
# "backslashreplace" complains about a non-exception passed in
543541
self.assertRaises(
@@ -596,8 +594,6 @@ def test_badandgoodbackslashreplaceexceptions(self):
596594
(r, 2)
597595
)
598596

599-
# TODO: RUSTPYTHON
600-
@unittest.expectedFailure
601597
def test_badandgoodnamereplaceexceptions(self):
602598
# "namereplace" complains about a non-exception passed in
603599
self.assertRaises(
@@ -644,8 +640,6 @@ def test_badandgoodnamereplaceexceptions(self):
644640
(r, 1 + len(s))
645641
)
646642

647-
# TODO: RUSTPYTHON
648-
@unittest.expectedFailure
649643
def test_badandgoodsurrogateescapeexceptions(self):
650644
surrogateescape_errors = codecs.lookup_error('surrogateescape')
651645
# "surrogateescape" complains about a non-exception passed in

‎common/src/encodings.rs

Copy file name to clipboardExpand all lines: common/src/encodings.rs
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ pub mod errors {
401401
let mut out = String::with_capacity(num_chars * 4);
402402
for c in err_str.code_points() {
403403
let c_u32 = c.to_u32();
404-
if let Some(c_name) = unicode_names2::name(c.to_char_lossy()) {
404+
if let Some(c_name) = c.to_char().and_then(unicode_names2::name) {
405405
write!(out, "\\N{{{c_name}}}").unwrap();
406406
} else if c_u32 >= 0x10000 {
407407
write!(out, "\\U{c_u32:08x}").unwrap();

‎common/src/wtf8/mod.rs

Copy file name to clipboardExpand all lines: common/src/wtf8/mod.rs
+53Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,12 @@ impl<W: AsRef<Wtf8>> FromIterator<W> for Wtf8Buf {
574574
}
575575
}
576576

577+
impl Hash for Wtf8Buf {
578+
fn hash<H: Hasher>(&self, state: &mut H) {
579+
Wtf8::hash(self, state)
580+
}
581+
}
582+
577583
impl AsRef<Wtf8> for Wtf8Buf {
578584
fn as_ref(&self) -> &Wtf8 {
579585
self
@@ -692,6 +698,13 @@ impl Default for &Wtf8 {
692698
}
693699
}
694700

701+
impl Hash for Wtf8 {
702+
fn hash<H: Hasher>(&self, state: &mut H) {
703+
state.write(self.as_bytes());
704+
state.write_u8(0xff);
705+
}
706+
}
707+
695708
impl Wtf8 {
696709
/// Creates a WTF-8 slice from a UTF-8 `&str` slice.
697710
///
@@ -722,6 +735,32 @@ impl Wtf8 {
722735
unsafe { &mut *(value as *mut [u8] as *mut Wtf8) }
723736
}
724737

738+
/// Create a WTF-8 slice from a WTF-8 byte slice.
739+
//
740+
// whooops! using WTF-8 for interchange!
741+
#[inline]
742+
pub fn from_bytes(b: &[u8]) -> Option<&Self> {
743+
let mut rest = b;
744+
while let Err(e) = std::str::from_utf8(rest) {
745+
rest = &rest[e.valid_up_to()..];
746+
Self::decode_surrogate(rest)?;
747+
rest = &rest[3..];
748+
}
749+
Some(unsafe { Wtf8::from_bytes_unchecked(b) })
750+
}
751+
752+
fn decode_surrogate(b: &[u8]) -> Option<CodePoint> {
753+
let [a, b, c, ..] = *b else { return None };
754+
if (a & 0xf0) == 0xe0 && (b & 0xc0) == 0x80 && (c & 0xc0) == 0x80 {
755+
// it's a three-byte code
756+
let c = ((a as u32 & 0x0f) << 12) + ((b as u32 & 0x3f) << 6) + (c as u32 & 0x3f);
757+
let 0xD800..=0xDFFF = c else { return None };
758+
Some(CodePoint { value: c })
759+
} else {
760+
None
761+
}
762+
}
763+
725764
/// Returns the length, in WTF-8 bytes.
726765
#[inline]
727766
pub fn len(&self) -> usize {
@@ -875,6 +914,14 @@ impl Wtf8 {
875914
}
876915
}
877916

917+
#[inline]
918+
fn final_lead_surrogate(&self) -> Option<u16> {
919+
match self.bytes {
920+
[.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
921+
_ => None,
922+
}
923+
}
924+
878925
pub fn is_code_point_boundary(&self, index: usize) -> bool {
879926
is_code_point_boundary(self, index)
880927
}
@@ -1481,6 +1528,12 @@ impl From<Wtf8Buf> for Box<Wtf8> {
14811528
}
14821529
}
14831530

1531+
impl From<Box<Wtf8>> for Wtf8Buf {
1532+
fn from(w: Box<Wtf8>) -> Self {
1533+
Wtf8Buf::from_box(w)
1534+
}
1535+
}
1536+
14841537
impl From<String> for Box<Wtf8> {
14851538
fn from(s: String) -> Self {
14861539
s.into_boxed_str().into()

‎compiler/codegen/Cargo.toml

Copy file name to clipboardExpand all lines: compiler/codegen/Cargo.toml
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ num-complex = { workspace = true }
3030
num-traits = { workspace = true }
3131
thiserror = { workspace = true }
3232
malachite-bigint = { workspace = true }
33+
memchr = { workspace = true }
34+
unicode_names2 = { workspace = true }
3335

3436
[dev-dependencies]
3537
# rustpython-parser = { workspace = true }

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.