RustPython · coolreader18 · Mar 26, 2025 · Mar 7, 2025 · Mar 21, 2025 · Mar 22, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py
@@ -1066,8 +1066,6 @@ def test_hash(self):
            hash(b)
        self.assertEqual(hash(a), hash(b))

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_capitalize_nonascii(self):
        # check that titlecased chars are lowered correctly
        # \u1ffc is the titlecased char

diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py
@@ -574,6 +574,7 @@ def test_pep_409_verbiage(self):
            self.assertTrue(text[1].startswith('  File '))
            self.assertTrue(text[3].startswith('NameError'))

+    @unittest.expectedFailureIf(sys.platform == "linux", "TODO: RUSTPYTHON")
    def test_non_ascii(self):
        # Mac OS X denies the creation of a file with an invalid UTF-8 name.
        # Windows allows creating a name with an arbitrary bytes name, but

diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
@@ -1698,8 +1698,6 @@ def test_decode_invalid(self):


 class NameprepTest(unittest.TestCase):
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_nameprep(self):
        from encodings.idna import nameprep
        for pos, (orig, prepped) in enumerate(nameprep_tests):

diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py
@@ -373,8 +373,6 @@ def test_byte_content(self):
        check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013'))
        check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013'))

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_byte_filenames(self):
        # somebody renamed a file from ISO-8859-2 to UTF-8
        fna = b'\xb3odz.txt'    # "łodz.txt"

diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py
@@ -1305,6 +1305,8 @@ def exec_module(*args):
            else:
                importlib.SourceLoader.exec_module = old_exec_module

+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
    @unittest.skipUnless(TESTFN_UNENCODABLE, 'need TESTFN_UNENCODABLE')
    def test_unencodable_filename(self):
        # Issue #11619: The Python parser and the import machinery must not

diff --git a/Lib/test/test_json/test_scanstring.py b/Lib/test/test_json/test_scanstring.py
@@ -143,10 +143,4 @@ def test_overflow(self):


 class TestPyScanstring(TestScanstring, PyTest): pass
-# TODO: RUSTPYTHON
-class TestPyScanstring(TestScanstring, PyTest):
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
-    def test_bad_escapes(self):
-        super().test_bad_escapes()
 class TestCScanstring(TestScanstring, CTest): pass
diff --git a/Lib/test/test_ntpath.py b/Lib/test/test_ntpath.py
@@ -1032,12 +1032,6 @@ class NtCommonTest(test_genericpath.CommonTest, unittest.TestCase):
    pathmodule = ntpath
    attributes = ['relpath']

-    # TODO: RUSTPYTHON
-    if sys.platform == "linux":
-        @unittest.expectedFailure
-        def test_nonascii_abspath(self):
-            super().test_nonascii_abspath()
-
    # TODO: RUSTPYTHON
    if sys.platform == "win32":
        # TODO: RUSTPYTHON, ValueError: illegal environment variable name

diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
@@ -854,8 +854,6 @@ def test_string_boundaries(self):
        # Can match around the whitespace.
        self.assertEqual(len(re.findall(r"\B", " ")), 2)

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_bigcharset(self):
        self.assertEqual(re.match("([\u2222\u2223])",
                                  "\u2222").group(1), "\u2222")
@@ -2233,6 +2231,7 @@ def test_bug_40736(self):
        with self.assertRaisesRegex(TypeError, "got 'type'"):
            re.search("x*", type)

+    @unittest.skip("TODO: RUSTPYTHON: flaky, improve perf")
    @requires_resource('cpu')
    def test_search_anchor_at_beginning(self):
        s = 'x'*10**7

diff --git a/Lib/test/test_smtplib.py b/Lib/test/test_smtplib.py
@@ -1459,8 +1459,6 @@ def test_send_unicode_with_SMTPUTF8_via_low_level_API(self):
        self.assertIn('SMTPUTF8', self.serv.last_mail_options)
        self.assertEqual(self.serv.last_rcpt_options, [])

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_send_message_uses_smtputf8_if_addrs_non_ascii(self):
        msg = EmailMessage()
        msg['From'] = "Páolo <főo@bar.com>"

diff --git a/Lib/test/test_socket.py b/Lib/test/test_socket.py
@@ -1578,7 +1578,7 @@ def test_getnameinfo(self):
        # only IP addresses are allowed
        self.assertRaises(OSError, socket.getnameinfo, ('mail.python.org',0), 0)

-    @unittest.expectedFailureIf(sys.platform != "darwin", "TODO: RUSTPYTHON; socket.gethostbyname_ex")
+    @unittest.skip("TODO: RUSTPYTHON: flaky on CI?")
    @unittest.skipUnless(support.is_resource_enabled('network'),
                         'network is not enabled')
    def test_idna(self):
@@ -5519,8 +5519,6 @@ def testBytesAddr(self):
        self.addCleanup(os_helper.unlink, path)
        self.assertEqual(self.sock.getsockname(), path)

-    # TODO: RUSTPYTHON, surrogateescape
-    @unittest.expectedFailure
    def testSurrogateescapeBind(self):
        # Test binding to a valid non-ASCII pathname, with the
        # non-ASCII bytes supplied using surrogateescape encoding.

diff --git a/Lib/test/test_sqlite3/test_types.py b/Lib/test/test_sqlite3/test_types.py
@@ -95,8 +95,6 @@ def test_too_large_int(self):
        row = self.cur.fetchone()
        self.assertIsNone(row)

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_string_with_surrogates(self):
        for value in 0xd8ff, 0xdcff:
            with self.assertRaises(UnicodeEncodeError):

diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
@@ -102,8 +102,6 @@ def test_cjk_unified_ideographs(self):
        self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
        self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_bmp_characters(self):
        for code in range(0x10000):
            char = chr(code)

diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
@@ -721,8 +721,6 @@ def test_isspace(self):
                   '\U0001F40D', '\U0001F46F']:
            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    @support.requires_resource('cpu')
    def test_isspace_invariant(self):
        for codepoint in range(sys.maxunicode + 1):

diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -99,8 +99,6 @@ def test_function_checksum(self):
        result = h.hexdigest()
        self.assertEqual(result, self.expectedchecksum)

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    @requires_resource('cpu')
    def test_name_inverse_lookup(self):
        for i in range(sys.maxunicode + 1):
@@ -326,8 +324,6 @@ def test_ucd_510(self):
        self.assertTrue("\u1d79".upper()=='\ua77d')
        self.assertTrue(".".upper()=='.')

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_bug_5828(self):
        self.assertEqual("\u1d79".lower(), "\u1d79")
        # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
@@ -347,8 +343,6 @@ def test_bug_4971(self):
        self.assertEqual("\u01c5".title(), "\u01c5")
        self.assertEqual("\u01c6".title(), "\u01c5")

-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
    def test_linebreak_7643(self):
        for i in range(0x10000):
            lines = (chr(i) + 'A').splitlines()

diff --git a/common/Cargo.toml b/common/Cargo.toml
@@ -16,12 +16,14 @@ rustpython-literal = { workspace = true }

 ascii = { workspace = true }
 bitflags = { workspace = true }
+bstr = { workspace = true }
 cfg-if = { workspace = true }
 itertools = { workspace = true }
 libc = { workspace = true }
 malachite-bigint = { workspace = true }
 malachite-q = { workspace = true }
 malachite-base = { workspace = true }
+memchr = { workspace = true }
 num-complex = { workspace = true }
 num-traits = { workspace = true }
 once_cell = { workspace = true }

diff --git a/common/src/cformat.rs b/common/src/cformat.rs
@@ -11,11 +11,13 @@ use std::{
    str::FromStr,
 };

+use crate::wtf8::{CodePoint, Wtf8, Wtf8Buf};
+
 #[derive(Debug, PartialEq)]
 pub enum CFormatErrorType {
    UnmatchedKeyParentheses,
    MissingModuloSign,
-    UnsupportedFormatChar(char),
+    UnsupportedFormatChar(CodePoint),
    IncompleteFormat,
    IntTooBig,
    // Unimplemented,
@@ -39,7 +41,9 @@ impl fmt::Display for CFormatError {
            UnsupportedFormatChar(c) => write!(
                f,
                "unsupported format character '{}' ({:#x}) at index {}",
-                c, c as u32, self.index
+                c,
+                c.to_u32(),
+                self.index
            ),
            IntTooBig => write!(f, "width/precision too big"),
            _ => write!(f, "unexpected error parsing format string"),
@@ -160,7 +164,7 @@ pub trait FormatBuf:
    fn concat(self, other: Self) -> Self;
 }

-pub trait FormatChar: Copy + Into<char> + From<u8> {
+pub trait FormatChar: Copy + Into<CodePoint> + From<u8> {
    fn to_char_lossy(self) -> char;
    fn eq_char(self, c: char) -> bool;
 }
@@ -188,6 +192,29 @@ impl FormatChar for char {
    }
 }

+impl FormatBuf for Wtf8Buf {
+    type Char = CodePoint;
+    fn chars(&self) -> impl Iterator<Item = Self::Char> {
+        self.code_points()
+    }
+    fn len(&self) -> usize {
+        (**self).len()
+    }
+    fn concat(mut self, other: Self) -> Self {
+        self.extend([other]);
+        self
+    }
+}
+
+impl FormatChar for CodePoint {
+    fn to_char_lossy(self) -> char {
+        self.to_char_lossy()
+    }
+    fn eq_char(self, c: char) -> bool {
+        self == c
+    }
+}
+
 impl FormatBuf for Vec<u8> {
    type Char = u8;
    fn chars(&self) -> impl Iterator<Item = Self::Char> {
@@ -801,6 +828,15 @@ impl FromStr for CFormatString {
    }
 }

+pub type CFormatWtf8 = CFormatStrOrBytes<Wtf8Buf>;
+
+impl CFormatWtf8 {
+    pub fn parse_from_wtf8(s: &Wtf8) -> Result<Self, CFormatError> {
+        let mut iter = s.code_points().enumerate().peekable();
+        Self::parse(&mut iter)
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;