Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Allow surrogates in str #5587

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 26, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update encoding to use wtf8
  • Loading branch information
coolreader18 committed Mar 26, 2025
commit ba1b5811ee5f151d435d042836da99dcb816144e
2 changes: 2 additions & 0 deletions 2 Lib/test/test_cmd_line_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,8 @@ def test_pep_409_verbiage(self):
self.assertTrue(text[1].startswith(' File '))
self.assertTrue(text[3].startswith('NameError'))

# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_non_ascii(self):
# Mac OS X denies the creation of a file with an invalid UTF-8 name.
# Windows allows creating a name with an arbitrary bytes name, but
Expand Down
136 changes: 74 additions & 62 deletions 136 common/src/encodings.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
use std::ops::Range;

use num_traits::ToPrimitive;

use crate::str::StrKind;
use crate::wtf8::{Wtf8, Wtf8Buf};

pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;

pub type DecodeErrorResult<S, B, E> = Result<(S, Option<B>, usize), E>;

pub trait StrBuffer: AsRef<Wtf8> {
fn is_ascii(&self) -> bool {
self.as_ref().is_ascii()
fn is_compatible_with(&self, kind: StrKind) -> bool {
let s = self.as_ref();
match kind {
StrKind::Ascii => s.is_ascii(),
StrKind::Utf8 => s.is_utf8(),
StrKind::Wtf8 => true,
}
}
}

Expand All @@ -18,7 +26,7 @@ pub trait ErrorHandler {
type BytesBuf: AsRef<[u8]>;
fn handle_encode_error(
&self,
data: &str,
data: &Wtf8,
char_range: Range<usize>,
reason: &str,
) -> EncodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
Expand All @@ -29,7 +37,7 @@ pub trait ErrorHandler {
reason: &str,
) -> DecodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
fn error_oob_restart(&self, i: usize) -> Self::Error;
fn error_encoding(&self, data: &str, char_range: Range<usize>, reason: &str) -> Self::Error;
fn error_encoding(&self, data: &Wtf8, char_range: Range<usize>, reason: &str) -> Self::Error;
}
pub enum EncodeReplace<S, B> {
Str(S),
Expand Down Expand Up @@ -118,14 +126,61 @@ where
Ok((out, remaining_index))
}

#[inline]
fn encode_utf8_compatible<E: ErrorHandler>(
s: &Wtf8,
errors: &E,
err_reason: &str,
target_kind: StrKind,
) -> Result<Vec<u8>, E::Error> {
let full_data = s;
let mut data = s;
let mut char_data_index = 0;
let mut out = Vec::<u8>::new();
while let Some((char_i, (byte_i, _))) = data
.code_point_indices()
.enumerate()
.find(|(_, (_, c))| !target_kind.can_encode(*c))
{
out.extend_from_slice(&data.as_bytes()[..byte_i]);
let char_start = char_data_index + char_i;

// number of non-compatible chars between the first non-compatible char and the next compatible char
let non_compat_run_length = data[byte_i..]
.code_points()
.take_while(|c| !target_kind.can_encode(*c))
.count();
let char_range = char_start..char_start + non_compat_run_length;
let (replace, char_restart) =
errors.handle_encode_error(full_data, char_range.clone(), err_reason)?;
match replace {
EncodeReplace::Str(s) => {
if s.is_compatible_with(target_kind) {
out.extend_from_slice(s.as_ref().as_bytes());
} else {
return Err(errors.error_encoding(full_data, char_range, err_reason));
}
}
EncodeReplace::Bytes(b) => {
out.extend_from_slice(b.as_ref());
}
}
data = crate::str::try_get_codepoints(full_data, char_restart..)
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
char_data_index = char_restart;
}
out.extend_from_slice(data.as_bytes());
Ok(out)
}

pub mod utf8 {
use super::*;

pub const ENCODING_NAME: &str = "utf-8";

#[inline]
pub fn encode<E: ErrorHandler>(s: &str, _errors: &E) -> Result<Vec<u8>, E::Error> {
Ok(s.as_bytes().to_vec())
pub fn encode<E: ErrorHandler>(s: &Wtf8, errors: &E) -> Result<Vec<u8>, E::Error> {
encode_utf8_compatible(s, errors, "surrogates not allowed", StrKind::Utf8)
}

pub fn decode<E: ErrorHandler>(
Expand Down Expand Up @@ -175,21 +230,22 @@ pub mod utf8 {
}

pub mod latin_1 {

coolreader18 marked this conversation as resolved.
Show resolved Hide resolved
use super::*;

pub const ENCODING_NAME: &str = "latin-1";

const ERR_REASON: &str = "ordinal not in range(256)";

#[inline]
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
pub fn encode<E: ErrorHandler>(s: &Wtf8, errors: &E) -> Result<Vec<u8>, E::Error> {
let full_data = s;
let mut data = s;
let mut char_data_index = 0;
let mut out = Vec::<u8>::new();
loop {
match data
.char_indices()
.code_point_indices()
.enumerate()
.find(|(_, (_, c))| !c.is_ascii())
{
Expand All @@ -200,17 +256,16 @@ pub mod latin_1 {
Some((char_i, (byte_i, ch))) => {
out.extend_from_slice(&data.as_bytes()[..byte_i]);
let char_start = char_data_index + char_i;
if (ch as u32) <= 255 {
out.push(ch as u8);
let char_restart = char_start + 1;
data = crate::str::try_get_chars(full_data, char_restart..)
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
char_data_index = char_restart;
if let Some(byte) = ch.to_u32().to_u8() {
out.push(byte);
// if the codepoint is between 128..=255, it's utf8-length is 2
data = &data[byte_i + 2..];
char_data_index = char_start + 1;
} else {
// number of non-latin_1 chars between the first non-latin_1 char and the next latin_1 char
let non_latin_1_run_length = data[byte_i..]
.chars()
.take_while(|c| (*c as u32) > 255)
.code_points()
.take_while(|c| c.to_u32() > 255)
.count();
let char_range = char_start..char_start + non_latin_1_run_length;
let (replace, char_restart) = errors.handle_encode_error(
Expand All @@ -231,7 +286,7 @@ pub mod latin_1 {
out.extend_from_slice(b.as_ref());
}
}
data = crate::str::try_get_chars(full_data, char_restart..)
data = crate::str::try_get_codepoints(full_data, char_restart..)
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
char_data_index = char_restart;
}
Expand All @@ -258,51 +313,8 @@ pub mod ascii {
const ERR_REASON: &str = "ordinal not in range(128)";

#[inline]
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
let full_data = s;
let mut data = s;
let mut char_data_index = 0;
let mut out = Vec::<u8>::new();
loop {
match data
.char_indices()
.enumerate()
.find(|(_, (_, c))| !c.is_ascii())
{
None => {
out.extend_from_slice(data.as_bytes());
break;
}
Some((char_i, (byte_i, _))) => {
out.extend_from_slice(&data.as_bytes()[..byte_i]);
let char_start = char_data_index + char_i;
// number of non-ascii chars between the first non-ascii char and the next ascii char
let non_ascii_run_length =
data[byte_i..].chars().take_while(|c| !c.is_ascii()).count();
let char_range = char_start..char_start + non_ascii_run_length;
let (replace, char_restart) =
errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
match replace {
EncodeReplace::Str(s) => {
if !s.is_ascii() {
return Err(
errors.error_encoding(full_data, char_range, ERR_REASON)
);
}
out.extend_from_slice(s.as_ref().as_bytes());
}
EncodeReplace::Bytes(b) => {
out.extend_from_slice(b.as_ref());
}
}
data = crate::str::try_get_chars(full_data, char_restart..)
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
char_data_index = char_restart;
continue;
}
}
}
Ok(out)
pub fn encode<E: ErrorHandler>(s: &Wtf8, errors: &E) -> Result<Vec<u8>, E::Error> {
encode_utf8_compatible(s, errors, ERR_REASON, StrKind::Ascii)
}

pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(Wtf8Buf, usize), E::Error> {
Expand Down
11 changes: 10 additions & 1 deletion 11 common/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub type wchar_t = libc::wchar_t;
pub type wchar_t = u32;

/// Utf8 + state.ascii (+ PyUnicode_Kind in future)
#[derive(Debug, Copy, Clone, PartialEq)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum StrKind {
Ascii,
Utf8,
Expand All @@ -41,6 +41,15 @@ impl StrKind {
pub fn is_utf8(&self) -> bool {
matches!(self, Self::Ascii | Self::Utf8)
}

#[inline(always)]
pub fn can_encode(&self, code: CodePoint) -> bool {
match self {
StrKind::Ascii => code.is_ascii(),
StrKind::Utf8 => code.to_char().is_some(),
StrKind::Wtf8 => true,
}
}
}

pub trait DeduceStrKind {
Expand Down
71 changes: 70 additions & 1 deletion 71 common/src/wtf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ use std::collections::TryReserveError;
use std::string::String;
use std::vec::Vec;

use bstr::ByteSlice;
use bstr::{ByteSlice, ByteVec};

mod core_char;
mod core_str;
Expand Down Expand Up @@ -168,6 +168,10 @@ impl CodePoint {
pub fn len_wtf8(&self) -> usize {
len_utf8(self.value)
}

pub fn is_ascii(&self) -> bool {
self.is_char_and(|c| c.is_ascii())
}
}

impl From<u16> for CodePoint {
Expand Down Expand Up @@ -436,6 +440,13 @@ impl Wtf8Buf {
self.push_wtf8(code_point.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
}

pub fn pop(&mut self) -> Option<CodePoint> {
let ch = self.code_points().next_back()?;
let newlen = self.len() - ch.len_wtf8();
self.bytes.truncate(newlen);
Some(ch)
}

/// Shortens a string to the specified length.
///
/// # Panics
Expand All @@ -448,6 +459,20 @@ impl Wtf8Buf {
self.bytes.truncate(new_len)
}

/// Inserts a codepoint into this `Wtf8Buf` at a byte position.
#[inline]
pub fn insert(&mut self, idx: usize, c: CodePoint) {
self.insert_wtf8(idx, c.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
}

/// Inserts a WTF-8 slice into this `Wtf8Buf` at a byte position.
#[inline]
pub fn insert_wtf8(&mut self, idx: usize, w: &Wtf8) {
assert!(is_code_point_boundary(self, idx));

self.bytes.insert_str(idx, w)
}

/// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
#[inline]
pub fn into_bytes(self) -> Vec<u8> {
Expand Down Expand Up @@ -914,6 +939,21 @@ impl Wtf8 {
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
}

pub fn trim(&self) -> &Self {
let w = self.bytes.trim();
unsafe { Wtf8::from_bytes_unchecked(w) }
}

pub fn trim_start(&self) -> &Self {
let w = self.bytes.trim_start();
unsafe { Wtf8::from_bytes_unchecked(w) }
}

pub fn trim_end(&self) -> &Self {
let w = self.bytes.trim_end();
unsafe { Wtf8::from_bytes_unchecked(w) }
}

pub fn trim_start_matches(&self, f: impl Fn(CodePoint) -> bool) -> &Self {
let mut iter = self.code_points();
loop {
Expand Down Expand Up @@ -958,6 +998,15 @@ impl Wtf8 {
memchr::memmem::rfind(self.as_bytes(), pat.as_bytes())
}

pub fn contains(&self, pat: &Wtf8) -> bool {
self.bytes.contains_str(pat)
}

pub fn contains_code_point(&self, pat: CodePoint) -> bool {
self.bytes
.contains_str(pat.encode_wtf8(&mut [0; MAX_LEN_UTF8]))
}

pub fn get(&self, range: impl ops::RangeBounds<usize>) -> Option<&Self> {
let start = match range.start_bound() {
ops::Bound::Included(&i) => i,
Expand All @@ -977,6 +1026,26 @@ impl Wtf8 {
None
}
}

pub fn ends_with(&self, w: &Wtf8) -> bool {
self.bytes.ends_with_str(w)
}

pub fn starts_with(&self, w: &Wtf8) -> bool {
self.bytes.starts_with_str(w)
}

pub fn strip_prefix(&self, w: &Wtf8) -> Option<&Self> {
self.bytes
.strip_prefix(w.as_bytes())
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
}

pub fn strip_suffix(&self, w: &Wtf8) -> Option<&Self> {
self.bytes
.strip_suffix(w.as_bytes())
.map(|w| unsafe { Wtf8::from_bytes_unchecked(w) })
}
}

impl AsRef<Wtf8> for str {
Expand Down
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.