Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit ec7ae7b

Browse filesBrowse files
committed
Implement ascii codec in Rust
1 parent 29c90a6 commit ec7ae7b
Copy full SHA for ec7ae7b

File tree

4 files changed

+292
-76
lines changed
Filter options

4 files changed

+292
-76
lines changed

‎Cargo.lock

Copy file name to clipboardExpand all lines: Cargo.lock
+1Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎common/Cargo.toml

Copy file name to clipboardExpand all lines: common/Cargo.toml
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ rand = "0.8"
2222
volatile = "0.3"
2323
radium = "0.6"
2424
libc = "0.2.101"
25+
ascii = "1.0"

‎common/src/encodings.rs

Copy file name to clipboardExpand all lines: common/src/encodings.rs
+197-62Lines changed: 197 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,20 @@ pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;
44

55
pub type DecodeErrorResult<S, B, E> = Result<(S, Option<B>, usize), E>;
66

7+
pub trait StrBuffer: AsRef<str> {
8+
fn is_ascii(&self) -> bool {
9+
self.as_ref().is_ascii()
10+
}
11+
}
12+
713
pub trait ErrorHandler {
814
type Error;
9-
type StrBuf: AsRef<str>;
15+
type StrBuf: StrBuffer;
1016
type BytesBuf: AsRef<[u8]>;
1117
fn handle_encode_error(
1218
&self,
13-
byte_range: Range<usize>,
19+
data: &str,
20+
char_range: Range<usize>,
1421
reason: &str,
1522
) -> EncodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
1623
fn handle_decode_error(
@@ -20,12 +27,95 @@ pub trait ErrorHandler {
2027
reason: &str,
2128
) -> DecodeErrorResult<Self::StrBuf, Self::BytesBuf, Self::Error>;
2229
fn error_oob_restart(&self, i: usize) -> Self::Error;
30+
fn error_encoding(&self, data: &str, char_range: Range<usize>, reason: &str) -> Self::Error;
2331
}
2432
pub enum EncodeReplace<S, B> {
2533
Str(S),
2634
Bytes(B),
2735
}
2836

37+
struct DecodeError<'a> {
38+
valid_prefix: &'a str,
39+
rest: &'a [u8],
40+
err_len: Option<usize>,
41+
}
42+
/// # Safety
43+
/// `v[..valid_up_to]` must be valid utf8
44+
unsafe fn make_decode_err(v: &[u8], valid_up_to: usize, err_len: Option<usize>) -> DecodeError<'_> {
45+
let valid_prefix = core::str::from_utf8_unchecked(v.get_unchecked(..valid_up_to));
46+
let rest = v.get_unchecked(valid_up_to..);
47+
DecodeError {
48+
valid_prefix,
49+
rest,
50+
err_len,
51+
}
52+
}
53+
54+
enum HandleResult<'a> {
55+
Done,
56+
Error {
57+
err_len: Option<usize>,
58+
reason: &'a str,
59+
},
60+
}
61+
fn decode_utf8_compatible<E: ErrorHandler, DecodeF, ErrF>(
62+
data: &[u8],
63+
errors: &E,
64+
decode: DecodeF,
65+
handle_error: ErrF,
66+
) -> Result<(String, usize), E::Error>
67+
where
68+
DecodeF: Fn(&[u8]) -> Result<&str, DecodeError<'_>>,
69+
ErrF: Fn(&[u8], Option<usize>) -> HandleResult<'_>,
70+
{
71+
if data.is_empty() {
72+
return Ok((String::new(), 0));
73+
}
74+
// we need to coerce the lifetime to that of the function body rather than the
75+
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
76+
let mut data = &*data;
77+
let mut data_from_err: E::BytesBuf;
78+
let mut out = String::with_capacity(data.len());
79+
let mut remaining_index = 0;
80+
let mut remaining_data = data;
81+
loop {
82+
match decode(remaining_data) {
83+
Ok(decoded) => {
84+
out.push_str(decoded);
85+
remaining_index += decoded.len();
86+
break;
87+
}
88+
Err(e) => {
89+
out.push_str(e.valid_prefix);
90+
match handle_error(e.rest, e.err_len) {
91+
HandleResult::Done => {
92+
remaining_index += e.valid_prefix.len();
93+
break;
94+
}
95+
HandleResult::Error { err_len, reason } => {
96+
let err_idx = remaining_index + e.valid_prefix.len();
97+
let err_range =
98+
err_idx..err_len.map_or_else(|| data.len(), |len| err_idx + len);
99+
let (replace, new_data, restart) =
100+
errors.handle_decode_error(data, err_range, reason)?;
101+
out.push_str(replace.as_ref());
102+
if let Some(new_data) = new_data {
103+
data_from_err = new_data;
104+
data = data_from_err.as_ref();
105+
}
106+
remaining_data = data
107+
.get(restart..)
108+
.ok_or_else(|| errors.error_oob_restart(restart))?;
109+
remaining_index = restart;
110+
continue;
111+
}
112+
}
113+
}
114+
}
115+
}
116+
Ok((out, remaining_index))
117+
}
118+
29119
pub mod utf8 {
30120
use super::*;
31121

@@ -41,75 +131,120 @@ pub mod utf8 {
41131
errors: &E,
42132
final_decode: bool,
43133
) -> Result<(String, usize), E::Error> {
44-
if data.is_empty() {
45-
return Ok((String::new(), 0));
46-
}
47-
// we need to coerce the lifetime to that of the function body rather than the
48-
// anonymous input lifetime, so that we can assign it data borrowed from data_from_err
49-
let mut data = &*data;
50-
let mut data_from_err: E::BytesBuf;
51-
let mut out = String::with_capacity(data.len());
52-
let mut remaining_index = 0;
53-
let mut remaining_data = data;
54-
macro_rules! handle_error {
55-
($range:expr, $reason:expr) => {{
56-
let (replace, new_data, restart) =
57-
errors.handle_decode_error(data, $range, $reason)?;
58-
out.push_str(replace.as_ref());
59-
if let Some(new_data) = new_data {
60-
data_from_err = new_data;
61-
data = data_from_err.as_ref();
134+
decode_utf8_compatible(
135+
data,
136+
errors,
137+
|v| {
138+
core::str::from_utf8(v).map_err(|e| {
139+
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
140+
// is valid utf8
141+
unsafe { make_decode_err(v, e.valid_up_to(), e.error_len()) }
142+
})
143+
},
144+
|rest, err_len| {
145+
let first_err = rest[0];
146+
if matches!(first_err, 0x80..=0xc1 | 0xf5..=0xff) {
147+
return HandleResult::Error {
148+
err_len: Some(1),
149+
reason: "invalid start byte",
150+
};
62151
}
63-
remaining_data = data
64-
.get(restart..)
65-
.ok_or_else(|| errors.error_oob_restart(restart))?;
66-
remaining_index = restart;
67-
continue;
68-
}};
69-
}
152+
if err_len.is_none() {
153+
// error_len() == None means unexpected eof
154+
let res = if final_decode {
155+
HandleResult::Error {
156+
err_len,
157+
reason: "unexpected end of data",
158+
}
159+
} else {
160+
HandleResult::Done
161+
};
162+
return res;
163+
}
164+
if !final_decode && matches!(rest, [0xed, 0xa0..=0xbf]) {
165+
// truncated surrogate
166+
return HandleResult::Done;
167+
}
168+
return HandleResult::Error {
169+
err_len,
170+
reason: "invalid continuation byte",
171+
};
172+
},
173+
)
174+
}
175+
}
176+
177+
pub mod ascii {
178+
use super::*;
179+
use ::ascii::AsciiStr;
180+
181+
pub const ENCODING_NAME: &str = "ascii";
182+
183+
const ERR_REASON: &str = "ordinal not in range(128)";
184+
185+
#[inline]
186+
pub fn encode<E: ErrorHandler>(s: &str, errors: &E) -> Result<Vec<u8>, E::Error> {
187+
let full_data = s;
188+
let mut data = s;
189+
let mut char_data_index = 0;
190+
let mut out = Vec::<u8>::new();
70191
loop {
71-
match core::str::from_utf8(remaining_data) {
72-
Ok(decoded) => {
73-
out.push_str(decoded);
74-
remaining_index += decoded.len();
192+
match data
193+
.char_indices()
194+
.enumerate()
195+
.find(|(_, (_, c))| !c.is_ascii())
196+
{
197+
None => {
198+
out.extend_from_slice(data.as_bytes());
75199
break;
76200
}
77-
Err(e) => {
78-
let (valid_prefix, rest, first_err) = unsafe {
79-
let index = e.valid_up_to();
80-
// SAFETY: as specified in valid_up_to's documentation, from_utf8(&input[..index]) will return Ok(_)
81-
let valid =
82-
std::str::from_utf8_unchecked(remaining_data.get_unchecked(..index));
83-
let rest = remaining_data.get_unchecked(index..);
84-
// SAFETY: if index didn't have something at it, this wouldn't be an error
85-
let first_err = *remaining_data.get_unchecked(index);
86-
(valid, rest, first_err)
87-
};
88-
out.push_str(valid_prefix);
89-
let err_idx = remaining_index + e.valid_up_to();
90-
remaining_data = rest;
91-
remaining_index += valid_prefix.len();
92-
if (0x80..0xc2).contains(&first_err) || (0xf5..=0xff).contains(&first_err) {
93-
handle_error!(err_idx..err_idx + 1, "invalid start byte");
94-
}
95-
let err_len = match e.error_len() {
96-
Some(l) => l,
97-
// error_len() == None means unexpected eof
98-
None => {
99-
if !final_decode {
100-
break;
201+
Some((char_i, (byte_i, _))) => {
202+
out.extend_from_slice(&data.as_bytes()[..byte_i]);
203+
let char_start = char_data_index + char_i;
204+
// number of non-ascii chars between the first non-ascii char and the next ascii char
205+
let non_ascii_run_length =
206+
data[byte_i..].chars().take_while(|c| !c.is_ascii()).count();
207+
let char_range = char_start..char_start + non_ascii_run_length;
208+
let (replace, char_restart) =
209+
errors.handle_encode_error(full_data, char_range.clone(), ERR_REASON)?;
210+
match replace {
211+
EncodeReplace::Str(s) => {
212+
if !s.is_ascii() {
213+
return Err(
214+
errors.error_encoding(full_data, char_range, ERR_REASON)
215+
);
101216
}
102-
handle_error!(err_idx..data.len(), "unexpected end of data");
217+
out.extend_from_slice(s.as_ref().as_bytes());
218+
}
219+
EncodeReplace::Bytes(b) => {
220+
out.extend_from_slice(b.as_ref());
103221
}
104-
};
105-
if !final_decode && matches!(remaining_data, [0xed, 0xa0..=0xbf]) {
106-
// truncated surrogate
107-
break;
108222
}
109-
handle_error!(err_idx..err_idx + err_len, "invalid continuation byte");
223+
data = crate::str::try_get_chars(full_data, char_restart..)
224+
.ok_or_else(|| errors.error_oob_restart(char_restart))?;
225+
char_data_index = char_restart;
226+
continue;
110227
}
111228
}
112229
}
113-
Ok((out, remaining_index))
230+
Ok(out)
231+
}
232+
233+
pub fn decode<E: ErrorHandler>(data: &[u8], errors: &E) -> Result<(String, usize), E::Error> {
234+
decode_utf8_compatible(
235+
data,
236+
errors,
237+
|v| {
238+
AsciiStr::from_ascii(v).map(|s| s.as_str()).map_err(|e| {
239+
// SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
240+
// is valid ascii & therefore valid utf8
241+
unsafe { make_decode_err(v, e.valid_up_to(), Some(1)) }
242+
})
243+
},
244+
|_rest, err_len| HandleResult::Error {
245+
err_len,
246+
reason: ERR_REASON,
247+
},
248+
)
114249
}
115250
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.