@@ -4,13 +4,20 @@ pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;
44
55pub type DecodeErrorResult < S , B , E > = Result < ( S , Option < B > , usize ) , E > ;
66
7+ pub trait StrBuffer : AsRef < str > {
8+ fn is_ascii ( & self ) -> bool {
9+ self . as_ref ( ) . is_ascii ( )
10+ }
11+ }
12+
713pub trait ErrorHandler {
814 type Error ;
9- type StrBuf : AsRef < str > ;
15+ type StrBuf : StrBuffer ;
1016 type BytesBuf : AsRef < [ u8 ] > ;
1117 fn handle_encode_error (
1218 & self ,
13- byte_range : Range < usize > ,
19+ data : & str ,
20+ char_range : Range < usize > ,
1421 reason : & str ,
1522 ) -> EncodeErrorResult < Self :: StrBuf , Self :: BytesBuf , Self :: Error > ;
1623 fn handle_decode_error (
@@ -20,12 +27,95 @@ pub trait ErrorHandler {
2027 reason : & str ,
2128 ) -> DecodeErrorResult < Self :: StrBuf , Self :: BytesBuf , Self :: Error > ;
2229 fn error_oob_restart ( & self , i : usize ) -> Self :: Error ;
30+ fn error_encoding ( & self , data : & str , char_range : Range < usize > , reason : & str ) -> Self :: Error ;
2331}
2432pub enum EncodeReplace < S , B > {
2533 Str ( S ) ,
2634 Bytes ( B ) ,
2735}
2836
37+ struct DecodeError < ' a > {
38+ valid_prefix : & ' a str ,
39+ rest : & ' a [ u8 ] ,
40+ err_len : Option < usize > ,
41+ }
42+ /// # Safety
43+ /// `v[..valid_up_to]` must be valid utf8
44+ unsafe fn make_decode_err ( v : & [ u8 ] , valid_up_to : usize , err_len : Option < usize > ) -> DecodeError < ' _ > {
45+ let valid_prefix = core:: str:: from_utf8_unchecked ( v. get_unchecked ( ..valid_up_to) ) ;
46+ let rest = v. get_unchecked ( valid_up_to..) ;
47+ DecodeError {
48+ valid_prefix,
49+ rest,
50+ err_len,
51+ }
52+ }
53+
54+ enum HandleResult < ' a > {
55+ Done ,
56+ Error {
57+ err_len : Option < usize > ,
58+ reason : & ' a str ,
59+ } ,
60+ }
61+ fn decode_utf8_compatible < E : ErrorHandler , DecodeF , ErrF > (
62+ data : & [ u8 ] ,
63+ errors : & E ,
64+ decode : DecodeF ,
65+ handle_error : ErrF ,
66+ ) -> Result < ( String , usize ) , E :: Error >
67+ where
68+ DecodeF : Fn ( & [ u8 ] ) -> Result < & str , DecodeError < ' _ > > ,
69+ ErrF : Fn ( & [ u8 ] , Option < usize > ) -> HandleResult < ' _ > ,
70+ {
71+ if data. is_empty ( ) {
72+ return Ok ( ( String :: new ( ) , 0 ) ) ;
73+ }
74+ // we need to coerce the lifetime to that of the function body rather than the
75+ // anonymous input lifetime, so that we can assign it data borrowed from data_from_err
76+ let mut data = & * data;
77+ let mut data_from_err: E :: BytesBuf ;
78+ let mut out = String :: with_capacity ( data. len ( ) ) ;
79+ let mut remaining_index = 0 ;
80+ let mut remaining_data = data;
81+ loop {
82+ match decode ( remaining_data) {
83+ Ok ( decoded) => {
84+ out. push_str ( decoded) ;
85+ remaining_index += decoded. len ( ) ;
86+ break ;
87+ }
88+ Err ( e) => {
89+ out. push_str ( e. valid_prefix ) ;
90+ match handle_error ( e. rest , e. err_len ) {
91+ HandleResult :: Done => {
92+ remaining_index += e. valid_prefix . len ( ) ;
93+ break ;
94+ }
95+ HandleResult :: Error { err_len, reason } => {
96+ let err_idx = remaining_index + e. valid_prefix . len ( ) ;
97+ let err_range =
98+ err_idx..err_len. map_or_else ( || data. len ( ) , |len| err_idx + len) ;
99+ let ( replace, new_data, restart) =
100+ errors. handle_decode_error ( data, err_range, reason) ?;
101+ out. push_str ( replace. as_ref ( ) ) ;
102+ if let Some ( new_data) = new_data {
103+ data_from_err = new_data;
104+ data = data_from_err. as_ref ( ) ;
105+ }
106+ remaining_data = data
107+ . get ( restart..)
108+ . ok_or_else ( || errors. error_oob_restart ( restart) ) ?;
109+ remaining_index = restart;
110+ continue ;
111+ }
112+ }
113+ }
114+ }
115+ }
116+ Ok ( ( out, remaining_index) )
117+ }
118+
29119pub mod utf8 {
30120 use super :: * ;
31121
@@ -41,75 +131,120 @@ pub mod utf8 {
41131 errors : & E ,
42132 final_decode : bool ,
43133 ) -> Result < ( String , usize ) , E :: Error > {
44- if data. is_empty ( ) {
45- return Ok ( ( String :: new ( ) , 0 ) ) ;
46- }
47- // we need to coerce the lifetime to that of the function body rather than the
48- // anonymous input lifetime, so that we can assign it data borrowed from data_from_err
49- let mut data = & * data;
50- let mut data_from_err: E :: BytesBuf ;
51- let mut out = String :: with_capacity ( data. len ( ) ) ;
52- let mut remaining_index = 0 ;
53- let mut remaining_data = data;
54- macro_rules! handle_error {
55- ( $range: expr, $reason: expr) => { {
56- let ( replace, new_data, restart) =
57- errors. handle_decode_error( data, $range, $reason) ?;
58- out. push_str( replace. as_ref( ) ) ;
59- if let Some ( new_data) = new_data {
60- data_from_err = new_data;
61- data = data_from_err. as_ref( ) ;
134+ decode_utf8_compatible (
135+ data,
136+ errors,
137+ |v| {
138+ core:: str:: from_utf8 ( v) . map_err ( |e| {
139+ // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
140+ // is valid utf8
141+ unsafe { make_decode_err ( v, e. valid_up_to ( ) , e. error_len ( ) ) }
142+ } )
143+ } ,
144+ |rest, err_len| {
145+ let first_err = rest[ 0 ] ;
146+ if matches ! ( first_err, 0x80 ..=0xc1 | 0xf5 ..=0xff ) {
147+ return HandleResult :: Error {
148+ err_len : Some ( 1 ) ,
149+ reason : "invalid start byte" ,
150+ } ;
62151 }
63- remaining_data = data
64- . get( restart..)
65- . ok_or_else( || errors. error_oob_restart( restart) ) ?;
66- remaining_index = restart;
67- continue ;
68- } } ;
69- }
152+ if err_len. is_none ( ) {
153+ // error_len() == None means unexpected eof
154+ let res = if final_decode {
155+ HandleResult :: Error {
156+ err_len,
157+ reason : "unexpected end of data" ,
158+ }
159+ } else {
160+ HandleResult :: Done
161+ } ;
162+ return res;
163+ }
164+ if !final_decode && matches ! ( rest, [ 0xed , 0xa0 ..=0xbf ] ) {
165+ // truncated surrogate
166+ return HandleResult :: Done ;
167+ }
168+ return HandleResult :: Error {
169+ err_len,
170+ reason : "invalid continuation byte" ,
171+ } ;
172+ } ,
173+ )
174+ }
175+ }
176+
177+ pub mod ascii {
178+ use super :: * ;
179+ use :: ascii:: AsciiStr ;
180+
181+ pub const ENCODING_NAME : & str = "ascii" ;
182+
183+ const ERR_REASON : & str = "ordinal not in range(128)" ;
184+
185+ #[ inline]
186+ pub fn encode < E : ErrorHandler > ( s : & str , errors : & E ) -> Result < Vec < u8 > , E :: Error > {
187+ let full_data = s;
188+ let mut data = s;
189+ let mut char_data_index = 0 ;
190+ let mut out = Vec :: < u8 > :: new ( ) ;
70191 loop {
71- match core:: str:: from_utf8 ( remaining_data) {
72- Ok ( decoded) => {
73- out. push_str ( decoded) ;
74- remaining_index += decoded. len ( ) ;
192+ match data
193+ . char_indices ( )
194+ . enumerate ( )
195+ . find ( |( _, ( _, c) ) | !c. is_ascii ( ) )
196+ {
197+ None => {
198+ out. extend_from_slice ( data. as_bytes ( ) ) ;
75199 break ;
76200 }
77- Err ( e) => {
78- let ( valid_prefix, rest, first_err) = unsafe {
79- let index = e. valid_up_to ( ) ;
80- // SAFETY: as specified in valid_up_to's documentation, from_utf8(&input[..index]) will return Ok(_)
81- let valid =
82- std:: str:: from_utf8_unchecked ( remaining_data. get_unchecked ( ..index) ) ;
83- let rest = remaining_data. get_unchecked ( index..) ;
84- // SAFETY: if index didn't have something at it, this wouldn't be an error
85- let first_err = * remaining_data. get_unchecked ( index) ;
86- ( valid, rest, first_err)
87- } ;
88- out. push_str ( valid_prefix) ;
89- let err_idx = remaining_index + e. valid_up_to ( ) ;
90- remaining_data = rest;
91- remaining_index += valid_prefix. len ( ) ;
92- if ( 0x80 ..0xc2 ) . contains ( & first_err) || ( 0xf5 ..=0xff ) . contains ( & first_err) {
93- handle_error ! ( err_idx..err_idx + 1 , "invalid start byte" ) ;
94- }
95- let err_len = match e. error_len ( ) {
96- Some ( l) => l,
97- // error_len() == None means unexpected eof
98- None => {
99- if !final_decode {
100- break ;
201+ Some ( ( char_i, ( byte_i, _) ) ) => {
202+ out. extend_from_slice ( & data. as_bytes ( ) [ ..byte_i] ) ;
203+ let char_start = char_data_index + char_i;
204+ // number of non-ascii chars between the first non-ascii char and the next ascii char
205+ let non_ascii_run_length =
206+ data[ byte_i..] . chars ( ) . take_while ( |c| !c. is_ascii ( ) ) . count ( ) ;
207+ let char_range = char_start..char_start + non_ascii_run_length;
208+ let ( replace, char_restart) =
209+ errors. handle_encode_error ( full_data, char_range. clone ( ) , ERR_REASON ) ?;
210+ match replace {
211+ EncodeReplace :: Str ( s) => {
212+ if !s. is_ascii ( ) {
213+ return Err (
214+ errors. error_encoding ( full_data, char_range, ERR_REASON )
215+ ) ;
101216 }
102- handle_error ! ( err_idx..data. len( ) , "unexpected end of data" ) ;
217+ out. extend_from_slice ( s. as_ref ( ) . as_bytes ( ) ) ;
218+ }
219+ EncodeReplace :: Bytes ( b) => {
220+ out. extend_from_slice ( b. as_ref ( ) ) ;
103221 }
104- } ;
105- if !final_decode && matches ! ( remaining_data, [ 0xed , 0xa0 ..=0xbf ] ) {
106- // truncated surrogate
107- break ;
108222 }
109- handle_error ! ( err_idx..err_idx + err_len, "invalid continuation byte" ) ;
223+ data = crate :: str:: try_get_chars ( full_data, char_restart..)
224+ . ok_or_else ( || errors. error_oob_restart ( char_restart) ) ?;
225+ char_data_index = char_restart;
226+ continue ;
110227 }
111228 }
112229 }
113- Ok ( ( out, remaining_index) )
230+ Ok ( out)
231+ }
232+
233+ pub fn decode < E : ErrorHandler > ( data : & [ u8 ] , errors : & E ) -> Result < ( String , usize ) , E :: Error > {
234+ decode_utf8_compatible (
235+ data,
236+ errors,
237+ |v| {
238+ AsciiStr :: from_ascii ( v) . map ( |s| s. as_str ( ) ) . map_err ( |e| {
239+ // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
240+ // is valid ascii & therefore valid utf8
241+ unsafe { make_decode_err ( v, e. valid_up_to ( ) , Some ( 1 ) ) }
242+ } )
243+ } ,
244+ |_rest, err_len| HandleResult :: Error {
245+ err_len,
246+ reason : ERR_REASON ,
247+ } ,
248+ )
114249 }
115250}
0 commit comments