@@ -4,13 +4,20 @@ pub type EncodeErrorResult<S, B, E> = Result<(EncodeReplace<S, B>, usize), E>;
4
4
5
5
pub type DecodeErrorResult < S , B , E > = Result < ( S , Option < B > , usize ) , E > ;
6
6
7
+ pub trait StrBuffer : AsRef < str > {
8
+ fn is_ascii ( & self ) -> bool {
9
+ self . as_ref ( ) . is_ascii ( )
10
+ }
11
+ }
12
+
7
13
pub trait ErrorHandler {
8
14
type Error ;
9
- type StrBuf : AsRef < str > ;
15
+ type StrBuf : StrBuffer ;
10
16
type BytesBuf : AsRef < [ u8 ] > ;
11
17
fn handle_encode_error (
12
18
& self ,
13
- byte_range : Range < usize > ,
19
+ data : & str ,
20
+ char_range : Range < usize > ,
14
21
reason : & str ,
15
22
) -> EncodeErrorResult < Self :: StrBuf , Self :: BytesBuf , Self :: Error > ;
16
23
fn handle_decode_error (
@@ -20,12 +27,95 @@ pub trait ErrorHandler {
20
27
reason : & str ,
21
28
) -> DecodeErrorResult < Self :: StrBuf , Self :: BytesBuf , Self :: Error > ;
22
29
fn error_oob_restart ( & self , i : usize ) -> Self :: Error ;
30
+ fn error_encoding ( & self , data : & str , char_range : Range < usize > , reason : & str ) -> Self :: Error ;
23
31
}
24
32
pub enum EncodeReplace < S , B > {
25
33
Str ( S ) ,
26
34
Bytes ( B ) ,
27
35
}
28
36
37
+ struct DecodeError < ' a > {
38
+ valid_prefix : & ' a str ,
39
+ rest : & ' a [ u8 ] ,
40
+ err_len : Option < usize > ,
41
+ }
42
+ /// # Safety
43
+ /// `v[..valid_up_to]` must be valid utf8
44
+ unsafe fn make_decode_err ( v : & [ u8 ] , valid_up_to : usize , err_len : Option < usize > ) -> DecodeError < ' _ > {
45
+ let valid_prefix = core:: str:: from_utf8_unchecked ( v. get_unchecked ( ..valid_up_to) ) ;
46
+ let rest = v. get_unchecked ( valid_up_to..) ;
47
+ DecodeError {
48
+ valid_prefix,
49
+ rest,
50
+ err_len,
51
+ }
52
+ }
53
+
54
+ enum HandleResult < ' a > {
55
+ Done ,
56
+ Error {
57
+ err_len : Option < usize > ,
58
+ reason : & ' a str ,
59
+ } ,
60
+ }
61
+ fn decode_utf8_compatible < E : ErrorHandler , DecodeF , ErrF > (
62
+ data : & [ u8 ] ,
63
+ errors : & E ,
64
+ decode : DecodeF ,
65
+ handle_error : ErrF ,
66
+ ) -> Result < ( String , usize ) , E :: Error >
67
+ where
68
+ DecodeF : Fn ( & [ u8 ] ) -> Result < & str , DecodeError < ' _ > > ,
69
+ ErrF : Fn ( & [ u8 ] , Option < usize > ) -> HandleResult < ' _ > ,
70
+ {
71
+ if data. is_empty ( ) {
72
+ return Ok ( ( String :: new ( ) , 0 ) ) ;
73
+ }
74
+ // we need to coerce the lifetime to that of the function body rather than the
75
+ // anonymous input lifetime, so that we can assign it data borrowed from data_from_err
76
+ let mut data = & * data;
77
+ let mut data_from_err: E :: BytesBuf ;
78
+ let mut out = String :: with_capacity ( data. len ( ) ) ;
79
+ let mut remaining_index = 0 ;
80
+ let mut remaining_data = data;
81
+ loop {
82
+ match decode ( remaining_data) {
83
+ Ok ( decoded) => {
84
+ out. push_str ( decoded) ;
85
+ remaining_index += decoded. len ( ) ;
86
+ break ;
87
+ }
88
+ Err ( e) => {
89
+ out. push_str ( e. valid_prefix ) ;
90
+ match handle_error ( e. rest , e. err_len ) {
91
+ HandleResult :: Done => {
92
+ remaining_index += e. valid_prefix . len ( ) ;
93
+ break ;
94
+ }
95
+ HandleResult :: Error { err_len, reason } => {
96
+ let err_idx = remaining_index + e. valid_prefix . len ( ) ;
97
+ let err_range =
98
+ err_idx..err_len. map_or_else ( || data. len ( ) , |len| err_idx + len) ;
99
+ let ( replace, new_data, restart) =
100
+ errors. handle_decode_error ( data, err_range, reason) ?;
101
+ out. push_str ( replace. as_ref ( ) ) ;
102
+ if let Some ( new_data) = new_data {
103
+ data_from_err = new_data;
104
+ data = data_from_err. as_ref ( ) ;
105
+ }
106
+ remaining_data = data
107
+ . get ( restart..)
108
+ . ok_or_else ( || errors. error_oob_restart ( restart) ) ?;
109
+ remaining_index = restart;
110
+ continue ;
111
+ }
112
+ }
113
+ }
114
+ }
115
+ }
116
+ Ok ( ( out, remaining_index) )
117
+ }
118
+
29
119
pub mod utf8 {
30
120
use super :: * ;
31
121
@@ -41,75 +131,120 @@ pub mod utf8 {
41
131
errors : & E ,
42
132
final_decode : bool ,
43
133
) -> Result < ( String , usize ) , E :: Error > {
44
- if data. is_empty ( ) {
45
- return Ok ( ( String :: new ( ) , 0 ) ) ;
46
- }
47
- // we need to coerce the lifetime to that of the function body rather than the
48
- // anonymous input lifetime, so that we can assign it data borrowed from data_from_err
49
- let mut data = & * data;
50
- let mut data_from_err: E :: BytesBuf ;
51
- let mut out = String :: with_capacity ( data. len ( ) ) ;
52
- let mut remaining_index = 0 ;
53
- let mut remaining_data = data;
54
- macro_rules! handle_error {
55
- ( $range: expr, $reason: expr) => { {
56
- let ( replace, new_data, restart) =
57
- errors. handle_decode_error( data, $range, $reason) ?;
58
- out. push_str( replace. as_ref( ) ) ;
59
- if let Some ( new_data) = new_data {
60
- data_from_err = new_data;
61
- data = data_from_err. as_ref( ) ;
134
+ decode_utf8_compatible (
135
+ data,
136
+ errors,
137
+ |v| {
138
+ core:: str:: from_utf8 ( v) . map_err ( |e| {
139
+ // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
140
+ // is valid utf8
141
+ unsafe { make_decode_err ( v, e. valid_up_to ( ) , e. error_len ( ) ) }
142
+ } )
143
+ } ,
144
+ |rest, err_len| {
145
+ let first_err = rest[ 0 ] ;
146
+ if matches ! ( first_err, 0x80 ..=0xc1 | 0xf5 ..=0xff ) {
147
+ return HandleResult :: Error {
148
+ err_len : Some ( 1 ) ,
149
+ reason : "invalid start byte" ,
150
+ } ;
62
151
}
63
- remaining_data = data
64
- . get( restart..)
65
- . ok_or_else( || errors. error_oob_restart( restart) ) ?;
66
- remaining_index = restart;
67
- continue ;
68
- } } ;
69
- }
152
+ if err_len. is_none ( ) {
153
+ // error_len() == None means unexpected eof
154
+ let res = if final_decode {
155
+ HandleResult :: Error {
156
+ err_len,
157
+ reason : "unexpected end of data" ,
158
+ }
159
+ } else {
160
+ HandleResult :: Done
161
+ } ;
162
+ return res;
163
+ }
164
+ if !final_decode && matches ! ( rest, [ 0xed , 0xa0 ..=0xbf ] ) {
165
+ // truncated surrogate
166
+ return HandleResult :: Done ;
167
+ }
168
+ return HandleResult :: Error {
169
+ err_len,
170
+ reason : "invalid continuation byte" ,
171
+ } ;
172
+ } ,
173
+ )
174
+ }
175
+ }
176
+
177
+ pub mod ascii {
178
+ use super :: * ;
179
+ use :: ascii:: AsciiStr ;
180
+
181
+ pub const ENCODING_NAME : & str = "ascii" ;
182
+
183
+ const ERR_REASON : & str = "ordinal not in range(128)" ;
184
+
185
+ #[ inline]
186
+ pub fn encode < E : ErrorHandler > ( s : & str , errors : & E ) -> Result < Vec < u8 > , E :: Error > {
187
+ let full_data = s;
188
+ let mut data = s;
189
+ let mut char_data_index = 0 ;
190
+ let mut out = Vec :: < u8 > :: new ( ) ;
70
191
loop {
71
- match core:: str:: from_utf8 ( remaining_data) {
72
- Ok ( decoded) => {
73
- out. push_str ( decoded) ;
74
- remaining_index += decoded. len ( ) ;
192
+ match data
193
+ . char_indices ( )
194
+ . enumerate ( )
195
+ . find ( |( _, ( _, c) ) | !c. is_ascii ( ) )
196
+ {
197
+ None => {
198
+ out. extend_from_slice ( data. as_bytes ( ) ) ;
75
199
break ;
76
200
}
77
- Err ( e) => {
78
- let ( valid_prefix, rest, first_err) = unsafe {
79
- let index = e. valid_up_to ( ) ;
80
- // SAFETY: as specified in valid_up_to's documentation, from_utf8(&input[..index]) will return Ok(_)
81
- let valid =
82
- std:: str:: from_utf8_unchecked ( remaining_data. get_unchecked ( ..index) ) ;
83
- let rest = remaining_data. get_unchecked ( index..) ;
84
- // SAFETY: if index didn't have something at it, this wouldn't be an error
85
- let first_err = * remaining_data. get_unchecked ( index) ;
86
- ( valid, rest, first_err)
87
- } ;
88
- out. push_str ( valid_prefix) ;
89
- let err_idx = remaining_index + e. valid_up_to ( ) ;
90
- remaining_data = rest;
91
- remaining_index += valid_prefix. len ( ) ;
92
- if ( 0x80 ..0xc2 ) . contains ( & first_err) || ( 0xf5 ..=0xff ) . contains ( & first_err) {
93
- handle_error ! ( err_idx..err_idx + 1 , "invalid start byte" ) ;
94
- }
95
- let err_len = match e. error_len ( ) {
96
- Some ( l) => l,
97
- // error_len() == None means unexpected eof
98
- None => {
99
- if !final_decode {
100
- break ;
201
+ Some ( ( char_i, ( byte_i, _) ) ) => {
202
+ out. extend_from_slice ( & data. as_bytes ( ) [ ..byte_i] ) ;
203
+ let char_start = char_data_index + char_i;
204
+ // number of non-ascii chars between the first non-ascii char and the next ascii char
205
+ let non_ascii_run_length =
206
+ data[ byte_i..] . chars ( ) . take_while ( |c| !c. is_ascii ( ) ) . count ( ) ;
207
+ let char_range = char_start..char_start + non_ascii_run_length;
208
+ let ( replace, char_restart) =
209
+ errors. handle_encode_error ( full_data, char_range. clone ( ) , ERR_REASON ) ?;
210
+ match replace {
211
+ EncodeReplace :: Str ( s) => {
212
+ if !s. is_ascii ( ) {
213
+ return Err (
214
+ errors. error_encoding ( full_data, char_range, ERR_REASON )
215
+ ) ;
101
216
}
102
- handle_error ! ( err_idx..data. len( ) , "unexpected end of data" ) ;
217
+ out. extend_from_slice ( s. as_ref ( ) . as_bytes ( ) ) ;
218
+ }
219
+ EncodeReplace :: Bytes ( b) => {
220
+ out. extend_from_slice ( b. as_ref ( ) ) ;
103
221
}
104
- } ;
105
- if !final_decode && matches ! ( remaining_data, [ 0xed , 0xa0 ..=0xbf ] ) {
106
- // truncated surrogate
107
- break ;
108
222
}
109
- handle_error ! ( err_idx..err_idx + err_len, "invalid continuation byte" ) ;
223
+ data = crate :: str:: try_get_chars ( full_data, char_restart..)
224
+ . ok_or_else ( || errors. error_oob_restart ( char_restart) ) ?;
225
+ char_data_index = char_restart;
226
+ continue ;
110
227
}
111
228
}
112
229
}
113
- Ok ( ( out, remaining_index) )
230
+ Ok ( out)
231
+ }
232
+
233
+ pub fn decode < E : ErrorHandler > ( data : & [ u8 ] , errors : & E ) -> Result < ( String , usize ) , E :: Error > {
234
+ decode_utf8_compatible (
235
+ data,
236
+ errors,
237
+ |v| {
238
+ AsciiStr :: from_ascii ( v) . map ( |s| s. as_str ( ) ) . map_err ( |e| {
239
+ // SAFETY: as specified in valid_up_to's documentation, input[..e.valid_up_to()]
240
+ // is valid ascii & therefore valid utf8
241
+ unsafe { make_decode_err ( v, e. valid_up_to ( ) , Some ( 1 ) ) }
242
+ } )
243
+ } ,
244
+ |_rest, err_len| HandleResult :: Error {
245
+ err_len,
246
+ reason : ERR_REASON ,
247
+ } ,
248
+ )
114
249
}
115
250
}
0 commit comments