Skip to content

Commit fceae4f

Browse files
committed
Integrate vendored "utf-8" crate with tendril types
Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
1 parent c5d5da2 commit fceae4f

4 files changed

Lines changed: 217 additions & 214 deletions

File tree

tendril/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ pub mod stream;
2525

2626
mod buf32;
2727
mod tendril;
28-
mod utf8;
2928
mod utf8_decode;
3029
mod util;
3130

tendril/src/stream.rs

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66

77
//! Streams of tendrils.
88
9-
use crate::fmt;
9+
use crate::utf8_decode::{decode_utf8, DecodeError, REPLACEMENT_CHARACTER};
10+
use crate::{fmt, IncompleteUtf8};
1011
use crate::{Atomicity, NonAtomic, Tendril};
1112

1213
use std::borrow::Cow;
@@ -15,7 +16,6 @@ use std::io;
1516
use std::marker::PhantomData;
1617
use std::path::Path;
1718

18-
use crate::utf8;
1919
#[cfg(feature = "encoding_rs")]
2020
use encoding_rs::{self, DecoderResult};
2121

@@ -124,7 +124,7 @@ where
124124
A: Atomicity,
125125
{
126126
pub inner_sink: Sink,
127-
incomplete: Option<utf8::Incomplete>,
127+
incomplete: Option<IncompleteUtf8>,
128128
marker: PhantomData<A>,
129129
}
130130

@@ -150,64 +150,71 @@ where
150150
A: Atomicity,
151151
{
152152
#[inline]
153-
fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) {
153+
fn process(&mut self, mut bytes: Tendril<fmt::Bytes, A>) {
154154
// FIXME: remove take() and map() when non-lexical borrows are stable.
155155
if let Some(mut incomplete) = self.incomplete.take() {
156-
let resume_at = incomplete.try_complete(&t).map(|(result, rest)| {
157-
match result {
158-
Ok(s) => self.inner_sink.process(Tendril::from_slice(s)),
159-
Err(_) => {
160-
self.inner_sink.error("invalid byte sequence".into());
161-
self.inner_sink
162-
.process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
163-
},
164-
}
165-
t.len() - rest.len()
166-
});
156+
let resume_at = incomplete
157+
.try_to_complete_codepoint(&bytes)
158+
.map(|(result, rest)| {
159+
match result {
160+
Ok(decoded_string) => {
161+
self.inner_sink.process(Tendril::from_slice(decoded_string))
162+
},
163+
Err(_) => {
164+
self.inner_sink.error("invalid byte sequence".into());
165+
self.inner_sink
166+
.process(Tendril::from_slice(REPLACEMENT_CHARACTER));
167+
},
168+
}
169+
bytes.len() - rest.len()
170+
});
167171
match resume_at {
168172
None => {
169173
self.incomplete = Some(incomplete);
170174
return;
171175
},
172-
Some(resume_at) => t.pop_front(resume_at as u32),
176+
Some(resume_at) => bytes.pop_front(resume_at as u32),
173177
}
174178
}
175-
while !t.is_empty() {
176-
let unborrowed_result = match utf8::decode(&t) {
179+
while !bytes.is_empty() {
180+
let unborrowed_result = match decode_utf8(&bytes) {
177181
Ok(s) => {
178-
debug_assert!(s.as_ptr() == t.as_ptr());
179-
debug_assert!(s.len() == t.len());
182+
debug_assert!(s.as_ptr() == bytes.as_ptr());
183+
debug_assert!(s.len() == bytes.len());
180184
Ok(())
181185
},
182-
Err(utf8::DecodeError::Invalid {
186+
Err(DecodeError::Invalid {
183187
valid_prefix,
184188
invalid_sequence,
185189
..
186190
}) => {
187-
debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
188-
debug_assert!(valid_prefix.len() <= t.len());
191+
debug_assert!(valid_prefix.as_ptr() == bytes.as_ptr());
192+
debug_assert!(valid_prefix.len() <= bytes.len());
189193
Err((
190194
valid_prefix.len(),
191195
Err(valid_prefix.len() + invalid_sequence.len()),
192196
))
193197
},
194-
Err(utf8::DecodeError::Incomplete {
198+
Err(DecodeError::Incomplete {
195199
valid_prefix,
196200
incomplete_suffix,
197201
}) => {
198-
debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
199-
debug_assert!(valid_prefix.len() <= t.len());
202+
debug_assert!(valid_prefix.as_ptr() == bytes.as_ptr());
203+
debug_assert!(valid_prefix.len() <= bytes.len());
200204
Err((valid_prefix.len(), Ok(incomplete_suffix)))
201205
},
202206
};
203207
match unborrowed_result {
204208
Ok(()) => {
205-
unsafe { self.inner_sink.process(t.reinterpret_without_validating()) }
209+
unsafe {
210+
self.inner_sink
211+
.process(bytes.reinterpret_without_validating())
212+
}
206213
return;
207214
},
208215
Err((valid_len, and_then)) => {
209216
if valid_len > 0 {
210-
let subtendril = t.subtendril(0, valid_len as u32);
217+
let subtendril = bytes.subtendril(0, valid_len as u32);
211218
unsafe {
212219
self.inner_sink
213220
.process(subtendril.reinterpret_without_validating())
@@ -221,8 +228,8 @@ where
221228
Err(offset) => {
222229
self.inner_sink.error("invalid byte sequence".into());
223230
self.inner_sink
224-
.process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
225-
t.pop_front(offset as u32);
231+
.process(Tendril::from_slice(REPLACEMENT_CHARACTER));
232+
bytes.pop_front(offset as u32);
226233
},
227234
}
228235
},
@@ -243,7 +250,7 @@ where
243250
self.inner_sink
244251
.error("incomplete byte sequence at end of stream".into());
245252
self.inner_sink
246-
.process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
253+
.process(Tendril::from_slice(REPLACEMENT_CHARACTER));
247254
}
248255
self.inner_sink.finish()
249256
}
@@ -380,7 +387,7 @@ where
380387

381388
#[cfg(feature = "encoding_rs")]
382389
fn decode_to_sink<Sink, A>(
383-
mut t: Tendril<fmt::Bytes, A>,
390+
mut input: Tendril<fmt::Bytes, A>,
384391
decoder: &mut encoding_rs::Decoder,
385392
sink: &mut Sink,
386393
last: bool,
@@ -391,13 +398,13 @@ fn decode_to_sink<Sink, A>(
391398
loop {
392399
let mut out = <Tendril<fmt::Bytes, A>>::new();
393400
let max_len = decoder
394-
.max_utf8_buffer_length_without_replacement(t.len())
401+
.max_utf8_buffer_length_without_replacement(input.len())
395402
.unwrap_or(8192);
396403
unsafe {
397-
out.push_uninitialized(std::cmp::min(max_len as u32, 8192));
404+
out.push_uninitialized(max_len.min(8192) as u32);
398405
}
399406
let (result, bytes_read, bytes_written) =
400-
decoder.decode_to_utf8_without_replacement(&t, &mut out, last);
407+
decoder.decode_to_utf8_without_replacement(&input, &mut out, last);
401408
if bytes_written > 0 {
402409
sink.process(unsafe {
403410
out.subtendril(0, bytes_written as u32)
@@ -409,11 +416,11 @@ fn decode_to_sink<Sink, A>(
409416
DecoderResult::OutputFull => {},
410417
DecoderResult::Malformed(_, _) => {
411418
sink.error(Cow::Borrowed("invalid sequence"));
412-
sink.process("\u{FFFD}".into());
419+
sink.process(Tendril::from_slice(REPLACEMENT_CHARACTER));
413420
},
414421
}
415-
t.pop_front(bytes_read as u32);
416-
if t.is_empty() {
422+
input.pop_front(bytes_read as u32);
423+
if input.is_empty() {
417424
return;
418425
}
419426
}

tendril/src/utf8.rs

Lines changed: 0 additions & 141 deletions
This file was deleted.

0 commit comments

Comments
 (0)