utf8/
lossy.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
use super::*;

/// A push-based, lossy decoder for UTF-8.
/// Errors are replaced with the U+FFFD replacement character.
///
/// Users “push” bytes into the decoder, which in turn “pushes” `&str` slices into a callback.
///
/// For example, `String::from_utf8_lossy` (but returning `String` instead of `Cow`)
/// can be rewritten as:
///
/// ```rust
/// fn string_from_utf8_lossy(input: &[u8]) -> String {
///     let mut string = String::new();
///     utf8::LossyDecoder::new(|s| string.push_str(s)).feed(input);
///     string
/// }
/// ```
///
/// **Note:** Dropping the decoder signals the end of the input:
/// If the last input chunk ended with an incomplete byte sequence for a code point,
/// this is an error and a replacement character is emitted.
/// Use `std::mem::forget` to inhibit this behavior.
pub struct LossyDecoder<F: FnMut(&str)> {
    push_str: F,
    incomplete: Incomplete,
}

impl<F: FnMut(&str)> LossyDecoder<F> {
    /// Create a new decoder from a callback.
    #[inline]
    pub fn new(push_str: F) -> Self {
        LossyDecoder {
            push_str: push_str,
            incomplete: Incomplete {
                buffer: [0, 0, 0, 0],
                buffer_len: 0,
            },
        }
    }

    /// Feed one chunk of input into the decoder.
    ///
    /// The input is decoded lossily
    /// and the callback called once or more with `&str` string slices.
    ///
    /// If the UTF-8 byte sequence for one code point was split into this bytes chunk
    /// and previous bytes chunks, it will be correctly pieced back together.
    pub fn feed(&mut self, mut input: &[u8]) {
        if self.incomplete.buffer_len > 0 {
            match self.incomplete.try_complete(input) {
                Some((Ok(s), remaining)) => {
                    (self.push_str)(s);
                    input = remaining
                }
                Some((Err(_), remaining)) => {
                    (self.push_str)(REPLACEMENT_CHARACTER);
                    input = remaining
                }
                None => {
                    return
                }
            }
        }
        loop {
            match decode(input) {
                Ok(s) => {
                    (self.push_str)(s);
                    return
                }
                Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => {
                    (self.push_str)(valid_prefix);
                    self.incomplete = incomplete_suffix;
                    return
                }
                Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => {
                    (self.push_str)(valid_prefix);
                    (self.push_str)(REPLACEMENT_CHARACTER);
                    input = remaining_input
                }
            }
        }
    }
}

impl<F: FnMut(&str)> Drop for LossyDecoder<F> {
    #[inline]
    fn drop(&mut self) {
        if self.incomplete.buffer_len > 0 {
            (self.push_str)(REPLACEMENT_CHARACTER)
        }
    }
}