tendril/
fmt.rs

// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Marker types for formats.
//!
//! This module defines the types and traits used to mark a `Tendril`
//! with the format of data it contains. It includes those formats
//! for which `Tendril` supports at least some operations without
//! conversion.
//!
//! To convert a string tendril to/from a byte tendril in an arbitrary
//! character encoding, see the `encode` and `decode` methods on
//! `Tendril`.
//!
//! `Tendril` operations may become memory-unsafe if data invalid for
//! the format sneaks in. For that reason, these traits require
//! `unsafe impl`.

use std::default::Default;
use std::{char, mem, str};

use futf::{self, Codepoint, Meaning};

/// Implementation details.
///
/// You don't need these unless you are implementing
/// a new format.
pub mod imp {
    use std::default::Default;
    use std::{iter, mem, slice};

    /// Describes how to fix up encodings when concatenating.
    ///
    /// We can drop characters on either side of the splice,
    /// and insert up to 4 bytes in the middle.
    pub struct Fixup {
        pub drop_left: u32,
        pub drop_right: u32,
        pub insert_len: u32,
        pub insert_bytes: [u8; 4],
    }

    impl Default for Fixup {
        #[inline(always)]
        fn default() -> Fixup {
            Fixup {
                drop_left: 0,
                drop_right: 0,
                insert_len: 0,
                insert_bytes: [0; 4],
            }
        }
    }

    #[inline(always)]
    unsafe fn from_u32_unchecked(n: u32) -> char {
        mem::transmute(n)
    }

    pub struct SingleByteCharIndices<'a> {
        inner: iter::Enumerate<slice::Iter<'a, u8>>,
    }

    impl<'a> Iterator for SingleByteCharIndices<'a> {
        type Item = (usize, char);

        #[inline]
        fn next(&mut self) -> Option<(usize, char)> {
            self.inner
                .next()
                .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) })
        }
    }

    impl<'a> SingleByteCharIndices<'a> {
        #[inline]
        pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
            SingleByteCharIndices {
                inner: buf.iter().enumerate(),
            }
        }
    }
}

/// Trait for format marker types.
///
/// The type implementing this trait is usually not instantiated.
/// It's used with a phantom type parameter of `Tendril`.
pub unsafe trait Format {
    /// Check whether the buffer is valid for this format.
    fn validate(buf: &[u8]) -> bool;

    /// Check whether the buffer is valid for this format.
    ///
    /// You may assume the buffer is a prefix of a valid buffer.
    #[inline]
    fn validate_prefix(buf: &[u8]) -> bool {
        <Self as Format>::validate(buf)
    }

    /// Check whether the buffer is valid for this format.
    ///
    /// You may assume the buffer is a suffix of a valid buffer.
    #[inline]
    fn validate_suffix(buf: &[u8]) -> bool {
        <Self as Format>::validate(buf)
    }

    /// Check whether the buffer is valid for this format.
    ///
    /// You may assume the buffer is a contiguous subsequence
    /// of a valid buffer, but not necessarily a prefix or
    /// a suffix.
    #[inline]
    fn validate_subseq(buf: &[u8]) -> bool {
        <Self as Format>::validate(buf)
    }

    /// Compute any fixup needed when concatenating buffers.
    ///
    /// The default is to do nothing.
    ///
    /// The function is `unsafe` because it may assume the input
    /// buffers are already valid for the format. Also, no
    /// bounds-checking is performed on the return value!
    #[inline(always)]
    unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
        Default::default()
    }
}

/// Indicates that one format is a subset of another.
///
/// The subset format can be converted to the superset format
/// for free.
pub unsafe trait SubsetOf<Super>: Format
where
    Super: Format,
{
    /// Validate the *other* direction of conversion; check if
    /// this buffer from the superset format conforms to the
    /// subset format.
    ///
    /// The default calls `Self::validate`, but some conversions
    /// may implement a check which is cheaper than validating
    /// from scratch.
    fn revalidate_subset(x: &[u8]) -> bool {
        Self::validate(x)
    }
}

/// Indicates a format which corresponds to a Rust slice type,
/// representing exactly the same invariants.
pub unsafe trait SliceFormat: Format + Sized {
    type Slice: ?Sized + Slice;
}

/// Indicates a format which contains characters from Unicode
/// (all of it, or some proper subset).
pub unsafe trait CharFormat<'a>: Format {
    /// Iterator for characters and their byte indices.
    type Iter: Iterator<Item = (usize, char)>;

    /// Iterate over the characters of the string and their byte
    /// indices.
    ///
    /// You may assume the buffer is *already validated* for `Format`.
    unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;

    /// Encode the character as bytes and pass them to a continuation.
    ///
    /// Returns `Err(())` iff the character cannot be represented.
    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
    where
        F: FnOnce(&[u8]);
}

/// Indicates a Rust slice type that is represented in memory as bytes.
pub unsafe trait Slice {
    /// Access the raw bytes of the slice.
    fn as_bytes(&self) -> &[u8];

    /// Convert a byte slice to this kind of slice.
    ///
    /// You may assume the buffer is *already validated*
    /// for `Format`.
    unsafe fn from_bytes(x: &[u8]) -> &Self;

    /// Convert a byte slice to this kind of slice.
    ///
    /// You may assume the buffer is *already validated*
    /// for `Format`.
    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
}

/// Marker type for uninterpreted bytes.
///
/// Validation will never fail for this format.
#[derive(Copy, Clone, Default, Debug)]
pub struct Bytes;

unsafe impl Format for Bytes {
    #[inline(always)]
    fn validate(_: &[u8]) -> bool {
        true
    }
}

unsafe impl SliceFormat for Bytes {
    type Slice = [u8];
}

unsafe impl Slice for [u8] {
    #[inline(always)]
    fn as_bytes(&self) -> &[u8] {
        self
    }

    #[inline(always)]
    unsafe fn from_bytes(x: &[u8]) -> &[u8] {
        x
    }

    #[inline(always)]
    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
        x
    }
}

/// Marker type for ASCII text.
#[derive(Copy, Clone, Default, Debug)]
pub struct ASCII;

unsafe impl Format for ASCII {
    #[inline]
    fn validate(buf: &[u8]) -> bool {
        buf.iter().all(|&n| n <= 127)
    }

    #[inline(always)]
    fn validate_prefix(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_suffix(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_subseq(_: &[u8]) -> bool {
        true
    }
}

unsafe impl SubsetOf<UTF8> for ASCII {}
unsafe impl SubsetOf<Latin1> for ASCII {}

unsafe impl<'a> CharFormat<'a> for ASCII {
    type Iter = imp::SingleByteCharIndices<'a>;

    #[inline]
    unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
        imp::SingleByteCharIndices::new(buf)
    }

    #[inline]
    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
    where
        F: FnOnce(&[u8]),
    {
        let n = ch as u32;
        if n > 0x7F {
            return Err(());
        }
        cont(&[n as u8]);
        Ok(())
    }
}

/// Marker type for UTF-8 text.
#[derive(Copy, Clone, Default, Debug)]
pub struct UTF8;

unsafe impl Format for UTF8 {
    #[inline]
    fn validate(buf: &[u8]) -> bool {
        str::from_utf8(buf).is_ok()
    }

    #[inline]
    fn validate_prefix(buf: &[u8]) -> bool {
        if buf.len() == 0 {
            return true;
        }
        match futf::classify(buf, buf.len() - 1) {
            Some(Codepoint {
                meaning: Meaning::Whole(_),
                ..
            }) => true,
            _ => false,
        }
    }

    #[inline]
    fn validate_suffix(buf: &[u8]) -> bool {
        if buf.len() == 0 {
            return true;
        }
        match futf::classify(buf, 0) {
            Some(Codepoint {
                meaning: Meaning::Whole(_),
                ..
            }) => true,
            _ => false,
        }
    }

    #[inline]
    fn validate_subseq(buf: &[u8]) -> bool {
        <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
    }
}

unsafe impl SubsetOf<WTF8> for UTF8 {}

unsafe impl SliceFormat for UTF8 {
    type Slice = str;
}

unsafe impl Slice for str {
    #[inline(always)]
    fn as_bytes(&self) -> &[u8] {
        str::as_bytes(self)
    }

    #[inline(always)]
    unsafe fn from_bytes(x: &[u8]) -> &str {
        str::from_utf8_unchecked(x)
    }

    #[inline(always)]
    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
        mem::transmute(x)
    }
}

unsafe impl<'a> CharFormat<'a> for UTF8 {
    type Iter = str::CharIndices<'a>;

    #[inline]
    unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
        str::from_utf8_unchecked(buf).char_indices()
    }

    #[inline]
    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
    where
        F: FnOnce(&[u8]),
    {
        cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
        Ok(())
    }
}

/// Marker type for WTF-8 text.
///
/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
#[derive(Copy, Clone, Default, Debug)]
pub struct WTF8;

#[inline]
fn wtf8_meaningful(m: Meaning) -> bool {
    match m {
        Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true,
        _ => false,
    }
}

unsafe impl Format for WTF8 {
    #[inline]
    fn validate(buf: &[u8]) -> bool {
        let mut i = 0;
        let mut prev_lead = false;
        while i < buf.len() {
            let codept = unwrap_or_return!(futf::classify(buf, i), false);
            if !wtf8_meaningful(codept.meaning) {
                return false;
            }
            i += codept.bytes.len();
            prev_lead = match codept.meaning {
                Meaning::TrailSurrogate(_) if prev_lead => return false,
                Meaning::LeadSurrogate(_) => true,
                _ => false,
            };
        }

        true
    }

    #[inline]
    fn validate_prefix(buf: &[u8]) -> bool {
        if buf.len() == 0 {
            return true;
        }
        match futf::classify(buf, buf.len() - 1) {
            Some(c) => wtf8_meaningful(c.meaning),
            _ => false,
        }
    }

    #[inline]
    fn validate_suffix(buf: &[u8]) -> bool {
        if buf.len() == 0 {
            return true;
        }
        match futf::classify(buf, 0) {
            Some(c) => wtf8_meaningful(c.meaning),
            _ => false,
        }
    }

    #[inline]
    fn validate_subseq(buf: &[u8]) -> bool {
        <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
    }

    #[inline]
    unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
        const ERR: &'static str = "WTF8: internal error";

        if lhs.len() >= 3 && rhs.len() >= 3 {
            if let (
                Some(Codepoint {
                    meaning: Meaning::LeadSurrogate(hi),
                    ..
                }),
                Some(Codepoint {
                    meaning: Meaning::TrailSurrogate(lo),
                    ..
                }),
            ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
            {
                let mut fixup = imp::Fixup {
                    drop_left: 3,
                    drop_right: 3,
                    insert_len: 0,
                    insert_bytes: [0_u8; 4],
                };

                let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);

                let ch = char::from_u32(n).expect(ERR);
                fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;

                return fixup;
            }
        }

        Default::default()
    }
}

/// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
///
/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
/// C0 and C1 control characters from ECMA-48 / ISO 6429.
///
/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
/// many other aliases), which actually stand for Windows-1252.
#[derive(Copy, Clone, Default, Debug)]
pub struct Latin1;

unsafe impl Format for Latin1 {
    #[inline(always)]
    fn validate(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_prefix(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_suffix(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_subseq(_: &[u8]) -> bool {
        true
    }
}

unsafe impl<'a> CharFormat<'a> for Latin1 {
    type Iter = imp::SingleByteCharIndices<'a>;

    #[inline]
    unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
        imp::SingleByteCharIndices::new(buf)
    }

    #[inline]
    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
    where
        F: FnOnce(&[u8]),
    {
        let n = ch as u32;
        if n > 0xFF {
            return Err(());
        }
        cont(&[n as u8]);
        Ok(())
    }
}
tendril/fmt.rs

tendril/
fmt.rs