decancer/options.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
use crate::{codepoints::Codepoint, Translation};
use paste::paste;
use std::cmp::Ordering;
/// A configuration struct where you can customize decancer's behavior.
///
/// By default, decancer cures as much characters as possible and turns all the output characters to lowercase.
///
/// If you don't plan on using this struct and only using decancer's defaults, it's recommended to disable the default `options` feature flag to optimize away unnecessary option checks.
///
/// ```rust
/// use decancer::Options;
///
/// // by default, all options are disabled
/// let _options = Options::default();
/// ```
#[derive(Copy, Clone, Eq, PartialEq, Default, Hash)]
pub struct Options(pub(crate) u32);
macro_rules! options {
($(
$(#[$extra_meta:meta])*
$idx:literal: $name:ident,
)*) => {
$(
$(#[$extra_meta])*
#[cfg_attr(not(feature = "options"), cold)]
pub const fn $name(self) -> Self {
#[cfg(feature = "options")]
return Self(self.0 | (1 << $idx));
#[cfg(not(feature = "options"))]
return self;
}
)*
};
}
macro_rules! retain {
($(
$idx:literal: $name:ident,
)*) => {
paste! {
options! {
$(
#[doc = concat!("Prevents decancer from curing all ", stringify!($name), " characters.")]
$idx: [<retain_ $name>],
)*
}
}
};
}
impl Options {
/// Creates a new configuration where every option is enabled.
#[cfg_attr(not(feature = "options"), cold)]
pub const fn all() -> Self {
#[cfg(feature = "options")]
return Self(0x1ffffff);
#[cfg(not(feature = "options"))]
return Self(0);
}
/// Creates a new configuration that prevents decancer from curing characters from major foreign writing systems, including diacritics.
#[cfg_attr(not(feature = "options"), cold)]
pub const fn pure_homoglyph() -> Self {
#[cfg(feature = "options")]
return Self(0x3ffffc);
#[cfg(not(feature = "options"))]
return Self(0);
}
options! {
/// Prevents decancer from changing all characters to lowercase. Therefore, if the input character is in uppercase, the output character will be in uppercase as well.
///
/// **NOTE:** Many confusables are neither an uppercase or a lowercase character. Therefore, the decancer defaults to displaying the translation **in lowercase**:
///
/// ```rust
/// use decancer::{Translation, Options};
/// use std::borrow::Cow;
///
/// let options = Options::default()
/// .retain_capitalization();
///
/// assert_eq!('🆐'.to_lowercase().collect::<String>(), '🆐'.to_uppercase().collect::<String>());
/// assert_eq!(decancer::cure_char('🆐', options), Translation::String(Cow::Borrowed("dj")));
/// ```
0: retain_capitalization,
/// Prevents decancer from applying the [Unicode Bidirectional Algorithm](https://en.wikipedia.org/wiki/Bidirectional_text). Use this **only** when you don't expect any right-to-left characters. Enabling this option has no effect if it's called on [`cure_char`][crate::cure_char()].
///
/// **NOTE:** This speeds up the function call, but **can break [right-to-left characters](https://en.wikipedia.org/wiki/Bidirectional_text)**. It's highly recommended to also use [`retain_arabic`][Options::retain_arabic] and [`retain_hebrew`][Options::retain_hebrew].
1: disable_bidi,
/// Prevents decancer from curing characters *with* diacritics or accents.
///
/// **NOTE:** Decancer can still cure standalone diacritic characters, which is used in [Zalgo texts](https://en.wikipedia.org/wiki/Zalgo_text).
2: retain_diacritics,
/// Prevents decancer from curing all katakana and hiragana characters.
///
/// **NOTE:** To also provent decancer from curing kanji characters, use [`retain_chinese`][Options::retain_chinese].
18: retain_japanese,
/// Prevents decancer from curing all emojis.
21: retain_emojis,
/// Prevents decancer from curing all turkish characters.
///
/// **NOTE:** To also prevent decancer from curing [the uppercase dotted i character](https://en.wikipedia.org/wiki/İ) (`İ`), use [`retain_capitalization`][Options::retain_capitalization].
22: retain_turkish,
/// Removes all non-ASCII characters from the result.
23: ascii_only,
/// Removes all non-alphanumeric characters from the result.
24: alphanumeric_only,
}
retain! {
3: greek,
4: cyrillic,
5: hebrew,
6: arabic,
7: devanagari,
8: bengali,
9: armenian,
10: gujarati,
11: tamil,
12: thai,
13: lao,
14: burmese,
15: khmer,
16: mongolian,
17: chinese,
19: korean,
20: braille,
}
#[cfg(feature = "options")]
pub(crate) const fn is(self, attribute_idx: u8) -> bool {
(self.0 & (1 << attribute_idx as u32)) != 0
}
#[cfg(feature = "options")]
pub(crate) const fn refuse_cure(self, attributes: u8) -> bool {
let locale = attributes >> 2;
((attributes & 1) != 0 && self.is(2))
|| ((attributes & 2) != 0 && self.is(22))
|| locale > 2 && self.is(locale)
}
pub(crate) fn translate(self, code: u32, offset: i32, mut end: i32) -> Option<Translation> {
let mut start = 0;
while start <= end {
let mid = (start + end) / 2;
let codepoint = Codepoint::at(offset + (mid * 6));
#[cfg(feature = "options")]
let mat = codepoint.matches(code, self);
#[cfg(not(feature = "options"))]
let mat = codepoint.matches(code);
match mat {
Some(ord) => match ord {
Ordering::Equal => return Some(codepoint.translation(code)),
Ordering::Greater => start = mid + 1,
Ordering::Less => end = mid - 1,
},
// could've just used ? but Rust doesn't allow it in a const fn
None => break,
};
}
None
}
}
#[doc(hidden)]
#[cfg(feature = "options")]
impl From<u32> for Options {
#[inline(always)]
fn from(value: u32) -> Self {
Self(value)
}
}