mime2ext/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#![no_std]
//! A simple compact crate to look up a file extension for a mime type.
//!
//! This crate embeds part of the [`mime-db`](https://github.com/jshttp/mime-db)
//! database.
//! Its version number tracks that of `mime-db`.
//! `mime2ext` version 0.1.49 corresponds to `mime-db` version 1.49.0.

// This database contains around a thousand entries. At 16 bytes per string
// slice and two strings per entry (mimetype and extension) a naive approach
// would have quite a lot of overhead.
//
// All the strings are instead packed into a single string, without
// delimiters, and we work with offsets into the string.
//
// The total string length is below u16::MAX, and individual string lengths
// are below u8::MAX. Each extension is packed after its mimetype, so a pair
// of strings requires one u16 offset and two u8 lengths, for 4 bytes in total
// (instead of 32).
//
// The entries are sorted by key to support a lookup with `binary_search`.
// They are subdivided by type (the part of the mimetype before the slash)
// to avoid unnecessary repetition. There are only around 10 unique types.
//
// This is likely overengineered, but it was fun to design and seems solid.

// The MSRV is 1.6 (for no_std), so some code looks a little archaic.

// See build.py
static RAW_DATA: &'static str = include_str!("raw_data");

#[derive(Copy, Clone, PartialEq, Debug)]
// (location, subtype_len, extension_len)
struct Entry(u16, u8, u8);

impl Entry {
    // Returns bytes to skip expensive UTF-8 slicing.
    fn subtype(self) -> &'static [u8] {
        let loc = self.0 as usize;
        let len = self.1 as usize;
        &RAW_DATA.as_bytes()[loc..loc + len]
    }

    fn extension(self) -> &'static str {
        let loc = self.0 as usize + self.1 as usize;
        let len = self.2 as usize;
        &RAW_DATA[loc..loc + len]
    }
}

type Table = &'static [Entry];
type Tables = &'static [(&'static str, Table)];

// See build.py
static LOOKUP: Tables = include!("lookup");

fn find_entry(table: Table, subtype: &str) -> Option<Entry> {
    let subtype = subtype.as_bytes();
    match table.binary_search_by(|entry| entry.subtype().cmp(subtype)) {
        Ok(idx) => Some(table[idx]),
        Err(_) => None,
    }
}

fn find_table(type_: &str) -> Option<Table> {
    match LOOKUP.iter().find(|item| item.0 == type_) {
        Some(item) => Some(item.1),
        None => None,
    }
}

fn parse_mimetype(mimetype: &str) -> Option<(&str, &str)> {
    let idx = match mimetype.find('/') {
        Some(idx) => idx,
        None => return None,
    };
    let (type_, mut subtype) = mimetype.split_at(idx);
    subtype = &subtype[1..];
    if let Some(idx) = subtype.find(';') {
        subtype = &subtype[..idx];
    }
    Some((type_, subtype))
}

/// Given a mimetype, pick a suitable file extension.
///
/// # Example
///
/// ```
/// use mime2ext::mime2ext;
///
/// assert_eq!(mime2ext("image/png"), Some("png"));
/// assert_eq!(mime2ext("application/octet-stream"), Some("bin"));
/// assert_eq!(mime2ext("text/html; charset=UTF-8"), Some("html"));
/// assert_eq!(mime2ext("notareal/mimetype"), None);
/// assert_eq!(mime2ext("invalid-mimetype"), None);
/// ```
pub fn mime2ext<S: AsRef<str>>(mimetype: S) -> Option<&'static str> {
    match parse_mimetype(mimetype.as_ref()) {
        Some((type_, subtype)) => match find_table(type_) {
            Some(table) => find_entry(table, subtype).map(Entry::extension),
            None => None,
        },
        None => None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    extern crate core;
    extern crate std;

    static NOT_FOUND: &'static [&'static str] = &[
        "notareal/mimetype",
        "noslash",
        "application/",
        "application/jpeg",
        "application////",
        "application/octet-stream/",
        "/application/octet-stream",
        "application/aaaaaaa",
        "application/zzzzzzz",
        "aaaaaaaa/jpeg",
        "zzzzzzzz/jpeg",
        "",
        "/",
        "//",
        "/;",
        "a/;",
        "/a;",
        "a/a;",
        ";;",
        ";",
        "\0",
        "\u{00B5}",
        "\u{00B5}\u{00B5}/\u{00B5}\u{00B5}",
        "\u{00B5}\u{00B5}/\u{00B5}\u{00B5}",
        "a\u{00B5}\u{00B5}//\u{00B5}\u{00B5}",
        "application/clr", // Exists in db.json, but without extensions
        "x-conference/nonexistent",
        "text/html ;",                // Bad semicolon position
        "application/xcap-error+xml", // Removed v1.47.0
    ];

    #[test]
    fn not_found() {
        for mimetype in NOT_FOUND {
            assert_eq!(mime2ext(*mimetype), None);
        }
    }

    static FOUND: &'static [(&'static str, &'static str)] = &[
        ("application/octet-stream", "bin"),
        ("image/png", "png"),
        ("application/davmount+xml", "davmount"),
        ("application/andrew-inset", "ez"),
        ("x-conference/x-cooltalk", "ice"),
        ("text/html; charset=UTF-8", "html"),
        ("text/xml;", "xml"),
        ("audio/amr", "amr"),                          // Added v1.46.0
        ("model/vnd.sap.vds", "vds"),                  // Added v1.47.0
        ("application/ecmascript", "ecma"),            // Changed v1.47.0, changed again v1.53.0
        ("application/vnd.mapbox-vector-tile", "mvt"), // Added v1.48.0
        ("model/step-xml+zip", "stpxz"),               // Added v1.49.0
        ("application/express", "exp"),                // Added v1.50.0
        ("text/vnd.familysearch.gedcom", "ged"),       // Added v1.51.0
        ("image/avci", "avci"),                        // Added v1.52.0
        ("image/jxl", "jxl"),                          // Added v1.53.0
        ("text/markdown", "md"),                       // Changed v1.53.0
    ];

    #[test]
    fn found() {
        for &(mimetype, ext) in FOUND {
            assert_eq!(mime2ext(mimetype), Some(ext));
        }
    }

    /// Make sure every entry can be retrieved (doesn't panic) and that its
    /// contents are unsurprising.
    #[test]
    fn check_entries() {
        for &(type_, entries) in super::LOOKUP {
            assert!(!type_.is_empty());
            assert!(!type_.contains('/'));

            // Required for binary search
            let mut sorted = entries.to_vec();
            sorted.sort_by(|a, b| a.subtype().cmp(b.subtype()));
            let sorted: &[super::Entry] = &sorted;
            assert_eq!(entries, sorted);

            for entry in entries {
                let subtype = core::str::from_utf8(entry.subtype()).unwrap();
                let ext = entry.extension();
                assert!(!subtype.is_empty());
                assert!(!subtype.contains('/'));
                assert!(!ext.is_empty());
                assert!(!ext.contains('.'));
                assert!(!ext.contains('/'));

                let mimetype = std::string::String::from(type_) + "/" + subtype;
                assert_eq!(mime2ext(&mimetype), Some(ext));
            }
        }
    }

    #[test]
    fn check_sizes() {
        assert_eq!(std::mem::size_of::<super::Entry>(), 4);
        assert!(super::RAW_DATA.len() < std::u16::MAX as usize);
    }
}