sanitize_filename_reader_friendly/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
//! Converts strings in a file system friendly and human readable form.
//!
//! The algorithm replaces or deletes characters from the input stream using
//! various filters that are applied in the following sequential order:
//!
//! 1. Replace all whitespace with space.
//! 2. Filter all control characters.
//! 3. `REPLACE_ORIG_WITH_UNDERSCORE`
//! 4. `REPLACE_ORIG_WITH_SPACE`
//! 5. `FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_SPACE`
//! 6. `FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_UNDERSCORE`
//! 7. `FILTER_ORIG_AFTER_LAST_PROCESSED_WAS_WHITESPACE`
//! 8. `FILTER_ORIG_NON_PRINTING_CHARS`
//! 9. `TRIM_LINE_CHARS`
//! 10. `INSERT_LINE_SEPARATOR`
//! 11. `TRIM_END_LINES`
//!
//! For details see the definition and documentation of the above (private) constants.
//!
//! # Rationale
//!
//! Exclude NTFS critical characters:       `<>:"\/|?*` \
//! <https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx>
//!
//! These are considered unsafe in URLs:    `<>#%{}|\^~[]` ` \
//! <https://perishablepress.com/stop-using-unsafe-characters-in-urls/>
//!
//! New in version 2.0.0:
//! Do **not** exclude restricted in FAT32:    `+,;=[]`  \
//! <https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words>
//!
//! ```
//! use sanitize_filename_reader_friendly::sanitize;
//! let output = sanitize("Read: http://blog.getreu.net/projects/tp-note/");
//! assert_eq!(output, "Read_ http_blog.getreu.net_projects_tp-note");
//! ```
//! The output string's length is guaranteed to be shorter or equal than the input
//! string's length.

use const_format::concatcp;

/// Start value for the algorithm. We pretend the last was just a regular letter
/// to which no `LAST_PROCESSED_WAS` rule applies.
const LAST_PROCESSED_START_CHAR: char = 'A';

/// Replace the set of quoted characters with underscore:
///
/// * Replace any whitespace by a space.
/// * Filter any control character.
/// * Replace any of the quoted characters with underscore.
const REPLACE_ORIG_WITH_UNDERSCORE: &str = r#":\/|?~"#;

/// * Replace any of the quoted characters with space.
const REPLACE_ORIG_WITH_SPACE: &str = "<>\"*#%{}^`";

/// * Filter the resulting character if it is in the quoted set and the last
///   processed character was a space.
const FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_SPACE: &str = " ";

/// * Filter the resulting character if it is in the quoted set and the last
///   processed character was an underscore.
const FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_UNDERSCORE: &str = "_";

/// * Filter if the current character is in the quoted set and the last
///   processed character was a whitespace. Ignore all former replacements of
///   the current character.
const FILTER_ORIG_AFTER_LAST_PROCESSED_WAS_WHITESPACE: &str = "_.\\/,;";

/// * Filter if the * current character is in the set of the quoted non printing
///   characters. Ignore all former replacements of the current character.
///
/// End of character loop.
const FILTER_ORIG_NON_PRINTING_CHARS: &str = "\u{200B}\u{202A}\u{202B}\u{202C}\
\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";

/// Group characters into lines (separated by newlines) and trim both sides of
/// all lines by the set of the quoted characters. In addition to the listed
/// characters whitespace is trimmed too. As the filter operates line by line,
/// it guarantees, that none of the listed characters can appear at the
/// beginning or the at end of the output string.
pub const TRIM_LINE_CHARS: &str = "_-.,;";

/// Insert the character below between lines.
const INSERT_LINE_SEPARATOR: char = '-';

/// Remove the final `INSERT_LINE_SEPARATOR`.
const TRIM_END_LINES: char = INSERT_LINE_SEPARATOR;

#[allow(dead_code)]
/// A set of characters that is always replaced or filtered and will never
/// appear in the output stream. Please note that additionally to the above :
/// all `is_whitespace()` characters are always replaced by space and all
/// `is_control()` characters are always filtered.
pub const ALWAYS_REPLACED_OR_FILTERED_CHARS: &str =
    concatcp!(REPLACE_ORIG_WITH_UNDERSCORE, REPLACE_ORIG_WITH_SPACE);

#[allow(dead_code)]
/// An unordered list of all characters that are potentially replaced under
/// certain conditions. Please note that additionally to the above : all
/// `is_whitespace()` characters are always replaced by space and all
/// `is_control()` characters are always filtered.
pub const POTENTIALLY_REPLACED_CHARS: &str = concatcp!(
    REPLACE_ORIG_WITH_UNDERSCORE,
    REPLACE_ORIG_WITH_SPACE,
    FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_SPACE,
    FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_UNDERSCORE,
    FILTER_ORIG_AFTER_LAST_PROCESSED_WAS_WHITESPACE,
    FILTER_ORIG_NON_PRINTING_CHARS,
    TRIM_LINE_CHARS,
    TRIM_END_LINES
);

/// Converts strings in a file system friendly and human readable form.
pub fn sanitize(s: &str) -> String {
    // This is used in a closure later.
    let mut last_processed_chr = LAST_PROCESSED_START_CHAR;

    // Proceed line by line.
    s.lines()
        .map(|l| {
            let mut s = l
                .chars()
                // Replace tab with space.
                .map(|c| if c.is_whitespace() { ' ' } else { c })
                // Delete control characters.
                .filter(|c| !c.is_control())
                .map(|c_orig| {
                    // Replace `:\\/|?~,;=` with underscore.
                    if REPLACE_ORIG_WITH_UNDERSCORE.find(c_orig).is_some() {
                        (c_orig, '_')
                    } else if REPLACE_ORIG_WITH_SPACE.find(c_orig).is_some() {
                        (c_orig, ' ')
                    } else {
                        (c_orig, c_orig)
                    }
                })
                .filter(|&(c_orig, c)| {
                    let discard = (FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_SPACE
                        .find(c)
                        .is_some()
                        && last_processed_chr == ' ')
                        || (FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_UNDERSCORE
                            .find(c)
                            .is_some()
                            && last_processed_chr == '_')
                        || (FILTER_ORIG_AFTER_LAST_PROCESSED_WAS_WHITESPACE
                            .find(c_orig)
                            .is_some()
                            && last_processed_chr.is_whitespace())
                        || FILTER_ORIG_NON_PRINTING_CHARS.find(c_orig).is_some();
                    if !discard {
                        last_processed_chr = c;
                    };
                    !discard
                })
                .map(|(_, c)| c)
                .collect::<String>()
                // Trim whitespace and `_-.,;` at the beginning and the end of the line.
                .trim_matches(|c: char| c.is_whitespace() || TRIM_LINE_CHARS.find(c).is_some())
                .to_string();
            // Filter newline and insert line separator `-`.
            s.push(INSERT_LINE_SEPARATOR);
            s
        })
        .collect::<String>()
        // Trim the last `LINE_SEPARATOR` just added.
        .trim_end_matches(TRIM_END_LINES)
        .to_string()
}
// TODO
// Should these be handled?
// RegexBuilder::new(r#"(?i)^(con|prn|aux|nul|com[0-9]|lpt[0-9])(\..*)?$"#)

#[cfg(test)]
mod tests {
    use super::sanitize;
    #[test]
    fn test_sanitize() {
        // Test filter tabs.
        assert_eq!(sanitize("\tabc\tefg\t"), "abc efg".to_string());
        // Test filter control characters.
        assert_eq!(sanitize("abc\u{0019}efg"), "abcefg".to_string());
        // Test filter special characters, replace with _.
        assert_eq!(sanitize("abc:\\/|?~=efg"), "abc_=efg".to_string());
        // Test filter special characters, replace with _.
        assert_eq!(
            sanitize("abc<>\"*<>#%{}^[]+[]`efg"),
            "abc []+[] efg".to_string()
        );
        // Test trim before and after newline.
        assert_eq!(
            sanitize("-_ \tabc \t >_-\n   efg \t_-"),
            "abc-efg".to_string()
        );
        // Test replace Unix newline.
        assert_eq!(sanitize("abc\nefg"), "abc-efg".to_string());
        // Test replace Windows newline.
        assert_eq!(sanitize("abc\r\nefg"), "abc-efg".to_string());
        // Test double '_' or ' '.
        assert_eq!(sanitize("abc_ __  efg __hij"), "abc_ efg hij".to_string());
        // Test hyperlink.
        assert_eq!(
            sanitize("https://blog.getreu.net/projects/"),
            "https_blog.getreu.net_projects".to_string()
        );
    }

    // File stem examples are taken from:
    // https://github.com/parshap/node-sanitize-filename/blob/master/test.js
    // (the extension is usually added after sanitzing the file stem.)
    static INPUT: &'static [&'static str] = &[
        "the quick brown fox jumped over the lazy dog",
        "résumé",
        "hello\u{0000}world",
        "hello\nworld",
        ";-_hello.,\n,.world_-;",
        "semi;colon",
        ";leading-semi",
        "com,ma",
        "equals=",
        "slash\\",
        "slash/",
        "col:on",
        "star*",
        "question?",
        "quote\"",
        "singlequote'",
        "brack<e>ts",
        "p|pes",
        "plus+",
        "'five and six<seven'",
        " space at front",
        "space at end ",
        ".period",
        "period.",
        "relative/path/to/some/dir",
        "/abs/path/to/some/dir",
        "~/.\u{0000}notssh/authorized_keys",
        "",
        "h?w",
        "h/w",
        "h*w",
        ".",
        "..",
        "./",
        "../",
        "/..",
        "/../",
        "*.|.",
        "./",
        "./foobar",
        "../foobar",
        "../../foobar",
        "./././foobar",
        "|*.what",
        "LPT9.asdf",
        "author| title",
        "author | title",
        "author: title",
        "auteur : titre",
        "author, title",
        "no , enumeration",
        "Any questions? Or not?",
        "Des questions ? Ou pas ?",
        "Hello!",
        "filename(1).ext",
        "1,23",
        "1.23",
        "foo\u{200b}bar",
    ];

    // Optimized for reading and keeping and much information as possible.
    // Compare with:
    // https://github.com/parshap/node-sanitize-filename/blob/master/test.js
    static EXPECTED_OUTPUT: &'static [&'static str] = &[
        "the quick brown fox jumped over the lazy dog",
        "résumé",
        "helloworld",
        "hello-world",
        "hello-world",
        "semi;colon",
        "leading-semi",
        "com,ma",
        "equals=",
        "slash",
        "slash",
        "col_on",
        "star",
        "question",
        "quote",
        "singlequote'",
        "brack e ts",
        "p_pes",
        "plus+",
        "'five and six seven'",
        "space at front",
        "space at end",
        "period",
        "period",
        "relative_path_to_some_dir",
        "abs_path_to_some_dir",
        "notssh_authorized_keys",
        "",
        "h_w",
        "h_w",
        "h w",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "foobar",
        "foobar",
        "foobar",
        "foobar",
        "what",
        "LPT9.asdf",
        "author_ title",
        "author _ title",
        "author_ title",
        "auteur _ titre",
        "author, title",
        "no enumeration",
        "Any questions_ Or not",
        "Des questions _ Ou pas",
        "Hello!",
        "filename(1).ext",
        "1,23",
        "1.23",
        "foobar",
    ];

    #[test]
    fn test_string_list() {
        for (i, s) in INPUT.iter().enumerate() {
            assert_eq!(EXPECTED_OUTPUT[i], super::sanitize(s));
        }
    }
}