string_cache/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// Copyright 2014 The Servo Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//!
//! A library for interning things that are `AsRef<str>`.
//!
//! Some strings may be interned at compile time using the `string-cache-codegen` crate, or the
//! `EmptyStaticAtomSet` may be used that has no compile-time interned strings. An `Atom` is an
//! interned string for a given set (either `EmptyStaticAtomSet` or a generated `StaticAtomSet`).
//!
//! Generated `Atom`s will have assocated macros to intern static strings at compile-time.
//!
//! # Examples
//!
//! Here are two examples, one with compile-time `Atom`s, and one without.
//!
//! ## With compile-time atoms
//!
//! In `Cargo.toml`:
//! ```toml
//! [dependencies]
//! string_cache = "0.8"
//!
//! [dev-dependencies]
//! string_cache_codegen = "0.5"
//! ```
//!
//! In `build.rs`:
//!
//! ```ignore
//! extern crate string_cache_codegen;
//!
//! use std::env;
//! use std::path::Path;
//!
//! fn main() {
//!     string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!")
//!         .atoms(&["foo", "bar"])
//!         .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs"))
//!         .unwrap()
//! }
//! ```
//!
//! In `lib.rs`:
//!
//! ```ignore
//! extern crate string_cache;
//!
//! mod foo {
//!     include!(concat!(env!("OUT_DIR"), "/foo_atom.rs"));
//! }
//!
//! fn use_the_atom(t: &str) {
//!     match *t {
//!         foo_atom!("foo") => println!("Found foo!"),
//!         foo_atom!("bar") => println!("Found bar!"),
//!         // foo_atom!("baz") => println!("Found baz!"), - would be a compile time error
//!         _ => {
//!             println!("String not interned");
//!             // We can intern strings at runtime as well
//!             foo::FooAtom::from(t)
//!         }
//!     }
//! }
//! ```
//!
//! ## No compile-time atoms
//!
//! ```
//! # extern crate string_cache;
//! use string_cache::DefaultAtom;
//!
//! # fn main() {
//! let mut interned_stuff = Vec::new();
//! let text = "here is a sentence of text that will be tokenised and
//!             interned and some repeated tokens is of text and";
//! for word in text.split_whitespace() {
//!     let seen_before = interned_stuff.iter()
//!         // We can use impl PartialEq<T> where T is anything string-like
//!         // to compare to interned strings to either other interned strings,
//!         // or actual strings  Comparing two interned strings is very fast
//!         // (normally a single cpu operation).
//!         .filter(|interned_word| interned_word == &word)
//!         .count();
//!     if seen_before > 0 {
//!         println!(r#"Seen the word "{}" {} times"#, word, seen_before);
//!     } else {
//!         println!(r#"Not seen the word "{}" before"#, word);
//!     }
//!     // We use the impl From<(Cow<'a, str>, or &'a str, or String)> for
//!     // Atom<Static> to intern a new string.
//!     interned_stuff.push(DefaultAtom::from(word));
//! }
//! # }
//! ```
//!

#![cfg_attr(test, deny(warnings))]

// Types, such as Atom, that impl Hash must follow the hash invariant: if two objects match
// with PartialEq, they must also have the same Hash. Clippy warns on types that derive one while
// manually impl-ing the other, because it seems easy for the two to drift apart, causing the
// invariant to be violated.
//
// But Atom is a newtype over NonZeroU64, and probably always will be, since cheap comparisons and
// copying are this library's purpose. So we know what the PartialEq comparison is going to do.
//
// The `get_hash` function, seen in `atom.rs`, consults that number, plus the global string interner
// tables. The only way for the resulting hash for two Atoms with the same inner 64-bit number to
// differ would be if the table entry changed between invocations, and that would be really bad.
#![allow(clippy::derive_hash_xor_eq)]

mod atom;
mod dynamic_set;
mod static_sets;
mod trivial_impls;

pub use atom::Atom;
pub use static_sets::{EmptyStaticAtomSet, PhfStrSet, StaticAtomSet};

/// Use this if you don’t care about static atoms.
pub type DefaultAtom = Atom<EmptyStaticAtomSet>;

// Some minor tests of internal layout here.
// See ../integration-tests for much more.

/// Guard against accidental changes to the sizes of things.
#[test]
fn assert_sizes() {
    use std::mem::size_of;
    assert_eq!(size_of::<DefaultAtom>(), 8);
    assert_eq!(size_of::<Option<DefaultAtom>>(), size_of::<DefaultAtom>(),);
}