use std::{
cell::RefCell,
collections::BTreeSet,
fmt, io,
iter::FusedIterator,
rc::{Rc, Weak},
};
use as_variant::as_variant;
use html5ever::{
local_name, namespace_url, ns, parse_fragment,
serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope},
tendril::{StrTendril, TendrilSink},
tree_builder::{NodeOrText, TreeSink},
Attribute, LocalName, ParseOpts, QualName,
};
use tracing::debug;
#[cfg(feature = "matrix")]
pub mod matrix;
use crate::SanitizerConfig;
#[derive(Debug)]
pub struct Html {
document: NodeRef,
}
impl Html {
pub fn parse(string: &str) -> Self {
let sink = Self::default();
let mut parser = parse_fragment(
sink,
ParseOpts::default(),
QualName::new(None, ns!(html), local_name!("div")),
Vec::new(),
);
parser.process(string.into());
parser.finish()
}
pub fn sanitize(&self) {
let config = SanitizerConfig::compat().remove_reply_fallback();
self.sanitize_with(&config);
}
pub fn sanitize_with(&self, config: &SanitizerConfig) {
config.clean(self);
}
fn root(&self) -> NodeRef {
self.document.first_child().expect("html should always have a root node")
}
pub fn has_children(&self) -> bool {
self.root().has_children()
}
pub fn first_child(&self) -> Option<NodeRef> {
self.root().first_child()
}
pub fn last_child(&self) -> Option<NodeRef> {
self.root().last_child()
}
pub fn children(&self) -> Children {
Children::new(self.first_child())
}
}
impl Default for Html {
fn default() -> Self {
Self { document: NodeRef::new(NodeData::Document) }
}
}
impl TreeSink for Html {
type Handle = NodeRef;
type Output = Self;
type ElemName<'a> = html5ever::ExpandedName<'a>;
fn finish(self) -> Self::Output {
self
}
fn parse_error(&self, msg: std::borrow::Cow<'static, str>) {
debug!("HTML parse error: {msg}");
}
fn get_document(&self) -> Self::Handle {
self.document.clone()
}
fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> html5ever::ExpandedName<'a> {
target.as_element().expect("not an element").name.expanded()
}
fn create_element(
&self,
name: QualName,
attrs: Vec<Attribute>,
_flags: html5ever::tree_builder::ElementFlags,
) -> Self::Handle {
NodeRef::new(NodeData::Element(ElementData {
name,
attrs: RefCell::new(attrs.into_iter().collect()),
}))
}
fn create_comment(&self, _text: StrTendril) -> Self::Handle {
NodeRef::new(NodeData::Other)
}
fn create_pi(&self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
NodeRef::new(NodeData::Other)
}
fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
match child {
NodeOrText::AppendNode(node) => parent.append_child(node),
NodeOrText::AppendText(text) => {
if let Some(prev_text) =
parent.last_child().as_ref().and_then(|sibling| sibling.as_text())
{
prev_text.borrow_mut().push_tendril(&text);
} else {
let node = NodeRef::new(NodeData::Text(text.into()));
parent.append_child(node);
}
}
}
}
fn append_based_on_parent_node(
&self,
element: &Self::Handle,
prev_element: &Self::Handle,
child: NodeOrText<Self::Handle>,
) {
if element.0.parent.borrow().is_some() {
self.append_before_sibling(element, child);
} else {
self.append(prev_element, child);
}
}
fn append_doctype_to_document(
&self,
_name: StrTendril,
_public_id: StrTendril,
_system_id: StrTendril,
) {
}
fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
target.clone()
}
fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
Rc::ptr_eq(&x.0, &y.0)
}
fn set_quirks_mode(&self, _mode: html5ever::tree_builder::QuirksMode) {}
fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
match new_node {
NodeOrText::AppendNode(node) => node.insert_before_sibling(sibling),
NodeOrText::AppendText(text) => {
if let Some(prev_text) =
sibling.prev_sibling().as_ref().and_then(|prev_sibling| prev_sibling.as_text())
{
prev_text.borrow_mut().push_tendril(&text);
} else {
let node = NodeRef::new(NodeData::Text(text.into()));
node.insert_before_sibling(sibling);
}
}
}
}
fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
let element = target.as_element().unwrap();
element.attrs.borrow_mut().extend(attrs);
}
fn remove_from_parent(&self, target: &Self::Handle) {
target.detach();
}
fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
for child in node.0.children.take() {
child.0.parent.take();
new_parent.append_child(child);
}
}
}
impl Serialize for Html {
fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
where
S: Serializer,
{
match traversal_scope {
TraversalScope::IncludeNode => {
for child in self.children() {
child.serialize(serializer)?;
}
Ok(())
}
TraversalScope::ChildrenOnly(_) => Ok(()),
}
}
}
impl fmt::Display for Html {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut u8_vec = Vec::new();
serialize(
&mut u8_vec,
self,
SerializeOpts { traversal_scope: TraversalScope::IncludeNode, ..Default::default() },
)
.unwrap();
f.write_str(&String::from_utf8(u8_vec).unwrap())?;
Ok(())
}
}
#[derive(Debug)]
#[non_exhaustive]
struct Node {
parent: RefCell<Option<Weak<Node>>>,
children: RefCell<Vec<NodeRef>>,
data: NodeData,
}
impl Node {
fn new(data: NodeData) -> Self {
Self { parent: Default::default(), children: Default::default(), data }
}
fn as_element(&self) -> Option<&ElementData> {
as_variant!(&self.data, NodeData::Element)
}
fn as_text(&self) -> Option<&RefCell<StrTendril>> {
as_variant!(&self.data, NodeData::Text)
}
fn is_root(&self) -> bool {
matches!(&self.data, NodeData::Element(element_data) if element_data.name.local.as_bytes() == b"html")
}
fn parent(&self) -> Option<NodeRef> {
self.parent.borrow().as_ref()?.upgrade().map(NodeRef)
}
}
#[derive(Debug, Clone)]
#[allow(clippy::exhaustive_enums)]
pub enum NodeData {
Document,
Text(RefCell<StrTendril>),
Element(ElementData),
Other,
}
#[derive(Debug, Clone)]
#[allow(clippy::exhaustive_structs)]
pub struct ElementData {
pub name: QualName,
pub attrs: RefCell<BTreeSet<Attribute>>,
}
impl ElementData {
#[cfg(feature = "matrix")]
pub fn to_matrix(&self) -> matrix::MatrixElementData {
matrix::MatrixElementData::parse(&self.name, &self.attrs.borrow())
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct NodeRef(Rc<Node>);
impl NodeRef {
fn new(data: NodeData) -> Self {
Self(Node::new(data).into())
}
pub(crate) fn detach(&self) {
if let Some((parent, index)) = self.parent_and_index() {
parent.0.children.borrow_mut().remove(index);
self.0.parent.take();
}
}
fn append_child(&self, child: NodeRef) {
child.detach();
child.0.parent.replace(Some(Rc::downgrade(&self.0)));
self.0.children.borrow_mut().push(child);
}
fn parent_and_index(&self) -> Option<(NodeRef, usize)> {
let parent = self.0.parent()?;
let i = parent
.0
.children
.borrow()
.iter()
.position(|child| Rc::ptr_eq(&child.0, &self.0))
.expect("child should be in parent's children");
Some((parent, i))
}
pub(crate) fn insert_before_sibling(&self, sibling: &NodeRef) {
self.detach();
let (parent, index) = sibling.parent_and_index().expect("sibling should have parent");
self.0.parent.replace(Some(Rc::downgrade(&parent.0)));
parent.0.children.borrow_mut().insert(index, self.clone());
}
pub(crate) fn replace_with_element_name(self, name: LocalName) -> NodeRef {
let mut element_data = self.as_element().unwrap().clone();
element_data.name.local = name;
let new_node = NodeRef::new(NodeData::Element(element_data));
for child in self.children() {
new_node.append_child(child);
}
new_node.insert_before_sibling(&self);
self.detach();
new_node
}
pub fn data(&self) -> &NodeData {
&self.0.data
}
pub fn as_element(&self) -> Option<&ElementData> {
self.0.as_element()
}
pub fn as_text(&self) -> Option<&RefCell<StrTendril>> {
self.0.as_text()
}
pub fn parent(&self) -> Option<NodeRef> {
let parent = self.0.parent()?;
if parent.0.is_root() {
return None;
}
Some(parent)
}
pub fn next_sibling(&self) -> Option<NodeRef> {
let (parent, index) = self.parent_and_index()?;
let index = index.checked_add(1)?;
let sibling = parent.0.children.borrow().get(index).cloned();
sibling
}
pub fn prev_sibling(&self) -> Option<NodeRef> {
let (parent, index) = self.parent_and_index()?;
let index = index.checked_sub(1)?;
let sibling = parent.0.children.borrow().get(index).cloned();
sibling
}
pub fn has_children(&self) -> bool {
!self.0.children.borrow().is_empty()
}
pub fn first_child(&self) -> Option<NodeRef> {
self.0.children.borrow().first().cloned()
}
pub fn last_child(&self) -> Option<NodeRef> {
self.0.children.borrow().last().cloned()
}
pub fn children(&self) -> Children {
Children::new(self.first_child())
}
pub(crate) fn serialize<S>(&self, serializer: &mut S) -> io::Result<()>
where
S: Serializer,
{
match self.data() {
NodeData::Element(data) => {
serializer.start_elem(
data.name.clone(),
data.attrs.borrow().iter().map(|attr| (&attr.name, &*attr.value)),
)?;
for child in self.children() {
child.serialize(serializer)?;
}
serializer.end_elem(data.name.clone())?;
Ok(())
}
NodeData::Document => {
for child in self.children() {
child.serialize(serializer)?;
}
Ok(())
}
NodeData::Text(text) => serializer.write_text(&text.borrow()),
_ => Ok(()),
}
}
}
#[derive(Debug, Clone)]
pub struct Children {
next: Option<NodeRef>,
}
impl Children {
fn new(start_node: Option<NodeRef>) -> Self {
Self { next: start_node }
}
}
impl Iterator for Children {
type Item = NodeRef;
fn next(&mut self) -> Option<Self::Item> {
let next = self.next.take()?;
self.next = next.next_sibling();
Some(next)
}
}
impl FusedIterator for Children {}
#[cfg(test)]
mod tests {
use super::Html;
#[test]
fn sanity() {
let html = "\
<h1>Title</h1>\
<div>\
<p>This is some <em>text</em></p>\
</div>\
";
assert_eq!(Html::parse(html).to_string(), html);
assert_eq!(Html::parse("").to_string(), "");
}
}