From 1a62a39879a1def200dcb87b900265993e6c1c83 Mon Sep 17 00:00:00 2001 From: Martin Robinson Date: Fri, 22 Mar 2024 09:13:10 +0100 Subject: [PATCH 01/16] Rename `master` branch to `main` (#522) --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f2fa4305..ab96e2a9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,7 +2,7 @@ name: CI on: push: - branches: [master] + branches: [main] pull_request: merge_group: types: [checks_requested] From 3820b051ae2dc9afd187b6e01730500e8613b81e Mon Sep 17 00:00:00 2001 From: nickelc Date: Fri, 29 Mar 2024 11:54:45 +0100 Subject: [PATCH 02/16] Bump `actions/checkout` to v4 (#524) --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ab96e2a9..4fe29324 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -15,7 +15,7 @@ jobs: matrix: version: [stable, beta, nightly] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set toolchain run: | @@ -48,7 +48,7 @@ jobs: name: MSRV runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install stable toolchain run: | @@ -75,7 +75,7 @@ jobs: name: Lint runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install stable toolchain run: | From 96674153b20606766b5465f66993eddaf424e3df Mon Sep 17 00:00:00 2001 From: Ralph Giles Date: Tue, 2 Apr 2024 12:25:42 -0700 Subject: [PATCH 03/16] Update README for 0.27 (#527) Suggest the latest release in the code example. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c78b18ed..4ed781d3 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Add html5ever as a dependency in your [`Cargo.toml`](https://crates.io/) file: ```toml [dependencies] -html5ever = "0.26" +html5ever = "0.27" ``` You should also take a look at [`examples/html2html.rs`], [`examples/print-rcdom.rs`], and the [API documentation][]. From 3d0eb15631589160090b95d34f8c745a99aedc2e Mon Sep 17 00:00:00 2001 From: nickelc Date: Tue, 2 Apr 2024 21:35:03 +0200 Subject: [PATCH 04/16] Remove unnecessary version check from the benchmark build step (#525) The check is from the time when the MSRV was included in the build matrix. --- .github/workflows/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4fe29324..56e412ad 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -26,7 +26,6 @@ jobs: run: git submodule update --init - name: Cargo bench - if: matrix.version != '1.41.0' run: cargo bench --all env: RUSTFLAGS: --cfg bench From 6570663a2d81a5dddc07e7eac94533b12cc62fbc Mon Sep 17 00:00:00 2001 From: nickelc Date: Tue, 2 Apr 2024 21:38:07 +0200 Subject: [PATCH 05/16] docs: fix bare url (#523) Bare URLs are not automatically turned into clickable links. --- html5ever/src/tokenizer/interface.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs index b3b8a1cf..bb9f89dc 100644 --- a/html5ever/src/tokenizer/interface.rs +++ b/html5ever/src/tokenizer/interface.rs @@ -92,7 +92,7 @@ pub trait TokenSink { /// Used in the markup declaration open state. By default, this always /// returns false and thus all CDATA sections are tokenized as bogus /// comments. - /// https://html.spec.whatwg.org/multipage/#markup-declaration-open-state + /// fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { false } From 99c70110ab8db0c3c1b58503d066d41c59917239 Mon Sep 17 00:00:00 2001 From: Keith Yeung Date: Wed, 3 Apr 2024 03:39:54 +0800 Subject: [PATCH 06/16] Add loading and referrerpolicy to local names (#515) --- markup5ever/local_names.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/markup5ever/local_names.txt b/markup5ever/local_names.txt index 47c635c8..14469cdf 100644 --- a/markup5ever/local_names.txt +++ b/markup5ever/local_names.txt @@ -303,6 +303,7 @@ feTile feTurbulence fence fetch +fetchpriority fieldset figcaption figure @@ -465,6 +466,7 @@ list listener listing ln +loading local log logbase From a3338efc8b1aceffaf60b64fa20fa40a5bfbab0d Mon Sep 17 00:00:00 2001 From: Ralph Giles Date: Tue, 2 Apr 2024 14:21:13 -0700 Subject: [PATCH 07/16] Remove rustc-test dependency (#528) * xml5ever: remove unused rust-test dev-dependency This isn't actually used by any code in the crate. * rcdom: port tests/xml-tree-builder to a custom runner This doesn't distribute the individual tests across the build-in harness, hurting parallelism and process isolation. On the other hand it's a minimal change to port off the rustc-test dependency which hasn't been actively maintained and currently doesn't compile. * rcdom: Port remaining tests to a custom runner Move the custom test runner into its own `util::runner` module and use it instead of rustc_test for the other test files. This allows `cargo test` to complete under rust 1.77.1. * github actions: Remove rustc-test/capture feature check This is no longer available since the depdendency has been removed. --- .github/workflows/main.yml | 5 ----- rcdom/Cargo.toml | 1 - rcdom/tests/html-tokenizer.rs | 27 +++++++++++++++---------- rcdom/tests/html-tree-builder.rs | 34 +++++++++++++++++--------------- rcdom/tests/util/runner.rs | 32 ++++++++++++++++++++++++++++++ rcdom/tests/xml-tokenizer.rs | 28 ++++++++++++-------------- rcdom/tests/xml-tree-builder.rs | 26 ++++++++++++------------ xml5ever/Cargo.toml | 1 - 8 files changed, 93 insertions(+), 61 deletions(-) create mode 100644 rcdom/tests/util/runner.rs diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 56e412ad..4d29eba2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,11 +30,6 @@ jobs: env: RUSTFLAGS: --cfg bench - - name: Test "rustc-test/capture" feature - if: matrix.version == 'nightly' - working-directory: rcdom - run: cargo test --features "rustc-test/capture" - - name: Cargo test if: matrix.version != 'nightly' run: cargo test --all diff --git a/rcdom/Cargo.toml b/rcdom/Cargo.toml index 2c42d7b7..d59b839b 100644 --- a/rcdom/Cargo.toml +++ b/rcdom/Cargo.toml @@ -22,7 +22,6 @@ xml5ever = { version = "0.18", path = "../xml5ever" } [dev-dependencies] serde_json = "1.0" -rustc-test = "0.3" [[test]] name = "html-tokenizer" diff --git a/rcdom/tests/html-tokenizer.rs b/rcdom/tests/html-tokenizer.rs index f67caf8f..061effcb 100644 --- a/rcdom/tests/html-tokenizer.rs +++ b/rcdom/tests/html-tokenizer.rs @@ -20,7 +20,6 @@ use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token}; use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag}; use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; use html5ever::{namespace_url, ns, Attribute, LocalName, QualName}; -use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn}; use serde_json::{Map, Value}; use std::ffi::OsStr; use std::fs::File; @@ -28,6 +27,12 @@ use std::io::Read; use std::path::Path; use std::{char, env, mem}; +use util::runner::Test; + +mod util { + pub mod runner; +} + #[derive(Debug)] struct TestError; @@ -334,10 +339,11 @@ fn mk_test( expect: Value, expect_errors: Vec, opts: TokenizerOpts, -) -> TestDescAndFn { - TestDescAndFn { - desc: TestDesc::new(DynTestName(desc)), - testfn: DynTestFn(Box::new(move || { +) -> Test { + Test { + name: desc, + skip: false, + test: Box::new(move || { // Split up the input at different points to test incremental tokenization. let insplits = splits(&input, 3); for input in insplits.into_iter() { @@ -354,11 +360,11 @@ fn mk_test( ); } } - })), + }), } } -fn mk_tests(tests: &mut Vec, filename: &str, js: &Value) { +fn mk_tests(tests: &mut Vec, filename: &str, js: &Value) { let obj = js.get_obj(); let mut input = js.find("input").get_str(); let mut expect = js.find("output").clone(); @@ -437,7 +443,7 @@ fn mk_tests(tests: &mut Vec, filename: &str, js: &Value) { } } -fn tests(src_dir: &Path) -> Vec { +fn tests(src_dir: &Path) -> Vec { let mut tests = vec![]; let mut add_test = |path: &Path, mut file: File| { @@ -474,6 +480,7 @@ fn tests(src_dir: &Path) -> Vec { } fn main() { - let args: Vec<_> = env::args().collect(); - rustc_test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR")))); + for test in tests(Path::new(env!("CARGO_MANIFEST_DIR"))) { + test.run(); + } } diff --git a/rcdom/tests/html-tree-builder.rs b/rcdom/tests/html-tree-builder.rs index d22207d3..1038e58e 100644 --- a/rcdom/tests/html-tree-builder.rs +++ b/rcdom/tests/html-tree-builder.rs @@ -8,7 +8,6 @@ // except according to those terms. extern crate markup5ever_rcdom as rcdom; -extern crate rustc_test as test; #[macro_use] extern crate html5ever; @@ -20,12 +19,16 @@ use std::ffi::OsStr; use std::io::BufRead; use std::path::Path; use std::{env, fs, io, iter, mem}; -use test::{DynTestName, TestDesc, TestDescAndFn, TestFn}; use html5ever::tendril::{StrTendril, TendrilSink}; use html5ever::{parse_document, parse_fragment, ParseOpts}; use html5ever::{LocalName, QualName}; use rcdom::{Handle, NodeData, RcDom}; +use util::runner::Test; + +mod util { + pub mod runner; +} fn parse_tests>(mut lines: It) -> Vec> { let mut tests = vec![]; @@ -159,7 +162,7 @@ fn serialize(buf: &mut String, indent: usize, handle: Handle) { } fn make_test( - tests: &mut Vec, + tests: &mut Vec, ignores: &HashSet, filename: &str, idx: usize, @@ -185,7 +188,7 @@ fn make_test_desc_with_scripting_flag( name: &str, fields: &HashMap, scripting_enabled: bool, -) -> TestDescAndFn { +) -> Test { let get_field = |key| { let field = fields.get(key).expect("missing field"); field.trim_end_matches('\n').to_string() @@ -197,24 +200,22 @@ fn make_test_desc_with_scripting_flag( let context = fields .get("document-fragment") .map(|field| context_name(field.trim_end_matches('\n'))); - let ignore = ignores.contains(name); + let skip = ignores.contains(name); let mut name = name.to_owned(); if scripting_enabled { name.push_str(" (scripting enabled)"); } else { name.push_str(" (scripting disabled)"); }; - let mut opts: ParseOpts = Default::default(); - opts.tree_builder.scripting_enabled = scripting_enabled; - TestDescAndFn { - desc: TestDesc { - ignore, - ..TestDesc::new(DynTestName(name)) - }, - testfn: TestFn::dyn_test_fn(move || { + Test { + name, + skip, + test: Box::new(move || { // Do this here because Tendril isn't Send. let data = StrTendril::from_slice(&data); + let mut opts: ParseOpts = Default::default(); + opts.tree_builder.scripting_enabled = scripting_enabled; let mut result = String::new(); match context { None => { @@ -258,7 +259,7 @@ fn context_name(context: &str) -> QualName { } } -fn tests(src_dir: &Path, ignores: &HashSet) -> Vec { +fn tests(src_dir: &Path, ignores: &HashSet) -> Vec { let mut tests = vec![]; foreach_html5lib_test( @@ -286,7 +287,6 @@ fn tests(src_dir: &Path, ignores: &HashSet) -> Vec { } fn main() { - let args: Vec<_> = env::args().collect(); let src_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let mut ignores = HashSet::new(); { @@ -297,5 +297,7 @@ fn main() { } } - test::test_main(&args, tests(src_dir, &ignores)); + for test in tests(src_dir, &ignores) { + test.run(); + } } diff --git a/rcdom/tests/util/runner.rs b/rcdom/tests/util/runner.rs new file mode 100644 index 00000000..3f50a7a2 --- /dev/null +++ b/rcdom/tests/util/runner.rs @@ -0,0 +1,32 @@ +// Copyright 2024 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/// Simple container for storing tests for later execution +pub struct Test { + pub name: String, + pub skip: bool, + pub test: Box, +} + +impl Test { + /// Invoke the stored test function + /// + /// A status message is printed if the wrapped closure completes + /// or is marked as skipped. The test should panic to report + /// failure. + pub fn run(&self) { + print!("test {} ...", self.name); + if self.skip { + println!(" SKIPPED"); + } else { + (self.test)(); + println!(" ok"); + } + } +} diff --git a/rcdom/tests/xml-tokenizer.rs b/rcdom/tests/xml-tokenizer.rs index cbdf10c3..ddb5d2ec 100644 --- a/rcdom/tests/xml-tokenizer.rs +++ b/rcdom/tests/xml-tokenizer.rs @@ -14,8 +14,8 @@ use std::io::Read; use std::path::Path; use std::{env, mem}; -use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn}; use util::find_tests::foreach_xml5lib_test; +use util::runner::Test; use markup5ever::buffer_queue::BufferQueue; use xml5ever::tendril::{SliceExt, StrTendril}; @@ -28,6 +28,7 @@ use xml5ever::{namespace_url, ns, Attribute, LocalName, QualName}; mod util { pub mod find_tests; + pub mod runner; } // Return all ways of splitting the string into at most n @@ -279,15 +280,11 @@ fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec { sink.get_tokens() } -fn mk_xml_test( - desc: String, - input: String, - expect: Value, - opts: XmlTokenizerOpts, -) -> TestDescAndFn { - TestDescAndFn { - desc: TestDesc::new(DynTestName(desc)), - testfn: DynTestFn(Box::new(move || { +fn mk_xml_test(name: String, input: String, expect: Value, opts: XmlTokenizerOpts) -> Test { + Test { + name, + skip: false, + test: Box::new(move || { // Split up the input at different points to test incremental tokenization. let insplits = splits(&input, 3); for input in insplits.into_iter() { @@ -304,11 +301,11 @@ fn mk_xml_test( ); } } - })), + }), } } -fn mk_xml_tests(tests: &mut Vec, filename: &str, js: &Value) { +fn mk_xml_tests(tests: &mut Vec, filename: &str, js: &Value) { let input: &str = &js.find("input").get_str(); let expect = js.find("output"); let desc = format!("tok: {}: {}", filename, js.find("description").get_str()); @@ -346,7 +343,7 @@ fn mk_xml_tests(tests: &mut Vec, filename: &str, js: &Value) { } } -fn tests(src_dir: &Path) -> Vec { +fn tests(src_dir: &Path) -> Vec { let mut tests = vec![]; foreach_xml5lib_test( src_dir, @@ -373,6 +370,7 @@ fn tests(src_dir: &Path) -> Vec { } fn main() { - let args: Vec<_> = env::args().collect(); - rustc_test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR")))); + for test in tests(Path::new(env!("CARGO_MANIFEST_DIR"))) { + test.run(); + } } diff --git a/rcdom/tests/xml-tree-builder.rs b/rcdom/tests/xml-tree-builder.rs index 98365c75..a28040ee 100644 --- a/rcdom/tests/xml-tree-builder.rs +++ b/rcdom/tests/xml-tree-builder.rs @@ -9,18 +9,19 @@ use markup5ever::{namespace_url, ns}; use markup5ever_rcdom::*; -use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn}; use std::collections::{HashMap, HashSet}; use std::ffi::OsStr; use std::io::BufRead; use std::path::Path; use std::{env, fs, io, iter, mem}; use util::find_tests::foreach_xml5lib_test; +use util::runner::Test; use xml5ever::driver::parse_document; use xml5ever::tendril::TendrilSink; mod util { pub mod find_tests; + pub mod runner; } fn parse_tests>(mut lines: It) -> Vec> { @@ -158,7 +159,7 @@ fn serialize(buf: &mut String, indent: usize, handle: Handle) { static IGNORE_SUBSTRS: &[&str] = &[", + tests: &mut Vec, ignores: &HashSet, filename: &str, idx: usize, @@ -172,14 +173,12 @@ fn make_xml_test( let data = get_field("data"); let expected = get_field("document"); let name = format!("tb: {}-{}", filename, idx); - let ignore = ignores.contains(&name) || IGNORE_SUBSTRS.iter().any(|&ig| data.contains(ig)); + let skip = ignores.contains(&name) || IGNORE_SUBSTRS.iter().any(|&ig| data.contains(ig)); - tests.push(TestDescAndFn { - desc: TestDesc { - ignore, - ..TestDesc::new(DynTestName(name)) - }, - testfn: DynTestFn(Box::new(move || { + tests.push(Test { + name, + skip, + test: Box::new(move || { let mut result = String::new(); let dom = parse_document(RcDom::default(), Default::default()).one(data.clone()); @@ -196,11 +195,11 @@ fn make_xml_test( data, result, expected ); } - })), + }), }); } -fn tests(src_dir: &Path, ignores: &HashSet) -> Vec { +fn tests(src_dir: &Path, ignores: &HashSet) -> Vec { let mut tests = vec![]; foreach_xml5lib_test( @@ -228,7 +227,6 @@ fn tests(src_dir: &Path, ignores: &HashSet) -> Vec { } fn main() { - let args: Vec<_> = env::args().collect(); let src_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let mut ignores = HashSet::new(); if let Ok(f) = fs::File::open(src_dir.join("data/test/ignore")) { @@ -238,5 +236,7 @@ fn main() { } } - rustc_test::test_main(&args, tests(src_dir, &ignores)); + for test in tests(src_dir, &ignores) { + test.run(); + } } diff --git a/xml5ever/Cargo.toml b/xml5ever/Cargo.toml index 41a27fc3..cdae3de4 100644 --- a/xml5ever/Cargo.toml +++ b/xml5ever/Cargo.toml @@ -22,7 +22,6 @@ markup5ever = {version = "0.12", path = "../markup5ever" } [dev-dependencies] criterion = "0.3" -rustc-test = "0.3" [[bench]] name = "xml5ever" From 9b94335eccb2d0fab755c8e9637d019d8c3d0279 Mon Sep 17 00:00:00 2001 From: Alex Touchet Date: Wed, 3 Apr 2024 01:04:06 -0700 Subject: [PATCH 08/16] Update URLs and some formatting (#529) --- README.md | 13 +++++++------ xml5ever/Cargo.toml | 12 +++++------- xml5ever/examples/README.md | 16 ++++++++-------- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 4ed781d3..ce9bc19d 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,11 @@ html5ever is an HTML parser developed as part of the [Servo][] project. -It can parse and serialize HTML according to the [WHATWG](https://whatwg.org/) specs (aka "HTML5"). However, there are some differences in the actual behavior currently, most of which are documented [in the bug tracker][]. html5ever passes all tokenizer tests from [html5lib-tests][], with most tree builder tests outside of the unimplemented features. The goal is to pass all html5lib tests, while also providing all hooks needed by a production web browser, e.g. `document.write`. +It can parse and serialize HTML according to the [WHATWG](https://whatwg.org/) specs (aka "HTML5"). However, there are some differences in the actual behavior currently, most of which are documented [in the bug tracker][]. html5ever passes all tokenizer tests from [html5lib-tests][], with most tree builder tests outside of the unimplemented features. The goal is to pass all html5lib tests, while also providing all hooks needed by a production web browser, e.g. `document.write`. -Note that the HTML syntax is very similar to XML. For correct parsing of XHTML, use an XML parser (That said, many XHTML documents in the wild are serialized in an HTML-compatible form). +Note that the HTML syntax is very similar to XML. For correct parsing of XHTML, use an XML parser (that said, many XHTML documents in the wild are serialized in an HTML-compatible form). -html5ever is written in [Rust][], therefore it avoids the notorious security problems that come along with using C. Being built with Rust also makes the library come with the high-grade performance you would expect from an HTML parser written in C. html5ever is basically a C HTML parser, but without needing a garbage collector or other heavy runtime processes. +html5ever is written in [Rust][], therefore it avoids the notorious security problems that come along with using C. Being built with Rust also makes the library come with the high-grade performance you would expect from an HTML parser written in C. html5ever is basically a C HTML parser, but without needing a garbage collector or other heavy runtime processes. ## Getting started in Rust @@ -25,6 +25,7 @@ html5ever = "0.27" You should also take a look at [`examples/html2html.rs`], [`examples/print-rcdom.rs`], and the [API documentation][]. + ## Getting started in other languages Bindings for Python and other languages are much desired. @@ -45,7 +46,7 @@ Run `cargo doc` in the repository root to build local documentation under `targe html5ever uses callbacks to manipulate the DOM, therefore it does not provide any DOM tree representation. -html5ever exclusively uses UTF-8 to represent strings. In the future it will support other document encodings (and UCS-2 `document.write`) by converting input. +html5ever exclusively uses UTF-8 to represent strings. In the future it will support other document encodings (and UCS-2 `document.write`) by converting input. The code is cross-referenced with the WHATWG syntax spec, and eventually we will have a way to present code and spec side-by-side. @@ -56,5 +57,5 @@ html5ever builds against the official stable releases of Rust, though some optim [Rust]: https://www.rust-lang.org/ [in the bug tracker]: https://github.com/servo/html5ever/issues?q=is%3Aopen+is%3Aissue+label%3Aweb-compat [html5lib-tests]: https://github.com/html5lib/html5lib-tests -[`examples/html2html.rs`]: https://github.com/servo/html5ever/blob/master/rcdom/examples/html2html.rs -[`examples/print-rcdom.rs`]: https://github.com/servo/html5ever/blob/master/rcdom/examples/print-rcdom.rs +[`examples/html2html.rs`]: https://github.com/servo/html5ever/blob/main/rcdom/examples/html2html.rs +[`examples/print-rcdom.rs`]: https://github.com/servo/html5ever/blob/main/rcdom/examples/print-rcdom.rs diff --git a/xml5ever/Cargo.toml b/xml5ever/Cargo.toml index cdae3de4..90f9a305 100644 --- a/xml5ever/Cargo.toml +++ b/xml5ever/Cargo.toml @@ -1,24 +1,22 @@ [package] - name = "xml5ever" version = "0.18.0" authors = ["The xml5ever project developers"] license = "MIT OR Apache-2.0" repository = "https://github.com/servo/html5ever" -description = "Push based streaming parser for xml" -documentation = "https://docs.rs/xml5ever/" - -homepage = "https://github.com/servo/html5ever/blob/master/xml5ever/README.md" +description = "Push based streaming parser for XML." +documentation = "https://docs.rs/xml5ever" +homepage = "https://github.com/servo/html5ever/blob/main/xml5ever/README.md" readme = "README.md" keywords = ["xml", "xml5", "parser", "parsing"] exclude = ["xml5lib-tests/*"] -categories = [ "parser-implementations", "web-programming" ] +categories = ["parser-implementations", "web-programming"] edition = "2021" [dependencies] log = "0.4" mac = "0.1" -markup5ever = {version = "0.12", path = "../markup5ever" } +markup5ever = { version = "0.12", path = "../markup5ever" } [dev-dependencies] criterion = "0.3" diff --git a/xml5ever/examples/README.md b/xml5ever/examples/README.md index 280bbe4e..b1763fc7 100644 --- a/xml5ever/examples/README.md +++ b/xml5ever/examples/README.md @@ -22,7 +22,7 @@ First let's define our dependencies: ``` With dependencies declared, we can now make a simple tokenizer sink. First step is to -define a [`TokenSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html). [`TokenSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html) are traits that received stream of [`Tokens`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/enum.Token.html). +define a [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html). [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html) are traits that received stream of [`Tokens`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/enum.Token.html). In our case we'll define a unit struct (i.e. a struct without any fields). @@ -30,7 +30,7 @@ In our case we'll define a unit struct (i.e. a struct without any fields). struct SimpleTokenPrinter; ``` -To make `SimpleTokenPrinter` a [`TokenSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html), we need to implement [process_token](https://ygg01.github.io/docs/xml5ever/xml5ever/tokenizer/trait.TokenSink.html#tymethod.process_token) method. +To make `SimpleTokenPrinter` a [`TokenSink`](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html), we need to implement [process_token](https://docs.rs/xml5ever/latest/xml5ever/tokenizer/trait.TokenSink.html#tymethod.process_token) method. ```rust impl TokenSink for SimpleTokenPrinter { @@ -64,7 +64,7 @@ To make `SimpleTokenPrinter` a [`TokenSink`](https://ygg01.github.io/docs/xml5ev ``` Now, we need some input to process. For input we'll use `stdin`. However, xml5ever `tokenize_to` method only takes `StrTendril`. So we need to construct a -[`ByteTendril`](https://doc.servo.org/tendril/type.ByteTendril.html) using `ByteTendril::new()`, then read the `stdin` using [`read_to_tendril`](https://doc.servo.org/tendril/trait.ReadExt.html#tymethod.read_to_tendril) extension. +[`ByteTendril`](https://docs.rs/tendril/latest/tendril/type.ByteTendril.html) using `ByteTendril::new()`, then read the `stdin` using [`read_to_tendril`](https://docs.rs/tendril/latest/tendril/trait.ReadExt.html#tymethod.read_to_tendril) extension. Once that is set, to make `SimpleTokenPrinter` parse the input, call, `tokenize_to` with it as the first parameter, input wrapped in Option for second parameter and XmlToke. @@ -96,7 +96,7 @@ Once that is set, to make `SimpleTokenPrinter` parse the input, call, NOTE: `unwrap` causes panic, it's only OK to use in simple examples. -For full source code check out: [`examples/simple_xml_tokenizer.rs`](https://github.com/Ygg01/xml5ever/blob/master/examples/simple_xml_tokenizer.rs) +For full source code check out: [`examples/simple_xml_tokenizer.rs`](https://github.com/servo/html5ever/blob/main/xml5ever/examples/simple_xml_tokenizer.rs) Once we have successfully compiled the example we run the example with inline xml @@ -105,7 +105,7 @@ xml cargo script simple_xml_tokenizer.rs <<< "Text with bold words!" ``` -or by sending an [`examples/example.xml`](https://github.com/Ygg01/xml5ever/blob/master/examples/simple_xml_tokenizer.rs) located in same folder as examples. +or by sending an [`examples/example.xml`](https://github.com/servo/html5ever/blob/main/xml5ever/examples/example.xml) located in same folder as examples. ```bash cargo script simple_xml_tokenizer.rs < example.xml @@ -153,8 +153,8 @@ First part is similar to making SimpleTokenPrinter: let input = input.try_reinterpret().unwrap(); ``` -This time, we need an implementation of [`TreeSink`](https://ygg01.github.io/docs/xml5ever/xml5ever/tree_builder/interface/trait.TreeSink.html). xml5ever comes with a -built-in `TreeSink` implementation called [`RcDom`](https://ygg01.github.io/docs/xml5ever/xml5ever/rcdom/struct.RcDom.html). To process input into +This time, we need an implementation of [`TreeSink`](https://docs.rs/xml5ever/latest/xml5ever/tree_builder/trait.TreeSink.html). xml5ever comes with a +built-in `TreeSink` implementation called [`RcDom`](https://docs.rs/markup5ever_rcdom/latest/markup5ever_rcdom/struct.RcDom.html). To process input into a `TreeSink` we use the following line: ```rust @@ -220,4 +220,4 @@ kind of function that will help us traverse it. We shall call that function `wal } ``` -For full source code check out: [`examples/xml_tree_printer.rs`](https://github.com/Ygg01/xml5ever/blob/master/examples/xml_tree_printer.rs) +For full source code check out: [`examples/xml_tree_printer.rs`](https://github.com/servo/html5ever/blob/main/rcdom/examples/xml_tree_printer.rs) From 775ee9c364416d34c7af5e4131fdf7fcba782d52 Mon Sep 17 00:00:00 2001 From: Alex Touchet Date: Wed, 3 Apr 2024 10:17:18 -0700 Subject: [PATCH 09/16] Switch doc link to docs.rs (#530) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ce9bc19d..43d232ce 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ The code is cross-referenced with the WHATWG syntax spec, and eventually we will html5ever builds against the official stable releases of Rust, though some optimizations are only supported on nightly releases. -[API documentation]: https://doc.servo.org/html5ever/index.html +[API documentation]: https://docs.rs/html5ever [Servo]: https://github.com/servo/servo [Rust]: https://www.rust-lang.org/ [in the bug tracker]: https://github.com/servo/html5ever/issues?q=is%3Aopen+is%3Aissue+label%3Aweb-compat From 030bfeb26d4ab8f09082bb6cc33a6cb283afaf56 Mon Sep 17 00:00:00 2001 From: Ralph Giles Date: Wed, 3 Apr 2024 10:18:06 -0700 Subject: [PATCH 10/16] Update to criterion 0.5 (#531) Specify the current version of the benchmarking framework to address cargo audit warnings about the dependencies of earlier releases. Note that this requires rust 1.74.1 or later to run `cargo bench` but MSRV (1.60.0) is only gated against the library target itself and not dev-dependencies, so this is no change in the requirements on downstream users. --- html5ever/Cargo.toml | 2 +- xml5ever/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml index 0a1c6077..5a8e257e 100644 --- a/html5ever/Cargo.toml +++ b/html5ever/Cargo.toml @@ -17,7 +17,7 @@ mac = "0.1" markup5ever = { version = "0.12", path = "../markup5ever" } [dev-dependencies] -criterion = "0.3" +criterion = "0.5" typed-arena = "2.0.2" [build-dependencies] diff --git a/xml5ever/Cargo.toml b/xml5ever/Cargo.toml index 90f9a305..e0309654 100644 --- a/xml5ever/Cargo.toml +++ b/xml5ever/Cargo.toml @@ -19,7 +19,7 @@ mac = "0.1" markup5ever = { version = "0.12", path = "../markup5ever" } [dev-dependencies] -criterion = "0.3" +criterion = "0.5" [[bench]] name = "xml5ever" From 5cc0951d2c22008d08db828244f8b6a7e54c4eaa Mon Sep 17 00:00:00 2001 From: Or Gany Date: Wed, 3 Apr 2024 20:18:23 +0300 Subject: [PATCH 11/16] Add comments for html5ever/examples (#506) * Adding comments for arena * Adding comments for noop-tokenize * Adding comments for noop-tree-builder * Adding comments for print-tree-actions * Add comments for tokenize * Removing redundant comments --------- Co-authored-by: Or Gany Co-authored-by: Martin Robinson --- html5ever/examples/arena.rs | 35 ++++++++++++++++++------ html5ever/examples/noop-tokenize.rs | 9 ++++-- html5ever/examples/noop-tree-builder.rs | 8 ++++++ html5ever/examples/print-tree-actions.rs | 3 ++ html5ever/examples/tokenize.rs | 6 ++++ 5 files changed, 50 insertions(+), 11 deletions(-) diff --git a/html5ever/examples/arena.rs b/html5ever/examples/arena.rs index d084e011..7a2b4980 100644 --- a/html5ever/examples/arena.rs +++ b/html5ever/examples/arena.rs @@ -19,36 +19,32 @@ use std::collections::HashSet; use std::io::{self, Read}; use std::ptr; -fn main() { - let mut bytes = Vec::new(); - io::stdin().read_to_end(&mut bytes).unwrap(); - let arena = typed_arena::Arena::new(); - html5ever_parse_slice_into_arena(&bytes, &arena); -} - +/// By using our Sink type, the arena is filled with parsed HTML. fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> { let sink = Sink { arena, document: arena.alloc(Node::new(NodeData::Document)), quirks_mode: QuirksMode::NoQuirks, }; + parse_document(sink, Default::default()) .from_utf8() .one(bytes) } type Arena<'arena> = &'arena typed_arena::Arena>; - type Ref<'arena> = &'arena Node<'arena>; - type Link<'arena> = Cell>>; +/// Sink struct is responsible for handling how the data that comes out of the HTML parsing +/// unit (TreeBuilder in our case) is handled. struct Sink<'arena> { arena: Arena<'arena>, document: Ref<'arena>, quirks_mode: QuirksMode, } +/// DOM node which contains links to other nodes in the tree. pub struct Node<'arena> { parent: Link<'arena>, next_sibling: Link<'arena>, @@ -58,6 +54,7 @@ pub struct Node<'arena> { data: NodeData<'arena>, } +/// HTML node data which can be an element, a comment, a string, a DOCTYPE, etc... pub enum NodeData<'arena> { Document, Doctype { @@ -178,6 +175,11 @@ impl<'arena> Sink<'arena> { } } +/// By implementing the TreeSink trait we determine how the data from the tree building step +/// is processed. In our case, our data is allocated in the arena and added to the Node data +/// structure. +/// +/// For deeper understating of each function go to the TreeSink declaration. impl<'arena> TreeSink for Sink<'arena> { type Handle = Ref<'arena>; type Output = Ref<'arena>; @@ -333,3 +335,18 @@ impl<'arena> TreeSink for Sink<'arena> { } } } + +/// In this example an "arena" is created and filled with the DOM nodes. +/// "Arena" is a type of allocation in which a block of memory is allocated +/// and later filled with data, DOM nodes in this case. When the arena is deallocated +/// it is destroyed with all of its items. +/// +/// Further info about arena: https://docs.rs/typed-arena/latest/typed_arena/ +fn main() { + // Read HTML from the standard input + let mut bytes = Vec::new(); + io::stdin().read_to_end(&mut bytes).unwrap(); + + let arena = typed_arena::Arena::new(); + html5ever_parse_slice_into_arena(&bytes, &arena); +} \ No newline at end of file diff --git a/html5ever/examples/noop-tokenize.rs b/html5ever/examples/noop-tokenize.rs index 68b1c8c9..f354a7b8 100644 --- a/html5ever/examples/noop-tokenize.rs +++ b/html5ever/examples/noop-tokenize.rs @@ -16,22 +16,27 @@ use std::io; use html5ever::tendril::*; use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer}; + +/// In our case, our sink only contains a tokens vector struct Sink(Vec); impl TokenSink for Sink { type Handle = (); + /// Each processed token will be handled by this method fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { - // Don't use the token, but make sure we don't get - // optimized out entirely. self.0.push(token); TokenSinkResult::Continue } } +/// In this example we implement the TokenSink trait which lets us implement how each +/// parsed token is treated. In our example we take each token and insert it into a vector. fn main() { + // Read HTML from standard input let mut chunk = ByteTendril::new(); io::stdin().read_to_tendril(&mut chunk).unwrap(); + let mut input = BufferQueue::default(); input.push_back(chunk.try_reinterpret().unwrap()); diff --git a/html5ever/examples/noop-tree-builder.rs b/html5ever/examples/noop-tree-builder.rs index 5e516df6..1baebf99 100644 --- a/html5ever/examples/noop-tree-builder.rs +++ b/html5ever/examples/noop-tree-builder.rs @@ -32,6 +32,10 @@ impl Sink { } } +/// By implementing the TreeSink trait we determine how the data from the tree building step +/// is processed. In this case the DOM elements are written into the "names" hashmap. +/// +/// For deeper understating of each function go to the TreeSink declaration. impl TreeSink for Sink { type Handle = usize; type Output = Self; @@ -98,11 +102,15 @@ impl TreeSink for Sink { fn mark_script_already_started(&mut self, _node: &usize) {} } +/// In this example we implement the TreeSink trait which takes each parsed elements and insert +/// it to a hashmap, while each element is given a numeric id. fn main() { let sink = Sink { next_id: 1, names: HashMap::new(), }; + + // Read HTML from the standard input and parse it let stdin = io::stdin(); parse_document(sink, Default::default()) .from_utf8() diff --git a/html5ever/examples/print-tree-actions.rs b/html5ever/examples/print-tree-actions.rs index b95368df..2fcf0ad8 100644 --- a/html5ever/examples/print-tree-actions.rs +++ b/html5ever/examples/print-tree-actions.rs @@ -158,6 +158,9 @@ impl TreeSink for Sink { } } +/// Same example as the "noop-tree-builder", but this time every function implemented in our +/// Sink object prints a log, so it's easier to get an understating of when each function is +/// called. fn main() { let sink = Sink { next_id: 1, diff --git a/html5ever/examples/tokenize.rs b/html5ever/examples/tokenize.rs index 8d4d9e7d..04ade72e 100644 --- a/html5ever/examples/tokenize.rs +++ b/html5ever/examples/tokenize.rs @@ -81,10 +81,15 @@ impl TokenSink for TokenPrinter { } } +/// In this example we implement the TokenSink trait in such a way that each token is printed. +/// If a there's an error while processing a token it is printed as well. fn main() { let mut sink = TokenPrinter { in_char_run: false }; + + // Read HTML from standard input let mut chunk = ByteTendril::new(); io::stdin().read_to_tendril(&mut chunk).unwrap(); + let mut input = BufferQueue::default(); input.push_back(chunk.try_reinterpret().unwrap()); @@ -96,6 +101,7 @@ fn main() { }, ); let _ = tok.feed(&mut input); + assert!(input.is_empty()); tok.end(); sink.is_char(false); From 38d11adcc7308decf8d468b652f1d0bb1b626f4c Mon Sep 17 00:00:00 2001 From: Alex Touchet Date: Wed, 3 Apr 2024 10:42:58 -0700 Subject: [PATCH 12/16] Update xml5ever doc link (#532) --- xml5ever/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xml5ever/README.md b/xml5ever/README.md index f40045b7..6c30c02d 100644 --- a/xml5ever/README.md +++ b/xml5ever/README.md @@ -4,7 +4,7 @@ [![Docs.rs](https://docs.rs/xml5ever/badge.svg)](https://docs.rs/xml5ever) [![](https://img.shields.io/crates/v/xml5ever.svg)](https://crates.io/crates/xml5ever) -[API documentation](https://Ygg01.github.io/docs/xml5ever/xml5ever/index.html) +[API documentation](https://docs.rs/xml5ever) **Warning:** This library is alpha quality, so no guarantees are given. @@ -75,7 +75,7 @@ To build examples and tests you need to do something along the lines of: ``` This will fetch tests from outside repository and it will invoke cargo to -build and test the crate. If you need docs checkout either [API docs](https://ygg01.github.io/docs/xml5ever/xml5ever/index.html) or run `cargo docs` +build and test the crate. If you need docs checkout either [API docs](https://docs.rs/xml5ever) or run `cargo docs` to generate documentation. ## Easy first tasks From fdbd3bfd1d54db3ef4fdaad0eae6a9011771fd6d Mon Sep 17 00:00:00 2001 From: Martin Robinson Date: Thu, 4 Apr 2024 18:01:13 +0200 Subject: [PATCH 13/16] ci: All jobs should pass for CI to pass (#533) This change makes it so that CI does not pass unless all jobs defined in the workflow file pass. This is necessary in order to prevent landing changes that break CI jobs. `success()` and `failure()` which were used before do not take into account dependent jobs. This also fixes the "lint" job which was failing, by formatting the code. --- .github/workflows/main.yml | 31 ++++++++++++++++------------- html5ever/examples/arena.rs | 2 +- html5ever/examples/noop-tokenize.rs | 3 +-- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4d29eba2..37aaa8b2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -51,20 +51,6 @@ jobs: - run: cargo check --lib --all-features - build_result: - name: Result - runs-on: ubuntu-latest - needs: - - "ci" - - steps: - - name: Mark the job as successful - run: exit 0 - if: success() - - name: Mark the job as unsuccessful - run: exit 1 - if: ${{ !success() }} - lint: name: Lint runs-on: ubuntu-latest @@ -86,3 +72,20 @@ jobs: - name: Run clippy run: cargo clippy --all-features --all-targets -- -D warnings + + build_result: + name: Result + runs-on: ubuntu-latest + needs: + - ci + - lint + - msrv + + steps: + - name: Success + if: ${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }} + run: exit 0 + - name: Failure + if: contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') + run: exit 1 + diff --git a/html5ever/examples/arena.rs b/html5ever/examples/arena.rs index 7a2b4980..acc05705 100644 --- a/html5ever/examples/arena.rs +++ b/html5ever/examples/arena.rs @@ -349,4 +349,4 @@ fn main() { let arena = typed_arena::Arena::new(); html5ever_parse_slice_into_arena(&bytes, &arena); -} \ No newline at end of file +} diff --git a/html5ever/examples/noop-tokenize.rs b/html5ever/examples/noop-tokenize.rs index f354a7b8..ade4571b 100644 --- a/html5ever/examples/noop-tokenize.rs +++ b/html5ever/examples/noop-tokenize.rs @@ -16,7 +16,6 @@ use std::io; use html5ever::tendril::*; use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer}; - /// In our case, our sink only contains a tokens vector struct Sink(Vec); @@ -36,7 +35,7 @@ fn main() { // Read HTML from standard input let mut chunk = ByteTendril::new(); io::stdin().read_to_tendril(&mut chunk).unwrap(); - + let mut input = BufferQueue::default(); input.push_back(chunk.try_reinterpret().unwrap()); From 5ea5bd50a29259c1f74a7cf93de66724d6fe4b35 Mon Sep 17 00:00:00 2001 From: Alex Touchet Date: Thu, 4 Apr 2024 11:45:20 -0700 Subject: [PATCH 14/16] Update xml5ever Readme (#534) --- xml5ever/README.md | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/xml5ever/README.md b/xml5ever/README.md index 6c30c02d..ee9a13ad 100644 --- a/xml5ever/README.md +++ b/xml5ever/README.md @@ -23,7 +23,7 @@ templates. XML5 tries to handle most common errors, in a manner similar to HTML5 - You aren't interested in well-formed documents. - You need to get some info from your data even if it has errors (although not all possible errors are handled). - - You want to features like character references or xml namespaces. + - You want to features like character references or XML namespaces. ## When you shouldn't use it @@ -34,17 +34,11 @@ templates. XML5 tries to handle most common errors, in a manner similar to HTML5 # Installation -Add xml5ever as a dependency in your project manifest. +Add xml5ever as a dependency in your project manifest: ```toml [dependencies] - xml5ever = "0.1.3" -``` - -And add crate declaration in your lib.rs - -```rust - extern crate xml5ever + xml5ever = "0.18" ``` # Getting started @@ -77,9 +71,3 @@ To build examples and tests you need to do something along the lines of: This will fetch tests from outside repository and it will invoke cargo to build and test the crate. If you need docs checkout either [API docs](https://docs.rs/xml5ever) or run `cargo docs` to generate documentation. - -## Easy first tasks - -What I generally recommend is to look at Clippy Linting badge results and create -a PR for fixing the said lints. Other than that try to look for any tasks labeled -easy or just update docs/examples. From 1ae2de3a1796a9b52a804a02039c6c1499e2f461 Mon Sep 17 00:00:00 2001 From: Nolan Lawson Date: Tue, 16 Apr 2024 07:46:45 -0700 Subject: [PATCH 15/16] fix(local_names): add missing ARIA attributes (#536) --- markup5ever/local_names.txt | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/markup5ever/local_names.txt b/markup5ever/local_names.txt index 14469cdf..2cdd07f3 100644 --- a/markup5ever/local_names.txt +++ b/markup5ever/local_names.txt @@ -60,31 +60,51 @@ arg aria-activedescendant aria-atomic aria-autocomplete +aria-braillelabel +aria-brailleroledescription aria-busy aria-channel aria-checked +aria-colcount +aria-colindex +aria-colindextext +aria-colspan aria-controls +aria-current aria-datatype aria-describedby +aria-description +aria-details aria-disabled aria-dropeffect +aria-errormessage aria-expanded aria-flowto aria-grab aria-haspopup aria-hidden aria-invalid +aria-keyshortcuts +aria-label aria-labelledby aria-level aria-live +aria-modal aria-multiline aria-multiselectable +aria-orientation aria-owns +aria-placeholder aria-posinset aria-pressed aria-readonly aria-relevant aria-required +aria-roledescription +aria-rowcount +aria-rowindex +aria-rowindextext +aria-rowspan aria-secret aria-selected aria-setsize @@ -93,6 +113,7 @@ aria-templateid aria-valuemax aria-valuemin aria-valuenow +aria-valuetext article ascent aside From 9089fc7ad88aa2021a9e1222769a80d0976736bc Mon Sep 17 00:00:00 2001 From: Martin Robinson Date: Tue, 23 Apr 2024 15:58:24 +0200 Subject: [PATCH 16/16] Release version 0.12.1 of `markup5ever` (#537) --- markup5ever/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markup5ever/Cargo.toml b/markup5ever/Cargo.toml index 3bd9da53..cb43d31c 100644 --- a/markup5ever/Cargo.toml +++ b/markup5ever/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "markup5ever" -version = "0.12.0" +version = "0.12.1" authors = [ "The html5ever Project Developers" ] license = "MIT OR Apache-2.0" repository = "https://github.com/servo/html5ever"