From b7755e119c37b77b04147ad623c9cbc7265b9167 Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Thu, 14 Mar 2024 19:20:56 +0100 Subject: [PATCH] Remove tools from examples. Add documentation for those tools, and make it so that `gen_large_yaml` generates a predetermined set of files instead of outputting to its standard output. --- saphyr/.cargo/config.toml | 2 + saphyr/Cargo.toml | 10 +- saphyr/tools/README.md | 188 ++++++++++++++++++ saphyr/{examples => tools}/dump_events.rs | 0 saphyr/tools/gen_large_yaml/Cargo.toml | 20 ++ .../gen_large_yaml/src}/gen.rs | 0 .../gen_large_yaml/src}/main.rs | 66 ++++-- .../gen_large_yaml/src}/nested.rs | 21 +- saphyr/{examples => tools}/time_parse.rs | 0 9 files changed, 274 insertions(+), 33 deletions(-) create mode 100644 saphyr/.cargo/config.toml create mode 100644 saphyr/tools/README.md rename saphyr/{examples => tools}/dump_events.rs (100%) create mode 100644 saphyr/tools/gen_large_yaml/Cargo.toml rename saphyr/{examples/gen_large_yaml => tools/gen_large_yaml/src}/gen.rs (100%) rename saphyr/{examples/gen_large_yaml => tools/gen_large_yaml/src}/main.rs (74%) rename saphyr/{examples/gen_large_yaml => tools/gen_large_yaml/src}/nested.rs (79%) rename saphyr/{examples => tools}/time_parse.rs (100%) diff --git a/saphyr/.cargo/config.toml b/saphyr/.cargo/config.toml new file mode 100644 index 0000000..7bc65f1 --- /dev/null +++ b/saphyr/.cargo/config.toml @@ -0,0 +1,2 @@ +[alias] +gen_large_yaml = "run --profile=release-lto --package gen_large_yaml --bin gen_large_yaml --manifest-path tools/gen_large_yaml/Cargo.toml --" diff --git a/saphyr/Cargo.toml b/saphyr/Cargo.toml index a24c258..15425df 100644 --- a/saphyr/Cargo.toml +++ b/saphyr/Cargo.toml @@ -19,8 +19,6 @@ linked-hash-map = "0.5.3" [dev-dependencies] libtest-mimic = "0.3.0" quickcheck = "0.9" -rand = "0.8.5" -lipsum = "0.9.0" [profile.release-lto] inherits = "release" @@ -29,3 +27,11 @@ lto = true [[test]] name = "yaml-test-suite" harness = false + +[[bin]] +name = "dump_events" +path = "tools/dump_events.rs" + +[[bin]] +name = "time_parse" +path = "tools/time_parse.rs" diff --git a/saphyr/tools/README.md b/saphyr/tools/README.md new file mode 100644 index 0000000..de71873 --- /dev/null +++ b/saphyr/tools/README.md @@ -0,0 +1,188 @@ +# `yaml-rust2` tools +This directory contains tools that are used to develop the crate. +Due to dependency management, only some of them are available as binaries from the `yaml-rust2` crate. + +| Tool | Invocation | +|------|------------| +| `dump_events` | `cargo run --bin dump_events -- [...]` | +| `gen_large_yaml` | `cargo gen_large_yaml` | +| `time_parse` | `cargo run --bin time_parse -- [...]` | + +## `dump_events` +This is a debugging helper for the parser. It outputs events emitted by the parser for a given file. This can be paired with the `YAMLRUST2_DEBUG` environment variable to have an in-depth overview of which steps the scanner and the parser are taking. + +### Example +Consider the following `input.yaml` YAML file: +```yaml +- foo: bar +- baz: + c: [3, 4, 5] +``` + +Running `cargo run --bin dump_events -- input.yaml` outputs: +``` + ↳ StreamStart + ↳ DocumentStart + ↳ SequenceStart(0, None) + ↳ MappingStart(0, None) + ↳ Scalar("foo", Plain, 0, None) + ↳ Scalar("bar", Plain, 0, None) + ↳ MappingEnd + ↳ MappingStart(0, None) + ↳ Scalar("baz", Plain, 0, None) + ↳ Scalar("~", Plain, 0, None) + ↳ Scalar("c", Plain, 0, None) + ↳ SequenceStart(0, None) + ↳ Scalar("3", Plain, 0, None) + ↳ Scalar("4", Plain, 0, None) + ↳ Scalar("5", Plain, 0, None) + ↳ SequenceEnd + ↳ MappingEnd + ↳ SequenceEnd + ↳ DocumentEnd + ↳ StreamEnd +``` + +Running `YAMLRUST2_DEBUG=1 cargo run --bin dump_events -- input.yaml` outputs much more details: +
+ Full output + +``` +Parser state: StreamStart + ↳ StreamStart(Utf8) Marker { index: 0, line: 1, col: 0 } + ↳ StreamStart + +Parser state: ImplicitDocumentStart + → fetch_next_token after whitespace Marker { index: 0, line: 1, col: 0 } '-' + ↳ BlockSequenceStart Marker { index: 0, line: 1, col: 0 } + ↳ DocumentStart + +Parser state: BlockNode + ↳ SequenceStart(0, None) + +Parser state: BlockSequenceFirstEntry + ↳ BlockEntry Marker { index: 2, line: 1, col: 2 } + → fetch_next_token after whitespace Marker { index: 2, line: 1, col: 2 } 'f' + → fetch_next_token after whitespace Marker { index: 5, line: 1, col: 5 } ':' + ↳ BlockMappingStart Marker { index: 5, line: 1, col: 5 } + ↳ MappingStart(0, None) + +Parser state: BlockMappingFirstKey + ↳ Key Marker { index: 2, line: 1, col: 2 } + ↳ Scalar(Plain, "foo") Marker { index: 2, line: 1, col: 2 } + ↳ Scalar("foo", Plain, 0, None) + +Parser state: BlockMappingValue + ↳ Value Marker { index: 5, line: 1, col: 5 } + → fetch_next_token after whitespace Marker { index: 7, line: 1, col: 7 } 'b' + ↳ Scalar(Plain, "bar") Marker { index: 7, line: 1, col: 7 } + ↳ Scalar("bar", Plain, 0, None) + +Parser state: BlockMappingKey + → fetch_next_token after whitespace Marker { index: 11, line: 2, col: 0 } '-' + ↳ BlockEnd Marker { index: 11, line: 2, col: 0 } + ↳ MappingEnd + +Parser state: BlockSequenceEntry + ↳ BlockEntry Marker { index: 13, line: 2, col: 2 } + → fetch_next_token after whitespace Marker { index: 13, line: 2, col: 2 } 'b' + → fetch_next_token after whitespace Marker { index: 16, line: 2, col: 5 } ':' + ↳ BlockMappingStart Marker { index: 16, line: 2, col: 5 } + ↳ MappingStart(0, None) + +Parser state: BlockMappingFirstKey + ↳ Key Marker { index: 13, line: 2, col: 2 } + ↳ Scalar(Plain, "baz") Marker { index: 13, line: 2, col: 2 } + ↳ Scalar("baz", Plain, 0, None) + +Parser state: BlockMappingValue + ↳ Value Marker { index: 16, line: 2, col: 5 } + → fetch_next_token after whitespace Marker { index: 20, line: 3, col: 2 } 'c' + → fetch_next_token after whitespace Marker { index: 21, line: 3, col: 3 } ':' + ↳ Key Marker { index: 20, line: 3, col: 2 } + ↳ Scalar("~", Plain, 0, None) + +Parser state: BlockMappingKey + ↳ Scalar(Plain, "c") Marker { index: 20, line: 3, col: 2 } + ↳ Scalar("c", Plain, 0, None) + +Parser state: BlockMappingValue + ↳ Value Marker { index: 21, line: 3, col: 3 } + → fetch_next_token after whitespace Marker { index: 23, line: 3, col: 5 } '[' + ↳ FlowSequenceStart Marker { index: 23, line: 3, col: 5 } + ↳ SequenceStart(0, None) + +Parser state: FlowSequenceFirstEntry + → fetch_next_token after whitespace Marker { index: 24, line: 3, col: 6 } '3' + → fetch_next_token after whitespace Marker { index: 25, line: 3, col: 7 } ',' + ↳ Scalar(Plain, "3") Marker { index: 24, line: 3, col: 6 } + ↳ Scalar("3", Plain, 0, None) + +Parser state: FlowSequenceEntry + ↳ FlowEntry Marker { index: 25, line: 3, col: 7 } + → fetch_next_token after whitespace Marker { index: 27, line: 3, col: 9 } '4' + → fetch_next_token after whitespace Marker { index: 28, line: 3, col: 10 } ',' + ↳ Scalar(Plain, "4") Marker { index: 27, line: 3, col: 9 } + ↳ Scalar("4", Plain, 0, None) + +Parser state: FlowSequenceEntry + ↳ FlowEntry Marker { index: 28, line: 3, col: 10 } + → fetch_next_token after whitespace Marker { index: 30, line: 3, col: 12 } '5' + → fetch_next_token after whitespace Marker { index: 31, line: 3, col: 13 } ']' + ↳ Scalar(Plain, "5") Marker { index: 30, line: 3, col: 12 } + ↳ Scalar("5", Plain, 0, None) + +Parser state: FlowSequenceEntry + ↳ FlowSequenceEnd Marker { index: 31, line: 3, col: 13 } + ↳ SequenceEnd + +Parser state: BlockMappingKey + → fetch_next_token after whitespace Marker { index: 33, line: 4, col: 0 } '\0' + ↳ BlockEnd Marker { index: 33, line: 4, col: 0 } + ↳ MappingEnd + +Parser state: BlockSequenceEntry + ↳ BlockEnd Marker { index: 33, line: 4, col: 0 } + ↳ SequenceEnd + +Parser state: DocumentEnd + ↳ StreamEnd Marker { index: 33, line: 4, col: 0 } + ↳ DocumentEnd + +Parser state: DocumentStart + ↳ StreamEnd +``` + +
+ +While this cannot be shown in Markdown, the output is colored so that it is a bit easier to read. + +## `gen_large_yaml` +It is hard to find large (100+MiB) real-world YAML files that could be used to benchmark a parser. This utility generates multiple large files that are meant to stress the parser with different layouts of YAML files. The resulting files do not look like anything that would be encountered in production, but can serve as a base to test several features of a YAML parser. + +The generated files are the following: + + - `big.yaml`: A large array of records with few fields. One of the fields is a description, a large text block scalar spanning multiple lines. Most of the scanning happens in block scalars. + - `nested.yaml`: Very short key-value pairs that nest deeply. + - `small_objects.yaml`: A large array of 2 key-value mappings. + - `strings_array.yaml`: A large array of lipsum one-liners (~150-175 characters in length). + +All generated files are meant to be between 200 and 250 MiB in size. + +This tool depends on external dependencies that are not part of `yaml-rust2`'s dependencies or `dev-dependencies` and as such can't be called through `cargo run` directly. A dedicated `cargo gen_large_yaml` alias can be used to generate the benchmark files. + +## `time_parse` +This is a benchmarking helper that times how long it takes for the parser to emit all events. It calls the parser on the given input file, receives parsing events and then immediately discards them. It is advised to run this tool with `--release`. + +### Examples +Loading a small file could output the following: +```sh +$> cargo run --release --bin time_parse -- input.yaml +Loaded 0MiB in 14.189µs +``` + +While loading a larger file could output the following: +```sh +$> cargo run --release --bin time_parse -- bench_yaml/big.yaml +Loaded 220MiB in 1.612677853s +``` diff --git a/saphyr/examples/dump_events.rs b/saphyr/tools/dump_events.rs similarity index 100% rename from saphyr/examples/dump_events.rs rename to saphyr/tools/dump_events.rs diff --git a/saphyr/tools/gen_large_yaml/Cargo.toml b/saphyr/tools/gen_large_yaml/Cargo.toml new file mode 100644 index 0000000..54b6b3c --- /dev/null +++ b/saphyr/tools/gen_large_yaml/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "gen_large_yaml" +version = "0.5.0" +authors = [ + "Ethiraric " +] +license = "MIT OR Apache-2.0" +description = "A helper to generate large YAML files" +repository = "https://github.com/Ethiraric/yaml-rust2" +readme = "README.md" +edition = "2018" + +[dependencies] +yaml-rust2 = { version = "0.5.0", path = "../../" } +rand = "0.8.5" +lipsum = "0.9.0" + +[profile.release-lto] +inherits = "release" +lto = true diff --git a/saphyr/examples/gen_large_yaml/gen.rs b/saphyr/tools/gen_large_yaml/src/gen.rs similarity index 100% rename from saphyr/examples/gen_large_yaml/gen.rs rename to saphyr/tools/gen_large_yaml/src/gen.rs diff --git a/saphyr/examples/gen_large_yaml/main.rs b/saphyr/tools/gen_large_yaml/src/main.rs similarity index 74% rename from saphyr/examples/gen_large_yaml/main.rs rename to saphyr/tools/gen_large_yaml/src/main.rs index cfc8d70..d478e8b 100644 --- a/saphyr/examples/gen_large_yaml/main.rs +++ b/saphyr/tools/gen_large_yaml/src/main.rs @@ -4,15 +4,37 @@ mod gen; mod nested; use std::collections::HashMap; +use std::fs::File; +use std::io::BufWriter; +use std::path::Path; use rand::{rngs::ThreadRng, Rng}; -fn main() -> std::fmt::Result { - let mut s = String::new(); - // let mut g = Generator::new(); - // g.gen_strings_array(&mut s, 1_300_000, 1_300_001, 10, 40)?; - nested::create_deep_object(&mut s, 5_000_000)?; - println!("{s}"); +/// The path into which the generated YAML files will be written. +const OUTPUT_DIR: &str = "bench_yaml"; + +fn main() -> std::io::Result<()> { + let mut generator = Generator::new(); + let output_path = Path::new(OUTPUT_DIR); + if !output_path.is_dir() { + std::fs::create_dir(output_path).unwrap(); + } + + println!("Generating big.yaml"); + let mut out = BufWriter::new(File::create(output_path.join("big.yaml")).unwrap()); + generator.gen_record_array(&mut out, 100_000, 100_001)?; + + println!("Generating nested.yaml"); + let mut out = BufWriter::new(File::create(output_path.join("nested.yaml")).unwrap()); + nested::create_deep_object(&mut out, 5_000_000)?; + + println!("Generating small_objects.yaml"); + let mut out = BufWriter::new(File::create(output_path.join("small_objects.yaml")).unwrap()); + generator.gen_authors_array(&mut out, 4_000_000, 4_000_001)?; + + println!("Generating strings_array.yaml"); + let mut out = BufWriter::new(File::create(output_path.join("strings_array.yaml")).unwrap()); + generator.gen_strings_array(&mut out, 1_300_000, 1_300_001, 10, 40)?; Ok(()) } @@ -24,7 +46,7 @@ struct Generator { indents: Vec, } -type GenFn = dyn FnOnce(&mut Generator, &mut W) -> std::fmt::Result; +type GenFn = dyn FnOnce(&mut Generator, &mut W) -> std::io::Result<()>; impl Generator { /// Create a new generator. @@ -36,24 +58,24 @@ impl Generator { } /// Generate an array of records as per [`Self::gen_record_object`]. - fn gen_record_array( + fn gen_record_array( &mut self, writer: &mut W, items_lo: usize, items_hi: usize, - ) -> std::fmt::Result { + ) -> std::io::Result<()> { self.gen_array(writer, items_lo, items_hi, Generator::gen_record_object) } /// Generate an array of lipsum one-liners. - fn gen_strings_array( + fn gen_strings_array( &mut self, writer: &mut W, items_lo: usize, items_hi: usize, words_lo: usize, words_hi: usize, - ) -> std::fmt::Result { + ) -> std::io::Result<()> { self.gen_array(writer, items_lo, items_hi, |gen, writer| { write!(writer, "{}", gen::words(&mut gen.rng, words_lo, words_hi)) }) @@ -64,7 +86,7 @@ impl Generator { /// Fields are description, hash, version, home, repository and pdf. /// The `description` field is a long string and puts a lot of weight in plain scalar / block /// scalar parsing. - fn gen_record_object(&mut self, writer: &mut W) -> std::fmt::Result { + fn gen_record_object(&mut self, writer: &mut W) -> std::io::Result<()> { let mut fields = HashMap::>>::new(); fields.insert( "description".to_string(), @@ -121,17 +143,17 @@ impl Generator { } /// Generate an array of authors as per [`Self::gen_author_object`]. - fn gen_authors_array( + fn gen_authors_array( &mut self, writer: &mut W, items_lo: usize, items_hi: usize, - ) -> std::fmt::Result { + ) -> std::io::Result<()> { self.gen_array(writer, items_lo, items_hi, Generator::gen_author_object) } /// Generate a small object with 2 string fields. - fn gen_author_object(&mut self, writer: &mut W) -> std::fmt::Result { + fn gen_author_object(&mut self, writer: &mut W) -> std::io::Result<()> { let mut fields = HashMap::>>::new(); fields.insert( "name".to_string(), @@ -145,13 +167,13 @@ impl Generator { } /// Generate a YAML array/sequence containing nodes generated by the given function. - fn gen_array std::fmt::Result>( + fn gen_array std::io::Result<()>>( &mut self, writer: &mut W, len_lo: usize, len_hi: usize, mut obj_creator: F, - ) -> std::fmt::Result { + ) -> std::io::Result<()> { let mut first = true; for _ in 0..self.rng.gen_range(len_lo..len_hi) { if first { @@ -168,11 +190,11 @@ impl Generator { } /// Create a Yaml object with some fields in it. - fn gen_object( + fn gen_object( &mut self, writer: &mut W, fields: HashMap>>, - ) -> std::fmt::Result { + ) -> std::io::Result<()> { let mut first = true; for (key, f) in fields { if first { @@ -187,11 +209,11 @@ impl Generator { } /// Write the given lines at the right indentation. - fn write_lines( + fn write_lines( &mut self, writer: &mut W, lines: &[String], - ) -> std::fmt::Result { + ) -> std::io::Result<()> { let mut first = true; for line in lines { @@ -207,7 +229,7 @@ impl Generator { } /// Write a new line to the writer and indent. - fn nl(&mut self, writer: &mut W) -> std::fmt::Result { + fn nl(&mut self, writer: &mut W) -> std::io::Result<()> { writeln!(writer)?; for _ in 0..self.indent() { write!(writer, " ")?; diff --git a/saphyr/examples/gen_large_yaml/nested.rs b/saphyr/tools/gen_large_yaml/src/nested.rs similarity index 79% rename from saphyr/examples/gen_large_yaml/nested.rs rename to saphyr/tools/gen_large_yaml/src/nested.rs index 3977901..92dc21a 100644 --- a/saphyr/examples/gen_large_yaml/nested.rs +++ b/saphyr/tools/gen_large_yaml/src/nested.rs @@ -3,7 +3,10 @@ use std::{cell::RefCell, rc::Rc}; use rand::{rngs::ThreadRng, Rng}; /// Create a deep object with the given amount of nodes. -pub fn create_deep_object(writer: &mut W, n_nodes: usize) -> std::fmt::Result { +pub fn create_deep_object( + writer: &mut W, + n_nodes: usize, +) -> std::io::Result<()> { let mut tree = Tree::new(); for _ in 0..n_nodes { tree.push_node(); @@ -51,7 +54,7 @@ impl Tree { } /// Write the YAML representation of the tree to `writer`. - fn write_to(&self, writer: &mut W) -> std::fmt::Result { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { (*self.root).borrow().write_to(writer, 0) } } @@ -72,15 +75,15 @@ impl Node { } /// Write the YAML representation of the node to `writer`. - fn write_to(&self, writer: &mut W, indent: usize) -> std::fmt::Result { + fn write_to(&self, writer: &mut W, indent: usize) -> std::io::Result<()> { if self.children.is_empty() { write_n(writer, ' ', indent)?; - writer.write_str("a: 1\n")?; + writer.write_all(b"a: 1\n")?; } else { for (n, child) in self.children.iter().enumerate() { write_n(writer, ' ', indent)?; write_id_for_number(writer, n)?; - writer.write_str(":\n")?; + writer.write_all(b":\n")?; (**child).borrow().write_to(writer, indent + 2)?; } } @@ -89,19 +92,19 @@ impl Node { } /// Write `n` times `c` to `out`. -fn write_n(out: &mut W, c: char, n: usize) -> std::fmt::Result { +fn write_n(out: &mut W, c: char, n: usize) -> std::io::Result<()> { for _ in 0..n { - out.write_char(c)?; + write!(out, "{c}")?; } Ok(()) } /// Create a valid identifier for the given number. -fn write_id_for_number(out: &mut W, mut n: usize) -> std::fmt::Result { +fn write_id_for_number(out: &mut W, mut n: usize) -> std::io::Result<()> { const DIGITS: &[u8] = b"_abcdefghijklmnopqrstuvwxyz"; n += 1; while n > 0 { - out.write_char(DIGITS[n % DIGITS.len()] as char)?; + write!(out, "{}", DIGITS[n % DIGITS.len()] as char)?; n /= DIGITS.len(); } Ok(()) diff --git a/saphyr/examples/time_parse.rs b/saphyr/tools/time_parse.rs similarity index 100% rename from saphyr/examples/time_parse.rs rename to saphyr/tools/time_parse.rs