Remove tools from examples.

Add documentation for those tools, and make it so that `gen_large_yaml`
generates a predetermined set of files instead of outputting to its
standard output.
This commit is contained in:
Ethiraric 2024-03-14 19:20:56 +01:00
parent 6919d6fd02
commit e390c88077
9 changed files with 274 additions and 33 deletions

View file

@ -0,0 +1,2 @@
[alias]
gen_large_yaml = "run --profile=release-lto --package gen_large_yaml --bin gen_large_yaml --manifest-path tools/gen_large_yaml/Cargo.toml --"

View file

@ -19,8 +19,6 @@ linked-hash-map = "0.5.3"
[dev-dependencies]
libtest-mimic = "0.3.0"
quickcheck = "0.9"
rand = "0.8.5"
lipsum = "0.9.0"
[profile.release-lto]
inherits = "release"
@ -29,3 +27,11 @@ lto = true
[[test]]
name = "yaml-test-suite"
harness = false
[[bin]]
name = "dump_events"
path = "tools/dump_events.rs"
[[bin]]
name = "time_parse"
path = "tools/time_parse.rs"

188
parser/tools/README.md Normal file
View file

@ -0,0 +1,188 @@
# `yaml-rust2` tools
This directory contains tools that are used to develop the crate.
Due to dependency management, only some of them are available as binaries from the `yaml-rust2` crate.
| Tool | Invocation |
|------|------------|
| `dump_events` | `cargo run --bin dump_events -- [...]` |
| `gen_large_yaml` | `cargo gen_large_yaml` |
| `time_parse` | `cargo run --bin time_parse -- [...]` |
## `dump_events`
This is a debugging helper for the parser. It outputs events emitted by the parser for a given file. This can be paired with the `YAMLRUST2_DEBUG` environment variable to have an in-depth overview of which steps the scanner and the parser are taking.
### Example
Consider the following `input.yaml` YAML file:
```yaml
- foo: bar
- baz:
c: [3, 4, 5]
```
Running `cargo run --bin dump_events -- input.yaml` outputs:
```
↳ StreamStart
↳ DocumentStart
↳ SequenceStart(0, None)
↳ MappingStart(0, None)
↳ Scalar("foo", Plain, 0, None)
↳ Scalar("bar", Plain, 0, None)
↳ MappingEnd
↳ MappingStart(0, None)
↳ Scalar("baz", Plain, 0, None)
↳ Scalar("~", Plain, 0, None)
↳ Scalar("c", Plain, 0, None)
↳ SequenceStart(0, None)
↳ Scalar("3", Plain, 0, None)
↳ Scalar("4", Plain, 0, None)
↳ Scalar("5", Plain, 0, None)
↳ SequenceEnd
↳ MappingEnd
↳ SequenceEnd
↳ DocumentEnd
↳ StreamEnd
```
Running `YAMLRUST2_DEBUG=1 cargo run --bin dump_events -- input.yaml` outputs much more details:
<details>
<summary> Full output </summary>
```
Parser state: StreamStart
↳ StreamStart(Utf8) Marker { index: 0, line: 1, col: 0 }
↳ StreamStart
Parser state: ImplicitDocumentStart
→ fetch_next_token after whitespace Marker { index: 0, line: 1, col: 0 } '-'
↳ BlockSequenceStart Marker { index: 0, line: 1, col: 0 }
↳ DocumentStart
Parser state: BlockNode
↳ SequenceStart(0, None)
Parser state: BlockSequenceFirstEntry
↳ BlockEntry Marker { index: 2, line: 1, col: 2 }
→ fetch_next_token after whitespace Marker { index: 2, line: 1, col: 2 } 'f'
→ fetch_next_token after whitespace Marker { index: 5, line: 1, col: 5 } ':'
↳ BlockMappingStart Marker { index: 5, line: 1, col: 5 }
↳ MappingStart(0, None)
Parser state: BlockMappingFirstKey
↳ Key Marker { index: 2, line: 1, col: 2 }
↳ Scalar(Plain, "foo") Marker { index: 2, line: 1, col: 2 }
↳ Scalar("foo", Plain, 0, None)
Parser state: BlockMappingValue
↳ Value Marker { index: 5, line: 1, col: 5 }
→ fetch_next_token after whitespace Marker { index: 7, line: 1, col: 7 } 'b'
↳ Scalar(Plain, "bar") Marker { index: 7, line: 1, col: 7 }
↳ Scalar("bar", Plain, 0, None)
Parser state: BlockMappingKey
→ fetch_next_token after whitespace Marker { index: 11, line: 2, col: 0 } '-'
↳ BlockEnd Marker { index: 11, line: 2, col: 0 }
↳ MappingEnd
Parser state: BlockSequenceEntry
↳ BlockEntry Marker { index: 13, line: 2, col: 2 }
→ fetch_next_token after whitespace Marker { index: 13, line: 2, col: 2 } 'b'
→ fetch_next_token after whitespace Marker { index: 16, line: 2, col: 5 } ':'
↳ BlockMappingStart Marker { index: 16, line: 2, col: 5 }
↳ MappingStart(0, None)
Parser state: BlockMappingFirstKey
↳ Key Marker { index: 13, line: 2, col: 2 }
↳ Scalar(Plain, "baz") Marker { index: 13, line: 2, col: 2 }
↳ Scalar("baz", Plain, 0, None)
Parser state: BlockMappingValue
↳ Value Marker { index: 16, line: 2, col: 5 }
→ fetch_next_token after whitespace Marker { index: 20, line: 3, col: 2 } 'c'
→ fetch_next_token after whitespace Marker { index: 21, line: 3, col: 3 } ':'
↳ Key Marker { index: 20, line: 3, col: 2 }
↳ Scalar("~", Plain, 0, None)
Parser state: BlockMappingKey
↳ Scalar(Plain, "c") Marker { index: 20, line: 3, col: 2 }
↳ Scalar("c", Plain, 0, None)
Parser state: BlockMappingValue
↳ Value Marker { index: 21, line: 3, col: 3 }
→ fetch_next_token after whitespace Marker { index: 23, line: 3, col: 5 } '['
↳ FlowSequenceStart Marker { index: 23, line: 3, col: 5 }
↳ SequenceStart(0, None)
Parser state: FlowSequenceFirstEntry
→ fetch_next_token after whitespace Marker { index: 24, line: 3, col: 6 } '3'
→ fetch_next_token after whitespace Marker { index: 25, line: 3, col: 7 } ','
↳ Scalar(Plain, "3") Marker { index: 24, line: 3, col: 6 }
↳ Scalar("3", Plain, 0, None)
Parser state: FlowSequenceEntry
↳ FlowEntry Marker { index: 25, line: 3, col: 7 }
→ fetch_next_token after whitespace Marker { index: 27, line: 3, col: 9 } '4'
→ fetch_next_token after whitespace Marker { index: 28, line: 3, col: 10 } ','
↳ Scalar(Plain, "4") Marker { index: 27, line: 3, col: 9 }
↳ Scalar("4", Plain, 0, None)
Parser state: FlowSequenceEntry
↳ FlowEntry Marker { index: 28, line: 3, col: 10 }
→ fetch_next_token after whitespace Marker { index: 30, line: 3, col: 12 } '5'
→ fetch_next_token after whitespace Marker { index: 31, line: 3, col: 13 } ']'
↳ Scalar(Plain, "5") Marker { index: 30, line: 3, col: 12 }
↳ Scalar("5", Plain, 0, None)
Parser state: FlowSequenceEntry
↳ FlowSequenceEnd Marker { index: 31, line: 3, col: 13 }
↳ SequenceEnd
Parser state: BlockMappingKey
→ fetch_next_token after whitespace Marker { index: 33, line: 4, col: 0 } '\0'
↳ BlockEnd Marker { index: 33, line: 4, col: 0 }
↳ MappingEnd
Parser state: BlockSequenceEntry
↳ BlockEnd Marker { index: 33, line: 4, col: 0 }
↳ SequenceEnd
Parser state: DocumentEnd
↳ StreamEnd Marker { index: 33, line: 4, col: 0 }
↳ DocumentEnd
Parser state: DocumentStart
↳ StreamEnd
```
</details>
While this cannot be shown in Markdown, the output is colored so that it is a bit easier to read.
## `gen_large_yaml`
It is hard to find large (100+MiB) real-world YAML files that could be used to benchmark a parser. This utility generates multiple large files that are meant to stress the parser with different layouts of YAML files. The resulting files do not look like anything that would be encountered in production, but can serve as a base to test several features of a YAML parser.
The generated files are the following:
- `big.yaml`: A large array of records with few fields. One of the fields is a description, a large text block scalar spanning multiple lines. Most of the scanning happens in block scalars.
- `nested.yaml`: Very short key-value pairs that nest deeply.
- `small_objects.yaml`: A large array of 2 key-value mappings.
- `strings_array.yaml`: A large array of lipsum one-liners (~150-175 characters in length).
All generated files are meant to be between 200 and 250 MiB in size.
This tool depends on external dependencies that are not part of `yaml-rust2`'s dependencies or `dev-dependencies` and as such can't be called through `cargo run` directly. A dedicated `cargo gen_large_yaml` alias can be used to generate the benchmark files.
## `time_parse`
This is a benchmarking helper that times how long it takes for the parser to emit all events. It calls the parser on the given input file, receives parsing events and then immediately discards them. It is advised to run this tool with `--release`.
### Examples
Loading a small file could output the following:
```sh
$> cargo run --release --bin time_parse -- input.yaml
Loaded 0MiB in 14.189µs
```
While loading a larger file could output the following:
```sh
$> cargo run --release --bin time_parse -- bench_yaml/big.yaml
Loaded 220MiB in 1.612677853s
```

View file

@ -0,0 +1,20 @@
[package]
name = "gen_large_yaml"
version = "0.5.0"
authors = [
"Ethiraric <ethiraric@gmail.com>"
]
license = "MIT OR Apache-2.0"
description = "A helper to generate large YAML files"
repository = "https://github.com/Ethiraric/yaml-rust2"
readme = "README.md"
edition = "2018"
[dependencies]
yaml-rust2 = { version = "0.5.0", path = "../../" }
rand = "0.8.5"
lipsum = "0.9.0"
[profile.release-lto]
inherits = "release"
lto = true

View file

@ -4,15 +4,37 @@ mod gen;
mod nested;
use std::collections::HashMap;
use std::fs::File;
use std::io::BufWriter;
use std::path::Path;
use rand::{rngs::ThreadRng, Rng};
fn main() -> std::fmt::Result {
let mut s = String::new();
// let mut g = Generator::new();
// g.gen_strings_array(&mut s, 1_300_000, 1_300_001, 10, 40)?;
nested::create_deep_object(&mut s, 5_000_000)?;
println!("{s}");
/// The path into which the generated YAML files will be written.
const OUTPUT_DIR: &str = "bench_yaml";
fn main() -> std::io::Result<()> {
let mut generator = Generator::new();
let output_path = Path::new(OUTPUT_DIR);
if !output_path.is_dir() {
std::fs::create_dir(output_path).unwrap();
}
println!("Generating big.yaml");
let mut out = BufWriter::new(File::create(output_path.join("big.yaml")).unwrap());
generator.gen_record_array(&mut out, 100_000, 100_001)?;
println!("Generating nested.yaml");
let mut out = BufWriter::new(File::create(output_path.join("nested.yaml")).unwrap());
nested::create_deep_object(&mut out, 5_000_000)?;
println!("Generating small_objects.yaml");
let mut out = BufWriter::new(File::create(output_path.join("small_objects.yaml")).unwrap());
generator.gen_authors_array(&mut out, 4_000_000, 4_000_001)?;
println!("Generating strings_array.yaml");
let mut out = BufWriter::new(File::create(output_path.join("strings_array.yaml")).unwrap());
generator.gen_strings_array(&mut out, 1_300_000, 1_300_001, 10, 40)?;
Ok(())
}
@ -24,7 +46,7 @@ struct Generator {
indents: Vec<usize>,
}
type GenFn<W> = dyn FnOnce(&mut Generator, &mut W) -> std::fmt::Result;
type GenFn<W> = dyn FnOnce(&mut Generator, &mut W) -> std::io::Result<()>;
impl Generator {
/// Create a new generator.
@ -36,24 +58,24 @@ impl Generator {
}
/// Generate an array of records as per [`Self::gen_record_object`].
fn gen_record_array<W: std::fmt::Write>(
fn gen_record_array<W: std::io::Write>(
&mut self,
writer: &mut W,
items_lo: usize,
items_hi: usize,
) -> std::fmt::Result {
) -> std::io::Result<()> {
self.gen_array(writer, items_lo, items_hi, Generator::gen_record_object)
}
/// Generate an array of lipsum one-liners.
fn gen_strings_array<W: std::fmt::Write>(
fn gen_strings_array<W: std::io::Write>(
&mut self,
writer: &mut W,
items_lo: usize,
items_hi: usize,
words_lo: usize,
words_hi: usize,
) -> std::fmt::Result {
) -> std::io::Result<()> {
self.gen_array(writer, items_lo, items_hi, |gen, writer| {
write!(writer, "{}", gen::words(&mut gen.rng, words_lo, words_hi))
})
@ -64,7 +86,7 @@ impl Generator {
/// Fields are description, hash, version, home, repository and pdf.
/// The `description` field is a long string and puts a lot of weight in plain scalar / block
/// scalar parsing.
fn gen_record_object<W: std::fmt::Write>(&mut self, writer: &mut W) -> std::fmt::Result {
fn gen_record_object<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
let mut fields = HashMap::<String, Box<GenFn<W>>>::new();
fields.insert(
"description".to_string(),
@ -121,17 +143,17 @@ impl Generator {
}
/// Generate an array of authors as per [`Self::gen_author_object`].
fn gen_authors_array<W: std::fmt::Write>(
fn gen_authors_array<W: std::io::Write>(
&mut self,
writer: &mut W,
items_lo: usize,
items_hi: usize,
) -> std::fmt::Result {
) -> std::io::Result<()> {
self.gen_array(writer, items_lo, items_hi, Generator::gen_author_object)
}
/// Generate a small object with 2 string fields.
fn gen_author_object<W: std::fmt::Write>(&mut self, writer: &mut W) -> std::fmt::Result {
fn gen_author_object<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
let mut fields = HashMap::<String, Box<GenFn<W>>>::new();
fields.insert(
"name".to_string(),
@ -145,13 +167,13 @@ impl Generator {
}
/// Generate a YAML array/sequence containing nodes generated by the given function.
fn gen_array<W: std::fmt::Write, F: FnMut(&mut Generator, &mut W) -> std::fmt::Result>(
fn gen_array<W: std::io::Write, F: FnMut(&mut Generator, &mut W) -> std::io::Result<()>>(
&mut self,
writer: &mut W,
len_lo: usize,
len_hi: usize,
mut obj_creator: F,
) -> std::fmt::Result {
) -> std::io::Result<()> {
let mut first = true;
for _ in 0..self.rng.gen_range(len_lo..len_hi) {
if first {
@ -168,11 +190,11 @@ impl Generator {
}
/// Create a Yaml object with some fields in it.
fn gen_object<W: std::fmt::Write>(
fn gen_object<W: std::io::Write>(
&mut self,
writer: &mut W,
fields: HashMap<String, Box<GenFn<W>>>,
) -> std::fmt::Result {
) -> std::io::Result<()> {
let mut first = true;
for (key, f) in fields {
if first {
@ -187,11 +209,11 @@ impl Generator {
}
/// Write the given lines at the right indentation.
fn write_lines<W: std::fmt::Write>(
fn write_lines<W: std::io::Write>(
&mut self,
writer: &mut W,
lines: &[String],
) -> std::fmt::Result {
) -> std::io::Result<()> {
let mut first = true;
for line in lines {
@ -207,7 +229,7 @@ impl Generator {
}
/// Write a new line to the writer and indent.
fn nl<W: std::fmt::Write>(&mut self, writer: &mut W) -> std::fmt::Result {
fn nl<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
writeln!(writer)?;
for _ in 0..self.indent() {
write!(writer, " ")?;

View file

@ -3,7 +3,10 @@ use std::{cell::RefCell, rc::Rc};
use rand::{rngs::ThreadRng, Rng};
/// Create a deep object with the given amount of nodes.
pub fn create_deep_object<W: std::fmt::Write>(writer: &mut W, n_nodes: usize) -> std::fmt::Result {
pub fn create_deep_object<W: std::io::Write>(
writer: &mut W,
n_nodes: usize,
) -> std::io::Result<()> {
let mut tree = Tree::new();
for _ in 0..n_nodes {
tree.push_node();
@ -51,7 +54,7 @@ impl Tree {
}
/// Write the YAML representation of the tree to `writer`.
fn write_to<W: std::fmt::Write>(&self, writer: &mut W) -> std::fmt::Result {
fn write_to<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
(*self.root).borrow().write_to(writer, 0)
}
}
@ -72,15 +75,15 @@ impl Node {
}
/// Write the YAML representation of the node to `writer`.
fn write_to<W: std::fmt::Write>(&self, writer: &mut W, indent: usize) -> std::fmt::Result {
fn write_to<W: std::io::Write>(&self, writer: &mut W, indent: usize) -> std::io::Result<()> {
if self.children.is_empty() {
write_n(writer, ' ', indent)?;
writer.write_str("a: 1\n")?;
writer.write_all(b"a: 1\n")?;
} else {
for (n, child) in self.children.iter().enumerate() {
write_n(writer, ' ', indent)?;
write_id_for_number(writer, n)?;
writer.write_str(":\n")?;
writer.write_all(b":\n")?;
(**child).borrow().write_to(writer, indent + 2)?;
}
}
@ -89,19 +92,19 @@ impl Node {
}
/// Write `n` times `c` to `out`.
fn write_n<W: std::fmt::Write>(out: &mut W, c: char, n: usize) -> std::fmt::Result {
fn write_n<W: std::io::Write>(out: &mut W, c: char, n: usize) -> std::io::Result<()> {
for _ in 0..n {
out.write_char(c)?;
write!(out, "{c}")?;
}
Ok(())
}
/// Create a valid identifier for the given number.
fn write_id_for_number<W: std::fmt::Write>(out: &mut W, mut n: usize) -> std::fmt::Result {
fn write_id_for_number<W: std::io::Write>(out: &mut W, mut n: usize) -> std::io::Result<()> {
const DIGITS: &[u8] = b"_abcdefghijklmnopqrstuvwxyz";
n += 1;
while n > 0 {
out.write_char(DIGITS[n % DIGITS.len()] as char)?;
write!(out, "{}", DIGITS[n % DIGITS.len()] as char)?;
n /= DIGITS.len();
}
Ok(())