From dc429b7ef718b1526ec71890f8326627ce4939d3 Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Sun, 13 Oct 2024 16:18:44 +0200 Subject: [PATCH] Deduplicate tools. --- parser/tools/README.md | 19 -- parser/tools/bench_compare/Cargo.toml | 21 -- parser/tools/bench_compare/README.md | 120 ---------- parser/tools/bench_compare/src/main.rs | 175 --------------- parser/tools/gen_large_yaml/Cargo.toml | 19 -- parser/tools/gen_large_yaml/src/gen.rs | 156 ------------- parser/tools/gen_large_yaml/src/main.rs | 261 ---------------------- parser/tools/gen_large_yaml/src/nested.rs | 115 ---------- 8 files changed, 886 deletions(-) delete mode 100644 parser/tools/bench_compare/Cargo.toml delete mode 100644 parser/tools/bench_compare/README.md delete mode 100644 parser/tools/bench_compare/src/main.rs delete mode 100644 parser/tools/gen_large_yaml/Cargo.toml delete mode 100644 parser/tools/gen_large_yaml/src/gen.rs delete mode 100644 parser/tools/gen_large_yaml/src/main.rs delete mode 100644 parser/tools/gen_large_yaml/src/nested.rs diff --git a/parser/tools/README.md b/parser/tools/README.md index 48d6232..017c493 100644 --- a/parser/tools/README.md +++ b/parser/tools/README.md @@ -4,15 +4,10 @@ Due to dependency management, only some of them are available as binaries from t | Tool | Invocation | |------|------------| -| `bench_compare` | `cargo bench_compare` | | `dump_events` | `cargo run --bin dump_events -- [...]` | -| `gen_large_yaml` | `cargo gen_large_yaml` | | `run_bench` | `cargo run --bin run_bench -- [...]` | | `time_parse` | `cargo run --bin time_parse -- [...]` | -## `bench_compare` -See the [dedicated README file](./bench_compare/README.md). - ## `dump_events` This is a debugging helper for the parser. It outputs events emitted by the parser for a given file. This can be paired with the `SAPHYR_DEBUG` environment variable to have an in-depth overview of which steps the scanner and the parser are taking. @@ -162,20 +157,6 @@ Parser state: DocumentStart While this cannot be shown in Markdown, the output is colored so that it is a bit easier to read. -## `gen_large_yaml` -It is hard to find large (100+MiB) real-world YAML files that could be used to benchmark a parser. This utility generates multiple large files that are meant to stress the parser with different layouts of YAML files. The resulting files do not look like anything that would be encountered in production, but can serve as a base to test several features of a YAML parser. - -The generated files are the following: - - - `big.yaml`: A large array of records with few fields. One of the fields is a description, a large text block scalar spanning multiple lines. Most of the scanning happens in block scalars. - - `nested.yaml`: Very short key-value pairs that nest deeply. - - `small_objects.yaml`: A large array of 2 key-value mappings. - - `strings_array.yaml`: A large array of lipsum one-liners (~150-175 characters in length). - -All generated files are meant to be between 200 and 250 MiB in size. - -This tool depends on external dependencies that are not part of `saphyr-parser`'s dependencies or `dev-dependencies` and as such can't be called through `cargo run` directly. A dedicated `cargo gen_large_yaml` alias can be used to generate the benchmark files. - ## `run_bench` This is a benchmarking helper that runs the parser on the given file a given number of times and is able to extract simple metrics out of the results. The `--output-yaml` flag can be specified to make the output a YAML file that can be fed into other tools. diff --git a/parser/tools/bench_compare/Cargo.toml b/parser/tools/bench_compare/Cargo.toml deleted file mode 100644 index 4ca9b33..0000000 --- a/parser/tools/bench_compare/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "bench_compare" -version = "0.6.0" -authors = [ - "Ethiraric " -] -license = "MIT OR Apache-2.0" -description = "Run multiple YAML parsers and compare their times" -repository = "https://github.com/Ethiraric/yaml-rust2" -readme = "README.md" -edition = "2018" - -[dependencies] -anyhow = { version = "1.0.81", features = ["backtrace"] } -serde = { version = "1.0.197", features = ["derive"] } -serde_yaml = "0.9.32" -toml = "0.8.11" - -[profile.release-lto] -inherits = "release" -lto = true diff --git a/parser/tools/bench_compare/README.md b/parser/tools/bench_compare/README.md deleted file mode 100644 index 8e2c1c5..0000000 --- a/parser/tools/bench_compare/README.md +++ /dev/null @@ -1,120 +0,0 @@ -# `bench_compare` -This tool helps with comparing times different YAML parsers take to parse the same input. - -## Synopsis -``` -bench_compare time_parse -bench_compare run_bench -``` - -This will run either `time_parse` or `run_bench` (described below) with the given set of parsers from the configuration file. - -## Parsers requirements -Parsers are expected to be event-based. In order to be fair to this crate's benchmark implementation, parsers should: - -* Load the file into memory (a string, `mmap`, ...) **prior** to starting the clock -* Initialize the parser, if needed -* **Start the clock** -* Read events from the parser while the parser has not finished parsing -* Discard events as they are received (dropping them, `free`ing them or anything similar) so as to not grow their memory consumption too high, and allowing the parser to reuse event structures -* **Stop the clock** -* Destroy the resources, if needed/wanted (parser, file buffer, ...). The kernel will reap after the process exits. - - -## Parsers required binaries -This tool recognizes 2 binaries: `time_parse` and `run_bench`. - -### `time_parse` -Synopsis: -``` -time_parse file.yaml [--short] -``` - -The binary must run the aforementioned steps and display on its output the time the parser took to parse the given file. -With the `--short` option, the binary must only output the benchmark time in nanoseconds. - -```sh -# This is meant to be human-readable. -# The example below is what this crate implements. -$> time_parse file.yaml -Loaded 200MiB in 1.74389s. - -# This will be read by this tool. -# This must output ONLY the time, in nanoseconds. -$> time_parse file.yaml --short -1743892394 -``` - -This tool will always provide the `--short` option. - -### `run_bench` -Synopsis: -``` -run_bench file.yaml [--output-yaml] -``` - -The binary is expected to run `` runs of the aforementioned steps and display on its output relevant information. -The `--output-yaml` instructs the binary to output details about its runs in YAML on its standard output. -The binary may optionally perform some warmup runs prior to running the benchmark. The time it took the binary to run will not be evaluated. - -```sh -# This is meant to be human-readable. -# The example below is what this crate implements. -$> run_bench file.yaml 100 -Average: 1.589485s -Min : 1.583078s -Max : 1.597028s -95% : 1.593219s - -# This will be read by this tool. -# This must output a YAML as described below. -$> run_bench ../file.yaml 10 --output-yaml -parser: saphyr -input: ../file.yaml -average: 1620303590 -min: 1611632108 -max: 1636401896 -percentile95: 1636401896 -iterations: 10 -times: - - 1636401896 - - 1623914538 - - 1611632108 - - 1612973608 - - 1617748930 - - 1615419514 - - 1612172250 - - 1620791346 - - 1629339306 - - 1622642412 -``` - -The expected fields are (all times in nanoseconds): - -* `parser`: The name of the parser (in case of a mistake renaming files) -* `input`: The path to the input file as given to the binary arguments -* `average`: The average time it took to run the parser -* `min`: The shortest time it took to run the parser -* `max`: The longest time it took to run the parser -* `percentile95`: The 95th percentile time of the runs -* `iterations`: The number of times the parser was run (``) -* `times`: An array of `iterations` times, one for each run, in the order they were run (first run first) - -## Configuration -`bench_compare` is configured through a `bench_compare.toml` file. This file must be located in the current directory. -As of now, default values are unsupported and all fields must be set. The following fields are required: -```toml -yaml_input_dir = "bench_yaml" # The path to the directory containing the input yaml files -iterations = 10 # The number of iterations, if using `run_bench` -yaml_output_dir = "yaml_output" # The directory in which `run_bench`'s yamls are saved -csv_output = "benchmark.csv" # The CSV output aggregating times for each parser and file - -[[parsers]] # A parser, can be repeated as many times as there are parsers -name = "saphyr" # The name of the parser (used for logging) -path = "target/release/" # The path in which the parsers' `run_bench` and `time_parse` are - -# If there is another parser, another block can be added -# [[parsers]] -# name = "libfyaml" -# path = "../libfyaml/build" -``` diff --git a/parser/tools/bench_compare/src/main.rs b/parser/tools/bench_compare/src/main.rs deleted file mode 100644 index c3a8c9e..0000000 --- a/parser/tools/bench_compare/src/main.rs +++ /dev/null @@ -1,175 +0,0 @@ -use std::{fs::File, io::BufWriter, io::Write, path::Path}; - -use anyhow::{Context, Error}; -use serde::{Deserialize, Serialize}; - -fn main() { - if let Err(e) = entrypoint() { - eprintln!("{e:?}"); - std::process::exit(1); - } -} - -fn entrypoint() -> Result<(), Error> { - let config: Config = - toml::from_str(&std::fs::read_to_string("bench_compare.toml").unwrap()).unwrap(); - if config.parsers.is_empty() { - println!("Please add at least one parser. Refer to the README for instructions."); - return Ok(()); - } - let args: Vec<_> = std::env::args().collect(); - if args.len() != 2 - || (args.len() == 2 && !["time_parse", "run_bench"].contains(&args[1].as_str())) - { - println!("Usage: bench_compare "); - return Ok(()); - } - match args[1].as_str() { - "run_bench" => run_bench(&config)?, - "time_parse" => unimplemented!(), - _ => unreachable!(), - } - Ok(()) -} - -/// Run the `run_bench` binary on the given parsers. -fn run_bench(config: &Config) -> Result<(), Error> { - // Create output directory - std::fs::create_dir_all(&config.yaml_output_dir)?; - - let inputs = list_input_files(config)?; - let iterations = format!("{}", config.iterations); - let mut averages = vec![]; - - // Inputs are ordered, so are parsers. - for input in &inputs { - let input_basename = Path::new(&input).file_name().unwrap().to_string_lossy(); - let mut input_times = vec![]; - - // Run each input for each parser. - for parser in &config.parsers { - println!("Running {input_basename} against {}", parser.name); - // Run benchmark - let path = Path::new(&parser.path).join("run_bench"); - let output = std::process::Command::new(&path) - .arg(input) - .arg(&iterations) - .arg("--output-yaml") - .output() - .with_context(|| format!("While running {path:?} against {input}"))?; - // Check exit status. - if output.status.code().unwrap_or(1) == 0 { - let s = String::from_utf8_lossy(&output.stdout); - // Get output as yaml. - match serde_yaml::from_str::(&s) { - Ok(output) => { - // Push average into our CSV-to-be. - input_times.push(output.average); - // Save the YAML for later. - serde_yaml::to_writer( - BufWriter::new(File::create(format!( - "{}/{}-{}", - config.yaml_output_dir, parser.name, input_basename - ))?), - &output, - )?; - } - Err(e) => { - // Yaml is invalid, use 0 as "didn't run properly". - println!("Errored: Invalid YAML output: {e}"); - input_times.push(0); - } - } - } else { - // An error happened, use 0 as "didn't run properly". - println!("Errored: process did exit non-zero"); - input_times.push(0); - } - } - averages.push(input_times); - } - - // Finally, save a CSV. - save_run_bench_csv(config, &inputs, &averages) -} - -/// General configuration structure. -#[derive(Serialize, Deserialize)] -struct Config { - /// The path to the directory containing the input yaml files. - yaml_input_dir: String, - /// Number of iterations to run, if using `run_bench`. - iterations: u32, - /// The parsers to run. - parsers: Vec, - /// The path to the directory in which `run_bench`'s yamls are saved. - yaml_output_dir: String, - /// The path to the CSV output aggregating times for each parser and file. - csv_output: String, -} - -/// A parser configuration. -#[derive(Serialize, Deserialize)] -struct Parser { - /// The name of the parser. - name: String, - /// The path in which the parser's `run_bench` and `time_parse` are located. - path: String, -} - -/// Ourput of running `run_bench` on a given parser. -#[derive(Serialize, Deserialize)] -struct BenchYamlOutput { - /// The name of the parser. - parser: String, - /// The file taken as input. - input: String, - /// Average parsing time (ns). - average: u64, - /// Shortest parsing time (ns). - min: u64, - /// Longest parsing time (ns). - max: u64, - /// 95th percentile of parsing times (ns). - percentile95: u64, - /// Number of iterations. - iterations: u64, - /// Parsing times for each run. - times: Vec, -} - -/// Save a CSV file with all averages from `run_bench`. -fn save_run_bench_csv( - config: &Config, - inputs: &[String], - averages: &[Vec], -) -> Result<(), Error> { - let mut csv = BufWriter::new(File::create(&config.csv_output)?); - for parser in &config.parsers { - write!(csv, ",{}", parser.name,)?; - } - writeln!(csv)?; - for (path, averages) in inputs.iter().zip(averages.iter()) { - let filename = Path::new(path).file_name().unwrap().to_string_lossy(); - write!(csv, "{}", filename)?; - for avg in averages { - write!(csv, ",{avg}")?; - } - writeln!(csv)?; - } - - Ok(()) -} - -/// Returns the paths to the input yaml files. -fn list_input_files(config: &Config) -> Result, Error> { - Ok(std::fs::read_dir(&config.yaml_input_dir)? - .filter_map(Result::ok) - .map(|entry| entry.path().to_string_lossy().to_string()) - .filter(|path| { - Path::new(path) - .extension() - .map_or(false, |ext| ext.eq_ignore_ascii_case("yaml")) - }) - .collect()) -} diff --git a/parser/tools/gen_large_yaml/Cargo.toml b/parser/tools/gen_large_yaml/Cargo.toml deleted file mode 100644 index d8526fb..0000000 --- a/parser/tools/gen_large_yaml/Cargo.toml +++ /dev/null @@ -1,19 +0,0 @@ -[package] -name = "gen_large_yaml" -version = "0.6.0" -authors = [ - "Ethiraric " -] -license = "MIT OR Apache-2.0" -description = "A helper to generate large YAML files" -repository = "https://github.com/Ethiraric/yaml-rust2" -readme = "README.md" -edition = "2018" - -[dependencies] -rand = { version = "0.8.5", features = [ "small_rng" ] } -lipsum = "0.9.0" - -[profile.release-lto] -inherits = "release" -lto = true diff --git a/parser/tools/gen_large_yaml/src/gen.rs b/parser/tools/gen_large_yaml/src/gen.rs deleted file mode 100644 index 78d16ba..0000000 --- a/parser/tools/gen_large_yaml/src/gen.rs +++ /dev/null @@ -1,156 +0,0 @@ -#![allow(clippy::too_many_arguments)] - -use rand::{distributions::Alphanumeric, rngs::SmallRng, Rng}; - -/// Generate a string with hexadecimal digits of the specified length. -pub fn hex_string(rng: &mut SmallRng, len: usize) -> String { - const DIGITS: &[u8] = b"0123456789abcdef"; - string_from_set(rng, len, len + 1, DIGITS) -} - -/// Generate an e-mail address. -pub fn email(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String { - const CHARSET: &[u8] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_.0123456789"; - format!( - "{}@example.com", - string_from_set(rng, len_lo, len_hi, CHARSET) - ) -} - -/// Generate a random URL. -pub fn url( - rng: &mut SmallRng, - scheme: &str, - n_paths_lo: usize, - n_paths_hi: usize, - path_len_lo: usize, - path_len_hi: usize, - extension: Option<&str>, -) -> String { - let mut string = format!("{scheme}://example.com"); - for _ in 0..rng.gen_range(n_paths_lo..n_paths_hi) { - string.push('/'); - string.push_str(&alnum_string(rng, path_len_lo, path_len_hi)); - } - if let Some(extension) = extension { - string.push('.'); - string.push_str(extension); - } - string -} - -/// Generate a random integer. -pub fn integer(rng: &mut SmallRng, lo: i64, hi: i64) -> i64 { - rng.gen_range(lo..hi) -} - -/// Generate an alphanumeric string with a length between `lo_len` and `hi_len`. -pub fn alnum_string(rng: &mut SmallRng, lo_len: usize, hi_len: usize) -> String { - let len = rng.gen_range(lo_len..hi_len); - rng.sample_iter(&Alphanumeric) - .take(len) - .map(char::from) - .collect() -} - -/// Generate a string with hexadecimal digits of the specified length. -pub fn string_from_set(rng: &mut SmallRng, len_lo: usize, len_hi: usize, set: &[u8]) -> String { - (0..rng.gen_range(len_lo..len_hi)) - .map(|_| set[rng.gen_range(0..set.len())] as char) - .collect() -} - -/// Generate a lipsum paragraph. -pub fn paragraph( - rng: &mut SmallRng, - lines_lo: usize, - lines_hi: usize, - wps_lo: usize, - wps_hi: usize, - line_maxcol: usize, -) -> Vec { - let mut ret = Vec::new(); - let nlines = rng.gen_range(lines_lo..lines_hi); - - while ret.len() < nlines { - let words_in_sentence = rng.gen_range(wps_lo..wps_hi); - let mut sentence = lipsum::lipsum_words_with_rng(rng.clone(), words_in_sentence); - - if let Some(last_line) = ret.pop() { - sentence = format!("{last_line} {sentence}"); - } - - while sentence.len() > line_maxcol { - let last_space_idx = line_maxcol - - sentence[0..line_maxcol] - .chars() - .rev() - .position(char::is_whitespace) - .unwrap(); - ret.push(sentence[0..last_space_idx].to_string()); - sentence = sentence[last_space_idx + 1..].to_string(); - } - if !sentence.is_empty() { - ret.push(sentence); - } - } - - ret -} - -/// Generate a full name. -pub fn full_name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String { - format!( - "{} {}", - name(rng, len_lo, len_hi), - name(rng, len_lo, len_hi) - ) -} - -/// Generate a name. -pub fn name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String { - const UPPER: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - const LOWER: &[u8] = b"abcdefghijklmnopqrstuvwxyz"; - - let len = rng.gen_range(len_lo..len_hi); - let mut ret = String::new(); - ret.push(UPPER[rng.gen_range(0..UPPER.len())] as char); - ret.push_str(string_from_set(rng, len, len + 1, LOWER).as_str()); - - ret -} - -/// Generate a set of words. -pub fn words(rng: &mut SmallRng, words_lo: usize, words_hi: usize) -> String { - let nwords = rng.gen_range(words_lo..words_hi); - lipsum::lipsum_words_with_rng(rng.clone(), nwords).replace(|c| "-\'\",*:".contains(c), "") -} - -/// Generate a lipsum text. -/// -/// Texts are composed of some paragraphs and empty lines between them. -pub fn text( - rng: &mut SmallRng, - paragraphs_lo: usize, - paragraphs_hi: usize, - lines_lo: usize, - lines_hi: usize, - wps_lo: usize, - wps_hi: usize, - line_maxcol: usize, -) -> Vec { - let mut ret = Vec::new(); - let mut first = true; - - for _ in 0..rng.gen_range(paragraphs_lo..paragraphs_hi) { - if first { - first = false; - } else { - ret.push(String::new()); - } - - ret.extend(paragraph(rng, lines_lo, lines_hi, wps_lo, wps_hi, line_maxcol).into_iter()); - } - - ret -} diff --git a/parser/tools/gen_large_yaml/src/main.rs b/parser/tools/gen_large_yaml/src/main.rs deleted file mode 100644 index b585c59..0000000 --- a/parser/tools/gen_large_yaml/src/main.rs +++ /dev/null @@ -1,261 +0,0 @@ -#![allow(dead_code)] - -mod gen; -mod nested; - -use std::fs::File; -use std::io::BufWriter; -use std::path::Path; - -use rand::{rngs::SmallRng, Rng, SeedableRng}; - -/// The path into which the generated YAML files will be written. -const OUTPUT_DIR: &str = "bench_yaml"; - -fn main() -> std::io::Result<()> { - let mut generator = Generator::new(); - let output_path = Path::new(OUTPUT_DIR); - if !output_path.is_dir() { - std::fs::create_dir(output_path).unwrap(); - } - - println!("Generating big.yaml"); - let mut out = BufWriter::new(File::create(output_path.join("big.yaml")).unwrap()); - generator.gen_record_array(&mut out, 100_000, 100_001)?; - - println!("Generating nested.yaml"); - let mut out = BufWriter::new(File::create(output_path.join("nested.yaml")).unwrap()); - nested::create_deep_object(&mut out, 1_100_000)?; - - println!("Generating small_objects.yaml"); - let mut out = BufWriter::new(File::create(output_path.join("small_objects.yaml")).unwrap()); - generator.gen_authors_array(&mut out, 4_000_000, 4_000_001)?; - - println!("Generating strings_array.yaml"); - let mut out = BufWriter::new(File::create(output_path.join("strings_array.yaml")).unwrap()); - generator.gen_strings_array(&mut out, 1_300_000, 1_300_001, 10, 40)?; - Ok(()) -} - -/// YAML Generator. -struct Generator { - /// The RNG state. - /// - /// We don't need to be cryptographically secure. [`SmallRng`] also implements the - /// [`SeedableRng`] trait, allowing runs to be predictable. - rng: SmallRng, - /// The stack of indentations. - indents: Vec, -} - -type GenFn = dyn FnOnce(&mut Generator, &mut W) -> std::io::Result<()>; - -impl Generator { - /// Create a new generator. - fn new() -> Self { - Generator { - rng: SmallRng::seed_from_u64(42), - indents: vec![0], - } - } - - /// Generate an array of records as per [`Self::gen_record_object`]. - fn gen_record_array( - &mut self, - writer: &mut W, - items_lo: usize, - items_hi: usize, - ) -> std::io::Result<()> { - self.gen_array(writer, items_lo, items_hi, Generator::gen_record_object) - } - - /// Generate an array of lipsum one-liners. - fn gen_strings_array( - &mut self, - writer: &mut W, - items_lo: usize, - items_hi: usize, - words_lo: usize, - words_hi: usize, - ) -> std::io::Result<()> { - self.gen_array(writer, items_lo, items_hi, |gen, writer| { - write!(writer, "{}", gen::words(&mut gen.rng, words_lo, words_hi)) - }) - } - - /// Generate a YAML object/mapping containing a record. - /// - /// Fields are description, hash, version, home, repository and pdf. - /// The `description` field is a long string and puts a lot of weight in plain scalar / block - /// scalar parsing. - fn gen_record_object(&mut self, writer: &mut W) -> std::io::Result<()> { - let fields: Vec<(String, Box>)> = vec![ - ( - "description".to_string(), - Box::new(|gen, w| { - write!(w, "|")?; - gen.push_indent(2); - gen.nl(w)?; - let indent = gen.indent(); - let text = gen::text(&mut gen.rng, 1, 9, 3, 8, 10, 20, 80 - indent); - gen.write_lines(w, &text)?; - gen.pop_indent(); - Ok(()) - }), - ), - ( - "authors".to_string(), - Box::new(|gen, w| { - gen.push_indent(2); - gen.nl(w)?; - gen.gen_authors_array(w, 1, 10)?; - gen.pop_indent(); - Ok(()) - }), - ), - ( - "hash".to_string(), - Box::new(|gen, w| write!(w, "{}", gen::hex_string(&mut gen.rng, 64))), - ), - ( - "version".to_string(), - Box::new(|gen, w| write!(w, "{}", gen::integer(&mut gen.rng, 1, 9))), - ), - ( - "home".to_string(), - Box::new(|gen, w| { - write!(w, "{}", gen::url(&mut gen.rng, "https", 0, 1, 0, 0, None)) - }), - ), - ( - "repository".to_string(), - Box::new(|gen, w| { - write!(w, "{}", gen::url(&mut gen.rng, "git", 1, 4, 10, 20, None)) - }), - ), - ( - "pdf".to_string(), - Box::new(|gen, w| { - write!( - w, - "{}", - gen::url(&mut gen.rng, "https", 1, 4, 10, 30, Some("pdf")) - ) - }), - ), - ]; - self.gen_object(writer, fields) - } - - /// Generate an array of authors as per [`Self::gen_author_object`]. - fn gen_authors_array( - &mut self, - writer: &mut W, - items_lo: usize, - items_hi: usize, - ) -> std::io::Result<()> { - self.gen_array(writer, items_lo, items_hi, Generator::gen_author_object) - } - - /// Generate a small object with 2 string fields. - fn gen_author_object(&mut self, writer: &mut W) -> std::io::Result<()> { - let fields: Vec<(String, Box>)> = vec![ - ( - "name".to_string(), - Box::new(|gen, w| write!(w, "{}", gen::full_name(&mut gen.rng, 10, 15))), - ), - ( - "email".to_string(), - Box::new(|gen, w| write!(w, "{}", gen::email(&mut gen.rng, 1, 9))), - ), - ]; - self.gen_object(writer, fields) - } - - /// Generate a YAML array/sequence containing nodes generated by the given function. - fn gen_array std::io::Result<()>>( - &mut self, - writer: &mut W, - len_lo: usize, - len_hi: usize, - mut obj_creator: F, - ) -> std::io::Result<()> { - let mut first = true; - for _ in 0..self.rng.gen_range(len_lo..len_hi) { - if first { - first = false; - } else { - self.nl(writer)?; - } - write!(writer, "- ")?; - self.push_indent(2); - (obj_creator)(self, writer)?; - self.pop_indent(); - } - Ok(()) - } - - /// Create a Yaml object with some fields in it. - fn gen_object( - &mut self, - writer: &mut W, - fields: Vec<(String, Box>)>, - ) -> std::io::Result<()> { - let mut first = true; - for (key, f) in fields { - if first { - first = false; - } else { - self.nl(writer)?; - } - write!(writer, "{key}: ")?; - f(self, writer)?; - } - Ok(()) - } - - /// Write the given lines at the right indentation. - fn write_lines( - &mut self, - writer: &mut W, - lines: &[String], - ) -> std::io::Result<()> { - let mut first = true; - - for line in lines { - if first { - first = false; - } else { - self.nl(writer)?; - } - write!(writer, "{line}")?; - } - - Ok(()) - } - - /// Write a new line to the writer and indent. - fn nl(&mut self, writer: &mut W) -> std::io::Result<()> { - writeln!(writer)?; - for _ in 0..self.indent() { - write!(writer, " ")?; - } - Ok(()) - } - - /// Return the given indent. - fn indent(&self) -> usize { - *self.indents.last().unwrap() - } - - /// Push a new indent with the given relative offset. - fn push_indent(&mut self, offset: usize) { - self.indents.push(self.indent() + offset); - } - - /// Pops the last indent. - fn pop_indent(&mut self) { - self.indents.pop(); - assert!(!self.indents.is_empty()); - } -} diff --git a/parser/tools/gen_large_yaml/src/nested.rs b/parser/tools/gen_large_yaml/src/nested.rs deleted file mode 100644 index 0f182a9..0000000 --- a/parser/tools/gen_large_yaml/src/nested.rs +++ /dev/null @@ -1,115 +0,0 @@ -use std::{cell::RefCell, rc::Rc}; - -use rand::{rngs::SmallRng, Rng, SeedableRng}; - -/// Create a deep object with the given amount of nodes. -pub fn create_deep_object( - writer: &mut W, - n_nodes: usize, -) -> std::io::Result<()> { - let mut tree = Tree::new(); - for _ in 0..n_nodes { - tree.push_node(); - } - tree.write_to(writer) -} - -/// An n-tree. -/// -/// The algorithm used to generate a potentially deep object is to create a tree, one node at a -/// time, where each node is put as a child of a random existing node in the tree. -struct Tree { - /// The tree-view of the tree. - root: Rc>, - /// Array of all the nodes in the tree, including the root node. - nodes: Vec>>, - /// The RNG state. - /// - /// We don't need to be cryptographically secure. [`SmallRng`] also implements the - /// [`SeedableRng`] trait, allowing runs to be predictable. - rng: SmallRng, -} - -/// A node in a tree. -struct Node { - /// All the children of the node. - children: Vec>>, -} - -impl Tree { - /// Create a new tree. - fn new() -> Self { - let root = Node::new_rc_refcell(); - Tree { - root: root.clone(), - nodes: vec![root], - rng: SmallRng::seed_from_u64(42), - } - } - - /// Add a new node as a child of a random node in the tree. - fn push_node(&mut self) { - let new_node = Node::new_rc_refcell(); - let n_nodes = self.nodes.len(); - // Bias the nodes towards the end so that there is more nesting. - let parent = &mut self.nodes[self.rng.gen_range((3 * n_nodes / 4)..n_nodes)]; - (**parent).borrow_mut().push_child(new_node.clone()); - self.nodes.push(new_node); - } - - /// Write the YAML representation of the tree to `writer`. - fn write_to(&self, writer: &mut W) -> std::io::Result<()> { - (*self.root).borrow().write_to(writer, 0) - } -} - -impl Node { - /// Create a new node. - fn new() -> Self { - Node { children: vec![] } - } - - fn new_rc_refcell() -> Rc> { - Rc::new(RefCell::new(Self::new())) - } - - /// Append a child to the node. - fn push_child(&mut self, child: Rc>) { - self.children.push(child); - } - - /// Write the YAML representation of the node to `writer`. - fn write_to(&self, writer: &mut W, indent: usize) -> std::io::Result<()> { - if self.children.is_empty() { - write_n(writer, ' ', indent)?; - writer.write_all(b"a: 1\n")?; - } else { - for (n, child) in self.children.iter().enumerate() { - write_n(writer, ' ', indent)?; - write_id_for_number(writer, n)?; - writer.write_all(b":\n")?; - (**child).borrow().write_to(writer, indent + 2)?; - } - } - Ok(()) - } -} - -/// Write `n` times `c` to `out`. -fn write_n(out: &mut W, c: char, n: usize) -> std::io::Result<()> { - for _ in 0..n { - write!(out, "{c}")?; - } - Ok(()) -} - -/// Create a valid identifier for the given number. -fn write_id_for_number(out: &mut W, mut n: usize) -> std::io::Result<()> { - const DIGITS: &[u8] = b"_abcdefghijklmnopqrstuvwxyz"; - n += 1; - while n > 0 { - write!(out, "{}", DIGITS[n % DIGITS.len()] as char)?; - n /= DIGITS.len(); - } - Ok(()) -}