Make gen_large_yaml reproductible.

* Use a seedable RNG so that we can have the same number sequence.
* Replace `HashMap`s with `Vec`s to avoid undeterministic iteration.
This commit is contained in:
Ethiraric 2024-03-20 23:07:08 +01:00
parent e8415713ab
commit 869a2d1a15
4 changed files with 91 additions and 82 deletions

View file

@ -12,7 +12,7 @@ edition = "2018"
[dependencies]
yaml-rust2 = { version = "0.7.0", path = "../../" }
rand = "0.8.5"
rand = { version = "0.8.5", features = [ "small_rng" ] }
lipsum = "0.9.0"
[profile.release-lto]

View file

@ -1,15 +1,15 @@
#![allow(clippy::too_many_arguments)]
use rand::{distributions::Alphanumeric, rngs::ThreadRng, Rng};
use rand::{distributions::Alphanumeric, rngs::SmallRng, Rng};
/// Generate a string with hexadecimal digits of the specified length.
pub fn hex_string(rng: &mut ThreadRng, len: usize) -> String {
pub fn hex_string(rng: &mut SmallRng, len: usize) -> String {
const DIGITS: &[u8] = b"0123456789abcdef";
string_from_set(rng, len, len + 1, DIGITS)
}
/// Generate an e-mail address.
pub fn email(rng: &mut ThreadRng, len_lo: usize, len_hi: usize) -> String {
pub fn email(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String {
const CHARSET: &[u8] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_.0123456789";
format!(
"{}@example.com",
@ -19,7 +19,7 @@ pub fn email(rng: &mut ThreadRng, len_lo: usize, len_hi: usize) -> String {
/// Generate a random URL.
pub fn url(
rng: &mut ThreadRng,
rng: &mut SmallRng,
scheme: &str,
n_paths_lo: usize,
n_paths_hi: usize,
@ -40,12 +40,12 @@ pub fn url(
}
/// Generate a random integer.
pub fn integer(rng: &mut ThreadRng, lo: i64, hi: i64) -> i64 {
pub fn integer(rng: &mut SmallRng, lo: i64, hi: i64) -> i64 {
rng.gen_range(lo..hi)
}
/// Generate an alphanumeric string with a length between `lo_len` and `hi_len`.
pub fn alnum_string(rng: &mut ThreadRng, lo_len: usize, hi_len: usize) -> String {
pub fn alnum_string(rng: &mut SmallRng, lo_len: usize, hi_len: usize) -> String {
let len = rng.gen_range(lo_len..hi_len);
rng.sample_iter(&Alphanumeric)
.take(len)
@ -54,7 +54,7 @@ pub fn alnum_string(rng: &mut ThreadRng, lo_len: usize, hi_len: usize) -> String
}
/// Generate a string with hexadecimal digits of the specified length.
pub fn string_from_set(rng: &mut ThreadRng, len_lo: usize, len_hi: usize, set: &[u8]) -> String {
pub fn string_from_set(rng: &mut SmallRng, len_lo: usize, len_hi: usize, set: &[u8]) -> String {
(0..rng.gen_range(len_lo..len_hi))
.map(|_| set[rng.gen_range(0..set.len())] as char)
.collect()
@ -62,7 +62,7 @@ pub fn string_from_set(rng: &mut ThreadRng, len_lo: usize, len_hi: usize, set: &
/// Generate a lipsum paragraph.
pub fn paragraph(
rng: &mut ThreadRng,
rng: &mut SmallRng,
lines_lo: usize,
lines_hi: usize,
wps_lo: usize,
@ -99,7 +99,7 @@ pub fn paragraph(
}
/// Generate a full name.
pub fn full_name(rng: &mut ThreadRng, len_lo: usize, len_hi: usize) -> String {
pub fn full_name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String {
format!(
"{} {}",
name(rng, len_lo, len_hi),
@ -108,7 +108,7 @@ pub fn full_name(rng: &mut ThreadRng, len_lo: usize, len_hi: usize) -> String {
}
/// Generate a name.
pub fn name(rng: &mut ThreadRng, len_lo: usize, len_hi: usize) -> String {
pub fn name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String {
const UPPER: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
const LOWER: &[u8] = b"abcdefghijklmnopqrstuvwxyz";
@ -121,7 +121,7 @@ pub fn name(rng: &mut ThreadRng, len_lo: usize, len_hi: usize) -> String {
}
/// Generate a set of words.
pub fn words(rng: &mut ThreadRng, words_lo: usize, words_hi: usize) -> String {
pub fn words(rng: &mut SmallRng, words_lo: usize, words_hi: usize) -> String {
let nwords = rng.gen_range(words_lo..words_hi);
lipsum::lipsum_words_with_rng(rng.clone(), nwords).replace(|c| "-\'\",*:".contains(c), "")
}
@ -130,7 +130,7 @@ pub fn words(rng: &mut ThreadRng, words_lo: usize, words_hi: usize) -> String {
///
/// Texts are composed of some paragraphs and empty lines between them.
pub fn text(
rng: &mut ThreadRng,
rng: &mut SmallRng,
paragraphs_lo: usize,
paragraphs_hi: usize,
lines_lo: usize,

View file

@ -3,12 +3,11 @@
mod gen;
mod nested;
use std::collections::HashMap;
use std::fs::File;
use std::io::BufWriter;
use std::path::Path;
use rand::{rngs::ThreadRng, Rng};
use rand::{rngs::SmallRng, Rng, SeedableRng};
/// The path into which the generated YAML files will be written.
const OUTPUT_DIR: &str = "bench_yaml";
@ -41,7 +40,10 @@ fn main() -> std::io::Result<()> {
/// YAML Generator.
struct Generator {
/// The RNG state.
rng: ThreadRng,
///
/// We don't need to be cryptographically secure. [`SmallRng`] also implements the
/// [`SeedableRng`] trait, allowing runs to be predictible.
rng: SmallRng,
/// The stack of indentations.
indents: Vec<usize>,
}
@ -52,7 +54,7 @@ impl Generator {
/// Create a new generator.
fn new() -> Self {
Generator {
rng: rand::thread_rng(),
rng: SmallRng::seed_from_u64(42),
indents: vec![0],
}
}
@ -87,58 +89,61 @@ impl Generator {
/// The `description` field is a long string and puts a lot of weight in plain scalar / block
/// scalar parsing.
fn gen_record_object<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
let mut fields = HashMap::<String, Box<GenFn<W>>>::new();
fields.insert(
"description".to_string(),
Box::new(|gen, w| {
write!(w, "|")?;
gen.push_indent(2);
gen.nl(w)?;
let indent = gen.indent();
let text = gen::text(&mut gen.rng, 1, 9, 3, 8, 10, 20, 80 - indent);
gen.write_lines(w, &text)?;
gen.pop_indent();
Ok(())
}),
);
fields.insert(
"authors".to_string(),
Box::new(|gen, w| {
gen.push_indent(2);
gen.nl(w)?;
gen.gen_authors_array(w, 1, 10)?;
gen.pop_indent();
Ok(())
}),
);
fields.insert(
"hash".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::hex_string(&mut gen.rng, 64))),
);
fields.insert(
"version".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::integer(&mut gen.rng, 1, 9))),
);
fields.insert(
"home".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::url(&mut gen.rng, "https", 0, 1, 0, 0, None))),
);
fields.insert(
"repository".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::url(&mut gen.rng, "git", 1, 4, 10, 20, None))),
);
fields.insert(
"pdf".to_string(),
Box::new(|gen, w| {
write!(
w,
"{}",
gen::url(&mut gen.rng, "https", 1, 4, 10, 30, Some("pdf"))
)
}),
);
let fields: Vec<(String, Box<GenFn<W>>)> = vec![
(
"description".to_string(),
Box::new(|gen, w| {
write!(w, "|")?;
gen.push_indent(2);
gen.nl(w)?;
let indent = gen.indent();
let text = gen::text(&mut gen.rng, 1, 9, 3, 8, 10, 20, 80 - indent);
gen.write_lines(w, &text)?;
gen.pop_indent();
Ok(())
}),
),
(
"authors".to_string(),
Box::new(|gen, w| {
gen.push_indent(2);
gen.nl(w)?;
gen.gen_authors_array(w, 1, 10)?;
gen.pop_indent();
Ok(())
}),
),
(
"hash".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::hex_string(&mut gen.rng, 64))),
),
(
"version".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::integer(&mut gen.rng, 1, 9))),
),
(
"home".to_string(),
Box::new(|gen, w| {
write!(w, "{}", gen::url(&mut gen.rng, "https", 0, 1, 0, 0, None))
}),
),
(
"repository".to_string(),
Box::new(|gen, w| {
write!(w, "{}", gen::url(&mut gen.rng, "git", 1, 4, 10, 20, None))
}),
),
(
"pdf".to_string(),
Box::new(|gen, w| {
write!(
w,
"{}",
gen::url(&mut gen.rng, "https", 1, 4, 10, 30, Some("pdf"))
)
}),
),
];
self.gen_object(writer, fields)
}
@ -154,15 +159,16 @@ impl Generator {
/// Generate a small object with 2 string fields.
fn gen_author_object<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
let mut fields = HashMap::<String, Box<GenFn<W>>>::new();
fields.insert(
"name".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::full_name(&mut gen.rng, 10, 15))),
);
fields.insert(
"email".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::email(&mut gen.rng, 1, 9))),
);
let fields: Vec<(String, Box<GenFn<W>>)> = vec![
(
"name".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::full_name(&mut gen.rng, 10, 15))),
),
(
"email".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::email(&mut gen.rng, 1, 9))),
),
];
self.gen_object(writer, fields)
}
@ -193,7 +199,7 @@ impl Generator {
fn gen_object<W: std::io::Write>(
&mut self,
writer: &mut W,
fields: HashMap<String, Box<GenFn<W>>>,
fields: Vec<(String, Box<GenFn<W>>)>,
) -> std::io::Result<()> {
let mut first = true;
for (key, f) in fields {

View file

@ -1,6 +1,6 @@
use std::{cell::RefCell, rc::Rc};
use rand::{rngs::ThreadRng, Rng};
use rand::{rngs::SmallRng, Rng, SeedableRng};
/// Create a deep object with the given amount of nodes.
pub fn create_deep_object<W: std::io::Write>(
@ -24,7 +24,10 @@ struct Tree {
/// Array of all the nodes in the tree, including the root node.
nodes: Vec<Rc<RefCell<Node>>>,
/// The RNG state.
rng: ThreadRng,
///
/// We don't need to be cryptographically secure. [`SmallRng`] also implements the
/// [`SeedableRng`] trait, allowing runs to be predictible.
rng: SmallRng,
}
/// A node in a tree.
@ -40,7 +43,7 @@ impl Tree {
Tree {
root: root.clone(),
nodes: vec![root],
rng: rand::thread_rng(),
rng: SmallRng::seed_from_u64(42),
}
}