yaml-rust2 -> saphyr

This commit is contained in:
Ethiraric 2024-04-02 18:49:52 +02:00
parent d618d06061
commit 30b713d7a7
32 changed files with 69 additions and 6258 deletions

View file

@ -31,7 +31,6 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v3
- run: git submodule update --init
- run: rustup toolchain install ${{ matrix.rust }} --profile minimal --no-self-update
- uses: Swatinem/rust-cache@v2
- name: Run build

3
saphyr/.gitmodules vendored
View file

@ -1,3 +0,0 @@
[submodule "tests/yaml-test-suite"]
path = tests/yaml-test-suite
url = https://github.com/yaml/yaml-test-suite/

View file

@ -1,49 +1,34 @@
[package]
name = "yaml-rust2"
version = "0.8.0"
name = "saphyr"
version = "0.0.1"
authors = [
"Yuheng Chen <yuhengchen@sensetime.com>",
"Ethiraric <ethiraric@gmail.com>",
"David Aguilar <davvid@gmail.com>"
]
documentation = "https://docs.rs/yaml-rust2"
documentation = "https://docs.rs/saphyr"
keywords = [ "yaml", "parser" ]
categories = [ "encoding", "parser-implementations" ]
license = "MIT OR Apache-2.0"
description = "A fully YAML 1.2 compliant YAML parser"
repository = "https://github.com/Ethiraric/yaml-rust2"
description = "A fully YAML 1.2 compliant YAML library"
repository = "https://github.com/saphyr-rs/saphyr"
readme = "README.md"
edition = "2021"
rust-version = "1.70.0"
[features]
default = [ "encoding" ]
debug_prints = []
encoding = [ "dep:encoding_rs" ]
[dependencies]
arraydeque = "0.5.1"
saphyr-parser = "0.0.1"
encoding_rs = { version = "0.8.33", optional = true }
hashlink = "0.8"
[dev-dependencies]
libtest-mimic = "0.3.0"
quickcheck = "1.0"
[profile.release-lto]
inherits = "release"
lto = true
[[test]]
name = "yaml-test-suite"
harness = false
[[bin]]
name = "dump_events"
path = "tools/dump_events.rs"
[[bin]]
name = "time_parse"
path = "tools/time_parse.rs"
[[bin]]
name = "run_bench"
path = "tools/run_bench.rs"

View file

@ -1,29 +1,34 @@
# yaml-rust2
# saphyr
[yaml-rust2](https://github.com/Ethiraric/yaml-rust2) is a fully compliant YAML 1.2
implementation written in pure Rust.
[saphyr](https://github.com/saphyr-rs/saphyr) is a fully compliant YAML 1.2
library written in pure Rust.
This work is based on [`yaml-rust`](https://github.com/chyh1990/yaml-rust) with
fixes towards being compliant to the [YAML test
suite](https://github.com/yaml/yaml-test-suite/). `yaml-rust`'s parser is
heavily influenced by `libyaml` and `yaml-cpp`.
`yaml-rust2` is a pure Rust YAML 1.2 implementation that benefits from the
`saphyr` is a pure Rust YAML 1.2 implementation that benefits from the
memory safety and other benefits from the Rust language.
## Quick Start
Add the following to the Cargo.toml of your project:
### Installing
Add the following to your Cargo.toml:
```toml
[dependencies]
yaml-rust2 = "0.8"
saphyr = "0.0.1"
```
or use `cargo add` to get the latest version automatically:
```sh
cargo add saphyr
```
Use `yaml_rust2::YamlLoader` to load YAML documents and access them as `Yaml` objects:
### Example
Use `saphyr::YamlLoader` to load YAML documents and access them as `Yaml` objects:
```rust
use yaml_rust2::{YamlLoader, YamlEmitter};
use saphyr::{YamlLoader, YamlEmitter};
fn main() {
let s =
@ -61,7 +66,7 @@ bar:
}
```
Note that `yaml_rust2::Yaml` implements `Index<&'a str>` and `Index<usize>`:
Note that `saphyr::Yaml` implements `Index<&'a str>` and `Index<usize>`:
* `Index<usize>` assumes the container is an array
* `Index<&'a str>` assumes the container is a string to value map
@ -75,7 +80,6 @@ your objects.
* Pure Rust
* `Vec`/`HashMap` access API
* Low-level YAML events emission
## Security
@ -85,24 +89,10 @@ communicating with the outside world just by parsing a YAML document.
## Specification Compliance
This implementation is fully compatible with the YAML 1.2 specification. In
order to help with compliance, `yaml-rust2` tests against (and passes) the [YAML
test suite](https://github.com/yaml/yaml-test-suite/).
## Upgrading from yaml-rust
You can use `yaml-rust2` as a drop-in replacement for the original `yaml-rust` crate.
```toml
[dependencies]
yaml-rust = { version = "#.#", package = "yaml-rust2" }
```
This `Cargo.toml` declaration allows you to refer to this crate as `yaml_rust` in your code.
```rust
use yaml_rust::{YamlLoader, YamlEmitter};
```
This implementation is fully compatible with the YAML 1.2 specification. The
parser behind this library
([`saphyr-parser`](https://github.com/saphyr-rs/saphyr-parser)) tests against
(and passes) the [YAML test suite](https://github.com/yaml/yaml-test-suite/).
## License
@ -122,10 +112,9 @@ You can find licences in the [`.licenses`](.licenses) subfolder.
## Contribution
[Fork this repository](https://github.com/Ethiraric/yaml-rust2/fork) and
[Create a Pull Request on Github](https://github.com/Ethiraric/yaml-rust2/compare/master...Ethiraric:yaml-rust2:master).
[Fork this repository](https://github.com/saphyr-rs/saphyr/fork) and
[Create a Pull Request on Github](https://github.com/saphyr-rs/saphyr/compare/master...saphyr-rs:saphyr:master).
You may need to click on "compare across forks" and select your fork's branch.
Make sure that `Ethiraric` is selected as the base repository, not `chyh1990`.
Unless you explicitly state otherwise, any contribution intentionally submitted
for inclusion in the work by you, as defined in the Apache-2.0 license, shall
@ -133,10 +122,12 @@ be dual licensed as above, without any additional terms or conditions.
## Links
* [yaml-rust2 source code repository](https://github.com/Ethiraric/yaml-rust2)
* [saphyr source code repository](https://github.com/saphyr-rs/saphyr)
* [yaml-rust2 releases on crates.io](https://crates.io/crates/yaml-rust2)
* [saphyr releases on crates.io](https://crates.io/crates/saphyr)
* [yaml-rust2 documentation on docs.rs](https://docs.rs/yaml-rust2/latest/yaml_rust2/)
* [saphyr documentation on docs.rs](https://docs.rs/saphyr/latest/saphyr/)
* [saphyr-parser releases on crates.io](https://crates.io/crates/saphyr-parser)
* [yaml-test-suite](https://github.com/yaml/yaml-test-suite)

View file

@ -1,7 +1,7 @@
use saphyr::yaml;
use std::env;
use std::fs::File;
use std::io::prelude::*;
use yaml_rust2::yaml;
fn print_indent(indent: usize) {
for _ in 0..indent {

View file

@ -1,4 +1,5 @@
before_commit:
cargo fmt --check
cargo clippy --release --all-targets -- -D warnings
cargo clippy --all-targets -- -D warnings
cargo build --release --all-targets
@ -6,12 +7,4 @@ before_commit:
cargo test
cargo test --release
cargo test --doc
cargo build --profile=release-lto --package gen_large_yaml --bin gen_large_yaml --manifest-path tools/gen_large_yaml/Cargo.toml
RUSTDOCFLAGS="-D warnings" cargo doc --all-features
ethi_bench:
cargo build --release --all-targets
cd ../Yaml-rust && cargo build --release --all-targets
cd ../serde-yaml/ && cargo build --release --all-targets
cd ../libfyaml/build && ninja
cargo bench_compare run_bench

View file

@ -1,115 +1,5 @@
//! Holds functions to determine if a character belongs to a specific character set.
/// Check whether the character is nil (`\0`).
#[inline]
pub(crate) fn is_z(c: char) -> bool {
c == '\0'
}
/// Check whether the character is a line break (`\r` or `\n`).
#[inline]
pub(crate) fn is_break(c: char) -> bool {
c == '\n' || c == '\r'
}
/// Check whether the character is nil or a line break (`\0`, `\r`, `\n`).
#[inline]
pub(crate) fn is_breakz(c: char) -> bool {
is_break(c) || is_z(c)
}
/// Check whether the character is a whitespace (` ` or `\t`).
#[inline]
pub(crate) fn is_blank(c: char) -> bool {
c == ' ' || c == '\t'
}
/// Check whether the character is nil, a linebreak or a whitespace.
///
/// `\0`, ` `, `\t`, `\n`, `\r`
#[inline]
pub(crate) fn is_blank_or_breakz(c: char) -> bool {
is_blank(c) || is_breakz(c)
}
/// Check whether the character is an ascii digit.
#[inline]
pub(crate) fn is_digit(c: char) -> bool {
c.is_ascii_digit()
}
/// Check whether the character is a digit, letter, `_` or `-`.
#[inline]
pub(crate) fn is_alpha(c: char) -> bool {
matches!(c, '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' | '-')
}
/// Check whether the character is a hexadecimal character (case insensitive).
#[inline]
pub(crate) fn is_hex(c: char) -> bool {
c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
}
/// Convert the hexadecimal digit to an integer.
#[inline]
pub(crate) fn as_hex(c: char) -> u32 {
match c {
'0'..='9' => (c as u32) - ('0' as u32),
'a'..='f' => (c as u32) - ('a' as u32) + 10,
'A'..='F' => (c as u32) - ('A' as u32) + 10,
_ => unreachable!(),
}
}
/// Check whether the character is a YAML flow character (one of `,[]{}`).
#[inline]
pub(crate) fn is_flow(c: char) -> bool {
matches!(c, ',' | '[' | ']' | '{' | '}')
}
/// Check whether the character is the BOM character.
#[inline]
pub(crate) fn is_bom(c: char) -> bool {
c == '\u{FEFF}'
}
/// Check whether the character is a YAML non-breaking character.
#[inline]
pub(crate) fn is_yaml_non_break(c: char) -> bool {
// TODO(ethiraric, 28/12/2023): is_printable
!is_break(c) && !is_bom(c)
}
/// Check whether the character is NOT a YAML whitespace (` ` / `\t`).
#[inline]
pub(crate) fn is_yaml_non_space(c: char) -> bool {
is_yaml_non_break(c) && !is_blank(c)
}
/// Check whether the character is a valid YAML anchor name character.
#[inline]
pub(crate) fn is_anchor_char(c: char) -> bool {
is_yaml_non_space(c) && !is_flow(c) && !is_z(c)
}
/// Check whether the character is a valid word character.
#[inline]
pub(crate) fn is_word_char(c: char) -> bool {
is_alpha(c) && c != '_'
}
/// Check whether the character is a valid URI character.
#[inline]
pub(crate) fn is_uri_char(c: char) -> bool {
is_word_char(c) || "#;/?:@&=+$,_.!~*\'()[]%".contains(c)
}
/// Check whether the character is a valid tag character.
#[inline]
pub(crate) fn is_tag_char(c: char) -> bool {
is_uri_char(c) && !is_flow(c) && c != '!'
}
/// Check if the string can be expressed a valid literal block scalar.
/// The YAML spec supports all of the following in block literals except `#xFEFF`:
/// ```no_compile

View file

@ -1,41 +0,0 @@
//! Debugging helpers.
//!
//! Debugging is governed by two conditions:
//! 1. The build mode. Debugging code is not emitted in release builds and thus not available.
//! 2. The `YAMLALL_DEBUG` environment variable. If built in debug mode, the program must be fed
//! the `YAMLALL_DEBUG` variable in its environment. While debugging code is present in debug
//! build, debug helpers will only trigger if that variable is set when running the program.
// If a debug build, use stuff in the debug submodule.
#[cfg(feature = "debug_prints")]
pub use debug::enabled;
// Otherwise, just export dummies for publicly visible functions.
/// Evaluates to nothing.
#[cfg(not(feature = "debug_prints"))]
macro_rules! debug_print {
($($arg:tt)*) => {{}};
}
#[cfg(feature = "debug_prints")]
#[macro_use]
#[allow(clippy::module_inception)]
mod debug {
use std::sync::OnceLock;
/// If debugging is [`enabled`], print the format string on the error output.
macro_rules! debug_print {
($($arg:tt)*) => {{
if $crate::debug::enabled() {
eprintln!($($arg)*)
}
}};
}
/// Return whether debugging features are enabled in this execution.
#[cfg(debug_assertions)]
pub fn enabled() -> bool {
static ENABLED: OnceLock<bool> = OnceLock::new();
*ENABLED.get_or_init(|| std::env::var("YAMLRUST2_DEBUG").is_ok())
}
}

View file

@ -36,7 +36,7 @@ impl From<fmt::Error> for EmitError {
/// The YAML serializer.
///
/// ```
/// # use yaml_rust2::{YamlLoader, YamlEmitter};
/// # use saphyr::{YamlLoader, YamlEmitter};
/// let input_string = "a: b\nc: d";
/// let yaml = YamlLoader::load_from_str(input_string).unwrap();
///
@ -142,6 +142,8 @@ impl<'a> YamlEmitter<'a> {
/// In this form, blocks cannot have any properties (such as anchors
/// or tags), which should be OK, because this emitter doesn't
/// (currently) emit those anyways.
///
/// TODO(ethiraric, 2024/04/02): We can support those now.
pub fn compact(&mut self, compact: bool) {
self.compact = compact;
}
@ -157,7 +159,7 @@ impl<'a> YamlEmitter<'a> {
/// # Examples
///
/// ```rust
/// use yaml_rust2::{Yaml, YamlEmitter, YamlLoader};
/// use saphyr::{Yaml, YamlEmitter, YamlLoader};
///
/// let input = r#"{foo: "bar!\nbar!", baz: 42}"#;
/// let parsed = YamlLoader::load_from_str(input).unwrap();

View file

@ -6,19 +6,22 @@
//!
//! # Usage
//!
//! This crate is [on github](https://github.com/Ethiraric/yaml-rust2) and can be used by adding
//! `yaml-rust2` to the dependencies in your project's `Cargo.toml`.
//!
//! This crate is [on github](https://github.com/saphyr-rs/saphyr) and can be used by adding
//! `saphyr` to the dependencies in your project's `Cargo.toml`.
//! ```toml
//! [dependencies]
//! yaml-rust2 = "0.8.0"
//! saphyr = "0.0.1"
//! ```
//! or by using `cargo add` to get the latest version:
//! ```sh
//! cargo add saphyr
//! ```
//!
//! # Examples
//! Parse a string into `Vec<Yaml>` and then serialize it as a YAML string.
//!
//! ```
//! use yaml_rust2::{YamlLoader, YamlEmitter};
//! use saphyr::{YamlLoader, YamlEmitter};
//!
//! let docs = YamlLoader::load_from_str("[1, 2, 3]").unwrap();
//! let doc = &docs[0]; // select the first YAML document
@ -37,28 +40,20 @@
//! Enables encoding-aware decoding of Yaml documents.
//!
//! The MSRV for this feature is `1.70.0`.
//!
//! #### `debug_prints`
//! Enables the `debug` module and usage of debug prints in the scanner and the parser. Do not
//! enable if you are consuming the crate rather than working on it as this can significantly
//! decrease performance.
//!
//! The MSRV for this feature is `1.70.0`.
#![warn(missing_docs, clippy::pedantic)]
extern crate hashlink;
pub(crate) mod char_traits;
#[macro_use]
pub(crate) mod debug;
pub mod emitter;
pub mod parser;
pub mod scanner;
pub mod yaml;
// reexport key APIs
pub use crate::emitter::{EmitError, YamlEmitter};
pub use crate::parser::Event;
pub use crate::scanner::ScanError;
pub use crate::yaml::{Yaml, YamlLoader};
// Re-export main components.
pub use crate::emitter::YamlEmitter;
pub use crate::yaml::{Array, Hash, Yaml, YamlLoader};
#[cfg(feature = "encoding")]
pub use crate::yaml::{YAMLDecodingTrap, YAMLDecodingTrapFn, YamlDecoder};
// Re-export `ScanError` as it is used as part of our public API and we want consumers to be able
// to inspect it (e.g. perform a `match`). They wouldn't be able without it.
pub use saphyr_parser::ScanError;

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -10,8 +10,7 @@ use std::{collections::BTreeMap, convert::TryFrom, mem, ops::Index, ops::IndexMu
use encoding_rs::{Decoder, DecoderResult, Encoding};
use hashlink::LinkedHashMap;
use crate::parser::{Event, MarkedEventReceiver, Parser, Tag};
use crate::scanner::{Marker, ScanError, TScalarStyle};
use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser, ScanError, TScalarStyle, Tag};
/// A YAML node is stored as this `Yaml` enumeration, which provides an easy way to
/// access your YAML document.
@ -19,7 +18,7 @@ use crate::scanner::{Marker, ScanError, TScalarStyle};
/// # Examples
///
/// ```
/// use yaml_rust2::Yaml;
/// use saphyr::Yaml;
/// let foo = Yaml::from_str("-123"); // convert the string to the appropriate YAML type
/// assert_eq!(foo.as_i64().unwrap(), -123);
///
@ -306,7 +305,7 @@ pub enum YAMLDecodingTrap {
/// For example, to read a YAML file while ignoring Unicode decoding errors you can set the
/// `encoding_trap` to `encoding::DecoderTrap::Ignore`.
/// ```rust
/// use yaml_rust2::yaml::{YamlDecoder, YAMLDecodingTrap};
/// use saphyr::{YamlDecoder, YAMLDecodingTrap};
///
/// let string = b"---
/// a\xa9: 1
@ -580,7 +579,7 @@ impl Yaml {
/// replace it with a given value `other`. Otherwise, return self unchanged.
///
/// ```
/// use yaml_rust2::yaml::Yaml;
/// use saphyr::Yaml;
///
/// assert_eq!(Yaml::BadValue.or(Yaml::Integer(3)), Yaml::Integer(3));
/// assert_eq!(Yaml::Integer(3).or(Yaml::BadValue), Yaml::Integer(3));
@ -613,7 +612,7 @@ impl Yaml {
///
/// # Examples
/// ```
/// # use yaml_rust2::yaml::Yaml;
/// # use saphyr::Yaml;
/// assert!(matches!(Yaml::from_str("42"), Yaml::Integer(42)));
/// assert!(matches!(Yaml::from_str("0x2A"), Yaml::Integer(42)));
/// assert!(matches!(Yaml::from_str("0o52"), Yaml::Integer(42)));

View file

@ -1,8 +1,7 @@
#![allow(clippy::bool_assert_comparison)]
#![allow(clippy::float_cmp)]
use std::vec;
use yaml_rust2::{Yaml, YamlEmitter, YamlLoader};
use saphyr::{Yaml, YamlEmitter, YamlLoader};
#[test]
fn test_api() {
@ -44,27 +43,6 @@ fn test_api() {
assert!(!writer.is_empty());
}
#[test]
fn test_fail() {
let s = "
# syntax error
scalar
key: [1, 2]]
key1:a2
";
let Err(error) = YamlLoader::load_from_str(s) else {
panic!()
};
assert_eq!(
error.info(),
"mapping values are not allowed in this context"
);
assert_eq!(
error.to_string(),
"mapping values are not allowed in this context at byte 26 line 4 column 4"
);
}
#[test]
fn test_coerce() {
let s = "---
@ -80,51 +58,6 @@ c: [1, 2]
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_empty_doc() {
let s: String = String::new();
YamlLoader::load_from_str(&s).unwrap();
let s: String = "---".to_owned();
assert_eq!(YamlLoader::load_from_str(&s).unwrap()[0], Yaml::Null);
}
#[test]
fn test_parser() {
let s: String = "
# comment
a0 bb: val
a1:
b1: 4
b2: d
a2: 4 # i'm comment
a3: [1, 2, 3]
a4:
- - a1
- a2
- 2
a5: 'single_quoted'
a6: \"double_quoted\"
a7:
"
.to_owned();
let out = YamlLoader::load_from_str(&s).unwrap();
let doc = &out[0];
assert_eq!(doc["a7"].as_str().unwrap(), "你好");
}
#[test]
fn test_multi_doc() {
let s = "
'a scalar'
---
'a scalar'
---
'a scalar'
";
let out = YamlLoader::load_from_str(s).unwrap();
assert_eq!(out.len(), 3);
}
#[test]
fn test_anchor() {
let s = "
@ -150,15 +83,6 @@ a1: &DEFAULT
assert_eq!(doc["a1"]["b2"], Yaml::BadValue);
}
#[test]
fn test_github_27() {
// https://github.com/chyh1990/yaml-rust/issues/27
let s = "&a";
let out = YamlLoader::load_from_str(s).unwrap();
let doc = &out[0];
assert_eq!(doc.as_str().unwrap(), "");
}
#[test]
fn test_plain_datatype() {
let s = "
@ -223,45 +147,6 @@ fn test_plain_datatype() {
assert!(!doc[25][1].as_bool().unwrap());
}
#[test]
fn test_bad_hyphen() {
// See: https://github.com/chyh1990/yaml-rust/issues/23
let s = "{-";
assert!(YamlLoader::load_from_str(s).is_err());
}
#[test]
fn test_issue_65() {
// See: https://github.com/chyh1990/yaml-rust/issues/65
let b = "\n\"ll\\\"ll\\\r\n\"ll\\\"ll\\\r\r\r\rU\r\r\rU";
assert!(YamlLoader::load_from_str(b).is_err());
}
#[test]
fn test_issue_65_mwe() {
// A MWE for `test_issue_65`. The error over there is that there is invalid trailing content
// after a double quoted string.
let b = r#""foo" l"#;
assert!(YamlLoader::load_from_str(b).is_err());
}
#[test]
fn test_bad_docstart() {
assert!(YamlLoader::load_from_str("---This used to cause an infinite loop").is_ok());
assert_eq!(
YamlLoader::load_from_str("----"),
Ok(vec![Yaml::String(String::from("----"))])
);
assert_eq!(
YamlLoader::load_from_str("--- #here goes a comment"),
Ok(vec![Yaml::Null])
);
assert_eq!(
YamlLoader::load_from_str("---- #here goes a comment"),
Ok(vec![Yaml::String(String::from("----"))])
);
}
#[test]
fn test_plain_datatype_with_into_methods() {
let s = "
@ -348,95 +233,3 @@ fn test_integer_key() {
let first = out.into_iter().next().unwrap();
assert_eq!(first[0]["important"].as_bool().unwrap(), true);
}
#[test]
fn test_indentation_equality() {
let four_spaces = YamlLoader::load_from_str(
r"
hash:
with:
indentations
",
)
.unwrap()
.into_iter()
.next()
.unwrap();
let two_spaces = YamlLoader::load_from_str(
r"
hash:
with:
indentations
",
)
.unwrap()
.into_iter()
.next()
.unwrap();
let one_space = YamlLoader::load_from_str(
r"
hash:
with:
indentations
",
)
.unwrap()
.into_iter()
.next()
.unwrap();
let mixed_spaces = YamlLoader::load_from_str(
r"
hash:
with:
indentations
",
)
.unwrap()
.into_iter()
.next()
.unwrap();
assert_eq!(four_spaces, two_spaces);
assert_eq!(two_spaces, one_space);
assert_eq!(four_spaces, mixed_spaces);
}
#[test]
fn test_two_space_indentations() {
// https://github.com/kbknapp/clap-rs/issues/965
let s = r"
subcommands:
- server:
about: server related commands
subcommands2:
- server:
about: server related commands
subcommands3:
- server:
about: server related commands
";
let out = YamlLoader::load_from_str(s).unwrap();
let doc = &out.into_iter().next().unwrap();
println!("{doc:#?}");
assert_eq!(doc["subcommands"][0]["server"], Yaml::Null);
assert!(doc["subcommands2"][0]["server"].as_hash().is_some());
assert!(doc["subcommands3"][0]["server"].as_hash().is_some());
}
#[test]
fn test_recursion_depth_check_objects() {
let s = "{a:".repeat(10_000) + &"}".repeat(10_000);
assert!(YamlLoader::load_from_str(&s).is_err());
}
#[test]
fn test_recursion_depth_check_arrays() {
let s = "[".repeat(10_000) + &"]".repeat(10_000);
assert!(YamlLoader::load_from_str(&s).is_err());
}

View file

@ -1,4 +1,4 @@
use yaml_rust2::{YamlEmitter, YamlLoader};
use saphyr::{YamlEmitter, YamlLoader};
#[allow(clippy::similar_names)]
#[test]

View file

@ -1,9 +1,9 @@
extern crate yaml_rust2;
#[macro_use]
extern crate quickcheck;
use quickcheck::TestResult;
use yaml_rust2::{Yaml, YamlEmitter, YamlLoader};
use saphyr::{Yaml, YamlEmitter, YamlLoader};
quickcheck! {
fn test_check_weird_keys(xs: Vec<String>) -> TestResult {

View file

@ -1,440 +0,0 @@
#![allow(clippy::enum_glob_use)]
use yaml_rust2::{scanner::TokenType::*, scanner::*};
macro_rules! next {
($p:ident, $tk:pat) => {{
let tok = $p.next().unwrap();
match tok.1 {
$tk => {}
_ => panic!("unexpected token: {:?}", tok),
}
}};
}
macro_rules! next_scalar {
($p:ident, $tk:expr, $v:expr) => {{
let tok = $p.next().unwrap();
match tok.1 {
Scalar(style, ref v) => {
assert_eq!(style, $tk);
assert_eq!(v, $v);
}
_ => panic!("unexpected token: {:?}", tok),
}
}};
}
macro_rules! end {
($p:ident) => {{
assert_eq!($p.next(), None);
}};
}
/// test cases in libyaml scanner.c
#[test]
fn test_empty() {
let s = "";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_scalar() {
let s = "a scalar";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, Scalar(TScalarStyle::Plain, _));
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_explicit_scalar() {
let s = "---
'a scalar'
...
";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, DocumentStart);
next!(p, Scalar(TScalarStyle::SingleQuoted, _));
next!(p, DocumentEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_multiple_documents() {
let s = "
'a scalar'
---
'a scalar'
---
'a scalar'
";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, Scalar(TScalarStyle::SingleQuoted, _));
next!(p, DocumentStart);
next!(p, Scalar(TScalarStyle::SingleQuoted, _));
next!(p, DocumentStart);
next!(p, Scalar(TScalarStyle::SingleQuoted, _));
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_a_flow_sequence() {
let s = "[item 1, item 2, item 3]";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, FlowSequenceStart);
next_scalar!(p, TScalarStyle::Plain, "item 1");
next!(p, FlowEntry);
next!(p, Scalar(TScalarStyle::Plain, _));
next!(p, FlowEntry);
next!(p, Scalar(TScalarStyle::Plain, _));
next!(p, FlowSequenceEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_a_flow_mapping() {
let s = "
{
a simple key: a value, # Note that the KEY token is produced.
? a complex key: another value,
}
";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, FlowMappingStart);
next!(p, Key);
next!(p, Scalar(TScalarStyle::Plain, _));
next!(p, Value);
next!(p, Scalar(TScalarStyle::Plain, _));
next!(p, FlowEntry);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "a complex key");
next!(p, Value);
next!(p, Scalar(TScalarStyle::Plain, _));
next!(p, FlowEntry);
next!(p, FlowMappingEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_block_sequences() {
let s = "
- item 1
- item 2
-
- item 3.1
- item 3.2
-
key 1: value 1
key 2: value 2
";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, BlockSequenceStart);
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 1");
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 2");
next!(p, BlockEntry);
next!(p, BlockSequenceStart);
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 3.1");
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 3.2");
next!(p, BlockEnd);
next!(p, BlockEntry);
next!(p, BlockMappingStart);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "key 1");
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, "value 1");
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "key 2");
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, "value 2");
next!(p, BlockEnd);
next!(p, BlockEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_block_mappings() {
let s = "
a simple key: a value # The KEY token is produced here.
? a complex key
: another value
a mapping:
key 1: value 1
key 2: value 2
a sequence:
- item 1
- item 2
";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, BlockMappingStart);
next!(p, Key);
next!(p, Scalar(_, _));
next!(p, Value);
next!(p, Scalar(_, _));
next!(p, Key);
next!(p, Scalar(_, _));
next!(p, Value);
next!(p, Scalar(_, _));
next!(p, Key);
next!(p, Scalar(_, _));
next!(p, Value); // libyaml comment seems to be wrong
next!(p, BlockMappingStart);
next!(p, Key);
next!(p, Scalar(_, _));
next!(p, Value);
next!(p, Scalar(_, _));
next!(p, Key);
next!(p, Scalar(_, _));
next!(p, Value);
next!(p, Scalar(_, _));
next!(p, BlockEnd);
next!(p, Key);
next!(p, Scalar(_, _));
next!(p, Value);
next!(p, BlockSequenceStart);
next!(p, BlockEntry);
next!(p, Scalar(_, _));
next!(p, BlockEntry);
next!(p, Scalar(_, _));
next!(p, BlockEnd);
next!(p, BlockEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_no_block_sequence_start() {
let s = "
key:
- item 1
- item 2
";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, BlockMappingStart);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "key");
next!(p, Value);
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 1");
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 2");
next!(p, BlockEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_collections_in_sequence() {
let s = "
- - item 1
- item 2
- key 1: value 1
key 2: value 2
- ? complex key
: complex value
";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, BlockSequenceStart);
next!(p, BlockEntry);
next!(p, BlockSequenceStart);
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 1");
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 2");
next!(p, BlockEnd);
next!(p, BlockEntry);
next!(p, BlockMappingStart);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "key 1");
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, "value 1");
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "key 2");
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, "value 2");
next!(p, BlockEnd);
next!(p, BlockEntry);
next!(p, BlockMappingStart);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "complex key");
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, "complex value");
next!(p, BlockEnd);
next!(p, BlockEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_collections_in_mapping() {
let s = "
? a sequence
: - item 1
- item 2
? a mapping
: key 1: value 1
key 2: value 2
";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, BlockMappingStart);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "a sequence");
next!(p, Value);
next!(p, BlockSequenceStart);
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 1");
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "item 2");
next!(p, BlockEnd);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "a mapping");
next!(p, Value);
next!(p, BlockMappingStart);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "key 1");
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, "value 1");
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "key 2");
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, "value 2");
next!(p, BlockEnd);
next!(p, BlockEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_spec_ex7_3() {
let s = "
{
? foo :,
: bar,
}
";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, FlowMappingStart);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "foo");
next!(p, Value);
next!(p, FlowEntry);
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, "bar");
next!(p, FlowEntry);
next!(p, FlowMappingEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_plain_scalar_starting_with_indicators_in_flow() {
// "Plain scalars must not begin with most indicators, as this would cause ambiguity with
// other YAML constructs. However, the “:”, “?” and “-” indicators may be used as the first
// character if followed by a non-space “safe” character, as this causes no ambiguity."
let s = "{a: :b}";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, FlowMappingStart);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "a");
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, ":b");
next!(p, FlowMappingEnd);
next!(p, StreamEnd);
end!(p);
let s = "{a: ?b}";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, FlowMappingStart);
next!(p, Key);
next_scalar!(p, TScalarStyle::Plain, "a");
next!(p, Value);
next_scalar!(p, TScalarStyle::Plain, "?b");
next!(p, FlowMappingEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_plain_scalar_starting_with_indicators_in_block() {
let s = ":a";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next_scalar!(p, TScalarStyle::Plain, ":a");
next!(p, StreamEnd);
end!(p);
let s = "?a";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next_scalar!(p, TScalarStyle::Plain, "?a");
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_plain_scalar_containing_indicators_in_block() {
let s = "a:,b";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next_scalar!(p, TScalarStyle::Plain, "a:,b");
next!(p, StreamEnd);
end!(p);
let s = ":,b";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next_scalar!(p, TScalarStyle::Plain, ":,b");
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_scanner_cr() {
let s = "---\r\n- tok1\r\n- tok2";
let mut p = Scanner::new(s.chars());
next!(p, StreamStart(..));
next!(p, DocumentStart);
next!(p, BlockSequenceStart);
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "tok1");
next!(p, BlockEntry);
next_scalar!(p, TScalarStyle::Plain, "tok2");
next!(p, BlockEnd);
next!(p, StreamEnd);
end!(p);
}
#[test]
fn test_uri() {
// TODO
}
#[test]
fn test_uri_escapes() {
// TODO
}

View file

@ -1,84 +1,7 @@
#![allow(dead_code)]
#![allow(non_upper_case_globals)]
extern crate yaml_rust2;
use yaml_rust2::parser::{Event, EventReceiver, Parser};
use yaml_rust2::scanner::TScalarStyle;
// These names match the names used in the C++ test suite.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
#[derive(Clone, PartialEq, PartialOrd, Debug)]
enum TestEvent {
OnDocumentStart,
OnDocumentEnd,
OnSequenceStart,
OnSequenceEnd,
OnMapStart,
OnMapEnd,
OnScalar,
OnAlias,
OnNull,
}
struct YamlChecker {
pub evs: Vec<TestEvent>,
}
impl EventReceiver for YamlChecker {
fn on_event(&mut self, ev: Event) {
let tev = match ev {
Event::DocumentStart => TestEvent::OnDocumentStart,
Event::DocumentEnd => TestEvent::OnDocumentEnd,
Event::SequenceStart(..) => TestEvent::OnSequenceStart,
Event::SequenceEnd => TestEvent::OnSequenceEnd,
Event::MappingStart(..) => TestEvent::OnMapStart,
Event::MappingEnd => TestEvent::OnMapEnd,
Event::Scalar(ref v, style, _, _) => {
if v == "~" && style == TScalarStyle::Plain {
TestEvent::OnNull
} else {
TestEvent::OnScalar
}
}
Event::Alias(_) => TestEvent::OnAlias,
_ => return, // ignore other events
};
self.evs.push(tev);
}
}
fn str_to_test_events(docs: &str) -> Vec<TestEvent> {
let mut p = YamlChecker { evs: Vec::new() };
let mut parser = Parser::new_from_str(docs);
parser.load(&mut p, true).unwrap();
p.evs
}
macro_rules! assert_next {
($v:expr, $p:pat) => {
match $v.next().unwrap() {
$p => {}
e => {
panic!("unexpected event: {:?} (expected {:?})", e, stringify!($p));
}
}
};
}
// auto generated from handler_spec_test.cpp
include!("specexamples.rs.inc");
include!("spec_test.rs.inc");
// hand-crafted tests
//#[test]
//fn test_hc_alias() {
//}
use saphyr::{Hash, Yaml, YamlEmitter, YamlLoader};
#[test]
fn test_mapvec_legal() {
use yaml_rust2::yaml::{Hash, Yaml};
use yaml_rust2::{YamlEmitter, YamlLoader};
// Emitting a `map<map<seq<_>>, _>` should result in legal yaml that
// we can parse.

View file

@ -1,6 +1,4 @@
extern crate yaml_rust2;
use yaml_rust2::{Yaml, YamlEmitter, YamlLoader};
use saphyr::{Yaml, YamlEmitter, YamlLoader};
fn roundtrip(original: &Yaml) {
let mut emitted = String::new();

@ -1 +0,0 @@
Subproject commit 45db50aecf9b1520f8258938c88f396e96f30831

View file

@ -1,295 +0,0 @@
use std::fs::{self, DirEntry};
use libtest_mimic::{run_tests, Arguments, Outcome, Test};
use yaml_rust2::{
parser::{Event, EventReceiver, Parser, Tag},
scanner::TScalarStyle,
yaml, ScanError, Yaml, YamlLoader,
};
type Result<T, E = Box<dyn std::error::Error>> = std::result::Result<T, E>;
struct YamlTest {
yaml_visual: String,
yaml: String,
expected_events: String,
expected_error: bool,
}
fn main() -> Result<()> {
let mut arguments = Arguments::from_args();
if arguments.num_threads.is_none() {
arguments.num_threads = Some(1);
}
let tests: Vec<Vec<_>> = std::fs::read_dir("tests/yaml-test-suite/src")?
.map(|entry| -> Result<_> {
let entry = entry?;
let tests = load_tests_from_file(&entry)?;
Ok(tests)
})
.collect::<Result<_>>()?;
let mut tests: Vec<_> = tests.into_iter().flatten().collect();
tests.sort_by_key(|t| t.name.clone());
run_tests(&arguments, tests, run_yaml_test).exit();
}
fn run_yaml_test(test: &Test<YamlTest>) -> Outcome {
let desc = &test.data;
let actual_events = parse_to_events(&desc.yaml);
let events_diff = actual_events.map(|events| events_differ(&events, &desc.expected_events));
let mut error_text = match (&events_diff, desc.expected_error) {
(Ok(x), true) => Some(format!("no error when expected: {x:#?}")),
(Err(_), true) | (Ok(None), false) => None,
(Err(e), false) => Some(format!("unexpected error {e:?}")),
(Ok(Some(diff)), false) => Some(format!("events differ: {diff}")),
};
// Show a caret on error.
if let Some(text) = &mut error_text {
use std::fmt::Write;
let _ = writeln!(text, "\n### Input:\n{}\n### End", desc.yaml_visual);
if let Err(err) = &events_diff {
writeln!(text, "### Error position").unwrap();
let mut lines = desc.yaml.lines();
for _ in 0..(err.marker().line() - 1) {
let l = lines.next().unwrap();
writeln!(text, "{l}").unwrap();
}
writeln!(text, "\x1B[91;1m{}", lines.next().unwrap()).unwrap();
for _ in 0..err.marker().col() {
write!(text, " ").unwrap();
}
writeln!(text, "^\x1b[m").unwrap();
for l in lines {
writeln!(text, "{l}").unwrap();
}
writeln!(text, "### End error position").unwrap();
}
}
match error_text {
None => Outcome::Passed,
Some(txt) => Outcome::Failed { msg: Some(txt) },
}
}
fn load_tests_from_file(entry: &DirEntry) -> Result<Vec<Test<YamlTest>>> {
let file_name = entry.file_name().to_string_lossy().to_string();
let test_name = file_name
.strip_suffix(".yaml")
.ok_or("unexpected filename")?;
let tests = YamlLoader::load_from_str(&fs::read_to_string(entry.path())?)?;
let tests = tests[0].as_vec().ok_or("no test list found in file")?;
let mut result = vec![];
let mut current_test = yaml::Hash::new();
for (idx, test_data) in tests.iter().enumerate() {
let name = if tests.len() > 1 {
format!("{test_name}-{idx:02}")
} else {
test_name.to_string()
};
// Test fields except `fail` are "inherited"
let test_data = test_data.as_hash().unwrap();
current_test.remove(&Yaml::String("fail".into()));
for (key, value) in test_data.clone() {
current_test.insert(key, value);
}
let current_test = Yaml::Hash(current_test.clone()); // Much better indexing
if current_test["skip"] != Yaml::BadValue {
continue;
}
result.push(Test {
name,
kind: String::new(),
is_ignored: false,
is_bench: false,
data: YamlTest {
yaml_visual: current_test["yaml"].as_str().unwrap().to_string(),
yaml: visual_to_raw(current_test["yaml"].as_str().unwrap()),
expected_events: visual_to_raw(current_test["tree"].as_str().unwrap()),
expected_error: current_test["fail"].as_bool() == Some(true),
},
});
}
Ok(result)
}
fn parse_to_events(source: &str) -> Result<Vec<String>, ScanError> {
let mut reporter = EventReporter::new();
Parser::new_from_str(source).load(&mut reporter, true)?;
Ok(reporter.events)
}
struct EventReporter {
events: Vec<String>,
}
impl EventReporter {
fn new() -> Self {
Self { events: vec![] }
}
}
impl EventReceiver for EventReporter {
fn on_event(&mut self, ev: Event) {
let line: String = match ev {
Event::StreamStart => "+STR".into(),
Event::StreamEnd => "-STR".into(),
Event::DocumentStart => "+DOC".into(),
Event::DocumentEnd => "-DOC".into(),
Event::SequenceStart(idx, tag) => {
format!("+SEQ{}{}", format_index(idx), format_tag(&tag))
}
Event::SequenceEnd => "-SEQ".into(),
Event::MappingStart(idx, tag) => {
format!("+MAP{}{}", format_index(idx), format_tag(&tag))
}
Event::MappingEnd => "-MAP".into(),
Event::Scalar(ref text, style, idx, ref tag) => {
let kind = match style {
TScalarStyle::Plain => ":",
TScalarStyle::SingleQuoted => "'",
TScalarStyle::DoubleQuoted => r#"""#,
TScalarStyle::Literal => "|",
TScalarStyle::Folded => ">",
};
format!(
"=VAL{}{} {}{}",
format_index(idx),
format_tag(tag),
kind,
escape_text(text)
)
}
Event::Alias(idx) => format!("=ALI *{idx}"),
Event::Nothing => return,
};
self.events.push(line);
}
}
fn format_index(idx: usize) -> String {
if idx > 0 {
format!(" &{idx}")
} else {
String::new()
}
}
fn escape_text(text: &str) -> String {
let mut text = text.to_owned();
for (ch, replacement) in [
('\\', r"\\"),
('\n', "\\n"),
('\r', "\\r"),
('\x08', "\\b"),
('\t', "\\t"),
] {
text = text.replace(ch, replacement);
}
text
}
fn format_tag(tag: &Option<Tag>) -> String {
if let Some(tag) = tag {
format!(" <{}{}>", tag.handle, tag.suffix)
} else {
String::new()
}
}
fn events_differ(actual: &[String], expected: &str) -> Option<String> {
let actual = actual.iter().map(Some).chain(std::iter::repeat(None));
let expected = expected_events(expected);
let expected = expected.iter().map(Some).chain(std::iter::repeat(None));
for (idx, (act, exp)) in actual.zip(expected).enumerate() {
return match (act, exp) {
(Some(act), Some(exp)) => {
if act == exp {
continue;
} else {
Some(format!(
"line {idx} differs: \n=> expected `{exp}`\n=> found `{act}`",
))
}
}
(Some(a), None) => Some(format!("extra actual line: {a:?}")),
(None, Some(e)) => Some(format!("extra expected line: {e:?}")),
(None, None) => None,
};
}
unreachable!()
}
/// Convert the snippets from "visual" to "actual" representation
fn visual_to_raw(yaml: &str) -> String {
let mut yaml = yaml.to_owned();
for (pat, replacement) in [
("", " "),
("»", "\t"),
("", ""), // Tab line continuation ——»
("", "\r"),
("", "\u{FEFF}"),
("", ""), // Trailing newline marker
("\n", ""),
] {
yaml = yaml.replace(pat, replacement);
}
yaml
}
/// Adapt the expectations to the yaml-rust reasonable limitations
///
/// Drop information on node styles (flow/block) and anchor names.
/// Both are things that can be omitted according to spec.
fn expected_events(expected_tree: &str) -> Vec<String> {
let mut anchors = vec![];
expected_tree
.split('\n')
.map(|s| s.trim_start().to_owned())
.filter(|s| !s.is_empty())
.map(|mut s| {
// Anchor name-to-number conversion
if let Some(start) = s.find('&') {
if s[..start].find(':').is_none() {
let len = s[start..].find(' ').unwrap_or(s[start..].len());
anchors.push(s[start + 1..start + len].to_owned());
s = s.replace(&s[start..start + len], &format!("&{}", anchors.len()));
}
}
// Alias nodes name-to-number
if s.starts_with("=ALI") {
let start = s.find('*').unwrap();
let name = &s[start + 1..];
let idx = anchors
.iter()
.enumerate()
.filter(|(_, v)| v == &name)
.last()
.unwrap()
.0;
s = s.replace(&s[start..], &format!("*{}", idx + 1));
}
// Dropping style information
match &*s {
"+DOC ---" => "+DOC".into(),
"-DOC ..." => "-DOC".into(),
s if s.starts_with("+SEQ []") => s.replacen("+SEQ []", "+SEQ", 1),
s if s.starts_with("+MAP {}") => s.replacen("+MAP {}", "+MAP", 1),
"=VAL :" => "=VAL :~".into(), // FIXME: known bug
s => s.into(),
}
})
.collect()
}

View file

@ -1,229 +0,0 @@
# `yaml-rust2` tools
This directory contains tools that are used to develop the crate.
Due to dependency management, only some of them are available as binaries from the `yaml-rust2` crate.
| Tool | Invocation |
|------|------------|
| `bench_compare` | `cargo bench_compare` |
| `dump_events` | `cargo run --bin dump_events -- [...]` |
| `gen_large_yaml` | `cargo gen_large_yaml` |
| `run_bench` | `cargo run --bin run_bench -- [...]` |
| `time_parse` | `cargo run --bin time_parse -- [...]` |
## `bench_compare`
See the [dedicated README file](./bench_compare/README.md).
## `dump_events`
This is a debugging helper for the parser. It outputs events emitted by the parser for a given file. This can be paired with the `YAMLRUST2_DEBUG` environment variable to have an in-depth overview of which steps the scanner and the parser are taking.
### Example
Consider the following `input.yaml` YAML file:
```yaml
- foo: bar
- baz:
c: [3, 4, 5]
```
Running `cargo run --bin dump_events -- input.yaml` outputs:
```
↳ StreamStart
↳ DocumentStart
↳ SequenceStart(0, None)
↳ MappingStart(0, None)
↳ Scalar("foo", Plain, 0, None)
↳ Scalar("bar", Plain, 0, None)
↳ MappingEnd
↳ MappingStart(0, None)
↳ Scalar("baz", Plain, 0, None)
↳ Scalar("~", Plain, 0, None)
↳ Scalar("c", Plain, 0, None)
↳ SequenceStart(0, None)
↳ Scalar("3", Plain, 0, None)
↳ Scalar("4", Plain, 0, None)
↳ Scalar("5", Plain, 0, None)
↳ SequenceEnd
↳ MappingEnd
↳ SequenceEnd
↳ DocumentEnd
↳ StreamEnd
```
Running `YAMLRUST2_DEBUG=1 cargo run --bin dump_events -- input.yaml` outputs much more details:
<details>
<summary> Full output </summary>
```
Parser state: StreamStart
↳ StreamStart(Utf8) Marker { index: 0, line: 1, col: 0 }
↳ StreamStart
Parser state: ImplicitDocumentStart
→ fetch_next_token after whitespace Marker { index: 0, line: 1, col: 0 } '-'
↳ BlockSequenceStart Marker { index: 0, line: 1, col: 0 }
↳ DocumentStart
Parser state: BlockNode
↳ SequenceStart(0, None)
Parser state: BlockSequenceFirstEntry
↳ BlockEntry Marker { index: 2, line: 1, col: 2 }
→ fetch_next_token after whitespace Marker { index: 2, line: 1, col: 2 } 'f'
→ fetch_next_token after whitespace Marker { index: 5, line: 1, col: 5 } ':'
↳ BlockMappingStart Marker { index: 5, line: 1, col: 5 }
↳ MappingStart(0, None)
Parser state: BlockMappingFirstKey
↳ Key Marker { index: 2, line: 1, col: 2 }
↳ Scalar(Plain, "foo") Marker { index: 2, line: 1, col: 2 }
↳ Scalar("foo", Plain, 0, None)
Parser state: BlockMappingValue
↳ Value Marker { index: 5, line: 1, col: 5 }
→ fetch_next_token after whitespace Marker { index: 7, line: 1, col: 7 } 'b'
↳ Scalar(Plain, "bar") Marker { index: 7, line: 1, col: 7 }
↳ Scalar("bar", Plain, 0, None)
Parser state: BlockMappingKey
→ fetch_next_token after whitespace Marker { index: 11, line: 2, col: 0 } '-'
↳ BlockEnd Marker { index: 11, line: 2, col: 0 }
↳ MappingEnd
Parser state: BlockSequenceEntry
↳ BlockEntry Marker { index: 13, line: 2, col: 2 }
→ fetch_next_token after whitespace Marker { index: 13, line: 2, col: 2 } 'b'
→ fetch_next_token after whitespace Marker { index: 16, line: 2, col: 5 } ':'
↳ BlockMappingStart Marker { index: 16, line: 2, col: 5 }
↳ MappingStart(0, None)
Parser state: BlockMappingFirstKey
↳ Key Marker { index: 13, line: 2, col: 2 }
↳ Scalar(Plain, "baz") Marker { index: 13, line: 2, col: 2 }
↳ Scalar("baz", Plain, 0, None)
Parser state: BlockMappingValue
↳ Value Marker { index: 16, line: 2, col: 5 }
→ fetch_next_token after whitespace Marker { index: 20, line: 3, col: 2 } 'c'
→ fetch_next_token after whitespace Marker { index: 21, line: 3, col: 3 } ':'
↳ Key Marker { index: 20, line: 3, col: 2 }
↳ Scalar("~", Plain, 0, None)
Parser state: BlockMappingKey
↳ Scalar(Plain, "c") Marker { index: 20, line: 3, col: 2 }
↳ Scalar("c", Plain, 0, None)
Parser state: BlockMappingValue
↳ Value Marker { index: 21, line: 3, col: 3 }
→ fetch_next_token after whitespace Marker { index: 23, line: 3, col: 5 } '['
↳ FlowSequenceStart Marker { index: 23, line: 3, col: 5 }
↳ SequenceStart(0, None)
Parser state: FlowSequenceFirstEntry
→ fetch_next_token after whitespace Marker { index: 24, line: 3, col: 6 } '3'
→ fetch_next_token after whitespace Marker { index: 25, line: 3, col: 7 } ','
↳ Scalar(Plain, "3") Marker { index: 24, line: 3, col: 6 }
↳ Scalar("3", Plain, 0, None)
Parser state: FlowSequenceEntry
↳ FlowEntry Marker { index: 25, line: 3, col: 7 }
→ fetch_next_token after whitespace Marker { index: 27, line: 3, col: 9 } '4'
→ fetch_next_token after whitespace Marker { index: 28, line: 3, col: 10 } ','
↳ Scalar(Plain, "4") Marker { index: 27, line: 3, col: 9 }
↳ Scalar("4", Plain, 0, None)
Parser state: FlowSequenceEntry
↳ FlowEntry Marker { index: 28, line: 3, col: 10 }
→ fetch_next_token after whitespace Marker { index: 30, line: 3, col: 12 } '5'
→ fetch_next_token after whitespace Marker { index: 31, line: 3, col: 13 } ']'
↳ Scalar(Plain, "5") Marker { index: 30, line: 3, col: 12 }
↳ Scalar("5", Plain, 0, None)
Parser state: FlowSequenceEntry
↳ FlowSequenceEnd Marker { index: 31, line: 3, col: 13 }
↳ SequenceEnd
Parser state: BlockMappingKey
→ fetch_next_token after whitespace Marker { index: 33, line: 4, col: 0 } '\0'
↳ BlockEnd Marker { index: 33, line: 4, col: 0 }
↳ MappingEnd
Parser state: BlockSequenceEntry
↳ BlockEnd Marker { index: 33, line: 4, col: 0 }
↳ SequenceEnd
Parser state: DocumentEnd
↳ StreamEnd Marker { index: 33, line: 4, col: 0 }
↳ DocumentEnd
Parser state: DocumentStart
↳ StreamEnd
```
</details>
While this cannot be shown in Markdown, the output is colored so that it is a bit easier to read.
## `gen_large_yaml`
It is hard to find large (100+MiB) real-world YAML files that could be used to benchmark a parser. This utility generates multiple large files that are meant to stress the parser with different layouts of YAML files. The resulting files do not look like anything that would be encountered in production, but can serve as a base to test several features of a YAML parser.
The generated files are the following:
- `big.yaml`: A large array of records with few fields. One of the fields is a description, a large text block scalar spanning multiple lines. Most of the scanning happens in block scalars.
- `nested.yaml`: Very short key-value pairs that nest deeply.
- `small_objects.yaml`: A large array of 2 key-value mappings.
- `strings_array.yaml`: A large array of lipsum one-liners (~150-175 characters in length).
All generated files are meant to be between 200 and 250 MiB in size.
This tool depends on external dependencies that are not part of `yaml-rust2`'s dependencies or `dev-dependencies` and as such can't be called through `cargo run` directly. A dedicated `cargo gen_large_yaml` alias can be used to generate the benchmark files.
## `run_bench`
This is a benchmarking helper that runs the parser on the given file a given number of times and is able to extract simple metrics out of the results. The `--output-yaml` flag can be specified to make the output a YAML file that can be fed into other tools.
This binary is made to be used by `bench_compare`.
Synopsis: `run_bench input.yaml <iterations> [--output-yaml]`
### Examples
```sh
$> cargo run --release --bin run_bench -- bench_yaml/big.yaml 10
Average: 1.631936191s
Min: 1.629654651s
Max: 1.633045284s
95%: 1.633045284s
$> cargo run --release --bin run_bench -- bench_yaml/big.yaml 10 --output-yaml
parser: yaml-rust2
input: bench_yaml/big.yaml
average: 1649847674
min: 1648277149
max: 1651936305
percentile95: 1651936305
iterations: 10
times:
- 1650216129
- 1649349978
- 1649507018
- 1648277149
- 1649036548
- 1650323982
- 1650917692
- 1648702081
- 1650209860
- 1651936305
```
## `time_parse`
This is a benchmarking helper that times how long it takes for the parser to emit all events. It calls the parser on the given input file, receives parsing events and then immediately discards them. It is advised to run this tool with `--release`.
### Examples
Loading a small file could output the following:
```sh
$> cargo run --release --bin time_parse -- input.yaml
Loaded 0MiB in 14.189µs
```
While loading a larger file could output the following:
```sh
$> cargo run --release --bin time_parse -- bench_yaml/big.yaml
Loaded 220MiB in 1.612677853s
```

View file

@ -1,21 +0,0 @@
[package]
name = "bench_compare"
version = "0.6.0"
authors = [
"Ethiraric <ethiraric@gmail.com>"
]
license = "MIT OR Apache-2.0"
description = "Run multiple YAML parsers and compare their times"
repository = "https://github.com/Ethiraric/yaml-rust2"
readme = "README.md"
edition = "2018"
[dependencies]
anyhow = { version = "1.0.81", features = ["backtrace"] }
serde = { version = "1.0.197", features = ["derive"] }
serde_yaml = "0.9.32"
toml = "0.8.11"
[profile.release-lto]
inherits = "release"
lto = true

View file

@ -1,120 +0,0 @@
# `bench_compare`
This tool helps with comparing times different YAML parsers take to parse the same input.
## Synopsis
```
bench_compare time_parse
bench_compare run_bench
```
This will run either `time_parse` or `run_bench` (described below) with the given set of parsers from the configuration file.
## Parsers requirements
Parsers are expected to be event-based. In order to be fair to this crate's benchmark implementation, parsers should:
* Load the file into memory (a string, `mmap`, ...) **prior** to starting the clock
* Initialize the parser, if needed
* **Start the clock**
* Read events from the parser while the parser has not finished parsing
* Discard events as they are received (dropping them, `free`ing them or anything similar) so as to not grow their memory consumption too high, and allowing the parser to reuse event structures
* **Stop the clock**
* Destroy the resources, if needed/wanted (parser, file buffer, ...). The kernel will reap after the process exits.
## Parsers required binaries
This tool recognizes 2 binaries: `time_parse` and `run_bench`.
### `time_parse`
Synopsis:
```
time_parse file.yaml [--short]
```
The binary must run the aforementioned steps and display on its output the time the parser took to parse the given file.
With the `--short` option, the binary must only output the benchmark time in nanoseconds.
```sh
# This is meant to be human-readable.
# The example below is what this crate implements.
$> time_parse file.yaml
Loaded 200MiB in 1.74389s.
# This will be read by this tool.
# This must output ONLY the time, in nanoseconds.
$> time_parse file.yaml --short
1743892394
```
This tool will always provide the `--short` option.
### `run_bench`
Synopsis:
```
run_bench file.yaml <iterations> [--output-yaml]
```
The binary is expected to run `<iteration>` runs of the aforementioned steps and display on its output relevant information.
The `--output-yaml` instructs the binary to output details about its runs in YAML on its standard output.
The binary may optionally perform some warmup runs prior to running the benchmark. The time it took the binary to run will not be evaluated.
```sh
# This is meant to be human-readable.
# The example below is what this crate implements.
$> run_bench file.yaml 100
Average: 1.589485s
Min : 1.583078s
Max : 1.597028s
95% : 1.593219s
# This will be read by this tool.
# This must output a YAML as described below.
$> run_bench ../file.yaml 10 --output-yaml
parser: yaml-rust2
input: ../file.yaml
average: 1620303590
min: 1611632108
max: 1636401896
percentile95: 1636401896
iterations: 10
times:
- 1636401896
- 1623914538
- 1611632108
- 1612973608
- 1617748930
- 1615419514
- 1612172250
- 1620791346
- 1629339306
- 1622642412
```
The expected fields are (all times in nanoseconds):
* `parser`: The name of the parser (in case of a mistake renaming files)
* `input`: The path to the input file as given to the binary arguments
* `average`: The average time it took to run the parser
* `min`: The shortest time it took to run the parser
* `max`: The longest time it took to run the parser
* `percentile95`: The 95th percentile time of the runs
* `iterations`: The number of times the parser was run (`<iterations>`)
* `times`: An array of `iterations` times, one for each run, in the order they were run (first run first)
## Configuration
`bench_compare` is configured through a `bench_compare.toml` file. This file must be located in the current directory.
As of now, default values are unsupported and all fields must be set. The following fields are required:
```toml
yaml_input_dir = "bench_yaml" # The path to the directory containing the input yaml files
iterations = 10 # The number of iterations, if using `run_bench`
yaml_output_dir = "yaml_output" # The directory in which `run_bench`'s yamls are saved
csv_output = "benchmark.csv" # The CSV output aggregating times for each parser and file
[[parsers]] # A parser, can be repeated as many times as there are parsers
name = "yaml-rust2" # The name of the parser (used for logging)
path = "target/release/" # The path in which the parsers' `run_bench` and `time_parse` are
# If there is another parser, another block can be added
# [[parsers]]
# name = "libfyaml"
# path = "../libfyaml/build"
```

View file

@ -1,174 +0,0 @@
use std::{fs::File, io::BufWriter, io::Write, path::Path};
use anyhow::Error;
use serde::{Deserialize, Serialize};
fn main() {
if let Err(e) = entrypoint() {
eprintln!("{e:?}");
std::process::exit(1);
}
}
fn entrypoint() -> Result<(), Error> {
let config: Config =
toml::from_str(&std::fs::read_to_string("bench_compare.toml").unwrap()).unwrap();
if config.parsers.is_empty() {
println!("Please add at least one parser. Refer to the README for instructions.");
return Ok(());
}
let args: Vec<_> = std::env::args().collect();
if args.len() != 2
|| (args.len() == 2 && !["time_parse", "run_bench"].contains(&args[1].as_str()))
{
println!("Usage: bench_compare <time_parse|run_bench>");
return Ok(());
}
match args[1].as_str() {
"run_bench" => run_bench(&config)?,
"time_parse" => unimplemented!(),
_ => unreachable!(),
}
Ok(())
}
/// Run the `run_bench` binary on the given parsers.
fn run_bench(config: &Config) -> Result<(), Error> {
// Create output directory
std::fs::create_dir_all(&config.yaml_output_dir)?;
let inputs = list_input_files(config)?;
let iterations = format!("{}", config.iterations);
let mut averages = vec![];
// Inputs are ordered, so are parsers.
for input in &inputs {
let input_basename = Path::new(&input).file_name().unwrap().to_string_lossy();
let mut input_times = vec![];
// Run each input for each parser.
for parser in &config.parsers {
println!("Running {input_basename} against {}", parser.name);
// Run benchmark
let path = Path::new(&parser.path).join("run_bench");
let output = std::process::Command::new(path)
.arg(input)
.arg(&iterations)
.arg("--output-yaml")
.output()?;
// Check exit status.
if output.status.code().unwrap_or(1) == 0 {
let s = String::from_utf8_lossy(&output.stdout);
// Get output as yaml.
match serde_yaml::from_str::<BenchYamlOutput>(&s) {
Ok(output) => {
// Push average into our CSV-to-be.
input_times.push(output.average);
// Save the YAML for later.
serde_yaml::to_writer(
BufWriter::new(File::create(format!(
"{}/{}-{}",
config.yaml_output_dir, parser.name, input_basename
))?),
&output,
)?;
}
Err(e) => {
// Yaml is invalid, use 0 as "didn't run properly".
println!("Errored: Invalid YAML output: {e}");
input_times.push(0);
}
}
} else {
// An error happened, use 0 as "didn't run properly".
println!("Errored: process did exit non-zero");
input_times.push(0);
}
}
averages.push(input_times);
}
// Finally, save a CSV.
save_run_bench_csv(config, &inputs, &averages)
}
/// General configuration structure.
#[derive(Serialize, Deserialize)]
struct Config {
/// The path to the directory containing the input yaml files.
yaml_input_dir: String,
/// Number of iterations to run, if using `run_bench`.
iterations: u32,
/// The parsers to run.
parsers: Vec<Parser>,
/// The path to the directory in which `run_bench`'s yamls are saved.
yaml_output_dir: String,
/// The path to the CSV output aggregating times for each parser and file.
csv_output: String,
}
/// A parser configuration.
#[derive(Serialize, Deserialize)]
struct Parser {
/// The name of the parser.
name: String,
/// The path in which the parser's `run_bench` and `time_parse` are located.
path: String,
}
/// Ourput of running `run_bench` on a given parser.
#[derive(Serialize, Deserialize)]
struct BenchYamlOutput {
/// The name of the parser.
parser: String,
/// The file taken as input.
input: String,
/// Average parsing time (ns).
average: u64,
/// Shortest parsing time (ns).
min: u64,
/// Longest parsing time (ns).
max: u64,
/// 95th percentile of parsing times (ns).
percentile95: u64,
/// Number of iterations.
iterations: u64,
/// Parsing times for each run.
times: Vec<u64>,
}
/// Save a CSV file with all averages from `run_bench`.
fn save_run_bench_csv(
config: &Config,
inputs: &[String],
averages: &[Vec<u64>],
) -> Result<(), Error> {
let mut csv = BufWriter::new(File::create(&config.csv_output)?);
for parser in &config.parsers {
write!(csv, ",{}", parser.name,)?;
}
writeln!(csv)?;
for (path, averages) in inputs.iter().zip(averages.iter()) {
let filename = Path::new(path).file_name().unwrap().to_string_lossy();
write!(csv, "{}", filename)?;
for avg in averages {
write!(csv, ",{avg}")?;
}
writeln!(csv)?;
}
Ok(())
}
/// Returns the paths to the input yaml files.
fn list_input_files(config: &Config) -> Result<Vec<String>, Error> {
Ok(std::fs::read_dir(&config.yaml_input_dir)?
.filter_map(Result::ok)
.map(|entry| entry.path().to_string_lossy().to_string())
.filter(|path| {
Path::new(path)
.extension()
.map_or(false, |ext| ext.eq_ignore_ascii_case("yaml"))
})
.collect())
}

View file

@ -1,38 +0,0 @@
use std::env;
use std::fs::File;
use std::io::prelude::*;
use yaml_rust2::{
parser::{MarkedEventReceiver, Parser},
scanner::Marker,
Event,
};
#[derive(Debug)]
struct EventSink {
events: Vec<(Event, Marker)>,
}
impl MarkedEventReceiver for EventSink {
fn on_event(&mut self, ev: Event, mark: Marker) {
eprintln!(" \x1B[;34m\u{21B3} {:?}\x1B[;m", &ev);
self.events.push((ev, mark));
}
}
fn str_to_events(yaml: &str) -> Vec<(Event, Marker)> {
let mut sink = EventSink { events: Vec::new() };
let mut parser = Parser::new_from_str(yaml);
// Load events using our sink as the receiver.
parser.load(&mut sink, true).unwrap();
sink.events
}
fn main() {
let args: Vec<_> = env::args().collect();
let mut f = File::open(&args[1]).unwrap();
let mut s = String::new();
f.read_to_string(&mut s).unwrap();
// dbg!(str_to_events(&s));
str_to_events(&s);
}

View file

@ -1,20 +0,0 @@
[package]
name = "gen_large_yaml"
version = "0.6.0"
authors = [
"Ethiraric <ethiraric@gmail.com>"
]
license = "MIT OR Apache-2.0"
description = "A helper to generate large YAML files"
repository = "https://github.com/Ethiraric/yaml-rust2"
readme = "README.md"
edition = "2018"
[dependencies]
yaml-rust2 = { path = "../.." }
rand = { version = "0.8.5", features = [ "small_rng" ] }
lipsum = "0.9.0"
[profile.release-lto]
inherits = "release"
lto = true

View file

@ -1,156 +0,0 @@
#![allow(clippy::too_many_arguments)]
use rand::{distributions::Alphanumeric, rngs::SmallRng, Rng};
/// Generate a string with hexadecimal digits of the specified length.
pub fn hex_string(rng: &mut SmallRng, len: usize) -> String {
const DIGITS: &[u8] = b"0123456789abcdef";
string_from_set(rng, len, len + 1, DIGITS)
}
/// Generate an e-mail address.
pub fn email(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String {
const CHARSET: &[u8] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_.0123456789";
format!(
"{}@example.com",
string_from_set(rng, len_lo, len_hi, CHARSET)
)
}
/// Generate a random URL.
pub fn url(
rng: &mut SmallRng,
scheme: &str,
n_paths_lo: usize,
n_paths_hi: usize,
path_len_lo: usize,
path_len_hi: usize,
extension: Option<&str>,
) -> String {
let mut string = format!("{scheme}://example.com");
for _ in 0..rng.gen_range(n_paths_lo..n_paths_hi) {
string.push('/');
string.push_str(&alnum_string(rng, path_len_lo, path_len_hi));
}
if let Some(extension) = extension {
string.push('.');
string.push_str(extension);
}
string
}
/// Generate a random integer.
pub fn integer(rng: &mut SmallRng, lo: i64, hi: i64) -> i64 {
rng.gen_range(lo..hi)
}
/// Generate an alphanumeric string with a length between `lo_len` and `hi_len`.
pub fn alnum_string(rng: &mut SmallRng, lo_len: usize, hi_len: usize) -> String {
let len = rng.gen_range(lo_len..hi_len);
rng.sample_iter(&Alphanumeric)
.take(len)
.map(char::from)
.collect()
}
/// Generate a string with hexadecimal digits of the specified length.
pub fn string_from_set(rng: &mut SmallRng, len_lo: usize, len_hi: usize, set: &[u8]) -> String {
(0..rng.gen_range(len_lo..len_hi))
.map(|_| set[rng.gen_range(0..set.len())] as char)
.collect()
}
/// Generate a lipsum paragraph.
pub fn paragraph(
rng: &mut SmallRng,
lines_lo: usize,
lines_hi: usize,
wps_lo: usize,
wps_hi: usize,
line_maxcol: usize,
) -> Vec<String> {
let mut ret = Vec::new();
let nlines = rng.gen_range(lines_lo..lines_hi);
while ret.len() < nlines {
let words_in_sentence = rng.gen_range(wps_lo..wps_hi);
let mut sentence = lipsum::lipsum_words_with_rng(rng.clone(), words_in_sentence);
if let Some(last_line) = ret.pop() {
sentence = format!("{last_line} {sentence}");
}
while sentence.len() > line_maxcol {
let last_space_idx = line_maxcol
- sentence[0..line_maxcol]
.chars()
.rev()
.position(char::is_whitespace)
.unwrap();
ret.push(sentence[0..last_space_idx].to_string());
sentence = sentence[last_space_idx + 1..].to_string();
}
if !sentence.is_empty() {
ret.push(sentence);
}
}
ret
}
/// Generate a full name.
pub fn full_name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String {
format!(
"{} {}",
name(rng, len_lo, len_hi),
name(rng, len_lo, len_hi)
)
}
/// Generate a name.
pub fn name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String {
const UPPER: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
const LOWER: &[u8] = b"abcdefghijklmnopqrstuvwxyz";
let len = rng.gen_range(len_lo..len_hi);
let mut ret = String::new();
ret.push(UPPER[rng.gen_range(0..UPPER.len())] as char);
ret.push_str(string_from_set(rng, len, len + 1, LOWER).as_str());
ret
}
/// Generate a set of words.
pub fn words(rng: &mut SmallRng, words_lo: usize, words_hi: usize) -> String {
let nwords = rng.gen_range(words_lo..words_hi);
lipsum::lipsum_words_with_rng(rng.clone(), nwords).replace(|c| "-\'\",*:".contains(c), "")
}
/// Generate a lipsum text.
///
/// Texts are composed of some paragraphs and empty lines between them.
pub fn text(
rng: &mut SmallRng,
paragraphs_lo: usize,
paragraphs_hi: usize,
lines_lo: usize,
lines_hi: usize,
wps_lo: usize,
wps_hi: usize,
line_maxcol: usize,
) -> Vec<String> {
let mut ret = Vec::new();
let mut first = true;
for _ in 0..rng.gen_range(paragraphs_lo..paragraphs_hi) {
if first {
first = false;
} else {
ret.push(String::new());
}
ret.extend(paragraph(rng, lines_lo, lines_hi, wps_lo, wps_hi, line_maxcol).into_iter());
}
ret
}

View file

@ -1,261 +0,0 @@
#![allow(dead_code)]
mod gen;
mod nested;
use std::fs::File;
use std::io::BufWriter;
use std::path::Path;
use rand::{rngs::SmallRng, Rng, SeedableRng};
/// The path into which the generated YAML files will be written.
const OUTPUT_DIR: &str = "bench_yaml";
fn main() -> std::io::Result<()> {
let mut generator = Generator::new();
let output_path = Path::new(OUTPUT_DIR);
if !output_path.is_dir() {
std::fs::create_dir(output_path).unwrap();
}
println!("Generating big.yaml");
let mut out = BufWriter::new(File::create(output_path.join("big.yaml")).unwrap());
generator.gen_record_array(&mut out, 100_000, 100_001)?;
println!("Generating nested.yaml");
let mut out = BufWriter::new(File::create(output_path.join("nested.yaml")).unwrap());
nested::create_deep_object(&mut out, 1_100_000)?;
println!("Generating small_objects.yaml");
let mut out = BufWriter::new(File::create(output_path.join("small_objects.yaml")).unwrap());
generator.gen_authors_array(&mut out, 4_000_000, 4_000_001)?;
println!("Generating strings_array.yaml");
let mut out = BufWriter::new(File::create(output_path.join("strings_array.yaml")).unwrap());
generator.gen_strings_array(&mut out, 1_300_000, 1_300_001, 10, 40)?;
Ok(())
}
/// YAML Generator.
struct Generator {
/// The RNG state.
///
/// We don't need to be cryptographically secure. [`SmallRng`] also implements the
/// [`SeedableRng`] trait, allowing runs to be predictable.
rng: SmallRng,
/// The stack of indentations.
indents: Vec<usize>,
}
type GenFn<W> = dyn FnOnce(&mut Generator, &mut W) -> std::io::Result<()>;
impl Generator {
/// Create a new generator.
fn new() -> Self {
Generator {
rng: SmallRng::seed_from_u64(42),
indents: vec![0],
}
}
/// Generate an array of records as per [`Self::gen_record_object`].
fn gen_record_array<W: std::io::Write>(
&mut self,
writer: &mut W,
items_lo: usize,
items_hi: usize,
) -> std::io::Result<()> {
self.gen_array(writer, items_lo, items_hi, Generator::gen_record_object)
}
/// Generate an array of lipsum one-liners.
fn gen_strings_array<W: std::io::Write>(
&mut self,
writer: &mut W,
items_lo: usize,
items_hi: usize,
words_lo: usize,
words_hi: usize,
) -> std::io::Result<()> {
self.gen_array(writer, items_lo, items_hi, |gen, writer| {
write!(writer, "{}", gen::words(&mut gen.rng, words_lo, words_hi))
})
}
/// Generate a YAML object/mapping containing a record.
///
/// Fields are description, hash, version, home, repository and pdf.
/// The `description` field is a long string and puts a lot of weight in plain scalar / block
/// scalar parsing.
fn gen_record_object<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
let fields: Vec<(String, Box<GenFn<W>>)> = vec![
(
"description".to_string(),
Box::new(|gen, w| {
write!(w, "|")?;
gen.push_indent(2);
gen.nl(w)?;
let indent = gen.indent();
let text = gen::text(&mut gen.rng, 1, 9, 3, 8, 10, 20, 80 - indent);
gen.write_lines(w, &text)?;
gen.pop_indent();
Ok(())
}),
),
(
"authors".to_string(),
Box::new(|gen, w| {
gen.push_indent(2);
gen.nl(w)?;
gen.gen_authors_array(w, 1, 10)?;
gen.pop_indent();
Ok(())
}),
),
(
"hash".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::hex_string(&mut gen.rng, 64))),
),
(
"version".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::integer(&mut gen.rng, 1, 9))),
),
(
"home".to_string(),
Box::new(|gen, w| {
write!(w, "{}", gen::url(&mut gen.rng, "https", 0, 1, 0, 0, None))
}),
),
(
"repository".to_string(),
Box::new(|gen, w| {
write!(w, "{}", gen::url(&mut gen.rng, "git", 1, 4, 10, 20, None))
}),
),
(
"pdf".to_string(),
Box::new(|gen, w| {
write!(
w,
"{}",
gen::url(&mut gen.rng, "https", 1, 4, 10, 30, Some("pdf"))
)
}),
),
];
self.gen_object(writer, fields)
}
/// Generate an array of authors as per [`Self::gen_author_object`].
fn gen_authors_array<W: std::io::Write>(
&mut self,
writer: &mut W,
items_lo: usize,
items_hi: usize,
) -> std::io::Result<()> {
self.gen_array(writer, items_lo, items_hi, Generator::gen_author_object)
}
/// Generate a small object with 2 string fields.
fn gen_author_object<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
let fields: Vec<(String, Box<GenFn<W>>)> = vec![
(
"name".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::full_name(&mut gen.rng, 10, 15))),
),
(
"email".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::email(&mut gen.rng, 1, 9))),
),
];
self.gen_object(writer, fields)
}
/// Generate a YAML array/sequence containing nodes generated by the given function.
fn gen_array<W: std::io::Write, F: FnMut(&mut Generator, &mut W) -> std::io::Result<()>>(
&mut self,
writer: &mut W,
len_lo: usize,
len_hi: usize,
mut obj_creator: F,
) -> std::io::Result<()> {
let mut first = true;
for _ in 0..self.rng.gen_range(len_lo..len_hi) {
if first {
first = false;
} else {
self.nl(writer)?;
}
write!(writer, "- ")?;
self.push_indent(2);
(obj_creator)(self, writer)?;
self.pop_indent();
}
Ok(())
}
/// Create a Yaml object with some fields in it.
fn gen_object<W: std::io::Write>(
&mut self,
writer: &mut W,
fields: Vec<(String, Box<GenFn<W>>)>,
) -> std::io::Result<()> {
let mut first = true;
for (key, f) in fields {
if first {
first = false;
} else {
self.nl(writer)?;
}
write!(writer, "{key}: ")?;
f(self, writer)?;
}
Ok(())
}
/// Write the given lines at the right indentation.
fn write_lines<W: std::io::Write>(
&mut self,
writer: &mut W,
lines: &[String],
) -> std::io::Result<()> {
let mut first = true;
for line in lines {
if first {
first = false;
} else {
self.nl(writer)?;
}
write!(writer, "{line}")?;
}
Ok(())
}
/// Write a new line to the writer and indent.
fn nl<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
writeln!(writer)?;
for _ in 0..self.indent() {
write!(writer, " ")?;
}
Ok(())
}
/// Return the given indent.
fn indent(&self) -> usize {
*self.indents.last().unwrap()
}
/// Push a new indent with the given relative offset.
fn push_indent(&mut self, offset: usize) {
self.indents.push(self.indent() + offset);
}
/// Pops the last indent.
fn pop_indent(&mut self) {
self.indents.pop();
assert!(!self.indents.is_empty());
}
}

View file

@ -1,115 +0,0 @@
use std::{cell::RefCell, rc::Rc};
use rand::{rngs::SmallRng, Rng, SeedableRng};
/// Create a deep object with the given amount of nodes.
pub fn create_deep_object<W: std::io::Write>(
writer: &mut W,
n_nodes: usize,
) -> std::io::Result<()> {
let mut tree = Tree::new();
for _ in 0..n_nodes {
tree.push_node();
}
tree.write_to(writer)
}
/// An n-tree.
///
/// The algorithm used to generate a potentially deep object is to create a tree, one node at a
/// time, where each node is put as a child of a random existing node in the tree.
struct Tree {
/// The tree-view of the tree.
root: Rc<RefCell<Node>>,
/// Array of all the nodes in the tree, including the root node.
nodes: Vec<Rc<RefCell<Node>>>,
/// The RNG state.
///
/// We don't need to be cryptographically secure. [`SmallRng`] also implements the
/// [`SeedableRng`] trait, allowing runs to be predictable.
rng: SmallRng,
}
/// A node in a tree.
struct Node {
/// All the children of the node.
children: Vec<Rc<RefCell<Node>>>,
}
impl Tree {
/// Create a new tree.
fn new() -> Self {
let root = Node::new_rc_refcell();
Tree {
root: root.clone(),
nodes: vec![root],
rng: SmallRng::seed_from_u64(42),
}
}
/// Add a new node as a child of a random node in the tree.
fn push_node(&mut self) {
let new_node = Node::new_rc_refcell();
let n_nodes = self.nodes.len();
// Bias the nodes towards the end so that there is more nesting.
let parent = &mut self.nodes[self.rng.gen_range((3 * n_nodes / 4)..n_nodes)];
(**parent).borrow_mut().push_child(new_node.clone());
self.nodes.push(new_node);
}
/// Write the YAML representation of the tree to `writer`.
fn write_to<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
(*self.root).borrow().write_to(writer, 0)
}
}
impl Node {
/// Create a new node.
fn new() -> Self {
Node { children: vec![] }
}
fn new_rc_refcell() -> Rc<RefCell<Self>> {
Rc::new(RefCell::new(Self::new()))
}
/// Append a child to the node.
fn push_child(&mut self, child: Rc<RefCell<Self>>) {
self.children.push(child);
}
/// Write the YAML representation of the node to `writer`.
fn write_to<W: std::io::Write>(&self, writer: &mut W, indent: usize) -> std::io::Result<()> {
if self.children.is_empty() {
write_n(writer, ' ', indent)?;
writer.write_all(b"a: 1\n")?;
} else {
for (n, child) in self.children.iter().enumerate() {
write_n(writer, ' ', indent)?;
write_id_for_number(writer, n)?;
writer.write_all(b":\n")?;
(**child).borrow().write_to(writer, indent + 2)?;
}
}
Ok(())
}
}
/// Write `n` times `c` to `out`.
fn write_n<W: std::io::Write>(out: &mut W, c: char, n: usize) -> std::io::Result<()> {
for _ in 0..n {
write!(out, "{c}")?;
}
Ok(())
}
/// Create a valid identifier for the given number.
fn write_id_for_number<W: std::io::Write>(out: &mut W, mut n: usize) -> std::io::Result<()> {
const DIGITS: &[u8] = b"_abcdefghijklmnopqrstuvwxyz";
n += 1;
while n > 0 {
write!(out, "{}", DIGITS[n % DIGITS.len()] as char)?;
n /= DIGITS.len();
}
Ok(())
}

View file

@ -1,71 +0,0 @@
#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
use std::{env, fs::File, io::prelude::*};
use yaml_rust2::{
parser::{MarkedEventReceiver, Parser},
scanner::Marker,
Event,
};
/// A sink which discards any event sent.
struct NullSink {}
impl MarkedEventReceiver for NullSink {
fn on_event(&mut self, _: Event, _: Marker) {}
}
/// Parse the given input, returning elapsed time in nanoseconds.
fn do_parse(input: &str) -> u64 {
let mut sink = NullSink {};
let mut parser = Parser::new_from_str(input);
let begin = std::time::Instant::now();
parser.load(&mut sink, true).unwrap();
let end = std::time::Instant::now();
(end - begin).as_nanos() as u64
}
fn main() {
let args: Vec<_> = env::args().collect();
let iterations: u64 = args[2].parse().unwrap();
let output_yaml = args.len() == 4 && args[3] == "--output-yaml";
let mut f = File::open(&args[1]).unwrap();
let mut s = String::new();
f.read_to_string(&mut s).unwrap();
// Warmup
do_parse(&s);
do_parse(&s);
do_parse(&s);
// Bench
let times: Vec<_> = (0..iterations).map(|_| do_parse(&s)).collect();
let mut sorted_times = times.clone();
sorted_times.sort_unstable();
// Compute relevant metrics.
let sum: u64 = times.iter().sum();
let avg = sum / iterations;
let min = sorted_times[0];
let max = sorted_times[(iterations - 1) as usize];
let percentile95 = sorted_times[((95 * iterations) / 100) as usize];
if output_yaml {
println!("parser: yaml-rust2");
println!("input: {}", args[1]);
println!("average: {avg}");
println!("min: {min}");
println!("max: {max}");
println!("percentile95: {percentile95}");
println!("iterations: {iterations}");
println!("times:");
for time in &times {
println!(" - {time}");
}
} else {
println!("Average: {}s", (avg as f64) / 1_000_000_000.0);
println!("Min: {}s", (min as f64) / 1_000_000_000.0);
println!("Max: {}s", (max as f64) / 1_000_000_000.0);
println!("95%: {}s", (percentile95 as f64) / 1_000_000_000.0);
}
}

View file

@ -1,36 +0,0 @@
use std::env;
use std::fs::File;
use std::io::prelude::*;
use yaml_rust2::{
parser::{MarkedEventReceiver, Parser},
scanner::Marker,
Event,
};
/// A sink which discards any event sent.
struct NullSink {}
impl MarkedEventReceiver for NullSink {
fn on_event(&mut self, _: Event, _: Marker) {}
}
fn main() {
let args: Vec<_> = env::args().collect();
let mut f = File::open(&args[1]).unwrap();
let mut s = String::new();
f.read_to_string(&mut s).unwrap();
let mut sink = NullSink {};
let mut parser = Parser::new_from_str(&s);
// Load events using our sink as the receiver.
let begin = std::time::Instant::now();
parser.load(&mut sink, true).unwrap();
let end = std::time::Instant::now();
if args.len() == 3 && args[2] == "--short" {
println!("{}", (end - begin).as_nanos());
} else {
println!("Loaded {}MiB in {:?}", s.len() / 1024 / 1024, end - begin);
}
}