diff --git a/saphyr/.github/workflows/ci.yml b/saphyr/.github/workflows/ci.yml index aa3a0fc..57aa15e 100644 --- a/saphyr/.github/workflows/ci.yml +++ b/saphyr/.github/workflows/ci.yml @@ -31,7 +31,6 @@ jobs: steps: - name: Checkout uses: actions/checkout@v3 - - run: git submodule update --init - run: rustup toolchain install ${{ matrix.rust }} --profile minimal --no-self-update - uses: Swatinem/rust-cache@v2 - name: Run build diff --git a/saphyr/.gitmodules b/saphyr/.gitmodules deleted file mode 100644 index cbc1e88..0000000 --- a/saphyr/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "tests/yaml-test-suite"] - path = tests/yaml-test-suite - url = https://github.com/yaml/yaml-test-suite/ diff --git a/saphyr/Cargo.toml b/saphyr/Cargo.toml index 6eb3390..4b62419 100644 --- a/saphyr/Cargo.toml +++ b/saphyr/Cargo.toml @@ -1,49 +1,34 @@ [package] -name = "yaml-rust2" -version = "0.8.0" +name = "saphyr" +version = "0.0.1" authors = [ "Yuheng Chen ", "Ethiraric ", "David Aguilar " ] -documentation = "https://docs.rs/yaml-rust2" +documentation = "https://docs.rs/saphyr" +keywords = [ "yaml", "parser" ] +categories = [ "encoding", "parser-implementations" ] license = "MIT OR Apache-2.0" -description = "A fully YAML 1.2 compliant YAML parser" -repository = "https://github.com/Ethiraric/yaml-rust2" +description = "A fully YAML 1.2 compliant YAML library" +repository = "https://github.com/saphyr-rs/saphyr" readme = "README.md" edition = "2021" rust-version = "1.70.0" [features] default = [ "encoding" ] -debug_prints = [] encoding = [ "dep:encoding_rs" ] [dependencies] arraydeque = "0.5.1" +saphyr-parser = "0.0.1" encoding_rs = { version = "0.8.33", optional = true } hashlink = "0.8" [dev-dependencies] -libtest-mimic = "0.3.0" quickcheck = "1.0" [profile.release-lto] inherits = "release" lto = true - -[[test]] -name = "yaml-test-suite" -harness = false - -[[bin]] -name = "dump_events" -path = "tools/dump_events.rs" - -[[bin]] -name = "time_parse" -path = "tools/time_parse.rs" - -[[bin]] -name = "run_bench" -path = "tools/run_bench.rs" diff --git a/saphyr/README.md b/saphyr/README.md index d9f12ad..1b417b9 100644 --- a/saphyr/README.md +++ b/saphyr/README.md @@ -1,29 +1,34 @@ -# yaml-rust2 +# saphyr -[yaml-rust2](https://github.com/Ethiraric/yaml-rust2) is a fully compliant YAML 1.2 -implementation written in pure Rust. +[saphyr](https://github.com/saphyr-rs/saphyr) is a fully compliant YAML 1.2 +library written in pure Rust. This work is based on [`yaml-rust`](https://github.com/chyh1990/yaml-rust) with fixes towards being compliant to the [YAML test suite](https://github.com/yaml/yaml-test-suite/). `yaml-rust`'s parser is heavily influenced by `libyaml` and `yaml-cpp`. -`yaml-rust2` is a pure Rust YAML 1.2 implementation that benefits from the +`saphyr` is a pure Rust YAML 1.2 implementation that benefits from the memory safety and other benefits from the Rust language. ## Quick Start - -Add the following to the Cargo.toml of your project: +### Installing +Add the following to your Cargo.toml: ```toml [dependencies] -yaml-rust2 = "0.8" +saphyr = "0.0.1" +``` +or use `cargo add` to get the latest version automatically: +```sh +cargo add saphyr ``` -Use `yaml_rust2::YamlLoader` to load YAML documents and access them as `Yaml` objects: +### Example +Use `saphyr::YamlLoader` to load YAML documents and access them as `Yaml` objects: ```rust -use yaml_rust2::{YamlLoader, YamlEmitter}; +use saphyr::{YamlLoader, YamlEmitter}; fn main() { let s = @@ -61,7 +66,7 @@ bar: } ``` -Note that `yaml_rust2::Yaml` implements `Index<&'a str>` and `Index`: +Note that `saphyr::Yaml` implements `Index<&'a str>` and `Index`: * `Index` assumes the container is an array * `Index<&'a str>` assumes the container is a string to value map @@ -75,7 +80,6 @@ your objects. * Pure Rust * `Vec`/`HashMap` access API -* Low-level YAML events emission ## Security @@ -85,24 +89,10 @@ communicating with the outside world just by parsing a YAML document. ## Specification Compliance -This implementation is fully compatible with the YAML 1.2 specification. In -order to help with compliance, `yaml-rust2` tests against (and passes) the [YAML -test suite](https://github.com/yaml/yaml-test-suite/). - -## Upgrading from yaml-rust - -You can use `yaml-rust2` as a drop-in replacement for the original `yaml-rust` crate. - -```toml -[dependencies] -yaml-rust = { version = "#.#", package = "yaml-rust2" } -``` - -This `Cargo.toml` declaration allows you to refer to this crate as `yaml_rust` in your code. - -```rust -use yaml_rust::{YamlLoader, YamlEmitter}; -``` +This implementation is fully compatible with the YAML 1.2 specification. The +parser behind this library +([`saphyr-parser`](https://github.com/saphyr-rs/saphyr-parser)) tests against +(and passes) the [YAML test suite](https://github.com/yaml/yaml-test-suite/). ## License @@ -122,10 +112,9 @@ You can find licences in the [`.licenses`](.licenses) subfolder. ## Contribution -[Fork this repository](https://github.com/Ethiraric/yaml-rust2/fork) and -[Create a Pull Request on Github](https://github.com/Ethiraric/yaml-rust2/compare/master...Ethiraric:yaml-rust2:master). +[Fork this repository](https://github.com/saphyr-rs/saphyr/fork) and +[Create a Pull Request on Github](https://github.com/saphyr-rs/saphyr/compare/master...saphyr-rs:saphyr:master). You may need to click on "compare across forks" and select your fork's branch. -Make sure that `Ethiraric` is selected as the base repository, not `chyh1990`. Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall @@ -133,10 +122,12 @@ be dual licensed as above, without any additional terms or conditions. ## Links -* [yaml-rust2 source code repository](https://github.com/Ethiraric/yaml-rust2) +* [saphyr source code repository](https://github.com/saphyr-rs/saphyr) -* [yaml-rust2 releases on crates.io](https://crates.io/crates/yaml-rust2) +* [saphyr releases on crates.io](https://crates.io/crates/saphyr) -* [yaml-rust2 documentation on docs.rs](https://docs.rs/yaml-rust2/latest/yaml_rust2/) +* [saphyr documentation on docs.rs](https://docs.rs/saphyr/latest/saphyr/) + +* [saphyr-parser releases on crates.io](https://crates.io/crates/saphyr-parser) * [yaml-test-suite](https://github.com/yaml/yaml-test-suite) diff --git a/saphyr/examples/dump_yaml.rs b/saphyr/examples/dump_yaml.rs index 1c3c452..1a9f0f5 100644 --- a/saphyr/examples/dump_yaml.rs +++ b/saphyr/examples/dump_yaml.rs @@ -1,7 +1,7 @@ +use saphyr::yaml; use std::env; use std::fs::File; use std::io::prelude::*; -use yaml_rust2::yaml; fn print_indent(indent: usize) { for _ in 0..indent { diff --git a/saphyr/justfile b/saphyr/justfile index f33ee69..47d601c 100644 --- a/saphyr/justfile +++ b/saphyr/justfile @@ -1,4 +1,5 @@ before_commit: + cargo fmt --check cargo clippy --release --all-targets -- -D warnings cargo clippy --all-targets -- -D warnings cargo build --release --all-targets @@ -6,12 +7,4 @@ before_commit: cargo test cargo test --release cargo test --doc - cargo build --profile=release-lto --package gen_large_yaml --bin gen_large_yaml --manifest-path tools/gen_large_yaml/Cargo.toml RUSTDOCFLAGS="-D warnings" cargo doc --all-features - -ethi_bench: - cargo build --release --all-targets - cd ../Yaml-rust && cargo build --release --all-targets - cd ../serde-yaml/ && cargo build --release --all-targets - cd ../libfyaml/build && ninja - cargo bench_compare run_bench diff --git a/saphyr/src/char_traits.rs b/saphyr/src/char_traits.rs index 82f81bd..c54aff0 100644 --- a/saphyr/src/char_traits.rs +++ b/saphyr/src/char_traits.rs @@ -1,115 +1,5 @@ //! Holds functions to determine if a character belongs to a specific character set. -/// Check whether the character is nil (`\0`). -#[inline] -pub(crate) fn is_z(c: char) -> bool { - c == '\0' -} - -/// Check whether the character is a line break (`\r` or `\n`). -#[inline] -pub(crate) fn is_break(c: char) -> bool { - c == '\n' || c == '\r' -} - -/// Check whether the character is nil or a line break (`\0`, `\r`, `\n`). -#[inline] -pub(crate) fn is_breakz(c: char) -> bool { - is_break(c) || is_z(c) -} - -/// Check whether the character is a whitespace (` ` or `\t`). -#[inline] -pub(crate) fn is_blank(c: char) -> bool { - c == ' ' || c == '\t' -} - -/// Check whether the character is nil, a linebreak or a whitespace. -/// -/// `\0`, ` `, `\t`, `\n`, `\r` -#[inline] -pub(crate) fn is_blank_or_breakz(c: char) -> bool { - is_blank(c) || is_breakz(c) -} - -/// Check whether the character is an ascii digit. -#[inline] -pub(crate) fn is_digit(c: char) -> bool { - c.is_ascii_digit() -} - -/// Check whether the character is a digit, letter, `_` or `-`. -#[inline] -pub(crate) fn is_alpha(c: char) -> bool { - matches!(c, '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' | '-') -} - -/// Check whether the character is a hexadecimal character (case insensitive). -#[inline] -pub(crate) fn is_hex(c: char) -> bool { - c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c) -} - -/// Convert the hexadecimal digit to an integer. -#[inline] -pub(crate) fn as_hex(c: char) -> u32 { - match c { - '0'..='9' => (c as u32) - ('0' as u32), - 'a'..='f' => (c as u32) - ('a' as u32) + 10, - 'A'..='F' => (c as u32) - ('A' as u32) + 10, - _ => unreachable!(), - } -} - -/// Check whether the character is a YAML flow character (one of `,[]{}`). -#[inline] -pub(crate) fn is_flow(c: char) -> bool { - matches!(c, ',' | '[' | ']' | '{' | '}') -} - -/// Check whether the character is the BOM character. -#[inline] -pub(crate) fn is_bom(c: char) -> bool { - c == '\u{FEFF}' -} - -/// Check whether the character is a YAML non-breaking character. -#[inline] -pub(crate) fn is_yaml_non_break(c: char) -> bool { - // TODO(ethiraric, 28/12/2023): is_printable - !is_break(c) && !is_bom(c) -} - -/// Check whether the character is NOT a YAML whitespace (` ` / `\t`). -#[inline] -pub(crate) fn is_yaml_non_space(c: char) -> bool { - is_yaml_non_break(c) && !is_blank(c) -} - -/// Check whether the character is a valid YAML anchor name character. -#[inline] -pub(crate) fn is_anchor_char(c: char) -> bool { - is_yaml_non_space(c) && !is_flow(c) && !is_z(c) -} - -/// Check whether the character is a valid word character. -#[inline] -pub(crate) fn is_word_char(c: char) -> bool { - is_alpha(c) && c != '_' -} - -/// Check whether the character is a valid URI character. -#[inline] -pub(crate) fn is_uri_char(c: char) -> bool { - is_word_char(c) || "#;/?:@&=+$,_.!~*\'()[]%".contains(c) -} - -/// Check whether the character is a valid tag character. -#[inline] -pub(crate) fn is_tag_char(c: char) -> bool { - is_uri_char(c) && !is_flow(c) && c != '!' -} - /// Check if the string can be expressed a valid literal block scalar. /// The YAML spec supports all of the following in block literals except `#xFEFF`: /// ```no_compile diff --git a/saphyr/src/debug.rs b/saphyr/src/debug.rs deleted file mode 100644 index c1411cb..0000000 --- a/saphyr/src/debug.rs +++ /dev/null @@ -1,41 +0,0 @@ -//! Debugging helpers. -//! -//! Debugging is governed by two conditions: -//! 1. The build mode. Debugging code is not emitted in release builds and thus not available. -//! 2. The `YAMLALL_DEBUG` environment variable. If built in debug mode, the program must be fed -//! the `YAMLALL_DEBUG` variable in its environment. While debugging code is present in debug -//! build, debug helpers will only trigger if that variable is set when running the program. - -// If a debug build, use stuff in the debug submodule. -#[cfg(feature = "debug_prints")] -pub use debug::enabled; - -// Otherwise, just export dummies for publicly visible functions. -/// Evaluates to nothing. -#[cfg(not(feature = "debug_prints"))] -macro_rules! debug_print { - ($($arg:tt)*) => {{}}; -} - -#[cfg(feature = "debug_prints")] -#[macro_use] -#[allow(clippy::module_inception)] -mod debug { - use std::sync::OnceLock; - - /// If debugging is [`enabled`], print the format string on the error output. - macro_rules! debug_print { - ($($arg:tt)*) => {{ - if $crate::debug::enabled() { - eprintln!($($arg)*) - } - }}; - } - - /// Return whether debugging features are enabled in this execution. - #[cfg(debug_assertions)] - pub fn enabled() -> bool { - static ENABLED: OnceLock = OnceLock::new(); - *ENABLED.get_or_init(|| std::env::var("YAMLRUST2_DEBUG").is_ok()) - } -} diff --git a/saphyr/src/emitter.rs b/saphyr/src/emitter.rs index 48c8b5c..19d8d4a 100644 --- a/saphyr/src/emitter.rs +++ b/saphyr/src/emitter.rs @@ -36,7 +36,7 @@ impl From for EmitError { /// The YAML serializer. /// /// ``` -/// # use yaml_rust2::{YamlLoader, YamlEmitter}; +/// # use saphyr::{YamlLoader, YamlEmitter}; /// let input_string = "a: b\nc: d"; /// let yaml = YamlLoader::load_from_str(input_string).unwrap(); /// @@ -142,6 +142,8 @@ impl<'a> YamlEmitter<'a> { /// In this form, blocks cannot have any properties (such as anchors /// or tags), which should be OK, because this emitter doesn't /// (currently) emit those anyways. + /// + /// TODO(ethiraric, 2024/04/02): We can support those now. pub fn compact(&mut self, compact: bool) { self.compact = compact; } @@ -157,7 +159,7 @@ impl<'a> YamlEmitter<'a> { /// # Examples /// /// ```rust - /// use yaml_rust2::{Yaml, YamlEmitter, YamlLoader}; + /// use saphyr::{Yaml, YamlEmitter, YamlLoader}; /// /// let input = r#"{foo: "bar!\nbar!", baz: 42}"#; /// let parsed = YamlLoader::load_from_str(input).unwrap(); diff --git a/saphyr/src/lib.rs b/saphyr/src/lib.rs index d430177..aaed759 100644 --- a/saphyr/src/lib.rs +++ b/saphyr/src/lib.rs @@ -6,19 +6,22 @@ //! //! # Usage //! -//! This crate is [on github](https://github.com/Ethiraric/yaml-rust2) and can be used by adding -//! `yaml-rust2` to the dependencies in your project's `Cargo.toml`. -//! +//! This crate is [on github](https://github.com/saphyr-rs/saphyr) and can be used by adding +//! `saphyr` to the dependencies in your project's `Cargo.toml`. //! ```toml //! [dependencies] -//! yaml-rust2 = "0.8.0" +//! saphyr = "0.0.1" +//! ``` +//! or by using `cargo add` to get the latest version: +//! ```sh +//! cargo add saphyr //! ``` //! //! # Examples //! Parse a string into `Vec` and then serialize it as a YAML string. //! //! ``` -//! use yaml_rust2::{YamlLoader, YamlEmitter}; +//! use saphyr::{YamlLoader, YamlEmitter}; //! //! let docs = YamlLoader::load_from_str("[1, 2, 3]").unwrap(); //! let doc = &docs[0]; // select the first YAML document @@ -37,28 +40,20 @@ //! Enables encoding-aware decoding of Yaml documents. //! //! The MSRV for this feature is `1.70.0`. -//! -//! #### `debug_prints` -//! Enables the `debug` module and usage of debug prints in the scanner and the parser. Do not -//! enable if you are consuming the crate rather than working on it as this can significantly -//! decrease performance. -//! -//! The MSRV for this feature is `1.70.0`. #![warn(missing_docs, clippy::pedantic)] -extern crate hashlink; - pub(crate) mod char_traits; -#[macro_use] -pub(crate) mod debug; pub mod emitter; -pub mod parser; -pub mod scanner; pub mod yaml; -// reexport key APIs -pub use crate::emitter::{EmitError, YamlEmitter}; -pub use crate::parser::Event; -pub use crate::scanner::ScanError; -pub use crate::yaml::{Yaml, YamlLoader}; +// Re-export main components. +pub use crate::emitter::YamlEmitter; +pub use crate::yaml::{Array, Hash, Yaml, YamlLoader}; + +#[cfg(feature = "encoding")] +pub use crate::yaml::{YAMLDecodingTrap, YAMLDecodingTrapFn, YamlDecoder}; + +// Re-export `ScanError` as it is used as part of our public API and we want consumers to be able +// to inspect it (e.g. perform a `match`). They wouldn't be able without it. +pub use saphyr_parser::ScanError; diff --git a/saphyr/src/parser.rs b/saphyr/src/parser.rs deleted file mode 100644 index 59869a2..0000000 --- a/saphyr/src/parser.rs +++ /dev/null @@ -1,1143 +0,0 @@ -//! Home to the YAML Parser. -//! -//! The parser takes input from the [`crate::scanner::Scanner`], performs final checks for YAML -//! compliance, and emits a stream of tokens that can be used by the [`crate::YamlLoader`] to -//! construct the [`crate::Yaml`] object. - -use crate::scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType}; -use std::collections::HashMap; - -#[derive(Clone, Copy, PartialEq, Debug, Eq)] -enum State { - /// We await the start of the stream. - StreamStart, - ImplicitDocumentStart, - DocumentStart, - DocumentContent, - DocumentEnd, - BlockNode, - // BlockNodeOrIndentlessSequence, - // FlowNode, - BlockSequenceFirstEntry, - BlockSequenceEntry, - IndentlessSequenceEntry, - BlockMappingFirstKey, - BlockMappingKey, - BlockMappingValue, - FlowSequenceFirstEntry, - FlowSequenceEntry, - FlowSequenceEntryMappingKey, - FlowSequenceEntryMappingValue, - FlowSequenceEntryMappingEnd, - FlowMappingFirstKey, - FlowMappingKey, - FlowMappingValue, - FlowMappingEmptyValue, - End, -} - -/// An event generated by the YAML parser. -/// -/// Events are used in the low-level event-based API (push parser). The API entrypoint is the -/// [`EventReceiver`] trait. -#[derive(Clone, PartialEq, Debug, Eq)] -pub enum Event { - /// Reserved for internal use. - Nothing, - /// Event generated at the very beginning of parsing. - StreamStart, - /// Last event that will be generated by the parser. Signals EOF. - StreamEnd, - /// The YAML start document directive (`---`). - DocumentStart, - /// The YAML end document directive (`...`). - DocumentEnd, - /// A YAML Alias. - Alias( - /// The anchor ID the alias refers to. - usize, - ), - /// Value, style, anchor_id, tag - Scalar(String, TScalarStyle, usize, Option), - /// The start of a YAML sequence (array). - SequenceStart( - /// The anchor ID of the start of the sequence. - usize, - /// An optional tag - Option, - ), - /// The end of a YAML sequence (array). - SequenceEnd, - /// The start of a YAML mapping (object, hash). - MappingStart( - /// The anchor ID of the start of the mapping. - usize, - /// An optional tag - Option, - ), - /// The end of a YAML mapping (object, hash). - MappingEnd, -} - -/// A YAML tag. -#[derive(Clone, PartialEq, Debug, Eq)] -pub struct Tag { - /// Handle of the tag (`!` included). - pub handle: String, - /// The suffix of the tag. - pub suffix: String, -} - -impl Event { - /// Create an empty scalar. - fn empty_scalar() -> Event { - // a null scalar - Event::Scalar("~".to_owned(), TScalarStyle::Plain, 0, None) - } - - /// Create an empty scalar with the given anchor. - fn empty_scalar_with_anchor(anchor: usize, tag: Option) -> Event { - Event::Scalar(String::new(), TScalarStyle::Plain, anchor, tag) - } -} - -/// A YAML parser. -#[derive(Debug)] -pub struct Parser { - scanner: Scanner, - states: Vec, - state: State, - token: Option, - current: Option<(Event, Marker)>, - anchors: HashMap, - anchor_id: usize, - /// The tag directives (`%TAG`) the parser has encountered. - /// - /// Key is the handle, and value is the prefix. - tags: HashMap, - /// Make tags global across all documents. - keep_tags: bool, -} - -/// Trait to be implemented in order to use the low-level parsing API. -/// -/// The low-level parsing API is event-based (a push parser), calling [`EventReceiver::on_event`] -/// for each YAML [`Event`] that occurs. -/// The [`EventReceiver`] trait only receives events. In order to receive both events and their -/// location in the source, use [`MarkedEventReceiver`]. Note that [`EventReceiver`]s implement -/// [`MarkedEventReceiver`] automatically. -/// -/// # Event hierarchy -/// The event stream starts with an [`Event::StreamStart`] event followed by an -/// [`Event::DocumentStart`] event. If the YAML document starts with a mapping (an object), an -/// [`Event::MappingStart`] event is emitted. If it starts with a sequence (an array), an -/// [`Event::SequenceStart`] event is emitted. Otherwise, an [`Event::Scalar`] event is emitted. -/// -/// In a mapping, key-values are sent as consecutive events. The first event after an -/// [`Event::MappingStart`] will be the key, and following its value. If the mapping contains no -/// sub-mapping or sub-sequence, then even events (starting from 0) will always be keys and odd -/// ones will always be values. The mapping ends when an [`Event::MappingEnd`] event is received. -/// -/// In a sequence, values are sent consecutively until the [`Event::SequenceEnd`] event. -/// -/// If a value is a sub-mapping or a sub-sequence, an [`Event::MappingStart`] or -/// [`Event::SequenceStart`] event will be sent respectively. Following events until the associated -/// [`Event::MappingStart`] or [`Event::SequenceEnd`] (beware of nested mappings or sequences) will -/// be part of the value and not another key-value pair or element in the sequence. -/// -/// For instance, the following yaml: -/// ```yaml -/// a: b -/// c: -/// d: e -/// f: -/// - g -/// - h -/// ``` -/// will emit (indented and commented for lisibility): -/// ```text -/// StreamStart, DocumentStart, MappingStart, -/// Scalar("a", ..), Scalar("b", ..) -/// Scalar("c", ..), MappingStart, Scalar("d", ..), Scalar("e", ..), MappingEnd, -/// Scalar("f", ..), SequenceStart, Scalar("g", ..), Scalar("h", ..), SequenceEnd, -/// MappingEnd, DocumentEnd, StreamEnd -/// ``` -/// -/// # Example -/// ``` -/// # use yaml_rust2::parser::{Event, EventReceiver, Parser}; -/// # -/// /// Sink of events. Collects them into an array. -/// struct EventSink { -/// events: Vec, -/// } -/// -/// /// Implement `on_event`, pushing into `self.events`. -/// impl EventReceiver for EventSink { -/// fn on_event(&mut self, ev: Event) { -/// self.events.push(ev); -/// } -/// } -/// -/// /// Load events from a yaml string. -/// fn str_to_events(yaml: &str) -> Vec { -/// let mut sink = EventSink { events: Vec::new() }; -/// let mut parser = Parser::new_from_str(yaml); -/// // Load events using our sink as the receiver. -/// parser.load(&mut sink, true).unwrap(); -/// sink.events -/// } -/// ``` -pub trait EventReceiver { - /// Handler called for each YAML event that is emitted by the parser. - fn on_event(&mut self, ev: Event); -} - -/// Trait to be implemented for using the low-level parsing API. -/// -/// Functionally similar to [`EventReceiver`], but receives a [`Marker`] as well as the event. -pub trait MarkedEventReceiver { - /// Handler called for each event that occurs. - fn on_event(&mut self, ev: Event, _mark: Marker); -} - -impl MarkedEventReceiver for R { - fn on_event(&mut self, ev: Event, _mark: Marker) { - self.on_event(ev); - } -} - -/// A convenience alias for a `Result` of a parser event. -pub type ParseResult = Result<(Event, Marker), ScanError>; - -impl<'a> Parser> { - /// Create a new instance of a parser from a &str. - #[must_use] - pub fn new_from_str(value: &'a str) -> Self { - Parser::new(value.chars()) - } -} - -impl> Parser { - /// Create a new instance of a parser from the given input of characters. - pub fn new(src: T) -> Parser { - Parser { - scanner: Scanner::new(src), - states: Vec::new(), - state: State::StreamStart, - token: None, - current: None, - - anchors: HashMap::new(), - // valid anchor_id starts from 1 - anchor_id: 1, - tags: HashMap::new(), - keep_tags: false, - } - } - - /// Whether to keep tags across multiple documents when parsing. - /// - /// This behavior is non-standard as per the YAML specification but can be encountered in the - /// wild. This boolean allows enabling this non-standard extension. This would result in the - /// parser accepting input from [test - /// QLJ7](https://github.com/yaml/yaml-test-suite/blob/ccfa74e56afb53da960847ff6e6976c0a0825709/src/QLJ7.yaml) - /// of the yaml-test-suite: - /// - /// ```yaml - /// %TAG !prefix! tag:example.com,2011: - /// --- !prefix!A - /// a: b - /// --- !prefix!B - /// c: d - /// --- !prefix!C - /// e: f - /// ``` - /// - /// With `keep_tags` set to `false`, the above YAML is rejected. As per the specification, tags - /// only apply to the document immediately following them. This would error on `!prefix!B`. - /// - /// With `keep_tags` set to `true`, the above YAML is accepted by the parser. - #[must_use] - pub fn keep_tags(mut self, value: bool) -> Self { - self.keep_tags = value; - self - } - - /// Try to load the next event and return it, but do not consuming it from `self`. - /// - /// Any subsequent call to [`Parser::peek`] will return the same value, until a call to - /// [`Iterator::next`] or [`Parser::load`]. - /// # Errors - /// Returns `ScanError` when loading the next event fails. - pub fn peek(&mut self) -> Result<&(Event, Marker), ScanError> { - if let Some(ref x) = self.current { - Ok(x) - } else { - self.current = Some(self.next_token()?); - self.peek() - } - } - - /// Try to load the next event and return it, consuming it from `self`. - /// # Errors - /// Returns `ScanError` when loading the next event fails. - pub fn next_token(&mut self) -> ParseResult { - match self.current.take() { - None => self.parse(), - Some(v) => Ok(v), - } - } - - /// Peek at the next token from the scanner. - fn peek_token(&mut self) -> Result<&Token, ScanError> { - match self.token { - None => { - self.token = Some(self.scan_next_token()?); - Ok(self.token.as_ref().unwrap()) - } - Some(ref tok) => Ok(tok), - } - } - - /// Extract and return the next token from the scanner. - /// - /// This function does _not_ make use of `self.token`. - fn scan_next_token(&mut self) -> Result { - let token = self.scanner.next(); - match token { - None => match self.scanner.get_error() { - None => Err(ScanError::new(self.scanner.mark(), "unexpected eof")), - Some(e) => Err(e), - }, - Some(tok) => Ok(tok), - } - } - - fn fetch_token(&mut self) -> Token { - self.token - .take() - .expect("fetch_token needs to be preceded by peek_token") - } - - /// Skip the next token from the scanner. - fn skip(&mut self) { - self.token = None; - //self.peek_token(); - } - /// Pops the top-most state and make it the current state. - fn pop_state(&mut self) { - self.state = self.states.pop().unwrap(); - } - /// Push a new state atop the state stack. - fn push_state(&mut self, state: State) { - self.states.push(state); - } - - fn parse(&mut self) -> ParseResult { - if self.state == State::End { - return Ok((Event::StreamEnd, self.scanner.mark())); - } - let (ev, mark) = self.state_machine()?; - // println!("EV {:?}", ev); - Ok((ev, mark)) - } - - /// Load the YAML from the stream in `self`, pushing events into `recv`. - /// - /// The contents of the stream are parsed and the corresponding events are sent into the - /// recveiver. For detailed explanations about how events work, see [`EventReceiver`]. - /// - /// If `multi` is set to `true`, the parser will allow parsing of multiple YAML documents - /// inside the stream. - /// - /// Note that any [`EventReceiver`] is also a [`MarkedEventReceiver`], so implementing the - /// former is enough to call this function. - /// # Errors - /// Returns `ScanError` when loading fails. - pub fn load( - &mut self, - recv: &mut R, - multi: bool, - ) -> Result<(), ScanError> { - if !self.scanner.stream_started() { - let (ev, mark) = self.next_token()?; - if ev != Event::StreamStart { - return Err(ScanError::new(mark, "did not find expected ")); - } - recv.on_event(ev, mark); - } - - if self.scanner.stream_ended() { - // XXX has parsed? - recv.on_event(Event::StreamEnd, self.scanner.mark()); - return Ok(()); - } - loop { - let (ev, mark) = self.next_token()?; - if ev == Event::StreamEnd { - recv.on_event(ev, mark); - return Ok(()); - } - // clear anchors before a new document - self.anchors.clear(); - self.load_document(ev, mark, recv)?; - if !multi { - break; - } - } - Ok(()) - } - - fn load_document( - &mut self, - first_ev: Event, - mark: Marker, - recv: &mut R, - ) -> Result<(), ScanError> { - if first_ev != Event::DocumentStart { - return Err(ScanError::new( - mark, - "did not find expected ", - )); - } - recv.on_event(first_ev, mark); - - let (ev, mark) = self.next_token()?; - self.load_node(ev, mark, recv)?; - - // DOCUMENT-END is expected. - let (ev, mark) = self.next_token()?; - assert_eq!(ev, Event::DocumentEnd); - recv.on_event(ev, mark); - - Ok(()) - } - - fn load_node( - &mut self, - first_ev: Event, - mark: Marker, - recv: &mut R, - ) -> Result<(), ScanError> { - match first_ev { - Event::Alias(..) | Event::Scalar(..) => { - recv.on_event(first_ev, mark); - Ok(()) - } - Event::SequenceStart(..) => { - recv.on_event(first_ev, mark); - self.load_sequence(recv) - } - Event::MappingStart(..) => { - recv.on_event(first_ev, mark); - self.load_mapping(recv) - } - _ => { - println!("UNREACHABLE EVENT: {first_ev:?}"); - unreachable!(); - } - } - } - - fn load_mapping(&mut self, recv: &mut R) -> Result<(), ScanError> { - let (mut key_ev, mut key_mark) = self.next_token()?; - while key_ev != Event::MappingEnd { - // key - self.load_node(key_ev, key_mark, recv)?; - - // value - let (ev, mark) = self.next_token()?; - self.load_node(ev, mark, recv)?; - - // next event - let (ev, mark) = self.next_token()?; - key_ev = ev; - key_mark = mark; - } - recv.on_event(key_ev, key_mark); - Ok(()) - } - - fn load_sequence(&mut self, recv: &mut R) -> Result<(), ScanError> { - let (mut ev, mut mark) = self.next_token()?; - while ev != Event::SequenceEnd { - self.load_node(ev, mark, recv)?; - - // next event - let (next_ev, next_mark) = self.next_token()?; - ev = next_ev; - mark = next_mark; - } - recv.on_event(ev, mark); - Ok(()) - } - - fn state_machine(&mut self) -> ParseResult { - // let next_tok = self.peek_token().cloned()?; - // println!("cur_state {:?}, next tok: {:?}", self.state, next_tok); - debug_print!("\n\x1B[;33mParser state: {:?} \x1B[;0m", self.state); - - match self.state { - State::StreamStart => self.stream_start(), - - State::ImplicitDocumentStart => self.document_start(true), - State::DocumentStart => self.document_start(false), - State::DocumentContent => self.document_content(), - State::DocumentEnd => self.document_end(), - - State::BlockNode => self.parse_node(true, false), - // State::BlockNodeOrIndentlessSequence => self.parse_node(true, true), - // State::FlowNode => self.parse_node(false, false), - State::BlockMappingFirstKey => self.block_mapping_key(true), - State::BlockMappingKey => self.block_mapping_key(false), - State::BlockMappingValue => self.block_mapping_value(), - - State::BlockSequenceFirstEntry => self.block_sequence_entry(true), - State::BlockSequenceEntry => self.block_sequence_entry(false), - - State::FlowSequenceFirstEntry => self.flow_sequence_entry(true), - State::FlowSequenceEntry => self.flow_sequence_entry(false), - - State::FlowMappingFirstKey => self.flow_mapping_key(true), - State::FlowMappingKey => self.flow_mapping_key(false), - State::FlowMappingValue => self.flow_mapping_value(false), - - State::IndentlessSequenceEntry => self.indentless_sequence_entry(), - - State::FlowSequenceEntryMappingKey => self.flow_sequence_entry_mapping_key(), - State::FlowSequenceEntryMappingValue => self.flow_sequence_entry_mapping_value(), - State::FlowSequenceEntryMappingEnd => self.flow_sequence_entry_mapping_end(), - State::FlowMappingEmptyValue => self.flow_mapping_value(true), - - /* impossible */ - State::End => unreachable!(), - } - } - - fn stream_start(&mut self) -> ParseResult { - match *self.peek_token()? { - Token(mark, TokenType::StreamStart(_)) => { - self.state = State::ImplicitDocumentStart; - self.skip(); - Ok((Event::StreamStart, mark)) - } - Token(mark, _) => Err(ScanError::new(mark, "did not find expected ")), - } - } - - fn document_start(&mut self, implicit: bool) -> ParseResult { - while let TokenType::DocumentEnd = self.peek_token()?.1 { - self.skip(); - } - - match *self.peek_token()? { - Token(mark, TokenType::StreamEnd) => { - self.state = State::End; - self.skip(); - Ok((Event::StreamEnd, mark)) - } - Token( - _, - TokenType::VersionDirective(..) - | TokenType::TagDirective(..) - | TokenType::DocumentStart, - ) => { - // explicit document - self.explicit_document_start() - } - Token(mark, _) if implicit => { - self.parser_process_directives()?; - self.push_state(State::DocumentEnd); - self.state = State::BlockNode; - Ok((Event::DocumentStart, mark)) - } - _ => { - // explicit document - self.explicit_document_start() - } - } - } - - fn parser_process_directives(&mut self) -> Result<(), ScanError> { - let mut version_directive_received = false; - loop { - let mut tags = HashMap::new(); - match self.peek_token()? { - Token(mark, TokenType::VersionDirective(_, _)) => { - // XXX parsing with warning according to spec - //if major != 1 || minor > 2 { - // return Err(ScanError::new(tok.0, - // "found incompatible YAML document")); - //} - if version_directive_received { - return Err(ScanError::new(*mark, "duplicate version directive")); - } - version_directive_received = true; - } - Token(mark, TokenType::TagDirective(handle, prefix)) => { - if tags.contains_key(handle) { - return Err(ScanError::new(*mark, "the TAG directive must only be given at most once per handle in the same document")); - } - tags.insert(handle.to_string(), prefix.to_string()); - } - _ => break, - } - self.tags = tags; - self.skip(); - } - Ok(()) - } - - fn explicit_document_start(&mut self) -> ParseResult { - self.parser_process_directives()?; - match *self.peek_token()? { - Token(mark, TokenType::DocumentStart) => { - self.push_state(State::DocumentEnd); - self.state = State::DocumentContent; - self.skip(); - Ok((Event::DocumentStart, mark)) - } - Token(mark, _) => Err(ScanError::new( - mark, - "did not find expected ", - )), - } - } - - fn document_content(&mut self) -> ParseResult { - match *self.peek_token()? { - Token( - mark, - TokenType::VersionDirective(..) - | TokenType::TagDirective(..) - | TokenType::DocumentStart - | TokenType::DocumentEnd - | TokenType::StreamEnd, - ) => { - self.pop_state(); - // empty scalar - Ok((Event::empty_scalar(), mark)) - } - _ => self.parse_node(true, false), - } - } - - fn document_end(&mut self) -> ParseResult { - let mut explicit_end = false; - let marker: Marker = match *self.peek_token()? { - Token(mark, TokenType::DocumentEnd) => { - explicit_end = true; - self.skip(); - mark - } - Token(mark, _) => mark, - }; - - if !self.keep_tags { - self.tags.clear(); - } - if explicit_end { - self.state = State::ImplicitDocumentStart; - } else { - if let Token(mark, TokenType::VersionDirective(..) | TokenType::TagDirective(..)) = - *self.peek_token()? - { - return Err(ScanError::new( - mark, - "missing explicit document end marker before directive", - )); - } - self.state = State::DocumentStart; - } - - Ok((Event::DocumentEnd, marker)) - } - - fn register_anchor(&mut self, name: String, _: &Marker) -> usize { - // anchors can be overridden/reused - // if self.anchors.contains_key(name) { - // return Err(ScanError::new(*mark, - // "while parsing anchor, found duplicated anchor")); - // } - let new_id = self.anchor_id; - self.anchor_id += 1; - self.anchors.insert(name, new_id); - new_id - } - - fn parse_node(&mut self, block: bool, indentless_sequence: bool) -> ParseResult { - let mut anchor_id = 0; - let mut tag = None; - match *self.peek_token()? { - Token(_, TokenType::Alias(_)) => { - self.pop_state(); - if let Token(mark, TokenType::Alias(name)) = self.fetch_token() { - match self.anchors.get(&name) { - None => { - return Err(ScanError::new( - mark, - "while parsing node, found unknown anchor", - )) - } - Some(id) => return Ok((Event::Alias(*id), mark)), - } - } - unreachable!() - } - Token(_, TokenType::Anchor(_)) => { - if let Token(mark, TokenType::Anchor(name)) = self.fetch_token() { - anchor_id = self.register_anchor(name, &mark); - if let TokenType::Tag(..) = self.peek_token()?.1 { - if let TokenType::Tag(handle, suffix) = self.fetch_token().1 { - tag = Some(self.resolve_tag(mark, &handle, suffix)?); - } else { - unreachable!() - } - } - } else { - unreachable!() - } - } - Token(mark, TokenType::Tag(..)) => { - if let TokenType::Tag(handle, suffix) = self.fetch_token().1 { - tag = Some(self.resolve_tag(mark, &handle, suffix)?); - if let TokenType::Anchor(_) = &self.peek_token()?.1 { - if let Token(mark, TokenType::Anchor(name)) = self.fetch_token() { - anchor_id = self.register_anchor(name, &mark); - } else { - unreachable!() - } - } - } else { - unreachable!() - } - } - _ => {} - } - match *self.peek_token()? { - Token(mark, TokenType::BlockEntry) if indentless_sequence => { - self.state = State::IndentlessSequenceEntry; - Ok((Event::SequenceStart(anchor_id, tag), mark)) - } - Token(_, TokenType::Scalar(..)) => { - self.pop_state(); - if let Token(mark, TokenType::Scalar(style, v)) = self.fetch_token() { - Ok((Event::Scalar(v, style, anchor_id, tag), mark)) - } else { - unreachable!() - } - } - Token(mark, TokenType::FlowSequenceStart) => { - self.state = State::FlowSequenceFirstEntry; - Ok((Event::SequenceStart(anchor_id, tag), mark)) - } - Token(mark, TokenType::FlowMappingStart) => { - self.state = State::FlowMappingFirstKey; - Ok((Event::MappingStart(anchor_id, tag), mark)) - } - Token(mark, TokenType::BlockSequenceStart) if block => { - self.state = State::BlockSequenceFirstEntry; - Ok((Event::SequenceStart(anchor_id, tag), mark)) - } - Token(mark, TokenType::BlockMappingStart) if block => { - self.state = State::BlockMappingFirstKey; - Ok((Event::MappingStart(anchor_id, tag), mark)) - } - // ex 7.2, an empty scalar can follow a secondary tag - Token(mark, _) if tag.is_some() || anchor_id > 0 => { - self.pop_state(); - Ok((Event::empty_scalar_with_anchor(anchor_id, tag), mark)) - } - Token(mark, _) => Err(ScanError::new( - mark, - "while parsing a node, did not find expected node content", - )), - } - } - - fn block_mapping_key(&mut self, first: bool) -> ParseResult { - // skip BlockMappingStart - if first { - let _ = self.peek_token()?; - //self.marks.push(tok.0); - self.skip(); - } - match *self.peek_token()? { - Token(_, TokenType::Key) => { - self.skip(); - if let Token(mark, TokenType::Key | TokenType::Value | TokenType::BlockEnd) = - *self.peek_token()? - { - self.state = State::BlockMappingValue; - // empty scalar - Ok((Event::empty_scalar(), mark)) - } else { - self.push_state(State::BlockMappingValue); - self.parse_node(true, true) - } - } - // XXX(chenyh): libyaml failed to parse spec 1.2, ex8.18 - Token(mark, TokenType::Value) => { - self.state = State::BlockMappingValue; - Ok((Event::empty_scalar(), mark)) - } - Token(mark, TokenType::BlockEnd) => { - self.pop_state(); - self.skip(); - Ok((Event::MappingEnd, mark)) - } - Token(mark, _) => Err(ScanError::new( - mark, - "while parsing a block mapping, did not find expected key", - )), - } - } - - fn block_mapping_value(&mut self) -> ParseResult { - match *self.peek_token()? { - Token(_, TokenType::Value) => { - self.skip(); - if let Token(mark, TokenType::Key | TokenType::Value | TokenType::BlockEnd) = - *self.peek_token()? - { - self.state = State::BlockMappingKey; - // empty scalar - Ok((Event::empty_scalar(), mark)) - } else { - self.push_state(State::BlockMappingKey); - self.parse_node(true, true) - } - } - Token(mark, _) => { - self.state = State::BlockMappingKey; - // empty scalar - Ok((Event::empty_scalar(), mark)) - } - } - } - - fn flow_mapping_key(&mut self, first: bool) -> ParseResult { - if first { - let _ = self.peek_token()?; - self.skip(); - } - let marker: Marker = { - match *self.peek_token()? { - Token(mark, TokenType::FlowMappingEnd) => mark, - Token(mark, _) => { - if !first { - match *self.peek_token()? { - Token(_, TokenType::FlowEntry) => self.skip(), - Token(mark, _) => return Err(ScanError::new( - mark, - "while parsing a flow mapping, did not find expected ',' or '}'", - )), - } - } - - match *self.peek_token()? { - Token(_, TokenType::Key) => { - self.skip(); - if let Token( - mark, - TokenType::Value | TokenType::FlowEntry | TokenType::FlowMappingEnd, - ) = *self.peek_token()? - { - self.state = State::FlowMappingValue; - return Ok((Event::empty_scalar(), mark)); - } - self.push_state(State::FlowMappingValue); - return self.parse_node(false, false); - } - Token(marker, TokenType::Value) => { - self.state = State::FlowMappingValue; - return Ok((Event::empty_scalar(), marker)); - } - Token(_, TokenType::FlowMappingEnd) => (), - _ => { - self.push_state(State::FlowMappingEmptyValue); - return self.parse_node(false, false); - } - } - - mark - } - } - }; - - self.pop_state(); - self.skip(); - Ok((Event::MappingEnd, marker)) - } - - fn flow_mapping_value(&mut self, empty: bool) -> ParseResult { - let mark: Marker = { - if empty { - let Token(mark, _) = *self.peek_token()?; - self.state = State::FlowMappingKey; - return Ok((Event::empty_scalar(), mark)); - } - match *self.peek_token()? { - Token(marker, TokenType::Value) => { - self.skip(); - match self.peek_token()?.1 { - TokenType::FlowEntry | TokenType::FlowMappingEnd => {} - _ => { - self.push_state(State::FlowMappingKey); - return self.parse_node(false, false); - } - } - marker - } - Token(marker, _) => marker, - } - }; - - self.state = State::FlowMappingKey; - Ok((Event::empty_scalar(), mark)) - } - - fn flow_sequence_entry(&mut self, first: bool) -> ParseResult { - // skip FlowMappingStart - if first { - let _ = self.peek_token()?; - //self.marks.push(tok.0); - self.skip(); - } - match *self.peek_token()? { - Token(mark, TokenType::FlowSequenceEnd) => { - self.pop_state(); - self.skip(); - return Ok((Event::SequenceEnd, mark)); - } - Token(_, TokenType::FlowEntry) if !first => { - self.skip(); - } - Token(mark, _) if !first => { - return Err(ScanError::new( - mark, - "while parsing a flow sequence, expected ',' or ']'", - )); - } - _ => { /* next */ } - } - match *self.peek_token()? { - Token(mark, TokenType::FlowSequenceEnd) => { - self.pop_state(); - self.skip(); - Ok((Event::SequenceEnd, mark)) - } - Token(mark, TokenType::Key) => { - self.state = State::FlowSequenceEntryMappingKey; - self.skip(); - Ok((Event::MappingStart(0, None), mark)) - } - _ => { - self.push_state(State::FlowSequenceEntry); - self.parse_node(false, false) - } - } - } - - fn indentless_sequence_entry(&mut self) -> ParseResult { - match *self.peek_token()? { - Token(_, TokenType::BlockEntry) => (), - Token(mark, _) => { - self.pop_state(); - return Ok((Event::SequenceEnd, mark)); - } - } - self.skip(); - if let Token( - mark, - TokenType::BlockEntry | TokenType::Key | TokenType::Value | TokenType::BlockEnd, - ) = *self.peek_token()? - { - self.state = State::IndentlessSequenceEntry; - Ok((Event::empty_scalar(), mark)) - } else { - self.push_state(State::IndentlessSequenceEntry); - self.parse_node(true, false) - } - } - - fn block_sequence_entry(&mut self, first: bool) -> ParseResult { - // BLOCK-SEQUENCE-START - if first { - let _ = self.peek_token()?; - //self.marks.push(tok.0); - self.skip(); - } - match *self.peek_token()? { - Token(mark, TokenType::BlockEnd) => { - self.pop_state(); - self.skip(); - Ok((Event::SequenceEnd, mark)) - } - Token(_, TokenType::BlockEntry) => { - self.skip(); - if let Token(mark, TokenType::BlockEntry | TokenType::BlockEnd) = - *self.peek_token()? - { - self.state = State::BlockSequenceEntry; - Ok((Event::empty_scalar(), mark)) - } else { - self.push_state(State::BlockSequenceEntry); - self.parse_node(true, false) - } - } - Token(mark, _) => Err(ScanError::new( - mark, - "while parsing a block collection, did not find expected '-' indicator", - )), - } - } - - fn flow_sequence_entry_mapping_key(&mut self) -> ParseResult { - if let Token(mark, TokenType::Value | TokenType::FlowEntry | TokenType::FlowSequenceEnd) = - *self.peek_token()? - { - self.skip(); - self.state = State::FlowSequenceEntryMappingValue; - Ok((Event::empty_scalar(), mark)) - } else { - self.push_state(State::FlowSequenceEntryMappingValue); - self.parse_node(false, false) - } - } - - fn flow_sequence_entry_mapping_value(&mut self) -> ParseResult { - match *self.peek_token()? { - Token(_, TokenType::Value) => { - self.skip(); - self.state = State::FlowSequenceEntryMappingValue; - if let Token(mark, TokenType::FlowEntry | TokenType::FlowSequenceEnd) = - *self.peek_token()? - { - self.state = State::FlowSequenceEntryMappingEnd; - Ok((Event::empty_scalar(), mark)) - } else { - self.push_state(State::FlowSequenceEntryMappingEnd); - self.parse_node(false, false) - } - } - Token(mark, _) => { - self.state = State::FlowSequenceEntryMappingEnd; - Ok((Event::empty_scalar(), mark)) - } - } - } - - #[allow(clippy::unnecessary_wraps)] - fn flow_sequence_entry_mapping_end(&mut self) -> ParseResult { - self.state = State::FlowSequenceEntry; - Ok((Event::MappingEnd, self.scanner.mark())) - } - - /// Resolve a tag from the handle and the suffix. - fn resolve_tag(&self, mark: Marker, handle: &str, suffix: String) -> Result { - if handle == "!!" { - // "!!" is a shorthand for "tag:yaml.org,2002:". However, that default can be - // overridden. - match self.tags.get("!!") { - Some(prefix) => Ok(Tag { - handle: prefix.to_string(), - suffix, - }), - None => Ok(Tag { - handle: "tag:yaml.org,2002:".to_string(), - suffix, - }), - } - } else if handle.is_empty() && suffix == "!" { - // "!" introduces a local tag. Local tags may have their prefix overridden. - match self.tags.get("") { - Some(prefix) => Ok(Tag { - handle: prefix.to_string(), - suffix, - }), - None => Ok(Tag { - handle: String::new(), - suffix, - }), - } - } else { - // Lookup handle in our tag directives. - let prefix = self.tags.get(handle); - if let Some(prefix) = prefix { - Ok(Tag { - handle: prefix.to_string(), - suffix, - }) - } else { - // Otherwise, it may be a local handle. With a local handle, the handle is set to - // "!" and the suffix to whatever follows it ("!foo" -> ("!", "foo")). - // If the handle is of the form "!foo!", this cannot be a local handle and we need - // to error. - if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') { - Err(ScanError::new(mark, "the handle wasn't declared")) - } else { - Ok(Tag { - handle: handle.to_string(), - suffix, - }) - } - } - } - } -} - -#[cfg(test)] -mod test { - use super::{Event, Parser}; - use crate::YamlLoader; - - #[test] - fn test_peek_eq_parse() { - let s = " -a0 bb: val -a1: &x - b1: 4 - b2: d -a2: 4 -a3: [1, 2, 3] -a4: - - [a1, a2] - - 2 -a5: *x -"; - let mut p = Parser::new_from_str(s); - while { - let event_peek = p.peek().unwrap().clone(); - let event = p.next_token().unwrap(); - assert_eq!(event, event_peek); - event.0 != Event::StreamEnd - } {} - } - - #[test] - fn test_keep_tags_across_multiple_documents() { - let text = r#" -%YAML 1.1 -%TAG !t! tag:test,2024: ---- !t!1 &1 -foo: "bar" ---- !t!2 &2 -baz: "qux" -"#; - let mut parser = Parser::new_from_str(text).keep_tags(true); - let result = YamlLoader::load_from_parser(&mut parser); - assert!(result.is_ok()); - let docs = result.unwrap(); - assert_eq!(docs.len(), 2); - let yaml = &docs[0]; - assert_eq!(yaml["foo"].as_str(), Some("bar")); - let yaml = &docs[1]; - assert_eq!(yaml["baz"].as_str(), Some("qux")); - - let mut parser = Parser::new_from_str(text).keep_tags(false); - let result = YamlLoader::load_from_parser(&mut parser); - assert!(result.is_err()); - } -} diff --git a/saphyr/src/scanner.rs b/saphyr/src/scanner.rs deleted file mode 100644 index dece35b..0000000 --- a/saphyr/src/scanner.rs +++ /dev/null @@ -1,2593 +0,0 @@ -//! Home to the YAML Scanner. -//! -//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a -//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`] -//! to check for more context and validity. -//! -//! Due to the grammar of YAML, the scanner has to have some context and is not error-free. - -#![allow(clippy::cast_possible_wrap)] -#![allow(clippy::cast_sign_loss)] - -use std::{char, collections::VecDeque, error::Error, fmt}; - -use arraydeque::ArrayDeque; - -use crate::char_traits::{ - as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, - is_flow, is_hex, is_tag_char, is_uri_char, is_z, -}; - -/// The encoding of the input. Currently, only UTF-8 is supported. -#[derive(Clone, Copy, PartialEq, Debug, Eq)] -pub enum TEncoding { - /// UTF-8 encoding. - Utf8, -} - -/// The style as which the scalar was written in the YAML document. -#[derive(Clone, Copy, PartialEq, Debug, Eq)] -pub enum TScalarStyle { - /// A YAML plain scalar. - Plain, - /// A YAML single quoted scalar. - SingleQuoted, - /// A YAML double quoted scalar. - DoubleQuoted, - - /// A YAML literal block (`|` block). - Literal, - /// A YAML folded block (`>` block). - Folded, -} - -/// A location in a yaml document. -#[derive(Clone, Copy, PartialEq, Debug, Eq)] -pub struct Marker { - /// The index (in chars) in the input string. - index: usize, - /// The line (1-indexed). - line: usize, - /// The column (1-indexed). - col: usize, -} - -impl Marker { - fn new(index: usize, line: usize, col: usize) -> Marker { - Marker { index, line, col } - } - - /// Return the index (in bytes) of the marker in the source. - #[must_use] - pub fn index(&self) -> usize { - self.index - } - - /// Return the line of the marker in the source. - #[must_use] - pub fn line(&self) -> usize { - self.line - } - - /// Return the column of the marker in the source. - #[must_use] - pub fn col(&self) -> usize { - self.col - } -} - -/// An error that occurred while scanning. -#[derive(Clone, PartialEq, Debug, Eq)] -pub struct ScanError { - /// The position at which the error happened in the source. - mark: Marker, - /// Human-readable details about the error. - info: String, -} - -impl ScanError { - /// Create a new error from a location and an error string. - #[must_use] - pub fn new(loc: Marker, info: &str) -> ScanError { - ScanError { - mark: loc, - info: info.to_owned(), - } - } - - /// Return the marker pointing to the error in the source. - #[must_use] - pub fn marker(&self) -> &Marker { - &self.mark - } - - /// Return the information string describing the error that happened. - #[must_use] - pub fn info(&self) -> &str { - self.info.as_ref() - } -} - -impl Error for ScanError { - fn description(&self) -> &str { - self.info.as_ref() - } - - fn cause(&self) -> Option<&dyn Error> { - None - } -} - -impl fmt::Display for ScanError { - // col starts from 0 - fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - write!( - formatter, - "{} at byte {} line {} column {}", - self.info, - self.mark.index, - self.mark.line, - self.mark.col + 1, - ) - } -} - -/// The contents of a scanner token. -#[derive(Clone, PartialEq, Debug, Eq)] -pub enum TokenType { - /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`]. - StreamStart(TEncoding), - /// The end of the stream, EOF. - StreamEnd, - /// A YAML version directive. - VersionDirective( - /// Major - u32, - /// Minor - u32, - ), - /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...). - TagDirective( - /// Handle - String, - /// Prefix - String, - ), - /// The start of a YAML document (`---`). - DocumentStart, - /// The end of a YAML document (`...`). - DocumentEnd, - /// The start of a sequence block. - /// - /// Sequence blocks are arrays starting with a `-`. - BlockSequenceStart, - /// The start of a sequence mapping. - /// - /// Sequence mappings are "dictionaries" with "key: value" entries. - BlockMappingStart, - /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`. - BlockEnd, - /// Start of an inline array (`[ a, b ]`). - FlowSequenceStart, - /// End of an inline array. - FlowSequenceEnd, - /// Start of an inline mapping (`{ a: b, c: d }`). - FlowMappingStart, - /// End of an inline mapping. - FlowMappingEnd, - /// An entry in a block sequence (c.f.: [`TokenType::BlockSequenceStart`]). - BlockEntry, - /// An entry in a flow sequence (c.f.: [`TokenType::FlowSequenceStart`]). - FlowEntry, - /// A key in a mapping. - Key, - /// A value in a mapping. - Value, - /// A reference to an anchor. - Alias(String), - /// A YAML anchor (`&`/`*`). - Anchor(String), - /// A YAML tag (starting with bangs `!`). - Tag( - /// The handle of the tag. - String, - /// The suffix of the tag. - String, - ), - /// A regular YAML scalar. - Scalar(TScalarStyle, String), -} - -/// A scanner token. -#[derive(Clone, PartialEq, Debug, Eq)] -pub struct Token(pub Marker, pub TokenType); - -/// A scalar that was parsed and may correspond to a simple key. -/// -/// Upon scanning the following yaml: -/// ```yaml -/// a: b -/// ``` -/// We do not know that `a` is a key for a map until we have reached the following `:`. For this -/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be -/// kept inside the scanner until more context is fetched and we are able to know whether it is a -/// plain scalar or a key. -/// -/// For example, see the following 2 yaml documents: -/// ```yaml -/// --- -/// a: b # Here, `a` is a key. -/// ... -/// --- -/// a # Here, `a` is a plain scalar. -/// ... -/// ``` -/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs. -/// -/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with -/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not -/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]). -/// -/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our -/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending -/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the -/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the -/// [`TokenType::Scalar`] token. -/// -/// In the second document however, reaching the EOF would stale the [`SimpleKey`] and no -/// [`TokenType::Key`] would be emitted by the scanner. -#[derive(Clone, PartialEq, Debug, Eq)] -struct SimpleKey { - /// Whether the token this [`SimpleKey`] refers to may still be a key. - /// - /// Sometimes, when we have more context, we notice that what we thought could be a key no - /// longer can be. In that case, [`Self::possible`] is set to `false`. - /// - /// For instance, let us consider the following invalid YAML: - /// ```yaml - /// key - /// : value - /// ``` - /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is staled - /// and [`Self::possible`] set to `false`. - possible: bool, - /// Whether the token this [`SimpleKey`] refers to is required to be a key. - /// - /// With more context, we may know for sure that the token must be a key. If the YAML is - /// invalid, it may happen that the token be deemed not a key. In such event, an error has to - /// be raised. This boolean helps us know when to raise such error. - /// - /// TODO(ethiraric, 30/12/2023): Example of when this happens. - required: bool, - /// The index of the token referred to by the [`SimpleKey`]. - /// - /// This is the index in the scanner, which takes into account both the tokens that have been - /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and - /// [`Scanner::tokens`] for more details. - token_number: usize, - /// The position at which the token the [`SimpleKey`] refers to is. - mark: Marker, -} - -impl SimpleKey { - /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level. - fn new(mark: Marker) -> SimpleKey { - SimpleKey { - possible: false, - required: false, - token_number: 0, - mark, - } - } -} - -/// An indentation level on the stack of indentations. -#[derive(Clone, Debug, Default)] -struct Indent { - /// The former indentation level. - indent: isize, - /// Whether, upon closing, this indents generates a `BlockEnd` token. - /// - /// There are levels of indentation which do not start a block. Examples of this would be: - /// ```yaml - /// - - /// foo # ok - /// - - /// bar # ko, bar needs to be indented further than the `-`. - /// - [ - /// baz, # ok - /// quux # ko, quux needs to be indented further than the '-'. - /// ] # ko, the closing bracket needs to be indented further than the `-`. - /// ``` - /// - /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a - /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the - /// sequence, although we must have exactly one to end the sequence. - needs_block_end: bool, -} - -/// The size of the [`Scanner`] buffer. -/// -/// The buffer is statically allocated to avoid conditions for reallocations each time we -/// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except: -/// - Escape sequences parsing: some escape codes are 8 characters -/// - Scanning indent in scalars: this looks ahead `indent + 2` characters -/// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done -/// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher -/// than that, the code will fall back to a loop of lookaheads. -const BUFFER_LEN: usize = 16; - -/// The YAML scanner. -/// -/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they -/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate -/// some of the constructs. It has understanding of indentation and whitespace and is able to -/// generate error messages for some invalid YAML constructs. -/// -/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid -/// YAML documents. -#[derive(Debug)] -#[allow(clippy::struct_excessive_bools)] -pub struct Scanner { - /// The reader, providing with characters. - rdr: T, - /// The position of the cursor within the reader. - mark: Marker, - /// Buffer for tokens to be returned. - /// - /// This buffer can hold some temporary tokens that are not yet ready to be returned. For - /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping - /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from - /// [`Self::next`] until we have more context. - tokens: VecDeque, - /// Buffer for the next characters to consume. - buffer: ArrayDeque, - /// The last error that happened. - error: Option, - - /// Whether we have already emitted the `StreamStart` token. - stream_start_produced: bool, - /// Whether we have already emitted the `StreamEnd` token. - stream_end_produced: bool, - /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it - /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`. - adjacent_value_allowed_at: usize, - /// Whether a simple key could potentially start at the current position. - /// - /// Simple keys are the opposite of complex keys which are keys starting with `?`. - simple_key_allowed: bool, - /// A stack of potential simple keys. - /// - /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they - /// are. - simple_keys: Vec, - /// The current indentation level. - indent: isize, - /// List of all block indentation levels we are in (except the current one). - indents: Vec, - /// Level of nesting of flow sequences. - flow_level: u8, - /// The number of tokens that have been returned from the scanner. - /// - /// This excludes the tokens from [`Self::tokens`]. - tokens_parsed: usize, - /// Whether a token is ready to be taken from [`Self::tokens`]. - token_available: bool, - /// Whether all characters encountered since the last newline were whitespace. - leading_whitespace: bool, - /// Whether we started a flow mapping. - /// - /// This is used to detect implicit flow mapping starts such as: - /// ```yaml - /// [ : foo ] # { null: "foo" } - /// ``` - flow_mapping_started: bool, - /// Whether we currently are in an implicit flow mapping. - implicit_flow_mapping: bool, -} - -impl> Iterator for Scanner { - type Item = Token; - fn next(&mut self) -> Option { - if self.error.is_some() { - return None; - } - match self.next_token() { - Ok(Some(tok)) => { - debug_print!( - " \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m", - tok.1, - tok.0 - ); - Some(tok) - } - Ok(tok) => tok, - Err(e) => { - self.error = Some(e); - None - } - } - } -} - -/// A convenience alias for scanner functions that may fail without returning a value. -pub type ScanResult = Result<(), ScanError>; - -impl> Scanner { - /// Creates the YAML tokenizer. - pub fn new(rdr: T) -> Scanner { - Scanner { - rdr, - buffer: ArrayDeque::new(), - mark: Marker::new(0, 1, 0), - tokens: VecDeque::new(), - error: None, - - stream_start_produced: false, - stream_end_produced: false, - adjacent_value_allowed_at: 0, - simple_key_allowed: true, - simple_keys: Vec::new(), - indent: -1, - indents: Vec::new(), - flow_level: 0, - tokens_parsed: 0, - token_available: false, - leading_whitespace: true, - flow_mapping_started: false, - implicit_flow_mapping: false, - } - } - - /// Get a copy of the last error that was encountered, if any. - /// - /// This does not clear the error state and further calls to [`Self::get_error`] will return (a - /// clone of) the same error. - #[inline] - pub fn get_error(&self) -> Option { - self.error.clone() - } - - /// Fill `self.buffer` with at least `count` characters. - /// - /// The characters that are extracted this way are not consumed but only placed in the buffer. - #[inline] - fn lookahead(&mut self, count: usize) { - if self.buffer.len() >= count { - return; - } - for _ in 0..(count - self.buffer.len()) { - self.buffer - .push_back(self.rdr.next().unwrap_or('\0')) - .unwrap(); - } - } - - /// Consume the next character. It is assumed the next character is a blank. - #[inline] - fn skip_blank(&mut self) { - self.buffer.pop_front(); - - self.mark.index += 1; - self.mark.col += 1; - } - - /// Consume the next character. It is assumed the next character is not a blank. - #[inline] - fn skip_non_blank(&mut self) { - self.buffer.pop_front(); - - self.mark.index += 1; - self.mark.col += 1; - self.leading_whitespace = false; - } - - /// Consume the next characters. It is assumed none of the next characters are blanks. - #[inline] - fn skip_n_non_blank(&mut self, n: usize) { - self.buffer.drain(0..n); - - self.mark.index += n; - self.mark.col += n; - self.leading_whitespace = false; - } - - /// Consume the next character. It is assumed the next character is a newline. - #[inline] - fn skip_nl(&mut self) { - self.buffer.pop_front(); - - self.mark.index += 1; - self.mark.col = 0; - self.mark.line += 1; - self.leading_whitespace = true; - } - - /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none. - #[inline] - fn skip_line(&mut self) { - if self.buffer[0] == '\r' && self.buffer[1] == '\n' { - // While technically not a blank, this does not matter as `self.leading_whitespace` - // will be reset by `skip_nl`. - self.skip_blank(); - self.skip_nl(); - } else if is_break(self.buffer[0]) { - self.skip_nl(); - } - } - - /// Return the next character in the buffer. - /// - /// The character is not consumed. - #[inline] - fn ch(&self) -> char { - self.buffer[0] - } - - /// Look for the next character and return it. - /// - /// The character is not consumed. - /// Equivalent to calling [`Self::lookahead`] and [`Self::ch`]. - #[inline] - fn look_ch(&mut self) -> char { - self.lookahead(1); - self.ch() - } - - /// Read a character from the input stream, returning it directly. - /// - /// The buffer is bypassed and `self.mark` needs to be updated manually. - #[inline] - #[must_use] - fn raw_read_ch(&mut self) -> char { - self.rdr.next().unwrap_or('\0') - } - - /// Return whether the next character is `c`. - #[inline] - fn ch_is(&self, c: char) -> bool { - self.buffer[0] == c - } - - /// Return whether the [`TokenType::StreamStart`] event has been emitted. - #[inline] - pub fn stream_started(&self) -> bool { - self.stream_start_produced - } - - /// Return whether the [`TokenType::StreamEnd`] event has been emitted. - #[inline] - pub fn stream_ended(&self) -> bool { - self.stream_end_produced - } - - /// Get the current position in the input stream. - #[inline] - pub fn mark(&self) -> Marker { - self.mark - } - - // Read and consume a line break (either `\r`, `\n` or `\r\n`). - // - // A `\n` is pushed into `s`. - // - // # Panics (in debug) - // If the next characters do not correspond to a line break. - #[inline] - fn read_break(&mut self, s: &mut String) { - let c = self.buffer[0]; - let nc = self.buffer[1]; - debug_assert!(is_break(c)); - if c == '\r' && nc == '\n' { - self.skip_blank(); - } - self.skip_nl(); - - s.push('\n'); - } - - /// Check whether the next characters correspond to an end of document. - /// - /// [`Self::lookahead`] must have been called before calling this function. - fn next_is_document_end(&self) -> bool { - assert!(self.buffer.len() >= 4); - self.buffer[0] == '.' - && self.buffer[1] == '.' - && self.buffer[2] == '.' - && is_blank_or_breakz(self.buffer[3]) - } - - /// Check whether the next characters correspond to a document indicator. - /// - /// [`Self::lookahead`] must have been called before calling this function. - #[inline] - fn next_is_document_indicator(&self) -> bool { - assert!(self.buffer.len() >= 4); - self.mark.col == 0 - && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-')) - || ((self.buffer[0] == '.') && (self.buffer[1] == '.') && (self.buffer[2] == '.'))) - && is_blank_or_breakz(self.buffer[3]) - } - - /// Insert a token at the given position. - fn insert_token(&mut self, pos: usize, tok: Token) { - let old_len = self.tokens.len(); - assert!(pos <= old_len); - self.tokens.insert(pos, tok); - } - - fn allow_simple_key(&mut self) { - self.simple_key_allowed = true; - } - - fn disallow_simple_key(&mut self) { - self.simple_key_allowed = false; - } - - /// Fetch the next token in the stream. - /// # Errors - /// Returns `ScanError` when the scanner does not find the next expected token. - pub fn fetch_next_token(&mut self) -> ScanResult { - self.lookahead(1); - // eprintln!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch()); - - if !self.stream_start_produced { - self.fetch_stream_start(); - return Ok(()); - } - self.skip_to_next_token()?; - - debug_print!( - " \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m", - self.mark, - self.ch() - ); - - self.stale_simple_keys()?; - - let mark = self.mark; - self.unroll_indent(mark.col as isize); - - self.lookahead(4); - - if is_z(self.ch()) { - self.fetch_stream_end()?; - return Ok(()); - } - - // Is it a directive? - if self.mark.col == 0 && self.ch_is('%') { - return self.fetch_directive(); - } - - if self.mark.col == 0 - && self.buffer[0] == '-' - && self.buffer[1] == '-' - && self.buffer[2] == '-' - && is_blank_or_breakz(self.buffer[3]) - { - self.fetch_document_indicator(TokenType::DocumentStart)?; - return Ok(()); - } - - if self.mark.col == 0 - && self.buffer[0] == '.' - && self.buffer[1] == '.' - && self.buffer[2] == '.' - && is_blank_or_breakz(self.buffer[3]) - { - self.fetch_document_indicator(TokenType::DocumentEnd)?; - self.skip_ws_to_eol(SkipTabs::Yes)?; - if !is_breakz(self.ch()) { - return Err(ScanError::new( - self.mark, - "invalid content after document end marker", - )); - } - return Ok(()); - } - - if (self.mark.col as isize) < self.indent { - return Err(ScanError::new(self.mark, "invalid indentation")); - } - - let c = self.buffer[0]; - let nc = self.buffer[1]; - match c { - '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart), - '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart), - ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd), - '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd), - ',' => self.fetch_flow_entry(), - '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(), - '?' if is_blank_or_breakz(nc) => self.fetch_key(), - ':' if is_blank_or_breakz(nc) - || (self.flow_level > 0 - && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) => - { - self.fetch_value() - } - // Is it an alias? - '*' => self.fetch_anchor(true), - // Is it an anchor? - '&' => self.fetch_anchor(false), - '!' => self.fetch_tag(), - // Is it a literal scalar? - '|' if self.flow_level == 0 => self.fetch_block_scalar(true), - // Is it a folded scalar? - '>' if self.flow_level == 0 => self.fetch_block_scalar(false), - '\'' => self.fetch_flow_scalar(true), - '"' => self.fetch_flow_scalar(false), - // plain scalar - '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(), - ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => { - self.fetch_plain_scalar() - } - '%' | '@' | '`' => Err(ScanError::new( - self.mark, - &format!("unexpected character: `{c}'"), - )), - _ => self.fetch_plain_scalar(), - } - } - - /// Return the next token in the stream. - /// # Errors - /// Returns `ScanError` when scanning fails to find an expected next token. - pub fn next_token(&mut self) -> Result, ScanError> { - if self.stream_end_produced { - return Ok(None); - } - - if !self.token_available { - self.fetch_more_tokens()?; - } - let Some(t) = self.tokens.pop_front() else { - return Err(ScanError::new( - self.mark, - "did not find expected next token", - )); - }; - self.token_available = false; - self.tokens_parsed += 1; - - if let TokenType::StreamEnd = t.1 { - self.stream_end_produced = true; - } - Ok(Some(t)) - } - - /// Fetch tokens from the token stream. - /// # Errors - /// Returns `ScanError` when loading fails. - pub fn fetch_more_tokens(&mut self) -> ScanResult { - let mut need_more; - loop { - if self.tokens.is_empty() { - need_more = true; - } else { - need_more = false; - // Stale potential keys that we know won't be keys. - self.stale_simple_keys()?; - // If our next token to be emitted may be a key, fetch more context. - for sk in &self.simple_keys { - if sk.possible && sk.token_number == self.tokens_parsed { - need_more = true; - break; - } - } - } - - if !need_more { - break; - } - self.fetch_next_token()?; - } - self.token_available = true; - - Ok(()) - } - - /// Mark simple keys that can no longer be keys as such. - /// - /// This function sets `possible` to `false` to each key that, now we have more context, we - /// know will not be keys. - /// - /// # Errors - /// This function returns an error if one of the key we would stale was required to be a key. - fn stale_simple_keys(&mut self) -> ScanResult { - for sk in &mut self.simple_keys { - if sk.possible - // If not in a flow construct, simple keys cannot span multiple lines. - && self.flow_level == 0 - && (sk.mark.line < self.mark.line || sk.mark.index + 1024 < self.mark.index) - { - if sk.required { - return Err(ScanError::new(self.mark, "simple key expect ':'")); - } - sk.possible = false; - } - } - Ok(()) - } - - /// Skip over all whitespace and comments until the next token. - /// - /// # Errors - /// This function returns an error if a tabulation is encountered where there should not be - /// one. - fn skip_to_next_token(&mut self) -> ScanResult { - loop { - // TODO(chenyh) BOM - match self.look_ch() { - // Tabs may not be used as indentation. - // "Indentation" only exists as long as a block is started, but does not exist - // inside of flow-style constructs. Tabs are allowed as part of leading - // whitespaces outside of indentation. - // If a flow-style construct is in an indented block, its contents must still be - // indented. Also, tabs are allowed anywhere in it if it has no content. - '\t' if self.is_within_block() - && self.leading_whitespace - && (self.mark.col as isize) < self.indent => - { - self.skip_ws_to_eol(SkipTabs::Yes)?; - // If we have content on that line with a tab, return an error. - if !is_breakz(self.ch()) { - return Err(ScanError::new( - self.mark, - "tabs disallowed within this context (block indentation)", - )); - } - } - '\t' | ' ' => self.skip_blank(), - '\n' | '\r' => { - self.lookahead(2); - self.skip_line(); - if self.flow_level == 0 { - self.allow_simple_key(); - } - } - '#' => { - while !is_breakz(self.look_ch()) { - self.skip_non_blank(); - } - } - _ => break, - } - } - Ok(()) - } - - /// Skip over YAML whitespace (` `, `\n`, `\r`). - /// - /// # Errors - /// This function returns an error if no whitespace was found. - fn skip_yaml_whitespace(&mut self) -> ScanResult { - let mut need_whitespace = true; - loop { - match self.look_ch() { - ' ' => { - self.skip_blank(); - - need_whitespace = false; - } - '\n' | '\r' => { - self.lookahead(2); - self.skip_line(); - if self.flow_level == 0 { - self.allow_simple_key(); - } - need_whitespace = false; - } - '#' => { - while !is_breakz(self.look_ch()) { - self.skip_non_blank(); - } - } - _ => break, - } - } - - if need_whitespace { - Err(ScanError::new(self.mark(), "expected whitespace")) - } else { - Ok(()) - } - } - - /// Skip yaml whitespace at most up to eol. Also skips comments. - fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result { - let mut encountered_tab = false; - let mut has_yaml_ws = false; - loop { - match self.look_ch() { - ' ' => { - has_yaml_ws = true; - self.skip_blank(); - } - '\t' if skip_tabs != SkipTabs::No => { - encountered_tab = true; - self.skip_blank(); - } - // YAML comments must be preceded by whitespace. - '#' if !encountered_tab && !has_yaml_ws => { - return Err(ScanError::new( - self.mark, - "comments must be separated from other tokens by whitespace", - )); - } - '#' => { - while !is_breakz(self.look_ch()) { - self.skip_non_blank(); - } - } - _ => break, - } - } - - Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)) - } - - fn fetch_stream_start(&mut self) { - let mark = self.mark; - self.indent = -1; - self.stream_start_produced = true; - self.allow_simple_key(); - self.tokens - .push_back(Token(mark, TokenType::StreamStart(TEncoding::Utf8))); - self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0))); - } - - fn fetch_stream_end(&mut self) -> ScanResult { - // force new line - if self.mark.col != 0 { - self.mark.col = 0; - self.mark.line += 1; - } - - // If the stream ended, we won't have more context. We can stall all the simple keys we - // had. If one was required, however, that was an error and we must propagate it. - for sk in &mut self.simple_keys { - if sk.required && sk.possible { - return Err(ScanError::new(self.mark, "simple key expected")); - } - sk.possible = false; - } - - self.unroll_indent(-1); - self.remove_simple_key()?; - self.disallow_simple_key(); - - self.tokens - .push_back(Token(self.mark, TokenType::StreamEnd)); - Ok(()) - } - - fn fetch_directive(&mut self) -> ScanResult { - self.unroll_indent(-1); - self.remove_simple_key()?; - - self.disallow_simple_key(); - - let tok = self.scan_directive()?; - self.skip_ws_to_eol(SkipTabs::Yes)?; - - self.tokens.push_back(tok); - - Ok(()) - } - - fn scan_directive(&mut self) -> Result { - let start_mark = self.mark; - self.skip_non_blank(); - - let name = self.scan_directive_name()?; - let tok = match name.as_ref() { - "YAML" => self.scan_version_directive_value(&start_mark)?, - "TAG" => self.scan_tag_directive_value(&start_mark)?, - // XXX This should be a warning instead of an error - _ => { - // skip current line - while !is_breakz(self.look_ch()) { - self.skip_non_blank(); - } - // XXX return an empty TagDirective token - Token( - start_mark, - TokenType::TagDirective(String::new(), String::new()), - ) - // return Err(ScanError::new(start_mark, - // "while scanning a directive, found unknown directive name")) - } - }; - - self.skip_ws_to_eol(SkipTabs::Yes)?; - - if !is_breakz(self.ch()) { - return Err(ScanError::new( - start_mark, - "while scanning a directive, did not find expected comment or line break", - )); - } - - // Eat a line break - if is_break(self.ch()) { - self.lookahead(2); - self.skip_line(); - } - - Ok(tok) - } - - fn scan_version_directive_value(&mut self, mark: &Marker) -> Result { - while is_blank(self.look_ch()) { - self.skip_blank(); - } - - let major = self.scan_version_directive_number(mark)?; - - if self.ch() != '.' { - return Err(ScanError::new( - *mark, - "while scanning a YAML directive, did not find expected digit or '.' character", - )); - } - self.skip_non_blank(); - - let minor = self.scan_version_directive_number(mark)?; - - Ok(Token(*mark, TokenType::VersionDirective(major, minor))) - } - - fn scan_directive_name(&mut self) -> Result { - let start_mark = self.mark; - let mut string = String::new(); - while is_alpha(self.look_ch()) { - string.push(self.ch()); - self.skip_non_blank(); - } - - if string.is_empty() { - return Err(ScanError::new( - start_mark, - "while scanning a directive, could not find expected directive name", - )); - } - - if !is_blank_or_breakz(self.ch()) { - return Err(ScanError::new( - start_mark, - "while scanning a directive, found unexpected non-alphabetical character", - )); - } - - Ok(string) - } - - fn scan_version_directive_number(&mut self, mark: &Marker) -> Result { - let mut val = 0u32; - let mut length = 0usize; - while let Some(digit) = self.look_ch().to_digit(10) { - if length + 1 > 9 { - return Err(ScanError::new( - *mark, - "while scanning a YAML directive, found extremely long version number", - )); - } - length += 1; - val = val * 10 + digit; - self.skip_non_blank(); - } - - if length == 0 { - return Err(ScanError::new( - *mark, - "while scanning a YAML directive, did not find expected version number", - )); - } - - Ok(val) - } - - fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result { - /* Eat whitespaces. */ - while is_blank(self.look_ch()) { - self.skip_blank(); - } - let handle = self.scan_tag_handle(true, mark)?; - - /* Eat whitespaces. */ - while is_blank(self.look_ch()) { - self.skip_blank(); - } - - let prefix = self.scan_tag_prefix(mark)?; - - self.lookahead(1); - - if is_blank_or_breakz(self.ch()) { - Ok(Token(*mark, TokenType::TagDirective(handle, prefix))) - } else { - Err(ScanError::new( - *mark, - "while scanning TAG, did not find expected whitespace or line break", - )) - } - } - - fn fetch_tag(&mut self) -> ScanResult { - self.save_simple_key(); - self.disallow_simple_key(); - - let tok = self.scan_tag()?; - self.tokens.push_back(tok); - Ok(()) - } - - fn scan_tag(&mut self) -> Result { - let start_mark = self.mark; - let mut handle = String::new(); - let mut suffix; - - // Check if the tag is in the canonical form (verbatim). - self.lookahead(2); - - if self.buffer[1] == '<' { - suffix = self.scan_verbatim_tag(&start_mark)?; - } else { - // The tag has either the '!suffix' or the '!handle!suffix' - handle = self.scan_tag_handle(false, &start_mark)?; - // Check if it is, indeed, handle. - if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') { - // A tag handle starting with "!!" is a secondary tag handle. - let is_secondary_handle = handle == "!!"; - suffix = - self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", &start_mark)?; - } else { - suffix = self.scan_tag_shorthand_suffix(false, false, &handle, &start_mark)?; - handle = "!".to_owned(); - // A special case: the '!' tag. Set the handle to '' and the - // suffix to '!'. - if suffix.is_empty() { - handle.clear(); - suffix = "!".to_owned(); - } - } - } - - if is_blank_or_breakz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) { - // XXX: ex 7.2, an empty scalar can follow a secondary tag - Ok(Token(start_mark, TokenType::Tag(handle, suffix))) - } else { - Err(ScanError::new( - start_mark, - "while scanning a tag, did not find expected whitespace or line break", - )) - } - } - - fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result { - let mut string = String::new(); - if self.look_ch() != '!' { - return Err(ScanError::new( - *mark, - "while scanning a tag, did not find expected '!'", - )); - } - - string.push(self.ch()); - self.skip_non_blank(); - - while is_alpha(self.look_ch()) { - string.push(self.ch()); - self.skip_non_blank(); - } - - // Check if the trailing character is '!' and copy it. - if self.ch() == '!' { - string.push(self.ch()); - self.skip_non_blank(); - } else if directive && string != "!" { - // It's either the '!' tag or not really a tag handle. If it's a %TAG - // directive, it's an error. If it's a tag token, it must be a part of - // URI. - return Err(ScanError::new( - *mark, - "while parsing a tag directive, did not find expected '!'", - )); - } - Ok(string) - } - - /// Scan for a tag prefix (6.8.2.2). - /// - /// There are 2 kinds of tag prefixes: - /// - Local: Starts with a `!`, contains only URI chars (`!foo`) - /// - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`) - fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result { - let mut string = String::new(); - - if self.look_ch() == '!' { - // If we have a local tag, insert and skip `!`. - string.push(self.ch()); - self.skip_non_blank(); - } else if !is_tag_char(self.ch()) { - // Otherwise, check if the first global tag character is valid. - return Err(ScanError::new(*start_mark, "invalid global tag character")); - } else if self.ch() == '%' { - // If it is valid and an escape sequence, escape it. - string.push(self.scan_uri_escapes(start_mark)?); - } else { - // Otherwise, push the first character. - string.push(self.ch()); - self.skip_non_blank(); - } - - while is_uri_char(self.look_ch()) { - if self.ch() == '%' { - string.push(self.scan_uri_escapes(start_mark)?); - } else { - string.push(self.ch()); - self.skip_non_blank(); - } - } - - Ok(string) - } - - /// Scan for a verbatim tag. - /// - /// The prefixing `!<` must _not_ have been skipped. - fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result { - // Eat `!<` - self.skip_non_blank(); - self.skip_non_blank(); - - let mut string = String::new(); - while is_uri_char(self.look_ch()) { - if self.ch() == '%' { - string.push(self.scan_uri_escapes(start_mark)?); - } else { - string.push(self.ch()); - self.skip_non_blank(); - } - } - - if self.ch() != '>' { - return Err(ScanError::new( - *start_mark, - "while scanning a verbatim tag, did not find the expected '>'", - )); - } - self.skip_non_blank(); - - Ok(string) - } - - fn scan_tag_shorthand_suffix( - &mut self, - _directive: bool, - _is_secondary: bool, - head: &str, - mark: &Marker, - ) -> Result { - let mut length = head.len(); - let mut string = String::new(); - - // Copy the head if needed. - // Note that we don't copy the leading '!' character. - if length > 1 { - string.extend(head.chars().skip(1)); - } - - while is_tag_char(self.look_ch()) { - // Check if it is a URI-escape sequence. - if self.ch() == '%' { - string.push(self.scan_uri_escapes(mark)?); - } else { - string.push(self.ch()); - self.skip_non_blank(); - } - - length += 1; - } - - if length == 0 { - return Err(ScanError::new( - *mark, - "while parsing a tag, did not find expected tag URI", - )); - } - - Ok(string) - } - - fn scan_uri_escapes(&mut self, mark: &Marker) -> Result { - let mut width = 0usize; - let mut code = 0u32; - loop { - self.lookahead(3); - - if !(self.ch() == '%' && is_hex(self.buffer[1]) && is_hex(self.buffer[2])) { - return Err(ScanError::new( - *mark, - "while parsing a tag, did not find URI escaped octet", - )); - } - - let octet = (as_hex(self.buffer[1]) << 4) + as_hex(self.buffer[2]); - if width == 0 { - width = match octet { - _ if octet & 0x80 == 0x00 => 1, - _ if octet & 0xE0 == 0xC0 => 2, - _ if octet & 0xF0 == 0xE0 => 3, - _ if octet & 0xF8 == 0xF0 => 4, - _ => { - return Err(ScanError::new( - *mark, - "while parsing a tag, found an incorrect leading UTF-8 octet", - )); - } - }; - code = octet; - } else { - if octet & 0xc0 != 0x80 { - return Err(ScanError::new( - *mark, - "while parsing a tag, found an incorrect trailing UTF-8 octet", - )); - } - code = (code << 8) + octet; - } - - self.skip_n_non_blank(3); - - width -= 1; - if width == 0 { - break; - } - } - - match char::from_u32(code) { - Some(ch) => Ok(ch), - None => Err(ScanError::new( - *mark, - "while parsing a tag, found an invalid UTF-8 codepoint", - )), - } - } - - fn fetch_anchor(&mut self, alias: bool) -> ScanResult { - self.save_simple_key(); - self.disallow_simple_key(); - - let tok = self.scan_anchor(alias)?; - - self.tokens.push_back(tok); - - Ok(()) - } - - fn scan_anchor(&mut self, alias: bool) -> Result { - let mut string = String::new(); - let start_mark = self.mark; - - self.skip_non_blank(); - while is_anchor_char(self.look_ch()) { - string.push(self.ch()); - self.skip_non_blank(); - } - - if string.is_empty() { - return Err(ScanError::new(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character")); - } - - if alias { - Ok(Token(start_mark, TokenType::Alias(string))) - } else { - Ok(Token(start_mark, TokenType::Anchor(string))) - } - } - - fn fetch_flow_collection_start(&mut self, tok: TokenType) -> ScanResult { - // The indicators '[' and '{' may start a simple key. - self.save_simple_key(); - - self.roll_one_col_indent(); - self.increase_flow_level()?; - - self.allow_simple_key(); - - let start_mark = self.mark; - self.skip_non_blank(); - - if tok == TokenType::FlowMappingStart { - self.flow_mapping_started = true; - } - - self.skip_ws_to_eol(SkipTabs::Yes)?; - - self.tokens.push_back(Token(start_mark, tok)); - Ok(()) - } - - fn fetch_flow_collection_end(&mut self, tok: TokenType) -> ScanResult { - self.remove_simple_key()?; - self.decrease_flow_level(); - - self.disallow_simple_key(); - - self.end_implicit_mapping(self.mark); - - let start_mark = self.mark; - self.skip_non_blank(); - self.skip_ws_to_eol(SkipTabs::Yes)?; - - // A flow collection within a flow mapping can be a key. In that case, the value may be - // adjacent to the `:`. - // ```yaml - // - [ {a: b}:value ] - // ``` - if self.flow_level > 0 { - self.adjacent_value_allowed_at = self.mark.index; - } - - self.tokens.push_back(Token(start_mark, tok)); - Ok(()) - } - - /// Push the `FlowEntry` token and skip over the `,`. - fn fetch_flow_entry(&mut self) -> ScanResult { - self.remove_simple_key()?; - self.allow_simple_key(); - - self.end_implicit_mapping(self.mark); - - let start_mark = self.mark; - self.skip_non_blank(); - self.skip_ws_to_eol(SkipTabs::Yes)?; - - self.tokens - .push_back(Token(start_mark, TokenType::FlowEntry)); - Ok(()) - } - - fn increase_flow_level(&mut self) -> ScanResult { - self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0))); - self.flow_level = self - .flow_level - .checked_add(1) - .ok_or_else(|| ScanError::new(self.mark, "recursion limit exceeded"))?; - Ok(()) - } - - fn decrease_flow_level(&mut self) { - if self.flow_level > 0 { - self.flow_level -= 1; - self.simple_keys.pop().unwrap(); - } - } - - /// Push the `Block*` token(s) and skip over the `-`. - /// - /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a - /// `BlockEntry` token. - /// This function only skips over the `-` and does not fetch the entry value. - fn fetch_block_entry(&mut self) -> ScanResult { - if self.flow_level > 0 { - // - * only allowed in block - return Err(ScanError::new( - self.mark, - r#""-" is only valid inside a block"#, - )); - } - // Check if we are allowed to start a new entry. - if !self.simple_key_allowed { - return Err(ScanError::new( - self.mark, - "block sequence entries are not allowed in this context", - )); - } - - // ???, fixes test G9HC. - if let Some(Token(mark, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() { - if self.mark.col == 0 && mark.col == 0 && self.indent > -1 { - return Err(ScanError::new(*mark, "invalid indentation for anchor")); - } - } - - // Skip over the `-`. - let mark = self.mark; - self.skip_non_blank(); - - // generate BLOCK-SEQUENCE-START if indented - self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark); - let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs(); - self.lookahead(2); - if found_tabs && self.buffer[0] == '-' && is_blank_or_breakz(self.buffer[1]) { - return Err(ScanError::new( - self.mark, - "'-' must be followed by a valid YAML whitespace", - )); - } - - self.skip_ws_to_eol(SkipTabs::No)?; - if is_break(self.look_ch()) || is_flow(self.ch()) { - self.roll_one_col_indent(); - } - - self.remove_simple_key()?; - self.allow_simple_key(); - - self.tokens - .push_back(Token(self.mark, TokenType::BlockEntry)); - - Ok(()) - } - - fn fetch_document_indicator(&mut self, t: TokenType) -> ScanResult { - self.unroll_indent(-1); - self.remove_simple_key()?; - self.disallow_simple_key(); - - let mark = self.mark; - - self.skip_n_non_blank(3); - - self.tokens.push_back(Token(mark, t)); - Ok(()) - } - - fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult { - self.save_simple_key(); - self.allow_simple_key(); - let tok = self.scan_block_scalar(literal)?; - - self.tokens.push_back(tok); - Ok(()) - } - - #[allow(clippy::too_many_lines)] - fn scan_block_scalar(&mut self, literal: bool) -> Result { - let start_mark = self.mark; - let mut chomping = Chomping::Clip; - let mut increment: usize = 0; - let mut indent: usize = 0; - let mut trailing_blank: bool; - let mut leading_blank: bool = false; - let style = if literal { - TScalarStyle::Literal - } else { - TScalarStyle::Folded - }; - - let mut string = String::new(); - let mut leading_break = String::new(); - let mut trailing_breaks = String::new(); - let mut chomping_break = String::new(); - - // skip '|' or '>' - self.skip_non_blank(); - self.unroll_non_block_indents(); - - if self.look_ch() == '+' || self.ch() == '-' { - if self.ch() == '+' { - chomping = Chomping::Keep; - } else { - chomping = Chomping::Strip; - } - self.skip_non_blank(); - if is_digit(self.look_ch()) { - if self.ch() == '0' { - return Err(ScanError::new( - start_mark, - "while scanning a block scalar, found an indentation indicator equal to 0", - )); - } - increment = (self.ch() as usize) - ('0' as usize); - self.skip_non_blank(); - } - } else if is_digit(self.ch()) { - if self.ch() == '0' { - return Err(ScanError::new( - start_mark, - "while scanning a block scalar, found an indentation indicator equal to 0", - )); - } - - increment = (self.ch() as usize) - ('0' as usize); - self.skip_non_blank(); - self.lookahead(1); - if self.ch() == '+' || self.ch() == '-' { - if self.ch() == '+' { - chomping = Chomping::Keep; - } else { - chomping = Chomping::Strip; - } - self.skip_non_blank(); - } - } - - self.skip_ws_to_eol(SkipTabs::Yes)?; - - // Check if we are at the end of the line. - if !is_breakz(self.look_ch()) { - return Err(ScanError::new( - start_mark, - "while scanning a block scalar, did not find expected comment or line break", - )); - } - - if is_break(self.ch()) { - self.lookahead(2); - self.read_break(&mut chomping_break); - } - - if self.look_ch() == '\t' { - return Err(ScanError::new( - start_mark, - "a block scalar content cannot start with a tab", - )); - } - - if increment > 0 { - indent = if self.indent >= 0 { - (self.indent + increment as isize) as usize - } else { - increment - } - } - - // Scan the leading line breaks and determine the indentation level if needed. - if indent == 0 { - self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks); - } else { - self.skip_block_scalar_indent(indent, &mut trailing_breaks); - } - - // We have an end-of-stream with no content, e.g.: - // ```yaml - // - |+ - // ``` - if is_z(self.ch()) { - let contents = match chomping { - // We strip trailing linebreaks. Nothing remain. - Chomping::Strip => String::new(), - // There was no newline after the chomping indicator. - _ if self.mark.line == start_mark.line() => String::new(), - // We clip lines, and there was a newline after the chomping indicator. - // All other breaks are ignored. - Chomping::Clip => chomping_break, - // We keep lines. There was a newline after the chomping indicator but nothing - // else. - Chomping::Keep if trailing_breaks.is_empty() => chomping_break, - // Otherwise, the newline after chomping is ignored. - Chomping::Keep => trailing_breaks, - }; - return Ok(Token(start_mark, TokenType::Scalar(style, contents))); - } - - if self.mark.col < indent && (self.mark.col as isize) > self.indent { - return Err(ScanError::new( - self.mark, - "wrongly indented line in block scalar", - )); - } - - let mut line_buffer = String::with_capacity(100); - let start_mark = self.mark; - while self.mark.col == indent && !is_z(self.ch()) { - if indent == 0 { - self.lookahead(4); - if self.next_is_document_end() { - break; - } - } - - // We are at the first content character of a content line. - trailing_blank = is_blank(self.ch()); - if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank { - string.push_str(&trailing_breaks); - if trailing_breaks.is_empty() { - string.push(' '); - } - } else { - string.push_str(&leading_break); - string.push_str(&trailing_breaks); - } - - leading_break.clear(); - trailing_breaks.clear(); - - leading_blank = is_blank(self.ch()); - - self.scan_block_scalar_content_line(&mut string, &mut line_buffer); - - // break on EOF - if is_z(self.ch()) { - break; - } - - self.lookahead(2); - self.read_break(&mut leading_break); - - // Eat the following indentation spaces and line breaks. - self.skip_block_scalar_indent(indent, &mut trailing_breaks); - } - - // Chomp the tail. - if chomping != Chomping::Strip { - string.push_str(&leading_break); - // If we had reached an eof but the last character wasn't an end-of-line, check if the - // last line was indented at least as the rest of the scalar, then we need to consider - // there is a newline. - if is_z(self.ch()) && self.mark.col >= indent.max(1) { - string.push('\n'); - } - } - - if chomping == Chomping::Keep { - string.push_str(&trailing_breaks); - } - - Ok(Token(start_mark, TokenType::Scalar(style, string))) - } - - /// Retrieve the contents of the line, parsing it as a block scalar. - /// - /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to - /// store bytes before pushing them to `string` and thus avoiding reallocating more than - /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be - /// `clear`ed before the end of the function. - /// - /// This function assumed the first character to read is the first content character in the - /// line. This function does not consume the line break character(s) after the line. - fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) { - // Start by evaluating characters in the buffer. - while !self.buffer.is_empty() && !is_breakz(self.ch()) { - string.push(self.ch()); - // We may technically skip non-blank characters. However, the only distinction is - // to determine what is leading whitespace and what is not. Here, we read the - // contents of the line until either eof or a linebreak. We know we will not read - // `self.leading_whitespace` until the end of the line, where it will be reset. - // This allows us to call a slightly less expensive function. - self.skip_blank(); - } - - // All characters that were in the buffer were consumed. We need to check if more - // follow. - if self.buffer.is_empty() { - // We will read all consecutive non-breakz characters. We push them into a - // temporary buffer. The main difference with going through `self.buffer` is that - // characters are appended here as their real size (1B for ascii, or up to 4 bytes for - // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string` - // (using `String::push_str`). - let mut c = self.raw_read_ch(); - while !is_breakz(c) { - line_buffer.push(c); - c = self.raw_read_ch(); - } - - // Our last character read is stored in `c`. It is either an EOF or a break. In any - // case, we need to push it back into `self.buffer` so it may be properly read - // after. We must not insert it in `string`. - self.buffer.push_back(c).unwrap(); - - // We need to manually update our position; we haven't called a `skip` function. - self.mark.col += line_buffer.len(); - self.mark.index += line_buffer.len(); - - // We can now append our bytes to our `string`. - string.reserve(line_buffer.as_bytes().len()); - string.push_str(line_buffer); - // This clears the _contents_ without touching the _capacity_. - line_buffer.clear(); - } - } - - /// Skip the block scalar indentation and empty lines. - fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) { - loop { - // Consume all spaces. Tabs cannot be used as indentation. - if indent < BUFFER_LEN - 2 { - self.lookahead(BUFFER_LEN); - while self.mark.col < indent && self.ch() == ' ' { - self.skip_blank(); - } - } else { - loop { - self.lookahead(BUFFER_LEN); - while !self.buffer.is_empty() && self.mark.col < indent && self.ch() == ' ' { - self.skip_blank(); - } - if !(!self.buffer.is_empty() && self.mark.col < indent && self.ch() == ' ') { - break; - } - } - self.lookahead(2); - } - - // If our current line is empty, skip over the break and continue looping. - if is_break(self.ch()) { - self.read_break(breaks); - } else { - // Otherwise, we have a content line. Return control. - break; - } - } - } - - /// Determine the indentation level for a block scalar from the first line of its contents. - /// - /// The function skips over whitespace-only lines and sets `indent` to the the longest - /// whitespace line that was encountered. - fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) { - let mut max_indent = 0; - loop { - // Consume all spaces. Tabs cannot be used as indentation. - while self.look_ch() == ' ' { - self.skip_blank(); - } - - if self.mark.col > max_indent { - max_indent = self.mark.col; - } - - if is_break(self.ch()) { - // If our current line is empty, skip over the break and continue looping. - self.lookahead(2); - self.read_break(breaks); - } else { - // Otherwise, we have a content line. Return control. - break; - } - } - - // In case a yaml looks like: - // ```yaml - // | - // foo - // bar - // ``` - // We need to set the indent to 0 and not 1. In all other cases, the indent must be at - // least 1. When in the above example, `self.indent` will be set to -1. - *indent = max_indent.max((self.indent + 1) as usize); - if self.indent > 0 { - *indent = (*indent).max(1); - } - } - - fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult { - self.save_simple_key(); - self.disallow_simple_key(); - - let tok = self.scan_flow_scalar(single)?; - - // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like, - // YAML allows the following value to be specified adjacent to the “:”. - self.skip_to_next_token()?; - self.adjacent_value_allowed_at = self.mark.index; - - self.tokens.push_back(tok); - Ok(()) - } - - #[allow(clippy::too_many_lines)] - fn scan_flow_scalar(&mut self, single: bool) -> Result { - let start_mark = self.mark; - - let mut string = String::new(); - let mut leading_break = String::new(); - let mut trailing_breaks = String::new(); - let mut whitespaces = String::new(); - let mut leading_blanks; - - /* Eat the left quote. */ - self.skip_non_blank(); - - loop { - /* Check for a document indicator. */ - self.lookahead(4); - - if self.mark.col == 0 - && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-')) - || ((self.buffer[0] == '.') - && (self.buffer[1] == '.') - && (self.buffer[2] == '.'))) - && is_blank_or_breakz(self.buffer[3]) - { - return Err(ScanError::new( - start_mark, - "while scanning a quoted scalar, found unexpected document indicator", - )); - } - - if is_z(self.ch()) { - return Err(ScanError::new( - start_mark, - "while scanning a quoted scalar, found unexpected end of stream", - )); - } - - if (self.mark.col as isize) < self.indent { - return Err(ScanError::new( - start_mark, - "invalid indentation in quoted scalar", - )); - } - - leading_blanks = false; - self.consume_flow_scalar_non_whitespace_chars( - single, - &mut string, - &mut leading_blanks, - &start_mark, - )?; - - match self.look_ch() { - '\'' if single => break, - '"' if !single => break, - _ => {} - } - - // Consume blank characters. - while is_blank(self.ch()) || is_break(self.ch()) { - if is_blank(self.ch()) { - // Consume a space or a tab character. - if leading_blanks { - if self.ch() == '\t' && (self.mark.col as isize) < self.indent { - return Err(ScanError::new( - self.mark, - "tab cannot be used as indentation", - )); - } - self.skip_blank(); - } else { - whitespaces.push(self.ch()); - self.skip_blank(); - } - } else { - self.lookahead(2); - // Check if it is a first line break. - if leading_blanks { - self.read_break(&mut trailing_breaks); - } else { - whitespaces.clear(); - self.read_break(&mut leading_break); - leading_blanks = true; - } - } - self.lookahead(1); - } - - // Join the whitespaces or fold line breaks. - if leading_blanks { - if leading_break.is_empty() { - string.push_str(&leading_break); - string.push_str(&trailing_breaks); - trailing_breaks.clear(); - leading_break.clear(); - } else { - if trailing_breaks.is_empty() { - string.push(' '); - } else { - string.push_str(&trailing_breaks); - trailing_breaks.clear(); - } - leading_break.clear(); - } - } else { - string.push_str(&whitespaces); - whitespaces.clear(); - } - } // loop - - // Eat the right quote. - self.skip_non_blank(); - // Ensure there is no invalid trailing content. - self.skip_ws_to_eol(SkipTabs::Yes)?; - match self.ch() { - // These can be encountered in flow sequences or mappings. - ',' | '}' | ']' if self.flow_level > 0 => {} - // An end-of-line / end-of-stream is fine. No trailing content. - c if is_breakz(c) => {} - // ':' can be encountered if our scalar is a key. - // Outside of flow contexts, keys cannot span multiple lines - ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {} - // Inside a flow context, this is allowed. - ':' if self.flow_level > 0 => {} - _ => { - return Err(ScanError::new( - self.mark, - "invalid trailing content after double-quoted scalar", - )); - } - } - - let style = if single { - TScalarStyle::SingleQuoted - } else { - TScalarStyle::DoubleQuoted - }; - Ok(Token(start_mark, TokenType::Scalar(style, string))) - } - - /// Consume successive non-whitespace characters from a flow scalar. - /// - /// This function resolves escape sequences and stops upon encountering a whitespace, the end - /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"` - /// for double quoted scalars). - /// - /// # Errors - /// Return an error if an invalid escape sequence is found. - fn consume_flow_scalar_non_whitespace_chars( - &mut self, - single: bool, - string: &mut String, - leading_blanks: &mut bool, - start_mark: &Marker, - ) -> Result<(), ScanError> { - self.lookahead(2); - while !is_blank_or_breakz(self.ch()) { - match self.ch() { - // Check for an escaped single quote. - '\'' if self.buffer[1] == '\'' && single => { - string.push('\''); - self.skip_n_non_blank(2); - } - // Check for the right quote. - '\'' if single => break, - '"' if !single => break, - // Check for an escaped line break. - '\\' if !single && is_break(self.buffer[1]) => { - self.lookahead(3); - self.skip_non_blank(); - self.skip_line(); - *leading_blanks = true; - break; - } - // Check for an escape sequence. - '\\' if !single => { - string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?); - } - c => { - string.push(c); - self.skip_non_blank(); - } - } - self.lookahead(2); - } - Ok(()) - } - - /// Escape the sequence we encounter in a flow scalar. - /// - /// `self.ch()` must point to the `\` starting the escape sequence. - /// - /// # Errors - /// Return an error if an invalid escape sequence is found. - fn resolve_flow_scalar_escape_sequence( - &mut self, - start_mark: &Marker, - ) -> Result { - let mut code_length = 0usize; - let mut ret = '\0'; - - match self.buffer[1] { - '0' => ret = '\0', - 'a' => ret = '\x07', - 'b' => ret = '\x08', - 't' | '\t' => ret = '\t', - 'n' => ret = '\n', - 'v' => ret = '\x0b', - 'f' => ret = '\x0c', - 'r' => ret = '\x0d', - 'e' => ret = '\x1b', - ' ' => ret = '\x20', - '"' => ret = '"', - '/' => ret = '/', - '\\' => ret = '\\', - // Unicode next line (#x85) - 'N' => ret = char::from_u32(0x85).unwrap(), - // Unicode non-breaking space (#xA0) - '_' => ret = char::from_u32(0xA0).unwrap(), - // Unicode line separator (#x2028) - 'L' => ret = char::from_u32(0x2028).unwrap(), - // Unicode paragraph separator (#x2029) - 'P' => ret = char::from_u32(0x2029).unwrap(), - 'x' => code_length = 2, - 'u' => code_length = 4, - 'U' => code_length = 8, - _ => { - return Err(ScanError::new( - *start_mark, - "while parsing a quoted scalar, found unknown escape character", - )) - } - } - self.skip_n_non_blank(2); - - // Consume an arbitrary escape code. - if code_length > 0 { - self.lookahead(code_length); - let mut value = 0u32; - for i in 0..code_length { - if !is_hex(self.buffer[i]) { - return Err(ScanError::new( - *start_mark, - "while parsing a quoted scalar, did not find expected hexadecimal number", - )); - } - value = (value << 4) + as_hex(self.buffer[i]); - } - - let Some(ch) = char::from_u32(value) else { - return Err(ScanError::new( - *start_mark, - "while parsing a quoted scalar, found invalid Unicode character escape code", - )); - }; - ret = ch; - - self.skip_n_non_blank(code_length); - } - Ok(ret) - } - - fn fetch_plain_scalar(&mut self) -> ScanResult { - self.save_simple_key(); - self.disallow_simple_key(); - - let tok = self.scan_plain_scalar()?; - - self.tokens.push_back(tok); - Ok(()) - } - - /// Scan for a plain scalar. - /// - /// Plain scalars are the most readable but restricted style. They may span multiple lines in - /// some contexts. - #[allow(clippy::too_many_lines)] - fn scan_plain_scalar(&mut self) -> Result { - self.unroll_non_block_indents(); - let indent = self.indent + 1; - let start_mark = self.mark; - - if self.flow_level > 0 && (start_mark.col as isize) < indent { - return Err(ScanError::new( - start_mark, - "invalid indentation in flow construct", - )); - } - - let mut string = String::with_capacity(32); - let mut leading_break = String::with_capacity(32); - let mut trailing_breaks = String::with_capacity(32); - let mut whitespaces = String::with_capacity(32); - - loop { - self.lookahead(4); - if self.next_is_document_indicator() || self.ch() == '#' { - break; - } - - if self.flow_level > 0 && self.ch() == '-' && is_flow(self.buffer[1]) { - return Err(ScanError::new( - self.mark, - "plain scalar cannot start with '-' followed by ,[]{}", - )); - } - - if !is_blank_or_breakz(self.ch()) && self.next_can_be_plain_scalar() { - if self.leading_whitespace { - if leading_break.is_empty() { - string.push_str(&leading_break); - string.push_str(&trailing_breaks); - trailing_breaks.clear(); - leading_break.clear(); - } else { - if trailing_breaks.is_empty() { - string.push(' '); - } else { - string.push_str(&trailing_breaks); - trailing_breaks.clear(); - } - leading_break.clear(); - } - self.leading_whitespace = false; - } else if !whitespaces.is_empty() { - string.push_str(&whitespaces); - whitespaces.clear(); - } - - // We can unroll the first iteration of the loop. - string.push(self.ch()); - self.skip_non_blank(); - self.lookahead(2); - - // Add content non-blank characters to the scalar. - while !is_blank_or_breakz(self.ch()) { - if !self.next_can_be_plain_scalar() { - break; - } - - string.push(self.ch()); - self.skip_non_blank(); - self.lookahead(2); - } - } - - // We may reach the end of a plain scalar if: - // - We reach eof - // - We reach ": " - // - We find a flow character in a flow context - if !(is_blank(self.ch()) || is_break(self.ch())) { - break; - } - - // Process blank characters. - while is_blank(self.look_ch()) || is_break(self.ch()) { - if is_blank(self.ch()) { - if !self.leading_whitespace { - whitespaces.push(self.ch()); - self.skip_blank(); - } else if (self.mark.col as isize) < indent && self.ch() == '\t' { - // Tabs in an indentation columns are allowed if and only if the line is - // empty. Skip to the end of the line. - self.skip_ws_to_eol(SkipTabs::Yes)?; - if !is_breakz(self.ch()) { - return Err(ScanError::new( - start_mark, - "while scanning a plain scalar, found a tab", - )); - } - } else { - self.skip_blank(); - } - } else { - self.lookahead(2); - // Check if it is a first line break - if self.leading_whitespace { - self.read_break(&mut trailing_breaks); - } else { - whitespaces.clear(); - self.read_break(&mut leading_break); - self.leading_whitespace = true; - } - } - } - - // check indentation level - if self.flow_level == 0 && (self.mark.col as isize) < indent { - break; - } - } - - if self.leading_whitespace { - self.allow_simple_key(); - } - - Ok(Token( - start_mark, - TokenType::Scalar(TScalarStyle::Plain, string), - )) - } - - fn fetch_key(&mut self) -> ScanResult { - let start_mark = self.mark; - if self.flow_level == 0 { - // Check if we are allowed to start a new key (not necessarily simple). - if !self.simple_key_allowed { - return Err(ScanError::new( - self.mark, - "mapping keys are not allowed in this context", - )); - } - self.roll_indent( - start_mark.col, - None, - TokenType::BlockMappingStart, - start_mark, - ); - } else { - // The parser, upon receiving a `Key`, will insert a `MappingStart` event. - self.flow_mapping_started = true; - } - - self.remove_simple_key()?; - - if self.flow_level == 0 { - self.allow_simple_key(); - } else { - self.disallow_simple_key(); - } - - self.skip_non_blank(); - self.skip_yaml_whitespace()?; - if self.ch() == '\t' { - return Err(ScanError::new( - self.mark(), - "tabs disallowed in this context", - )); - } - self.tokens.push_back(Token(start_mark, TokenType::Key)); - Ok(()) - } - - /// Fetch a value from a mapping (after a `:`). - fn fetch_value(&mut self) -> ScanResult { - let sk = self.simple_keys.last().unwrap().clone(); - let start_mark = self.mark; - self.implicit_flow_mapping = self.flow_level > 0 && !self.flow_mapping_started; - - // Skip over ':'. - self.skip_non_blank(); - if self.look_ch() == '\t' - && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws() - && (self.ch() == '-' || is_alpha(self.ch())) - { - return Err(ScanError::new( - self.mark, - "':' must be followed by a valid YAML whitespace", - )); - } - - if sk.possible { - // insert simple key - let tok = Token(sk.mark, TokenType::Key); - self.insert_token(sk.token_number - self.tokens_parsed, tok); - if self.implicit_flow_mapping { - if sk.mark.line < start_mark.line { - return Err(ScanError::new( - start_mark, - "illegal placement of ':' indicator", - )); - } - self.insert_token( - sk.token_number - self.tokens_parsed, - Token(self.mark, TokenType::FlowMappingStart), - ); - } - - // Add the BLOCK-MAPPING-START token if needed. - self.roll_indent( - sk.mark.col, - Some(sk.token_number), - TokenType::BlockMappingStart, - start_mark, - ); - self.roll_one_col_indent(); - - self.simple_keys.last_mut().unwrap().possible = false; - self.disallow_simple_key(); - } else { - if self.implicit_flow_mapping { - self.tokens - .push_back(Token(self.mark, TokenType::FlowMappingStart)); - } - // The ':' indicator follows a complex key. - if self.flow_level == 0 { - if !self.simple_key_allowed { - return Err(ScanError::new( - start_mark, - "mapping values are not allowed in this context", - )); - } - - self.roll_indent( - start_mark.col, - None, - TokenType::BlockMappingStart, - start_mark, - ); - } - self.roll_one_col_indent(); - - if self.flow_level == 0 { - self.allow_simple_key(); - } else { - self.disallow_simple_key(); - } - } - self.tokens.push_back(Token(start_mark, TokenType::Value)); - - Ok(()) - } - - /// Add an indentation level to the stack with the given block token, if needed. - /// - /// An indentation level is added only if: - /// - We are not in a flow-style construct (which don't have indentation per-se). - /// - The current column is further indented than the last indent we have registered. - fn roll_indent(&mut self, col: usize, number: Option, tok: TokenType, mark: Marker) { - if self.flow_level > 0 { - return; - } - - // If the last indent was a non-block indent, remove it. - // This means that we prepared an indent that we thought we wouldn't use, but realized just - // now that it is a block indent. - if self.indent <= col as isize { - if let Some(indent) = self.indents.last() { - if !indent.needs_block_end { - self.indent = indent.indent; - self.indents.pop(); - } - } - } - - if self.indent < col as isize { - self.indents.push(Indent { - indent: self.indent, - needs_block_end: true, - }); - self.indent = col as isize; - let tokens_parsed = self.tokens_parsed; - match number { - Some(n) => self.insert_token(n - tokens_parsed, Token(mark, tok)), - None => self.tokens.push_back(Token(mark, tok)), - } - } - } - - /// Pop indentation levels from the stack as much as needed. - /// - /// Indentation levels are popped from the stack while they are further indented than `col`. - /// If we are in a flow-style construct (which don't have indentation per-se), this function - /// does nothing. - fn unroll_indent(&mut self, col: isize) { - if self.flow_level > 0 { - return; - } - while self.indent > col { - let indent = self.indents.pop().unwrap(); - self.indent = indent.indent; - if indent.needs_block_end { - self.tokens.push_back(Token(self.mark, TokenType::BlockEnd)); - } - } - } - - /// Add an indentation level of 1 column that does not start a block. - /// - /// See the documentation of [`Indent::needs_block_end`] for more details. - /// An indentation is not added if we are inside a flow level or if the last indent is already - /// a non-block indent. - fn roll_one_col_indent(&mut self) { - if self.flow_level == 0 && self.indents.last().map_or(false, |x| x.needs_block_end) { - self.indents.push(Indent { - indent: self.indent, - needs_block_end: false, - }); - self.indent += 1; - } - } - - /// Unroll all last indents created with [`Self::roll_one_col_indent`]. - fn unroll_non_block_indents(&mut self) { - while let Some(indent) = self.indents.last() { - if indent.needs_block_end { - break; - } - self.indent = indent.indent; - self.indents.pop(); - } - } - - /// Mark the next token to be inserted as a potential simple key. - fn save_simple_key(&mut self) { - if self.simple_key_allowed { - let required = self.flow_level == 0 - && self.indent == (self.mark.col as isize) - && self.indents.last().unwrap().needs_block_end; - let mut sk = SimpleKey::new(self.mark); - sk.possible = true; - sk.required = required; - sk.token_number = self.tokens_parsed + self.tokens.len(); - - self.simple_keys.pop(); - self.simple_keys.push(sk); - } - } - - fn remove_simple_key(&mut self) -> ScanResult { - let last = self.simple_keys.last_mut().unwrap(); - if last.possible && last.required { - return Err(ScanError::new(self.mark, "simple key expected")); - } - - last.possible = false; - Ok(()) - } - - /// Check whether the next characters may be part of a plain scalar. - /// - /// This function assumes we are not given a blankz character. - // For some reason, `#[inline]` is not enough. - #[allow(clippy::inline_always)] - #[inline(always)] - fn next_can_be_plain_scalar(&self) -> bool { - match self.ch() { - // indicators can end a plain scalar, see 7.3.3. Plain Style - ':' if is_blank_or_breakz(self.buffer[1]) - || (self.flow_level > 0 && is_flow(self.buffer[1])) => - { - false - } - c if self.flow_level > 0 && is_flow(c) => false, - _ => true, - } - } - - /// Return whether the scanner is inside a block but outside of a flow sequence. - fn is_within_block(&self) -> bool { - !self.indents.is_empty() - } - - /// If an implicit mapping had started, end it. - fn end_implicit_mapping(&mut self, mark: Marker) { - if self.implicit_flow_mapping { - self.implicit_flow_mapping = false; - self.flow_mapping_started = false; - self.tokens - .push_back(Token(mark, TokenType::FlowMappingEnd)); - } - } -} - -/// Behavior to adopt regarding treating tabs as whitespace. -/// -/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space. -#[derive(Copy, Clone, Eq, PartialEq)] -enum SkipTabs { - /// Skip all tabs as whitespace. - Yes, - /// Don't skip any tab. Return from the function when encountering one. - No, - /// Return value from the function. - Result( - /// Whether tabs were encountered. - bool, - /// Whether at least 1 valid yaml whitespace has been encountered. - bool, - ), -} - -impl SkipTabs { - /// Whether tabs were found while skipping whitespace. - /// - /// This function must be called after a call to `skip_ws_to_eol`. - fn found_tabs(self) -> bool { - matches!(self, SkipTabs::Result(true, _)) - } - - /// Whether a valid YAML whitespace has been found in skipped-over content. - /// - /// This function must be called after a call to `skip_ws_to_eol`. - fn has_valid_yaml_ws(self) -> bool { - matches!(self, SkipTabs::Result(_, true)) - } -} - -/// Chomping, how final line breaks and trailing empty lines are interpreted. -/// -/// See YAML spec 8.1.1.2. -#[derive(PartialEq, Eq)] -pub enum Chomping { - /// The final line break and any trailing empty lines are excluded. - Strip, - /// The final line break is preserved, but trailing empty lines are excluded. - Clip, - /// The final line break and trailing empty lines are included. - Keep, -} - -#[cfg(test)] -mod test { - #[test] - fn test_is_anchor_char() { - use super::is_anchor_char; - assert!(is_anchor_char('x')); - } -} diff --git a/saphyr/src/yaml.rs b/saphyr/src/yaml.rs index 3c429d5..3af4bb3 100644 --- a/saphyr/src/yaml.rs +++ b/saphyr/src/yaml.rs @@ -10,8 +10,7 @@ use std::{collections::BTreeMap, convert::TryFrom, mem, ops::Index, ops::IndexMu use encoding_rs::{Decoder, DecoderResult, Encoding}; use hashlink::LinkedHashMap; -use crate::parser::{Event, MarkedEventReceiver, Parser, Tag}; -use crate::scanner::{Marker, ScanError, TScalarStyle}; +use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser, ScanError, TScalarStyle, Tag}; /// A YAML node is stored as this `Yaml` enumeration, which provides an easy way to /// access your YAML document. @@ -19,7 +18,7 @@ use crate::scanner::{Marker, ScanError, TScalarStyle}; /// # Examples /// /// ``` -/// use yaml_rust2::Yaml; +/// use saphyr::Yaml; /// let foo = Yaml::from_str("-123"); // convert the string to the appropriate YAML type /// assert_eq!(foo.as_i64().unwrap(), -123); /// @@ -306,7 +305,7 @@ pub enum YAMLDecodingTrap { /// For example, to read a YAML file while ignoring Unicode decoding errors you can set the /// `encoding_trap` to `encoding::DecoderTrap::Ignore`. /// ```rust -/// use yaml_rust2::yaml::{YamlDecoder, YAMLDecodingTrap}; +/// use saphyr::{YamlDecoder, YAMLDecodingTrap}; /// /// let string = b"--- /// a\xa9: 1 @@ -580,7 +579,7 @@ impl Yaml { /// replace it with a given value `other`. Otherwise, return self unchanged. /// /// ``` - /// use yaml_rust2::yaml::Yaml; + /// use saphyr::Yaml; /// /// assert_eq!(Yaml::BadValue.or(Yaml::Integer(3)), Yaml::Integer(3)); /// assert_eq!(Yaml::Integer(3).or(Yaml::BadValue), Yaml::Integer(3)); @@ -613,7 +612,7 @@ impl Yaml { /// /// # Examples /// ``` - /// # use yaml_rust2::yaml::Yaml; + /// # use saphyr::Yaml; /// assert!(matches!(Yaml::from_str("42"), Yaml::Integer(42))); /// assert!(matches!(Yaml::from_str("0x2A"), Yaml::Integer(42))); /// assert!(matches!(Yaml::from_str("0o52"), Yaml::Integer(42))); diff --git a/saphyr/tests/basic.rs b/saphyr/tests/basic.rs index b769c2b..cc00cb0 100644 --- a/saphyr/tests/basic.rs +++ b/saphyr/tests/basic.rs @@ -1,8 +1,7 @@ #![allow(clippy::bool_assert_comparison)] #![allow(clippy::float_cmp)] -use std::vec; -use yaml_rust2::{Yaml, YamlEmitter, YamlLoader}; +use saphyr::{Yaml, YamlEmitter, YamlLoader}; #[test] fn test_api() { @@ -44,27 +43,6 @@ fn test_api() { assert!(!writer.is_empty()); } -#[test] -fn test_fail() { - let s = " -# syntax error -scalar -key: [1, 2]] -key1:a2 -"; - let Err(error) = YamlLoader::load_from_str(s) else { - panic!() - }; - assert_eq!( - error.info(), - "mapping values are not allowed in this context" - ); - assert_eq!( - error.to_string(), - "mapping values are not allowed in this context at byte 26 line 4 column 4" - ); -} - #[test] fn test_coerce() { let s = "--- @@ -80,51 +58,6 @@ c: [1, 2] assert!(doc["d"][0].is_badvalue()); } -#[test] -fn test_empty_doc() { - let s: String = String::new(); - YamlLoader::load_from_str(&s).unwrap(); - let s: String = "---".to_owned(); - assert_eq!(YamlLoader::load_from_str(&s).unwrap()[0], Yaml::Null); -} - -#[test] -fn test_parser() { - let s: String = " -# comment -a0 bb: val -a1: - b1: 4 - b2: d -a2: 4 # i'm comment -a3: [1, 2, 3] -a4: - - - a1 - - a2 - - 2 -a5: 'single_quoted' -a6: \"double_quoted\" -a7: 你好 -" - .to_owned(); - let out = YamlLoader::load_from_str(&s).unwrap(); - let doc = &out[0]; - assert_eq!(doc["a7"].as_str().unwrap(), "你好"); -} - -#[test] -fn test_multi_doc() { - let s = " -'a scalar' ---- -'a scalar' ---- -'a scalar' -"; - let out = YamlLoader::load_from_str(s).unwrap(); - assert_eq!(out.len(), 3); -} - #[test] fn test_anchor() { let s = " @@ -150,15 +83,6 @@ a1: &DEFAULT assert_eq!(doc["a1"]["b2"], Yaml::BadValue); } -#[test] -fn test_github_27() { - // https://github.com/chyh1990/yaml-rust/issues/27 - let s = "&a"; - let out = YamlLoader::load_from_str(s).unwrap(); - let doc = &out[0]; - assert_eq!(doc.as_str().unwrap(), ""); -} - #[test] fn test_plain_datatype() { let s = " @@ -223,45 +147,6 @@ fn test_plain_datatype() { assert!(!doc[25][1].as_bool().unwrap()); } -#[test] -fn test_bad_hyphen() { - // See: https://github.com/chyh1990/yaml-rust/issues/23 - let s = "{-"; - assert!(YamlLoader::load_from_str(s).is_err()); -} - -#[test] -fn test_issue_65() { - // See: https://github.com/chyh1990/yaml-rust/issues/65 - let b = "\n\"ll\\\"ll\\\r\n\"ll\\\"ll\\\r\r\r\rU\r\r\rU"; - assert!(YamlLoader::load_from_str(b).is_err()); -} - -#[test] -fn test_issue_65_mwe() { - // A MWE for `test_issue_65`. The error over there is that there is invalid trailing content - // after a double quoted string. - let b = r#""foo" l"#; - assert!(YamlLoader::load_from_str(b).is_err()); -} - -#[test] -fn test_bad_docstart() { - assert!(YamlLoader::load_from_str("---This used to cause an infinite loop").is_ok()); - assert_eq!( - YamlLoader::load_from_str("----"), - Ok(vec![Yaml::String(String::from("----"))]) - ); - assert_eq!( - YamlLoader::load_from_str("--- #here goes a comment"), - Ok(vec![Yaml::Null]) - ); - assert_eq!( - YamlLoader::load_from_str("---- #here goes a comment"), - Ok(vec![Yaml::String(String::from("----"))]) - ); -} - #[test] fn test_plain_datatype_with_into_methods() { let s = " @@ -348,95 +233,3 @@ fn test_integer_key() { let first = out.into_iter().next().unwrap(); assert_eq!(first[0]["important"].as_bool().unwrap(), true); } - -#[test] -fn test_indentation_equality() { - let four_spaces = YamlLoader::load_from_str( - r" -hash: - with: - indentations -", - ) - .unwrap() - .into_iter() - .next() - .unwrap(); - - let two_spaces = YamlLoader::load_from_str( - r" -hash: - with: - indentations -", - ) - .unwrap() - .into_iter() - .next() - .unwrap(); - - let one_space = YamlLoader::load_from_str( - r" -hash: - with: - indentations -", - ) - .unwrap() - .into_iter() - .next() - .unwrap(); - - let mixed_spaces = YamlLoader::load_from_str( - r" -hash: - with: - indentations -", - ) - .unwrap() - .into_iter() - .next() - .unwrap(); - - assert_eq!(four_spaces, two_spaces); - assert_eq!(two_spaces, one_space); - assert_eq!(four_spaces, mixed_spaces); -} - -#[test] -fn test_two_space_indentations() { - // https://github.com/kbknapp/clap-rs/issues/965 - - let s = r" -subcommands: - - server: - about: server related commands -subcommands2: - - server: - about: server related commands -subcommands3: - - server: - about: server related commands - "; - - let out = YamlLoader::load_from_str(s).unwrap(); - let doc = &out.into_iter().next().unwrap(); - - println!("{doc:#?}"); - assert_eq!(doc["subcommands"][0]["server"], Yaml::Null); - assert!(doc["subcommands2"][0]["server"].as_hash().is_some()); - assert!(doc["subcommands3"][0]["server"].as_hash().is_some()); -} - -#[test] -fn test_recursion_depth_check_objects() { - let s = "{a:".repeat(10_000) + &"}".repeat(10_000); - assert!(YamlLoader::load_from_str(&s).is_err()); -} - -#[test] -fn test_recursion_depth_check_arrays() { - let s = "[".repeat(10_000) + &"]".repeat(10_000); - assert!(YamlLoader::load_from_str(&s).is_err()); -} diff --git a/saphyr/tests/emitter.rs b/saphyr/tests/emitter.rs index c085a56..53e558f 100644 --- a/saphyr/tests/emitter.rs +++ b/saphyr/tests/emitter.rs @@ -1,4 +1,4 @@ -use yaml_rust2::{YamlEmitter, YamlLoader}; +use saphyr::{YamlEmitter, YamlLoader}; #[allow(clippy::similar_names)] #[test] diff --git a/saphyr/tests/quickcheck.rs b/saphyr/tests/quickcheck.rs index fdf2549..819d064 100644 --- a/saphyr/tests/quickcheck.rs +++ b/saphyr/tests/quickcheck.rs @@ -1,9 +1,9 @@ -extern crate yaml_rust2; #[macro_use] extern crate quickcheck; use quickcheck::TestResult; -use yaml_rust2::{Yaml, YamlEmitter, YamlLoader}; + +use saphyr::{Yaml, YamlEmitter, YamlLoader}; quickcheck! { fn test_check_weird_keys(xs: Vec) -> TestResult { diff --git a/saphyr/tests/scanner.rs b/saphyr/tests/scanner.rs deleted file mode 100644 index 0a09517..0000000 --- a/saphyr/tests/scanner.rs +++ /dev/null @@ -1,440 +0,0 @@ -#![allow(clippy::enum_glob_use)] - -use yaml_rust2::{scanner::TokenType::*, scanner::*}; - -macro_rules! next { - ($p:ident, $tk:pat) => {{ - let tok = $p.next().unwrap(); - match tok.1 { - $tk => {} - _ => panic!("unexpected token: {:?}", tok), - } - }}; -} - -macro_rules! next_scalar { - ($p:ident, $tk:expr, $v:expr) => {{ - let tok = $p.next().unwrap(); - match tok.1 { - Scalar(style, ref v) => { - assert_eq!(style, $tk); - assert_eq!(v, $v); - } - _ => panic!("unexpected token: {:?}", tok), - } - }}; -} - -macro_rules! end { - ($p:ident) => {{ - assert_eq!($p.next(), None); - }}; -} -/// test cases in libyaml scanner.c -#[test] -fn test_empty() { - let s = ""; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_scalar() { - let s = "a scalar"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, Scalar(TScalarStyle::Plain, _)); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_explicit_scalar() { - let s = "--- -'a scalar' -... -"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, DocumentStart); - next!(p, Scalar(TScalarStyle::SingleQuoted, _)); - next!(p, DocumentEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_multiple_documents() { - let s = " -'a scalar' ---- -'a scalar' ---- -'a scalar' -"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, Scalar(TScalarStyle::SingleQuoted, _)); - next!(p, DocumentStart); - next!(p, Scalar(TScalarStyle::SingleQuoted, _)); - next!(p, DocumentStart); - next!(p, Scalar(TScalarStyle::SingleQuoted, _)); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_a_flow_sequence() { - let s = "[item 1, item 2, item 3]"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, FlowSequenceStart); - next_scalar!(p, TScalarStyle::Plain, "item 1"); - next!(p, FlowEntry); - next!(p, Scalar(TScalarStyle::Plain, _)); - next!(p, FlowEntry); - next!(p, Scalar(TScalarStyle::Plain, _)); - next!(p, FlowSequenceEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_a_flow_mapping() { - let s = " -{ - a simple key: a value, # Note that the KEY token is produced. - ? a complex key: another value, -} -"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, FlowMappingStart); - next!(p, Key); - next!(p, Scalar(TScalarStyle::Plain, _)); - next!(p, Value); - next!(p, Scalar(TScalarStyle::Plain, _)); - next!(p, FlowEntry); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "a complex key"); - next!(p, Value); - next!(p, Scalar(TScalarStyle::Plain, _)); - next!(p, FlowEntry); - next!(p, FlowMappingEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_block_sequences() { - let s = " -- item 1 -- item 2 -- - - item 3.1 - - item 3.2 -- - key 1: value 1 - key 2: value 2 -"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, BlockSequenceStart); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 1"); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 2"); - next!(p, BlockEntry); - next!(p, BlockSequenceStart); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 3.1"); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 3.2"); - next!(p, BlockEnd); - next!(p, BlockEntry); - next!(p, BlockMappingStart); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "key 1"); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, "value 1"); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "key 2"); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, "value 2"); - next!(p, BlockEnd); - next!(p, BlockEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_block_mappings() { - let s = " -a simple key: a value # The KEY token is produced here. -? a complex key -: another value -a mapping: - key 1: value 1 - key 2: value 2 -a sequence: - - item 1 - - item 2 -"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, BlockMappingStart); - next!(p, Key); - next!(p, Scalar(_, _)); - next!(p, Value); - next!(p, Scalar(_, _)); - next!(p, Key); - next!(p, Scalar(_, _)); - next!(p, Value); - next!(p, Scalar(_, _)); - next!(p, Key); - next!(p, Scalar(_, _)); - next!(p, Value); // libyaml comment seems to be wrong - next!(p, BlockMappingStart); - next!(p, Key); - next!(p, Scalar(_, _)); - next!(p, Value); - next!(p, Scalar(_, _)); - next!(p, Key); - next!(p, Scalar(_, _)); - next!(p, Value); - next!(p, Scalar(_, _)); - next!(p, BlockEnd); - next!(p, Key); - next!(p, Scalar(_, _)); - next!(p, Value); - next!(p, BlockSequenceStart); - next!(p, BlockEntry); - next!(p, Scalar(_, _)); - next!(p, BlockEntry); - next!(p, Scalar(_, _)); - next!(p, BlockEnd); - next!(p, BlockEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_no_block_sequence_start() { - let s = " -key: -- item 1 -- item 2 -"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, BlockMappingStart); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "key"); - next!(p, Value); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 1"); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 2"); - next!(p, BlockEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_collections_in_sequence() { - let s = " -- - item 1 - - item 2 -- key 1: value 1 - key 2: value 2 -- ? complex key - : complex value -"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, BlockSequenceStart); - next!(p, BlockEntry); - next!(p, BlockSequenceStart); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 1"); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 2"); - next!(p, BlockEnd); - next!(p, BlockEntry); - next!(p, BlockMappingStart); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "key 1"); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, "value 1"); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "key 2"); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, "value 2"); - next!(p, BlockEnd); - next!(p, BlockEntry); - next!(p, BlockMappingStart); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "complex key"); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, "complex value"); - next!(p, BlockEnd); - next!(p, BlockEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_collections_in_mapping() { - let s = " -? a sequence -: - item 1 - - item 2 -? a mapping -: key 1: value 1 - key 2: value 2 -"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, BlockMappingStart); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "a sequence"); - next!(p, Value); - next!(p, BlockSequenceStart); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 1"); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "item 2"); - next!(p, BlockEnd); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "a mapping"); - next!(p, Value); - next!(p, BlockMappingStart); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "key 1"); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, "value 1"); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "key 2"); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, "value 2"); - next!(p, BlockEnd); - next!(p, BlockEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_spec_ex7_3() { - let s = " -{ - ? foo :, - : bar, -} -"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, FlowMappingStart); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "foo"); - next!(p, Value); - next!(p, FlowEntry); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, "bar"); - next!(p, FlowEntry); - next!(p, FlowMappingEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_plain_scalar_starting_with_indicators_in_flow() { - // "Plain scalars must not begin with most indicators, as this would cause ambiguity with - // other YAML constructs. However, the “:”, “?” and “-” indicators may be used as the first - // character if followed by a non-space “safe” character, as this causes no ambiguity." - - let s = "{a: :b}"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, FlowMappingStart); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "a"); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, ":b"); - next!(p, FlowMappingEnd); - next!(p, StreamEnd); - end!(p); - - let s = "{a: ?b}"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, FlowMappingStart); - next!(p, Key); - next_scalar!(p, TScalarStyle::Plain, "a"); - next!(p, Value); - next_scalar!(p, TScalarStyle::Plain, "?b"); - next!(p, FlowMappingEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_plain_scalar_starting_with_indicators_in_block() { - let s = ":a"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next_scalar!(p, TScalarStyle::Plain, ":a"); - next!(p, StreamEnd); - end!(p); - - let s = "?a"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next_scalar!(p, TScalarStyle::Plain, "?a"); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_plain_scalar_containing_indicators_in_block() { - let s = "a:,b"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next_scalar!(p, TScalarStyle::Plain, "a:,b"); - next!(p, StreamEnd); - end!(p); - - let s = ":,b"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next_scalar!(p, TScalarStyle::Plain, ":,b"); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_scanner_cr() { - let s = "---\r\n- tok1\r\n- tok2"; - let mut p = Scanner::new(s.chars()); - next!(p, StreamStart(..)); - next!(p, DocumentStart); - next!(p, BlockSequenceStart); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "tok1"); - next!(p, BlockEntry); - next_scalar!(p, TScalarStyle::Plain, "tok2"); - next!(p, BlockEnd); - next!(p, StreamEnd); - end!(p); -} - -#[test] -fn test_uri() { - // TODO -} - -#[test] -fn test_uri_escapes() { - // TODO -} diff --git a/saphyr/tests/spec_test.rs b/saphyr/tests/spec_test.rs index ecf1327..80b6bfd 100644 --- a/saphyr/tests/spec_test.rs +++ b/saphyr/tests/spec_test.rs @@ -1,84 +1,7 @@ -#![allow(dead_code)] -#![allow(non_upper_case_globals)] -extern crate yaml_rust2; - -use yaml_rust2::parser::{Event, EventReceiver, Parser}; -use yaml_rust2::scanner::TScalarStyle; - -// These names match the names used in the C++ test suite. -#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))] -#[derive(Clone, PartialEq, PartialOrd, Debug)] -enum TestEvent { - OnDocumentStart, - OnDocumentEnd, - OnSequenceStart, - OnSequenceEnd, - OnMapStart, - OnMapEnd, - OnScalar, - OnAlias, - OnNull, -} - -struct YamlChecker { - pub evs: Vec, -} - -impl EventReceiver for YamlChecker { - fn on_event(&mut self, ev: Event) { - let tev = match ev { - Event::DocumentStart => TestEvent::OnDocumentStart, - Event::DocumentEnd => TestEvent::OnDocumentEnd, - Event::SequenceStart(..) => TestEvent::OnSequenceStart, - Event::SequenceEnd => TestEvent::OnSequenceEnd, - Event::MappingStart(..) => TestEvent::OnMapStart, - Event::MappingEnd => TestEvent::OnMapEnd, - Event::Scalar(ref v, style, _, _) => { - if v == "~" && style == TScalarStyle::Plain { - TestEvent::OnNull - } else { - TestEvent::OnScalar - } - } - Event::Alias(_) => TestEvent::OnAlias, - _ => return, // ignore other events - }; - self.evs.push(tev); - } -} - -fn str_to_test_events(docs: &str) -> Vec { - let mut p = YamlChecker { evs: Vec::new() }; - let mut parser = Parser::new_from_str(docs); - parser.load(&mut p, true).unwrap(); - p.evs -} - -macro_rules! assert_next { - ($v:expr, $p:pat) => { - match $v.next().unwrap() { - $p => {} - e => { - panic!("unexpected event: {:?} (expected {:?})", e, stringify!($p)); - } - } - }; -} - -// auto generated from handler_spec_test.cpp -include!("specexamples.rs.inc"); -include!("spec_test.rs.inc"); - -// hand-crafted tests -//#[test] -//fn test_hc_alias() { -//} +use saphyr::{Hash, Yaml, YamlEmitter, YamlLoader}; #[test] fn test_mapvec_legal() { - use yaml_rust2::yaml::{Hash, Yaml}; - use yaml_rust2::{YamlEmitter, YamlLoader}; - // Emitting a `map>, _>` should result in legal yaml that // we can parse. diff --git a/saphyr/tests/test_round_trip.rs b/saphyr/tests/test_round_trip.rs index 5f0a7a1..0d03d3e 100644 --- a/saphyr/tests/test_round_trip.rs +++ b/saphyr/tests/test_round_trip.rs @@ -1,6 +1,4 @@ -extern crate yaml_rust2; - -use yaml_rust2::{Yaml, YamlEmitter, YamlLoader}; +use saphyr::{Yaml, YamlEmitter, YamlLoader}; fn roundtrip(original: &Yaml) { let mut emitted = String::new(); diff --git a/saphyr/tests/yaml-test-suite b/saphyr/tests/yaml-test-suite deleted file mode 160000 index 45db50a..0000000 --- a/saphyr/tests/yaml-test-suite +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 45db50aecf9b1520f8258938c88f396e96f30831 diff --git a/saphyr/tests/yaml-test-suite.rs b/saphyr/tests/yaml-test-suite.rs deleted file mode 100644 index 818083f..0000000 --- a/saphyr/tests/yaml-test-suite.rs +++ /dev/null @@ -1,295 +0,0 @@ -use std::fs::{self, DirEntry}; - -use libtest_mimic::{run_tests, Arguments, Outcome, Test}; - -use yaml_rust2::{ - parser::{Event, EventReceiver, Parser, Tag}, - scanner::TScalarStyle, - yaml, ScanError, Yaml, YamlLoader, -}; - -type Result> = std::result::Result; - -struct YamlTest { - yaml_visual: String, - yaml: String, - expected_events: String, - expected_error: bool, -} - -fn main() -> Result<()> { - let mut arguments = Arguments::from_args(); - if arguments.num_threads.is_none() { - arguments.num_threads = Some(1); - } - let tests: Vec> = std::fs::read_dir("tests/yaml-test-suite/src")? - .map(|entry| -> Result<_> { - let entry = entry?; - let tests = load_tests_from_file(&entry)?; - Ok(tests) - }) - .collect::>()?; - let mut tests: Vec<_> = tests.into_iter().flatten().collect(); - tests.sort_by_key(|t| t.name.clone()); - - run_tests(&arguments, tests, run_yaml_test).exit(); -} - -fn run_yaml_test(test: &Test) -> Outcome { - let desc = &test.data; - let actual_events = parse_to_events(&desc.yaml); - let events_diff = actual_events.map(|events| events_differ(&events, &desc.expected_events)); - let mut error_text = match (&events_diff, desc.expected_error) { - (Ok(x), true) => Some(format!("no error when expected: {x:#?}")), - (Err(_), true) | (Ok(None), false) => None, - (Err(e), false) => Some(format!("unexpected error {e:?}")), - (Ok(Some(diff)), false) => Some(format!("events differ: {diff}")), - }; - - // Show a caret on error. - if let Some(text) = &mut error_text { - use std::fmt::Write; - let _ = writeln!(text, "\n### Input:\n{}\n### End", desc.yaml_visual); - if let Err(err) = &events_diff { - writeln!(text, "### Error position").unwrap(); - let mut lines = desc.yaml.lines(); - for _ in 0..(err.marker().line() - 1) { - let l = lines.next().unwrap(); - writeln!(text, "{l}").unwrap(); - } - writeln!(text, "\x1B[91;1m{}", lines.next().unwrap()).unwrap(); - for _ in 0..err.marker().col() { - write!(text, " ").unwrap(); - } - writeln!(text, "^\x1b[m").unwrap(); - for l in lines { - writeln!(text, "{l}").unwrap(); - } - writeln!(text, "### End error position").unwrap(); - } - } - - match error_text { - None => Outcome::Passed, - Some(txt) => Outcome::Failed { msg: Some(txt) }, - } -} - -fn load_tests_from_file(entry: &DirEntry) -> Result>> { - let file_name = entry.file_name().to_string_lossy().to_string(); - let test_name = file_name - .strip_suffix(".yaml") - .ok_or("unexpected filename")?; - let tests = YamlLoader::load_from_str(&fs::read_to_string(entry.path())?)?; - let tests = tests[0].as_vec().ok_or("no test list found in file")?; - - let mut result = vec![]; - let mut current_test = yaml::Hash::new(); - for (idx, test_data) in tests.iter().enumerate() { - let name = if tests.len() > 1 { - format!("{test_name}-{idx:02}") - } else { - test_name.to_string() - }; - - // Test fields except `fail` are "inherited" - let test_data = test_data.as_hash().unwrap(); - current_test.remove(&Yaml::String("fail".into())); - for (key, value) in test_data.clone() { - current_test.insert(key, value); - } - - let current_test = Yaml::Hash(current_test.clone()); // Much better indexing - - if current_test["skip"] != Yaml::BadValue { - continue; - } - - result.push(Test { - name, - kind: String::new(), - is_ignored: false, - is_bench: false, - data: YamlTest { - yaml_visual: current_test["yaml"].as_str().unwrap().to_string(), - yaml: visual_to_raw(current_test["yaml"].as_str().unwrap()), - expected_events: visual_to_raw(current_test["tree"].as_str().unwrap()), - expected_error: current_test["fail"].as_bool() == Some(true), - }, - }); - } - Ok(result) -} - -fn parse_to_events(source: &str) -> Result, ScanError> { - let mut reporter = EventReporter::new(); - Parser::new_from_str(source).load(&mut reporter, true)?; - Ok(reporter.events) -} - -struct EventReporter { - events: Vec, -} - -impl EventReporter { - fn new() -> Self { - Self { events: vec![] } - } -} - -impl EventReceiver for EventReporter { - fn on_event(&mut self, ev: Event) { - let line: String = match ev { - Event::StreamStart => "+STR".into(), - Event::StreamEnd => "-STR".into(), - - Event::DocumentStart => "+DOC".into(), - Event::DocumentEnd => "-DOC".into(), - - Event::SequenceStart(idx, tag) => { - format!("+SEQ{}{}", format_index(idx), format_tag(&tag)) - } - Event::SequenceEnd => "-SEQ".into(), - - Event::MappingStart(idx, tag) => { - format!("+MAP{}{}", format_index(idx), format_tag(&tag)) - } - Event::MappingEnd => "-MAP".into(), - - Event::Scalar(ref text, style, idx, ref tag) => { - let kind = match style { - TScalarStyle::Plain => ":", - TScalarStyle::SingleQuoted => "'", - TScalarStyle::DoubleQuoted => r#"""#, - TScalarStyle::Literal => "|", - TScalarStyle::Folded => ">", - }; - format!( - "=VAL{}{} {}{}", - format_index(idx), - format_tag(tag), - kind, - escape_text(text) - ) - } - Event::Alias(idx) => format!("=ALI *{idx}"), - Event::Nothing => return, - }; - self.events.push(line); - } -} - -fn format_index(idx: usize) -> String { - if idx > 0 { - format!(" &{idx}") - } else { - String::new() - } -} - -fn escape_text(text: &str) -> String { - let mut text = text.to_owned(); - for (ch, replacement) in [ - ('\\', r"\\"), - ('\n', "\\n"), - ('\r', "\\r"), - ('\x08', "\\b"), - ('\t', "\\t"), - ] { - text = text.replace(ch, replacement); - } - text -} - -fn format_tag(tag: &Option) -> String { - if let Some(tag) = tag { - format!(" <{}{}>", tag.handle, tag.suffix) - } else { - String::new() - } -} - -fn events_differ(actual: &[String], expected: &str) -> Option { - let actual = actual.iter().map(Some).chain(std::iter::repeat(None)); - let expected = expected_events(expected); - let expected = expected.iter().map(Some).chain(std::iter::repeat(None)); - for (idx, (act, exp)) in actual.zip(expected).enumerate() { - return match (act, exp) { - (Some(act), Some(exp)) => { - if act == exp { - continue; - } else { - Some(format!( - "line {idx} differs: \n=> expected `{exp}`\n=> found `{act}`", - )) - } - } - (Some(a), None) => Some(format!("extra actual line: {a:?}")), - (None, Some(e)) => Some(format!("extra expected line: {e:?}")), - (None, None) => None, - }; - } - unreachable!() -} - -/// Convert the snippets from "visual" to "actual" representation -fn visual_to_raw(yaml: &str) -> String { - let mut yaml = yaml.to_owned(); - for (pat, replacement) in [ - ("␣", " "), - ("»", "\t"), - ("—", ""), // Tab line continuation ——» - ("←", "\r"), - ("⇔", "\u{FEFF}"), - ("↵", ""), // Trailing newline marker - ("∎\n", ""), - ] { - yaml = yaml.replace(pat, replacement); - } - yaml -} - -/// Adapt the expectations to the yaml-rust reasonable limitations -/// -/// Drop information on node styles (flow/block) and anchor names. -/// Both are things that can be omitted according to spec. -fn expected_events(expected_tree: &str) -> Vec { - let mut anchors = vec![]; - expected_tree - .split('\n') - .map(|s| s.trim_start().to_owned()) - .filter(|s| !s.is_empty()) - .map(|mut s| { - // Anchor name-to-number conversion - if let Some(start) = s.find('&') { - if s[..start].find(':').is_none() { - let len = s[start..].find(' ').unwrap_or(s[start..].len()); - anchors.push(s[start + 1..start + len].to_owned()); - s = s.replace(&s[start..start + len], &format!("&{}", anchors.len())); - } - } - // Alias nodes name-to-number - if s.starts_with("=ALI") { - let start = s.find('*').unwrap(); - let name = &s[start + 1..]; - let idx = anchors - .iter() - .enumerate() - .filter(|(_, v)| v == &name) - .last() - .unwrap() - .0; - s = s.replace(&s[start..], &format!("*{}", idx + 1)); - } - // Dropping style information - match &*s { - "+DOC ---" => "+DOC".into(), - "-DOC ..." => "-DOC".into(), - s if s.starts_with("+SEQ []") => s.replacen("+SEQ []", "+SEQ", 1), - s if s.starts_with("+MAP {}") => s.replacen("+MAP {}", "+MAP", 1), - "=VAL :" => "=VAL :~".into(), // FIXME: known bug - s => s.into(), - } - }) - .collect() -} diff --git a/saphyr/tools/README.md b/saphyr/tools/README.md deleted file mode 100644 index 7728a0f..0000000 --- a/saphyr/tools/README.md +++ /dev/null @@ -1,229 +0,0 @@ -# `yaml-rust2` tools -This directory contains tools that are used to develop the crate. -Due to dependency management, only some of them are available as binaries from the `yaml-rust2` crate. - -| Tool | Invocation | -|------|------------| -| `bench_compare` | `cargo bench_compare` | -| `dump_events` | `cargo run --bin dump_events -- [...]` | -| `gen_large_yaml` | `cargo gen_large_yaml` | -| `run_bench` | `cargo run --bin run_bench -- [...]` | -| `time_parse` | `cargo run --bin time_parse -- [...]` | - -## `bench_compare` -See the [dedicated README file](./bench_compare/README.md). - -## `dump_events` -This is a debugging helper for the parser. It outputs events emitted by the parser for a given file. This can be paired with the `YAMLRUST2_DEBUG` environment variable to have an in-depth overview of which steps the scanner and the parser are taking. - -### Example -Consider the following `input.yaml` YAML file: -```yaml -- foo: bar -- baz: - c: [3, 4, 5] -``` - -Running `cargo run --bin dump_events -- input.yaml` outputs: -``` - ↳ StreamStart - ↳ DocumentStart - ↳ SequenceStart(0, None) - ↳ MappingStart(0, None) - ↳ Scalar("foo", Plain, 0, None) - ↳ Scalar("bar", Plain, 0, None) - ↳ MappingEnd - ↳ MappingStart(0, None) - ↳ Scalar("baz", Plain, 0, None) - ↳ Scalar("~", Plain, 0, None) - ↳ Scalar("c", Plain, 0, None) - ↳ SequenceStart(0, None) - ↳ Scalar("3", Plain, 0, None) - ↳ Scalar("4", Plain, 0, None) - ↳ Scalar("5", Plain, 0, None) - ↳ SequenceEnd - ↳ MappingEnd - ↳ SequenceEnd - ↳ DocumentEnd - ↳ StreamEnd -``` - -Running `YAMLRUST2_DEBUG=1 cargo run --bin dump_events -- input.yaml` outputs much more details: -
- Full output - -``` -Parser state: StreamStart - ↳ StreamStart(Utf8) Marker { index: 0, line: 1, col: 0 } - ↳ StreamStart - -Parser state: ImplicitDocumentStart - → fetch_next_token after whitespace Marker { index: 0, line: 1, col: 0 } '-' - ↳ BlockSequenceStart Marker { index: 0, line: 1, col: 0 } - ↳ DocumentStart - -Parser state: BlockNode - ↳ SequenceStart(0, None) - -Parser state: BlockSequenceFirstEntry - ↳ BlockEntry Marker { index: 2, line: 1, col: 2 } - → fetch_next_token after whitespace Marker { index: 2, line: 1, col: 2 } 'f' - → fetch_next_token after whitespace Marker { index: 5, line: 1, col: 5 } ':' - ↳ BlockMappingStart Marker { index: 5, line: 1, col: 5 } - ↳ MappingStart(0, None) - -Parser state: BlockMappingFirstKey - ↳ Key Marker { index: 2, line: 1, col: 2 } - ↳ Scalar(Plain, "foo") Marker { index: 2, line: 1, col: 2 } - ↳ Scalar("foo", Plain, 0, None) - -Parser state: BlockMappingValue - ↳ Value Marker { index: 5, line: 1, col: 5 } - → fetch_next_token after whitespace Marker { index: 7, line: 1, col: 7 } 'b' - ↳ Scalar(Plain, "bar") Marker { index: 7, line: 1, col: 7 } - ↳ Scalar("bar", Plain, 0, None) - -Parser state: BlockMappingKey - → fetch_next_token after whitespace Marker { index: 11, line: 2, col: 0 } '-' - ↳ BlockEnd Marker { index: 11, line: 2, col: 0 } - ↳ MappingEnd - -Parser state: BlockSequenceEntry - ↳ BlockEntry Marker { index: 13, line: 2, col: 2 } - → fetch_next_token after whitespace Marker { index: 13, line: 2, col: 2 } 'b' - → fetch_next_token after whitespace Marker { index: 16, line: 2, col: 5 } ':' - ↳ BlockMappingStart Marker { index: 16, line: 2, col: 5 } - ↳ MappingStart(0, None) - -Parser state: BlockMappingFirstKey - ↳ Key Marker { index: 13, line: 2, col: 2 } - ↳ Scalar(Plain, "baz") Marker { index: 13, line: 2, col: 2 } - ↳ Scalar("baz", Plain, 0, None) - -Parser state: BlockMappingValue - ↳ Value Marker { index: 16, line: 2, col: 5 } - → fetch_next_token after whitespace Marker { index: 20, line: 3, col: 2 } 'c' - → fetch_next_token after whitespace Marker { index: 21, line: 3, col: 3 } ':' - ↳ Key Marker { index: 20, line: 3, col: 2 } - ↳ Scalar("~", Plain, 0, None) - -Parser state: BlockMappingKey - ↳ Scalar(Plain, "c") Marker { index: 20, line: 3, col: 2 } - ↳ Scalar("c", Plain, 0, None) - -Parser state: BlockMappingValue - ↳ Value Marker { index: 21, line: 3, col: 3 } - → fetch_next_token after whitespace Marker { index: 23, line: 3, col: 5 } '[' - ↳ FlowSequenceStart Marker { index: 23, line: 3, col: 5 } - ↳ SequenceStart(0, None) - -Parser state: FlowSequenceFirstEntry - → fetch_next_token after whitespace Marker { index: 24, line: 3, col: 6 } '3' - → fetch_next_token after whitespace Marker { index: 25, line: 3, col: 7 } ',' - ↳ Scalar(Plain, "3") Marker { index: 24, line: 3, col: 6 } - ↳ Scalar("3", Plain, 0, None) - -Parser state: FlowSequenceEntry - ↳ FlowEntry Marker { index: 25, line: 3, col: 7 } - → fetch_next_token after whitespace Marker { index: 27, line: 3, col: 9 } '4' - → fetch_next_token after whitespace Marker { index: 28, line: 3, col: 10 } ',' - ↳ Scalar(Plain, "4") Marker { index: 27, line: 3, col: 9 } - ↳ Scalar("4", Plain, 0, None) - -Parser state: FlowSequenceEntry - ↳ FlowEntry Marker { index: 28, line: 3, col: 10 } - → fetch_next_token after whitespace Marker { index: 30, line: 3, col: 12 } '5' - → fetch_next_token after whitespace Marker { index: 31, line: 3, col: 13 } ']' - ↳ Scalar(Plain, "5") Marker { index: 30, line: 3, col: 12 } - ↳ Scalar("5", Plain, 0, None) - -Parser state: FlowSequenceEntry - ↳ FlowSequenceEnd Marker { index: 31, line: 3, col: 13 } - ↳ SequenceEnd - -Parser state: BlockMappingKey - → fetch_next_token after whitespace Marker { index: 33, line: 4, col: 0 } '\0' - ↳ BlockEnd Marker { index: 33, line: 4, col: 0 } - ↳ MappingEnd - -Parser state: BlockSequenceEntry - ↳ BlockEnd Marker { index: 33, line: 4, col: 0 } - ↳ SequenceEnd - -Parser state: DocumentEnd - ↳ StreamEnd Marker { index: 33, line: 4, col: 0 } - ↳ DocumentEnd - -Parser state: DocumentStart - ↳ StreamEnd -``` - -
- -While this cannot be shown in Markdown, the output is colored so that it is a bit easier to read. - -## `gen_large_yaml` -It is hard to find large (100+MiB) real-world YAML files that could be used to benchmark a parser. This utility generates multiple large files that are meant to stress the parser with different layouts of YAML files. The resulting files do not look like anything that would be encountered in production, but can serve as a base to test several features of a YAML parser. - -The generated files are the following: - - - `big.yaml`: A large array of records with few fields. One of the fields is a description, a large text block scalar spanning multiple lines. Most of the scanning happens in block scalars. - - `nested.yaml`: Very short key-value pairs that nest deeply. - - `small_objects.yaml`: A large array of 2 key-value mappings. - - `strings_array.yaml`: A large array of lipsum one-liners (~150-175 characters in length). - -All generated files are meant to be between 200 and 250 MiB in size. - -This tool depends on external dependencies that are not part of `yaml-rust2`'s dependencies or `dev-dependencies` and as such can't be called through `cargo run` directly. A dedicated `cargo gen_large_yaml` alias can be used to generate the benchmark files. - -## `run_bench` -This is a benchmarking helper that runs the parser on the given file a given number of times and is able to extract simple metrics out of the results. The `--output-yaml` flag can be specified to make the output a YAML file that can be fed into other tools. - -This binary is made to be used by `bench_compare`. - -Synopsis: `run_bench input.yaml [--output-yaml]` - -### Examples -```sh -$> cargo run --release --bin run_bench -- bench_yaml/big.yaml 10 -Average: 1.631936191s -Min: 1.629654651s -Max: 1.633045284s -95%: 1.633045284s - -$> cargo run --release --bin run_bench -- bench_yaml/big.yaml 10 --output-yaml -parser: yaml-rust2 -input: bench_yaml/big.yaml -average: 1649847674 -min: 1648277149 -max: 1651936305 -percentile95: 1651936305 -iterations: 10 -times: - - 1650216129 - - 1649349978 - - 1649507018 - - 1648277149 - - 1649036548 - - 1650323982 - - 1650917692 - - 1648702081 - - 1650209860 - - 1651936305 -``` - -## `time_parse` -This is a benchmarking helper that times how long it takes for the parser to emit all events. It calls the parser on the given input file, receives parsing events and then immediately discards them. It is advised to run this tool with `--release`. - -### Examples -Loading a small file could output the following: -```sh -$> cargo run --release --bin time_parse -- input.yaml -Loaded 0MiB in 14.189µs -``` - -While loading a larger file could output the following: -```sh -$> cargo run --release --bin time_parse -- bench_yaml/big.yaml -Loaded 220MiB in 1.612677853s -``` diff --git a/saphyr/tools/bench_compare/Cargo.toml b/saphyr/tools/bench_compare/Cargo.toml deleted file mode 100644 index 4ca9b33..0000000 --- a/saphyr/tools/bench_compare/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "bench_compare" -version = "0.6.0" -authors = [ - "Ethiraric " -] -license = "MIT OR Apache-2.0" -description = "Run multiple YAML parsers and compare their times" -repository = "https://github.com/Ethiraric/yaml-rust2" -readme = "README.md" -edition = "2018" - -[dependencies] -anyhow = { version = "1.0.81", features = ["backtrace"] } -serde = { version = "1.0.197", features = ["derive"] } -serde_yaml = "0.9.32" -toml = "0.8.11" - -[profile.release-lto] -inherits = "release" -lto = true diff --git a/saphyr/tools/bench_compare/README.md b/saphyr/tools/bench_compare/README.md deleted file mode 100644 index b9e990b..0000000 --- a/saphyr/tools/bench_compare/README.md +++ /dev/null @@ -1,120 +0,0 @@ -# `bench_compare` -This tool helps with comparing times different YAML parsers take to parse the same input. - -## Synopsis -``` -bench_compare time_parse -bench_compare run_bench -``` - -This will run either `time_parse` or `run_bench` (described below) with the given set of parsers from the configuration file. - -## Parsers requirements -Parsers are expected to be event-based. In order to be fair to this crate's benchmark implementation, parsers should: - -* Load the file into memory (a string, `mmap`, ...) **prior** to starting the clock -* Initialize the parser, if needed -* **Start the clock** -* Read events from the parser while the parser has not finished parsing -* Discard events as they are received (dropping them, `free`ing them or anything similar) so as to not grow their memory consumption too high, and allowing the parser to reuse event structures -* **Stop the clock** -* Destroy the resources, if needed/wanted (parser, file buffer, ...). The kernel will reap after the process exits. - - -## Parsers required binaries -This tool recognizes 2 binaries: `time_parse` and `run_bench`. - -### `time_parse` -Synopsis: -``` -time_parse file.yaml [--short] -``` - -The binary must run the aforementioned steps and display on its output the time the parser took to parse the given file. -With the `--short` option, the binary must only output the benchmark time in nanoseconds. - -```sh -# This is meant to be human-readable. -# The example below is what this crate implements. -$> time_parse file.yaml -Loaded 200MiB in 1.74389s. - -# This will be read by this tool. -# This must output ONLY the time, in nanoseconds. -$> time_parse file.yaml --short -1743892394 -``` - -This tool will always provide the `--short` option. - -### `run_bench` -Synopsis: -``` -run_bench file.yaml [--output-yaml] -``` - -The binary is expected to run `` runs of the aforementioned steps and display on its output relevant information. -The `--output-yaml` instructs the binary to output details about its runs in YAML on its standard output. -The binary may optionally perform some warmup runs prior to running the benchmark. The time it took the binary to run will not be evaluated. - -```sh -# This is meant to be human-readable. -# The example below is what this crate implements. -$> run_bench file.yaml 100 -Average: 1.589485s -Min : 1.583078s -Max : 1.597028s -95% : 1.593219s - -# This will be read by this tool. -# This must output a YAML as described below. -$> run_bench ../file.yaml 10 --output-yaml -parser: yaml-rust2 -input: ../file.yaml -average: 1620303590 -min: 1611632108 -max: 1636401896 -percentile95: 1636401896 -iterations: 10 -times: - - 1636401896 - - 1623914538 - - 1611632108 - - 1612973608 - - 1617748930 - - 1615419514 - - 1612172250 - - 1620791346 - - 1629339306 - - 1622642412 -``` - -The expected fields are (all times in nanoseconds): - -* `parser`: The name of the parser (in case of a mistake renaming files) -* `input`: The path to the input file as given to the binary arguments -* `average`: The average time it took to run the parser -* `min`: The shortest time it took to run the parser -* `max`: The longest time it took to run the parser -* `percentile95`: The 95th percentile time of the runs -* `iterations`: The number of times the parser was run (``) -* `times`: An array of `iterations` times, one for each run, in the order they were run (first run first) - -## Configuration -`bench_compare` is configured through a `bench_compare.toml` file. This file must be located in the current directory. -As of now, default values are unsupported and all fields must be set. The following fields are required: -```toml -yaml_input_dir = "bench_yaml" # The path to the directory containing the input yaml files -iterations = 10 # The number of iterations, if using `run_bench` -yaml_output_dir = "yaml_output" # The directory in which `run_bench`'s yamls are saved -csv_output = "benchmark.csv" # The CSV output aggregating times for each parser and file - -[[parsers]] # A parser, can be repeated as many times as there are parsers -name = "yaml-rust2" # The name of the parser (used for logging) -path = "target/release/" # The path in which the parsers' `run_bench` and `time_parse` are - -# If there is another parser, another block can be added -# [[parsers]] -# name = "libfyaml" -# path = "../libfyaml/build" -``` diff --git a/saphyr/tools/bench_compare/src/main.rs b/saphyr/tools/bench_compare/src/main.rs deleted file mode 100644 index ac33f9c..0000000 --- a/saphyr/tools/bench_compare/src/main.rs +++ /dev/null @@ -1,174 +0,0 @@ -use std::{fs::File, io::BufWriter, io::Write, path::Path}; - -use anyhow::Error; -use serde::{Deserialize, Serialize}; - -fn main() { - if let Err(e) = entrypoint() { - eprintln!("{e:?}"); - std::process::exit(1); - } -} - -fn entrypoint() -> Result<(), Error> { - let config: Config = - toml::from_str(&std::fs::read_to_string("bench_compare.toml").unwrap()).unwrap(); - if config.parsers.is_empty() { - println!("Please add at least one parser. Refer to the README for instructions."); - return Ok(()); - } - let args: Vec<_> = std::env::args().collect(); - if args.len() != 2 - || (args.len() == 2 && !["time_parse", "run_bench"].contains(&args[1].as_str())) - { - println!("Usage: bench_compare "); - return Ok(()); - } - match args[1].as_str() { - "run_bench" => run_bench(&config)?, - "time_parse" => unimplemented!(), - _ => unreachable!(), - } - Ok(()) -} - -/// Run the `run_bench` binary on the given parsers. -fn run_bench(config: &Config) -> Result<(), Error> { - // Create output directory - std::fs::create_dir_all(&config.yaml_output_dir)?; - - let inputs = list_input_files(config)?; - let iterations = format!("{}", config.iterations); - let mut averages = vec![]; - - // Inputs are ordered, so are parsers. - for input in &inputs { - let input_basename = Path::new(&input).file_name().unwrap().to_string_lossy(); - let mut input_times = vec![]; - - // Run each input for each parser. - for parser in &config.parsers { - println!("Running {input_basename} against {}", parser.name); - // Run benchmark - let path = Path::new(&parser.path).join("run_bench"); - let output = std::process::Command::new(path) - .arg(input) - .arg(&iterations) - .arg("--output-yaml") - .output()?; - // Check exit status. - if output.status.code().unwrap_or(1) == 0 { - let s = String::from_utf8_lossy(&output.stdout); - // Get output as yaml. - match serde_yaml::from_str::(&s) { - Ok(output) => { - // Push average into our CSV-to-be. - input_times.push(output.average); - // Save the YAML for later. - serde_yaml::to_writer( - BufWriter::new(File::create(format!( - "{}/{}-{}", - config.yaml_output_dir, parser.name, input_basename - ))?), - &output, - )?; - } - Err(e) => { - // Yaml is invalid, use 0 as "didn't run properly". - println!("Errored: Invalid YAML output: {e}"); - input_times.push(0); - } - } - } else { - // An error happened, use 0 as "didn't run properly". - println!("Errored: process did exit non-zero"); - input_times.push(0); - } - } - averages.push(input_times); - } - - // Finally, save a CSV. - save_run_bench_csv(config, &inputs, &averages) -} - -/// General configuration structure. -#[derive(Serialize, Deserialize)] -struct Config { - /// The path to the directory containing the input yaml files. - yaml_input_dir: String, - /// Number of iterations to run, if using `run_bench`. - iterations: u32, - /// The parsers to run. - parsers: Vec, - /// The path to the directory in which `run_bench`'s yamls are saved. - yaml_output_dir: String, - /// The path to the CSV output aggregating times for each parser and file. - csv_output: String, -} - -/// A parser configuration. -#[derive(Serialize, Deserialize)] -struct Parser { - /// The name of the parser. - name: String, - /// The path in which the parser's `run_bench` and `time_parse` are located. - path: String, -} - -/// Ourput of running `run_bench` on a given parser. -#[derive(Serialize, Deserialize)] -struct BenchYamlOutput { - /// The name of the parser. - parser: String, - /// The file taken as input. - input: String, - /// Average parsing time (ns). - average: u64, - /// Shortest parsing time (ns). - min: u64, - /// Longest parsing time (ns). - max: u64, - /// 95th percentile of parsing times (ns). - percentile95: u64, - /// Number of iterations. - iterations: u64, - /// Parsing times for each run. - times: Vec, -} - -/// Save a CSV file with all averages from `run_bench`. -fn save_run_bench_csv( - config: &Config, - inputs: &[String], - averages: &[Vec], -) -> Result<(), Error> { - let mut csv = BufWriter::new(File::create(&config.csv_output)?); - for parser in &config.parsers { - write!(csv, ",{}", parser.name,)?; - } - writeln!(csv)?; - for (path, averages) in inputs.iter().zip(averages.iter()) { - let filename = Path::new(path).file_name().unwrap().to_string_lossy(); - write!(csv, "{}", filename)?; - for avg in averages { - write!(csv, ",{avg}")?; - } - writeln!(csv)?; - } - - Ok(()) -} - -/// Returns the paths to the input yaml files. -fn list_input_files(config: &Config) -> Result, Error> { - Ok(std::fs::read_dir(&config.yaml_input_dir)? - .filter_map(Result::ok) - .map(|entry| entry.path().to_string_lossy().to_string()) - .filter(|path| { - Path::new(path) - .extension() - .map_or(false, |ext| ext.eq_ignore_ascii_case("yaml")) - }) - .collect()) -} diff --git a/saphyr/tools/dump_events.rs b/saphyr/tools/dump_events.rs deleted file mode 100644 index 747e9b9..0000000 --- a/saphyr/tools/dump_events.rs +++ /dev/null @@ -1,38 +0,0 @@ -use std::env; -use std::fs::File; -use std::io::prelude::*; -use yaml_rust2::{ - parser::{MarkedEventReceiver, Parser}, - scanner::Marker, - Event, -}; - -#[derive(Debug)] -struct EventSink { - events: Vec<(Event, Marker)>, -} - -impl MarkedEventReceiver for EventSink { - fn on_event(&mut self, ev: Event, mark: Marker) { - eprintln!(" \x1B[;34m\u{21B3} {:?}\x1B[;m", &ev); - self.events.push((ev, mark)); - } -} - -fn str_to_events(yaml: &str) -> Vec<(Event, Marker)> { - let mut sink = EventSink { events: Vec::new() }; - let mut parser = Parser::new_from_str(yaml); - // Load events using our sink as the receiver. - parser.load(&mut sink, true).unwrap(); - sink.events -} - -fn main() { - let args: Vec<_> = env::args().collect(); - let mut f = File::open(&args[1]).unwrap(); - let mut s = String::new(); - f.read_to_string(&mut s).unwrap(); - - // dbg!(str_to_events(&s)); - str_to_events(&s); -} diff --git a/saphyr/tools/gen_large_yaml/Cargo.toml b/saphyr/tools/gen_large_yaml/Cargo.toml deleted file mode 100644 index d57bdea..0000000 --- a/saphyr/tools/gen_large_yaml/Cargo.toml +++ /dev/null @@ -1,20 +0,0 @@ -[package] -name = "gen_large_yaml" -version = "0.6.0" -authors = [ - "Ethiraric " -] -license = "MIT OR Apache-2.0" -description = "A helper to generate large YAML files" -repository = "https://github.com/Ethiraric/yaml-rust2" -readme = "README.md" -edition = "2018" - -[dependencies] -yaml-rust2 = { path = "../.." } -rand = { version = "0.8.5", features = [ "small_rng" ] } -lipsum = "0.9.0" - -[profile.release-lto] -inherits = "release" -lto = true diff --git a/saphyr/tools/gen_large_yaml/src/gen.rs b/saphyr/tools/gen_large_yaml/src/gen.rs deleted file mode 100644 index 78d16ba..0000000 --- a/saphyr/tools/gen_large_yaml/src/gen.rs +++ /dev/null @@ -1,156 +0,0 @@ -#![allow(clippy::too_many_arguments)] - -use rand::{distributions::Alphanumeric, rngs::SmallRng, Rng}; - -/// Generate a string with hexadecimal digits of the specified length. -pub fn hex_string(rng: &mut SmallRng, len: usize) -> String { - const DIGITS: &[u8] = b"0123456789abcdef"; - string_from_set(rng, len, len + 1, DIGITS) -} - -/// Generate an e-mail address. -pub fn email(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String { - const CHARSET: &[u8] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_.0123456789"; - format!( - "{}@example.com", - string_from_set(rng, len_lo, len_hi, CHARSET) - ) -} - -/// Generate a random URL. -pub fn url( - rng: &mut SmallRng, - scheme: &str, - n_paths_lo: usize, - n_paths_hi: usize, - path_len_lo: usize, - path_len_hi: usize, - extension: Option<&str>, -) -> String { - let mut string = format!("{scheme}://example.com"); - for _ in 0..rng.gen_range(n_paths_lo..n_paths_hi) { - string.push('/'); - string.push_str(&alnum_string(rng, path_len_lo, path_len_hi)); - } - if let Some(extension) = extension { - string.push('.'); - string.push_str(extension); - } - string -} - -/// Generate a random integer. -pub fn integer(rng: &mut SmallRng, lo: i64, hi: i64) -> i64 { - rng.gen_range(lo..hi) -} - -/// Generate an alphanumeric string with a length between `lo_len` and `hi_len`. -pub fn alnum_string(rng: &mut SmallRng, lo_len: usize, hi_len: usize) -> String { - let len = rng.gen_range(lo_len..hi_len); - rng.sample_iter(&Alphanumeric) - .take(len) - .map(char::from) - .collect() -} - -/// Generate a string with hexadecimal digits of the specified length. -pub fn string_from_set(rng: &mut SmallRng, len_lo: usize, len_hi: usize, set: &[u8]) -> String { - (0..rng.gen_range(len_lo..len_hi)) - .map(|_| set[rng.gen_range(0..set.len())] as char) - .collect() -} - -/// Generate a lipsum paragraph. -pub fn paragraph( - rng: &mut SmallRng, - lines_lo: usize, - lines_hi: usize, - wps_lo: usize, - wps_hi: usize, - line_maxcol: usize, -) -> Vec { - let mut ret = Vec::new(); - let nlines = rng.gen_range(lines_lo..lines_hi); - - while ret.len() < nlines { - let words_in_sentence = rng.gen_range(wps_lo..wps_hi); - let mut sentence = lipsum::lipsum_words_with_rng(rng.clone(), words_in_sentence); - - if let Some(last_line) = ret.pop() { - sentence = format!("{last_line} {sentence}"); - } - - while sentence.len() > line_maxcol { - let last_space_idx = line_maxcol - - sentence[0..line_maxcol] - .chars() - .rev() - .position(char::is_whitespace) - .unwrap(); - ret.push(sentence[0..last_space_idx].to_string()); - sentence = sentence[last_space_idx + 1..].to_string(); - } - if !sentence.is_empty() { - ret.push(sentence); - } - } - - ret -} - -/// Generate a full name. -pub fn full_name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String { - format!( - "{} {}", - name(rng, len_lo, len_hi), - name(rng, len_lo, len_hi) - ) -} - -/// Generate a name. -pub fn name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String { - const UPPER: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - const LOWER: &[u8] = b"abcdefghijklmnopqrstuvwxyz"; - - let len = rng.gen_range(len_lo..len_hi); - let mut ret = String::new(); - ret.push(UPPER[rng.gen_range(0..UPPER.len())] as char); - ret.push_str(string_from_set(rng, len, len + 1, LOWER).as_str()); - - ret -} - -/// Generate a set of words. -pub fn words(rng: &mut SmallRng, words_lo: usize, words_hi: usize) -> String { - let nwords = rng.gen_range(words_lo..words_hi); - lipsum::lipsum_words_with_rng(rng.clone(), nwords).replace(|c| "-\'\",*:".contains(c), "") -} - -/// Generate a lipsum text. -/// -/// Texts are composed of some paragraphs and empty lines between them. -pub fn text( - rng: &mut SmallRng, - paragraphs_lo: usize, - paragraphs_hi: usize, - lines_lo: usize, - lines_hi: usize, - wps_lo: usize, - wps_hi: usize, - line_maxcol: usize, -) -> Vec { - let mut ret = Vec::new(); - let mut first = true; - - for _ in 0..rng.gen_range(paragraphs_lo..paragraphs_hi) { - if first { - first = false; - } else { - ret.push(String::new()); - } - - ret.extend(paragraph(rng, lines_lo, lines_hi, wps_lo, wps_hi, line_maxcol).into_iter()); - } - - ret -} diff --git a/saphyr/tools/gen_large_yaml/src/main.rs b/saphyr/tools/gen_large_yaml/src/main.rs deleted file mode 100644 index b585c59..0000000 --- a/saphyr/tools/gen_large_yaml/src/main.rs +++ /dev/null @@ -1,261 +0,0 @@ -#![allow(dead_code)] - -mod gen; -mod nested; - -use std::fs::File; -use std::io::BufWriter; -use std::path::Path; - -use rand::{rngs::SmallRng, Rng, SeedableRng}; - -/// The path into which the generated YAML files will be written. -const OUTPUT_DIR: &str = "bench_yaml"; - -fn main() -> std::io::Result<()> { - let mut generator = Generator::new(); - let output_path = Path::new(OUTPUT_DIR); - if !output_path.is_dir() { - std::fs::create_dir(output_path).unwrap(); - } - - println!("Generating big.yaml"); - let mut out = BufWriter::new(File::create(output_path.join("big.yaml")).unwrap()); - generator.gen_record_array(&mut out, 100_000, 100_001)?; - - println!("Generating nested.yaml"); - let mut out = BufWriter::new(File::create(output_path.join("nested.yaml")).unwrap()); - nested::create_deep_object(&mut out, 1_100_000)?; - - println!("Generating small_objects.yaml"); - let mut out = BufWriter::new(File::create(output_path.join("small_objects.yaml")).unwrap()); - generator.gen_authors_array(&mut out, 4_000_000, 4_000_001)?; - - println!("Generating strings_array.yaml"); - let mut out = BufWriter::new(File::create(output_path.join("strings_array.yaml")).unwrap()); - generator.gen_strings_array(&mut out, 1_300_000, 1_300_001, 10, 40)?; - Ok(()) -} - -/// YAML Generator. -struct Generator { - /// The RNG state. - /// - /// We don't need to be cryptographically secure. [`SmallRng`] also implements the - /// [`SeedableRng`] trait, allowing runs to be predictable. - rng: SmallRng, - /// The stack of indentations. - indents: Vec, -} - -type GenFn = dyn FnOnce(&mut Generator, &mut W) -> std::io::Result<()>; - -impl Generator { - /// Create a new generator. - fn new() -> Self { - Generator { - rng: SmallRng::seed_from_u64(42), - indents: vec![0], - } - } - - /// Generate an array of records as per [`Self::gen_record_object`]. - fn gen_record_array( - &mut self, - writer: &mut W, - items_lo: usize, - items_hi: usize, - ) -> std::io::Result<()> { - self.gen_array(writer, items_lo, items_hi, Generator::gen_record_object) - } - - /// Generate an array of lipsum one-liners. - fn gen_strings_array( - &mut self, - writer: &mut W, - items_lo: usize, - items_hi: usize, - words_lo: usize, - words_hi: usize, - ) -> std::io::Result<()> { - self.gen_array(writer, items_lo, items_hi, |gen, writer| { - write!(writer, "{}", gen::words(&mut gen.rng, words_lo, words_hi)) - }) - } - - /// Generate a YAML object/mapping containing a record. - /// - /// Fields are description, hash, version, home, repository and pdf. - /// The `description` field is a long string and puts a lot of weight in plain scalar / block - /// scalar parsing. - fn gen_record_object(&mut self, writer: &mut W) -> std::io::Result<()> { - let fields: Vec<(String, Box>)> = vec![ - ( - "description".to_string(), - Box::new(|gen, w| { - write!(w, "|")?; - gen.push_indent(2); - gen.nl(w)?; - let indent = gen.indent(); - let text = gen::text(&mut gen.rng, 1, 9, 3, 8, 10, 20, 80 - indent); - gen.write_lines(w, &text)?; - gen.pop_indent(); - Ok(()) - }), - ), - ( - "authors".to_string(), - Box::new(|gen, w| { - gen.push_indent(2); - gen.nl(w)?; - gen.gen_authors_array(w, 1, 10)?; - gen.pop_indent(); - Ok(()) - }), - ), - ( - "hash".to_string(), - Box::new(|gen, w| write!(w, "{}", gen::hex_string(&mut gen.rng, 64))), - ), - ( - "version".to_string(), - Box::new(|gen, w| write!(w, "{}", gen::integer(&mut gen.rng, 1, 9))), - ), - ( - "home".to_string(), - Box::new(|gen, w| { - write!(w, "{}", gen::url(&mut gen.rng, "https", 0, 1, 0, 0, None)) - }), - ), - ( - "repository".to_string(), - Box::new(|gen, w| { - write!(w, "{}", gen::url(&mut gen.rng, "git", 1, 4, 10, 20, None)) - }), - ), - ( - "pdf".to_string(), - Box::new(|gen, w| { - write!( - w, - "{}", - gen::url(&mut gen.rng, "https", 1, 4, 10, 30, Some("pdf")) - ) - }), - ), - ]; - self.gen_object(writer, fields) - } - - /// Generate an array of authors as per [`Self::gen_author_object`]. - fn gen_authors_array( - &mut self, - writer: &mut W, - items_lo: usize, - items_hi: usize, - ) -> std::io::Result<()> { - self.gen_array(writer, items_lo, items_hi, Generator::gen_author_object) - } - - /// Generate a small object with 2 string fields. - fn gen_author_object(&mut self, writer: &mut W) -> std::io::Result<()> { - let fields: Vec<(String, Box>)> = vec![ - ( - "name".to_string(), - Box::new(|gen, w| write!(w, "{}", gen::full_name(&mut gen.rng, 10, 15))), - ), - ( - "email".to_string(), - Box::new(|gen, w| write!(w, "{}", gen::email(&mut gen.rng, 1, 9))), - ), - ]; - self.gen_object(writer, fields) - } - - /// Generate a YAML array/sequence containing nodes generated by the given function. - fn gen_array std::io::Result<()>>( - &mut self, - writer: &mut W, - len_lo: usize, - len_hi: usize, - mut obj_creator: F, - ) -> std::io::Result<()> { - let mut first = true; - for _ in 0..self.rng.gen_range(len_lo..len_hi) { - if first { - first = false; - } else { - self.nl(writer)?; - } - write!(writer, "- ")?; - self.push_indent(2); - (obj_creator)(self, writer)?; - self.pop_indent(); - } - Ok(()) - } - - /// Create a Yaml object with some fields in it. - fn gen_object( - &mut self, - writer: &mut W, - fields: Vec<(String, Box>)>, - ) -> std::io::Result<()> { - let mut first = true; - for (key, f) in fields { - if first { - first = false; - } else { - self.nl(writer)?; - } - write!(writer, "{key}: ")?; - f(self, writer)?; - } - Ok(()) - } - - /// Write the given lines at the right indentation. - fn write_lines( - &mut self, - writer: &mut W, - lines: &[String], - ) -> std::io::Result<()> { - let mut first = true; - - for line in lines { - if first { - first = false; - } else { - self.nl(writer)?; - } - write!(writer, "{line}")?; - } - - Ok(()) - } - - /// Write a new line to the writer and indent. - fn nl(&mut self, writer: &mut W) -> std::io::Result<()> { - writeln!(writer)?; - for _ in 0..self.indent() { - write!(writer, " ")?; - } - Ok(()) - } - - /// Return the given indent. - fn indent(&self) -> usize { - *self.indents.last().unwrap() - } - - /// Push a new indent with the given relative offset. - fn push_indent(&mut self, offset: usize) { - self.indents.push(self.indent() + offset); - } - - /// Pops the last indent. - fn pop_indent(&mut self) { - self.indents.pop(); - assert!(!self.indents.is_empty()); - } -} diff --git a/saphyr/tools/gen_large_yaml/src/nested.rs b/saphyr/tools/gen_large_yaml/src/nested.rs deleted file mode 100644 index 0f182a9..0000000 --- a/saphyr/tools/gen_large_yaml/src/nested.rs +++ /dev/null @@ -1,115 +0,0 @@ -use std::{cell::RefCell, rc::Rc}; - -use rand::{rngs::SmallRng, Rng, SeedableRng}; - -/// Create a deep object with the given amount of nodes. -pub fn create_deep_object( - writer: &mut W, - n_nodes: usize, -) -> std::io::Result<()> { - let mut tree = Tree::new(); - for _ in 0..n_nodes { - tree.push_node(); - } - tree.write_to(writer) -} - -/// An n-tree. -/// -/// The algorithm used to generate a potentially deep object is to create a tree, one node at a -/// time, where each node is put as a child of a random existing node in the tree. -struct Tree { - /// The tree-view of the tree. - root: Rc>, - /// Array of all the nodes in the tree, including the root node. - nodes: Vec>>, - /// The RNG state. - /// - /// We don't need to be cryptographically secure. [`SmallRng`] also implements the - /// [`SeedableRng`] trait, allowing runs to be predictable. - rng: SmallRng, -} - -/// A node in a tree. -struct Node { - /// All the children of the node. - children: Vec>>, -} - -impl Tree { - /// Create a new tree. - fn new() -> Self { - let root = Node::new_rc_refcell(); - Tree { - root: root.clone(), - nodes: vec![root], - rng: SmallRng::seed_from_u64(42), - } - } - - /// Add a new node as a child of a random node in the tree. - fn push_node(&mut self) { - let new_node = Node::new_rc_refcell(); - let n_nodes = self.nodes.len(); - // Bias the nodes towards the end so that there is more nesting. - let parent = &mut self.nodes[self.rng.gen_range((3 * n_nodes / 4)..n_nodes)]; - (**parent).borrow_mut().push_child(new_node.clone()); - self.nodes.push(new_node); - } - - /// Write the YAML representation of the tree to `writer`. - fn write_to(&self, writer: &mut W) -> std::io::Result<()> { - (*self.root).borrow().write_to(writer, 0) - } -} - -impl Node { - /// Create a new node. - fn new() -> Self { - Node { children: vec![] } - } - - fn new_rc_refcell() -> Rc> { - Rc::new(RefCell::new(Self::new())) - } - - /// Append a child to the node. - fn push_child(&mut self, child: Rc>) { - self.children.push(child); - } - - /// Write the YAML representation of the node to `writer`. - fn write_to(&self, writer: &mut W, indent: usize) -> std::io::Result<()> { - if self.children.is_empty() { - write_n(writer, ' ', indent)?; - writer.write_all(b"a: 1\n")?; - } else { - for (n, child) in self.children.iter().enumerate() { - write_n(writer, ' ', indent)?; - write_id_for_number(writer, n)?; - writer.write_all(b":\n")?; - (**child).borrow().write_to(writer, indent + 2)?; - } - } - Ok(()) - } -} - -/// Write `n` times `c` to `out`. -fn write_n(out: &mut W, c: char, n: usize) -> std::io::Result<()> { - for _ in 0..n { - write!(out, "{c}")?; - } - Ok(()) -} - -/// Create a valid identifier for the given number. -fn write_id_for_number(out: &mut W, mut n: usize) -> std::io::Result<()> { - const DIGITS: &[u8] = b"_abcdefghijklmnopqrstuvwxyz"; - n += 1; - while n > 0 { - write!(out, "{}", DIGITS[n % DIGITS.len()] as char)?; - n /= DIGITS.len(); - } - Ok(()) -} diff --git a/saphyr/tools/run_bench.rs b/saphyr/tools/run_bench.rs deleted file mode 100644 index 795f7bc..0000000 --- a/saphyr/tools/run_bench.rs +++ /dev/null @@ -1,71 +0,0 @@ -#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)] - -use std::{env, fs::File, io::prelude::*}; -use yaml_rust2::{ - parser::{MarkedEventReceiver, Parser}, - scanner::Marker, - Event, -}; - -/// A sink which discards any event sent. -struct NullSink {} - -impl MarkedEventReceiver for NullSink { - fn on_event(&mut self, _: Event, _: Marker) {} -} - -/// Parse the given input, returning elapsed time in nanoseconds. -fn do_parse(input: &str) -> u64 { - let mut sink = NullSink {}; - let mut parser = Parser::new_from_str(input); - let begin = std::time::Instant::now(); - parser.load(&mut sink, true).unwrap(); - let end = std::time::Instant::now(); - (end - begin).as_nanos() as u64 -} - -fn main() { - let args: Vec<_> = env::args().collect(); - let iterations: u64 = args[2].parse().unwrap(); - let output_yaml = args.len() == 4 && args[3] == "--output-yaml"; - let mut f = File::open(&args[1]).unwrap(); - let mut s = String::new(); - f.read_to_string(&mut s).unwrap(); - - // Warmup - do_parse(&s); - do_parse(&s); - do_parse(&s); - - // Bench - let times: Vec<_> = (0..iterations).map(|_| do_parse(&s)).collect(); - - let mut sorted_times = times.clone(); - sorted_times.sort_unstable(); - - // Compute relevant metrics. - let sum: u64 = times.iter().sum(); - let avg = sum / iterations; - let min = sorted_times[0]; - let max = sorted_times[(iterations - 1) as usize]; - let percentile95 = sorted_times[((95 * iterations) / 100) as usize]; - - if output_yaml { - println!("parser: yaml-rust2"); - println!("input: {}", args[1]); - println!("average: {avg}"); - println!("min: {min}"); - println!("max: {max}"); - println!("percentile95: {percentile95}"); - println!("iterations: {iterations}"); - println!("times:"); - for time in × { - println!(" - {time}"); - } - } else { - println!("Average: {}s", (avg as f64) / 1_000_000_000.0); - println!("Min: {}s", (min as f64) / 1_000_000_000.0); - println!("Max: {}s", (max as f64) / 1_000_000_000.0); - println!("95%: {}s", (percentile95 as f64) / 1_000_000_000.0); - } -} diff --git a/saphyr/tools/time_parse.rs b/saphyr/tools/time_parse.rs deleted file mode 100644 index 1555dde..0000000 --- a/saphyr/tools/time_parse.rs +++ /dev/null @@ -1,36 +0,0 @@ -use std::env; -use std::fs::File; -use std::io::prelude::*; -use yaml_rust2::{ - parser::{MarkedEventReceiver, Parser}, - scanner::Marker, - Event, -}; - -/// A sink which discards any event sent. -struct NullSink {} - -impl MarkedEventReceiver for NullSink { - fn on_event(&mut self, _: Event, _: Marker) {} -} - -fn main() { - let args: Vec<_> = env::args().collect(); - let mut f = File::open(&args[1]).unwrap(); - let mut s = String::new(); - f.read_to_string(&mut s).unwrap(); - - let mut sink = NullSink {}; - let mut parser = Parser::new_from_str(&s); - - // Load events using our sink as the receiver. - let begin = std::time::Instant::now(); - parser.load(&mut sink, true).unwrap(); - let end = std::time::Instant::now(); - - if args.len() == 3 && args[2] == "--short" { - println!("{}", (end - begin).as_nanos()); - } else { - println!("Loaded {}MiB in {:?}", s.len() / 1024 / 1024, end - begin); - } -}