From 750c9921216ae1196489e3cc120b9ad69140b24f Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Tue, 2 Jul 2024 01:51:07 +0200 Subject: [PATCH] Add support for nested implicit flow mappings. These are corner cases not tested by the `yaml-test-suite`. Parsing for the reported input has been fixed, as well as other similar-looking inputs which make use of nested implicit flow mappings and implicit null keys and values. Fixes #1. --- parser/src/scanner.rs | 126 ++++++++++++++++++++++++++----- parser/tests/issues.rs | 168 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 277 insertions(+), 17 deletions(-) create mode 100644 parser/tests/issues.rs diff --git a/parser/src/scanner.rs b/parser/src/scanner.rs index 7d9fd28..5bae1ae 100644 --- a/parser/src/scanner.rs +++ b/parser/src/scanner.rs @@ -309,6 +309,40 @@ struct Indent { needs_block_end: bool, } +/// The knowledge we have about an implicit mapping. +/// +/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow +/// sequence is omitted: +/// ```yaml +/// [ a: b, c: d ] +/// # Equivalent to +/// [ { a: b }, { c: d } ] +/// # Equivalent to +/// - a: b +/// - c: d +/// ``` +/// +/// The state must be carefully tracked for each nested flow sequence since we must emit a +/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a +/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,` +/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them +/// out-of-order. +/// +/// [`FlowMappingStart`]: TokenType::FlowMappingStart +/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd +#[derive(Debug, PartialEq)] +enum ImplicitMappingState { + /// It is possible there is an implicit mapping. + /// + /// This state is the one when we have just encountered the opening `[`. We need more context + /// to know whether an implicit mapping follows. + Possible, + /// We are inside the implcit mapping. + /// + /// Note that this state is not set immediately (we need to have encountered the `:` to know). + Inside, +} + /// The size of the [`Scanner`] buffer. /// /// The buffer is statically allocated to avoid conditions for reallocations each time we @@ -386,8 +420,19 @@ pub struct Scanner { /// [ : foo ] # { null: "foo" } /// ``` flow_mapping_started: bool, - /// Whether we currently are in an implicit flow mapping. - implicit_flow_mapping: bool, + /// An array of states, representing whether flow sequences have implicit mappings. + /// + /// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence), + /// the state is set to [`Possible`]. + /// When we encounter the `:`, we know we are in an implicit mapping and can set the state to + /// [`Inside`]. + /// + /// There is one entry in this [`Vec`] for each nested flow sequence that we are in. + /// The entries are created with the opening `]` and popped with the closing `]`. + /// + /// [`Possible`]: ImplicitMappingState::Possible + /// [`Inside`]: ImplicitMappingState::Inside + implicit_flow_mapping_states: Vec, } impl> Iterator for Scanner { @@ -439,7 +484,7 @@ impl> Scanner { token_available: false, leading_whitespace: true, flow_mapping_started: false, - implicit_flow_mapping: false, + implicit_flow_mapping_states: vec![], } } @@ -705,11 +750,11 @@ impl> Scanner { ',' => self.fetch_flow_entry(), '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(), '?' if is_blank_or_breakz(nc) => self.fetch_key(), - ':' if is_blank_or_breakz(nc) - || (self.flow_level > 0 - && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) => + ':' if is_blank_or_breakz(nc) => self.fetch_value(), + ':' if self.flow_level > 0 + && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at) => { - self.fetch_value() + self.fetch_flow_value() } // Is it an alias? '*' => self.fetch_anchor(true), @@ -1404,6 +1449,9 @@ impl> Scanner { if tok == TokenType::FlowMappingStart { self.flow_mapping_started = true; + } else { + self.implicit_flow_mapping_states + .push(ImplicitMappingState::Possible); } self.skip_ws_to_eol(SkipTabs::Yes)?; @@ -1418,7 +1466,11 @@ impl> Scanner { self.disallow_simple_key(); - self.end_implicit_mapping(self.mark); + if matches!(tok, TokenType::FlowSequenceEnd) { + self.end_implicit_mapping(self.mark); + // We are out exiting the flow sequence, nesting goes down 1 level. + self.implicit_flow_mapping_states.pop(); + } let start_mark = self.mark; self.skip_non_blank(); @@ -2292,7 +2344,7 @@ impl> Scanner { start_mark, ); } else { - // The parser, upon receiving a `Key`, will insert a `MappingStart` event. + // The scanner, upon emitting a `Key`, will prepend a `MappingStart` event. self.flow_mapping_started = true; } @@ -2316,11 +2368,45 @@ impl> Scanner { Ok(()) } + /// Fetch a value in a mapping inside of a flow collection. + /// + /// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding + /// values in flow collections are respected prior to calling [`fetch_value`]. + /// + /// [`self.flow_level`]: Self::flow_level + /// [`fetch_value`]: Self::fetch_value + fn fetch_flow_value(&mut self) -> ScanResult { + let nc = self.buffer[1]; + + // If we encounter a ':' inside a flow collection and it is not immediately + // followed by a blank or breakz: + // - We must check whether an adjacent value is allowed + // `["a":[]]` is valid. If the key is double-quoted, no need for a space. This + // is needed for JSON compatibility. + // - If not, we must ensure there is a space after the ':' and before its value. + // `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`. + // - But if the value is empty (null), then it's okay. + // The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a + // flow character), but the ']' is not the value. The value is an invisible empty + // space which is represented as null ('~'). + if self.mark.index != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') { + return Err(ScanError::new_str( + self.mark, + "':' may not precede any of `[{` in flow mapping", + )); + } + + self.fetch_value() + } + /// Fetch a value from a mapping (after a `:`). fn fetch_value(&mut self) -> ScanResult { let sk = self.simple_keys.last().unwrap().clone(); let start_mark = self.mark; - self.implicit_flow_mapping = self.flow_level > 0 && !self.flow_mapping_started; + let is_implicit_flow_mapping = self.flow_level > 0 && !self.flow_mapping_started; + if is_implicit_flow_mapping { + *self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Inside; + } // Skip over ':'. self.skip_non_blank(); @@ -2338,7 +2424,7 @@ impl> Scanner { // insert simple key let tok = Token(sk.mark, TokenType::Key); self.insert_token(sk.token_number - self.tokens_parsed, tok); - if self.implicit_flow_mapping { + if is_implicit_flow_mapping { if sk.mark.line < start_mark.line { return Err(ScanError::new_str( start_mark, @@ -2363,7 +2449,7 @@ impl> Scanner { self.simple_keys.last_mut().unwrap().possible = false; self.disallow_simple_key(); } else { - if self.implicit_flow_mapping { + if is_implicit_flow_mapping { self.tokens .push_back(Token(self.mark, TokenType::FlowMappingStart)); } @@ -2527,12 +2613,18 @@ impl> Scanner { } /// If an implicit mapping had started, end it. + /// + /// This function does not pop the state in [`implicit_flow_mapping_states`]. + /// + /// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states fn end_implicit_mapping(&mut self, mark: Marker) { - if self.implicit_flow_mapping { - self.implicit_flow_mapping = false; - self.flow_mapping_started = false; - self.tokens - .push_back(Token(mark, TokenType::FlowMappingEnd)); + if let Some(implicit_mapping) = self.implicit_flow_mapping_states.last_mut() { + if *implicit_mapping == ImplicitMappingState::Inside { + self.flow_mapping_started = false; + *implicit_mapping = ImplicitMappingState::Possible; + self.tokens + .push_back(Token(mark, TokenType::FlowMappingEnd)); + } } } } diff --git a/parser/tests/issues.rs b/parser/tests/issues.rs new file mode 100644 index 0000000..ce7c6df --- /dev/null +++ b/parser/tests/issues.rs @@ -0,0 +1,168 @@ +use saphyr_parser::{Event, Parser, ScanError, TScalarStyle}; + +/// Run the parser through the string. +/// +/// # Returns +/// This functions returns the events if parsing succeeds, the error the parser returned otherwise. +fn run_parser(input: &str) -> Result, ScanError> { + let mut events = vec![]; + for x in Parser::new_from_str(input) { + events.push(x?.0); + } + Ok(events) +} + +#[test] +#[allow(clippy::too_many_lines)] +fn test_issue1() { + // https://github.com/saphyr-rs/saphyr-parser/issues/1 + + // Implicit mappings with a nested flow sequence prematurely end the mapping. + // + // [a: [42]] + // ^ This closing sequence character + // ^ is interpreted as closing this sequence, where the implicit mapping starts. + // + // The current implementation does not allow for nested implicit mapping: + // + // [a: [b: c]] + // ^ this implicit mapping would be ignored + let reference = r" +- a: + - 42 +"; + + let expected = [ + Event::StreamStart, + Event::DocumentStart, + Event::SequenceStart(0, None), + Event::MappingStart(0, None), + Event::Scalar("a".to_string(), TScalarStyle::Plain, 0, None), + Event::SequenceStart(0, None), + Event::Scalar("42".to_string(), TScalarStyle::Plain, 0, None), + Event::SequenceEnd, + Event::MappingEnd, + Event::SequenceEnd, + Event::DocumentEnd, + Event::StreamEnd, + ]; + assert_eq!(run_parser(reference).unwrap(), expected); + assert_eq!(run_parser("[{a: [42]}]").unwrap(), expected); + assert_eq!(run_parser("[a: [42]]").unwrap(), expected); + + // Other test cases derived from the bug + + // Implicit mapping in a complex key. + assert_eq!( + run_parser("[foo: [bar]]: baz").unwrap(), + [ + Event::StreamStart, + Event::DocumentStart, + Event::MappingStart(0, None), + Event::SequenceStart(0, None), + Event::MappingStart(0, None), + Event::Scalar("foo".to_string(), TScalarStyle::Plain, 0, None), + Event::SequenceStart(0, None), + Event::Scalar("bar".to_string(), TScalarStyle::Plain, 0, None), + Event::SequenceEnd, + Event::MappingEnd, + Event::SequenceEnd, + Event::Scalar("baz".to_string(), TScalarStyle::Plain, 0, None), + Event::MappingEnd, + Event::DocumentEnd, + Event::StreamEnd, + ] + ); + + // Implicit mappings with implicit null keys + assert_eq!( + run_parser("[:]").unwrap(), + [ + Event::StreamStart, + Event::DocumentStart, + Event::SequenceStart(0, None), + Event::MappingStart(0, None), + Event::Scalar("~".to_string(), TScalarStyle::Plain, 0, None), + Event::Scalar("~".to_string(), TScalarStyle::Plain, 0, None), + Event::MappingEnd, + Event::SequenceEnd, + Event::DocumentEnd, + Event::StreamEnd, + ] + ); + + // Nested implicit mappings with implicit null keys + assert_eq!( + run_parser("[: [:]]").unwrap(), + [ + Event::StreamStart, + Event::DocumentStart, + Event::SequenceStart(0, None), + Event::MappingStart(0, None), + Event::Scalar("~".to_string(), TScalarStyle::Plain, 0, None), + Event::SequenceStart(0, None), + Event::MappingStart(0, None), + Event::Scalar("~".to_string(), TScalarStyle::Plain, 0, None), + Event::Scalar("~".to_string(), TScalarStyle::Plain, 0, None), + Event::MappingEnd, + Event::SequenceEnd, + Event::MappingEnd, + Event::SequenceEnd, + Event::DocumentEnd, + Event::StreamEnd, + ] + ); + + // Interleaved nested implicit and non-implicit mappings. + assert_eq!( + run_parser("[a: [ [b:]]]").unwrap(), + // ^ ^ ^ has an implicit mapping + // | ` has no implicit mapping + // ` has an implicit mapping + // We must make sure that the `MappingEnd` events are correctly issued for the first and + // third nested sequences, but not the second. + [ + Event::StreamStart, + Event::DocumentStart, + Event::SequenceStart(0, None), + Event::MappingStart(0, None), + Event::Scalar("a".to_string(), TScalarStyle::Plain, 0, None), + Event::SequenceStart(0, None), + // No `MappingStart` here. + Event::SequenceStart(0, None), + Event::MappingStart(0, None), + Event::Scalar("b".to_string(), TScalarStyle::Plain, 0, None), + Event::Scalar("~".to_string(), TScalarStyle::Plain, 0, None), + Event::MappingEnd, + Event::SequenceEnd, + // No `MappingEnd` here. + Event::SequenceEnd, + Event::MappingEnd, + Event::SequenceEnd, + Event::DocumentEnd, + Event::StreamEnd, + ] + ); + + // There needs to be a space between a `:` in a flow sequence and the value. + assert!(run_parser("[:[:]]").is_err()); + assert!(run_parser("[a:[42]]").is_err()); + + // Double-quoted keys may have a value without a space for JSON-compatibility. + assert_eq!( + run_parser(r#"["a":[]]"#).unwrap(), + [ + Event::StreamStart, + Event::DocumentStart, + Event::SequenceStart(0, None), + Event::MappingStart(0, None), + Event::Scalar("a".to_string(), TScalarStyle::DoubleQuoted, 0, None), + Event::SequenceStart(0, None), + Event::SequenceEnd, + Event::MappingEnd, + Event::SequenceEnd, + Event::DocumentEnd, + Event::StreamEnd, + ] + ); +}