saphyr-serde/parser/src/parser.rs
2024-06-13 22:05:43 +02:00

1196 lines
41 KiB
Rust

//! Home to the YAML Parser.
//!
//! The parser takes input from the [`crate::scanner::Scanner`], performs final checks for YAML
//! compliance, and emits a stream of YAML events. This stream can for instance be used to create
//! YAML objects.
use crate::scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType};
use std::collections::HashMap;
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
enum State {
StreamStart,
ImplicitDocumentStart,
DocumentStart,
DocumentContent,
DocumentEnd,
BlockNode,
BlockSequenceFirstEntry,
BlockSequenceEntry,
IndentlessSequenceEntry,
BlockMappingFirstKey,
BlockMappingKey,
BlockMappingValue,
FlowSequenceFirstEntry,
FlowSequenceEntry,
FlowSequenceEntryMappingKey,
FlowSequenceEntryMappingValue,
FlowSequenceEntryMappingEnd,
FlowMappingFirstKey,
FlowMappingKey,
FlowMappingValue,
FlowMappingEmptyValue,
End,
}
/// An event generated by the YAML parser.
///
/// Events are used in the low-level event-based API (push parser). The API entrypoint is the
/// [`EventReceiver`] trait.
#[derive(Clone, PartialEq, Debug, Eq)]
pub enum Event {
/// Reserved for internal use.
Nothing,
/// Event generated at the very beginning of parsing.
StreamStart,
/// Last event that will be generated by the parser. Signals EOF.
StreamEnd,
/// The YAML start document directive (`---`).
DocumentStart,
/// The YAML end document directive (`...`).
DocumentEnd,
/// A YAML Alias.
Alias(
/// The anchor ID the alias refers to.
usize,
),
/// Value, style, `anchor_id`, tag
Scalar(String, TScalarStyle, usize, Option<Tag>),
/// The start of a YAML sequence (array).
SequenceStart(
/// The anchor ID of the start of the sequence.
usize,
/// An optional tag
Option<Tag>,
),
/// The end of a YAML sequence (array).
SequenceEnd,
/// The start of a YAML mapping (object, hash).
MappingStart(
/// The anchor ID of the start of the mapping.
usize,
/// An optional tag
Option<Tag>,
),
/// The end of a YAML mapping (object, hash).
MappingEnd,
}
/// A YAML tag.
#[derive(Clone, PartialEq, Debug, Eq)]
pub struct Tag {
/// Handle of the tag (`!` included).
pub handle: String,
/// The suffix of the tag.
pub suffix: String,
}
impl Event {
/// Create an empty scalar.
fn empty_scalar() -> Event {
// a null scalar
Event::Scalar("~".to_owned(), TScalarStyle::Plain, 0, None)
}
/// Create an empty scalar with the given anchor.
fn empty_scalar_with_anchor(anchor: usize, tag: Option<Tag>) -> Event {
Event::Scalar(String::new(), TScalarStyle::Plain, anchor, tag)
}
}
/// A YAML parser.
#[derive(Debug)]
pub struct Parser<T> {
/// The underlying scanner from which we pull tokens.
scanner: Scanner<T>,
/// The stack of _previous_ states we were in.
///
/// States are pushed in the context of subobjects to this stack. The top-most element is the
/// state in which to come back to when exiting the current state.
states: Vec<State>,
/// The state in which we currently are.
state: State,
/// The next token from the scanner.
token: Option<Token>,
/// The next YAML event to emit.
current: Option<(Event, Marker)>,
/// Anchors that have been encountered in the YAML document.
anchors: HashMap<String, usize>,
/// Next ID available for an anchor.
///
/// Every anchor is given a unique ID. We use an incrementing ID and this is both the ID to
/// return for the next anchor and the count of anchor IDs emitted.
anchor_id_count: usize,
/// The tag directives (`%TAG`) the parser has encountered.
///
/// Key is the handle, and value is the prefix.
tags: HashMap<String, String>,
/// Whether we have emitted [`Event::StreamEnd`].
///
/// Emitted means that it has been returned from [`Self::next_token`]. If it is stored in
/// [`Self::token`], this is set to `false`.
stream_end_emitted: bool,
/// Make tags global across all documents.
keep_tags: bool,
}
/// Trait to be implemented in order to use the low-level parsing API.
///
/// The low-level parsing API is event-based (a push parser), calling [`EventReceiver::on_event`]
/// for each YAML [`Event`] that occurs.
/// The [`EventReceiver`] trait only receives events. In order to receive both events and their
/// location in the source, use [`MarkedEventReceiver`]. Note that [`EventReceiver`]s implement
/// [`MarkedEventReceiver`] automatically.
///
/// # Event hierarchy
/// The event stream starts with an [`Event::StreamStart`] event followed by an
/// [`Event::DocumentStart`] event. If the YAML document starts with a mapping (an object), an
/// [`Event::MappingStart`] event is emitted. If it starts with a sequence (an array), an
/// [`Event::SequenceStart`] event is emitted. Otherwise, an [`Event::Scalar`] event is emitted.
///
/// In a mapping, key-values are sent as consecutive events. The first event after an
/// [`Event::MappingStart`] will be the key, and following its value. If the mapping contains no
/// sub-mapping or sub-sequence, then even events (starting from 0) will always be keys and odd
/// ones will always be values. The mapping ends when an [`Event::MappingEnd`] event is received.
///
/// In a sequence, values are sent consecutively until the [`Event::SequenceEnd`] event.
///
/// If a value is a sub-mapping or a sub-sequence, an [`Event::MappingStart`] or
/// [`Event::SequenceStart`] event will be sent respectively. Following events until the associated
/// [`Event::MappingStart`] or [`Event::SequenceEnd`] (beware of nested mappings or sequences) will
/// be part of the value and not another key-value pair or element in the sequence.
///
/// For instance, the following yaml:
/// ```yaml
/// a: b
/// c:
/// d: e
/// f:
/// - g
/// - h
/// ```
/// will emit (indented and commented for lisibility):
/// ```text
/// StreamStart, DocumentStart, MappingStart,
/// Scalar("a", ..), Scalar("b", ..)
/// Scalar("c", ..), MappingStart, Scalar("d", ..), Scalar("e", ..), MappingEnd,
/// Scalar("f", ..), SequenceStart, Scalar("g", ..), Scalar("h", ..), SequenceEnd,
/// MappingEnd, DocumentEnd, StreamEnd
/// ```
///
/// # Example
/// ```
/// # use saphyr_parser::{Event, EventReceiver, Parser};
/// #
/// /// Sink of events. Collects them into an array.
/// struct EventSink {
/// events: Vec<Event>,
/// }
///
/// /// Implement `on_event`, pushing into `self.events`.
/// impl EventReceiver for EventSink {
/// fn on_event(&mut self, ev: Event) {
/// self.events.push(ev);
/// }
/// }
///
/// /// Load events from a yaml string.
/// fn str_to_events(yaml: &str) -> Vec<Event> {
/// let mut sink = EventSink { events: Vec::new() };
/// let mut parser = Parser::new_from_str(yaml);
/// // Load events using our sink as the receiver.
/// parser.load(&mut sink, true).unwrap();
/// sink.events
/// }
/// ```
pub trait EventReceiver {
/// Handler called for each YAML event that is emitted by the parser.
fn on_event(&mut self, ev: Event);
}
/// Trait to be implemented for using the low-level parsing API.
///
/// Functionally similar to [`EventReceiver`], but receives a [`Marker`] as well as the event.
pub trait MarkedEventReceiver {
/// Handler called for each event that occurs.
fn on_event(&mut self, ev: Event, _mark: Marker);
}
impl<R: EventReceiver> MarkedEventReceiver for R {
fn on_event(&mut self, ev: Event, _mark: Marker) {
self.on_event(ev);
}
}
/// A convenience alias for a `Result` of a parser event.
pub type ParseResult = Result<(Event, Marker), ScanError>;
impl<'a> Parser<core::str::Chars<'a>> {
/// Create a new instance of a parser from a &str.
#[must_use]
pub fn new_from_str(value: &'a str) -> Self {
Parser::new(value.chars())
}
}
impl<T: Iterator<Item = char>> Parser<T> {
/// Create a new instance of a parser from the given input of characters.
pub fn new(src: T) -> Parser<T> {
Parser {
scanner: Scanner::new(src),
states: Vec::new(),
state: State::StreamStart,
token: None,
current: None,
anchors: HashMap::new(),
// valid anchor_id starts from 1
anchor_id_count: 1,
tags: HashMap::new(),
stream_end_emitted: false,
keep_tags: false,
}
}
/// Whether to keep tags across multiple documents when parsing.
///
/// This behavior is non-standard as per the YAML specification but can be encountered in the
/// wild. This boolean allows enabling this non-standard extension. This would result in the
/// parser accepting input from [test
/// QLJ7](https://github.com/yaml/yaml-test-suite/blob/ccfa74e56afb53da960847ff6e6976c0a0825709/src/QLJ7.yaml)
/// of the yaml-test-suite:
///
/// ```yaml
/// %TAG !prefix! tag:example.com,2011:
/// --- !prefix!A
/// a: b
/// --- !prefix!B
/// c: d
/// --- !prefix!C
/// e: f
/// ```
///
/// With `keep_tags` set to `false`, the above YAML is rejected. As per the specification, tags
/// only apply to the document immediately following them. This would error on `!prefix!B`.
///
/// With `keep_tags` set to `true`, the above YAML is accepted by the parser.
#[must_use]
pub fn keep_tags(mut self, value: bool) -> Self {
self.keep_tags = value;
self
}
/// Try to load the next event and return it, but do not consuming it from `self`.
///
/// Any subsequent call to [`Parser::peek`] will return the same value, until a call to
/// [`Iterator::next`] or [`Parser::load`].
///
/// # Errors
/// Returns `ScanError` when loading the next event fails.
pub fn peek(&mut self) -> Option<Result<&(Event, Marker), ScanError>> {
if let Some(ref x) = self.current {
Some(Ok(x))
} else {
if self.stream_end_emitted {
return None;
}
match self.next_event_impl() {
Ok(token) => self.current = Some(token),
Err(e) => return Some(Err(e)),
}
self.current.as_ref().map(Ok)
}
}
/// Try to load the next event and return it, consuming it from `self`.
///
/// # Errors
/// Returns `ScanError` when loading the next event fails.
pub fn next_event(&mut self) -> Option<ParseResult> {
if self.stream_end_emitted {
return None;
}
let tok = self.next_event_impl();
if matches!(tok, Ok((Event::StreamEnd, _))) {
self.stream_end_emitted = true;
}
Some(tok)
}
/// Implementation function for [`Self::next_event`] without the `Option`.
///
/// [`Self::next_event`] should conform to the expectations of an [`Iterator`] and return an
/// option. This burdens the parser code. This function is used internally when an option is
/// undesirable.
fn next_event_impl(&mut self) -> ParseResult {
match self.current.take() {
None => self.parse(),
Some(v) => Ok(v),
}
}
/// Peek at the next token from the scanner.
fn peek_token(&mut self) -> Result<&Token, ScanError> {
match self.token {
None => {
self.token = Some(self.scan_next_token()?);
Ok(self.token.as_ref().unwrap())
}
Some(ref tok) => Ok(tok),
}
}
/// Extract and return the next token from the scanner.
///
/// This function does _not_ make use of `self.token`.
fn scan_next_token(&mut self) -> Result<Token, ScanError> {
let token = self.scanner.next();
match token {
None => match self.scanner.get_error() {
None => Err(ScanError::new_str(self.scanner.mark(), "unexpected eof")),
Some(e) => Err(e),
},
Some(tok) => Ok(tok),
}
}
fn fetch_token(&mut self) -> Token {
self.token
.take()
.expect("fetch_token needs to be preceded by peek_token")
}
/// Skip the next token from the scanner.
fn skip(&mut self) {
self.token = None;
}
/// Pops the top-most state and make it the current state.
fn pop_state(&mut self) {
self.state = self.states.pop().unwrap();
}
/// Push a new state atop the state stack.
fn push_state(&mut self, state: State) {
self.states.push(state);
}
fn parse(&mut self) -> ParseResult {
if self.state == State::End {
return Ok((Event::StreamEnd, self.scanner.mark()));
}
let (ev, mark) = self.state_machine()?;
Ok((ev, mark))
}
/// Load the YAML from the stream in `self`, pushing events into `recv`.
///
/// The contents of the stream are parsed and the corresponding events are sent into the
/// recveiver. For detailed explanations about how events work, see [`EventReceiver`].
///
/// If `multi` is set to `true`, the parser will allow parsing of multiple YAML documents
/// inside the stream.
///
/// Note that any [`EventReceiver`] is also a [`MarkedEventReceiver`], so implementing the
/// former is enough to call this function.
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load<R: MarkedEventReceiver>(
&mut self,
recv: &mut R,
multi: bool,
) -> Result<(), ScanError> {
if !self.scanner.stream_started() {
let (ev, mark) = self.next_event_impl()?;
if ev != Event::StreamStart {
return Err(ScanError::new_str(
mark,
"did not find expected <stream-start>",
));
}
recv.on_event(ev, mark);
}
if self.scanner.stream_ended() {
// XXX has parsed?
recv.on_event(Event::StreamEnd, self.scanner.mark());
return Ok(());
}
loop {
let (ev, mark) = self.next_event_impl()?;
if ev == Event::StreamEnd {
recv.on_event(ev, mark);
return Ok(());
}
// clear anchors before a new document
self.anchors.clear();
self.load_document(ev, mark, recv)?;
if !multi {
break;
}
}
Ok(())
}
fn load_document<R: MarkedEventReceiver>(
&mut self,
first_ev: Event,
mark: Marker,
recv: &mut R,
) -> Result<(), ScanError> {
if first_ev != Event::DocumentStart {
return Err(ScanError::new_str(
mark,
"did not find expected <document-start>",
));
}
recv.on_event(first_ev, mark);
let (ev, mark) = self.next_event_impl()?;
self.load_node(ev, mark, recv)?;
// DOCUMENT-END is expected.
let (ev, mark) = self.next_event_impl()?;
assert_eq!(ev, Event::DocumentEnd);
recv.on_event(ev, mark);
Ok(())
}
fn load_node<R: MarkedEventReceiver>(
&mut self,
first_ev: Event,
mark: Marker,
recv: &mut R,
) -> Result<(), ScanError> {
match first_ev {
Event::Alias(..) | Event::Scalar(..) => {
recv.on_event(first_ev, mark);
Ok(())
}
Event::SequenceStart(..) => {
recv.on_event(first_ev, mark);
self.load_sequence(recv)
}
Event::MappingStart(..) => {
recv.on_event(first_ev, mark);
self.load_mapping(recv)
}
_ => {
println!("UNREACHABLE EVENT: {first_ev:?}");
unreachable!();
}
}
}
fn load_mapping<R: MarkedEventReceiver>(&mut self, recv: &mut R) -> Result<(), ScanError> {
let (mut key_ev, mut key_mark) = self.next_event_impl()?;
while key_ev != Event::MappingEnd {
// key
self.load_node(key_ev, key_mark, recv)?;
// value
let (ev, mark) = self.next_event_impl()?;
self.load_node(ev, mark, recv)?;
// next event
let (ev, mark) = self.next_event_impl()?;
key_ev = ev;
key_mark = mark;
}
recv.on_event(key_ev, key_mark);
Ok(())
}
fn load_sequence<R: MarkedEventReceiver>(&mut self, recv: &mut R) -> Result<(), ScanError> {
let (mut ev, mut mark) = self.next_event_impl()?;
while ev != Event::SequenceEnd {
self.load_node(ev, mark, recv)?;
// next event
let (next_ev, next_mark) = self.next_event_impl()?;
ev = next_ev;
mark = next_mark;
}
recv.on_event(ev, mark);
Ok(())
}
fn state_machine(&mut self) -> ParseResult {
// let next_tok = self.peek_token().cloned()?;
// println!("cur_state {:?}, next tok: {:?}", self.state, next_tok);
debug_print!("\n\x1B[;33mParser state: {:?} \x1B[;0m", self.state);
match self.state {
State::StreamStart => self.stream_start(),
State::ImplicitDocumentStart => self.document_start(true),
State::DocumentStart => self.document_start(false),
State::DocumentContent => self.document_content(),
State::DocumentEnd => self.document_end(),
State::BlockNode => self.parse_node(true, false),
// State::BlockNodeOrIndentlessSequence => self.parse_node(true, true),
// State::FlowNode => self.parse_node(false, false),
State::BlockMappingFirstKey => self.block_mapping_key(true),
State::BlockMappingKey => self.block_mapping_key(false),
State::BlockMappingValue => self.block_mapping_value(),
State::BlockSequenceFirstEntry => self.block_sequence_entry(true),
State::BlockSequenceEntry => self.block_sequence_entry(false),
State::FlowSequenceFirstEntry => self.flow_sequence_entry(true),
State::FlowSequenceEntry => self.flow_sequence_entry(false),
State::FlowMappingFirstKey => self.flow_mapping_key(true),
State::FlowMappingKey => self.flow_mapping_key(false),
State::FlowMappingValue => self.flow_mapping_value(false),
State::IndentlessSequenceEntry => self.indentless_sequence_entry(),
State::FlowSequenceEntryMappingKey => self.flow_sequence_entry_mapping_key(),
State::FlowSequenceEntryMappingValue => self.flow_sequence_entry_mapping_value(),
State::FlowSequenceEntryMappingEnd => self.flow_sequence_entry_mapping_end(),
State::FlowMappingEmptyValue => self.flow_mapping_value(true),
/* impossible */
State::End => unreachable!(),
}
}
fn stream_start(&mut self) -> ParseResult {
match *self.peek_token()? {
Token(mark, TokenType::StreamStart(_)) => {
self.state = State::ImplicitDocumentStart;
self.skip();
Ok((Event::StreamStart, mark))
}
Token(mark, _) => Err(ScanError::new_str(
mark,
"did not find expected <stream-start>",
)),
}
}
fn document_start(&mut self, implicit: bool) -> ParseResult {
while let TokenType::DocumentEnd = self.peek_token()?.1 {
self.skip();
}
match *self.peek_token()? {
Token(mark, TokenType::StreamEnd) => {
self.state = State::End;
self.skip();
Ok((Event::StreamEnd, mark))
}
Token(
_,
TokenType::VersionDirective(..)
| TokenType::TagDirective(..)
| TokenType::DocumentStart,
) => {
// explicit document
self.explicit_document_start()
}
Token(mark, _) if implicit => {
self.parser_process_directives()?;
self.push_state(State::DocumentEnd);
self.state = State::BlockNode;
Ok((Event::DocumentStart, mark))
}
_ => {
// explicit document
self.explicit_document_start()
}
}
}
fn parser_process_directives(&mut self) -> Result<(), ScanError> {
let mut version_directive_received = false;
loop {
let mut tags = HashMap::new();
match self.peek_token()? {
Token(mark, TokenType::VersionDirective(_, _)) => {
// XXX parsing with warning according to spec
//if major != 1 || minor > 2 {
// return Err(ScanError::new_str(tok.0,
// "found incompatible YAML document"));
//}
if version_directive_received {
return Err(ScanError::new_str(*mark, "duplicate version directive"));
}
version_directive_received = true;
}
Token(mark, TokenType::TagDirective(handle, prefix)) => {
if tags.contains_key(handle) {
return Err(ScanError::new_str(*mark, "the TAG directive must only be given at most once per handle in the same document"));
}
tags.insert(handle.to_string(), prefix.to_string());
}
_ => break,
}
self.tags = tags;
self.skip();
}
Ok(())
}
fn explicit_document_start(&mut self) -> ParseResult {
self.parser_process_directives()?;
match *self.peek_token()? {
Token(mark, TokenType::DocumentStart) => {
self.push_state(State::DocumentEnd);
self.state = State::DocumentContent;
self.skip();
Ok((Event::DocumentStart, mark))
}
Token(mark, _) => Err(ScanError::new_str(
mark,
"did not find expected <document start>",
)),
}
}
fn document_content(&mut self) -> ParseResult {
match *self.peek_token()? {
Token(
mark,
TokenType::VersionDirective(..)
| TokenType::TagDirective(..)
| TokenType::DocumentStart
| TokenType::DocumentEnd
| TokenType::StreamEnd,
) => {
self.pop_state();
// empty scalar
Ok((Event::empty_scalar(), mark))
}
_ => self.parse_node(true, false),
}
}
fn document_end(&mut self) -> ParseResult {
let mut explicit_end = false;
let marker: Marker = match *self.peek_token()? {
Token(mark, TokenType::DocumentEnd) => {
explicit_end = true;
self.skip();
mark
}
Token(mark, _) => mark,
};
if !self.keep_tags {
self.tags.clear();
}
if explicit_end {
self.state = State::ImplicitDocumentStart;
} else {
if let Token(mark, TokenType::VersionDirective(..) | TokenType::TagDirective(..)) =
*self.peek_token()?
{
return Err(ScanError::new_str(
mark,
"missing explicit document end marker before directive",
));
}
self.state = State::DocumentStart;
}
Ok((Event::DocumentEnd, marker))
}
fn register_anchor(&mut self, name: String, _: &Marker) -> usize {
// anchors can be overridden/reused
// if self.anchors.contains_key(name) {
// return Err(ScanError::new_str(*mark,
// "while parsing anchor, found duplicated anchor"));
// }
let new_id = self.anchor_id_count;
self.anchor_id_count += 1;
self.anchors.insert(name, new_id);
new_id
}
fn parse_node(&mut self, block: bool, indentless_sequence: bool) -> ParseResult {
let mut anchor_id = 0;
let mut tag = None;
match *self.peek_token()? {
Token(_, TokenType::Alias(_)) => {
self.pop_state();
if let Token(mark, TokenType::Alias(name)) = self.fetch_token() {
match self.anchors.get(&name) {
None => {
return Err(ScanError::new_str(
mark,
"while parsing node, found unknown anchor",
))
}
Some(id) => return Ok((Event::Alias(*id), mark)),
}
}
unreachable!()
}
Token(_, TokenType::Anchor(_)) => {
if let Token(mark, TokenType::Anchor(name)) = self.fetch_token() {
anchor_id = self.register_anchor(name, &mark);
if let TokenType::Tag(..) = self.peek_token()?.1 {
if let TokenType::Tag(handle, suffix) = self.fetch_token().1 {
tag = Some(self.resolve_tag(mark, &handle, suffix)?);
} else {
unreachable!()
}
}
} else {
unreachable!()
}
}
Token(mark, TokenType::Tag(..)) => {
if let TokenType::Tag(handle, suffix) = self.fetch_token().1 {
tag = Some(self.resolve_tag(mark, &handle, suffix)?);
if let TokenType::Anchor(_) = &self.peek_token()?.1 {
if let Token(mark, TokenType::Anchor(name)) = self.fetch_token() {
anchor_id = self.register_anchor(name, &mark);
} else {
unreachable!()
}
}
} else {
unreachable!()
}
}
_ => {}
}
match *self.peek_token()? {
Token(mark, TokenType::BlockEntry) if indentless_sequence => {
self.state = State::IndentlessSequenceEntry;
Ok((Event::SequenceStart(anchor_id, tag), mark))
}
Token(_, TokenType::Scalar(..)) => {
self.pop_state();
if let Token(mark, TokenType::Scalar(style, v)) = self.fetch_token() {
Ok((Event::Scalar(v, style, anchor_id, tag), mark))
} else {
unreachable!()
}
}
Token(mark, TokenType::FlowSequenceStart) => {
self.state = State::FlowSequenceFirstEntry;
Ok((Event::SequenceStart(anchor_id, tag), mark))
}
Token(mark, TokenType::FlowMappingStart) => {
self.state = State::FlowMappingFirstKey;
Ok((Event::MappingStart(anchor_id, tag), mark))
}
Token(mark, TokenType::BlockSequenceStart) if block => {
self.state = State::BlockSequenceFirstEntry;
Ok((Event::SequenceStart(anchor_id, tag), mark))
}
Token(mark, TokenType::BlockMappingStart) if block => {
self.state = State::BlockMappingFirstKey;
Ok((Event::MappingStart(anchor_id, tag), mark))
}
// ex 7.2, an empty scalar can follow a secondary tag
Token(mark, _) if tag.is_some() || anchor_id > 0 => {
self.pop_state();
Ok((Event::empty_scalar_with_anchor(anchor_id, tag), mark))
}
Token(mark, _) => Err(ScanError::new_str(
mark,
"while parsing a node, did not find expected node content",
)),
}
}
fn block_mapping_key(&mut self, first: bool) -> ParseResult {
// skip BlockMappingStart
if first {
let _ = self.peek_token()?;
//self.marks.push(tok.0);
self.skip();
}
match *self.peek_token()? {
Token(_, TokenType::Key) => {
self.skip();
if let Token(mark, TokenType::Key | TokenType::Value | TokenType::BlockEnd) =
*self.peek_token()?
{
self.state = State::BlockMappingValue;
// empty scalar
Ok((Event::empty_scalar(), mark))
} else {
self.push_state(State::BlockMappingValue);
self.parse_node(true, true)
}
}
// XXX(chenyh): libyaml failed to parse spec 1.2, ex8.18
Token(mark, TokenType::Value) => {
self.state = State::BlockMappingValue;
Ok((Event::empty_scalar(), mark))
}
Token(mark, TokenType::BlockEnd) => {
self.pop_state();
self.skip();
Ok((Event::MappingEnd, mark))
}
Token(mark, _) => Err(ScanError::new_str(
mark,
"while parsing a block mapping, did not find expected key",
)),
}
}
fn block_mapping_value(&mut self) -> ParseResult {
match *self.peek_token()? {
Token(_, TokenType::Value) => {
self.skip();
if let Token(mark, TokenType::Key | TokenType::Value | TokenType::BlockEnd) =
*self.peek_token()?
{
self.state = State::BlockMappingKey;
// empty scalar
Ok((Event::empty_scalar(), mark))
} else {
self.push_state(State::BlockMappingKey);
self.parse_node(true, true)
}
}
Token(mark, _) => {
self.state = State::BlockMappingKey;
// empty scalar
Ok((Event::empty_scalar(), mark))
}
}
}
fn flow_mapping_key(&mut self, first: bool) -> ParseResult {
if first {
let _ = self.peek_token()?;
self.skip();
}
let marker: Marker = {
match *self.peek_token()? {
Token(mark, TokenType::FlowMappingEnd) => mark,
Token(mark, _) => {
if !first {
match *self.peek_token()? {
Token(_, TokenType::FlowEntry) => self.skip(),
Token(mark, _) => return Err(ScanError::new_str(
mark,
"while parsing a flow mapping, did not find expected ',' or '}'",
)),
}
}
match *self.peek_token()? {
Token(_, TokenType::Key) => {
self.skip();
if let Token(
mark,
TokenType::Value | TokenType::FlowEntry | TokenType::FlowMappingEnd,
) = *self.peek_token()?
{
self.state = State::FlowMappingValue;
return Ok((Event::empty_scalar(), mark));
}
self.push_state(State::FlowMappingValue);
return self.parse_node(false, false);
}
Token(marker, TokenType::Value) => {
self.state = State::FlowMappingValue;
return Ok((Event::empty_scalar(), marker));
}
Token(_, TokenType::FlowMappingEnd) => (),
_ => {
self.push_state(State::FlowMappingEmptyValue);
return self.parse_node(false, false);
}
}
mark
}
}
};
self.pop_state();
self.skip();
Ok((Event::MappingEnd, marker))
}
fn flow_mapping_value(&mut self, empty: bool) -> ParseResult {
let mark: Marker = {
if empty {
let Token(mark, _) = *self.peek_token()?;
self.state = State::FlowMappingKey;
return Ok((Event::empty_scalar(), mark));
}
match *self.peek_token()? {
Token(marker, TokenType::Value) => {
self.skip();
match self.peek_token()?.1 {
TokenType::FlowEntry | TokenType::FlowMappingEnd => {}
_ => {
self.push_state(State::FlowMappingKey);
return self.parse_node(false, false);
}
}
marker
}
Token(marker, _) => marker,
}
};
self.state = State::FlowMappingKey;
Ok((Event::empty_scalar(), mark))
}
fn flow_sequence_entry(&mut self, first: bool) -> ParseResult {
// skip FlowMappingStart
if first {
let _ = self.peek_token()?;
//self.marks.push(tok.0);
self.skip();
}
match *self.peek_token()? {
Token(mark, TokenType::FlowSequenceEnd) => {
self.pop_state();
self.skip();
return Ok((Event::SequenceEnd, mark));
}
Token(_, TokenType::FlowEntry) if !first => {
self.skip();
}
Token(mark, _) if !first => {
return Err(ScanError::new_str(
mark,
"while parsing a flow sequence, expected ',' or ']'",
));
}
_ => { /* next */ }
}
match *self.peek_token()? {
Token(mark, TokenType::FlowSequenceEnd) => {
self.pop_state();
self.skip();
Ok((Event::SequenceEnd, mark))
}
Token(mark, TokenType::Key) => {
self.state = State::FlowSequenceEntryMappingKey;
self.skip();
Ok((Event::MappingStart(0, None), mark))
}
_ => {
self.push_state(State::FlowSequenceEntry);
self.parse_node(false, false)
}
}
}
fn indentless_sequence_entry(&mut self) -> ParseResult {
match *self.peek_token()? {
Token(_, TokenType::BlockEntry) => (),
Token(mark, _) => {
self.pop_state();
return Ok((Event::SequenceEnd, mark));
}
}
self.skip();
if let Token(
mark,
TokenType::BlockEntry | TokenType::Key | TokenType::Value | TokenType::BlockEnd,
) = *self.peek_token()?
{
self.state = State::IndentlessSequenceEntry;
Ok((Event::empty_scalar(), mark))
} else {
self.push_state(State::IndentlessSequenceEntry);
self.parse_node(true, false)
}
}
fn block_sequence_entry(&mut self, first: bool) -> ParseResult {
// BLOCK-SEQUENCE-START
if first {
let _ = self.peek_token()?;
//self.marks.push(tok.0);
self.skip();
}
match *self.peek_token()? {
Token(mark, TokenType::BlockEnd) => {
self.pop_state();
self.skip();
Ok((Event::SequenceEnd, mark))
}
Token(_, TokenType::BlockEntry) => {
self.skip();
if let Token(mark, TokenType::BlockEntry | TokenType::BlockEnd) =
*self.peek_token()?
{
self.state = State::BlockSequenceEntry;
Ok((Event::empty_scalar(), mark))
} else {
self.push_state(State::BlockSequenceEntry);
self.parse_node(true, false)
}
}
Token(mark, _) => Err(ScanError::new_str(
mark,
"while parsing a block collection, did not find expected '-' indicator",
)),
}
}
fn flow_sequence_entry_mapping_key(&mut self) -> ParseResult {
if let Token(mark, TokenType::Value | TokenType::FlowEntry | TokenType::FlowSequenceEnd) =
*self.peek_token()?
{
self.skip();
self.state = State::FlowSequenceEntryMappingValue;
Ok((Event::empty_scalar(), mark))
} else {
self.push_state(State::FlowSequenceEntryMappingValue);
self.parse_node(false, false)
}
}
fn flow_sequence_entry_mapping_value(&mut self) -> ParseResult {
match *self.peek_token()? {
Token(_, TokenType::Value) => {
self.skip();
self.state = State::FlowSequenceEntryMappingValue;
if let Token(mark, TokenType::FlowEntry | TokenType::FlowSequenceEnd) =
*self.peek_token()?
{
self.state = State::FlowSequenceEntryMappingEnd;
Ok((Event::empty_scalar(), mark))
} else {
self.push_state(State::FlowSequenceEntryMappingEnd);
self.parse_node(false, false)
}
}
Token(mark, _) => {
self.state = State::FlowSequenceEntryMappingEnd;
Ok((Event::empty_scalar(), mark))
}
}
}
#[allow(clippy::unnecessary_wraps)]
fn flow_sequence_entry_mapping_end(&mut self) -> ParseResult {
self.state = State::FlowSequenceEntry;
Ok((Event::MappingEnd, self.scanner.mark()))
}
/// Resolve a tag from the handle and the suffix.
fn resolve_tag(&self, mark: Marker, handle: &str, suffix: String) -> Result<Tag, ScanError> {
if handle == "!!" {
// "!!" is a shorthand for "tag:yaml.org,2002:". However, that default can be
// overridden.
Ok(Tag {
handle: self
.tags
.get("!!")
.map_or_else(|| "tag:yaml.org,2002:".to_string(), ToString::to_string),
suffix,
})
} else if handle.is_empty() && suffix == "!" {
// "!" introduces a local tag. Local tags may have their prefix overridden.
match self.tags.get("") {
Some(prefix) => Ok(Tag {
handle: prefix.to_string(),
suffix,
}),
None => Ok(Tag {
handle: String::new(),
suffix,
}),
}
} else {
// Lookup handle in our tag directives.
let prefix = self.tags.get(handle);
if let Some(prefix) = prefix {
Ok(Tag {
handle: prefix.to_string(),
suffix,
})
} else {
// Otherwise, it may be a local handle. With a local handle, the handle is set to
// "!" and the suffix to whatever follows it ("!foo" -> ("!", "foo")).
// If the handle is of the form "!foo!", this cannot be a local handle and we need
// to error.
if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
Err(ScanError::new_str(mark, "the handle wasn't declared"))
} else {
Ok(Tag {
handle: handle.to_string(),
suffix,
})
}
}
}
}
}
impl<T: Iterator<Item = char>> Iterator for Parser<T> {
type Item = Result<(Event, Marker), ScanError>;
fn next(&mut self) -> Option<Self::Item> {
self.next_event()
}
}
#[cfg(test)]
mod test {
use super::{Event, Parser};
#[test]
fn test_peek_eq_parse() {
let s = "
a0 bb: val
a1: &x
b1: 4
b2: d
a2: 4
a3: [1, 2, 3]
a4:
- [a1, a2]
- 2
a5: *x
";
let mut p = Parser::new_from_str(s);
loop {
let event_peek = p.peek().unwrap().unwrap().clone();
let event = p.next_event().unwrap().unwrap();
assert_eq!(event, event_peek);
if event.0 == Event::StreamEnd {
break;
}
}
}
#[test]
fn test_keep_tags_across_multiple_documents() {
let text = r#"
%YAML 1.1
%TAG !t! tag:test,2024:
--- !t!1 &1
foo: "bar"
--- !t!2 &2
baz: "qux"
"#;
for x in Parser::new_from_str(text).keep_tags(true) {
let x = x.unwrap();
if let Event::MappingStart(_, tag) = x.0 {
let tag = tag.unwrap();
assert_eq!(tag.handle, "tag:test,2024:");
}
}
for x in Parser::new_from_str(text).keep_tags(false) {
if x.is_err() {
// Test successful
return;
}
}
panic!("Test failed, did not encounter error")
}
}