saphyr-serde/parser/src/scanner.rs

2617 lines
90 KiB
Rust
Raw Normal View History

2024-03-20 14:50:48 +00:00
//! Home to the YAML Scanner.
//!
//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
//! to check for more context and validity.
//!
//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
2023-08-11 23:54:46 +00:00
#![allow(clippy::cast_possible_wrap)]
#![allow(clippy::cast_sign_loss)]
use std::{char, collections::VecDeque, error::Error, fmt};
use crate::{
char_traits::{
as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz,
is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z,
},
input::Input,
};
2015-05-24 06:27:42 +00:00
2024-03-20 14:50:48 +00:00
/// The encoding of the input. Currently, only UTF-8 is supported.
2015-05-24 06:27:42 +00:00
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
pub enum TEncoding {
2024-03-20 14:50:48 +00:00
/// UTF-8 encoding.
2018-09-15 16:49:04 +00:00
Utf8,
2015-05-24 06:27:42 +00:00
}
2024-03-20 14:50:48 +00:00
/// The style as which the scalar was written in the YAML document.
2015-05-24 06:27:42 +00:00
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
pub enum TScalarStyle {
2024-03-20 14:50:48 +00:00
/// A YAML plain scalar.
2015-05-24 06:27:42 +00:00
Plain,
2024-03-20 14:50:48 +00:00
/// A YAML single quoted scalar.
2015-05-24 06:27:42 +00:00
SingleQuoted,
2024-03-20 14:50:48 +00:00
/// A YAML double quoted scalar.
2015-05-24 06:27:42 +00:00
DoubleQuoted,
2024-03-20 14:50:48 +00:00
/// A YAML literal block (`|` block).
2015-05-24 06:27:42 +00:00
Literal,
2024-03-20 14:50:48 +00:00
/// A YAML folded block (`>` block).
2024-01-19 19:21:36 +00:00
Folded,
2015-05-24 06:27:42 +00:00
}
/// A location in a yaml document.
#[derive(Clone, Copy, PartialEq, Debug, Eq, Default)]
2015-05-24 06:27:42 +00:00
pub struct Marker {
/// The index (in chars) in the input string.
2015-05-24 06:27:42 +00:00
index: usize,
/// The line (1-indexed).
2015-05-24 06:27:42 +00:00
line: usize,
/// The column (1-indexed).
2015-05-24 06:27:42 +00:00
col: usize,
}
impl Marker {
/// Create a new [`Marker`] at the given position.
#[must_use]
pub fn new(index: usize, line: usize, col: usize) -> Marker {
2018-09-15 17:03:55 +00:00
Marker { index, line, col }
2015-05-24 06:27:42 +00:00
}
2017-09-02 11:49:53 +00:00
/// Return the index (in bytes) of the marker in the source.
2023-08-11 23:54:46 +00:00
#[must_use]
2017-09-02 11:49:53 +00:00
pub fn index(&self) -> usize {
self.index
}
/// Return the line of the marker in the source.
2023-08-11 23:54:46 +00:00
#[must_use]
2017-09-02 11:49:53 +00:00
pub fn line(&self) -> usize {
self.line
}
/// Return the column of the marker in the source.
2023-08-11 23:54:46 +00:00
#[must_use]
2017-09-02 11:49:53 +00:00
pub fn col(&self) -> usize {
self.col
}
2015-05-24 06:27:42 +00:00
}
2024-03-25 11:01:58 +00:00
/// An error that occurred while scanning.
2015-05-24 06:27:42 +00:00
#[derive(Clone, PartialEq, Debug, Eq)]
pub struct ScanError {
/// The position at which the error happened in the source.
2015-05-24 06:27:42 +00:00
mark: Marker,
/// Human-readable details about the error.
2015-05-24 06:27:42 +00:00
info: String,
}
impl ScanError {
/// Create a new error from a location and an error string.
2023-08-11 23:54:46 +00:00
#[must_use]
pub fn new(loc: Marker, info: String) -> ScanError {
ScanError { mark: loc, info }
}
/// Convenience alias for string slices.
#[must_use]
pub fn new_str(loc: Marker, info: &str) -> ScanError {
2015-05-24 06:27:42 +00:00
ScanError {
mark: loc,
2018-09-15 16:49:04 +00:00
info: info.to_owned(),
2015-05-24 06:27:42 +00:00
}
}
2017-11-15 01:36:16 +00:00
/// Return the marker pointing to the error in the source.
2023-08-11 23:54:46 +00:00
#[must_use]
2017-11-15 03:41:39 +00:00
pub fn marker(&self) -> &Marker {
&self.mark
2017-11-15 01:36:16 +00:00
}
/// Return the information string describing the error that happened.
#[must_use]
pub fn info(&self) -> &str {
self.info.as_ref()
}
2015-05-24 06:27:42 +00:00
}
impl Error for ScanError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
None
}
}
impl fmt::Display for ScanError {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
2018-09-15 16:49:04 +00:00
write!(
formatter,
2021-08-18 02:07:07 +00:00
"{} at byte {} line {} column {}",
2018-09-15 16:49:04 +00:00
self.info,
2021-08-18 02:07:07 +00:00
self.mark.index,
2018-09-15 16:49:04 +00:00
self.mark.line,
2021-08-18 02:07:07 +00:00
self.mark.col + 1,
2018-09-15 16:49:04 +00:00
)
}
}
/// The contents of a scanner token.
2015-05-24 06:27:42 +00:00
#[derive(Clone, PartialEq, Debug, Eq)]
pub enum TokenType {
2024-03-28 20:03:14 +00:00
/// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
StreamStart(TEncoding),
2023-11-19 00:09:41 +00:00
/// The end of the stream, EOF.
StreamEnd,
2024-03-20 14:50:48 +00:00
/// A YAML version directive.
2023-11-19 00:09:41 +00:00
VersionDirective(
/// Major
u32,
/// Minor
u32,
),
2024-03-20 14:50:48 +00:00
/// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
2023-11-19 00:09:41 +00:00
TagDirective(
/// Handle
String,
/// Prefix
String,
),
/// The start of a YAML document (`---`).
DocumentStart,
2023-11-19 00:09:41 +00:00
/// The end of a YAML document (`...`).
DocumentEnd,
2023-12-20 23:14:08 +00:00
/// The start of a sequence block.
///
/// Sequence blocks are arrays starting with a `-`.
BlockSequenceStart,
2023-12-20 23:14:08 +00:00
/// The start of a sequence mapping.
///
/// Sequence mappings are "dictionaries" with "key: value" entries.
BlockMappingStart,
2023-12-20 23:14:08 +00:00
/// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
BlockEnd,
2023-11-19 00:09:41 +00:00
/// Start of an inline array (`[ a, b ]`).
FlowSequenceStart,
2023-11-19 00:09:41 +00:00
/// End of an inline array.
FlowSequenceEnd,
2023-11-19 00:09:41 +00:00
/// Start of an inline mapping (`{ a: b, c: d }`).
FlowMappingStart,
2023-11-19 00:09:41 +00:00
/// End of an inline mapping.
FlowMappingEnd,
/// An entry in a block sequence (c.f.: [`TokenType::BlockSequenceStart`]).
BlockEntry,
/// An entry in a flow sequence (c.f.: [`TokenType::FlowSequenceStart`]).
FlowEntry,
/// A key in a mapping.
Key,
/// A value in a mapping.
Value,
/// A reference to an anchor.
Alias(String),
2023-11-19 00:09:41 +00:00
/// A YAML anchor (`&`/`*`).
Anchor(String),
/// A YAML tag (starting with bangs `!`).
Tag(
/// The handle of the tag.
String,
/// The suffix of the tag.
String,
),
/// A regular YAML scalar.
2018-09-15 16:49:04 +00:00
Scalar(TScalarStyle, String),
2015-05-24 06:27:42 +00:00
}
/// A scanner token.
2015-05-24 06:27:42 +00:00
#[derive(Clone, PartialEq, Debug, Eq)]
pub struct Token(pub Marker, pub TokenType);
/// A scalar that was parsed and may correspond to a simple key.
///
/// Upon scanning the following yaml:
/// ```yaml
/// a: b
/// ```
/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
/// kept inside the scanner until more context is fetched and we are able to know whether it is a
/// plain scalar or a key.
///
/// For example, see the following 2 yaml documents:
/// ```yaml
/// ---
/// a: b # Here, `a` is a key.
/// ...
/// ---
/// a # Here, `a` is a plain scalar.
/// ...
/// ```
/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
///
/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
///
/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
/// [`TokenType::Scalar`] token.
///
/// In the second document however, reaching the EOF would stale the [`SimpleKey`] and no
/// [`TokenType::Key`] would be emitted by the scanner.
2015-05-24 06:27:42 +00:00
#[derive(Clone, PartialEq, Debug, Eq)]
struct SimpleKey {
/// Whether the token this [`SimpleKey`] refers to may still be a key.
///
/// Sometimes, when we have more context, we notice that what we thought could be a key no
/// longer can be. In that case, [`Self::possible`] is set to `false`.
///
/// For instance, let us consider the following invalid YAML:
/// ```yaml
/// key
/// : value
/// ```
/// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is staled
/// and [`Self::possible`] set to `false`.
2015-05-24 06:27:42 +00:00
possible: bool,
/// Whether the token this [`SimpleKey`] refers to is required to be a key.
///
/// With more context, we may know for sure that the token must be a key. If the YAML is
/// invalid, it may happen that the token be deemed not a key. In such event, an error has to
/// be raised. This boolean helps us know when to raise such error.
///
/// TODO(ethiraric, 30/12/2023): Example of when this happens.
2015-05-24 06:27:42 +00:00
required: bool,
/// The index of the token referred to by the [`SimpleKey`].
///
/// This is the index in the scanner, which takes into account both the tokens that have been
/// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
/// [`Scanner::tokens`] for more details.
2015-05-24 06:27:42 +00:00
token_number: usize,
/// The position at which the token the [`SimpleKey`] refers to is.
2015-05-24 06:27:42 +00:00
mark: Marker,
}
impl SimpleKey {
/// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
2015-05-24 06:27:42 +00:00
fn new(mark: Marker) -> SimpleKey {
SimpleKey {
possible: false,
required: false,
token_number: 0,
2018-09-15 17:03:55 +00:00
mark,
2015-05-24 06:27:42 +00:00
}
}
}
/// An indentation level on the stack of indentations.
#[derive(Clone, Debug, Default)]
struct Indent {
/// The former indentation level.
indent: isize,
/// Whether, upon closing, this indents generates a `BlockEnd` token.
///
/// There are levels of indentation which do not start a block. Examples of this would be:
/// ```yaml
/// -
/// foo # ok
/// -
/// bar # ko, bar needs to be indented further than the `-`.
/// - [
/// baz, # ok
/// quux # ko, quux needs to be indented further than the '-'.
/// ] # ko, the closing bracket needs to be indented further than the `-`.
/// ```
///
/// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
/// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
/// sequence, although we must have exactly one to end the sequence.
needs_block_end: bool,
}
/// The knowledge we have about an implicit mapping.
///
/// Implicit mappings occur in flow sequences where the opening `{` for a mapping in a flow
/// sequence is omitted:
/// ```yaml
/// [ a: b, c: d ]
/// # Equivalent to
/// [ { a: b }, { c: d } ]
/// # Equivalent to
/// - a: b
/// - c: d
/// ```
///
/// The state must be carefully tracked for each nested flow sequence since we must emit a
/// [`FlowMappingStart`] event when encountering `a` and `c` in our previous example without a
/// character hinting us. Similarly, we must emit a [`FlowMappingEnd`] event when we reach the `,`
/// or the `]`. If the state is not properly tracked, we may omit to emit these events or emit them
/// out-of-order.
///
/// [`FlowMappingStart`]: TokenType::FlowMappingStart
/// [`FlowMappingEnd`]: TokenType::FlowMappingEnd
#[derive(Debug, PartialEq)]
enum ImplicitMappingState {
/// It is possible there is an implicit mapping.
///
/// This state is the one when we have just encountered the opening `[`. We need more context
/// to know whether an implicit mapping follows.
Possible,
/// We are inside the implcit mapping.
///
/// Note that this state is not set immediately (we need to have encountered the `:` to know).
Inside,
}
/// The YAML scanner.
///
/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
/// some of the constructs. It has understanding of indentation and whitespace and is able to
/// generate error messages for some invalid YAML constructs.
///
2024-03-28 20:03:14 +00:00
/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
/// YAML documents.
2015-05-24 06:27:42 +00:00
#[derive(Debug)]
2023-08-11 23:54:46 +00:00
#[allow(clippy::struct_excessive_bools)]
2015-05-24 06:27:42 +00:00
pub struct Scanner<T> {
/// The input source.
///
/// This must implement [`Input`].
input: T,
/// The position of the cursor within the reader.
2015-05-24 06:27:42 +00:00
mark: Marker,
/// Buffer for tokens to be returned.
///
/// This buffer can hold some temporary tokens that are not yet ready to be returned. For
/// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
/// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
/// [`Self::next`] until we have more context.
2015-05-24 06:27:42 +00:00
tokens: VecDeque<Token>,
/// The last error that happened.
2015-05-28 14:07:59 +00:00
error: Option<ScanError>,
2015-05-24 06:27:42 +00:00
/// Whether we have already emitted the `StreamStart` token.
2015-05-24 06:27:42 +00:00
stream_start_produced: bool,
/// Whether we have already emitted the `StreamEnd` token.
2015-05-24 06:27:42 +00:00
stream_end_produced: bool,
/// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
/// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
adjacent_value_allowed_at: usize,
2023-12-20 23:14:08 +00:00
/// Whether a simple key could potentially start at the current position.
///
/// Simple keys are the opposite of complex keys which are keys starting with `?`.
2015-05-24 06:27:42 +00:00
simple_key_allowed: bool,
/// A stack of potential simple keys.
///
/// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
/// are.
2015-05-24 06:27:42 +00:00
simple_keys: Vec<SimpleKey>,
/// The current indentation level.
2015-05-24 06:27:42 +00:00
indent: isize,
/// List of all block indentation levels we are in (except the current one).
indents: Vec<Indent>,
/// Level of nesting of flow sequences.
2018-09-15 10:33:26 +00:00
flow_level: u8,
/// The number of tokens that have been returned from the scanner.
///
/// This excludes the tokens from [`Self::tokens`].
2015-05-24 06:27:42 +00:00
tokens_parsed: usize,
/// Whether a token is ready to be taken from [`Self::tokens`].
2015-05-24 06:27:42 +00:00
token_available: bool,
2023-12-20 22:14:22 +00:00
/// Whether all characters encountered since the last newline were whitespace.
leading_whitespace: bool,
2023-12-26 18:11:17 +00:00
/// Whether we started a flow mapping.
///
/// This is used to detect implicit flow mapping starts such as:
/// ```yaml
/// [ : foo ] # { null: "foo" }
/// ```
flow_mapping_started: bool,
/// An array of states, representing whether flow sequences have implicit mappings.
///
/// When a flow mapping is possible (when encountering the first `[` or a `,` in a sequence),
/// the state is set to [`Possible`].
/// When we encounter the `:`, we know we are in an implicit mapping and can set the state to
/// [`Inside`].
///
/// There is one entry in this [`Vec`] for each nested flow sequence that we are in.
/// The entries are created with the opening `]` and popped with the closing `]`.
///
/// [`Possible`]: ImplicitMappingState::Possible
/// [`Inside`]: ImplicitMappingState::Inside
implicit_flow_mapping_states: Vec<ImplicitMappingState>,
2015-05-24 06:27:42 +00:00
}
impl<T: Input> Iterator for Scanner<T> {
2015-05-24 06:27:42 +00:00
type Item = Token;
fn next(&mut self) -> Option<Token> {
2015-05-28 14:07:59 +00:00
if self.error.is_some() {
return None;
}
2015-05-24 06:27:42 +00:00
match self.next_token() {
2023-12-26 17:08:21 +00:00
Ok(Some(tok)) => {
debug_print!(
" \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
tok.1,
tok.0
);
2023-12-26 17:08:21 +00:00
Some(tok)
}
2015-05-24 06:27:42 +00:00
Ok(tok) => tok,
Err(e) => {
2015-05-28 14:07:59 +00:00
self.error = Some(e);
2015-05-24 06:27:42 +00:00
None
}
}
}
}
2024-03-20 14:50:48 +00:00
/// A convenience alias for scanner functions that may fail without returning a value.
2015-05-24 06:27:42 +00:00
pub type ScanResult = Result<(), ScanError>;
impl<T: Input> Scanner<T> {
2015-05-24 06:27:42 +00:00
/// Creates the YAML tokenizer.
pub fn new(input: T) -> Scanner<T> {
2015-05-24 06:37:36 +00:00
Scanner {
input,
2015-05-24 06:27:42 +00:00
mark: Marker::new(0, 1, 0),
tokens: VecDeque::new(),
2015-05-28 14:07:59 +00:00
error: None,
2015-05-24 06:27:42 +00:00
stream_start_produced: false,
stream_end_produced: false,
adjacent_value_allowed_at: 0,
2015-05-24 06:27:42 +00:00
simple_key_allowed: true,
simple_keys: Vec::new(),
indent: -1,
indents: Vec::new(),
flow_level: 0,
tokens_parsed: 0,
token_available: false,
2023-12-20 22:14:22 +00:00
leading_whitespace: true,
2023-12-26 18:11:17 +00:00
flow_mapping_started: false,
implicit_flow_mapping_states: vec![],
2015-05-24 06:37:36 +00:00
}
2015-05-24 06:27:42 +00:00
}
/// Get a copy of the last error that was encountered, if any.
///
/// This does not clear the error state and further calls to [`Self::get_error`] will return (a
/// clone of) the same error.
2015-05-28 14:07:59 +00:00
#[inline]
pub fn get_error(&self) -> Option<ScanError> {
2024-02-13 22:10:17 +00:00
self.error.clone()
2015-05-28 14:07:59 +00:00
}
2015-05-24 06:27:42 +00:00
/// Consume the next character. It is assumed the next character is a blank.
2015-05-24 19:21:53 +00:00
#[inline]
fn skip_blank(&mut self) {
self.input.skip();
2015-05-24 06:27:42 +00:00
self.mark.index += 1;
self.mark.col += 1;
}
/// Consume the next character. It is assumed the next character is not a blank.
#[inline]
fn skip_non_blank(&mut self) {
self.input.skip();
self.mark.index += 1;
self.mark.col += 1;
self.leading_whitespace = false;
}
/// Consume the next characters. It is assumed none of the next characters are blanks.
#[inline]
fn skip_n_non_blank(&mut self, count: usize) {
self.input.skip_n(count);
self.mark.index += count;
self.mark.col += count;
self.leading_whitespace = false;
}
/// Consume the next character. It is assumed the next character is a newline.
#[inline]
fn skip_nl(&mut self) {
self.input.skip();
self.mark.index += 1;
self.mark.col = 0;
self.mark.line += 1;
self.leading_whitespace = true;
2015-05-24 06:27:42 +00:00
}
2023-12-20 22:14:22 +00:00
/// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
2015-05-24 19:21:53 +00:00
#[inline]
2024-04-16 10:03:42 +00:00
fn skip_linebreak(&mut self) {
if self.input.next_2_are('\r', '\n') {
// While technically not a blank, this does not matter as `self.leading_whitespace`
// will be reset by `skip_nl`.
self.skip_blank();
self.skip_nl();
} else if is_break(self.input.peek()) {
self.skip_nl();
2015-05-24 19:21:53 +00:00
}
}
2024-03-20 14:50:48 +00:00
/// Return whether the [`TokenType::StreamStart`] event has been emitted.
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
pub fn stream_started(&self) -> bool {
self.stream_start_produced
}
2024-03-20 14:50:48 +00:00
/// Return whether the [`TokenType::StreamEnd`] event has been emitted.
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
pub fn stream_ended(&self) -> bool {
self.stream_end_produced
}
2024-03-20 14:50:48 +00:00
/// Get the current position in the input stream.
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
pub fn mark(&self) -> Marker {
self.mark
}
2023-11-19 16:08:28 +00:00
// Read and consume a line break (either `\r`, `\n` or `\r\n`).
//
// A `\n` is pushed into `s`.
//
2024-01-24 16:14:52 +00:00
// # Panics (in debug)
2023-11-19 16:08:28 +00:00
// If the next characters do not correspond to a line break.
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn read_break(&mut self, s: &mut String) {
let c = self.input.peek();
let nc = self.input.peek_nth(1);
2024-01-24 16:14:52 +00:00
debug_assert!(is_break(c));
if c == '\r' && nc == '\n' {
self.skip_blank();
2015-05-24 06:27:42 +00:00
}
self.skip_nl();
2024-01-24 16:14:52 +00:00
s.push('\n');
2015-05-24 06:27:42 +00:00
}
2023-12-20 23:14:08 +00:00
/// Insert a token at the given position.
2015-05-24 06:27:42 +00:00
fn insert_token(&mut self, pos: usize, tok: Token) {
let old_len = self.tokens.len();
assert!(pos <= old_len);
2023-12-20 23:14:08 +00:00
self.tokens.insert(pos, tok);
2015-05-24 06:27:42 +00:00
}
2015-05-24 06:27:42 +00:00
fn allow_simple_key(&mut self) {
2018-09-15 16:49:04 +00:00
self.simple_key_allowed = true;
2015-05-24 06:27:42 +00:00
}
2015-05-24 06:27:42 +00:00
fn disallow_simple_key(&mut self) {
2018-09-15 16:49:04 +00:00
self.simple_key_allowed = false;
2015-05-24 06:27:42 +00:00
}
/// Fetch the next token in the stream.
///
/// # Errors
/// Returns `ScanError` when the scanner does not find the next expected token.
2015-05-24 06:27:42 +00:00
pub fn fetch_next_token(&mut self) -> ScanResult {
self.input.lookahead(1);
2015-05-24 06:27:42 +00:00
if !self.stream_start_produced {
self.fetch_stream_start();
return Ok(());
}
2023-12-20 22:14:22 +00:00
self.skip_to_next_token()?;
2023-12-30 02:35:43 +00:00
debug_print!(
" \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
self.mark,
2024-04-18 17:25:16 +00:00
self.input.peek()
);
2015-05-24 06:27:42 +00:00
2018-09-16 06:58:48 +00:00
self.stale_simple_keys()?;
2015-05-24 06:27:42 +00:00
let mark = self.mark;
self.unroll_indent(mark.col as isize);
self.input.lookahead(4);
2015-05-24 06:27:42 +00:00
2024-04-18 17:25:16 +00:00
if is_z(self.input.peek()) {
2018-09-16 06:58:48 +00:00
self.fetch_stream_end()?;
2015-05-24 06:27:42 +00:00
return Ok(());
}
if self.mark.col == 0 {
if self.input.next_char_is('%') {
return self.fetch_directive();
} else if self.input.next_is_document_start() {
return self.fetch_document_indicator(TokenType::DocumentStart);
} else if self.input.next_is_document_end() {
self.fetch_document_indicator(TokenType::DocumentEnd)?;
self.skip_ws_to_eol(SkipTabs::Yes)?;
2024-04-18 17:25:16 +00:00
if !is_breakz(self.input.peek()) {
return Err(ScanError::new_str(
self.mark,
"invalid content after document end marker",
));
}
return Ok(());
}
2015-05-24 06:27:42 +00:00
}
if (self.mark.col as isize) < self.indent {
return Err(ScanError::new_str(self.mark, "invalid indentation"));
}
let c = self.input.peek();
let nc = self.input.peek_nth(1);
2015-05-24 06:27:42 +00:00
match c {
'[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
'{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
'}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
2015-05-28 17:56:03 +00:00
',' => self.fetch_flow_entry(),
'-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
'?' if is_blank_or_breakz(nc) => self.fetch_key(),
':' if is_blank_or_breakz(nc) => self.fetch_value(),
':' if self.flow_level > 0
&& (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at) =>
{
self.fetch_flow_value()
}
2015-05-28 17:56:03 +00:00
// Is it an alias?
'*' => self.fetch_anchor(true),
// Is it an anchor?
'&' => self.fetch_anchor(false),
'!' => self.fetch_tag(),
2015-05-26 16:29:40 +00:00
// Is it a literal scalar?
2015-05-28 17:56:03 +00:00
'|' if self.flow_level == 0 => self.fetch_block_scalar(true),
2015-05-26 16:29:40 +00:00
// Is it a folded scalar?
2015-05-28 17:56:03 +00:00
'>' if self.flow_level == 0 => self.fetch_block_scalar(false),
'\'' => self.fetch_flow_scalar(true),
'"' => self.fetch_flow_scalar(false),
2015-05-24 06:27:42 +00:00
// plain scalar
'-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
self.fetch_plain_scalar()
}
2018-09-15 16:49:04 +00:00
'%' | '@' | '`' => Err(ScanError::new(
self.mark,
format!("unexpected character: `{c}'"),
2018-09-15 16:49:04 +00:00
)),
2015-05-28 17:56:03 +00:00
_ => self.fetch_plain_scalar(),
2015-05-24 06:27:42 +00:00
}
}
/// Return the next token in the stream.
/// # Errors
/// Returns `ScanError` when scanning fails to find an expected next token.
2015-05-24 06:27:42 +00:00
pub fn next_token(&mut self) -> Result<Option<Token>, ScanError> {
if self.stream_end_produced {
return Ok(None);
}
if !self.token_available {
2018-09-16 06:58:48 +00:00
self.fetch_more_tokens()?;
2015-05-24 06:27:42 +00:00
}
let Some(t) = self.tokens.pop_front() else {
return Err(ScanError::new_str(
self.mark,
"did not find expected next token",
));
};
2015-05-24 06:27:42 +00:00
self.token_available = false;
self.tokens_parsed += 1;
if let TokenType::StreamEnd = t.1 {
self.stream_end_produced = true;
2015-05-24 06:27:42 +00:00
}
Ok(Some(t))
}
/// Fetch tokens from the token stream.
/// # Errors
/// Returns `ScanError` when loading fails.
2015-05-24 06:27:42 +00:00
pub fn fetch_more_tokens(&mut self) -> ScanResult {
2015-05-24 06:37:36 +00:00
let mut need_more;
2015-05-24 06:27:42 +00:00
loop {
if self.tokens.is_empty() {
need_more = true;
} else {
2023-12-20 23:14:08 +00:00
need_more = false;
// Stale potential keys that we know won't be keys.
2018-09-16 06:58:48 +00:00
self.stale_simple_keys()?;
// If our next token to be emitted may be a key, fetch more context.
2015-05-24 06:27:42 +00:00
for sk in &self.simple_keys {
if sk.possible && sk.token_number == self.tokens_parsed {
need_more = true;
break;
}
}
}
2018-09-15 16:49:04 +00:00
if !need_more {
break;
}
2018-09-16 06:58:48 +00:00
self.fetch_next_token()?;
2015-05-24 06:27:42 +00:00
}
self.token_available = true;
Ok(())
}
/// Mark simple keys that can no longer be keys as such.
///
/// This function sets `possible` to `false` to each key that, now we have more context, we
/// know will not be keys.
///
/// # Errors
/// This function returns an error if one of the key we would stale was required to be a key.
2015-05-24 06:27:42 +00:00
fn stale_simple_keys(&mut self) -> ScanResult {
2024-02-13 22:10:17 +00:00
for sk in &mut self.simple_keys {
2018-09-15 16:49:04 +00:00
if sk.possible
// If not in a flow construct, simple keys cannot span multiple lines.
&& self.flow_level == 0
&& (sk.mark.line < self.mark.line || sk.mark.index + 1024 < self.mark.index)
2018-09-15 16:49:04 +00:00
{
if sk.required {
return Err(ScanError::new_str(self.mark, "simple key expect ':'"));
2015-05-24 06:27:42 +00:00
}
2018-09-15 16:49:04 +00:00
sk.possible = false;
}
2015-05-24 06:27:42 +00:00
}
Ok(())
}
/// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
2023-12-22 14:43:28 +00:00
///
/// # Errors
/// This function returns an error if a tabulation is encountered where there should not be
/// one.
2023-12-20 22:14:22 +00:00
fn skip_to_next_token(&mut self) -> ScanResult {
2015-05-24 06:27:42 +00:00
loop {
// TODO(chenyh) BOM
match self.input.look_ch() {
2023-12-20 22:14:22 +00:00
// Tabs may not be used as indentation.
// "Indentation" only exists as long as a block is started, but does not exist
// inside of flow-style constructs. Tabs are allowed as part of leading
2023-12-20 22:14:22 +00:00
// whitespaces outside of indentation.
// If a flow-style construct is in an indented block, its contents must still be
// indented. Also, tabs are allowed anywhere in it if it has no content.
2023-12-22 14:43:28 +00:00
'\t' if self.is_within_block()
&& self.leading_whitespace
&& (self.mark.col as isize) < self.indent =>
{
2024-01-19 15:21:56 +00:00
self.skip_ws_to_eol(SkipTabs::Yes)?;
// If we have content on that line with a tab, return an error.
2024-04-18 17:25:16 +00:00
if !is_breakz(self.input.peek()) {
return Err(ScanError::new_str(
self.mark,
"tabs disallowed within this context (block indentation)",
));
}
2023-12-20 22:14:22 +00:00
}
2024-03-17 09:18:39 +00:00
'\t' | ' ' => self.skip_blank(),
2015-05-24 06:27:42 +00:00
'\n' | '\r' => {
self.input.lookahead(2);
2024-04-16 10:03:42 +00:00
self.skip_linebreak();
2015-05-24 06:27:42 +00:00
if self.flow_level == 0 {
self.allow_simple_key();
}
2018-09-15 16:49:04 +00:00
}
2018-12-13 07:35:01 +00:00
'#' => {
while !is_breakz(self.input.look_ch()) {
self.skip_non_blank();
2018-12-13 07:35:01 +00:00
}
}
2018-09-15 16:49:04 +00:00
_ => break,
2015-05-24 06:27:42 +00:00
}
}
2023-12-20 22:14:22 +00:00
Ok(())
2015-05-24 06:27:42 +00:00
}
2023-12-22 15:11:07 +00:00
/// Skip over YAML whitespace (` `, `\n`, `\r`).
///
/// # Errors
2023-12-23 22:25:14 +00:00
/// This function returns an error if no whitespace was found.
2023-12-22 15:11:07 +00:00
fn skip_yaml_whitespace(&mut self) -> ScanResult {
let mut need_whitespace = true;
loop {
match self.input.look_ch() {
2023-12-22 15:11:07 +00:00
' ' => {
self.skip_blank();
2023-12-22 15:11:07 +00:00
need_whitespace = false;
}
'\n' | '\r' => {
self.input.lookahead(2);
2024-04-16 10:03:42 +00:00
self.skip_linebreak();
2023-12-22 15:11:07 +00:00
if self.flow_level == 0 {
self.allow_simple_key();
}
need_whitespace = false;
}
'#' => {
while !is_breakz(self.input.look_ch()) {
self.skip_non_blank();
2023-12-22 15:11:07 +00:00
}
}
_ => break,
}
}
if need_whitespace {
Err(ScanError::new_str(self.mark(), "expected whitespace"))
2023-12-22 15:11:07 +00:00
} else {
Ok(())
}
}
/// Skip yaml whitespace at most up to eol. Also skips comments.
2024-01-19 15:21:56 +00:00
fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
2023-12-23 23:02:42 +00:00
let mut encountered_tab = false;
2023-12-25 23:34:29 +00:00
let mut has_yaml_ws = false;
loop {
match self.input.look_ch() {
2023-12-25 23:34:29 +00:00
' ' => {
has_yaml_ws = true;
self.skip_blank();
2023-12-25 23:34:29 +00:00
}
2023-12-23 23:02:42 +00:00
'\t' if skip_tabs != SkipTabs::No => {
encountered_tab = true;
self.skip_blank();
2023-12-23 23:02:42 +00:00
}
2024-01-19 15:21:56 +00:00
// YAML comments must be preceded by whitespace.
'#' if !encountered_tab && !has_yaml_ws => {
return Err(ScanError::new_str(
2024-01-19 15:21:56 +00:00
self.mark,
"comments must be separated from other tokens by whitespace",
));
}
'#' => {
while !is_breakz(self.input.look_ch()) {
self.skip_non_blank();
}
}
_ => break,
}
}
2023-12-23 23:02:42 +00:00
2024-01-19 15:21:56 +00:00
Ok(SkipTabs::Result(encountered_tab, has_yaml_ws))
}
2015-05-24 06:27:42 +00:00
fn fetch_stream_start(&mut self) {
let mark = self.mark;
self.indent = -1;
self.stream_start_produced = true;
self.allow_simple_key();
2018-09-15 16:49:04 +00:00
self.tokens
.push_back(Token(mark, TokenType::StreamStart(TEncoding::Utf8)));
self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2015-05-24 06:27:42 +00:00
}
fn fetch_stream_end(&mut self) -> ScanResult {
// force new line
if self.mark.col != 0 {
self.mark.col = 0;
self.mark.line += 1;
}
// If the stream ended, we won't have more context. We can stall all the simple keys we
// had. If one was required, however, that was an error and we must propagate it.
for sk in &mut self.simple_keys {
if sk.required && sk.possible {
return Err(ScanError::new_str(self.mark, "simple key expected"));
}
sk.possible = false;
}
2015-05-24 06:27:42 +00:00
self.unroll_indent(-1);
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.disallow_simple_key();
2018-09-15 16:49:04 +00:00
self.tokens
.push_back(Token(self.mark, TokenType::StreamEnd));
2015-05-24 06:27:42 +00:00
Ok(())
}
2015-05-28 14:07:59 +00:00
fn fetch_directive(&mut self) -> ScanResult {
self.unroll_indent(-1);
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-28 14:07:59 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_directive()?;
2015-05-28 14:07:59 +00:00
self.tokens.push_back(tok);
Ok(())
}
fn scan_directive(&mut self) -> Result<Token, ScanError> {
let start_mark = self.mark;
self.skip_non_blank();
2015-05-28 14:07:59 +00:00
2018-09-16 06:58:48 +00:00
let name = self.scan_directive_name()?;
2015-05-28 14:07:59 +00:00
let tok = match name.as_ref() {
2018-09-16 06:58:48 +00:00
"YAML" => self.scan_version_directive_value(&start_mark)?,
"TAG" => self.scan_tag_directive_value(&start_mark)?,
2015-05-28 18:57:41 +00:00
// XXX This should be a warning instead of an error
2015-05-30 10:49:54 +00:00
_ => {
// skip current line
while !is_breakz(self.input.look_ch()) {
self.skip_non_blank();
2015-05-30 10:49:54 +00:00
}
// XXX return an empty TagDirective token
2018-09-15 16:49:04 +00:00
Token(
start_mark,
TokenType::TagDirective(String::new(), String::new()),
)
// return Err(ScanError::new_str(start_mark,
2015-05-30 10:49:54 +00:00
// "while scanning a directive, found unknown directive name"))
}
2015-05-28 14:07:59 +00:00
};
2024-01-19 15:21:56 +00:00
self.skip_ws_to_eol(SkipTabs::Yes)?;
2015-05-28 14:07:59 +00:00
2024-04-18 17:25:16 +00:00
if is_breakz(self.input.peek()) {
self.input.lookahead(2);
2024-04-16 10:03:42 +00:00
self.skip_linebreak();
Ok(tok)
} else {
Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"while scanning a directive, did not find expected comment or line break",
2024-04-16 10:03:42 +00:00
))
2015-05-28 14:07:59 +00:00
}
}
fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
while is_blank(self.input.look_ch()) {
self.skip_blank();
2015-05-28 14:07:59 +00:00
}
2018-09-16 06:58:48 +00:00
let major = self.scan_version_directive_number(mark)?;
2015-05-28 14:07:59 +00:00
2024-04-18 17:25:16 +00:00
if self.input.peek() != '.' {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while scanning a YAML directive, did not find expected digit or '.' character",
));
2015-05-28 14:07:59 +00:00
}
self.skip_non_blank();
2015-05-28 14:07:59 +00:00
2018-09-16 06:58:48 +00:00
let minor = self.scan_version_directive_number(mark)?;
2015-05-28 14:07:59 +00:00
Ok(Token(*mark, TokenType::VersionDirective(major, minor)))
2015-05-28 14:07:59 +00:00
}
fn scan_directive_name(&mut self) -> Result<String, ScanError> {
let start_mark = self.mark;
let mut string = String::new();
while is_alpha(self.input.look_ch()) {
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
2015-05-28 14:07:59 +00:00
}
if string.is_empty() {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"while scanning a directive, could not find expected directive name",
));
2015-05-28 14:07:59 +00:00
}
2024-04-18 17:25:16 +00:00
if !is_blank_or_breakz(self.input.peek()) {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"while scanning a directive, found unexpected non-alphabetical character",
));
2015-05-28 14:07:59 +00:00
}
Ok(string)
}
fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
let mut val = 0u32;
let mut length = 0usize;
while let Some(digit) = self.input.look_ch().to_digit(10) {
2015-05-28 14:07:59 +00:00
if length + 1 > 9 {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while scanning a YAML directive, found extremely long version number",
));
2015-05-28 14:07:59 +00:00
}
length += 1;
2024-01-19 15:21:56 +00:00
val = val * 10 + digit;
self.skip_non_blank();
2015-05-28 14:07:59 +00:00
}
if length == 0 {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while scanning a YAML directive, did not find expected version number",
));
2015-05-28 14:07:59 +00:00
}
Ok(val)
}
fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
2015-05-28 18:57:41 +00:00
/* Eat whitespaces. */
while is_blank(self.input.look_ch()) {
self.skip_blank();
2015-05-28 18:57:41 +00:00
}
2018-09-16 06:58:48 +00:00
let handle = self.scan_tag_handle(true, mark)?;
2015-05-28 18:57:41 +00:00
/* Eat whitespaces. */
while is_blank(self.input.look_ch()) {
self.skip_blank();
2015-05-28 18:57:41 +00:00
}
2024-01-22 23:04:46 +00:00
let prefix = self.scan_tag_prefix(mark)?;
2015-05-28 18:57:41 +00:00
self.input.lookahead(1);
2015-05-28 18:57:41 +00:00
2024-04-18 17:25:16 +00:00
if is_blank_or_breakz(self.input.peek()) {
2016-03-10 12:49:02 +00:00
Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
} else {
Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while scanning TAG, did not find expected whitespace or line break",
))
2015-05-28 18:57:41 +00:00
}
2015-05-28 14:07:59 +00:00
}
2015-05-28 17:56:03 +00:00
fn fetch_tag(&mut self) -> ScanResult {
self.save_simple_key();
2015-05-28 17:56:03 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_tag()?;
2015-05-28 17:56:03 +00:00
self.tokens.push_back(tok);
Ok(())
}
fn scan_tag(&mut self) -> Result<Token, ScanError> {
let start_mark = self.mark;
let mut handle = String::new();
2015-05-28 18:57:41 +00:00
let mut suffix;
2015-05-28 17:56:03 +00:00
// Check if the tag is in the canonical form (verbatim).
self.input.lookahead(2);
2015-05-28 17:56:03 +00:00
if self.input.nth_char_is(1, '<') {
2024-01-22 23:04:46 +00:00
suffix = self.scan_verbatim_tag(&start_mark)?;
2015-05-28 17:56:03 +00:00
} else {
2015-06-29 16:31:22 +00:00
// The tag has either the '!suffix' or the '!handle!suffix'
2018-09-16 06:58:48 +00:00
handle = self.scan_tag_handle(false, &start_mark)?;
2015-05-28 17:56:03 +00:00
// Check if it is, indeed, handle.
if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
// A tag handle starting with "!!" is a secondary tag handle.
let is_secondary_handle = handle == "!!";
2024-01-22 23:04:46 +00:00
suffix =
self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", &start_mark)?;
2015-05-28 17:56:03 +00:00
} else {
2024-01-22 23:04:46 +00:00
suffix = self.scan_tag_shorthand_suffix(false, false, &handle, &start_mark)?;
2024-06-13 20:05:43 +00:00
"!".clone_into(&mut handle);
2015-05-28 17:56:03 +00:00
// A special case: the '!' tag. Set the handle to '' and the
// suffix to '!'.
if suffix.is_empty() {
2015-05-28 17:56:03 +00:00
handle.clear();
2024-06-13 20:05:43 +00:00
"!".clone_into(&mut suffix);
2015-05-28 17:56:03 +00:00
}
}
}
2024-04-18 17:25:16 +00:00
if is_blank_or_breakz(self.input.look_ch())
|| (self.flow_level > 0 && is_flow(self.input.peek()))
{
2015-05-28 17:56:03 +00:00
// XXX: ex 7.2, an empty scalar can follow a secondary tag
Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
2015-05-28 17:56:03 +00:00
} else {
Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"while scanning a tag, did not find expected whitespace or line break",
))
2015-05-28 17:56:03 +00:00
}
}
fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new();
if self.input.look_ch() != '!' {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while scanning a tag, did not find expected '!'",
));
2015-05-28 17:56:03 +00:00
}
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
2015-05-28 17:56:03 +00:00
while is_alpha(self.input.look_ch()) {
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
2015-05-28 17:56:03 +00:00
}
// Check if the trailing character is '!' and copy it.
2024-04-18 17:25:16 +00:00
if self.input.peek() == '!' {
string.push(self.input.peek());
self.skip_non_blank();
} else if directive && string != "!" {
2015-05-28 17:56:03 +00:00
// It's either the '!' tag or not really a tag handle. If it's a %TAG
// directive, it's an error. If it's a tag token, it must be a part of
// URI.
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while parsing a tag directive, did not find expected '!'",
));
2015-05-28 17:56:03 +00:00
}
Ok(string)
}
2024-01-22 23:04:46 +00:00
/// Scan for a tag prefix (6.8.2.2).
///
/// There are 2 kinds of tag prefixes:
/// - Local: Starts with a `!`, contains only URI chars (`!foo`)
/// - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new();
if self.input.look_ch() == '!' {
2024-01-22 23:04:46 +00:00
// If we have a local tag, insert and skip `!`.
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
2024-04-18 17:25:16 +00:00
} else if !is_tag_char(self.input.peek()) {
2024-01-22 23:04:46 +00:00
// Otherwise, check if the first global tag character is valid.
return Err(ScanError::new_str(
*start_mark,
"invalid global tag character",
));
2024-04-18 17:25:16 +00:00
} else if self.input.peek() == '%' {
2024-01-22 23:04:46 +00:00
// If it is valid and an escape sequence, escape it.
string.push(self.scan_uri_escapes(start_mark)?);
} else {
// Otherwise, push the first character.
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
2024-01-22 23:04:46 +00:00
}
while is_uri_char(self.input.look_ch()) {
2024-04-18 17:25:16 +00:00
if self.input.peek() == '%' {
2024-01-22 23:04:46 +00:00
string.push(self.scan_uri_escapes(start_mark)?);
} else {
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
2024-01-22 23:04:46 +00:00
}
}
Ok(string)
}
/// Scan for a verbatim tag.
///
/// The prefixing `!<` must _not_ have been skipped.
fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
// Eat `!<`
self.skip_non_blank();
self.skip_non_blank();
2024-01-22 23:04:46 +00:00
let mut string = String::new();
while is_uri_char(self.input.look_ch()) {
2024-04-18 17:25:16 +00:00
if self.input.peek() == '%' {
2024-01-22 23:04:46 +00:00
string.push(self.scan_uri_escapes(start_mark)?);
} else {
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
2024-01-22 23:04:46 +00:00
}
}
2024-04-18 17:25:16 +00:00
if self.input.peek() != '>' {
return Err(ScanError::new_str(
2024-01-22 23:04:46 +00:00
*start_mark,
"while scanning a verbatim tag, did not find the expected '>'",
));
}
self.skip_non_blank();
2024-01-22 23:04:46 +00:00
Ok(string)
}
fn scan_tag_shorthand_suffix(
2018-09-15 16:49:04 +00:00
&mut self,
2024-01-22 23:04:46 +00:00
_directive: bool,
2018-09-15 16:49:04 +00:00
_is_secondary: bool,
head: &str,
mark: &Marker,
) -> Result<String, ScanError> {
2015-05-28 17:56:03 +00:00
let mut length = head.len();
let mut string = String::new();
// Copy the head if needed.
// Note that we don't copy the leading '!' character.
if length > 1 {
string.extend(head.chars().skip(1));
}
while is_tag_char(self.input.look_ch()) {
2015-05-28 17:56:03 +00:00
// Check if it is a URI-escape sequence.
2024-04-18 17:25:16 +00:00
if self.input.peek() == '%' {
2024-01-22 23:04:46 +00:00
string.push(self.scan_uri_escapes(mark)?);
2015-05-28 17:56:03 +00:00
} else {
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
2015-05-28 17:56:03 +00:00
}
length += 1;
}
if length == 0 {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while parsing a tag, did not find expected tag URI",
));
2015-05-28 17:56:03 +00:00
}
Ok(string)
}
2024-01-22 23:04:46 +00:00
fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
2015-05-30 10:49:54 +00:00
let mut width = 0usize;
let mut code = 0u32;
loop {
self.input.lookahead(3);
let c = self.input.peek_nth(1);
let nc = self.input.peek_nth(2);
2015-05-30 10:49:54 +00:00
2024-04-18 17:25:16 +00:00
if !(self.input.peek() == '%' && is_hex(c) && is_hex(nc)) {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while parsing a tag, found an invalid escape sequence",
2018-09-15 16:49:04 +00:00
));
2015-05-30 10:49:54 +00:00
}
let byte = (as_hex(c) << 4) + as_hex(nc);
2015-05-30 10:49:54 +00:00
if width == 0 {
width = match byte {
_ if byte & 0x80 == 0x00 => 1,
_ if byte & 0xE0 == 0xC0 => 2,
_ if byte & 0xF0 == 0xE0 => 3,
_ if byte & 0xF8 == 0xF0 => 4,
2015-05-30 10:49:54 +00:00
_ => {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while parsing a tag, found an incorrect leading UTF-8 byte",
2018-09-15 16:49:04 +00:00
));
2015-05-30 10:49:54 +00:00
}
};
code = byte;
2015-05-30 10:49:54 +00:00
} else {
if byte & 0xc0 != 0x80 {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while parsing a tag, found an incorrect trailing UTF-8 byte",
2018-09-15 16:49:04 +00:00
));
2015-05-30 10:49:54 +00:00
}
code = (code << 8) + byte;
2015-05-30 10:49:54 +00:00
}
self.skip_n_non_blank(3);
2015-05-30 10:49:54 +00:00
width -= 1;
if width == 0 {
break;
}
}
match char::from_u32(code) {
Some(ch) => Ok(ch),
None => Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
*mark,
"while parsing a tag, found an invalid UTF-8 codepoint",
)),
2015-05-30 10:49:54 +00:00
}
}
2015-05-28 17:56:03 +00:00
fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
self.save_simple_key();
2015-05-28 17:56:03 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_anchor(alias)?;
2015-05-28 17:56:03 +00:00
self.tokens.push_back(tok);
Ok(())
}
2018-09-15 16:49:04 +00:00
fn scan_anchor(&mut self, alias: bool) -> Result<Token, ScanError> {
2015-05-28 17:56:03 +00:00
let mut string = String::new();
let start_mark = self.mark;
self.skip_non_blank();
while is_anchor_char(self.input.look_ch()) {
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
2015-05-28 17:56:03 +00:00
}
2023-12-28 00:48:19 +00:00
if string.is_empty() {
return Err(ScanError::new_str(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
2015-05-28 17:56:03 +00:00
}
if alias {
Ok(Token(start_mark, TokenType::Alias(string)))
2015-05-28 17:56:03 +00:00
} else {
Ok(Token(start_mark, TokenType::Anchor(string)))
2015-05-28 17:56:03 +00:00
}
}
2018-09-15 16:49:04 +00:00
fn fetch_flow_collection_start(&mut self, tok: TokenType) -> ScanResult {
2015-05-24 06:27:42 +00:00
// The indicators '[' and '{' may start a simple key.
self.save_simple_key();
2015-05-24 06:27:42 +00:00
self.roll_one_col_indent();
2018-09-15 10:33:26 +00:00
self.increase_flow_level()?;
2015-05-24 06:27:42 +00:00
self.allow_simple_key();
let start_mark = self.mark;
self.skip_non_blank();
2015-05-24 06:27:42 +00:00
2023-12-26 18:11:17 +00:00
if tok == TokenType::FlowMappingStart {
self.flow_mapping_started = true;
} else {
self.implicit_flow_mapping_states
.push(ImplicitMappingState::Possible);
2023-12-26 18:11:17 +00:00
}
2024-01-19 15:21:56 +00:00
self.skip_ws_to_eol(SkipTabs::Yes)?;
2015-05-24 06:27:42 +00:00
self.tokens.push_back(Token(start_mark, tok));
Ok(())
}
2018-09-15 16:49:04 +00:00
fn fetch_flow_collection_end(&mut self, tok: TokenType) -> ScanResult {
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.decrease_flow_level();
self.disallow_simple_key();
if matches!(tok, TokenType::FlowSequenceEnd) {
self.end_implicit_mapping(self.mark);
// We are out exiting the flow sequence, nesting goes down 1 level.
self.implicit_flow_mapping_states.pop();
}
2023-12-26 18:11:17 +00:00
2015-05-24 06:27:42 +00:00
let start_mark = self.mark;
self.skip_non_blank();
2024-01-19 15:21:56 +00:00
self.skip_ws_to_eol(SkipTabs::Yes)?;
2015-05-24 06:27:42 +00:00
// A flow collection within a flow mapping can be a key. In that case, the value may be
// adjacent to the `:`.
// ```yaml
// - [ {a: b}:value ]
// ```
if self.flow_level > 0 {
self.adjacent_value_allowed_at = self.mark.index;
}
2015-05-24 06:27:42 +00:00
self.tokens.push_back(Token(start_mark, tok));
Ok(())
}
/// Push the `FlowEntry` token and skip over the `,`.
2015-05-24 06:27:42 +00:00
fn fetch_flow_entry(&mut self) -> ScanResult {
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.allow_simple_key();
2023-12-26 18:11:17 +00:00
self.end_implicit_mapping(self.mark);
2015-05-24 06:27:42 +00:00
let start_mark = self.mark;
self.skip_non_blank();
2024-01-19 15:21:56 +00:00
self.skip_ws_to_eol(SkipTabs::Yes)?;
2015-05-24 06:27:42 +00:00
2018-09-15 16:49:04 +00:00
self.tokens
.push_back(Token(start_mark, TokenType::FlowEntry));
2015-05-24 06:27:42 +00:00
Ok(())
}
2018-09-15 10:33:26 +00:00
fn increase_flow_level(&mut self) -> ScanResult {
2018-09-15 16:49:04 +00:00
self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
self.flow_level = self
.flow_level
.checked_add(1)
.ok_or_else(|| ScanError::new_str(self.mark, "recursion limit exceeded"))?;
2018-09-15 10:33:26 +00:00
Ok(())
2015-05-24 06:27:42 +00:00
}
2015-05-24 06:27:42 +00:00
fn decrease_flow_level(&mut self) {
if self.flow_level > 0 {
self.flow_level -= 1;
self.simple_keys.pop().unwrap();
}
}
/// Push the `Block*` token(s) and skip over the `-`.
///
/// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
/// `BlockEntry` token.
/// This function only skips over the `-` and does not fetch the entry value.
2015-05-24 06:27:42 +00:00
fn fetch_block_entry(&mut self) -> ScanResult {
if self.flow_level > 0 {
2015-05-24 06:27:42 +00:00
// - * only allowed in block
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
self.mark,
r#""-" is only valid inside a block"#,
));
2015-05-24 06:27:42 +00:00
}
// Check if we are allowed to start a new entry.
if !self.simple_key_allowed {
return Err(ScanError::new_str(
self.mark,
"block sequence entries are not allowed in this context",
));
}
2015-05-24 06:27:42 +00:00
// ???, fixes test G9HC.
if let Some(Token(mark, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() {
if self.mark.col == 0 && mark.col == 0 && self.indent > -1 {
return Err(ScanError::new_str(*mark, "invalid indentation for anchor"));
}
}
// Skip over the `-`.
let mark = self.mark;
self.skip_non_blank();
2015-05-24 06:27:42 +00:00
// generate BLOCK-SEQUENCE-START if indented
self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2024-01-19 15:21:56 +00:00
let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
self.input.lookahead(2);
if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
{
return Err(ScanError::new_str(
2023-12-23 23:02:42 +00:00
self.mark,
"'-' must be followed by a valid YAML whitespace",
));
2023-12-23 22:25:14 +00:00
}
2024-01-19 15:21:56 +00:00
self.skip_ws_to_eol(SkipTabs::No)?;
2024-04-18 17:25:16 +00:00
if is_break(self.input.look_ch()) || is_flow(self.input.peek()) {
2023-12-26 17:06:20 +00:00
self.roll_one_col_indent();
}
self.remove_simple_key()?;
self.allow_simple_key();
2018-09-15 16:49:04 +00:00
self.tokens
.push_back(Token(self.mark, TokenType::BlockEntry));
2015-05-24 06:27:42 +00:00
Ok(())
}
2015-06-29 16:31:22 +00:00
2015-05-24 06:27:42 +00:00
fn fetch_document_indicator(&mut self, t: TokenType) -> ScanResult {
self.unroll_indent(-1);
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.disallow_simple_key();
let mark = self.mark;
self.skip_n_non_blank(3);
2015-05-24 06:27:42 +00:00
self.tokens.push_back(Token(mark, t));
Ok(())
}
2015-05-26 16:29:40 +00:00
fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
self.save_simple_key();
2015-05-26 16:29:40 +00:00
self.allow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_block_scalar(literal)?;
2015-05-26 16:29:40 +00:00
self.tokens.push_back(tok);
Ok(())
}
2023-08-11 23:54:46 +00:00
#[allow(clippy::too_many_lines)]
2015-05-26 16:29:40 +00:00
fn scan_block_scalar(&mut self, literal: bool) -> Result<Token, ScanError> {
let start_mark = self.mark;
2024-01-19 19:21:36 +00:00
let mut chomping = Chomping::Clip;
2015-05-26 16:29:40 +00:00
let mut increment: usize = 0;
let mut indent: usize = 0;
2015-05-28 09:18:20 +00:00
let mut trailing_blank: bool;
2015-05-26 16:29:40 +00:00
let mut leading_blank: bool = false;
2024-01-19 19:21:36 +00:00
let style = if literal {
TScalarStyle::Literal
} else {
TScalarStyle::Folded
};
2015-05-26 16:29:40 +00:00
let mut string = String::new();
let mut leading_break = String::new();
let mut trailing_breaks = String::new();
2024-01-19 19:21:36 +00:00
let mut chomping_break = String::new();
2015-05-26 16:29:40 +00:00
// skip '|' or '>'
self.skip_non_blank();
2023-12-26 17:06:20 +00:00
self.unroll_non_block_indents();
2015-05-26 16:29:40 +00:00
2024-04-18 17:25:16 +00:00
if self.input.look_ch() == '+' || self.input.peek() == '-' {
if self.input.peek() == '+' {
2024-01-19 19:21:36 +00:00
chomping = Chomping::Keep;
2015-05-26 16:29:40 +00:00
} else {
2024-01-19 19:21:36 +00:00
chomping = Chomping::Strip;
2015-05-26 16:29:40 +00:00
}
self.skip_non_blank();
if is_digit(self.input.look_ch()) {
2024-04-18 17:25:16 +00:00
if self.input.peek() == '0' {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"while scanning a block scalar, found an indentation indicator equal to 0",
2018-09-15 16:49:04 +00:00
));
2015-05-26 16:29:40 +00:00
}
2024-04-18 17:25:16 +00:00
increment = (self.input.peek() as usize) - ('0' as usize);
self.skip_non_blank();
2015-05-26 16:29:40 +00:00
}
2024-04-18 17:25:16 +00:00
} else if is_digit(self.input.peek()) {
if self.input.peek() == '0' {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"while scanning a block scalar, found an indentation indicator equal to 0",
2018-09-15 16:49:04 +00:00
));
2015-05-26 16:29:40 +00:00
}
2024-04-18 17:25:16 +00:00
increment = (self.input.peek() as usize) - ('0' as usize);
self.skip_non_blank();
self.input.lookahead(1);
2024-04-18 17:25:16 +00:00
if self.input.peek() == '+' || self.input.peek() == '-' {
if self.input.peek() == '+' {
2024-01-19 19:21:36 +00:00
chomping = Chomping::Keep;
2015-05-26 16:29:40 +00:00
} else {
2024-01-19 19:21:36 +00:00
chomping = Chomping::Strip;
2015-05-26 16:29:40 +00:00
}
self.skip_non_blank();
2015-05-26 16:29:40 +00:00
}
}
2024-01-19 15:21:56 +00:00
self.skip_ws_to_eol(SkipTabs::Yes)?;
2015-05-26 16:29:40 +00:00
// Check if we are at the end of the line.
if !is_breakz(self.input.look_ch()) {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"while scanning a block scalar, did not find expected comment or line break",
));
2015-05-26 16:29:40 +00:00
}
2024-04-18 17:25:16 +00:00
if is_break(self.input.peek()) {
self.input.lookahead(2);
2024-01-19 19:21:36 +00:00
self.read_break(&mut chomping_break);
2015-05-26 16:29:40 +00:00
}
if self.input.look_ch() == '\t' {
return Err(ScanError::new_str(
2023-12-22 14:43:28 +00:00
start_mark,
"a block scalar content cannot start with a tab",
));
}
2015-05-26 16:29:40 +00:00
if increment > 0 {
2018-09-15 16:49:04 +00:00
indent = if self.indent >= 0 {
(self.indent + increment as isize) as usize
} else {
increment
}
2015-05-26 16:29:40 +00:00
}
2023-12-21 19:02:56 +00:00
2015-05-26 16:29:40 +00:00
// Scan the leading line breaks and determine the indentation level if needed.
2023-12-21 19:02:56 +00:00
if indent == 0 {
self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
} else {
self.skip_block_scalar_indent(indent, &mut trailing_breaks);
}
2015-06-29 16:31:22 +00:00
2024-01-19 19:21:36 +00:00
// We have an end-of-stream with no content, e.g.:
// ```yaml
// - |+
// ```
2024-04-18 17:25:16 +00:00
if is_z(self.input.peek()) {
2024-01-19 19:21:36 +00:00
let contents = match chomping {
// We strip trailing linebreaks. Nothing remain.
Chomping::Strip => String::new(),
// There was no newline after the chomping indicator.
_ if self.mark.line == start_mark.line() => String::new(),
// We clip lines, and there was a newline after the chomping indicator.
// All other breaks are ignored.
Chomping::Clip => chomping_break,
// We keep lines. There was a newline after the chomping indicator but nothing
// else.
Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
// Otherwise, the newline after chomping is ignored.
Chomping::Keep => trailing_breaks,
};
return Ok(Token(start_mark, TokenType::Scalar(style, contents)));
}
2015-05-26 16:29:40 +00:00
2024-01-22 22:09:20 +00:00
if self.mark.col < indent && (self.mark.col as isize) > self.indent {
return Err(ScanError::new_str(
2024-01-22 22:09:20 +00:00
self.mark,
"wrongly indented line in block scalar",
));
}
2015-05-26 16:29:40 +00:00
let mut line_buffer = String::with_capacity(100);
2024-01-22 22:09:20 +00:00
let start_mark = self.mark;
2024-04-18 17:25:16 +00:00
while self.mark.col == indent && !is_z(self.input.peek()) {
if indent == 0 {
self.input.lookahead(4);
if self.input.next_is_document_end() {
break;
}
}
2024-01-19 19:21:36 +00:00
// We are at the first content character of a content line.
2024-04-18 17:25:16 +00:00
trailing_blank = is_blank(self.input.peek());
2018-09-15 16:49:04 +00:00
if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
string.push_str(&trailing_breaks);
2018-09-15 16:49:04 +00:00
if trailing_breaks.is_empty() {
string.push(' ');
}
2015-05-26 16:29:40 +00:00
} else {
2016-11-24 10:10:49 +00:00
string.push_str(&leading_break);
string.push_str(&trailing_breaks);
2015-05-26 16:29:40 +00:00
}
leading_break.clear();
2015-05-26 16:29:40 +00:00
trailing_breaks.clear();
2024-04-18 17:25:16 +00:00
leading_blank = is_blank(self.input.peek());
2015-05-26 16:29:40 +00:00
self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
2015-09-15 07:27:32 +00:00
// break on EOF
2024-04-18 17:25:16 +00:00
if is_z(self.input.peek()) {
2018-09-15 16:49:04 +00:00
break;
}
2015-05-26 16:29:40 +00:00
self.input.lookahead(2);
2015-09-15 07:27:32 +00:00
self.read_break(&mut leading_break);
2015-05-26 16:29:40 +00:00
// Eat the following indentation spaces and line breaks.
2023-12-21 19:02:56 +00:00
self.skip_block_scalar_indent(indent, &mut trailing_breaks);
2015-05-26 16:29:40 +00:00
}
// Chomp the tail.
2024-01-19 19:21:36 +00:00
if chomping != Chomping::Strip {
2016-11-24 10:10:49 +00:00
string.push_str(&leading_break);
2024-01-19 19:21:36 +00:00
// If we had reached an eof but the last character wasn't an end-of-line, check if the
// last line was indented at least as the rest of the scalar, then we need to consider
// there is a newline.
2024-04-18 17:25:16 +00:00
if is_z(self.input.peek()) && self.mark.col >= indent.max(1) {
2024-01-19 19:21:36 +00:00
string.push('\n');
}
2015-05-26 16:29:40 +00:00
}
2024-01-19 19:21:36 +00:00
if chomping == Chomping::Keep {
2016-11-24 10:10:49 +00:00
string.push_str(&trailing_breaks);
2015-05-26 16:29:40 +00:00
}
2024-01-19 19:21:36 +00:00
Ok(Token(start_mark, TokenType::Scalar(style, string)))
2015-05-26 16:29:40 +00:00
}
/// Retrieve the contents of the line, parsing it as a block scalar.
///
/// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
/// store bytes before pushing them to `string` and thus avoiding reallocating more than
/// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
/// `clear`ed before the end of the function.
///
/// This function assumed the first character to read is the first content character in the
/// line. This function does not consume the line break character(s) after the line.
fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
// Start by evaluating characters in the buffer.
2024-04-18 17:25:16 +00:00
while !self.input.buf_is_empty() && !is_breakz(self.input.peek()) {
string.push(self.input.peek());
// We may technically skip non-blank characters. However, the only distinction is
// to determine what is leading whitespace and what is not. Here, we read the
// contents of the line until either eof or a linebreak. We know we will not read
// `self.leading_whitespace` until the end of the line, where it will be reset.
// This allows us to call a slightly less expensive function.
self.skip_blank();
}
// All characters that were in the buffer were consumed. We need to check if more
// follow.
if self.input.buf_is_empty() {
// We will read all consecutive non-breakz characters. We push them into a
// temporary buffer. The main difference with going through `self.buffer` is that
// characters are appended here as their real size (1B for ascii, or up to 4 bytes for
// UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
// (using `String::push_str`).
2024-04-18 17:25:16 +00:00
let mut c = self.input.raw_read_ch();
while !is_breakz(c) {
line_buffer.push(c);
2024-04-18 17:25:16 +00:00
c = self.input.raw_read_ch();
}
// Our last character read is stored in `c`. It is either an EOF or a break. In any
// case, we need to push it back into `self.buffer` so it may be properly read
// after. We must not insert it in `string`.
self.input.push_back(c);
// We need to manually update our position; we haven't called a `skip` function.
self.mark.col += line_buffer.len();
self.mark.index += line_buffer.len();
// We can now append our bytes to our `string`.
string.reserve(line_buffer.as_bytes().len());
string.push_str(line_buffer);
// This clears the _contents_ without touching the _capacity_.
line_buffer.clear();
}
}
2023-12-21 19:02:56 +00:00
/// Skip the block scalar indentation and empty lines.
fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
loop {
// Consume all spaces. Tabs cannot be used as indentation.
if indent < self.input.bufmaxlen() - 2 {
self.input.lookahead(self.input.bufmaxlen());
2024-04-18 17:25:16 +00:00
while self.mark.col < indent && self.input.peek() == ' ' {
self.skip_blank();
}
} else {
loop {
self.input.lookahead(self.input.bufmaxlen());
2024-04-18 17:25:16 +00:00
while !self.input.buf_is_empty()
&& self.mark.col < indent
&& self.input.peek() == ' '
{
self.skip_blank();
}
// If we reached our indent, we can break. We must also break if we have
// reached content or EOF; that is, the buffer is not empty and the next
// character is not a space.
2024-04-18 17:25:16 +00:00
if self.mark.col == indent
|| (!self.input.buf_is_empty() && self.input.peek() != ' ')
{
break;
}
}
self.input.lookahead(2);
2023-12-21 19:02:56 +00:00
}
// If our current line is empty, skip over the break and continue looping.
2024-04-18 17:25:16 +00:00
if is_break(self.input.peek()) {
2023-12-21 19:02:56 +00:00
self.read_break(breaks);
} else {
// Otherwise, we have a content line. Return control.
break;
}
}
}
/// Determine the indentation level for a block scalar from the first line of its contents.
///
/// The function skips over whitespace-only lines and sets `indent` to the the longest
/// whitespace line that was encountered.
fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
2015-05-26 16:29:40 +00:00
let mut max_indent = 0;
loop {
2023-11-19 16:08:28 +00:00
// Consume all spaces. Tabs cannot be used as indentation.
while self.input.look_ch() == ' ' {
self.skip_blank();
2015-05-26 16:29:40 +00:00
}
if self.mark.col > max_indent {
max_indent = self.mark.col;
}
2024-04-18 17:25:16 +00:00
if is_break(self.input.peek()) {
2023-12-21 19:02:56 +00:00
// If our current line is empty, skip over the break and continue looping.
self.input.lookahead(2);
2023-12-21 19:02:56 +00:00
self.read_break(breaks);
} else {
// Otherwise, we have a content line. Return control.
2015-05-26 16:29:40 +00:00
break;
}
}
// In case a yaml looks like:
// ```yaml
// |
// foo
// bar
// ```
// We need to set the indent to 0 and not 1. In all other cases, the indent must be at
// least 1. When in the above example, `self.indent` will be set to -1.
*indent = max_indent.max((self.indent + 1) as usize);
if self.indent > 0 {
*indent = (*indent).max(1);
}
2015-05-26 16:29:40 +00:00
}
2015-05-24 19:21:53 +00:00
fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
self.save_simple_key();
2015-05-24 06:27:42 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_flow_scalar(single)?;
2015-05-24 06:27:42 +00:00
// From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
// YAML allows the following value to be specified adjacent to the “:”.
2024-01-18 14:46:15 +00:00
self.skip_to_next_token()?;
self.adjacent_value_allowed_at = self.mark.index;
2015-05-24 06:27:42 +00:00
self.tokens.push_back(tok);
Ok(())
}
2023-08-11 23:54:46 +00:00
#[allow(clippy::too_many_lines)]
2015-05-24 19:21:53 +00:00
fn scan_flow_scalar(&mut self, single: bool) -> Result<Token, ScanError> {
2015-05-24 06:27:42 +00:00
let start_mark = self.mark;
let mut string = String::new();
let mut leading_break = String::new();
let mut trailing_breaks = String::new();
let mut whitespaces = String::new();
2015-05-28 09:18:20 +00:00
let mut leading_blanks;
2015-05-24 06:27:42 +00:00
2015-05-24 19:21:53 +00:00
/* Eat the left quote. */
self.skip_non_blank();
2015-05-24 19:21:53 +00:00
2015-05-24 06:27:42 +00:00
loop {
/* Check for a document indicator. */
self.input.lookahead(4);
if self.mark.col == 0 && self.input.next_is_document_indicator() {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"while scanning a quoted scalar, found unexpected document indicator",
));
}
2015-05-24 06:27:42 +00:00
2024-04-18 17:25:16 +00:00
if is_z(self.input.peek()) {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"while scanning a quoted scalar, found unexpected end of stream",
));
2015-05-24 19:21:53 +00:00
}
2024-01-22 22:03:02 +00:00
if (self.mark.col as isize) < self.indent {
return Err(ScanError::new_str(
2024-01-22 22:09:20 +00:00
start_mark,
2024-03-25 11:01:58 +00:00
"invalid indentation in quoted scalar",
2024-01-22 22:09:20 +00:00
));
2024-01-22 22:03:02 +00:00
}
2015-05-24 19:21:53 +00:00
leading_blanks = false;
2024-01-18 18:15:19 +00:00
self.consume_flow_scalar_non_whitespace_chars(
single,
&mut string,
&mut leading_blanks,
&start_mark,
)?;
2015-05-24 19:21:53 +00:00
match self.input.look_ch() {
2018-09-15 16:49:04 +00:00
'\'' if single => break,
'"' if !single => break,
2015-05-24 19:21:53 +00:00
_ => {}
}
// Consume blank characters.
2024-04-18 17:25:16 +00:00
while is_blank(self.input.peek()) || is_break(self.input.peek()) {
if is_blank(self.input.peek()) {
2015-05-24 19:21:53 +00:00
// Consume a space or a tab character.
2016-03-10 12:49:02 +00:00
if leading_blanks {
2024-04-18 17:25:16 +00:00
if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent {
return Err(ScanError::new_str(
2023-12-26 17:06:20 +00:00
self.mark,
"tab cannot be used as indentation",
));
}
self.skip_blank();
2015-05-24 19:21:53 +00:00
} else {
2024-04-18 17:25:16 +00:00
whitespaces.push(self.input.peek());
self.skip_blank();
2015-05-24 19:21:53 +00:00
}
} else {
self.input.lookahead(2);
2015-05-24 19:21:53 +00:00
// Check if it is a first line break.
2016-03-10 12:49:02 +00:00
if leading_blanks {
self.read_break(&mut trailing_breaks);
} else {
2015-05-24 19:21:53 +00:00
whitespaces.clear();
self.read_break(&mut leading_break);
leading_blanks = true;
}
}
self.input.lookahead(1);
2015-05-24 19:21:53 +00:00
}
2024-01-18 18:15:19 +00:00
2015-05-24 19:21:53 +00:00
// Join the whitespaces or fold line breaks.
if leading_blanks {
2016-03-10 12:49:02 +00:00
if leading_break.is_empty() {
2016-11-24 10:10:49 +00:00
string.push_str(&leading_break);
string.push_str(&trailing_breaks);
2016-03-10 12:49:02 +00:00
trailing_breaks.clear();
leading_break.clear();
} else {
2015-05-24 19:21:53 +00:00
if trailing_breaks.is_empty() {
string.push(' ');
} else {
2016-11-24 10:10:49 +00:00
string.push_str(&trailing_breaks);
2015-05-24 19:21:53 +00:00
trailing_breaks.clear();
}
leading_break.clear();
}
} else {
2016-11-24 10:10:49 +00:00
string.push_str(&whitespaces);
2015-05-24 19:21:53 +00:00
whitespaces.clear();
}
} // loop
// Eat the right quote.
self.skip_non_blank();
// Ensure there is no invalid trailing content.
2024-01-19 15:21:56 +00:00
self.skip_ws_to_eol(SkipTabs::Yes)?;
2024-04-18 17:25:16 +00:00
match self.input.peek() {
// These can be encountered in flow sequences or mappings.
',' | '}' | ']' if self.flow_level > 0 => {}
// An end-of-line / end-of-stream is fine. No trailing content.
c if is_breakz(c) => {}
// ':' can be encountered if our scalar is a key.
// Outside of flow contexts, keys cannot span multiple lines
':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
// Inside a flow context, this is allowed.
':' if self.flow_level > 0 => {}
_ => {
return Err(ScanError::new_str(
self.mark,
"invalid trailing content after double-quoted scalar",
));
}
}
2015-05-24 19:21:53 +00:00
2024-01-18 18:15:19 +00:00
let style = if single {
TScalarStyle::SingleQuoted
2015-05-24 19:21:53 +00:00
} else {
2024-01-18 18:15:19 +00:00
TScalarStyle::DoubleQuoted
};
Ok(Token(start_mark, TokenType::Scalar(style, string)))
}
/// Consume successive non-whitespace characters from a flow scalar.
///
/// This function resolves escape sequences and stops upon encountering a whitespace, the end
/// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
/// for double quoted scalars).
///
/// # Errors
/// Return an error if an invalid escape sequence is found.
fn consume_flow_scalar_non_whitespace_chars(
&mut self,
single: bool,
string: &mut String,
leading_blanks: &mut bool,
start_mark: &Marker,
) -> Result<(), ScanError> {
self.input.lookahead(2);
2024-04-18 17:25:16 +00:00
while !is_blank_or_breakz(self.input.peek()) {
match self.input.peek() {
2024-01-18 18:15:19 +00:00
// Check for an escaped single quote.
'\'' if self.input.peek_nth(1) == '\'' && single => {
2024-01-18 18:15:19 +00:00
string.push('\'');
self.skip_n_non_blank(2);
2024-01-18 18:15:19 +00:00
}
// Check for the right quote.
'\'' if single => break,
'"' if !single => break,
// Check for an escaped line break.
'\\' if !single && is_break(self.input.peek_nth(1)) => {
self.input.lookahead(3);
self.skip_non_blank();
2024-04-16 10:03:42 +00:00
self.skip_linebreak();
2024-01-18 18:15:19 +00:00
*leading_blanks = true;
break;
}
// Check for an escape sequence.
'\\' if !single => {
string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
}
c => {
string.push(c);
self.skip_non_blank();
2024-01-18 18:15:19 +00:00
}
}
self.input.lookahead(2);
2024-01-18 18:15:19 +00:00
}
Ok(())
}
/// Escape the sequence we encounter in a flow scalar.
///
2024-04-18 17:25:16 +00:00
/// `self.input.peek()` must point to the `\` starting the escape sequence.
2024-01-18 18:15:19 +00:00
///
/// # Errors
/// Return an error if an invalid escape sequence is found.
fn resolve_flow_scalar_escape_sequence(
&mut self,
start_mark: &Marker,
) -> Result<char, ScanError> {
let mut code_length = 0usize;
let mut ret = '\0';
match self.input.peek_nth(1) {
2024-01-18 18:15:19 +00:00
'0' => ret = '\0',
'a' => ret = '\x07',
'b' => ret = '\x08',
't' | '\t' => ret = '\t',
'n' => ret = '\n',
'v' => ret = '\x0b',
'f' => ret = '\x0c',
'r' => ret = '\x0d',
'e' => ret = '\x1b',
' ' => ret = '\x20',
'"' => ret = '"',
2024-01-22 21:56:18 +00:00
'/' => ret = '/',
2024-01-18 18:15:19 +00:00
'\\' => ret = '\\',
// Unicode next line (#x85)
'N' => ret = char::from_u32(0x85).unwrap(),
// Unicode non-breaking space (#xA0)
'_' => ret = char::from_u32(0xA0).unwrap(),
// Unicode line separator (#x2028)
'L' => ret = char::from_u32(0x2028).unwrap(),
// Unicode paragraph separator (#x2029)
'P' => ret = char::from_u32(0x2029).unwrap(),
'x' => code_length = 2,
'u' => code_length = 4,
'U' => code_length = 8,
_ => {
return Err(ScanError::new_str(
2024-01-18 18:15:19 +00:00
*start_mark,
"while parsing a quoted scalar, found unknown escape character",
))
}
}
self.skip_n_non_blank(2);
2024-01-18 18:15:19 +00:00
// Consume an arbitrary escape code.
if code_length > 0 {
self.input.lookahead(code_length);
2024-01-18 18:15:19 +00:00
let mut value = 0u32;
for i in 0..code_length {
let c = self.input.peek_nth(i);
if !is_hex(c) {
return Err(ScanError::new_str(
2024-01-18 18:15:19 +00:00
*start_mark,
"while parsing a quoted scalar, did not find expected hexadecimal number",
));
}
value = (value << 4) + as_hex(c);
2024-01-18 18:15:19 +00:00
}
let Some(ch) = char::from_u32(value) else {
return Err(ScanError::new_str(
2024-01-18 18:15:19 +00:00
*start_mark,
"while parsing a quoted scalar, found invalid Unicode character escape code",
));
};
ret = ch;
self.skip_n_non_blank(code_length);
2015-05-24 19:21:53 +00:00
}
2024-01-18 18:15:19 +00:00
Ok(ret)
2015-05-24 19:21:53 +00:00
}
fn fetch_plain_scalar(&mut self) -> ScanResult {
self.save_simple_key();
2015-05-24 19:21:53 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_plain_scalar()?;
2015-05-24 19:21:53 +00:00
self.tokens.push_back(tok);
Ok(())
}
/// Scan for a plain scalar.
///
/// Plain scalars are the most readable but restricted style. They may span multiple lines in
/// some contexts.
2024-01-19 20:57:39 +00:00
#[allow(clippy::too_many_lines)]
2015-05-24 19:21:53 +00:00
fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> {
2023-12-26 17:06:20 +00:00
self.unroll_non_block_indents();
2015-05-24 19:21:53 +00:00
let indent = self.indent + 1;
let start_mark = self.mark;
if self.flow_level > 0 && (start_mark.col as isize) < indent {
return Err(ScanError::new_str(
2024-01-22 22:03:02 +00:00
start_mark,
"invalid indentation in flow construct",
));
}
2024-01-31 21:02:53 +00:00
let mut string = String::with_capacity(32);
let mut leading_break = String::with_capacity(32);
let mut trailing_breaks = String::with_capacity(32);
let mut whitespaces = String::with_capacity(32);
2015-05-24 19:21:53 +00:00
loop {
self.input.lookahead(4);
if self.input.next_is_document_indicator() || self.input.peek() == '#' {
2018-09-15 16:49:04 +00:00
break;
}
2024-01-19 20:57:39 +00:00
2024-04-18 17:25:16 +00:00
if self.flow_level > 0 && self.input.peek() == '-' && is_flow(self.input.peek_nth(1)) {
return Err(ScanError::new_str(
2024-01-19 20:57:39 +00:00
self.mark,
"plain scalar cannot start with '-' followed by ,[]{}",
));
}
if !is_blank_or_breakz(self.input.peek())
&& next_can_be_plain_scalar(
self.input.peek(),
self.input.peek_nth(1),
self.flow_level > 0,
)
{
if self.leading_whitespace {
if leading_break.is_empty() {
string.push_str(&leading_break);
string.push_str(&trailing_breaks);
trailing_breaks.clear();
leading_break.clear();
} else {
if trailing_breaks.is_empty() {
string.push(' ');
} else {
2016-11-24 10:10:49 +00:00
string.push_str(&trailing_breaks);
2016-03-10 12:49:02 +00:00
trailing_breaks.clear();
2015-05-24 06:27:42 +00:00
}
leading_break.clear();
2015-05-24 06:27:42 +00:00
}
self.leading_whitespace = false;
2024-01-31 21:02:53 +00:00
} else if !whitespaces.is_empty() {
string.push_str(&whitespaces);
whitespaces.clear();
}
2015-05-24 06:27:42 +00:00
2024-01-31 21:02:53 +00:00
// We can unroll the first iteration of the loop.
2024-04-18 17:25:16 +00:00
string.push(self.input.peek());
self.skip_non_blank();
string.reserve(self.input.bufmaxlen());
2024-01-31 21:02:53 +00:00
// Add content non-blank characters to the scalar.
let mut end = false;
while !end {
// Fill the buffer once and process all characters in the buffer until the next
// fetch. Note that `next_can_be_plain_scalar` needs 2 lookahead characters,
// hence the `for` loop looping `self.input.bufmaxlen() - 1` times.
self.input.lookahead(self.input.bufmaxlen());
for _ in 0..self.input.bufmaxlen() - 1 {
// We need to have `c` and `nc`'s assignations at the beginning of the
// loop. If at the end of it, we will peek one index further than we
// looked ahead. On the first iteration of the loop, `c` is a characte we
// already pushed in `string` a bit earlier.
if is_blank_or_breakz(self.input.peek())
|| !next_can_be_plain_scalar(
self.input.peek(),
self.input.peek_nth(1),
self.flow_level > 0,
)
{
end = true;
break;
}
string.push(self.input.peek());
self.skip_non_blank();
2024-01-31 21:02:53 +00:00
}
}
2015-05-24 06:27:42 +00:00
}
// We may reach the end of a plain scalar if:
// - We reach eof
// - We reach ": "
// - We find a flow character in a flow context
2024-04-18 17:25:16 +00:00
if !(is_blank(self.input.peek()) || is_break(self.input.peek())) {
2018-09-15 16:49:04 +00:00
break;
}
2015-05-24 06:27:42 +00:00
// Process blank characters.
2024-04-18 17:25:16 +00:00
while is_blank(self.input.look_ch()) || is_break(self.input.peek()) {
if is_blank(self.input.peek()) {
if !self.leading_whitespace {
2024-04-18 17:25:16 +00:00
whitespaces.push(self.input.peek());
self.skip_blank();
2024-04-18 17:25:16 +00:00
} else if (self.mark.col as isize) < indent && self.input.peek() == '\t' {
// Tabs in an indentation columns are allowed if and only if the line is
// empty. Skip to the end of the line.
2024-01-19 15:21:56 +00:00
self.skip_ws_to_eol(SkipTabs::Yes)?;
2024-04-18 17:25:16 +00:00
if !is_breakz(self.input.peek()) {
return Err(ScanError::new_str(
2024-01-31 21:02:53 +00:00
start_mark,
"while scanning a plain scalar, found a tab",
));
}
} else {
self.skip_blank();
2015-05-24 06:27:42 +00:00
}
} else {
self.input.lookahead(2);
2015-05-24 06:27:42 +00:00
// Check if it is a first line break
if self.leading_whitespace {
2016-03-10 12:49:02 +00:00
self.read_break(&mut trailing_breaks);
} else {
2015-05-24 06:27:42 +00:00
whitespaces.clear();
self.read_break(&mut leading_break);
self.leading_whitespace = true;
2015-05-24 06:27:42 +00:00
}
}
}
// check indentation level
2015-05-24 06:27:42 +00:00
if self.flow_level == 0 && (self.mark.col as isize) < indent {
break;
}
}
if self.leading_whitespace {
2015-05-24 06:27:42 +00:00
self.allow_simple_key();
}
2018-09-15 16:49:04 +00:00
Ok(Token(
start_mark,
TokenType::Scalar(TScalarStyle::Plain, string),
))
2015-05-24 06:27:42 +00:00
}
2015-05-25 11:31:33 +00:00
fn fetch_key(&mut self) -> ScanResult {
let start_mark = self.mark;
if self.flow_level == 0 {
// Check if we are allowed to start a new key (not necessarily simple).
2015-05-25 11:31:33 +00:00
if !self.simple_key_allowed {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
self.mark,
"mapping keys are not allowed in this context",
));
2015-05-25 11:31:33 +00:00
}
2018-09-15 16:49:04 +00:00
self.roll_indent(
start_mark.col,
None,
TokenType::BlockMappingStart,
start_mark,
);
2023-12-26 18:11:17 +00:00
} else {
// The scanner, upon emitting a `Key`, will prepend a `MappingStart` event.
2023-12-26 18:11:17 +00:00
self.flow_mapping_started = true;
2015-05-25 11:31:33 +00:00
}
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-25 11:31:33 +00:00
if self.flow_level == 0 {
self.allow_simple_key();
} else {
self.disallow_simple_key();
}
self.skip_non_blank();
2023-12-22 15:11:07 +00:00
self.skip_yaml_whitespace()?;
2024-04-18 17:25:16 +00:00
if self.input.peek() == '\t' {
return Err(ScanError::new_str(
2023-12-23 22:25:14 +00:00
self.mark(),
"tabs disallowed in this context",
));
}
self.tokens.push_back(Token(start_mark, TokenType::Key));
2015-05-25 11:31:33 +00:00
Ok(())
}
/// Fetch a value in a mapping inside of a flow collection.
///
/// This must not be called if [`self.flow_level`] is 0. This ensures the rules surrounding
/// values in flow collections are respected prior to calling [`fetch_value`].
///
/// [`self.flow_level`]: Self::flow_level
/// [`fetch_value`]: Self::fetch_value
fn fetch_flow_value(&mut self) -> ScanResult {
let nc = self.input.peek_nth(1);
// If we encounter a ':' inside a flow collection and it is not immediately
// followed by a blank or breakz:
// - We must check whether an adjacent value is allowed
// `["a":[]]` is valid. If the key is double-quoted, no need for a space. This
// is needed for JSON compatibility.
// - If not, we must ensure there is a space after the ':' and before its value.
// `[a: []]` is valid while `[a:[]]` isn't. `[a:b]` is treated as `["a:b"]`.
// - But if the value is empty (null), then it's okay.
// The last line is for YAMLs like `[a:]`. The ':' is followed by a ']' (which is a
// flow character), but the ']' is not the value. The value is an invisible empty
// space which is represented as null ('~').
if self.mark.index != self.adjacent_value_allowed_at && (nc == '[' || nc == '{') {
return Err(ScanError::new_str(
self.mark,
"':' may not precede any of `[{` in flow mapping",
));
}
self.fetch_value()
}
/// Fetch a value from a mapping (after a `:`).
2015-05-24 06:27:42 +00:00
fn fetch_value(&mut self) -> ScanResult {
let sk = self.simple_keys.last().unwrap().clone();
let start_mark = self.mark;
let is_implicit_flow_mapping = self.flow_level > 0 && !self.flow_mapping_started;
if is_implicit_flow_mapping {
*self.implicit_flow_mapping_states.last_mut().unwrap() = ImplicitMappingState::Inside;
}
2023-12-25 23:34:29 +00:00
// Skip over ':'.
self.skip_non_blank();
if self.input.look_ch() == '\t'
2024-01-19 15:21:56 +00:00
&& !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
2024-04-18 17:25:16 +00:00
&& (self.input.peek() == '-' || is_alpha(self.input.peek()))
2023-12-25 23:34:29 +00:00
{
return Err(ScanError::new_str(
2023-12-25 23:34:29 +00:00
self.mark,
"':' must be followed by a valid YAML whitespace",
));
}
2015-05-24 06:27:42 +00:00
if sk.possible {
2015-05-25 11:31:33 +00:00
// insert simple key
let tok = Token(sk.mark, TokenType::Key);
self.insert_token(sk.token_number - self.tokens_parsed, tok);
if is_implicit_flow_mapping {
if sk.mark.line < start_mark.line {
return Err(ScanError::new_str(
start_mark,
"illegal placement of ':' indicator",
));
}
2023-12-26 18:11:17 +00:00
self.insert_token(
sk.token_number - self.tokens_parsed,
2023-12-26 18:11:17 +00:00
Token(self.mark, TokenType::FlowMappingStart),
);
}
2015-05-24 06:27:42 +00:00
// Add the BLOCK-MAPPING-START token if needed.
2018-09-15 16:49:04 +00:00
self.roll_indent(
sk.mark.col,
Some(sk.token_number),
TokenType::BlockMappingStart,
start_mark,
);
2023-12-26 17:06:20 +00:00
self.roll_one_col_indent();
2015-05-24 06:27:42 +00:00
self.simple_keys.last_mut().unwrap().possible = false;
self.disallow_simple_key();
} else {
if is_implicit_flow_mapping {
2023-12-26 18:11:17 +00:00
self.tokens
.push_back(Token(self.mark, TokenType::FlowMappingStart));
}
2015-05-24 06:27:42 +00:00
// The ':' indicator follows a complex key.
2015-05-25 11:31:33 +00:00
if self.flow_level == 0 {
if !self.simple_key_allowed {
return Err(ScanError::new_str(
2018-09-15 16:49:04 +00:00
start_mark,
"mapping values are not allowed in this context",
));
2015-05-25 11:31:33 +00:00
}
2015-05-24 06:27:42 +00:00
2018-09-15 16:49:04 +00:00
self.roll_indent(
start_mark.col,
None,
TokenType::BlockMappingStart,
start_mark,
);
2015-05-25 11:31:33 +00:00
}
2023-12-26 17:06:20 +00:00
self.roll_one_col_indent();
2015-05-25 11:31:33 +00:00
if self.flow_level == 0 {
self.allow_simple_key();
} else {
self.disallow_simple_key();
}
}
self.tokens.push_back(Token(start_mark, TokenType::Value));
2015-05-24 06:27:42 +00:00
Ok(())
}
2023-12-20 23:14:08 +00:00
/// Add an indentation level to the stack with the given block token, if needed.
///
/// An indentation level is added only if:
/// - We are not in a flow-style construct (which don't have indentation per-se).
/// - The current column is further indented than the last indent we have registered.
2018-09-15 16:49:04 +00:00
fn roll_indent(&mut self, col: usize, number: Option<usize>, tok: TokenType, mark: Marker) {
2015-05-24 06:27:42 +00:00
if self.flow_level > 0 {
return;
}
// If the last indent was a non-block indent, remove it.
// This means that we prepared an indent that we thought we wouldn't use, but realized just
// now that it is a block indent.
2023-12-26 17:06:20 +00:00
if self.indent <= col as isize {
if let Some(indent) = self.indents.last() {
if !indent.needs_block_end {
self.indent = indent.indent;
self.indents.pop();
}
}
}
2015-05-24 06:27:42 +00:00
if self.indent < col as isize {
self.indents.push(Indent {
indent: self.indent,
needs_block_end: true,
});
2015-05-24 06:27:42 +00:00
self.indent = col as isize;
let tokens_parsed = self.tokens_parsed;
match number {
Some(n) => self.insert_token(n - tokens_parsed, Token(mark, tok)),
2018-09-15 16:49:04 +00:00
None => self.tokens.push_back(Token(mark, tok)),
2015-05-24 06:27:42 +00:00
}
}
}
2023-12-20 23:14:08 +00:00
/// Pop indentation levels from the stack as much as needed.
///
/// Indentation levels are popped from the stack while they are further indented than `col`.
/// If we are in a flow-style construct (which don't have indentation per-se), this function
/// does nothing.
2015-05-24 06:27:42 +00:00
fn unroll_indent(&mut self, col: isize) {
if self.flow_level > 0 {
return;
}
while self.indent > col {
let indent = self.indents.pop().unwrap();
self.indent = indent.indent;
if indent.needs_block_end {
self.tokens.push_back(Token(self.mark, TokenType::BlockEnd));
}
2015-05-24 06:27:42 +00:00
}
}
2023-12-26 17:06:20 +00:00
/// Add an indentation level of 1 column that does not start a block.
///
/// See the documentation of [`Indent::needs_block_end`] for more details.
/// An indentation is not added if we are inside a flow level or if the last indent is already
/// a non-block indent.
2023-12-26 17:06:20 +00:00
fn roll_one_col_indent(&mut self) {
if self.flow_level == 0 && self.indents.last().map_or(false, |x| x.needs_block_end) {
2023-12-26 17:06:20 +00:00
self.indents.push(Indent {
indent: self.indent,
needs_block_end: false,
});
self.indent += 1;
}
}
/// Unroll all last indents created with [`Self::roll_one_col_indent`].
fn unroll_non_block_indents(&mut self) {
while let Some(indent) = self.indents.last() {
if indent.needs_block_end {
break;
}
self.indent = indent.indent;
self.indents.pop();
2023-12-26 17:06:20 +00:00
}
}
/// Mark the next token to be inserted as a potential simple key.
fn save_simple_key(&mut self) {
2015-05-24 06:27:42 +00:00
if self.simple_key_allowed {
let required = self.flow_level == 0
&& self.indent == (self.mark.col as isize)
&& self.indents.last().unwrap().needs_block_end;
2015-05-24 06:27:42 +00:00
let mut sk = SimpleKey::new(self.mark);
sk.possible = true;
sk.required = required;
sk.token_number = self.tokens_parsed + self.tokens.len();
self.simple_keys.pop();
self.simple_keys.push(sk);
}
}
fn remove_simple_key(&mut self) -> ScanResult {
let last = self.simple_keys.last_mut().unwrap();
if last.possible && last.required {
return Err(ScanError::new_str(self.mark, "simple key expected"));
2015-05-24 06:27:42 +00:00
}
last.possible = false;
Ok(())
}
2023-12-20 22:14:22 +00:00
/// Return whether the scanner is inside a block but outside of a flow sequence.
fn is_within_block(&self) -> bool {
!self.indents.is_empty()
2023-12-20 22:14:22 +00:00
}
2023-12-26 18:11:17 +00:00
/// If an implicit mapping had started, end it.
///
/// This function does not pop the state in [`implicit_flow_mapping_states`].
///
/// [`implicit_flow_mapping_states`]: Self::implicit_flow_mapping_states
2023-12-26 18:11:17 +00:00
fn end_implicit_mapping(&mut self, mark: Marker) {
if let Some(implicit_mapping) = self.implicit_flow_mapping_states.last_mut() {
if *implicit_mapping == ImplicitMappingState::Inside {
self.flow_mapping_started = false;
*implicit_mapping = ImplicitMappingState::Possible;
self.tokens
.push_back(Token(mark, TokenType::FlowMappingEnd));
}
2023-12-26 18:11:17 +00:00
}
}
2015-05-24 06:27:42 +00:00
}
2023-12-23 23:02:42 +00:00
/// Behavior to adopt regarding treating tabs as whitespace.
2023-12-25 23:34:29 +00:00
///
/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
2023-12-23 23:02:42 +00:00
#[derive(Copy, Clone, Eq, PartialEq)]
enum SkipTabs {
/// Skip all tabs as whitespace.
Yes,
/// Don't skip any tab. Return from the function when encountering one.
No,
/// Return value from the function.
Result(
/// Whether tabs were encountered.
bool,
2023-12-25 23:34:29 +00:00
/// Whether at least 1 valid yaml whitespace has been encountered.
bool,
2023-12-23 23:02:42 +00:00
),
}
impl SkipTabs {
/// Whether tabs were found while skipping whitespace.
///
/// This function must be called after a call to `skip_ws_to_eol`.
fn found_tabs(self) -> bool {
2023-12-25 23:34:29 +00:00
matches!(self, SkipTabs::Result(true, _))
}
/// Whether a valid YAML whitespace has been found in skipped-over content.
///
/// This function must be called after a call to `skip_ws_to_eol`.
fn has_valid_yaml_ws(self) -> bool {
matches!(self, SkipTabs::Result(_, true))
2023-12-23 23:02:42 +00:00
}
}
2023-12-28 00:48:19 +00:00
2024-01-19 19:21:36 +00:00
/// Chomping, how final line breaks and trailing empty lines are interpreted.
///
/// See YAML spec 8.1.1.2.
#[derive(PartialEq, Eq)]
pub enum Chomping {
/// The final line break and any trailing empty lines are excluded.
Strip,
/// The final line break is preserved, but trailing empty lines are excluded.
Clip,
/// The final line break and trailing empty lines are included.
Keep,
}
/// Check whether the next characters may be part of a plain scalar.
///
/// This function assumes we are not given a blankz character.
// For some reason, `#[inline]` is not enough.
#[allow(clippy::inline_always)]
#[inline(always)]
pub fn next_can_be_plain_scalar(c: char, nc: char, in_flow: bool) -> bool {
match c {
// indicators can end a plain scalar, see 7.3.3. Plain Style
':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
c if in_flow && is_flow(c) => false,
_ => true,
}
}
2023-12-28 00:48:19 +00:00
#[cfg(test)]
mod test {
#[test]
fn test_is_anchor_char() {
use super::is_anchor_char;
assert!(is_anchor_char('x'));
}
}