Add Input interface.

Hiding character fetching behind this interface allows us to create more
specific implementations when is appropriate. For instance, an instance
of `Input` can be created for a `&str`, allowing for borrowing and more
efficient peeking and traversing than if we were to fetch characters one
at a time and placing them into a temporary buffer.
This commit is contained in:
Ethiraric 2024-04-18 17:48:49 +02:00
parent 11cffc6df8
commit d9bb7a1693
8 changed files with 384 additions and 229 deletions

View file

@ -0,0 +1,99 @@
use crate::input::Input;
use arraydeque::ArrayDeque;
/// The size of the [`BufferedInput`] buffer.
///
/// The buffer is statically allocated to avoid conditions for reallocations each time we
/// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except:
/// - Escape sequences parsing: some escape codes are 8 characters
/// - Scanning indent in scalars: this looks ahead `indent + 2` characters
///
/// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done
/// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher
/// than that, the code will fall back to a loop of lookaheads.
const BUFFER_LEN: usize = 16;
/// A wrapper around an [`Iterator`] of [`char`]s with a buffer.
///
/// The YAML scanner often needs some lookahead. With fully allocated buffers such as `String` or
/// `&str`, this is not an issue. However, with streams, we need to have a way of peeking multiple
/// characters at a time and sometimes pushing some back into the stream.
/// There is no "easy" way of doing this without itertools. In order to avoid pulling the entierty
/// of itertools for one method, we use this structure.
pub struct BufferedInput<T: Iterator<Item = char>> {
/// The iterator source,
input: T,
/// Buffer for the next characters to consume.
buffer: ArrayDeque<char, BUFFER_LEN>,
}
impl<T: Iterator<Item = char>> BufferedInput<T> {
/// Create a new [`BufferedInput`] with the given input.
pub fn new(input: T) -> Self {
Self {
input,
buffer: ArrayDeque::default(),
}
}
}
impl<T: Iterator<Item = char>> Input for BufferedInput<T> {
#[inline]
fn lookahead(&mut self, count: usize) {
if self.buffer.len() >= count {
return;
}
for _ in 0..(count - self.buffer.len()) {
self.buffer
.push_back(self.input.next().unwrap_or('\0'))
.unwrap();
}
}
#[inline]
fn buflen(&self) -> usize {
self.buffer.len()
}
#[inline]
fn bufmaxlen(&self) -> usize {
BUFFER_LEN
}
#[inline]
fn raw_read_ch(&mut self) -> char {
self.input.next().unwrap_or('\0')
}
#[inline]
fn push_back(&mut self, c: char) {
self.buffer.push_back(c).unwrap();
}
#[inline]
fn skip(&mut self) {
self.buffer.pop_front();
}
#[inline]
fn skip_n(&mut self, count: usize) {
self.buffer.drain(0..count);
}
#[inline]
fn peek(&self) -> char {
self.buffer[0]
}
#[inline]
fn peek_nth(&self, n: usize) -> char {
self.buffer[n]
}
#[inline]
fn next_is(&self, pat: &str) -> bool {
assert!(self.buffer.len() >= pat.len());
self.buffer.iter().zip(pat.chars()).all(|(a, b)| *a == b)
}
}

111
parser/src/input.rs Normal file
View file

@ -0,0 +1,111 @@
/// Interface for a source of characters.
///
/// Hiding the input's implementation behind this trait allows mostly:
/// * For input-specific optimizations (for instance, using `str` methods instead of manually
/// transferring one `char` at a time to a buffer).
/// * To return `&str`s referencing the input string, thus avoiding potentially costly
/// allocations. Should users need an owned version of the data, they can always `.to_owned()`
/// their YAML object.
pub trait Input {
/// A hint to the input source that we will need to read `count` characters.
///
/// If the input is exhausted, `\0` can be used to pad the last characters and later returned.
/// The characters must not be consumed, but may be placed in an internal buffer.
///
/// This method may be a no-op if buffering yields no performance improvement.
///
/// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The
/// parser tracks how many characters are loaded in the buffer and acts accordingly.
fn lookahead(&mut self, count: usize);
/// Return the number of buffered characters in `self`.
#[must_use]
fn buflen(&self) -> usize;
/// Return the capacity of the buffer in `self`.
#[must_use]
fn bufmaxlen(&self) -> usize;
/// Return whether the buffer (!= stream) is empty.
#[inline]
#[must_use]
fn buf_is_empty(&self) -> bool {
self.buflen() == 0
}
/// Read a character from the input stream and return it directly.
///
/// The internal buffer (is any) is bypassed.
#[must_use]
fn raw_read_ch(&mut self) -> char;
/// Put a character back in the buffer.
///
/// This function is only called when we read one too many characters and the pushed back
/// character is exactly the last character that was read. This function will not be called
/// multiple times consecutively.
fn push_back(&mut self, c: char);
/// Consume the next character.
fn skip(&mut self);
/// Consume the next `count` character.
fn skip_n(&mut self, count: usize);
/// Return the next character, without consuming it.
///
/// Users of the [`Input`] must make sure that the character has been loaded through a prior
/// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to
/// [`Input::lookahead`] has been made beforehand.
///
/// # Return
/// If the input source is not exhausted, returns the next character to be fed into the
/// scanner. Otherwise, returns `\0`.
#[must_use]
fn peek(&self) -> char;
/// Return the `n`-th character in the buffer, without consuming it.
///
/// This function assumes that the n-th character in the input has already been fetched through
/// [`Input::lookahead`].
#[must_use]
fn peek_nth(&self, n: usize) -> char;
/// Look for the next character and return it.
///
/// The character is not consumed.
/// Equivalent to calling [`Input::lookahead`] and [`Input::peek`].
#[inline]
#[must_use]
fn look_ch(&mut self) -> char {
self.lookahead(1);
self.peek()
}
/// Return whether the next character in the input source is equal to `c`.
///
/// This function assumes that the next character in the input has already been fetched through
/// [`Input::lookahead`].
#[inline]
#[must_use]
fn next_char_is(&self, c: char) -> bool {
self.peek() == c
}
/// Return whether the `n`-th character in the input source is equal to `c`.
///
/// This function assumes that the n-th character in the input has already been fetched through
/// [`Input::lookahead`].
#[inline]
#[must_use]
fn nth_char_is(&self, n: usize, c: char) -> bool {
self.peek_nth(n) == c
}
/// Return whether the next characters in the input source match the given pattern.
///
/// This function assumes that the next `pat.len()` characters in the input has already been
/// fetched through [`Input::lookahead`].
#[must_use]
fn next_is(&self, pat: &str) -> bool;
}

View file

@ -32,11 +32,14 @@
#![warn(missing_docs, clippy::pedantic)] #![warn(missing_docs, clippy::pedantic)]
pub(crate) mod char_traits; mod buffered_input;
mod char_traits;
#[macro_use] #[macro_use]
pub(crate) mod debug; mod debug;
pub mod parser; mod input;
pub mod scanner; mod parser;
mod scanner;
pub use crate::buffered_input::BufferedInput;
pub use crate::parser::{Event, EventReceiver, MarkedEventReceiver, Parser, Tag}; pub use crate::parser::{Event, EventReceiver, MarkedEventReceiver, Parser, Tag};
pub use crate::scanner::{Marker, ScanError, TScalarStyle}; pub use crate::scanner::{Marker, ScanError, TScalarStyle};

View file

@ -4,7 +4,11 @@
//! compliance, and emits a stream of YAML events. This stream can for instance be used to create //! compliance, and emits a stream of YAML events. This stream can for instance be used to create
//! YAML objects. //! YAML objects.
use crate::scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType}; use crate::{
input::Input,
scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType},
BufferedInput,
};
use std::collections::HashMap; use std::collections::HashMap;
#[derive(Clone, Copy, PartialEq, Debug, Eq)] #[derive(Clone, Copy, PartialEq, Debug, Eq)]
@ -100,7 +104,7 @@ impl Event {
/// A YAML parser. /// A YAML parser.
#[derive(Debug)] #[derive(Debug)]
pub struct Parser<T> { pub struct Parser<T: Input> {
/// The underlying scanner from which we pull tokens. /// The underlying scanner from which we pull tokens.
scanner: Scanner<T>, scanner: Scanner<T>,
/// The stack of _previous_ states we were in. /// The stack of _previous_ states we were in.
@ -225,15 +229,15 @@ impl<R: EventReceiver> MarkedEventReceiver for R {
/// A convenience alias for a `Result` of a parser event. /// A convenience alias for a `Result` of a parser event.
pub type ParseResult = Result<(Event, Marker), ScanError>; pub type ParseResult = Result<(Event, Marker), ScanError>;
impl<'a> Parser<core::str::Chars<'a>> { impl<'a> Parser<BufferedInput<std::str::Chars<'a>>> {
/// Create a new instance of a parser from a &str. /// Create a new instance of a parser from a &str.
#[must_use] #[must_use]
pub fn new_from_str(value: &'a str) -> Self { pub fn new_from_str(value: &'a str) -> Self {
Parser::new(value.chars()) Parser::new(BufferedInput::new(value.chars()))
} }
} }
impl<T: Iterator<Item = char>> Parser<T> { impl<T: Input> Parser<T> {
/// Create a new instance of a parser from the given input of characters. /// Create a new instance of a parser from the given input of characters.
pub fn new(src: T) -> Parser<T> { pub fn new(src: T) -> Parser<T> {
Parser { Parser {
@ -1130,7 +1134,7 @@ impl<T: Iterator<Item = char>> Parser<T> {
} }
} }
impl<T: Iterator<Item = char>> Iterator for Parser<T> { impl<T: Input> Iterator for Parser<T> {
type Item = Result<(Event, Marker), ScanError>; type Item = Result<(Event, Marker), ScanError>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {

View file

@ -11,11 +11,12 @@
use std::{char, collections::VecDeque, error::Error, fmt}; use std::{char, collections::VecDeque, error::Error, fmt};
use arraydeque::ArrayDeque; use crate::{
char_traits::{
use crate::char_traits::{ as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz,
as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z,
is_flow, is_hex, is_tag_char, is_uri_char, is_z, },
input::Input,
}; };
/// The encoding of the input. Currently, only UTF-8 is supported. /// The encoding of the input. Currently, only UTF-8 is supported.
@ -343,18 +344,6 @@ enum ImplicitMappingState {
Inside, Inside,
} }
/// The size of the [`Scanner`] buffer.
///
/// The buffer is statically allocated to avoid conditions for reallocations each time we
/// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except:
/// - Escape sequences parsing: some escape codes are 8 characters
/// - Scanning indent in scalars: this looks ahead `indent + 2` characters
///
/// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done
/// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher
/// than that, the code will fall back to a loop of lookaheads.
const BUFFER_LEN: usize = 16;
/// The YAML scanner. /// The YAML scanner.
/// ///
/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they /// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
@ -367,8 +356,10 @@ const BUFFER_LEN: usize = 16;
#[derive(Debug)] #[derive(Debug)]
#[allow(clippy::struct_excessive_bools)] #[allow(clippy::struct_excessive_bools)]
pub struct Scanner<T> { pub struct Scanner<T> {
/// The reader, providing with characters. /// The input source.
rdr: T, ///
/// This must implement [`Input`].
input: T,
/// The position of the cursor within the reader. /// The position of the cursor within the reader.
mark: Marker, mark: Marker,
/// Buffer for tokens to be returned. /// Buffer for tokens to be returned.
@ -378,8 +369,6 @@ pub struct Scanner<T> {
/// follows. In this case, the token stays in the `VecDeque` but cannot be returned from /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
/// [`Self::next`] until we have more context. /// [`Self::next`] until we have more context.
tokens: VecDeque<Token>, tokens: VecDeque<Token>,
/// Buffer for the next characters to consume.
buffer: ArrayDeque<char, BUFFER_LEN>,
/// The last error that happened. /// The last error that happened.
error: Option<ScanError>, error: Option<ScanError>,
@ -435,7 +424,7 @@ pub struct Scanner<T> {
implicit_flow_mapping_states: Vec<ImplicitMappingState>, implicit_flow_mapping_states: Vec<ImplicitMappingState>,
} }
impl<T: Iterator<Item = char>> Iterator for Scanner<T> { impl<T: Input> Iterator for Scanner<T> {
type Item = Token; type Item = Token;
fn next(&mut self) -> Option<Token> { fn next(&mut self) -> Option<Token> {
if self.error.is_some() { if self.error.is_some() {
@ -462,12 +451,11 @@ impl<T: Iterator<Item = char>> Iterator for Scanner<T> {
/// A convenience alias for scanner functions that may fail without returning a value. /// A convenience alias for scanner functions that may fail without returning a value.
pub type ScanResult = Result<(), ScanError>; pub type ScanResult = Result<(), ScanError>;
impl<T: Iterator<Item = char>> Scanner<T> { impl<T: Input> Scanner<T> {
/// Creates the YAML tokenizer. /// Creates the YAML tokenizer.
pub fn new(rdr: T) -> Scanner<T> { pub fn new(input: T) -> Scanner<T> {
Scanner { Scanner {
rdr, input,
buffer: ArrayDeque::new(),
mark: Marker::new(0, 1, 0), mark: Marker::new(0, 1, 0),
tokens: VecDeque::new(), tokens: VecDeque::new(),
error: None, error: None,
@ -497,25 +485,10 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.error.clone() self.error.clone()
} }
/// Fill `self.buffer` with at least `count` characters.
///
/// The characters that are extracted this way are not consumed but only placed in the buffer.
#[inline]
fn lookahead(&mut self, count: usize) {
if self.buffer.len() >= count {
return;
}
for _ in 0..(count - self.buffer.len()) {
self.buffer
.push_back(self.rdr.next().unwrap_or('\0'))
.unwrap();
}
}
/// Consume the next character. It is assumed the next character is a blank. /// Consume the next character. It is assumed the next character is a blank.
#[inline] #[inline]
fn skip_blank(&mut self) { fn skip_blank(&mut self) {
self.buffer.pop_front(); self.input.skip();
self.mark.index += 1; self.mark.index += 1;
self.mark.col += 1; self.mark.col += 1;
@ -524,7 +497,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// Consume the next character. It is assumed the next character is not a blank. /// Consume the next character. It is assumed the next character is not a blank.
#[inline] #[inline]
fn skip_non_blank(&mut self) { fn skip_non_blank(&mut self) {
self.buffer.pop_front(); self.input.skip();
self.mark.index += 1; self.mark.index += 1;
self.mark.col += 1; self.mark.col += 1;
@ -533,18 +506,18 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// Consume the next characters. It is assumed none of the next characters are blanks. /// Consume the next characters. It is assumed none of the next characters are blanks.
#[inline] #[inline]
fn skip_n_non_blank(&mut self, n: usize) { fn skip_n_non_blank(&mut self, count: usize) {
self.buffer.drain(0..n); self.input.skip_n(count);
self.mark.index += n; self.mark.index += count;
self.mark.col += n; self.mark.col += count;
self.leading_whitespace = false; self.leading_whitespace = false;
} }
/// Consume the next character. It is assumed the next character is a newline. /// Consume the next character. It is assumed the next character is a newline.
#[inline] #[inline]
fn skip_nl(&mut self) { fn skip_nl(&mut self) {
self.buffer.pop_front(); self.input.skip();
self.mark.index += 1; self.mark.index += 1;
self.mark.col = 0; self.mark.col = 0;
@ -555,12 +528,12 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none. /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
#[inline] #[inline]
fn skip_linebreak(&mut self) { fn skip_linebreak(&mut self) {
if self.buffer[0] == '\r' && self.buffer[1] == '\n' { if self.input.next_is("\r\n") {
// While technically not a blank, this does not matter as `self.leading_whitespace` // While technically not a blank, this does not matter as `self.leading_whitespace`
// will be reset by `skip_nl`. // will be reset by `skip_nl`.
self.skip_blank(); self.skip_blank();
self.skip_nl(); self.skip_nl();
} else if is_break(self.buffer[0]) { } else if is_break(self.input.peek()) {
self.skip_nl(); self.skip_nl();
} }
} }
@ -570,32 +543,16 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// The character is not consumed. /// The character is not consumed.
#[inline] #[inline]
fn ch(&self) -> char { fn ch(&self) -> char {
self.buffer[0] self.input.peek()
}
/// Look for the next character and return it.
///
/// The character is not consumed.
/// Equivalent to calling [`Self::lookahead`] and [`Self::ch`].
#[inline]
fn look_ch(&mut self) -> char {
self.lookahead(1);
self.ch()
} }
/// Read a character from the input stream, returning it directly. /// Read a character from the input stream, returning it directly.
/// ///
/// The buffer is bypassed and `self.mark` needs to be updated manually. /// The buffer (if any) is bypassed and `self.mark` needs to be updated manually.
#[inline] #[inline]
#[must_use] #[must_use]
fn raw_read_ch(&mut self) -> char { fn raw_read_ch(&mut self) -> char {
self.rdr.next().unwrap_or('\0') self.input.raw_read_ch()
}
/// Return whether the next character is `c`.
#[inline]
fn ch_is(&self, c: char) -> bool {
self.buffer[0] == c
} }
/// Return whether the [`TokenType::StreamStart`] event has been emitted. /// Return whether the [`TokenType::StreamStart`] event has been emitted.
@ -624,8 +581,8 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// If the next characters do not correspond to a line break. // If the next characters do not correspond to a line break.
#[inline] #[inline]
fn read_break(&mut self, s: &mut String) { fn read_break(&mut self, s: &mut String) {
let c = self.buffer[0]; let c = self.input.peek();
let nc = self.buffer[1]; let nc = self.input.peek_nth(1);
debug_assert!(is_break(c)); debug_assert!(is_break(c));
if c == '\r' && nc == '\n' { if c == '\r' && nc == '\n' {
self.skip_blank(); self.skip_blank();
@ -635,15 +592,20 @@ impl<T: Iterator<Item = char>> Scanner<T> {
s.push('\n'); s.push('\n');
} }
/// Check whether the next characters correspond to a start of document.
///
/// [`Self::lookahead`] must have been called before calling this function.
fn next_is_document_start(&self) -> bool {
assert!(self.input.buflen() >= 4);
self.input.next_is("---") && is_blank_or_breakz(self.input.peek_nth(3))
}
/// Check whether the next characters correspond to an end of document. /// Check whether the next characters correspond to an end of document.
/// ///
/// [`Self::lookahead`] must have been called before calling this function. /// [`Self::lookahead`] must have been called before calling this function.
fn next_is_document_end(&self) -> bool { fn next_is_document_end(&self) -> bool {
assert!(self.buffer.len() >= 4); assert!(self.input.buflen() >= 4);
self.buffer[0] == '.' self.input.next_is("...") && is_blank_or_breakz(self.input.peek_nth(3))
&& self.buffer[1] == '.'
&& self.buffer[2] == '.'
&& is_blank_or_breakz(self.buffer[3])
} }
/// Check whether the next characters correspond to a document indicator. /// Check whether the next characters correspond to a document indicator.
@ -651,11 +613,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// [`Self::lookahead`] must have been called before calling this function. /// [`Self::lookahead`] must have been called before calling this function.
#[inline] #[inline]
fn next_is_document_indicator(&self) -> bool { fn next_is_document_indicator(&self) -> bool {
assert!(self.buffer.len() >= 4); assert!(self.input.buflen() >= 4);
self.mark.col == 0 is_blank_or_breakz(self.input.peek_nth(3))
&& (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-')) && (self.input.next_is("...") || self.input.next_is("---"))
|| ((self.buffer[0] == '.') && (self.buffer[1] == '.') && (self.buffer[2] == '.')))
&& is_blank_or_breakz(self.buffer[3])
} }
/// Insert a token at the given position. /// Insert a token at the given position.
@ -674,11 +634,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
/// Fetch the next token in the stream. /// Fetch the next token in the stream.
///
/// # Errors /// # Errors
/// Returns `ScanError` when the scanner does not find the next expected token. /// Returns `ScanError` when the scanner does not find the next expected token.
pub fn fetch_next_token(&mut self) -> ScanResult { pub fn fetch_next_token(&mut self) -> ScanResult {
self.lookahead(1); self.input.lookahead(1);
// eprintln!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch());
if !self.stream_start_produced { if !self.stream_start_produced {
self.fetch_stream_start(); self.fetch_stream_start();
@ -697,51 +657,37 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mark = self.mark; let mark = self.mark;
self.unroll_indent(mark.col as isize); self.unroll_indent(mark.col as isize);
self.lookahead(4); self.input.lookahead(4);
if is_z(self.ch()) { if is_z(self.ch()) {
self.fetch_stream_end()?; self.fetch_stream_end()?;
return Ok(()); return Ok(());
} }
// Is it a directive? if self.mark.col == 0 {
if self.mark.col == 0 && self.ch_is('%') { if self.input.next_char_is('%') {
return self.fetch_directive(); return self.fetch_directive();
} } else if self.next_is_document_start() {
return self.fetch_document_indicator(TokenType::DocumentStart);
if self.mark.col == 0 } else if self.next_is_document_end() {
&& self.buffer[0] == '-' self.fetch_document_indicator(TokenType::DocumentEnd)?;
&& self.buffer[1] == '-' self.skip_ws_to_eol(SkipTabs::Yes)?;
&& self.buffer[2] == '-' if !is_breakz(self.ch()) {
&& is_blank_or_breakz(self.buffer[3]) return Err(ScanError::new_str(
{ self.mark,
self.fetch_document_indicator(TokenType::DocumentStart)?; "invalid content after document end marker",
return Ok(()); ));
} }
return Ok(());
if self.mark.col == 0
&& self.buffer[0] == '.'
&& self.buffer[1] == '.'
&& self.buffer[2] == '.'
&& is_blank_or_breakz(self.buffer[3])
{
self.fetch_document_indicator(TokenType::DocumentEnd)?;
self.skip_ws_to_eol(SkipTabs::Yes)?;
if !is_breakz(self.ch()) {
return Err(ScanError::new_str(
self.mark,
"invalid content after document end marker",
));
} }
return Ok(());
} }
if (self.mark.col as isize) < self.indent { if (self.mark.col as isize) < self.indent {
return Err(ScanError::new_str(self.mark, "invalid indentation")); return Err(ScanError::new_str(self.mark, "invalid indentation"));
} }
let c = self.buffer[0]; let c = self.input.peek();
let nc = self.buffer[1]; let nc = self.input.peek_nth(1);
match c { match c {
'[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart), '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
'{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart), '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
@ -860,7 +806,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
Ok(()) Ok(())
} }
/// Skip over all whitespace and comments until the next token. /// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
/// ///
/// # Errors /// # Errors
/// This function returns an error if a tabulation is encountered where there should not be /// This function returns an error if a tabulation is encountered where there should not be
@ -868,7 +814,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn skip_to_next_token(&mut self) -> ScanResult { fn skip_to_next_token(&mut self) -> ScanResult {
loop { loop {
// TODO(chenyh) BOM // TODO(chenyh) BOM
match self.look_ch() { match self.input.look_ch() {
// Tabs may not be used as indentation. // Tabs may not be used as indentation.
// "Indentation" only exists as long as a block is started, but does not exist // "Indentation" only exists as long as a block is started, but does not exist
// inside of flow-style constructs. Tabs are allowed as part of leading // inside of flow-style constructs. Tabs are allowed as part of leading
@ -890,14 +836,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
'\t' | ' ' => self.skip_blank(), '\t' | ' ' => self.skip_blank(),
'\n' | '\r' => { '\n' | '\r' => {
self.lookahead(2); self.input.lookahead(2);
self.skip_linebreak(); self.skip_linebreak();
if self.flow_level == 0 { if self.flow_level == 0 {
self.allow_simple_key(); self.allow_simple_key();
} }
} }
'#' => { '#' => {
while !is_breakz(self.look_ch()) { while !is_breakz(self.input.look_ch()) {
self.skip_non_blank(); self.skip_non_blank();
} }
} }
@ -914,14 +860,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn skip_yaml_whitespace(&mut self) -> ScanResult { fn skip_yaml_whitespace(&mut self) -> ScanResult {
let mut need_whitespace = true; let mut need_whitespace = true;
loop { loop {
match self.look_ch() { match self.input.look_ch() {
' ' => { ' ' => {
self.skip_blank(); self.skip_blank();
need_whitespace = false; need_whitespace = false;
} }
'\n' | '\r' => { '\n' | '\r' => {
self.lookahead(2); self.input.lookahead(2);
self.skip_linebreak(); self.skip_linebreak();
if self.flow_level == 0 { if self.flow_level == 0 {
self.allow_simple_key(); self.allow_simple_key();
@ -929,7 +875,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
need_whitespace = false; need_whitespace = false;
} }
'#' => { '#' => {
while !is_breakz(self.look_ch()) { while !is_breakz(self.input.look_ch()) {
self.skip_non_blank(); self.skip_non_blank();
} }
} }
@ -949,7 +895,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut encountered_tab = false; let mut encountered_tab = false;
let mut has_yaml_ws = false; let mut has_yaml_ws = false;
loop { loop {
match self.look_ch() { match self.input.look_ch() {
' ' => { ' ' => {
has_yaml_ws = true; has_yaml_ws = true;
self.skip_blank(); self.skip_blank();
@ -966,7 +912,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
)); ));
} }
'#' => { '#' => {
while !is_breakz(self.look_ch()) { while !is_breakz(self.input.look_ch()) {
self.skip_non_blank(); self.skip_non_blank();
} }
} }
@ -1035,7 +981,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// XXX This should be a warning instead of an error // XXX This should be a warning instead of an error
_ => { _ => {
// skip current line // skip current line
while !is_breakz(self.look_ch()) { while !is_breakz(self.input.look_ch()) {
self.skip_non_blank(); self.skip_non_blank();
} }
// XXX return an empty TagDirective token // XXX return an empty TagDirective token
@ -1051,7 +997,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_ws_to_eol(SkipTabs::Yes)?; self.skip_ws_to_eol(SkipTabs::Yes)?;
if is_breakz(self.ch()) { if is_breakz(self.ch()) {
self.lookahead(2); self.input.lookahead(2);
self.skip_linebreak(); self.skip_linebreak();
Ok(tok) Ok(tok)
} else { } else {
@ -1063,7 +1009,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> { fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
while is_blank(self.look_ch()) { while is_blank(self.input.look_ch()) {
self.skip_blank(); self.skip_blank();
} }
@ -1085,7 +1031,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_directive_name(&mut self) -> Result<String, ScanError> { fn scan_directive_name(&mut self) -> Result<String, ScanError> {
let start_mark = self.mark; let start_mark = self.mark;
let mut string = String::new(); let mut string = String::new();
while is_alpha(self.look_ch()) { while is_alpha(self.input.look_ch()) {
string.push(self.ch()); string.push(self.ch());
self.skip_non_blank(); self.skip_non_blank();
} }
@ -1110,7 +1056,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> { fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
let mut val = 0u32; let mut val = 0u32;
let mut length = 0usize; let mut length = 0usize;
while let Some(digit) = self.look_ch().to_digit(10) { while let Some(digit) = self.input.look_ch().to_digit(10) {
if length + 1 > 9 { if length + 1 > 9 {
return Err(ScanError::new_str( return Err(ScanError::new_str(
*mark, *mark,
@ -1134,19 +1080,19 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> { fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
/* Eat whitespaces. */ /* Eat whitespaces. */
while is_blank(self.look_ch()) { while is_blank(self.input.look_ch()) {
self.skip_blank(); self.skip_blank();
} }
let handle = self.scan_tag_handle(true, mark)?; let handle = self.scan_tag_handle(true, mark)?;
/* Eat whitespaces. */ /* Eat whitespaces. */
while is_blank(self.look_ch()) { while is_blank(self.input.look_ch()) {
self.skip_blank(); self.skip_blank();
} }
let prefix = self.scan_tag_prefix(mark)?; let prefix = self.scan_tag_prefix(mark)?;
self.lookahead(1); self.input.lookahead(1);
if is_blank_or_breakz(self.ch()) { if is_blank_or_breakz(self.ch()) {
Ok(Token(*mark, TokenType::TagDirective(handle, prefix))) Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
@ -1173,9 +1119,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut suffix; let mut suffix;
// Check if the tag is in the canonical form (verbatim). // Check if the tag is in the canonical form (verbatim).
self.lookahead(2); self.input.lookahead(2);
if self.buffer[1] == '<' { if self.input.nth_char_is(1, '<') {
suffix = self.scan_verbatim_tag(&start_mark)?; suffix = self.scan_verbatim_tag(&start_mark)?;
} else { } else {
// The tag has either the '!suffix' or the '!handle!suffix' // The tag has either the '!suffix' or the '!handle!suffix'
@ -1198,7 +1144,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
} }
if is_blank_or_breakz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) { if is_blank_or_breakz(self.input.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) {
// XXX: ex 7.2, an empty scalar can follow a secondary tag // XXX: ex 7.2, an empty scalar can follow a secondary tag
Ok(Token(start_mark, TokenType::Tag(handle, suffix))) Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
} else { } else {
@ -1211,7 +1157,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> { fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new(); let mut string = String::new();
if self.look_ch() != '!' { if self.input.look_ch() != '!' {
return Err(ScanError::new_str( return Err(ScanError::new_str(
*mark, *mark,
"while scanning a tag, did not find expected '!'", "while scanning a tag, did not find expected '!'",
@ -1221,7 +1167,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
string.push(self.ch()); string.push(self.ch());
self.skip_non_blank(); self.skip_non_blank();
while is_alpha(self.look_ch()) { while is_alpha(self.input.look_ch()) {
string.push(self.ch()); string.push(self.ch());
self.skip_non_blank(); self.skip_non_blank();
} }
@ -1250,7 +1196,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> { fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new(); let mut string = String::new();
if self.look_ch() == '!' { if self.input.look_ch() == '!' {
// If we have a local tag, insert and skip `!`. // If we have a local tag, insert and skip `!`.
string.push(self.ch()); string.push(self.ch());
self.skip_non_blank(); self.skip_non_blank();
@ -1269,7 +1215,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_non_blank(); self.skip_non_blank();
} }
while is_uri_char(self.look_ch()) { while is_uri_char(self.input.look_ch()) {
if self.ch() == '%' { if self.ch() == '%' {
string.push(self.scan_uri_escapes(start_mark)?); string.push(self.scan_uri_escapes(start_mark)?);
} else { } else {
@ -1290,7 +1236,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_non_blank(); self.skip_non_blank();
let mut string = String::new(); let mut string = String::new();
while is_uri_char(self.look_ch()) { while is_uri_char(self.input.look_ch()) {
if self.ch() == '%' { if self.ch() == '%' {
string.push(self.scan_uri_escapes(start_mark)?); string.push(self.scan_uri_escapes(start_mark)?);
} else { } else {
@ -1326,7 +1272,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
string.extend(head.chars().skip(1)); string.extend(head.chars().skip(1));
} }
while is_tag_char(self.look_ch()) { while is_tag_char(self.input.look_ch()) {
// Check if it is a URI-escape sequence. // Check if it is a URI-escape sequence.
if self.ch() == '%' { if self.ch() == '%' {
string.push(self.scan_uri_escapes(mark)?); string.push(self.scan_uri_escapes(mark)?);
@ -1352,38 +1298,41 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut width = 0usize; let mut width = 0usize;
let mut code = 0u32; let mut code = 0u32;
loop { loop {
self.lookahead(3); self.input.lookahead(3);
if !(self.ch() == '%' && is_hex(self.buffer[1]) && is_hex(self.buffer[2])) { let c = self.input.peek_nth(1);
let nc = self.input.peek_nth(2);
if !(self.ch() == '%' && is_hex(c) && is_hex(nc)) {
return Err(ScanError::new_str( return Err(ScanError::new_str(
*mark, *mark,
"while parsing a tag, did not find URI escaped octet", "while parsing a tag, found an invalid escape sequence",
)); ));
} }
let octet = (as_hex(self.buffer[1]) << 4) + as_hex(self.buffer[2]); let byte = (as_hex(c) << 4) + as_hex(nc);
if width == 0 { if width == 0 {
width = match octet { width = match byte {
_ if octet & 0x80 == 0x00 => 1, _ if byte & 0x80 == 0x00 => 1,
_ if octet & 0xE0 == 0xC0 => 2, _ if byte & 0xE0 == 0xC0 => 2,
_ if octet & 0xF0 == 0xE0 => 3, _ if byte & 0xF0 == 0xE0 => 3,
_ if octet & 0xF8 == 0xF0 => 4, _ if byte & 0xF8 == 0xF0 => 4,
_ => { _ => {
return Err(ScanError::new_str( return Err(ScanError::new_str(
*mark, *mark,
"while parsing a tag, found an incorrect leading UTF-8 octet", "while parsing a tag, found an incorrect leading UTF-8 byte",
)); ));
} }
}; };
code = octet; code = byte;
} else { } else {
if octet & 0xc0 != 0x80 { if byte & 0xc0 != 0x80 {
return Err(ScanError::new_str( return Err(ScanError::new_str(
*mark, *mark,
"while parsing a tag, found an incorrect trailing UTF-8 octet", "while parsing a tag, found an incorrect trailing UTF-8 byte",
)); ));
} }
code = (code << 8) + octet; code = (code << 8) + byte;
} }
self.skip_n_non_blank(3); self.skip_n_non_blank(3);
@ -1419,7 +1368,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let start_mark = self.mark; let start_mark = self.mark;
self.skip_non_blank(); self.skip_non_blank();
while is_anchor_char(self.look_ch()) { while is_anchor_char(self.input.look_ch()) {
string.push(self.ch()); string.push(self.ch());
self.skip_non_blank(); self.skip_non_blank();
} }
@ -1556,8 +1505,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// generate BLOCK-SEQUENCE-START if indented // generate BLOCK-SEQUENCE-START if indented
self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark); self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs(); let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
self.lookahead(2); self.input.lookahead(2);
if found_tabs && self.buffer[0] == '-' && is_blank_or_breakz(self.buffer[1]) { if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
{
return Err(ScanError::new_str( return Err(ScanError::new_str(
self.mark, self.mark,
"'-' must be followed by a valid YAML whitespace", "'-' must be followed by a valid YAML whitespace",
@ -1565,7 +1515,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
self.skip_ws_to_eol(SkipTabs::No)?; self.skip_ws_to_eol(SkipTabs::No)?;
if is_break(self.look_ch()) || is_flow(self.ch()) { if is_break(self.input.look_ch()) || is_flow(self.ch()) {
self.roll_one_col_indent(); self.roll_one_col_indent();
} }
@ -1623,14 +1573,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_non_blank(); self.skip_non_blank();
self.unroll_non_block_indents(); self.unroll_non_block_indents();
if self.look_ch() == '+' || self.ch() == '-' { if self.input.look_ch() == '+' || self.ch() == '-' {
if self.ch() == '+' { if self.ch() == '+' {
chomping = Chomping::Keep; chomping = Chomping::Keep;
} else { } else {
chomping = Chomping::Strip; chomping = Chomping::Strip;
} }
self.skip_non_blank(); self.skip_non_blank();
if is_digit(self.look_ch()) { if is_digit(self.input.look_ch()) {
if self.ch() == '0' { if self.ch() == '0' {
return Err(ScanError::new_str( return Err(ScanError::new_str(
start_mark, start_mark,
@ -1650,7 +1600,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
increment = (self.ch() as usize) - ('0' as usize); increment = (self.ch() as usize) - ('0' as usize);
self.skip_non_blank(); self.skip_non_blank();
self.lookahead(1); self.input.lookahead(1);
if self.ch() == '+' || self.ch() == '-' { if self.ch() == '+' || self.ch() == '-' {
if self.ch() == '+' { if self.ch() == '+' {
chomping = Chomping::Keep; chomping = Chomping::Keep;
@ -1664,7 +1614,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_ws_to_eol(SkipTabs::Yes)?; self.skip_ws_to_eol(SkipTabs::Yes)?;
// Check if we are at the end of the line. // Check if we are at the end of the line.
if !is_breakz(self.look_ch()) { if !is_breakz(self.input.look_ch()) {
return Err(ScanError::new_str( return Err(ScanError::new_str(
start_mark, start_mark,
"while scanning a block scalar, did not find expected comment or line break", "while scanning a block scalar, did not find expected comment or line break",
@ -1672,11 +1622,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
if is_break(self.ch()) { if is_break(self.ch()) {
self.lookahead(2); self.input.lookahead(2);
self.read_break(&mut chomping_break); self.read_break(&mut chomping_break);
} }
if self.look_ch() == '\t' { if self.input.look_ch() == '\t' {
return Err(ScanError::new_str( return Err(ScanError::new_str(
start_mark, start_mark,
"a block scalar content cannot start with a tab", "a block scalar content cannot start with a tab",
@ -1731,7 +1681,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let start_mark = self.mark; let start_mark = self.mark;
while self.mark.col == indent && !is_z(self.ch()) { while self.mark.col == indent && !is_z(self.ch()) {
if indent == 0 { if indent == 0 {
self.lookahead(4); self.input.lookahead(4);
if self.next_is_document_end() { if self.next_is_document_end() {
break; break;
} }
@ -1761,7 +1711,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
break; break;
} }
self.lookahead(2); self.input.lookahead(2);
self.read_break(&mut leading_break); self.read_break(&mut leading_break);
// Eat the following indentation spaces and line breaks. // Eat the following indentation spaces and line breaks.
@ -1797,7 +1747,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// line. This function does not consume the line break character(s) after the line. /// line. This function does not consume the line break character(s) after the line.
fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) { fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
// Start by evaluating characters in the buffer. // Start by evaluating characters in the buffer.
while !self.buffer.is_empty() && !is_breakz(self.ch()) { while !self.input.buf_is_empty() && !is_breakz(self.ch()) {
string.push(self.ch()); string.push(self.ch());
// We may technically skip non-blank characters. However, the only distinction is // We may technically skip non-blank characters. However, the only distinction is
// to determine what is leading whitespace and what is not. Here, we read the // to determine what is leading whitespace and what is not. Here, we read the
@ -1809,7 +1759,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// All characters that were in the buffer were consumed. We need to check if more // All characters that were in the buffer were consumed. We need to check if more
// follow. // follow.
if self.buffer.is_empty() { if self.input.buf_is_empty() {
// We will read all consecutive non-breakz characters. We push them into a // We will read all consecutive non-breakz characters. We push them into a
// temporary buffer. The main difference with going through `self.buffer` is that // temporary buffer. The main difference with going through `self.buffer` is that
// characters are appended here as their real size (1B for ascii, or up to 4 bytes for // characters are appended here as their real size (1B for ascii, or up to 4 bytes for
@ -1824,7 +1774,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// Our last character read is stored in `c`. It is either an EOF or a break. In any // Our last character read is stored in `c`. It is either an EOF or a break. In any
// case, we need to push it back into `self.buffer` so it may be properly read // case, we need to push it back into `self.buffer` so it may be properly read
// after. We must not insert it in `string`. // after. We must not insert it in `string`.
self.buffer.push_back(c).unwrap(); self.input.push_back(c);
// We need to manually update our position; we haven't called a `skip` function. // We need to manually update our position; we haven't called a `skip` function.
self.mark.col += line_buffer.len(); self.mark.col += line_buffer.len();
@ -1842,25 +1792,25 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) { fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
loop { loop {
// Consume all spaces. Tabs cannot be used as indentation. // Consume all spaces. Tabs cannot be used as indentation.
if indent < BUFFER_LEN - 2 { if indent < self.input.bufmaxlen() - 2 {
self.lookahead(BUFFER_LEN); self.input.lookahead(self.input.bufmaxlen());
while self.mark.col < indent && self.ch() == ' ' { while self.mark.col < indent && self.ch() == ' ' {
self.skip_blank(); self.skip_blank();
} }
} else { } else {
loop { loop {
self.lookahead(BUFFER_LEN); self.input.lookahead(self.input.bufmaxlen());
while !self.buffer.is_empty() && self.mark.col < indent && self.ch() == ' ' { while !self.input.buf_is_empty() && self.mark.col < indent && self.ch() == ' ' {
self.skip_blank(); self.skip_blank();
} }
// If we reached our indent, we can break. We must also break if we have // If we reached our indent, we can break. We must also break if we have
// reached content or EOF; that is, the buffer is not empty and the next // reached content or EOF; that is, the buffer is not empty and the next
// character is not a space. // character is not a space.
if self.mark.col == indent || (!self.buffer.is_empty() && self.ch() != ' ') { if self.mark.col == indent || (!self.input.buf_is_empty() && self.ch() != ' ') {
break; break;
} }
} }
self.lookahead(2); self.input.lookahead(2);
} }
// If our current line is empty, skip over the break and continue looping. // If our current line is empty, skip over the break and continue looping.
@ -1881,7 +1831,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut max_indent = 0; let mut max_indent = 0;
loop { loop {
// Consume all spaces. Tabs cannot be used as indentation. // Consume all spaces. Tabs cannot be used as indentation.
while self.look_ch() == ' ' { while self.input.look_ch() == ' ' {
self.skip_blank(); self.skip_blank();
} }
@ -1891,7 +1841,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
if is_break(self.ch()) { if is_break(self.ch()) {
// If our current line is empty, skip over the break and continue looping. // If our current line is empty, skip over the break and continue looping.
self.lookahead(2); self.input.lookahead(2);
self.read_break(breaks); self.read_break(breaks);
} else { } else {
// Otherwise, we have a content line. Return control. // Otherwise, we have a content line. Return control.
@ -1943,15 +1893,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
loop { loop {
/* Check for a document indicator. */ /* Check for a document indicator. */
self.lookahead(4); self.input.lookahead(4);
if self.mark.col == 0 if self.mark.col == 0 && self.next_is_document_indicator() {
&& (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
|| ((self.buffer[0] == '.')
&& (self.buffer[1] == '.')
&& (self.buffer[2] == '.')))
&& is_blank_or_breakz(self.buffer[3])
{
return Err(ScanError::new_str( return Err(ScanError::new_str(
start_mark, start_mark,
"while scanning a quoted scalar, found unexpected document indicator", "while scanning a quoted scalar, found unexpected document indicator",
@ -1980,7 +1924,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
&start_mark, &start_mark,
)?; )?;
match self.look_ch() { match self.input.look_ch() {
'\'' if single => break, '\'' if single => break,
'"' if !single => break, '"' if !single => break,
_ => {} _ => {}
@ -2003,7 +1947,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_blank(); self.skip_blank();
} }
} else { } else {
self.lookahead(2); self.input.lookahead(2);
// Check if it is a first line break. // Check if it is a first line break.
if leading_blanks { if leading_blanks {
self.read_break(&mut trailing_breaks); self.read_break(&mut trailing_breaks);
@ -2013,7 +1957,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
leading_blanks = true; leading_blanks = true;
} }
} }
self.lookahead(1); self.input.lookahead(1);
} }
// Join the whitespaces or fold line breaks. // Join the whitespaces or fold line breaks.
@ -2083,11 +2027,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
leading_blanks: &mut bool, leading_blanks: &mut bool,
start_mark: &Marker, start_mark: &Marker,
) -> Result<(), ScanError> { ) -> Result<(), ScanError> {
self.lookahead(2); self.input.lookahead(2);
while !is_blank_or_breakz(self.ch()) { while !is_blank_or_breakz(self.ch()) {
match self.ch() { match self.ch() {
// Check for an escaped single quote. // Check for an escaped single quote.
'\'' if self.buffer[1] == '\'' && single => { '\'' if self.input.peek_nth(1) == '\'' && single => {
string.push('\''); string.push('\'');
self.skip_n_non_blank(2); self.skip_n_non_blank(2);
} }
@ -2095,8 +2039,8 @@ impl<T: Iterator<Item = char>> Scanner<T> {
'\'' if single => break, '\'' if single => break,
'"' if !single => break, '"' if !single => break,
// Check for an escaped line break. // Check for an escaped line break.
'\\' if !single && is_break(self.buffer[1]) => { '\\' if !single && is_break(self.input.peek_nth(1)) => {
self.lookahead(3); self.input.lookahead(3);
self.skip_non_blank(); self.skip_non_blank();
self.skip_linebreak(); self.skip_linebreak();
*leading_blanks = true; *leading_blanks = true;
@ -2111,7 +2055,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_non_blank(); self.skip_non_blank();
} }
} }
self.lookahead(2); self.input.lookahead(2);
} }
Ok(()) Ok(())
} }
@ -2129,7 +2073,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut code_length = 0usize; let mut code_length = 0usize;
let mut ret = '\0'; let mut ret = '\0';
match self.buffer[1] { match self.input.peek_nth(1) {
'0' => ret = '\0', '0' => ret = '\0',
'a' => ret = '\x07', 'a' => ret = '\x07',
'b' => ret = '\x08', 'b' => ret = '\x08',
@ -2165,16 +2109,17 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// Consume an arbitrary escape code. // Consume an arbitrary escape code.
if code_length > 0 { if code_length > 0 {
self.lookahead(code_length); self.input.lookahead(code_length);
let mut value = 0u32; let mut value = 0u32;
for i in 0..code_length { for i in 0..code_length {
if !is_hex(self.buffer[i]) { let c = self.input.peek_nth(i);
if !is_hex(c) {
return Err(ScanError::new_str( return Err(ScanError::new_str(
*start_mark, *start_mark,
"while parsing a quoted scalar, did not find expected hexadecimal number", "while parsing a quoted scalar, did not find expected hexadecimal number",
)); ));
} }
value = (value << 4) + as_hex(self.buffer[i]); value = (value << 4) + as_hex(c);
} }
let Some(ch) = char::from_u32(value) else { let Some(ch) = char::from_u32(value) else {
@ -2223,12 +2168,12 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut whitespaces = String::with_capacity(32); let mut whitespaces = String::with_capacity(32);
loop { loop {
self.lookahead(4); self.input.lookahead(4);
if self.next_is_document_indicator() || self.ch() == '#' { if self.next_is_document_indicator() || self.ch() == '#' {
break; break;
} }
if self.flow_level > 0 && self.ch() == '-' && is_flow(self.buffer[1]) { if self.flow_level > 0 && self.ch() == '-' && is_flow(self.input.peek_nth(1)) {
return Err(ScanError::new_str( return Err(ScanError::new_str(
self.mark, self.mark,
"plain scalar cannot start with '-' followed by ,[]{}", "plain scalar cannot start with '-' followed by ,[]{}",
@ -2260,7 +2205,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// We can unroll the first iteration of the loop. // We can unroll the first iteration of the loop.
string.push(self.ch()); string.push(self.ch());
self.skip_non_blank(); self.skip_non_blank();
self.lookahead(2); self.input.lookahead(2);
// Add content non-blank characters to the scalar. // Add content non-blank characters to the scalar.
while !is_blank_or_breakz(self.ch()) { while !is_blank_or_breakz(self.ch()) {
@ -2270,7 +2215,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
string.push(self.ch()); string.push(self.ch());
self.skip_non_blank(); self.skip_non_blank();
self.lookahead(2); self.input.lookahead(2);
} }
} }
@ -2283,7 +2228,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
// Process blank characters. // Process blank characters.
while is_blank(self.look_ch()) || is_break(self.ch()) { while is_blank(self.input.look_ch()) || is_break(self.ch()) {
if is_blank(self.ch()) { if is_blank(self.ch()) {
if !self.leading_whitespace { if !self.leading_whitespace {
whitespaces.push(self.ch()); whitespaces.push(self.ch());
@ -2302,7 +2247,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_blank(); self.skip_blank();
} }
} else { } else {
self.lookahead(2); self.input.lookahead(2);
// Check if it is a first line break // Check if it is a first line break
if self.leading_whitespace { if self.leading_whitespace {
self.read_break(&mut trailing_breaks); self.read_break(&mut trailing_breaks);
@ -2379,7 +2324,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// [`self.flow_level`]: Self::flow_level /// [`self.flow_level`]: Self::flow_level
/// [`fetch_value`]: Self::fetch_value /// [`fetch_value`]: Self::fetch_value
fn fetch_flow_value(&mut self) -> ScanResult { fn fetch_flow_value(&mut self) -> ScanResult {
let nc = self.buffer[1]; let nc = self.input.peek_nth(1);
// If we encounter a ':' inside a flow collection and it is not immediately // If we encounter a ':' inside a flow collection and it is not immediately
// followed by a blank or breakz: // followed by a blank or breakz:
@ -2413,7 +2358,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// Skip over ':'. // Skip over ':'.
self.skip_non_blank(); self.skip_non_blank();
if self.look_ch() == '\t' if self.input.look_ch() == '\t'
&& !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws() && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
&& (self.ch() == '-' || is_alpha(self.ch())) && (self.ch() == '-' || is_alpha(self.ch()))
{ {
@ -2600,8 +2545,8 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn next_can_be_plain_scalar(&self) -> bool { fn next_can_be_plain_scalar(&self) -> bool {
match self.ch() { match self.ch() {
// indicators can end a plain scalar, see 7.3.3. Plain Style // indicators can end a plain scalar, see 7.3.3. Plain Style
':' if is_blank_or_breakz(self.buffer[1]) ':' if is_blank_or_breakz(self.input.peek_nth(1))
|| (self.flow_level > 0 && is_flow(self.buffer[1])) => || (self.flow_level > 0 && is_flow(self.input.peek_nth(1))) =>
{ {
false false
} }

View file

@ -231,7 +231,7 @@ a: |-
#[test] #[test]
fn test_bad_docstart() { fn test_bad_docstart() {
assert!(run_parser("---This used to cause an infinite loop").is_ok()); run_parser("---This used to cause an infinite loop").unwrap();
assert_eq!( assert_eq!(
run_parser("----").unwrap(), run_parser("----").unwrap(),
[ [

View file

@ -2,11 +2,7 @@ use std::env;
use std::fs::File; use std::fs::File;
use std::io::prelude::*; use std::io::prelude::*;
use saphyr_parser::{ use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser};
parser::{MarkedEventReceiver, Parser},
scanner::Marker,
Event,
};
#[derive(Debug)] #[derive(Debug)]
struct EventSink { struct EventSink {

View file

@ -1,12 +1,9 @@
use saphyr_parser::{
parser::{MarkedEventReceiver, Parser},
scanner::Marker,
Event,
};
use std::env; use std::env;
use std::fs::File; use std::fs::File;
use std::io::prelude::*; use std::io::prelude::*;
use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser};
/// A sink which discards any event sent. /// A sink which discards any event sent.
struct NullSink {} struct NullSink {}