Add Input interface.

Hiding character fetching behind this interface allows us to create more
specific implementations when is appropriate. For instance, an instance
of `Input` can be created for a `&str`, allowing for borrowing and more
efficient peeking and traversing than if we were to fetch characters one
at a time and placing them into a temporary buffer.
This commit is contained in:
Ethiraric 2024-04-18 17:48:49 +02:00
parent 11cffc6df8
commit d9bb7a1693
8 changed files with 384 additions and 229 deletions

View file

@ -0,0 +1,99 @@
use crate::input::Input;
use arraydeque::ArrayDeque;
/// The size of the [`BufferedInput`] buffer.
///
/// The buffer is statically allocated to avoid conditions for reallocations each time we
/// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except:
/// - Escape sequences parsing: some escape codes are 8 characters
/// - Scanning indent in scalars: this looks ahead `indent + 2` characters
///
/// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done
/// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher
/// than that, the code will fall back to a loop of lookaheads.
const BUFFER_LEN: usize = 16;
/// A wrapper around an [`Iterator`] of [`char`]s with a buffer.
///
/// The YAML scanner often needs some lookahead. With fully allocated buffers such as `String` or
/// `&str`, this is not an issue. However, with streams, we need to have a way of peeking multiple
/// characters at a time and sometimes pushing some back into the stream.
/// There is no "easy" way of doing this without itertools. In order to avoid pulling the entierty
/// of itertools for one method, we use this structure.
pub struct BufferedInput<T: Iterator<Item = char>> {
/// The iterator source,
input: T,
/// Buffer for the next characters to consume.
buffer: ArrayDeque<char, BUFFER_LEN>,
}
impl<T: Iterator<Item = char>> BufferedInput<T> {
/// Create a new [`BufferedInput`] with the given input.
pub fn new(input: T) -> Self {
Self {
input,
buffer: ArrayDeque::default(),
}
}
}
impl<T: Iterator<Item = char>> Input for BufferedInput<T> {
#[inline]
fn lookahead(&mut self, count: usize) {
if self.buffer.len() >= count {
return;
}
for _ in 0..(count - self.buffer.len()) {
self.buffer
.push_back(self.input.next().unwrap_or('\0'))
.unwrap();
}
}
#[inline]
fn buflen(&self) -> usize {
self.buffer.len()
}
#[inline]
fn bufmaxlen(&self) -> usize {
BUFFER_LEN
}
#[inline]
fn raw_read_ch(&mut self) -> char {
self.input.next().unwrap_or('\0')
}
#[inline]
fn push_back(&mut self, c: char) {
self.buffer.push_back(c).unwrap();
}
#[inline]
fn skip(&mut self) {
self.buffer.pop_front();
}
#[inline]
fn skip_n(&mut self, count: usize) {
self.buffer.drain(0..count);
}
#[inline]
fn peek(&self) -> char {
self.buffer[0]
}
#[inline]
fn peek_nth(&self, n: usize) -> char {
self.buffer[n]
}
#[inline]
fn next_is(&self, pat: &str) -> bool {
assert!(self.buffer.len() >= pat.len());
self.buffer.iter().zip(pat.chars()).all(|(a, b)| *a == b)
}
}

111
parser/src/input.rs Normal file
View file

@ -0,0 +1,111 @@
/// Interface for a source of characters.
///
/// Hiding the input's implementation behind this trait allows mostly:
/// * For input-specific optimizations (for instance, using `str` methods instead of manually
/// transferring one `char` at a time to a buffer).
/// * To return `&str`s referencing the input string, thus avoiding potentially costly
/// allocations. Should users need an owned version of the data, they can always `.to_owned()`
/// their YAML object.
pub trait Input {
/// A hint to the input source that we will need to read `count` characters.
///
/// If the input is exhausted, `\0` can be used to pad the last characters and later returned.
/// The characters must not be consumed, but may be placed in an internal buffer.
///
/// This method may be a no-op if buffering yields no performance improvement.
///
/// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The
/// parser tracks how many characters are loaded in the buffer and acts accordingly.
fn lookahead(&mut self, count: usize);
/// Return the number of buffered characters in `self`.
#[must_use]
fn buflen(&self) -> usize;
/// Return the capacity of the buffer in `self`.
#[must_use]
fn bufmaxlen(&self) -> usize;
/// Return whether the buffer (!= stream) is empty.
#[inline]
#[must_use]
fn buf_is_empty(&self) -> bool {
self.buflen() == 0
}
/// Read a character from the input stream and return it directly.
///
/// The internal buffer (is any) is bypassed.
#[must_use]
fn raw_read_ch(&mut self) -> char;
/// Put a character back in the buffer.
///
/// This function is only called when we read one too many characters and the pushed back
/// character is exactly the last character that was read. This function will not be called
/// multiple times consecutively.
fn push_back(&mut self, c: char);
/// Consume the next character.
fn skip(&mut self);
/// Consume the next `count` character.
fn skip_n(&mut self, count: usize);
/// Return the next character, without consuming it.
///
/// Users of the [`Input`] must make sure that the character has been loaded through a prior
/// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to
/// [`Input::lookahead`] has been made beforehand.
///
/// # Return
/// If the input source is not exhausted, returns the next character to be fed into the
/// scanner. Otherwise, returns `\0`.
#[must_use]
fn peek(&self) -> char;
/// Return the `n`-th character in the buffer, without consuming it.
///
/// This function assumes that the n-th character in the input has already been fetched through
/// [`Input::lookahead`].
#[must_use]
fn peek_nth(&self, n: usize) -> char;
/// Look for the next character and return it.
///
/// The character is not consumed.
/// Equivalent to calling [`Input::lookahead`] and [`Input::peek`].
#[inline]
#[must_use]
fn look_ch(&mut self) -> char {
self.lookahead(1);
self.peek()
}
/// Return whether the next character in the input source is equal to `c`.
///
/// This function assumes that the next character in the input has already been fetched through
/// [`Input::lookahead`].
#[inline]
#[must_use]
fn next_char_is(&self, c: char) -> bool {
self.peek() == c
}
/// Return whether the `n`-th character in the input source is equal to `c`.
///
/// This function assumes that the n-th character in the input has already been fetched through
/// [`Input::lookahead`].
#[inline]
#[must_use]
fn nth_char_is(&self, n: usize, c: char) -> bool {
self.peek_nth(n) == c
}
/// Return whether the next characters in the input source match the given pattern.
///
/// This function assumes that the next `pat.len()` characters in the input has already been
/// fetched through [`Input::lookahead`].
#[must_use]
fn next_is(&self, pat: &str) -> bool;
}

View file

@ -32,11 +32,14 @@
#![warn(missing_docs, clippy::pedantic)]
pub(crate) mod char_traits;
mod buffered_input;
mod char_traits;
#[macro_use]
pub(crate) mod debug;
pub mod parser;
pub mod scanner;
mod debug;
mod input;
mod parser;
mod scanner;
pub use crate::buffered_input::BufferedInput;
pub use crate::parser::{Event, EventReceiver, MarkedEventReceiver, Parser, Tag};
pub use crate::scanner::{Marker, ScanError, TScalarStyle};

View file

@ -4,7 +4,11 @@
//! compliance, and emits a stream of YAML events. This stream can for instance be used to create
//! YAML objects.
use crate::scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType};
use crate::{
input::Input,
scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType},
BufferedInput,
};
use std::collections::HashMap;
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
@ -100,7 +104,7 @@ impl Event {
/// A YAML parser.
#[derive(Debug)]
pub struct Parser<T> {
pub struct Parser<T: Input> {
/// The underlying scanner from which we pull tokens.
scanner: Scanner<T>,
/// The stack of _previous_ states we were in.
@ -225,15 +229,15 @@ impl<R: EventReceiver> MarkedEventReceiver for R {
/// A convenience alias for a `Result` of a parser event.
pub type ParseResult = Result<(Event, Marker), ScanError>;
impl<'a> Parser<core::str::Chars<'a>> {
impl<'a> Parser<BufferedInput<std::str::Chars<'a>>> {
/// Create a new instance of a parser from a &str.
#[must_use]
pub fn new_from_str(value: &'a str) -> Self {
Parser::new(value.chars())
Parser::new(BufferedInput::new(value.chars()))
}
}
impl<T: Iterator<Item = char>> Parser<T> {
impl<T: Input> Parser<T> {
/// Create a new instance of a parser from the given input of characters.
pub fn new(src: T) -> Parser<T> {
Parser {
@ -1130,7 +1134,7 @@ impl<T: Iterator<Item = char>> Parser<T> {
}
}
impl<T: Iterator<Item = char>> Iterator for Parser<T> {
impl<T: Input> Iterator for Parser<T> {
type Item = Result<(Event, Marker), ScanError>;
fn next(&mut self) -> Option<Self::Item> {

View file

@ -11,11 +11,12 @@
use std::{char, collections::VecDeque, error::Error, fmt};
use arraydeque::ArrayDeque;
use crate::char_traits::{
as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit,
is_flow, is_hex, is_tag_char, is_uri_char, is_z,
use crate::{
char_traits::{
as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz,
is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z,
},
input::Input,
};
/// The encoding of the input. Currently, only UTF-8 is supported.
@ -343,18 +344,6 @@ enum ImplicitMappingState {
Inside,
}
/// The size of the [`Scanner`] buffer.
///
/// The buffer is statically allocated to avoid conditions for reallocations each time we
/// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except:
/// - Escape sequences parsing: some escape codes are 8 characters
/// - Scanning indent in scalars: this looks ahead `indent + 2` characters
///
/// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done
/// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher
/// than that, the code will fall back to a loop of lookaheads.
const BUFFER_LEN: usize = 16;
/// The YAML scanner.
///
/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
@ -367,8 +356,10 @@ const BUFFER_LEN: usize = 16;
#[derive(Debug)]
#[allow(clippy::struct_excessive_bools)]
pub struct Scanner<T> {
/// The reader, providing with characters.
rdr: T,
/// The input source.
///
/// This must implement [`Input`].
input: T,
/// The position of the cursor within the reader.
mark: Marker,
/// Buffer for tokens to be returned.
@ -378,8 +369,6 @@ pub struct Scanner<T> {
/// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
/// [`Self::next`] until we have more context.
tokens: VecDeque<Token>,
/// Buffer for the next characters to consume.
buffer: ArrayDeque<char, BUFFER_LEN>,
/// The last error that happened.
error: Option<ScanError>,
@ -435,7 +424,7 @@ pub struct Scanner<T> {
implicit_flow_mapping_states: Vec<ImplicitMappingState>,
}
impl<T: Iterator<Item = char>> Iterator for Scanner<T> {
impl<T: Input> Iterator for Scanner<T> {
type Item = Token;
fn next(&mut self) -> Option<Token> {
if self.error.is_some() {
@ -462,12 +451,11 @@ impl<T: Iterator<Item = char>> Iterator for Scanner<T> {
/// A convenience alias for scanner functions that may fail without returning a value.
pub type ScanResult = Result<(), ScanError>;
impl<T: Iterator<Item = char>> Scanner<T> {
impl<T: Input> Scanner<T> {
/// Creates the YAML tokenizer.
pub fn new(rdr: T) -> Scanner<T> {
pub fn new(input: T) -> Scanner<T> {
Scanner {
rdr,
buffer: ArrayDeque::new(),
input,
mark: Marker::new(0, 1, 0),
tokens: VecDeque::new(),
error: None,
@ -497,25 +485,10 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.error.clone()
}
/// Fill `self.buffer` with at least `count` characters.
///
/// The characters that are extracted this way are not consumed but only placed in the buffer.
#[inline]
fn lookahead(&mut self, count: usize) {
if self.buffer.len() >= count {
return;
}
for _ in 0..(count - self.buffer.len()) {
self.buffer
.push_back(self.rdr.next().unwrap_or('\0'))
.unwrap();
}
}
/// Consume the next character. It is assumed the next character is a blank.
#[inline]
fn skip_blank(&mut self) {
self.buffer.pop_front();
self.input.skip();
self.mark.index += 1;
self.mark.col += 1;
@ -524,7 +497,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// Consume the next character. It is assumed the next character is not a blank.
#[inline]
fn skip_non_blank(&mut self) {
self.buffer.pop_front();
self.input.skip();
self.mark.index += 1;
self.mark.col += 1;
@ -533,18 +506,18 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// Consume the next characters. It is assumed none of the next characters are blanks.
#[inline]
fn skip_n_non_blank(&mut self, n: usize) {
self.buffer.drain(0..n);
fn skip_n_non_blank(&mut self, count: usize) {
self.input.skip_n(count);
self.mark.index += n;
self.mark.col += n;
self.mark.index += count;
self.mark.col += count;
self.leading_whitespace = false;
}
/// Consume the next character. It is assumed the next character is a newline.
#[inline]
fn skip_nl(&mut self) {
self.buffer.pop_front();
self.input.skip();
self.mark.index += 1;
self.mark.col = 0;
@ -555,12 +528,12 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
#[inline]
fn skip_linebreak(&mut self) {
if self.buffer[0] == '\r' && self.buffer[1] == '\n' {
if self.input.next_is("\r\n") {
// While technically not a blank, this does not matter as `self.leading_whitespace`
// will be reset by `skip_nl`.
self.skip_blank();
self.skip_nl();
} else if is_break(self.buffer[0]) {
} else if is_break(self.input.peek()) {
self.skip_nl();
}
}
@ -570,32 +543,16 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// The character is not consumed.
#[inline]
fn ch(&self) -> char {
self.buffer[0]
}
/// Look for the next character and return it.
///
/// The character is not consumed.
/// Equivalent to calling [`Self::lookahead`] and [`Self::ch`].
#[inline]
fn look_ch(&mut self) -> char {
self.lookahead(1);
self.ch()
self.input.peek()
}
/// Read a character from the input stream, returning it directly.
///
/// The buffer is bypassed and `self.mark` needs to be updated manually.
/// The buffer (if any) is bypassed and `self.mark` needs to be updated manually.
#[inline]
#[must_use]
fn raw_read_ch(&mut self) -> char {
self.rdr.next().unwrap_or('\0')
}
/// Return whether the next character is `c`.
#[inline]
fn ch_is(&self, c: char) -> bool {
self.buffer[0] == c
self.input.raw_read_ch()
}
/// Return whether the [`TokenType::StreamStart`] event has been emitted.
@ -624,8 +581,8 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// If the next characters do not correspond to a line break.
#[inline]
fn read_break(&mut self, s: &mut String) {
let c = self.buffer[0];
let nc = self.buffer[1];
let c = self.input.peek();
let nc = self.input.peek_nth(1);
debug_assert!(is_break(c));
if c == '\r' && nc == '\n' {
self.skip_blank();
@ -635,15 +592,20 @@ impl<T: Iterator<Item = char>> Scanner<T> {
s.push('\n');
}
/// Check whether the next characters correspond to a start of document.
///
/// [`Self::lookahead`] must have been called before calling this function.
fn next_is_document_start(&self) -> bool {
assert!(self.input.buflen() >= 4);
self.input.next_is("---") && is_blank_or_breakz(self.input.peek_nth(3))
}
/// Check whether the next characters correspond to an end of document.
///
/// [`Self::lookahead`] must have been called before calling this function.
fn next_is_document_end(&self) -> bool {
assert!(self.buffer.len() >= 4);
self.buffer[0] == '.'
&& self.buffer[1] == '.'
&& self.buffer[2] == '.'
&& is_blank_or_breakz(self.buffer[3])
assert!(self.input.buflen() >= 4);
self.input.next_is("...") && is_blank_or_breakz(self.input.peek_nth(3))
}
/// Check whether the next characters correspond to a document indicator.
@ -651,11 +613,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// [`Self::lookahead`] must have been called before calling this function.
#[inline]
fn next_is_document_indicator(&self) -> bool {
assert!(self.buffer.len() >= 4);
self.mark.col == 0
&& (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
|| ((self.buffer[0] == '.') && (self.buffer[1] == '.') && (self.buffer[2] == '.')))
&& is_blank_or_breakz(self.buffer[3])
assert!(self.input.buflen() >= 4);
is_blank_or_breakz(self.input.peek_nth(3))
&& (self.input.next_is("...") || self.input.next_is("---"))
}
/// Insert a token at the given position.
@ -674,11 +634,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
}
/// Fetch the next token in the stream.
///
/// # Errors
/// Returns `ScanError` when the scanner does not find the next expected token.
pub fn fetch_next_token(&mut self) -> ScanResult {
self.lookahead(1);
// eprintln!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch());
self.input.lookahead(1);
if !self.stream_start_produced {
self.fetch_stream_start();
@ -697,34 +657,19 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mark = self.mark;
self.unroll_indent(mark.col as isize);
self.lookahead(4);
self.input.lookahead(4);
if is_z(self.ch()) {
self.fetch_stream_end()?;
return Ok(());
}
// Is it a directive?
if self.mark.col == 0 && self.ch_is('%') {
if self.mark.col == 0 {
if self.input.next_char_is('%') {
return self.fetch_directive();
}
if self.mark.col == 0
&& self.buffer[0] == '-'
&& self.buffer[1] == '-'
&& self.buffer[2] == '-'
&& is_blank_or_breakz(self.buffer[3])
{
self.fetch_document_indicator(TokenType::DocumentStart)?;
return Ok(());
}
if self.mark.col == 0
&& self.buffer[0] == '.'
&& self.buffer[1] == '.'
&& self.buffer[2] == '.'
&& is_blank_or_breakz(self.buffer[3])
{
} else if self.next_is_document_start() {
return self.fetch_document_indicator(TokenType::DocumentStart);
} else if self.next_is_document_end() {
self.fetch_document_indicator(TokenType::DocumentEnd)?;
self.skip_ws_to_eol(SkipTabs::Yes)?;
if !is_breakz(self.ch()) {
@ -735,13 +680,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
}
return Ok(());
}
}
if (self.mark.col as isize) < self.indent {
return Err(ScanError::new_str(self.mark, "invalid indentation"));
}
let c = self.buffer[0];
let nc = self.buffer[1];
let c = self.input.peek();
let nc = self.input.peek_nth(1);
match c {
'[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
'{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
@ -860,7 +806,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
Ok(())
}
/// Skip over all whitespace and comments until the next token.
/// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
///
/// # Errors
/// This function returns an error if a tabulation is encountered where there should not be
@ -868,7 +814,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn skip_to_next_token(&mut self) -> ScanResult {
loop {
// TODO(chenyh) BOM
match self.look_ch() {
match self.input.look_ch() {
// Tabs may not be used as indentation.
// "Indentation" only exists as long as a block is started, but does not exist
// inside of flow-style constructs. Tabs are allowed as part of leading
@ -890,14 +836,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
}
'\t' | ' ' => self.skip_blank(),
'\n' | '\r' => {
self.lookahead(2);
self.input.lookahead(2);
self.skip_linebreak();
if self.flow_level == 0 {
self.allow_simple_key();
}
}
'#' => {
while !is_breakz(self.look_ch()) {
while !is_breakz(self.input.look_ch()) {
self.skip_non_blank();
}
}
@ -914,14 +860,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn skip_yaml_whitespace(&mut self) -> ScanResult {
let mut need_whitespace = true;
loop {
match self.look_ch() {
match self.input.look_ch() {
' ' => {
self.skip_blank();
need_whitespace = false;
}
'\n' | '\r' => {
self.lookahead(2);
self.input.lookahead(2);
self.skip_linebreak();
if self.flow_level == 0 {
self.allow_simple_key();
@ -929,7 +875,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
need_whitespace = false;
}
'#' => {
while !is_breakz(self.look_ch()) {
while !is_breakz(self.input.look_ch()) {
self.skip_non_blank();
}
}
@ -949,7 +895,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut encountered_tab = false;
let mut has_yaml_ws = false;
loop {
match self.look_ch() {
match self.input.look_ch() {
' ' => {
has_yaml_ws = true;
self.skip_blank();
@ -966,7 +912,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
));
}
'#' => {
while !is_breakz(self.look_ch()) {
while !is_breakz(self.input.look_ch()) {
self.skip_non_blank();
}
}
@ -1035,7 +981,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// XXX This should be a warning instead of an error
_ => {
// skip current line
while !is_breakz(self.look_ch()) {
while !is_breakz(self.input.look_ch()) {
self.skip_non_blank();
}
// XXX return an empty TagDirective token
@ -1051,7 +997,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_ws_to_eol(SkipTabs::Yes)?;
if is_breakz(self.ch()) {
self.lookahead(2);
self.input.lookahead(2);
self.skip_linebreak();
Ok(tok)
} else {
@ -1063,7 +1009,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
}
fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
while is_blank(self.look_ch()) {
while is_blank(self.input.look_ch()) {
self.skip_blank();
}
@ -1085,7 +1031,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_directive_name(&mut self) -> Result<String, ScanError> {
let start_mark = self.mark;
let mut string = String::new();
while is_alpha(self.look_ch()) {
while is_alpha(self.input.look_ch()) {
string.push(self.ch());
self.skip_non_blank();
}
@ -1110,7 +1056,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
let mut val = 0u32;
let mut length = 0usize;
while let Some(digit) = self.look_ch().to_digit(10) {
while let Some(digit) = self.input.look_ch().to_digit(10) {
if length + 1 > 9 {
return Err(ScanError::new_str(
*mark,
@ -1134,19 +1080,19 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
/* Eat whitespaces. */
while is_blank(self.look_ch()) {
while is_blank(self.input.look_ch()) {
self.skip_blank();
}
let handle = self.scan_tag_handle(true, mark)?;
/* Eat whitespaces. */
while is_blank(self.look_ch()) {
while is_blank(self.input.look_ch()) {
self.skip_blank();
}
let prefix = self.scan_tag_prefix(mark)?;
self.lookahead(1);
self.input.lookahead(1);
if is_blank_or_breakz(self.ch()) {
Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
@ -1173,9 +1119,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut suffix;
// Check if the tag is in the canonical form (verbatim).
self.lookahead(2);
self.input.lookahead(2);
if self.buffer[1] == '<' {
if self.input.nth_char_is(1, '<') {
suffix = self.scan_verbatim_tag(&start_mark)?;
} else {
// The tag has either the '!suffix' or the '!handle!suffix'
@ -1198,7 +1144,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
}
}
if is_blank_or_breakz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) {
if is_blank_or_breakz(self.input.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) {
// XXX: ex 7.2, an empty scalar can follow a secondary tag
Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
} else {
@ -1211,7 +1157,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new();
if self.look_ch() != '!' {
if self.input.look_ch() != '!' {
return Err(ScanError::new_str(
*mark,
"while scanning a tag, did not find expected '!'",
@ -1221,7 +1167,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
string.push(self.ch());
self.skip_non_blank();
while is_alpha(self.look_ch()) {
while is_alpha(self.input.look_ch()) {
string.push(self.ch());
self.skip_non_blank();
}
@ -1250,7 +1196,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new();
if self.look_ch() == '!' {
if self.input.look_ch() == '!' {
// If we have a local tag, insert and skip `!`.
string.push(self.ch());
self.skip_non_blank();
@ -1269,7 +1215,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_non_blank();
}
while is_uri_char(self.look_ch()) {
while is_uri_char(self.input.look_ch()) {
if self.ch() == '%' {
string.push(self.scan_uri_escapes(start_mark)?);
} else {
@ -1290,7 +1236,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_non_blank();
let mut string = String::new();
while is_uri_char(self.look_ch()) {
while is_uri_char(self.input.look_ch()) {
if self.ch() == '%' {
string.push(self.scan_uri_escapes(start_mark)?);
} else {
@ -1326,7 +1272,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
string.extend(head.chars().skip(1));
}
while is_tag_char(self.look_ch()) {
while is_tag_char(self.input.look_ch()) {
// Check if it is a URI-escape sequence.
if self.ch() == '%' {
string.push(self.scan_uri_escapes(mark)?);
@ -1352,38 +1298,41 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut width = 0usize;
let mut code = 0u32;
loop {
self.lookahead(3);
self.input.lookahead(3);
if !(self.ch() == '%' && is_hex(self.buffer[1]) && is_hex(self.buffer[2])) {
let c = self.input.peek_nth(1);
let nc = self.input.peek_nth(2);
if !(self.ch() == '%' && is_hex(c) && is_hex(nc)) {
return Err(ScanError::new_str(
*mark,
"while parsing a tag, did not find URI escaped octet",
"while parsing a tag, found an invalid escape sequence",
));
}
let octet = (as_hex(self.buffer[1]) << 4) + as_hex(self.buffer[2]);
let byte = (as_hex(c) << 4) + as_hex(nc);
if width == 0 {
width = match octet {
_ if octet & 0x80 == 0x00 => 1,
_ if octet & 0xE0 == 0xC0 => 2,
_ if octet & 0xF0 == 0xE0 => 3,
_ if octet & 0xF8 == 0xF0 => 4,
width = match byte {
_ if byte & 0x80 == 0x00 => 1,
_ if byte & 0xE0 == 0xC0 => 2,
_ if byte & 0xF0 == 0xE0 => 3,
_ if byte & 0xF8 == 0xF0 => 4,
_ => {
return Err(ScanError::new_str(
*mark,
"while parsing a tag, found an incorrect leading UTF-8 octet",
"while parsing a tag, found an incorrect leading UTF-8 byte",
));
}
};
code = octet;
code = byte;
} else {
if octet & 0xc0 != 0x80 {
if byte & 0xc0 != 0x80 {
return Err(ScanError::new_str(
*mark,
"while parsing a tag, found an incorrect trailing UTF-8 octet",
"while parsing a tag, found an incorrect trailing UTF-8 byte",
));
}
code = (code << 8) + octet;
code = (code << 8) + byte;
}
self.skip_n_non_blank(3);
@ -1419,7 +1368,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let start_mark = self.mark;
self.skip_non_blank();
while is_anchor_char(self.look_ch()) {
while is_anchor_char(self.input.look_ch()) {
string.push(self.ch());
self.skip_non_blank();
}
@ -1556,8 +1505,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// generate BLOCK-SEQUENCE-START if indented
self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
self.lookahead(2);
if found_tabs && self.buffer[0] == '-' && is_blank_or_breakz(self.buffer[1]) {
self.input.lookahead(2);
if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
{
return Err(ScanError::new_str(
self.mark,
"'-' must be followed by a valid YAML whitespace",
@ -1565,7 +1515,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
}
self.skip_ws_to_eol(SkipTabs::No)?;
if is_break(self.look_ch()) || is_flow(self.ch()) {
if is_break(self.input.look_ch()) || is_flow(self.ch()) {
self.roll_one_col_indent();
}
@ -1623,14 +1573,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_non_blank();
self.unroll_non_block_indents();
if self.look_ch() == '+' || self.ch() == '-' {
if self.input.look_ch() == '+' || self.ch() == '-' {
if self.ch() == '+' {
chomping = Chomping::Keep;
} else {
chomping = Chomping::Strip;
}
self.skip_non_blank();
if is_digit(self.look_ch()) {
if is_digit(self.input.look_ch()) {
if self.ch() == '0' {
return Err(ScanError::new_str(
start_mark,
@ -1650,7 +1600,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
increment = (self.ch() as usize) - ('0' as usize);
self.skip_non_blank();
self.lookahead(1);
self.input.lookahead(1);
if self.ch() == '+' || self.ch() == '-' {
if self.ch() == '+' {
chomping = Chomping::Keep;
@ -1664,7 +1614,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_ws_to_eol(SkipTabs::Yes)?;
// Check if we are at the end of the line.
if !is_breakz(self.look_ch()) {
if !is_breakz(self.input.look_ch()) {
return Err(ScanError::new_str(
start_mark,
"while scanning a block scalar, did not find expected comment or line break",
@ -1672,11 +1622,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
}
if is_break(self.ch()) {
self.lookahead(2);
self.input.lookahead(2);
self.read_break(&mut chomping_break);
}
if self.look_ch() == '\t' {
if self.input.look_ch() == '\t' {
return Err(ScanError::new_str(
start_mark,
"a block scalar content cannot start with a tab",
@ -1731,7 +1681,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let start_mark = self.mark;
while self.mark.col == indent && !is_z(self.ch()) {
if indent == 0 {
self.lookahead(4);
self.input.lookahead(4);
if self.next_is_document_end() {
break;
}
@ -1761,7 +1711,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
break;
}
self.lookahead(2);
self.input.lookahead(2);
self.read_break(&mut leading_break);
// Eat the following indentation spaces and line breaks.
@ -1797,7 +1747,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// line. This function does not consume the line break character(s) after the line.
fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
// Start by evaluating characters in the buffer.
while !self.buffer.is_empty() && !is_breakz(self.ch()) {
while !self.input.buf_is_empty() && !is_breakz(self.ch()) {
string.push(self.ch());
// We may technically skip non-blank characters. However, the only distinction is
// to determine what is leading whitespace and what is not. Here, we read the
@ -1809,7 +1759,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// All characters that were in the buffer were consumed. We need to check if more
// follow.
if self.buffer.is_empty() {
if self.input.buf_is_empty() {
// We will read all consecutive non-breakz characters. We push them into a
// temporary buffer. The main difference with going through `self.buffer` is that
// characters are appended here as their real size (1B for ascii, or up to 4 bytes for
@ -1824,7 +1774,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// Our last character read is stored in `c`. It is either an EOF or a break. In any
// case, we need to push it back into `self.buffer` so it may be properly read
// after. We must not insert it in `string`.
self.buffer.push_back(c).unwrap();
self.input.push_back(c);
// We need to manually update our position; we haven't called a `skip` function.
self.mark.col += line_buffer.len();
@ -1842,25 +1792,25 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
loop {
// Consume all spaces. Tabs cannot be used as indentation.
if indent < BUFFER_LEN - 2 {
self.lookahead(BUFFER_LEN);
if indent < self.input.bufmaxlen() - 2 {
self.input.lookahead(self.input.bufmaxlen());
while self.mark.col < indent && self.ch() == ' ' {
self.skip_blank();
}
} else {
loop {
self.lookahead(BUFFER_LEN);
while !self.buffer.is_empty() && self.mark.col < indent && self.ch() == ' ' {
self.input.lookahead(self.input.bufmaxlen());
while !self.input.buf_is_empty() && self.mark.col < indent && self.ch() == ' ' {
self.skip_blank();
}
// If we reached our indent, we can break. We must also break if we have
// reached content or EOF; that is, the buffer is not empty and the next
// character is not a space.
if self.mark.col == indent || (!self.buffer.is_empty() && self.ch() != ' ') {
if self.mark.col == indent || (!self.input.buf_is_empty() && self.ch() != ' ') {
break;
}
}
self.lookahead(2);
self.input.lookahead(2);
}
// If our current line is empty, skip over the break and continue looping.
@ -1881,7 +1831,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut max_indent = 0;
loop {
// Consume all spaces. Tabs cannot be used as indentation.
while self.look_ch() == ' ' {
while self.input.look_ch() == ' ' {
self.skip_blank();
}
@ -1891,7 +1841,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
if is_break(self.ch()) {
// If our current line is empty, skip over the break and continue looping.
self.lookahead(2);
self.input.lookahead(2);
self.read_break(breaks);
} else {
// Otherwise, we have a content line. Return control.
@ -1943,15 +1893,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
loop {
/* Check for a document indicator. */
self.lookahead(4);
self.input.lookahead(4);
if self.mark.col == 0
&& (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
|| ((self.buffer[0] == '.')
&& (self.buffer[1] == '.')
&& (self.buffer[2] == '.')))
&& is_blank_or_breakz(self.buffer[3])
{
if self.mark.col == 0 && self.next_is_document_indicator() {
return Err(ScanError::new_str(
start_mark,
"while scanning a quoted scalar, found unexpected document indicator",
@ -1980,7 +1924,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
&start_mark,
)?;
match self.look_ch() {
match self.input.look_ch() {
'\'' if single => break,
'"' if !single => break,
_ => {}
@ -2003,7 +1947,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_blank();
}
} else {
self.lookahead(2);
self.input.lookahead(2);
// Check if it is a first line break.
if leading_blanks {
self.read_break(&mut trailing_breaks);
@ -2013,7 +1957,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
leading_blanks = true;
}
}
self.lookahead(1);
self.input.lookahead(1);
}
// Join the whitespaces or fold line breaks.
@ -2083,11 +2027,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
leading_blanks: &mut bool,
start_mark: &Marker,
) -> Result<(), ScanError> {
self.lookahead(2);
self.input.lookahead(2);
while !is_blank_or_breakz(self.ch()) {
match self.ch() {
// Check for an escaped single quote.
'\'' if self.buffer[1] == '\'' && single => {
'\'' if self.input.peek_nth(1) == '\'' && single => {
string.push('\'');
self.skip_n_non_blank(2);
}
@ -2095,8 +2039,8 @@ impl<T: Iterator<Item = char>> Scanner<T> {
'\'' if single => break,
'"' if !single => break,
// Check for an escaped line break.
'\\' if !single && is_break(self.buffer[1]) => {
self.lookahead(3);
'\\' if !single && is_break(self.input.peek_nth(1)) => {
self.input.lookahead(3);
self.skip_non_blank();
self.skip_linebreak();
*leading_blanks = true;
@ -2111,7 +2055,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_non_blank();
}
}
self.lookahead(2);
self.input.lookahead(2);
}
Ok(())
}
@ -2129,7 +2073,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut code_length = 0usize;
let mut ret = '\0';
match self.buffer[1] {
match self.input.peek_nth(1) {
'0' => ret = '\0',
'a' => ret = '\x07',
'b' => ret = '\x08',
@ -2165,16 +2109,17 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// Consume an arbitrary escape code.
if code_length > 0 {
self.lookahead(code_length);
self.input.lookahead(code_length);
let mut value = 0u32;
for i in 0..code_length {
if !is_hex(self.buffer[i]) {
let c = self.input.peek_nth(i);
if !is_hex(c) {
return Err(ScanError::new_str(
*start_mark,
"while parsing a quoted scalar, did not find expected hexadecimal number",
));
}
value = (value << 4) + as_hex(self.buffer[i]);
value = (value << 4) + as_hex(c);
}
let Some(ch) = char::from_u32(value) else {
@ -2223,12 +2168,12 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut whitespaces = String::with_capacity(32);
loop {
self.lookahead(4);
self.input.lookahead(4);
if self.next_is_document_indicator() || self.ch() == '#' {
break;
}
if self.flow_level > 0 && self.ch() == '-' && is_flow(self.buffer[1]) {
if self.flow_level > 0 && self.ch() == '-' && is_flow(self.input.peek_nth(1)) {
return Err(ScanError::new_str(
self.mark,
"plain scalar cannot start with '-' followed by ,[]{}",
@ -2260,7 +2205,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// We can unroll the first iteration of the loop.
string.push(self.ch());
self.skip_non_blank();
self.lookahead(2);
self.input.lookahead(2);
// Add content non-blank characters to the scalar.
while !is_blank_or_breakz(self.ch()) {
@ -2270,7 +2215,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
string.push(self.ch());
self.skip_non_blank();
self.lookahead(2);
self.input.lookahead(2);
}
}
@ -2283,7 +2228,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
}
// Process blank characters.
while is_blank(self.look_ch()) || is_break(self.ch()) {
while is_blank(self.input.look_ch()) || is_break(self.ch()) {
if is_blank(self.ch()) {
if !self.leading_whitespace {
whitespaces.push(self.ch());
@ -2302,7 +2247,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip_blank();
}
} else {
self.lookahead(2);
self.input.lookahead(2);
// Check if it is a first line break
if self.leading_whitespace {
self.read_break(&mut trailing_breaks);
@ -2379,7 +2324,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// [`self.flow_level`]: Self::flow_level
/// [`fetch_value`]: Self::fetch_value
fn fetch_flow_value(&mut self) -> ScanResult {
let nc = self.buffer[1];
let nc = self.input.peek_nth(1);
// If we encounter a ':' inside a flow collection and it is not immediately
// followed by a blank or breakz:
@ -2413,7 +2358,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
// Skip over ':'.
self.skip_non_blank();
if self.look_ch() == '\t'
if self.input.look_ch() == '\t'
&& !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
&& (self.ch() == '-' || is_alpha(self.ch()))
{
@ -2600,8 +2545,8 @@ impl<T: Iterator<Item = char>> Scanner<T> {
fn next_can_be_plain_scalar(&self) -> bool {
match self.ch() {
// indicators can end a plain scalar, see 7.3.3. Plain Style
':' if is_blank_or_breakz(self.buffer[1])
|| (self.flow_level > 0 && is_flow(self.buffer[1])) =>
':' if is_blank_or_breakz(self.input.peek_nth(1))
|| (self.flow_level > 0 && is_flow(self.input.peek_nth(1))) =>
{
false
}

View file

@ -231,7 +231,7 @@ a: |-
#[test]
fn test_bad_docstart() {
assert!(run_parser("---This used to cause an infinite loop").is_ok());
run_parser("---This used to cause an infinite loop").unwrap();
assert_eq!(
run_parser("----").unwrap(),
[

View file

@ -2,11 +2,7 @@ use std::env;
use std::fs::File;
use std::io::prelude::*;
use saphyr_parser::{
parser::{MarkedEventReceiver, Parser},
scanner::Marker,
Event,
};
use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser};
#[derive(Debug)]
struct EventSink {

View file

@ -1,12 +1,9 @@
use saphyr_parser::{
parser::{MarkedEventReceiver, Parser},
scanner::Marker,
Event,
};
use std::env;
use std::fs::File;
use std::io::prelude::*;
use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser};
/// A sink which discards any event sent.
struct NullSink {}