Add Input interface.

Hiding character fetching behind this interface allows us to create more specific implementations when is appropriate. For instance, an instance of `Input` can be created for a `&str`, allowing for borrowing and more efficient peeking and traversing than if we were to fetch characters one at a time and placing them into a temporary buffer.
2024-04-18 17:48:49 +02:00 · 2024-04-18 17:48:49 +02:00 · d9bb7a1693
commit d9bb7a1693
parent 11cffc6df8
8 changed files with 384 additions and 229 deletions
--- a/parser/src/buffered_input.rs
+++ b/parser/src/buffered_input.rs
@ -0,0 +1,99 @@
 use crate::input::Input;
 use arraydeque::ArrayDeque;
 /// The size of the [`BufferedInput`] buffer.
 ///
 /// The buffer is statically allocated to avoid conditions for reallocations each time we
 /// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except:
 ///   - Escape sequences parsing: some escape codes are 8 characters
 ///   - Scanning indent in scalars: this looks ahead `indent + 2` characters
 ///
 /// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done
 /// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher
 /// than that, the code will fall back to a loop of lookaheads.
 const BUFFER_LEN: usize = 16;
 /// A wrapper around an [`Iterator`] of [`char`]s with a buffer.
 ///
 /// The YAML scanner often needs some lookahead. With fully allocated buffers such as `String` or
 /// `&str`, this is not an issue. However, with streams, we need to have a way of peeking multiple
 /// characters at a time and sometimes pushing some back into the stream.
 /// There is no "easy" way of doing this without itertools. In order to avoid pulling the entierty
 /// of itertools for one method, we use this structure.
 pub struct BufferedInput<T: Iterator<Item = char>> {
    /// The iterator source,
    input: T,
    /// Buffer for the next characters to consume.
    buffer: ArrayDeque<char, BUFFER_LEN>,
 }
 impl<T: Iterator<Item = char>> BufferedInput<T> {
    /// Create a new [`BufferedInput`] with the given input.
    pub fn new(input: T) -> Self {
        Self {
            input,
            buffer: ArrayDeque::default(),
        }
    }
 }
 impl<T: Iterator<Item = char>> Input for BufferedInput<T> {
    #[inline]
    fn lookahead(&mut self, count: usize) {
        if self.buffer.len() >= count {
            return;
        }
        for _ in 0..(count - self.buffer.len()) {
            self.buffer
                .push_back(self.input.next().unwrap_or('\0'))
                .unwrap();
        }
    }
    #[inline]
    fn buflen(&self) -> usize {
        self.buffer.len()
    }
    #[inline]
    fn bufmaxlen(&self) -> usize {
        BUFFER_LEN
    }
    #[inline]
    fn raw_read_ch(&mut self) -> char {
        self.input.next().unwrap_or('\0')
    }
    #[inline]
    fn push_back(&mut self, c: char) {
        self.buffer.push_back(c).unwrap();
    }
    #[inline]
    fn skip(&mut self) {
        self.buffer.pop_front();
    }
    #[inline]
    fn skip_n(&mut self, count: usize) {
        self.buffer.drain(0..count);
    }
    #[inline]
    fn peek(&self) -> char {
        self.buffer[0]
    }
    #[inline]
    fn peek_nth(&self, n: usize) -> char {
        self.buffer[n]
    }
    #[inline]
    fn next_is(&self, pat: &str) -> bool {
        assert!(self.buffer.len() >= pat.len());
        self.buffer.iter().zip(pat.chars()).all(|(a, b)| *a == b)
    }
 }
--- a/parser/src/input.rs
+++ b/parser/src/input.rs
@ -0,0 +1,111 @@
 /// Interface for a source of characters.
 ///
 /// Hiding the input's implementation behind this trait allows mostly:
 ///  * For input-specific optimizations (for instance, using `str` methods instead of manually
 ///    transferring one `char` at a time to a buffer).
 ///  * To return `&str`s referencing the input string, thus avoiding potentially costly
 ///    allocations. Should users need an owned version of the data, they can always `.to_owned()`
 ///    their YAML object.
 pub trait Input {
    /// A hint to the input source that we will need to read `count` characters.
    ///
    /// If the input is exhausted, `\0` can be used to pad the last characters and later returned.
    /// The characters must not be consumed, but may be placed in an internal buffer.
    ///
    /// This method may be a no-op if buffering yields no performance improvement.
    ///
    /// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The
    /// parser tracks how many characters are loaded in the buffer and acts accordingly.
    fn lookahead(&mut self, count: usize);
    /// Return the number of buffered characters in `self`.
    #[must_use]
    fn buflen(&self) -> usize;
    /// Return the capacity of the buffer in `self`.
    #[must_use]
    fn bufmaxlen(&self) -> usize;
    /// Return whether the buffer (!= stream) is empty.
    #[inline]
    #[must_use]
    fn buf_is_empty(&self) -> bool {
        self.buflen() == 0
    }
    /// Read a character from the input stream and return it directly.
    ///
    /// The internal buffer (is any) is bypassed.
    #[must_use]
    fn raw_read_ch(&mut self) -> char;
    /// Put a character back in the buffer.
    ///
    /// This function is only called when we read one too many characters and the pushed back
    /// character is exactly the last character that was read. This function will not be called
    /// multiple times consecutively.
    fn push_back(&mut self, c: char);
    /// Consume the next character.
    fn skip(&mut self);
    /// Consume the next `count` character.
    fn skip_n(&mut self, count: usize);
    /// Return the next character, without consuming it.
    ///
    /// Users of the [`Input`] must make sure that the character has been loaded through a prior
    /// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to
    /// [`Input::lookahead`] has been made beforehand.
    ///
    /// # Return
    /// If the input source is not exhausted, returns the next character to be fed into the
    /// scanner. Otherwise, returns `\0`.
    #[must_use]
    fn peek(&self) -> char;
    /// Return the `n`-th character in the buffer, without consuming it.
    ///
    /// This function assumes that the n-th character in the input has already been fetched through
    /// [`Input::lookahead`].
    #[must_use]
    fn peek_nth(&self, n: usize) -> char;
    /// Look for the next character and return it.
    ///
    /// The character is not consumed.
    /// Equivalent to calling [`Input::lookahead`] and [`Input::peek`].
    #[inline]
    #[must_use]
    fn look_ch(&mut self) -> char {
        self.lookahead(1);
        self.peek()
    }
    /// Return whether the next character in the input source is equal to `c`.
    ///
    /// This function assumes that the next character in the input has already been fetched through
    /// [`Input::lookahead`].
    #[inline]
    #[must_use]
    fn next_char_is(&self, c: char) -> bool {
        self.peek() == c
    }
    /// Return whether the `n`-th character in the input source is equal to `c`.
    ///
    /// This function assumes that the n-th character in the input has already been fetched through
    /// [`Input::lookahead`].
    #[inline]
    #[must_use]
    fn nth_char_is(&self, n: usize, c: char) -> bool {
        self.peek_nth(n) == c
    }
    /// Return whether the next characters in the input source match the given pattern.
    ///
    /// This function assumes that the next `pat.len()` characters in the input has already been
    /// fetched through [`Input::lookahead`].
    #[must_use]
    fn next_is(&self, pat: &str) -> bool;
 }
--- a/parser/src/lib.rs
+++ b/parser/src/lib.rs
@ -32,11 +32,14 @@
 #![warn(missing_docs, clippy::pedantic)]
-pub(crate) mod char_traits;
+mod buffered_input;
 mod char_traits;
 #[macro_use]
-pub(crate) mod debug;
+mod debug;
-pub mod parser;
+mod input;
-pub mod scanner;
+mod parser;
 mod scanner;
 pub use crate::buffered_input::BufferedInput;
 pub use crate::parser::{Event, EventReceiver, MarkedEventReceiver, Parser, Tag};
 pub use crate::scanner::{Marker, ScanError, TScalarStyle};
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -4,7 +4,11 @@
 //! compliance, and emits a stream of YAML events. This stream can for instance be used to create
 //! YAML objects.
-use crate::scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType};
+use crate::{
    input::Input,
    scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType},
    BufferedInput,
 };
 use std::collections::HashMap;
 #[derive(Clone, Copy, PartialEq, Debug, Eq)]
@ -100,7 +104,7 @@ impl Event {
 /// A YAML parser.
 #[derive(Debug)]
-pub struct Parser<T> {
+pub struct Parser<T: Input> {
    /// The underlying scanner from which we pull tokens.
    scanner: Scanner<T>,
    /// The stack of _previous_ states we were in.
@ -225,15 +229,15 @@ impl<R: EventReceiver> MarkedEventReceiver for R {
 /// A convenience alias for a `Result` of a parser event.
 pub type ParseResult = Result<(Event, Marker), ScanError>;
-impl<'a> Parser<core::str::Chars<'a>> {
+impl<'a> Parser<BufferedInput<std::str::Chars<'a>>> {
    /// Create a new instance of a parser from a &str.
    #[must_use]
    pub fn new_from_str(value: &'a str) -> Self {
-        Parser::new(value.chars())
+        Parser::new(BufferedInput::new(value.chars()))
    }
 }
-impl<T: Iterator<Item = char>> Parser<T> {
+impl<T: Input> Parser<T> {
    /// Create a new instance of a parser from the given input of characters.
    pub fn new(src: T) -> Parser<T> {
        Parser {
@ -1130,7 +1134,7 @@ impl<T: Iterator<Item = char>> Parser<T> {
    }
 }
-impl<T: Iterator<Item = char>> Iterator for Parser<T> {
+impl<T: Input> Iterator for Parser<T> {
    type Item = Result<(Event, Marker), ScanError>;
    fn next(&mut self) -> Option<Self::Item> {
--- a/parser/src/scanner.rs
+++ b/parser/src/scanner.rs
@ -11,11 +11,12 @@
 use std::{char, collections::VecDeque, error::Error, fmt};
-use arraydeque::ArrayDeque;
+use crate::{
-
+    char_traits::{
-use crate::char_traits::{
+        as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz,
-    as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit,
+        is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z,
-    is_flow, is_hex, is_tag_char, is_uri_char, is_z,
+    },
    input::Input,
 };
 /// The encoding of the input. Currently, only UTF-8 is supported.
@ -343,18 +344,6 @@ enum ImplicitMappingState {
    Inside,
 }
 /// The size of the [`Scanner`] buffer.
 ///
 /// The buffer is statically allocated to avoid conditions for reallocations each time we
 /// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except:
 ///   - Escape sequences parsing: some escape codes are 8 characters
 ///   - Scanning indent in scalars: this looks ahead `indent + 2` characters
 ///
 /// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done
 /// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher
 /// than that, the code will fall back to a loop of lookaheads.
 const BUFFER_LEN: usize = 16;
 /// The YAML scanner.
 ///
 /// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
@ -367,8 +356,10 @@ const BUFFER_LEN: usize = 16;
 #[derive(Debug)]
 #[allow(clippy::struct_excessive_bools)]
 pub struct Scanner<T> {
-    /// The reader, providing with characters.
+    /// The input source.
-    rdr: T,
+    ///
    /// This must implement [`Input`].
    input: T,
    /// The position of the cursor within the reader.
    mark: Marker,
    /// Buffer for tokens to be returned.
@ -378,8 +369,6 @@ pub struct Scanner<T> {
    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
    /// [`Self::next`] until we have more context.
    tokens: VecDeque<Token>,
    /// Buffer for the next characters to consume.
    buffer: ArrayDeque<char, BUFFER_LEN>,
    /// The last error that happened.
    error: Option<ScanError>,
@ -435,7 +424,7 @@ pub struct Scanner<T> {
    implicit_flow_mapping_states: Vec<ImplicitMappingState>,
 }
-impl<T: Iterator<Item = char>> Iterator for Scanner<T> {
+impl<T: Input> Iterator for Scanner<T> {
    type Item = Token;
    fn next(&mut self) -> Option<Token> {
        if self.error.is_some() {
@ -462,12 +451,11 @@ impl<T: Iterator<Item = char>> Iterator for Scanner<T> {
 /// A convenience alias for scanner functions that may fail without returning a value.
 pub type ScanResult = Result<(), ScanError>;
-impl<T: Iterator<Item = char>> Scanner<T> {
+impl<T: Input> Scanner<T> {
    /// Creates the YAML tokenizer.
-    pub fn new(rdr: T) -> Scanner<T> {
+    pub fn new(input: T) -> Scanner<T> {
        Scanner {
-            rdr,
+            input,
            buffer: ArrayDeque::new(),
            mark: Marker::new(0, 1, 0),
            tokens: VecDeque::new(),
            error: None,
@ -497,25 +485,10 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        self.error.clone()
    }
    /// Fill `self.buffer` with at least `count` characters.
    ///
    /// The characters that are extracted this way are not consumed but only placed in the buffer.
    #[inline]
    fn lookahead(&mut self, count: usize) {
        if self.buffer.len() >= count {
            return;
        }
        for _ in 0..(count - self.buffer.len()) {
            self.buffer
                .push_back(self.rdr.next().unwrap_or('\0'))
                .unwrap();
        }
    }
    /// Consume the next character. It is assumed the next character is a blank.
    #[inline]
    fn skip_blank(&mut self) {
-        self.buffer.pop_front();
+        self.input.skip();
        self.mark.index += 1;
        self.mark.col += 1;
@ -524,7 +497,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    /// Consume the next character. It is assumed the next character is not a blank.
    #[inline]
    fn skip_non_blank(&mut self) {
-        self.buffer.pop_front();
+        self.input.skip();
        self.mark.index += 1;
        self.mark.col += 1;
@ -533,18 +506,18 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    /// Consume the next characters. It is assumed none of the next characters are blanks.
    #[inline]
-    fn skip_n_non_blank(&mut self, n: usize) {
+    fn skip_n_non_blank(&mut self, count: usize) {
-        self.buffer.drain(0..n);
+        self.input.skip_n(count);
-        self.mark.index += n;
+        self.mark.index += count;
-        self.mark.col += n;
+        self.mark.col += count;
        self.leading_whitespace = false;
    }
    /// Consume the next character. It is assumed the next character is a newline.
    #[inline]
    fn skip_nl(&mut self) {
-        self.buffer.pop_front();
+        self.input.skip();
        self.mark.index += 1;
        self.mark.col = 0;
@ -555,12 +528,12 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
    #[inline]
    fn skip_linebreak(&mut self) {
-        if self.buffer[0] == '\r' && self.buffer[1] == '\n' {
+        if self.input.next_is("\r\n") {
            // While technically not a blank, this does not matter as `self.leading_whitespace`
            // will be reset by `skip_nl`.
            self.skip_blank();
            self.skip_nl();
-        } else if is_break(self.buffer[0]) {
+        } else if is_break(self.input.peek()) {
            self.skip_nl();
        }
    }
@ -570,32 +543,16 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    /// The character is not consumed.
    #[inline]
    fn ch(&self) -> char {
-        self.buffer[0]
+        self.input.peek()
    }
    /// Look for the next character and return it.
    ///
    /// The character is not consumed.
    /// Equivalent to calling [`Self::lookahead`] and [`Self::ch`].
    #[inline]
    fn look_ch(&mut self) -> char {
        self.lookahead(1);
        self.ch()
    }
    /// Read a character from the input stream, returning it directly.
    ///
-    /// The buffer is bypassed and `self.mark` needs to be updated manually.
+    /// The buffer (if any) is bypassed and `self.mark` needs to be updated manually.
    #[inline]
    #[must_use]
    fn raw_read_ch(&mut self) -> char {
-        self.rdr.next().unwrap_or('\0')
+        self.input.raw_read_ch()
    }
    /// Return whether the next character is `c`.
    #[inline]
    fn ch_is(&self, c: char) -> bool {
        self.buffer[0] == c
    }
    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
@ -624,8 +581,8 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    // If the next characters do not correspond to a line break.
    #[inline]
    fn read_break(&mut self, s: &mut String) {
-        let c = self.buffer[0];
+        let c = self.input.peek();
-        let nc = self.buffer[1];
+        let nc = self.input.peek_nth(1);
        debug_assert!(is_break(c));
        if c == '\r' && nc == '\n' {
            self.skip_blank();
@ -635,15 +592,20 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        s.push('\n');
    }
    /// Check whether the next characters correspond to a start of document.
    ///
    /// [`Self::lookahead`] must have been called before calling this function.
    fn next_is_document_start(&self) -> bool {
        assert!(self.input.buflen() >= 4);
        self.input.next_is("---") && is_blank_or_breakz(self.input.peek_nth(3))
    }
    /// Check whether the next characters correspond to an end of document.
    ///
    /// [`Self::lookahead`] must have been called before calling this function.
    fn next_is_document_end(&self) -> bool {
-        assert!(self.buffer.len() >= 4);
+        assert!(self.input.buflen() >= 4);
-        self.buffer[0] == '.'
+        self.input.next_is("...") && is_blank_or_breakz(self.input.peek_nth(3))
            && self.buffer[1] == '.'
            && self.buffer[2] == '.'
            && is_blank_or_breakz(self.buffer[3])
    }
    /// Check whether the next characters correspond to a document indicator.
@ -651,11 +613,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    /// [`Self::lookahead`] must have been called before calling this function.
    #[inline]
    fn next_is_document_indicator(&self) -> bool {
-        assert!(self.buffer.len() >= 4);
+        assert!(self.input.buflen() >= 4);
-        self.mark.col == 0
+        is_blank_or_breakz(self.input.peek_nth(3))
-            && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
+            && (self.input.next_is("...") || self.input.next_is("---"))
                || ((self.buffer[0] == '.') && (self.buffer[1] == '.') && (self.buffer[2] == '.')))
            && is_blank_or_breakz(self.buffer[3])
    }
    /// Insert a token at the given position.
@ -674,11 +634,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    }
    /// Fetch the next token in the stream.
    ///
    /// # Errors
    /// Returns `ScanError` when the scanner does not find the next expected token.
    pub fn fetch_next_token(&mut self) -> ScanResult {
-        self.lookahead(1);
+        self.input.lookahead(1);
        // eprintln!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch());
        if !self.stream_start_produced {
            self.fetch_stream_start();
@ -697,51 +657,37 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        let mark = self.mark;
        self.unroll_indent(mark.col as isize);
-        self.lookahead(4);
+        self.input.lookahead(4);
        if is_z(self.ch()) {
            self.fetch_stream_end()?;
            return Ok(());
        }
-        // Is it a directive?
+        if self.mark.col == 0 {
-        if self.mark.col == 0 && self.ch_is('%') {
+            if self.input.next_char_is('%') {
-            return self.fetch_directive();
+                return self.fetch_directive();
-        }
+            } else if self.next_is_document_start() {
-
+                return self.fetch_document_indicator(TokenType::DocumentStart);
-        if self.mark.col == 0
+            } else if self.next_is_document_end() {
-            && self.buffer[0] == '-'
+                self.fetch_document_indicator(TokenType::DocumentEnd)?;
-            && self.buffer[1] == '-'
+                self.skip_ws_to_eol(SkipTabs::Yes)?;
-            && self.buffer[2] == '-'
+                if !is_breakz(self.ch()) {
-            && is_blank_or_breakz(self.buffer[3])
+                    return Err(ScanError::new_str(
-        {
+                        self.mark,
-            self.fetch_document_indicator(TokenType::DocumentStart)?;
+                        "invalid content after document end marker",
-            return Ok(());
+                    ));
-        }
+                }
-
+                return Ok(());
        if self.mark.col == 0
            && self.buffer[0] == '.'
            && self.buffer[1] == '.'
            && self.buffer[2] == '.'
            && is_blank_or_breakz(self.buffer[3])
        {
            self.fetch_document_indicator(TokenType::DocumentEnd)?;
            self.skip_ws_to_eol(SkipTabs::Yes)?;
            if !is_breakz(self.ch()) {
                return Err(ScanError::new_str(
                    self.mark,
                    "invalid content after document end marker",
                ));
            }
            return Ok(());
        }
        if (self.mark.col as isize) < self.indent {
            return Err(ScanError::new_str(self.mark, "invalid indentation"));
        }
-        let c = self.buffer[0];
+        let c = self.input.peek();
-        let nc = self.buffer[1];
+        let nc = self.input.peek_nth(1);
        match c {
            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
@ -860,7 +806,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        Ok(())
    }
-    /// Skip over all whitespace and comments until the next token.
+    /// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token.
    ///
    /// # Errors
    /// This function returns an error if a tabulation is encountered where there should not be
@ -868,7 +814,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    fn skip_to_next_token(&mut self) -> ScanResult {
        loop {
            // TODO(chenyh) BOM
-            match self.look_ch() {
+            match self.input.look_ch() {
                // Tabs may not be used as indentation.
                // "Indentation" only exists as long as a block is started, but does not exist
                // inside of flow-style constructs. Tabs are allowed as part of leading
@ -890,14 +836,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                }
                '\t' | ' ' => self.skip_blank(),
                '\n' | '\r' => {
-                    self.lookahead(2);
+                    self.input.lookahead(2);
                    self.skip_linebreak();
                    if self.flow_level == 0 {
                        self.allow_simple_key();
                    }
                }
                '#' => {
-                    while !is_breakz(self.look_ch()) {
+                    while !is_breakz(self.input.look_ch()) {
                        self.skip_non_blank();
                    }
                }
@ -914,14 +860,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    fn skip_yaml_whitespace(&mut self) -> ScanResult {
        let mut need_whitespace = true;
        loop {
-            match self.look_ch() {
+            match self.input.look_ch() {
                ' ' => {
                    self.skip_blank();
                    need_whitespace = false;
                }
                '\n' | '\r' => {
-                    self.lookahead(2);
+                    self.input.lookahead(2);
                    self.skip_linebreak();
                    if self.flow_level == 0 {
                        self.allow_simple_key();
@ -929,7 +875,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                    need_whitespace = false;
                }
                '#' => {
-                    while !is_breakz(self.look_ch()) {
+                    while !is_breakz(self.input.look_ch()) {
                        self.skip_non_blank();
                    }
                }
@ -949,7 +895,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        let mut encountered_tab = false;
        let mut has_yaml_ws = false;
        loop {
-            match self.look_ch() {
+            match self.input.look_ch() {
                ' ' => {
                    has_yaml_ws = true;
                    self.skip_blank();
@ -966,7 +912,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                    ));
                }
                '#' => {
-                    while !is_breakz(self.look_ch()) {
+                    while !is_breakz(self.input.look_ch()) {
                        self.skip_non_blank();
                    }
                }
@ -1035,7 +981,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
            // XXX This should be a warning instead of an error
            _ => {
                // skip current line
-                while !is_breakz(self.look_ch()) {
+                while !is_breakz(self.input.look_ch()) {
                    self.skip_non_blank();
                }
                // XXX return an empty TagDirective token
@ -1051,7 +997,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        self.skip_ws_to_eol(SkipTabs::Yes)?;
        if is_breakz(self.ch()) {
-            self.lookahead(2);
+            self.input.lookahead(2);
            self.skip_linebreak();
            Ok(tok)
        } else {
@ -1063,7 +1009,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    }
    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
-        while is_blank(self.look_ch()) {
+        while is_blank(self.input.look_ch()) {
            self.skip_blank();
        }
@ -1085,7 +1031,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
        let start_mark = self.mark;
        let mut string = String::new();
-        while is_alpha(self.look_ch()) {
+        while is_alpha(self.input.look_ch()) {
            string.push(self.ch());
            self.skip_non_blank();
        }
@ -1110,7 +1056,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
        let mut val = 0u32;
        let mut length = 0usize;
-        while let Some(digit) = self.look_ch().to_digit(10) {
+        while let Some(digit) = self.input.look_ch().to_digit(10) {
            if length + 1 > 9 {
                return Err(ScanError::new_str(
                    *mark,
@ -1134,19 +1080,19 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
        /* Eat whitespaces. */
-        while is_blank(self.look_ch()) {
+        while is_blank(self.input.look_ch()) {
            self.skip_blank();
        }
        let handle = self.scan_tag_handle(true, mark)?;
        /* Eat whitespaces. */
-        while is_blank(self.look_ch()) {
+        while is_blank(self.input.look_ch()) {
            self.skip_blank();
        }
        let prefix = self.scan_tag_prefix(mark)?;
-        self.lookahead(1);
+        self.input.lookahead(1);
        if is_blank_or_breakz(self.ch()) {
            Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
@ -1173,9 +1119,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        let mut suffix;
        // Check if the tag is in the canonical form (verbatim).
-        self.lookahead(2);
+        self.input.lookahead(2);
-        if self.buffer[1] == '<' {
+        if self.input.nth_char_is(1, '<') {
            suffix = self.scan_verbatim_tag(&start_mark)?;
        } else {
            // The tag has either the '!suffix' or the '!handle!suffix'
@ -1198,7 +1144,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
            }
        }
-        if is_blank_or_breakz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) {
+        if is_blank_or_breakz(self.input.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) {
            // XXX: ex 7.2, an empty scalar can follow a secondary tag
            Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
        } else {
@ -1211,7 +1157,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
        let mut string = String::new();
-        if self.look_ch() != '!' {
+        if self.input.look_ch() != '!' {
            return Err(ScanError::new_str(
                *mark,
                "while scanning a tag, did not find expected '!'",
@ -1221,7 +1167,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        string.push(self.ch());
        self.skip_non_blank();
-        while is_alpha(self.look_ch()) {
+        while is_alpha(self.input.look_ch()) {
            string.push(self.ch());
            self.skip_non_blank();
        }
@ -1250,7 +1196,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
        let mut string = String::new();
-        if self.look_ch() == '!' {
+        if self.input.look_ch() == '!' {
            // If we have a local tag, insert and skip `!`.
            string.push(self.ch());
            self.skip_non_blank();
@ -1269,7 +1215,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
            self.skip_non_blank();
        }
-        while is_uri_char(self.look_ch()) {
+        while is_uri_char(self.input.look_ch()) {
            if self.ch() == '%' {
                string.push(self.scan_uri_escapes(start_mark)?);
            } else {
@ -1290,7 +1236,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        self.skip_non_blank();
        let mut string = String::new();
-        while is_uri_char(self.look_ch()) {
+        while is_uri_char(self.input.look_ch()) {
            if self.ch() == '%' {
                string.push(self.scan_uri_escapes(start_mark)?);
            } else {
@ -1326,7 +1272,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
            string.extend(head.chars().skip(1));
        }
-        while is_tag_char(self.look_ch()) {
+        while is_tag_char(self.input.look_ch()) {
            // Check if it is a URI-escape sequence.
            if self.ch() == '%' {
                string.push(self.scan_uri_escapes(mark)?);
@ -1352,38 +1298,41 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        let mut width = 0usize;
        let mut code = 0u32;
        loop {
-            self.lookahead(3);
+            self.input.lookahead(3);
-            if !(self.ch() == '%' && is_hex(self.buffer[1]) && is_hex(self.buffer[2])) {
+            let c = self.input.peek_nth(1);
            let nc = self.input.peek_nth(2);
            if !(self.ch() == '%' && is_hex(c) && is_hex(nc)) {
                return Err(ScanError::new_str(
                    *mark,
-                    "while parsing a tag, did not find URI escaped octet",
+                    "while parsing a tag, found an invalid escape sequence",
                ));
            }
-            let octet = (as_hex(self.buffer[1]) << 4) + as_hex(self.buffer[2]);
+            let byte = (as_hex(c) << 4) + as_hex(nc);
            if width == 0 {
-                width = match octet {
+                width = match byte {
-                    _ if octet & 0x80 == 0x00 => 1,
+                    _ if byte & 0x80 == 0x00 => 1,
-                    _ if octet & 0xE0 == 0xC0 => 2,
+                    _ if byte & 0xE0 == 0xC0 => 2,
-                    _ if octet & 0xF0 == 0xE0 => 3,
+                    _ if byte & 0xF0 == 0xE0 => 3,
-                    _ if octet & 0xF8 == 0xF0 => 4,
+                    _ if byte & 0xF8 == 0xF0 => 4,
                    _ => {
                        return Err(ScanError::new_str(
                            *mark,
-                            "while parsing a tag, found an incorrect leading UTF-8 octet",
+                            "while parsing a tag, found an incorrect leading UTF-8 byte",
                        ));
                    }
                };
-                code = octet;
+                code = byte;
            } else {
-                if octet & 0xc0 != 0x80 {
+                if byte & 0xc0 != 0x80 {
                    return Err(ScanError::new_str(
                        *mark,
-                        "while parsing a tag, found an incorrect trailing UTF-8 octet",
+                        "while parsing a tag, found an incorrect trailing UTF-8 byte",
                    ));
                }
-                code = (code << 8) + octet;
+                code = (code << 8) + byte;
            }
            self.skip_n_non_blank(3);
@ -1419,7 +1368,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        let start_mark = self.mark;
        self.skip_non_blank();
-        while is_anchor_char(self.look_ch()) {
+        while is_anchor_char(self.input.look_ch()) {
            string.push(self.ch());
            self.skip_non_blank();
        }
@ -1556,8 +1505,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        // generate BLOCK-SEQUENCE-START if indented
        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
-        self.lookahead(2);
+        self.input.lookahead(2);
-        if found_tabs && self.buffer[0] == '-' && is_blank_or_breakz(self.buffer[1]) {
+        if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1))
        {
            return Err(ScanError::new_str(
                self.mark,
                "'-' must be followed by a valid YAML whitespace",
@ -1565,7 +1515,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        }
        self.skip_ws_to_eol(SkipTabs::No)?;
-        if is_break(self.look_ch()) || is_flow(self.ch()) {
+        if is_break(self.input.look_ch()) || is_flow(self.ch()) {
            self.roll_one_col_indent();
        }
@ -1623,14 +1573,14 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        self.skip_non_blank();
        self.unroll_non_block_indents();
-        if self.look_ch() == '+' || self.ch() == '-' {
+        if self.input.look_ch() == '+' || self.ch() == '-' {
            if self.ch() == '+' {
                chomping = Chomping::Keep;
            } else {
                chomping = Chomping::Strip;
            }
            self.skip_non_blank();
-            if is_digit(self.look_ch()) {
+            if is_digit(self.input.look_ch()) {
                if self.ch() == '0' {
                    return Err(ScanError::new_str(
                        start_mark,
@ -1650,7 +1600,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
            increment = (self.ch() as usize) - ('0' as usize);
            self.skip_non_blank();
-            self.lookahead(1);
+            self.input.lookahead(1);
            if self.ch() == '+' || self.ch() == '-' {
                if self.ch() == '+' {
                    chomping = Chomping::Keep;
@ -1664,7 +1614,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        self.skip_ws_to_eol(SkipTabs::Yes)?;
        // Check if we are at the end of the line.
-        if !is_breakz(self.look_ch()) {
+        if !is_breakz(self.input.look_ch()) {
            return Err(ScanError::new_str(
                start_mark,
                "while scanning a block scalar, did not find expected comment or line break",
@ -1672,11 +1622,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        }
        if is_break(self.ch()) {
-            self.lookahead(2);
+            self.input.lookahead(2);
            self.read_break(&mut chomping_break);
        }
-        if self.look_ch() == '\t' {
+        if self.input.look_ch() == '\t' {
            return Err(ScanError::new_str(
                start_mark,
                "a block scalar content cannot start with a tab",
@ -1731,7 +1681,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        let start_mark = self.mark;
        while self.mark.col == indent && !is_z(self.ch()) {
            if indent == 0 {
-                self.lookahead(4);
+                self.input.lookahead(4);
                if self.next_is_document_end() {
                    break;
                }
@ -1761,7 +1711,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                break;
            }
-            self.lookahead(2);
+            self.input.lookahead(2);
            self.read_break(&mut leading_break);
            // Eat the following indentation spaces and line breaks.
@ -1797,7 +1747,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    /// line. This function does not consume the line break character(s) after the line.
    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
        // Start by evaluating characters in the buffer.
-        while !self.buffer.is_empty() && !is_breakz(self.ch()) {
+        while !self.input.buf_is_empty() && !is_breakz(self.ch()) {
            string.push(self.ch());
            // We may technically skip non-blank characters. However, the only distinction is
            // to determine what is leading whitespace and what is not. Here, we read the
@ -1809,7 +1759,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        // All characters that were in the buffer were consumed. We need to check if more
        // follow.
-        if self.buffer.is_empty() {
+        if self.input.buf_is_empty() {
            // We will read all consecutive non-breakz characters. We push them into a
            // temporary buffer. The main difference with going through `self.buffer` is that
            // characters are appended here as their real size (1B for ascii, or up to 4 bytes for
@ -1824,7 +1774,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
            // Our last character read is stored in `c`. It is either an EOF or a break. In any
            // case, we need to push it back into `self.buffer` so it may be properly read
            // after. We must not insert it in `string`.
-            self.buffer.push_back(c).unwrap();
+            self.input.push_back(c);
            // We need to manually update our position; we haven't called a `skip` function.
            self.mark.col += line_buffer.len();
@ -1842,25 +1792,25 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
        loop {
            // Consume all spaces. Tabs cannot be used as indentation.
-            if indent < BUFFER_LEN - 2 {
+            if indent < self.input.bufmaxlen() - 2 {
-                self.lookahead(BUFFER_LEN);
+                self.input.lookahead(self.input.bufmaxlen());
                while self.mark.col < indent && self.ch() == ' ' {
                    self.skip_blank();
                }
            } else {
                loop {
-                    self.lookahead(BUFFER_LEN);
+                    self.input.lookahead(self.input.bufmaxlen());
-                    while !self.buffer.is_empty() && self.mark.col < indent && self.ch() == ' ' {
+                    while !self.input.buf_is_empty() && self.mark.col < indent && self.ch() == ' ' {
                        self.skip_blank();
                    }
                    // If we reached our indent, we can break. We must also break if we have
                    // reached content or EOF; that is, the buffer is not empty and the next
                    // character is not a space.
-                    if self.mark.col == indent || (!self.buffer.is_empty() && self.ch() != ' ') {
+                    if self.mark.col == indent || (!self.input.buf_is_empty() && self.ch() != ' ') {
                        break;
                    }
                }
-                self.lookahead(2);
+                self.input.lookahead(2);
            }
            // If our current line is empty, skip over the break and continue looping.
@ -1881,7 +1831,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        let mut max_indent = 0;
        loop {
            // Consume all spaces. Tabs cannot be used as indentation.
-            while self.look_ch() == ' ' {
+            while self.input.look_ch() == ' ' {
                self.skip_blank();
            }
@ -1891,7 +1841,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
            if is_break(self.ch()) {
                // If our current line is empty, skip over the break and continue looping.
-                self.lookahead(2);
+                self.input.lookahead(2);
                self.read_break(breaks);
            } else {
                // Otherwise, we have a content line. Return control.
@ -1943,15 +1893,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        loop {
            /* Check for a document indicator. */
-            self.lookahead(4);
+            self.input.lookahead(4);
-            if self.mark.col == 0
+            if self.mark.col == 0 && self.next_is_document_indicator() {
                && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
                    || ((self.buffer[0] == '.')
                        && (self.buffer[1] == '.')
                        && (self.buffer[2] == '.')))
                && is_blank_or_breakz(self.buffer[3])
            {
                return Err(ScanError::new_str(
                    start_mark,
                    "while scanning a quoted scalar, found unexpected document indicator",
@ -1980,7 +1924,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                &start_mark,
            )?;
-            match self.look_ch() {
+            match self.input.look_ch() {
                '\'' if single => break,
                '"' if !single => break,
                _ => {}
@ -2003,7 +1947,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                        self.skip_blank();
                    }
                } else {
-                    self.lookahead(2);
+                    self.input.lookahead(2);
                    // Check if it is a first line break.
                    if leading_blanks {
                        self.read_break(&mut trailing_breaks);
@ -2013,7 +1957,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                        leading_blanks = true;
                    }
                }
-                self.lookahead(1);
+                self.input.lookahead(1);
            }
            // Join the whitespaces or fold line breaks.
@ -2083,11 +2027,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        leading_blanks: &mut bool,
        start_mark: &Marker,
    ) -> Result<(), ScanError> {
-        self.lookahead(2);
+        self.input.lookahead(2);
        while !is_blank_or_breakz(self.ch()) {
            match self.ch() {
                // Check for an escaped single quote.
-                '\'' if self.buffer[1] == '\'' && single => {
+                '\'' if self.input.peek_nth(1) == '\'' && single => {
                    string.push('\'');
                    self.skip_n_non_blank(2);
                }
@ -2095,8 +2039,8 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                '\'' if single => break,
                '"' if !single => break,
                // Check for an escaped line break.
-                '\\' if !single && is_break(self.buffer[1]) => {
+                '\\' if !single && is_break(self.input.peek_nth(1)) => {
-                    self.lookahead(3);
+                    self.input.lookahead(3);
                    self.skip_non_blank();
                    self.skip_linebreak();
                    *leading_blanks = true;
@ -2111,7 +2055,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                    self.skip_non_blank();
                }
            }
-            self.lookahead(2);
+            self.input.lookahead(2);
        }
        Ok(())
    }
@ -2129,7 +2073,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        let mut code_length = 0usize;
        let mut ret = '\0';
-        match self.buffer[1] {
+        match self.input.peek_nth(1) {
            '0' => ret = '\0',
            'a' => ret = '\x07',
            'b' => ret = '\x08',
@ -2165,16 +2109,17 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        // Consume an arbitrary escape code.
        if code_length > 0 {
-            self.lookahead(code_length);
+            self.input.lookahead(code_length);
            let mut value = 0u32;
            for i in 0..code_length {
-                if !is_hex(self.buffer[i]) {
+                let c = self.input.peek_nth(i);
                if !is_hex(c) {
                    return Err(ScanError::new_str(
                        *start_mark,
                        "while parsing a quoted scalar, did not find expected hexadecimal number",
                    ));
                }
-                value = (value << 4) + as_hex(self.buffer[i]);
+                value = (value << 4) + as_hex(c);
            }
            let Some(ch) = char::from_u32(value) else {
@ -2223,12 +2168,12 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        let mut whitespaces = String::with_capacity(32);
        loop {
-            self.lookahead(4);
+            self.input.lookahead(4);
            if self.next_is_document_indicator() || self.ch() == '#' {
                break;
            }
-            if self.flow_level > 0 && self.ch() == '-' && is_flow(self.buffer[1]) {
+            if self.flow_level > 0 && self.ch() == '-' && is_flow(self.input.peek_nth(1)) {
                return Err(ScanError::new_str(
                    self.mark,
                    "plain scalar cannot start with '-' followed by ,[]{}",
@ -2260,7 +2205,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                // We can unroll the first iteration of the loop.
                string.push(self.ch());
                self.skip_non_blank();
-                self.lookahead(2);
+                self.input.lookahead(2);
                // Add content non-blank characters to the scalar.
                while !is_blank_or_breakz(self.ch()) {
@ -2270,7 +2215,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                    string.push(self.ch());
                    self.skip_non_blank();
-                    self.lookahead(2);
+                    self.input.lookahead(2);
                }
            }
@ -2283,7 +2228,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
            }
            // Process blank characters.
-            while is_blank(self.look_ch()) || is_break(self.ch()) {
+            while is_blank(self.input.look_ch()) || is_break(self.ch()) {
                if is_blank(self.ch()) {
                    if !self.leading_whitespace {
                        whitespaces.push(self.ch());
@ -2302,7 +2247,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
                        self.skip_blank();
                    }
                } else {
-                    self.lookahead(2);
+                    self.input.lookahead(2);
                    // Check if it is a first line break
                    if self.leading_whitespace {
                        self.read_break(&mut trailing_breaks);
@ -2379,7 +2324,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    /// [`self.flow_level`]: Self::flow_level
    /// [`fetch_value`]: Self::fetch_value
    fn fetch_flow_value(&mut self) -> ScanResult {
-        let nc = self.buffer[1];
+        let nc = self.input.peek_nth(1);
        // If we encounter a ':' inside a flow collection and it is not immediately
        // followed by a blank or breakz:
@ -2413,7 +2358,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
        // Skip over ':'.
        self.skip_non_blank();
-        if self.look_ch() == '\t'
+        if self.input.look_ch() == '\t'
            && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
            && (self.ch() == '-' || is_alpha(self.ch()))
        {
@ -2600,8 +2545,8 @@ impl<T: Iterator<Item = char>> Scanner<T> {
    fn next_can_be_plain_scalar(&self) -> bool {
        match self.ch() {
            // indicators can end a plain scalar, see 7.3.3. Plain Style
-            ':' if is_blank_or_breakz(self.buffer[1])
+            ':' if is_blank_or_breakz(self.input.peek_nth(1))
-                || (self.flow_level > 0 && is_flow(self.buffer[1])) =>
+                || (self.flow_level > 0 && is_flow(self.input.peek_nth(1))) =>
            {
                false
            }
--- a/parser/tests/basic.rs
+++ b/parser/tests/basic.rs
@ -231,7 +231,7 @@ a: |-
 #[test]
 fn test_bad_docstart() {
-    assert!(run_parser("---This used to cause an infinite loop").is_ok());
+    run_parser("---This used to cause an infinite loop").unwrap();
    assert_eq!(
        run_parser("----").unwrap(),
        [
--- a/parser/tools/dump_events.rs
+++ b/parser/tools/dump_events.rs
@ -2,11 +2,7 @@ use std::env;
 use std::fs::File;
 use std::io::prelude::*;
-use saphyr_parser::{
+use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser};
    parser::{MarkedEventReceiver, Parser},
    scanner::Marker,
    Event,
 };
 #[derive(Debug)]
 struct EventSink {
--- a/parser/tools/time_parse.rs
+++ b/parser/tools/time_parse.rs
@ -1,12 +1,9 @@
 use saphyr_parser::{
    parser::{MarkedEventReceiver, Parser},
    scanner::Marker,
    Event,
 };
 use std::env;
 use std::fs::File;
 use std::io::prelude::*;
 use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser};
 /// A sink which discards any event sent.
 struct NullSink {}