From d9bb7a1693a9b53a36b9b35d4fda171ae311b959 Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Thu, 18 Apr 2024 17:48:49 +0200 Subject: [PATCH] Add `Input` interface. Hiding character fetching behind this interface allows us to create more specific implementations when is appropriate. For instance, an instance of `Input` can be created for a `&str`, allowing for borrowing and more efficient peeking and traversing than if we were to fetch characters one at a time and placing them into a temporary buffer. --- parser/src/buffered_input.rs | 99 ++++++++++ parser/src/input.rs | 111 +++++++++++ parser/src/lib.rs | 11 +- parser/src/parser.rs | 16 +- parser/src/scanner.rs | 361 +++++++++++++++-------------------- parser/tests/basic.rs | 2 +- parser/tools/dump_events.rs | 6 +- parser/tools/time_parse.rs | 7 +- 8 files changed, 384 insertions(+), 229 deletions(-) create mode 100644 parser/src/buffered_input.rs create mode 100644 parser/src/input.rs diff --git a/parser/src/buffered_input.rs b/parser/src/buffered_input.rs new file mode 100644 index 0000000..cdd8b59 --- /dev/null +++ b/parser/src/buffered_input.rs @@ -0,0 +1,99 @@ +use crate::input::Input; + +use arraydeque::ArrayDeque; + +/// The size of the [`BufferedInput`] buffer. +/// +/// The buffer is statically allocated to avoid conditions for reallocations each time we +/// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except: +/// - Escape sequences parsing: some escape codes are 8 characters +/// - Scanning indent in scalars: this looks ahead `indent + 2` characters +/// +/// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done +/// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher +/// than that, the code will fall back to a loop of lookaheads. +const BUFFER_LEN: usize = 16; + +/// A wrapper around an [`Iterator`] of [`char`]s with a buffer. +/// +/// The YAML scanner often needs some lookahead. With fully allocated buffers such as `String` or +/// `&str`, this is not an issue. However, with streams, we need to have a way of peeking multiple +/// characters at a time and sometimes pushing some back into the stream. +/// There is no "easy" way of doing this without itertools. In order to avoid pulling the entierty +/// of itertools for one method, we use this structure. +pub struct BufferedInput> { + /// The iterator source, + input: T, + /// Buffer for the next characters to consume. + buffer: ArrayDeque, +} + +impl> BufferedInput { + /// Create a new [`BufferedInput`] with the given input. + pub fn new(input: T) -> Self { + Self { + input, + buffer: ArrayDeque::default(), + } + } +} + +impl> Input for BufferedInput { + #[inline] + fn lookahead(&mut self, count: usize) { + if self.buffer.len() >= count { + return; + } + for _ in 0..(count - self.buffer.len()) { + self.buffer + .push_back(self.input.next().unwrap_or('\0')) + .unwrap(); + } + } + + #[inline] + fn buflen(&self) -> usize { + self.buffer.len() + } + + #[inline] + fn bufmaxlen(&self) -> usize { + BUFFER_LEN + } + + #[inline] + fn raw_read_ch(&mut self) -> char { + self.input.next().unwrap_or('\0') + } + + #[inline] + fn push_back(&mut self, c: char) { + self.buffer.push_back(c).unwrap(); + } + + #[inline] + fn skip(&mut self) { + self.buffer.pop_front(); + } + + #[inline] + fn skip_n(&mut self, count: usize) { + self.buffer.drain(0..count); + } + + #[inline] + fn peek(&self) -> char { + self.buffer[0] + } + + #[inline] + fn peek_nth(&self, n: usize) -> char { + self.buffer[n] + } + + #[inline] + fn next_is(&self, pat: &str) -> bool { + assert!(self.buffer.len() >= pat.len()); + self.buffer.iter().zip(pat.chars()).all(|(a, b)| *a == b) + } +} diff --git a/parser/src/input.rs b/parser/src/input.rs new file mode 100644 index 0000000..d6b0715 --- /dev/null +++ b/parser/src/input.rs @@ -0,0 +1,111 @@ +/// Interface for a source of characters. +/// +/// Hiding the input's implementation behind this trait allows mostly: +/// * For input-specific optimizations (for instance, using `str` methods instead of manually +/// transferring one `char` at a time to a buffer). +/// * To return `&str`s referencing the input string, thus avoiding potentially costly +/// allocations. Should users need an owned version of the data, they can always `.to_owned()` +/// their YAML object. +pub trait Input { + /// A hint to the input source that we will need to read `count` characters. + /// + /// If the input is exhausted, `\0` can be used to pad the last characters and later returned. + /// The characters must not be consumed, but may be placed in an internal buffer. + /// + /// This method may be a no-op if buffering yields no performance improvement. + /// + /// Implementers of [`Input`] must _not_ load more than `count` characters into the buffer. The + /// parser tracks how many characters are loaded in the buffer and acts accordingly. + fn lookahead(&mut self, count: usize); + + /// Return the number of buffered characters in `self`. + #[must_use] + fn buflen(&self) -> usize; + + /// Return the capacity of the buffer in `self`. + #[must_use] + fn bufmaxlen(&self) -> usize; + + /// Return whether the buffer (!= stream) is empty. + #[inline] + #[must_use] + fn buf_is_empty(&self) -> bool { + self.buflen() == 0 + } + + /// Read a character from the input stream and return it directly. + /// + /// The internal buffer (is any) is bypassed. + #[must_use] + fn raw_read_ch(&mut self) -> char; + + /// Put a character back in the buffer. + /// + /// This function is only called when we read one too many characters and the pushed back + /// character is exactly the last character that was read. This function will not be called + /// multiple times consecutively. + fn push_back(&mut self, c: char); + + /// Consume the next character. + fn skip(&mut self); + + /// Consume the next `count` character. + fn skip_n(&mut self, count: usize); + + /// Return the next character, without consuming it. + /// + /// Users of the [`Input`] must make sure that the character has been loaded through a prior + /// call to [`Input::lookahead`]. Implementors of [`Input`] may assume that a valid call to + /// [`Input::lookahead`] has been made beforehand. + /// + /// # Return + /// If the input source is not exhausted, returns the next character to be fed into the + /// scanner. Otherwise, returns `\0`. + #[must_use] + fn peek(&self) -> char; + + /// Return the `n`-th character in the buffer, without consuming it. + /// + /// This function assumes that the n-th character in the input has already been fetched through + /// [`Input::lookahead`]. + #[must_use] + fn peek_nth(&self, n: usize) -> char; + + /// Look for the next character and return it. + /// + /// The character is not consumed. + /// Equivalent to calling [`Input::lookahead`] and [`Input::peek`]. + #[inline] + #[must_use] + fn look_ch(&mut self) -> char { + self.lookahead(1); + self.peek() + } + + /// Return whether the next character in the input source is equal to `c`. + /// + /// This function assumes that the next character in the input has already been fetched through + /// [`Input::lookahead`]. + #[inline] + #[must_use] + fn next_char_is(&self, c: char) -> bool { + self.peek() == c + } + + /// Return whether the `n`-th character in the input source is equal to `c`. + /// + /// This function assumes that the n-th character in the input has already been fetched through + /// [`Input::lookahead`]. + #[inline] + #[must_use] + fn nth_char_is(&self, n: usize, c: char) -> bool { + self.peek_nth(n) == c + } + + /// Return whether the next characters in the input source match the given pattern. + /// + /// This function assumes that the next `pat.len()` characters in the input has already been + /// fetched through [`Input::lookahead`]. + #[must_use] + fn next_is(&self, pat: &str) -> bool; +} diff --git a/parser/src/lib.rs b/parser/src/lib.rs index 8cb802e..8cd4731 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -32,11 +32,14 @@ #![warn(missing_docs, clippy::pedantic)] -pub(crate) mod char_traits; +mod buffered_input; +mod char_traits; #[macro_use] -pub(crate) mod debug; -pub mod parser; -pub mod scanner; +mod debug; +mod input; +mod parser; +mod scanner; +pub use crate::buffered_input::BufferedInput; pub use crate::parser::{Event, EventReceiver, MarkedEventReceiver, Parser, Tag}; pub use crate::scanner::{Marker, ScanError, TScalarStyle}; diff --git a/parser/src/parser.rs b/parser/src/parser.rs index ea234f6..0d15275 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -4,7 +4,11 @@ //! compliance, and emits a stream of YAML events. This stream can for instance be used to create //! YAML objects. -use crate::scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType}; +use crate::{ + input::Input, + scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType}, + BufferedInput, +}; use std::collections::HashMap; #[derive(Clone, Copy, PartialEq, Debug, Eq)] @@ -100,7 +104,7 @@ impl Event { /// A YAML parser. #[derive(Debug)] -pub struct Parser { +pub struct Parser { /// The underlying scanner from which we pull tokens. scanner: Scanner, /// The stack of _previous_ states we were in. @@ -225,15 +229,15 @@ impl MarkedEventReceiver for R { /// A convenience alias for a `Result` of a parser event. pub type ParseResult = Result<(Event, Marker), ScanError>; -impl<'a> Parser> { +impl<'a> Parser>> { /// Create a new instance of a parser from a &str. #[must_use] pub fn new_from_str(value: &'a str) -> Self { - Parser::new(value.chars()) + Parser::new(BufferedInput::new(value.chars())) } } -impl> Parser { +impl Parser { /// Create a new instance of a parser from the given input of characters. pub fn new(src: T) -> Parser { Parser { @@ -1130,7 +1134,7 @@ impl> Parser { } } -impl> Iterator for Parser { +impl Iterator for Parser { type Item = Result<(Event, Marker), ScanError>; fn next(&mut self) -> Option { diff --git a/parser/src/scanner.rs b/parser/src/scanner.rs index 6eac393..9528dfb 100644 --- a/parser/src/scanner.rs +++ b/parser/src/scanner.rs @@ -11,11 +11,12 @@ use std::{char, collections::VecDeque, error::Error, fmt}; -use arraydeque::ArrayDeque; - -use crate::char_traits::{ - as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, - is_flow, is_hex, is_tag_char, is_uri_char, is_z, +use crate::{ + char_traits::{ + as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, + is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z, + }, + input::Input, }; /// The encoding of the input. Currently, only UTF-8 is supported. @@ -343,18 +344,6 @@ enum ImplicitMappingState { Inside, } -/// The size of the [`Scanner`] buffer. -/// -/// The buffer is statically allocated to avoid conditions for reallocations each time we -/// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except: -/// - Escape sequences parsing: some escape codes are 8 characters -/// - Scanning indent in scalars: this looks ahead `indent + 2` characters -/// -/// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done -/// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher -/// than that, the code will fall back to a loop of lookaheads. -const BUFFER_LEN: usize = 16; - /// The YAML scanner. /// /// This corresponds to the low-level interface when reading YAML. The scanner emits token as they @@ -367,8 +356,10 @@ const BUFFER_LEN: usize = 16; #[derive(Debug)] #[allow(clippy::struct_excessive_bools)] pub struct Scanner { - /// The reader, providing with characters. - rdr: T, + /// The input source. + /// + /// This must implement [`Input`]. + input: T, /// The position of the cursor within the reader. mark: Marker, /// Buffer for tokens to be returned. @@ -378,8 +369,6 @@ pub struct Scanner { /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from /// [`Self::next`] until we have more context. tokens: VecDeque, - /// Buffer for the next characters to consume. - buffer: ArrayDeque, /// The last error that happened. error: Option, @@ -435,7 +424,7 @@ pub struct Scanner { implicit_flow_mapping_states: Vec, } -impl> Iterator for Scanner { +impl Iterator for Scanner { type Item = Token; fn next(&mut self) -> Option { if self.error.is_some() { @@ -462,12 +451,11 @@ impl> Iterator for Scanner { /// A convenience alias for scanner functions that may fail without returning a value. pub type ScanResult = Result<(), ScanError>; -impl> Scanner { +impl Scanner { /// Creates the YAML tokenizer. - pub fn new(rdr: T) -> Scanner { + pub fn new(input: T) -> Scanner { Scanner { - rdr, - buffer: ArrayDeque::new(), + input, mark: Marker::new(0, 1, 0), tokens: VecDeque::new(), error: None, @@ -497,25 +485,10 @@ impl> Scanner { self.error.clone() } - /// Fill `self.buffer` with at least `count` characters. - /// - /// The characters that are extracted this way are not consumed but only placed in the buffer. - #[inline] - fn lookahead(&mut self, count: usize) { - if self.buffer.len() >= count { - return; - } - for _ in 0..(count - self.buffer.len()) { - self.buffer - .push_back(self.rdr.next().unwrap_or('\0')) - .unwrap(); - } - } - /// Consume the next character. It is assumed the next character is a blank. #[inline] fn skip_blank(&mut self) { - self.buffer.pop_front(); + self.input.skip(); self.mark.index += 1; self.mark.col += 1; @@ -524,7 +497,7 @@ impl> Scanner { /// Consume the next character. It is assumed the next character is not a blank. #[inline] fn skip_non_blank(&mut self) { - self.buffer.pop_front(); + self.input.skip(); self.mark.index += 1; self.mark.col += 1; @@ -533,18 +506,18 @@ impl> Scanner { /// Consume the next characters. It is assumed none of the next characters are blanks. #[inline] - fn skip_n_non_blank(&mut self, n: usize) { - self.buffer.drain(0..n); + fn skip_n_non_blank(&mut self, count: usize) { + self.input.skip_n(count); - self.mark.index += n; - self.mark.col += n; + self.mark.index += count; + self.mark.col += count; self.leading_whitespace = false; } /// Consume the next character. It is assumed the next character is a newline. #[inline] fn skip_nl(&mut self) { - self.buffer.pop_front(); + self.input.skip(); self.mark.index += 1; self.mark.col = 0; @@ -555,12 +528,12 @@ impl> Scanner { /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none. #[inline] fn skip_linebreak(&mut self) { - if self.buffer[0] == '\r' && self.buffer[1] == '\n' { + if self.input.next_is("\r\n") { // While technically not a blank, this does not matter as `self.leading_whitespace` // will be reset by `skip_nl`. self.skip_blank(); self.skip_nl(); - } else if is_break(self.buffer[0]) { + } else if is_break(self.input.peek()) { self.skip_nl(); } } @@ -570,32 +543,16 @@ impl> Scanner { /// The character is not consumed. #[inline] fn ch(&self) -> char { - self.buffer[0] - } - - /// Look for the next character and return it. - /// - /// The character is not consumed. - /// Equivalent to calling [`Self::lookahead`] and [`Self::ch`]. - #[inline] - fn look_ch(&mut self) -> char { - self.lookahead(1); - self.ch() + self.input.peek() } /// Read a character from the input stream, returning it directly. /// - /// The buffer is bypassed and `self.mark` needs to be updated manually. + /// The buffer (if any) is bypassed and `self.mark` needs to be updated manually. #[inline] #[must_use] fn raw_read_ch(&mut self) -> char { - self.rdr.next().unwrap_or('\0') - } - - /// Return whether the next character is `c`. - #[inline] - fn ch_is(&self, c: char) -> bool { - self.buffer[0] == c + self.input.raw_read_ch() } /// Return whether the [`TokenType::StreamStart`] event has been emitted. @@ -624,8 +581,8 @@ impl> Scanner { // If the next characters do not correspond to a line break. #[inline] fn read_break(&mut self, s: &mut String) { - let c = self.buffer[0]; - let nc = self.buffer[1]; + let c = self.input.peek(); + let nc = self.input.peek_nth(1); debug_assert!(is_break(c)); if c == '\r' && nc == '\n' { self.skip_blank(); @@ -635,15 +592,20 @@ impl> Scanner { s.push('\n'); } + /// Check whether the next characters correspond to a start of document. + /// + /// [`Self::lookahead`] must have been called before calling this function. + fn next_is_document_start(&self) -> bool { + assert!(self.input.buflen() >= 4); + self.input.next_is("---") && is_blank_or_breakz(self.input.peek_nth(3)) + } + /// Check whether the next characters correspond to an end of document. /// /// [`Self::lookahead`] must have been called before calling this function. fn next_is_document_end(&self) -> bool { - assert!(self.buffer.len() >= 4); - self.buffer[0] == '.' - && self.buffer[1] == '.' - && self.buffer[2] == '.' - && is_blank_or_breakz(self.buffer[3]) + assert!(self.input.buflen() >= 4); + self.input.next_is("...") && is_blank_or_breakz(self.input.peek_nth(3)) } /// Check whether the next characters correspond to a document indicator. @@ -651,11 +613,9 @@ impl> Scanner { /// [`Self::lookahead`] must have been called before calling this function. #[inline] fn next_is_document_indicator(&self) -> bool { - assert!(self.buffer.len() >= 4); - self.mark.col == 0 - && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-')) - || ((self.buffer[0] == '.') && (self.buffer[1] == '.') && (self.buffer[2] == '.'))) - && is_blank_or_breakz(self.buffer[3]) + assert!(self.input.buflen() >= 4); + is_blank_or_breakz(self.input.peek_nth(3)) + && (self.input.next_is("...") || self.input.next_is("---")) } /// Insert a token at the given position. @@ -674,11 +634,11 @@ impl> Scanner { } /// Fetch the next token in the stream. + /// /// # Errors /// Returns `ScanError` when the scanner does not find the next expected token. pub fn fetch_next_token(&mut self) -> ScanResult { - self.lookahead(1); - // eprintln!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch()); + self.input.lookahead(1); if !self.stream_start_produced { self.fetch_stream_start(); @@ -697,51 +657,37 @@ impl> Scanner { let mark = self.mark; self.unroll_indent(mark.col as isize); - self.lookahead(4); + self.input.lookahead(4); if is_z(self.ch()) { self.fetch_stream_end()?; return Ok(()); } - // Is it a directive? - if self.mark.col == 0 && self.ch_is('%') { - return self.fetch_directive(); - } - - if self.mark.col == 0 - && self.buffer[0] == '-' - && self.buffer[1] == '-' - && self.buffer[2] == '-' - && is_blank_or_breakz(self.buffer[3]) - { - self.fetch_document_indicator(TokenType::DocumentStart)?; - return Ok(()); - } - - if self.mark.col == 0 - && self.buffer[0] == '.' - && self.buffer[1] == '.' - && self.buffer[2] == '.' - && is_blank_or_breakz(self.buffer[3]) - { - self.fetch_document_indicator(TokenType::DocumentEnd)?; - self.skip_ws_to_eol(SkipTabs::Yes)?; - if !is_breakz(self.ch()) { - return Err(ScanError::new_str( - self.mark, - "invalid content after document end marker", - )); + if self.mark.col == 0 { + if self.input.next_char_is('%') { + return self.fetch_directive(); + } else if self.next_is_document_start() { + return self.fetch_document_indicator(TokenType::DocumentStart); + } else if self.next_is_document_end() { + self.fetch_document_indicator(TokenType::DocumentEnd)?; + self.skip_ws_to_eol(SkipTabs::Yes)?; + if !is_breakz(self.ch()) { + return Err(ScanError::new_str( + self.mark, + "invalid content after document end marker", + )); + } + return Ok(()); } - return Ok(()); } if (self.mark.col as isize) < self.indent { return Err(ScanError::new_str(self.mark, "invalid indentation")); } - let c = self.buffer[0]; - let nc = self.buffer[1]; + let c = self.input.peek(); + let nc = self.input.peek_nth(1); match c { '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart), '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart), @@ -860,7 +806,7 @@ impl> Scanner { Ok(()) } - /// Skip over all whitespace and comments until the next token. + /// Skip over all whitespace (`\t`, ` `, `\n`, `\r`) and comments until the next token. /// /// # Errors /// This function returns an error if a tabulation is encountered where there should not be @@ -868,7 +814,7 @@ impl> Scanner { fn skip_to_next_token(&mut self) -> ScanResult { loop { // TODO(chenyh) BOM - match self.look_ch() { + match self.input.look_ch() { // Tabs may not be used as indentation. // "Indentation" only exists as long as a block is started, but does not exist // inside of flow-style constructs. Tabs are allowed as part of leading @@ -890,14 +836,14 @@ impl> Scanner { } '\t' | ' ' => self.skip_blank(), '\n' | '\r' => { - self.lookahead(2); + self.input.lookahead(2); self.skip_linebreak(); if self.flow_level == 0 { self.allow_simple_key(); } } '#' => { - while !is_breakz(self.look_ch()) { + while !is_breakz(self.input.look_ch()) { self.skip_non_blank(); } } @@ -914,14 +860,14 @@ impl> Scanner { fn skip_yaml_whitespace(&mut self) -> ScanResult { let mut need_whitespace = true; loop { - match self.look_ch() { + match self.input.look_ch() { ' ' => { self.skip_blank(); need_whitespace = false; } '\n' | '\r' => { - self.lookahead(2); + self.input.lookahead(2); self.skip_linebreak(); if self.flow_level == 0 { self.allow_simple_key(); @@ -929,7 +875,7 @@ impl> Scanner { need_whitespace = false; } '#' => { - while !is_breakz(self.look_ch()) { + while !is_breakz(self.input.look_ch()) { self.skip_non_blank(); } } @@ -949,7 +895,7 @@ impl> Scanner { let mut encountered_tab = false; let mut has_yaml_ws = false; loop { - match self.look_ch() { + match self.input.look_ch() { ' ' => { has_yaml_ws = true; self.skip_blank(); @@ -966,7 +912,7 @@ impl> Scanner { )); } '#' => { - while !is_breakz(self.look_ch()) { + while !is_breakz(self.input.look_ch()) { self.skip_non_blank(); } } @@ -1035,7 +981,7 @@ impl> Scanner { // XXX This should be a warning instead of an error _ => { // skip current line - while !is_breakz(self.look_ch()) { + while !is_breakz(self.input.look_ch()) { self.skip_non_blank(); } // XXX return an empty TagDirective token @@ -1051,7 +997,7 @@ impl> Scanner { self.skip_ws_to_eol(SkipTabs::Yes)?; if is_breakz(self.ch()) { - self.lookahead(2); + self.input.lookahead(2); self.skip_linebreak(); Ok(tok) } else { @@ -1063,7 +1009,7 @@ impl> Scanner { } fn scan_version_directive_value(&mut self, mark: &Marker) -> Result { - while is_blank(self.look_ch()) { + while is_blank(self.input.look_ch()) { self.skip_blank(); } @@ -1085,7 +1031,7 @@ impl> Scanner { fn scan_directive_name(&mut self) -> Result { let start_mark = self.mark; let mut string = String::new(); - while is_alpha(self.look_ch()) { + while is_alpha(self.input.look_ch()) { string.push(self.ch()); self.skip_non_blank(); } @@ -1110,7 +1056,7 @@ impl> Scanner { fn scan_version_directive_number(&mut self, mark: &Marker) -> Result { let mut val = 0u32; let mut length = 0usize; - while let Some(digit) = self.look_ch().to_digit(10) { + while let Some(digit) = self.input.look_ch().to_digit(10) { if length + 1 > 9 { return Err(ScanError::new_str( *mark, @@ -1134,19 +1080,19 @@ impl> Scanner { fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result { /* Eat whitespaces. */ - while is_blank(self.look_ch()) { + while is_blank(self.input.look_ch()) { self.skip_blank(); } let handle = self.scan_tag_handle(true, mark)?; /* Eat whitespaces. */ - while is_blank(self.look_ch()) { + while is_blank(self.input.look_ch()) { self.skip_blank(); } let prefix = self.scan_tag_prefix(mark)?; - self.lookahead(1); + self.input.lookahead(1); if is_blank_or_breakz(self.ch()) { Ok(Token(*mark, TokenType::TagDirective(handle, prefix))) @@ -1173,9 +1119,9 @@ impl> Scanner { let mut suffix; // Check if the tag is in the canonical form (verbatim). - self.lookahead(2); + self.input.lookahead(2); - if self.buffer[1] == '<' { + if self.input.nth_char_is(1, '<') { suffix = self.scan_verbatim_tag(&start_mark)?; } else { // The tag has either the '!suffix' or the '!handle!suffix' @@ -1198,7 +1144,7 @@ impl> Scanner { } } - if is_blank_or_breakz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) { + if is_blank_or_breakz(self.input.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) { // XXX: ex 7.2, an empty scalar can follow a secondary tag Ok(Token(start_mark, TokenType::Tag(handle, suffix))) } else { @@ -1211,7 +1157,7 @@ impl> Scanner { fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result { let mut string = String::new(); - if self.look_ch() != '!' { + if self.input.look_ch() != '!' { return Err(ScanError::new_str( *mark, "while scanning a tag, did not find expected '!'", @@ -1221,7 +1167,7 @@ impl> Scanner { string.push(self.ch()); self.skip_non_blank(); - while is_alpha(self.look_ch()) { + while is_alpha(self.input.look_ch()) { string.push(self.ch()); self.skip_non_blank(); } @@ -1250,7 +1196,7 @@ impl> Scanner { fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result { let mut string = String::new(); - if self.look_ch() == '!' { + if self.input.look_ch() == '!' { // If we have a local tag, insert and skip `!`. string.push(self.ch()); self.skip_non_blank(); @@ -1269,7 +1215,7 @@ impl> Scanner { self.skip_non_blank(); } - while is_uri_char(self.look_ch()) { + while is_uri_char(self.input.look_ch()) { if self.ch() == '%' { string.push(self.scan_uri_escapes(start_mark)?); } else { @@ -1290,7 +1236,7 @@ impl> Scanner { self.skip_non_blank(); let mut string = String::new(); - while is_uri_char(self.look_ch()) { + while is_uri_char(self.input.look_ch()) { if self.ch() == '%' { string.push(self.scan_uri_escapes(start_mark)?); } else { @@ -1326,7 +1272,7 @@ impl> Scanner { string.extend(head.chars().skip(1)); } - while is_tag_char(self.look_ch()) { + while is_tag_char(self.input.look_ch()) { // Check if it is a URI-escape sequence. if self.ch() == '%' { string.push(self.scan_uri_escapes(mark)?); @@ -1352,38 +1298,41 @@ impl> Scanner { let mut width = 0usize; let mut code = 0u32; loop { - self.lookahead(3); + self.input.lookahead(3); - if !(self.ch() == '%' && is_hex(self.buffer[1]) && is_hex(self.buffer[2])) { + let c = self.input.peek_nth(1); + let nc = self.input.peek_nth(2); + + if !(self.ch() == '%' && is_hex(c) && is_hex(nc)) { return Err(ScanError::new_str( *mark, - "while parsing a tag, did not find URI escaped octet", + "while parsing a tag, found an invalid escape sequence", )); } - let octet = (as_hex(self.buffer[1]) << 4) + as_hex(self.buffer[2]); + let byte = (as_hex(c) << 4) + as_hex(nc); if width == 0 { - width = match octet { - _ if octet & 0x80 == 0x00 => 1, - _ if octet & 0xE0 == 0xC0 => 2, - _ if octet & 0xF0 == 0xE0 => 3, - _ if octet & 0xF8 == 0xF0 => 4, + width = match byte { + _ if byte & 0x80 == 0x00 => 1, + _ if byte & 0xE0 == 0xC0 => 2, + _ if byte & 0xF0 == 0xE0 => 3, + _ if byte & 0xF8 == 0xF0 => 4, _ => { return Err(ScanError::new_str( *mark, - "while parsing a tag, found an incorrect leading UTF-8 octet", + "while parsing a tag, found an incorrect leading UTF-8 byte", )); } }; - code = octet; + code = byte; } else { - if octet & 0xc0 != 0x80 { + if byte & 0xc0 != 0x80 { return Err(ScanError::new_str( *mark, - "while parsing a tag, found an incorrect trailing UTF-8 octet", + "while parsing a tag, found an incorrect trailing UTF-8 byte", )); } - code = (code << 8) + octet; + code = (code << 8) + byte; } self.skip_n_non_blank(3); @@ -1419,7 +1368,7 @@ impl> Scanner { let start_mark = self.mark; self.skip_non_blank(); - while is_anchor_char(self.look_ch()) { + while is_anchor_char(self.input.look_ch()) { string.push(self.ch()); self.skip_non_blank(); } @@ -1556,8 +1505,9 @@ impl> Scanner { // generate BLOCK-SEQUENCE-START if indented self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark); let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs(); - self.lookahead(2); - if found_tabs && self.buffer[0] == '-' && is_blank_or_breakz(self.buffer[1]) { + self.input.lookahead(2); + if found_tabs && self.input.next_char_is('-') && is_blank_or_breakz(self.input.peek_nth(1)) + { return Err(ScanError::new_str( self.mark, "'-' must be followed by a valid YAML whitespace", @@ -1565,7 +1515,7 @@ impl> Scanner { } self.skip_ws_to_eol(SkipTabs::No)?; - if is_break(self.look_ch()) || is_flow(self.ch()) { + if is_break(self.input.look_ch()) || is_flow(self.ch()) { self.roll_one_col_indent(); } @@ -1623,14 +1573,14 @@ impl> Scanner { self.skip_non_blank(); self.unroll_non_block_indents(); - if self.look_ch() == '+' || self.ch() == '-' { + if self.input.look_ch() == '+' || self.ch() == '-' { if self.ch() == '+' { chomping = Chomping::Keep; } else { chomping = Chomping::Strip; } self.skip_non_blank(); - if is_digit(self.look_ch()) { + if is_digit(self.input.look_ch()) { if self.ch() == '0' { return Err(ScanError::new_str( start_mark, @@ -1650,7 +1600,7 @@ impl> Scanner { increment = (self.ch() as usize) - ('0' as usize); self.skip_non_blank(); - self.lookahead(1); + self.input.lookahead(1); if self.ch() == '+' || self.ch() == '-' { if self.ch() == '+' { chomping = Chomping::Keep; @@ -1664,7 +1614,7 @@ impl> Scanner { self.skip_ws_to_eol(SkipTabs::Yes)?; // Check if we are at the end of the line. - if !is_breakz(self.look_ch()) { + if !is_breakz(self.input.look_ch()) { return Err(ScanError::new_str( start_mark, "while scanning a block scalar, did not find expected comment or line break", @@ -1672,11 +1622,11 @@ impl> Scanner { } if is_break(self.ch()) { - self.lookahead(2); + self.input.lookahead(2); self.read_break(&mut chomping_break); } - if self.look_ch() == '\t' { + if self.input.look_ch() == '\t' { return Err(ScanError::new_str( start_mark, "a block scalar content cannot start with a tab", @@ -1731,7 +1681,7 @@ impl> Scanner { let start_mark = self.mark; while self.mark.col == indent && !is_z(self.ch()) { if indent == 0 { - self.lookahead(4); + self.input.lookahead(4); if self.next_is_document_end() { break; } @@ -1761,7 +1711,7 @@ impl> Scanner { break; } - self.lookahead(2); + self.input.lookahead(2); self.read_break(&mut leading_break); // Eat the following indentation spaces and line breaks. @@ -1797,7 +1747,7 @@ impl> Scanner { /// line. This function does not consume the line break character(s) after the line. fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) { // Start by evaluating characters in the buffer. - while !self.buffer.is_empty() && !is_breakz(self.ch()) { + while !self.input.buf_is_empty() && !is_breakz(self.ch()) { string.push(self.ch()); // We may technically skip non-blank characters. However, the only distinction is // to determine what is leading whitespace and what is not. Here, we read the @@ -1809,7 +1759,7 @@ impl> Scanner { // All characters that were in the buffer were consumed. We need to check if more // follow. - if self.buffer.is_empty() { + if self.input.buf_is_empty() { // We will read all consecutive non-breakz characters. We push them into a // temporary buffer. The main difference with going through `self.buffer` is that // characters are appended here as their real size (1B for ascii, or up to 4 bytes for @@ -1824,7 +1774,7 @@ impl> Scanner { // Our last character read is stored in `c`. It is either an EOF or a break. In any // case, we need to push it back into `self.buffer` so it may be properly read // after. We must not insert it in `string`. - self.buffer.push_back(c).unwrap(); + self.input.push_back(c); // We need to manually update our position; we haven't called a `skip` function. self.mark.col += line_buffer.len(); @@ -1842,25 +1792,25 @@ impl> Scanner { fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) { loop { // Consume all spaces. Tabs cannot be used as indentation. - if indent < BUFFER_LEN - 2 { - self.lookahead(BUFFER_LEN); + if indent < self.input.bufmaxlen() - 2 { + self.input.lookahead(self.input.bufmaxlen()); while self.mark.col < indent && self.ch() == ' ' { self.skip_blank(); } } else { loop { - self.lookahead(BUFFER_LEN); - while !self.buffer.is_empty() && self.mark.col < indent && self.ch() == ' ' { + self.input.lookahead(self.input.bufmaxlen()); + while !self.input.buf_is_empty() && self.mark.col < indent && self.ch() == ' ' { self.skip_blank(); } // If we reached our indent, we can break. We must also break if we have // reached content or EOF; that is, the buffer is not empty and the next // character is not a space. - if self.mark.col == indent || (!self.buffer.is_empty() && self.ch() != ' ') { + if self.mark.col == indent || (!self.input.buf_is_empty() && self.ch() != ' ') { break; } } - self.lookahead(2); + self.input.lookahead(2); } // If our current line is empty, skip over the break and continue looping. @@ -1881,7 +1831,7 @@ impl> Scanner { let mut max_indent = 0; loop { // Consume all spaces. Tabs cannot be used as indentation. - while self.look_ch() == ' ' { + while self.input.look_ch() == ' ' { self.skip_blank(); } @@ -1891,7 +1841,7 @@ impl> Scanner { if is_break(self.ch()) { // If our current line is empty, skip over the break and continue looping. - self.lookahead(2); + self.input.lookahead(2); self.read_break(breaks); } else { // Otherwise, we have a content line. Return control. @@ -1943,15 +1893,9 @@ impl> Scanner { loop { /* Check for a document indicator. */ - self.lookahead(4); + self.input.lookahead(4); - if self.mark.col == 0 - && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-')) - || ((self.buffer[0] == '.') - && (self.buffer[1] == '.') - && (self.buffer[2] == '.'))) - && is_blank_or_breakz(self.buffer[3]) - { + if self.mark.col == 0 && self.next_is_document_indicator() { return Err(ScanError::new_str( start_mark, "while scanning a quoted scalar, found unexpected document indicator", @@ -1980,7 +1924,7 @@ impl> Scanner { &start_mark, )?; - match self.look_ch() { + match self.input.look_ch() { '\'' if single => break, '"' if !single => break, _ => {} @@ -2003,7 +1947,7 @@ impl> Scanner { self.skip_blank(); } } else { - self.lookahead(2); + self.input.lookahead(2); // Check if it is a first line break. if leading_blanks { self.read_break(&mut trailing_breaks); @@ -2013,7 +1957,7 @@ impl> Scanner { leading_blanks = true; } } - self.lookahead(1); + self.input.lookahead(1); } // Join the whitespaces or fold line breaks. @@ -2083,11 +2027,11 @@ impl> Scanner { leading_blanks: &mut bool, start_mark: &Marker, ) -> Result<(), ScanError> { - self.lookahead(2); + self.input.lookahead(2); while !is_blank_or_breakz(self.ch()) { match self.ch() { // Check for an escaped single quote. - '\'' if self.buffer[1] == '\'' && single => { + '\'' if self.input.peek_nth(1) == '\'' && single => { string.push('\''); self.skip_n_non_blank(2); } @@ -2095,8 +2039,8 @@ impl> Scanner { '\'' if single => break, '"' if !single => break, // Check for an escaped line break. - '\\' if !single && is_break(self.buffer[1]) => { - self.lookahead(3); + '\\' if !single && is_break(self.input.peek_nth(1)) => { + self.input.lookahead(3); self.skip_non_blank(); self.skip_linebreak(); *leading_blanks = true; @@ -2111,7 +2055,7 @@ impl> Scanner { self.skip_non_blank(); } } - self.lookahead(2); + self.input.lookahead(2); } Ok(()) } @@ -2129,7 +2073,7 @@ impl> Scanner { let mut code_length = 0usize; let mut ret = '\0'; - match self.buffer[1] { + match self.input.peek_nth(1) { '0' => ret = '\0', 'a' => ret = '\x07', 'b' => ret = '\x08', @@ -2165,16 +2109,17 @@ impl> Scanner { // Consume an arbitrary escape code. if code_length > 0 { - self.lookahead(code_length); + self.input.lookahead(code_length); let mut value = 0u32; for i in 0..code_length { - if !is_hex(self.buffer[i]) { + let c = self.input.peek_nth(i); + if !is_hex(c) { return Err(ScanError::new_str( *start_mark, "while parsing a quoted scalar, did not find expected hexadecimal number", )); } - value = (value << 4) + as_hex(self.buffer[i]); + value = (value << 4) + as_hex(c); } let Some(ch) = char::from_u32(value) else { @@ -2223,12 +2168,12 @@ impl> Scanner { let mut whitespaces = String::with_capacity(32); loop { - self.lookahead(4); + self.input.lookahead(4); if self.next_is_document_indicator() || self.ch() == '#' { break; } - if self.flow_level > 0 && self.ch() == '-' && is_flow(self.buffer[1]) { + if self.flow_level > 0 && self.ch() == '-' && is_flow(self.input.peek_nth(1)) { return Err(ScanError::new_str( self.mark, "plain scalar cannot start with '-' followed by ,[]{}", @@ -2260,7 +2205,7 @@ impl> Scanner { // We can unroll the first iteration of the loop. string.push(self.ch()); self.skip_non_blank(); - self.lookahead(2); + self.input.lookahead(2); // Add content non-blank characters to the scalar. while !is_blank_or_breakz(self.ch()) { @@ -2270,7 +2215,7 @@ impl> Scanner { string.push(self.ch()); self.skip_non_blank(); - self.lookahead(2); + self.input.lookahead(2); } } @@ -2283,7 +2228,7 @@ impl> Scanner { } // Process blank characters. - while is_blank(self.look_ch()) || is_break(self.ch()) { + while is_blank(self.input.look_ch()) || is_break(self.ch()) { if is_blank(self.ch()) { if !self.leading_whitespace { whitespaces.push(self.ch()); @@ -2302,7 +2247,7 @@ impl> Scanner { self.skip_blank(); } } else { - self.lookahead(2); + self.input.lookahead(2); // Check if it is a first line break if self.leading_whitespace { self.read_break(&mut trailing_breaks); @@ -2379,7 +2324,7 @@ impl> Scanner { /// [`self.flow_level`]: Self::flow_level /// [`fetch_value`]: Self::fetch_value fn fetch_flow_value(&mut self) -> ScanResult { - let nc = self.buffer[1]; + let nc = self.input.peek_nth(1); // If we encounter a ':' inside a flow collection and it is not immediately // followed by a blank or breakz: @@ -2413,7 +2358,7 @@ impl> Scanner { // Skip over ':'. self.skip_non_blank(); - if self.look_ch() == '\t' + if self.input.look_ch() == '\t' && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws() && (self.ch() == '-' || is_alpha(self.ch())) { @@ -2600,8 +2545,8 @@ impl> Scanner { fn next_can_be_plain_scalar(&self) -> bool { match self.ch() { // indicators can end a plain scalar, see 7.3.3. Plain Style - ':' if is_blank_or_breakz(self.buffer[1]) - || (self.flow_level > 0 && is_flow(self.buffer[1])) => + ':' if is_blank_or_breakz(self.input.peek_nth(1)) + || (self.flow_level > 0 && is_flow(self.input.peek_nth(1))) => { false } diff --git a/parser/tests/basic.rs b/parser/tests/basic.rs index c3532fa..9782a32 100644 --- a/parser/tests/basic.rs +++ b/parser/tests/basic.rs @@ -231,7 +231,7 @@ a: |- #[test] fn test_bad_docstart() { - assert!(run_parser("---This used to cause an infinite loop").is_ok()); + run_parser("---This used to cause an infinite loop").unwrap(); assert_eq!( run_parser("----").unwrap(), [ diff --git a/parser/tools/dump_events.rs b/parser/tools/dump_events.rs index 16908b9..0ec84c9 100644 --- a/parser/tools/dump_events.rs +++ b/parser/tools/dump_events.rs @@ -2,11 +2,7 @@ use std::env; use std::fs::File; use std::io::prelude::*; -use saphyr_parser::{ - parser::{MarkedEventReceiver, Parser}, - scanner::Marker, - Event, -}; +use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser}; #[derive(Debug)] struct EventSink { diff --git a/parser/tools/time_parse.rs b/parser/tools/time_parse.rs index 24cb875..154d698 100644 --- a/parser/tools/time_parse.rs +++ b/parser/tools/time_parse.rs @@ -1,12 +1,9 @@ -use saphyr_parser::{ - parser::{MarkedEventReceiver, Parser}, - scanner::Marker, - Event, -}; use std::env; use std::fs::File; use std::io::prelude::*; +use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser}; + /// A sink which discards any event sent. struct NullSink {}