From db4f26da425ce3f2d6a7431328f2bf8fd4f306fd Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Fri, 5 Jul 2024 21:38:34 +0200 Subject: [PATCH] Add `StrInput`. --- parser/justfile | 2 +- parser/src/input.rs | 1 + parser/src/input/str.rs | 251 ++++++++++++++++++++++++++++++++++++++++ parser/src/parser.rs | 7 +- 4 files changed, 256 insertions(+), 5 deletions(-) create mode 100644 parser/src/input/str.rs diff --git a/parser/justfile b/parser/justfile index 9644f63..b214570 100644 --- a/parser/justfile +++ b/parser/justfile @@ -24,4 +24,4 @@ ethi_build_dump: ethi_compare: ethi_build_dump cg_file=`\ls -1t callgrind.out.* | head -n1` && callgrind_annotate $cg_file --auto=no --threshold=99.99 > cg/WORK && rm $cg_file - callgrind_differ cg/00{05,06,07,08,09,10,11,12}* cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a + callgrind_differ `\ls cg/0*` cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a diff --git a/parser/src/input.rs b/parser/src/input.rs index a9402bb..f407257 100644 --- a/parser/src/input.rs +++ b/parser/src/input.rs @@ -1,4 +1,5 @@ pub mod buffered; +pub mod str; #[allow(clippy::module_name_repetitions)] pub use buffered::BufferedInput; diff --git a/parser/src/input/str.rs b/parser/src/input/str.rs new file mode 100644 index 0000000..deaf58d --- /dev/null +++ b/parser/src/input/str.rs @@ -0,0 +1,251 @@ +use crate::{char_traits::is_blank_or_breakz, input::Input}; + +#[allow(clippy::module_name_repetitions)] +pub struct StrInput<'a> { + /// The input str buffer. + buffer: &'a str, + /// The number of characters (**not** bytes) in the buffer. + n_chars: usize, + /// The number of characters we have looked ahead. + /// + /// We must however keep track of how many characters the parser asked us to look ahead for so + /// that we can return the correct value in [`Self::buflen`]. + lookahead: usize, +} + +impl<'a> StrInput<'a> { + /// Create a new [`StrInput`] with the given str. + pub fn new(input: &'a str) -> Self { + Self { + buffer: input, + n_chars: input.chars().count(), + lookahead: 0, + } + } +} + +impl<'a> Input for StrInput<'a> { + #[inline] + fn lookahead(&mut self, x: usize) { + // We already have all characters that we need. + // We cannot add '\0's to the buffer should we prematurely reach EOF. + // Returning '\0's befalls the character-retrieving functions. + self.lookahead = self.lookahead.max(x); + } + + #[inline] + fn buflen(&self) -> usize { + self.lookahead + } + + #[inline] + fn bufmaxlen(&self) -> usize { + BUFFER_LEN + } + + fn buf_is_empty(&self) -> bool { + self.buflen() == 0 + } + + #[inline] + fn raw_read_ch(&mut self) -> char { + let mut chars = self.buffer.chars(); + if let Some(c) = chars.next() { + self.buffer = chars.as_str(); + self.n_chars -= 1; + c + } else { + '\0' + } + } + + #[inline] + fn push_back(&mut self, c: char) { + let n_bytes = c.len_utf8(); + + // SAFETY: The character that gets pushed back is guaranteed to be the one that is + // immediately preceding our buffer. We can compute the length of the character and move + // our buffer back that many bytes. + unsafe { + let buffer_byte_len = self.buffer.len(); + let mut now_ptr = self.buffer.as_ptr(); + now_ptr = now_ptr.wrapping_sub(n_bytes); + self.buffer = std::str::from_utf8_unchecked(std::slice::from_raw_parts( + now_ptr, + buffer_byte_len + n_bytes, + )); + } + } + + #[inline] + fn skip(&mut self) { + let mut chars = self.buffer.chars(); + if chars.next().is_some() { + self.buffer = chars.as_str(); + self.n_chars -= 1; + } + } + + #[inline] + fn skip_n(&mut self, count: usize) { + let mut chars = self.buffer.chars(); + for _ in 0..count { + if chars.next().is_none() { + break; + } + } + self.buffer = chars.as_str(); + self.n_chars = self.n_chars.saturating_sub(count); + } + + #[inline] + fn peek(&self) -> char { + self.buffer.chars().next().unwrap_or('\0') + } + + #[inline] + fn peek_nth(&self, n: usize) -> char { + let mut chars = self.buffer.chars(); + for _ in 0..n { + if chars.next().is_none() { + return '\0'; + } + } + chars.next().unwrap_or('\0') + } + + #[inline] + fn look_ch(&mut self) -> char { + self.lookahead(1); + self.peek() + } + + #[inline] + fn next_char_is(&self, c: char) -> bool { + self.peek() == c + } + + #[inline] + fn nth_char_is(&self, n: usize, c: char) -> bool { + self.peek_nth(n) == c + } + + #[inline] + fn next_2_are(&self, c1: char, c2: char) -> bool { + let mut chars = self.buffer.chars(); + chars.next().is_some_and(|c| c == c1) && chars.next().is_some_and(|c| c == c2) + } + + #[inline] + fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool { + let mut chars = self.buffer.chars(); + chars.next().is_some_and(|c| c == c1) + && chars.next().is_some_and(|c| c == c2) + && chars.next().is_some_and(|c| c == c3) + } + + #[inline] + fn next_is_document_indicator(&self) -> bool { + if self.buffer.len() < 3 { + false + } else { + // Since all characters we look for are ascii, we can directly use the byte API of str. + (if self.buffer.len() == 3 { + true + } else { + is_blank_or_breakz(self.buffer.as_bytes()[3] as char) + }) && (self.buffer.starts_with("...") || self.buffer.starts_with("---")) + } + } + + #[inline] + fn next_is_document_start(&self) -> bool { + if self.buffer.len() < 3 { + false + } else { + // Since all characters we look for are ascii, we can directly use the byte API of str. + (if self.buffer.len() == 3 { + true + } else { + is_blank_or_breakz(self.buffer.as_bytes()[3] as char) + }) && self.buffer.starts_with("---") + } + } + + #[inline] + fn next_is_document_end(&self) -> bool { + if self.buffer.len() < 3 { + false + } else { + // Since all characters we look for are ascii, we can directly use the byte API of str. + (if self.buffer.len() == 3 { + true + } else { + is_blank_or_breakz(self.buffer.as_bytes()[3] as char) + }) && self.buffer.starts_with("...") + } + } +} + +/// The buffer size we return to the scanner. +/// +/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw +/// any character they want. If it's within the input buffer, the given character is returned, +/// otherwise `\0` is returned. +/// +/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size +/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It +/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return +/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner +/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its +/// call. +/// +/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is +/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any +/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line +/// at a time, should fit what we expect to be a good balance between memory consumption and what +/// we expect the maximum line length to be. +/// +/// [`lookahead`]: `StrInput::lookahead` +/// [`bufmaxlen`]: `StrInput::bufmaxlen` +/// [`buflen`]: `StrInput::buflen` +const BUFFER_LEN: usize = 128; + +#[cfg(test)] +mod test { + use crate::input::Input; + + use super::StrInput; + + #[test] + pub fn is_document_start() { + let input = StrInput::new("---\n"); + assert!(input.next_is_document_start()); + assert!(input.next_is_document_indicator()); + let input = StrInput::new("---"); + assert!(input.next_is_document_start()); + assert!(input.next_is_document_indicator()); + let input = StrInput::new("...\n"); + assert!(!input.next_is_document_start()); + assert!(input.next_is_document_indicator()); + let input = StrInput::new("--- "); + assert!(input.next_is_document_start()); + assert!(input.next_is_document_indicator()); + } + + #[test] + pub fn is_document_end() { + let input = StrInput::new("...\n"); + assert!(input.next_is_document_end()); + assert!(input.next_is_document_indicator()); + let input = StrInput::new("..."); + assert!(input.next_is_document_end()); + assert!(input.next_is_document_indicator()); + let input = StrInput::new("---\n"); + assert!(!input.next_is_document_end()); + assert!(input.next_is_document_indicator()); + let input = StrInput::new("... "); + assert!(input.next_is_document_end()); + assert!(input.next_is_document_indicator()); + } +} diff --git a/parser/src/parser.rs b/parser/src/parser.rs index 0d15275..76dbe67 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -5,9 +5,8 @@ //! YAML objects. use crate::{ - input::Input, + input::{str::StrInput, Input}, scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType}, - BufferedInput, }; use std::collections::HashMap; @@ -229,11 +228,11 @@ impl MarkedEventReceiver for R { /// A convenience alias for a `Result` of a parser event. pub type ParseResult = Result<(Event, Marker), ScanError>; -impl<'a> Parser>> { +impl<'a> Parser> { /// Create a new instance of a parser from a &str. #[must_use] pub fn new_from_str(value: &'a str) -> Self { - Parser::new(BufferedInput::new(value.chars())) + Parser::new(StrInput::new(value)) } }