Add StrInput.

2024-07-05 21:38:34 +02:00 · 2024-07-05 21:38:34 +02:00 · db4f26da42
commit db4f26da42
parent 0e9cee18f2
4 changed files with 256 additions and 5 deletions
--- a/parser/justfile
+++ b/parser/justfile
@ -24,4 +24,4 @@ ethi_build_dump:
 ethi_compare: ethi_build_dump
  cg_file=`\ls -1t callgrind.out.* | head -n1` && callgrind_annotate $cg_file --auto=no --threshold=99.99 > cg/WORK && rm $cg_file
-  callgrind_differ cg/00{05,06,07,08,09,10,11,12}* cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a
+  callgrind_differ `\ls cg/0*` cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a
--- a/parser/src/input.rs
+++ b/parser/src/input.rs
@ -1,4 +1,5 @@
 pub mod buffered;
 pub mod str;
 #[allow(clippy::module_name_repetitions)]
 pub use buffered::BufferedInput;
--- a/parser/src/input/str.rs
+++ b/parser/src/input/str.rs
@ -0,0 +1,251 @@
 use crate::{char_traits::is_blank_or_breakz, input::Input};
 #[allow(clippy::module_name_repetitions)]
 pub struct StrInput<'a> {
    /// The input str buffer.
    buffer: &'a str,
    /// The number of characters (**not** bytes) in the buffer.
    n_chars: usize,
    /// The number of characters we have looked ahead.
    ///
    /// We must however keep track of how many characters the parser asked us to look ahead for so
    /// that we can return the correct value in [`Self::buflen`].
    lookahead: usize,
 }
 impl<'a> StrInput<'a> {
    /// Create a new [`StrInput`] with the given str.
    pub fn new(input: &'a str) -> Self {
        Self {
            buffer: input,
            n_chars: input.chars().count(),
            lookahead: 0,
        }
    }
 }
 impl<'a> Input for StrInput<'a> {
    #[inline]
    fn lookahead(&mut self, x: usize) {
        // We already have all characters that we need.
        // We cannot add '\0's to the buffer should we prematurely reach EOF.
        // Returning '\0's befalls the character-retrieving functions.
        self.lookahead = self.lookahead.max(x);
    }
    #[inline]
    fn buflen(&self) -> usize {
        self.lookahead
    }
    #[inline]
    fn bufmaxlen(&self) -> usize {
        BUFFER_LEN
    }
    fn buf_is_empty(&self) -> bool {
        self.buflen() == 0
    }
    #[inline]
    fn raw_read_ch(&mut self) -> char {
        let mut chars = self.buffer.chars();
        if let Some(c) = chars.next() {
            self.buffer = chars.as_str();
            self.n_chars -= 1;
            c
        } else {
            '\0'
        }
    }
    #[inline]
    fn push_back(&mut self, c: char) {
        let n_bytes = c.len_utf8();
        // SAFETY: The character that gets pushed back is guaranteed to be the one that is
        // immediately preceding our buffer. We can compute the length of the character and move
        // our buffer back that many bytes.
        unsafe {
            let buffer_byte_len = self.buffer.len();
            let mut now_ptr = self.buffer.as_ptr();
            now_ptr = now_ptr.wrapping_sub(n_bytes);
            self.buffer = std::str::from_utf8_unchecked(std::slice::from_raw_parts(
                now_ptr,
                buffer_byte_len + n_bytes,
            ));
        }
    }
    #[inline]
    fn skip(&mut self) {
        let mut chars = self.buffer.chars();
        if chars.next().is_some() {
            self.buffer = chars.as_str();
            self.n_chars -= 1;
        }
    }
    #[inline]
    fn skip_n(&mut self, count: usize) {
        let mut chars = self.buffer.chars();
        for _ in 0..count {
            if chars.next().is_none() {
                break;
            }
        }
        self.buffer = chars.as_str();
        self.n_chars = self.n_chars.saturating_sub(count);
    }
    #[inline]
    fn peek(&self) -> char {
        self.buffer.chars().next().unwrap_or('\0')
    }
    #[inline]
    fn peek_nth(&self, n: usize) -> char {
        let mut chars = self.buffer.chars();
        for _ in 0..n {
            if chars.next().is_none() {
                return '\0';
            }
        }
        chars.next().unwrap_or('\0')
    }
    #[inline]
    fn look_ch(&mut self) -> char {
        self.lookahead(1);
        self.peek()
    }
    #[inline]
    fn next_char_is(&self, c: char) -> bool {
        self.peek() == c
    }
    #[inline]
    fn nth_char_is(&self, n: usize, c: char) -> bool {
        self.peek_nth(n) == c
    }
    #[inline]
    fn next_2_are(&self, c1: char, c2: char) -> bool {
        let mut chars = self.buffer.chars();
        chars.next().is_some_and(|c| c == c1) && chars.next().is_some_and(|c| c == c2)
    }
    #[inline]
    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
        let mut chars = self.buffer.chars();
        chars.next().is_some_and(|c| c == c1)
            && chars.next().is_some_and(|c| c == c2)
            && chars.next().is_some_and(|c| c == c3)
    }
    #[inline]
    fn next_is_document_indicator(&self) -> bool {
        if self.buffer.len() < 3 {
            false
        } else {
            // Since all characters we look for are ascii, we can directly use the byte API of str.
            (if self.buffer.len() == 3 {
                true
            } else {
                is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
            }) && (self.buffer.starts_with("...") || self.buffer.starts_with("---"))
        }
    }
    #[inline]
    fn next_is_document_start(&self) -> bool {
        if self.buffer.len() < 3 {
            false
        } else {
            // Since all characters we look for are ascii, we can directly use the byte API of str.
            (if self.buffer.len() == 3 {
                true
            } else {
                is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
            }) && self.buffer.starts_with("---")
        }
    }
    #[inline]
    fn next_is_document_end(&self) -> bool {
        if self.buffer.len() < 3 {
            false
        } else {
            // Since all characters we look for are ascii, we can directly use the byte API of str.
            (if self.buffer.len() == 3 {
                true
            } else {
                is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
            }) && self.buffer.starts_with("...")
        }
    }
 }
 /// The buffer size we return to the scanner.
 ///
 /// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
 /// any character they want. If it's within the input buffer, the given character is returned,
 /// otherwise `\0` is returned.
 ///
 /// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
 /// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
 /// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
 /// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
 /// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
 /// call.
 ///
 /// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
 /// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
 /// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
 /// at a time, should fit what we expect to be a good balance between memory consumption and what
 /// we expect the maximum line length to be.
 ///
 /// [`lookahead`]: `StrInput::lookahead`
 /// [`bufmaxlen`]: `StrInput::bufmaxlen`
 /// [`buflen`]: `StrInput::buflen`
 const BUFFER_LEN: usize = 128;
 #[cfg(test)]
 mod test {
    use crate::input::Input;
    use super::StrInput;
    #[test]
    pub fn is_document_start() {
        let input = StrInput::new("---\n");
        assert!(input.next_is_document_start());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("---");
        assert!(input.next_is_document_start());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("...\n");
        assert!(!input.next_is_document_start());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("--- ");
        assert!(input.next_is_document_start());
        assert!(input.next_is_document_indicator());
    }
    #[test]
    pub fn is_document_end() {
        let input = StrInput::new("...\n");
        assert!(input.next_is_document_end());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("...");
        assert!(input.next_is_document_end());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("---\n");
        assert!(!input.next_is_document_end());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("... ");
        assert!(input.next_is_document_end());
        assert!(input.next_is_document_indicator());
    }
 }
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -5,9 +5,8 @@
 //! YAML objects.
 use crate::{
-    input::Input,
+    input::{str::StrInput, Input},
    scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType},
    BufferedInput,
 };
 use std::collections::HashMap;
@ -229,11 +228,11 @@ impl<R: EventReceiver> MarkedEventReceiver for R {
 /// A convenience alias for a `Result` of a parser event.
 pub type ParseResult = Result<(Event, Marker), ScanError>;
-impl<'a> Parser<BufferedInput<std::str::Chars<'a>>> {
+impl<'a> Parser<StrInput<'a>> {
    /// Create a new instance of a parser from a &str.
    #[must_use]
    pub fn new_from_str(value: &'a str) -> Self {
-        Parser::new(BufferedInput::new(value.chars()))
+        Parser::new(StrInput::new(value))
    }
 }