Add StrInput.

2024-07-05 21:38:34 +02:00 · 2024-07-05 21:38:34 +02:00 · db4f26da42
commit db4f26da42
parent 0e9cee18f2
4 changed files with 256 additions and 5 deletions
--- a/parser/justfile
+++ b/parser/justfile
@ -24,4 +24,4 @@ ethi_build_dump:

 ethi_compare: ethi_build_dump
  cg_file=`\ls -1t callgrind.out.* | head -n1` && callgrind_annotate $cg_file --auto=no --threshold=99.99 > cg/WORK && rm $cg_file
-  callgrind_differ cg/00{05,06,07,08,09,10,11,12}* cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a
+  callgrind_differ `\ls cg/0*` cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a
--- a/parser/src/input.rs
+++ b/parser/src/input.rs
@ -1,4 +1,5 @@
 pub mod buffered;
+pub mod str;

 #[allow(clippy::module_name_repetitions)]
 pub use buffered::BufferedInput;
--- a/parser/src/input/str.rs
+++ b/parser/src/input/str.rs
@ -0,0 +1,251 @@
+use crate::{char_traits::is_blank_or_breakz, input::Input};
+
+#[allow(clippy::module_name_repetitions)]
+pub struct StrInput<'a> {
+    /// The input str buffer.
+    buffer: &'a str,
+    /// The number of characters (**not** bytes) in the buffer.
+    n_chars: usize,
+    /// The number of characters we have looked ahead.
+    ///
+    /// We must however keep track of how many characters the parser asked us to look ahead for so
+    /// that we can return the correct value in [`Self::buflen`].
+    lookahead: usize,
+}
+
+impl<'a> StrInput<'a> {
+    /// Create a new [`StrInput`] with the given str.
+    pub fn new(input: &'a str) -> Self {
+        Self {
+            buffer: input,
+            n_chars: input.chars().count(),
+            lookahead: 0,
+        }
+    }
+}
+
+impl<'a> Input for StrInput<'a> {
+    #[inline]
+    fn lookahead(&mut self, x: usize) {
+        // We already have all characters that we need.
+        // We cannot add '\0's to the buffer should we prematurely reach EOF.
+        // Returning '\0's befalls the character-retrieving functions.
+        self.lookahead = self.lookahead.max(x);
+    }
+
+    #[inline]
+    fn buflen(&self) -> usize {
+        self.lookahead
+    }
+
+    #[inline]
+    fn bufmaxlen(&self) -> usize {
+        BUFFER_LEN
+    }
+
+    fn buf_is_empty(&self) -> bool {
+        self.buflen() == 0
+    }
+
+    #[inline]
+    fn raw_read_ch(&mut self) -> char {
+        let mut chars = self.buffer.chars();
+        if let Some(c) = chars.next() {
+            self.buffer = chars.as_str();
+            self.n_chars -= 1;
+            c
+        } else {
+            '\0'
+        }
+    }
+
+    #[inline]
+    fn push_back(&mut self, c: char) {
+        let n_bytes = c.len_utf8();
+
+        // SAFETY: The character that gets pushed back is guaranteed to be the one that is
+        // immediately preceding our buffer. We can compute the length of the character and move
+        // our buffer back that many bytes.
+        unsafe {
+            let buffer_byte_len = self.buffer.len();
+            let mut now_ptr = self.buffer.as_ptr();
+            now_ptr = now_ptr.wrapping_sub(n_bytes);
+            self.buffer = std::str::from_utf8_unchecked(std::slice::from_raw_parts(
+                now_ptr,
+                buffer_byte_len + n_bytes,
+            ));
+        }
+    }
+
+    #[inline]
+    fn skip(&mut self) {
+        let mut chars = self.buffer.chars();
+        if chars.next().is_some() {
+            self.buffer = chars.as_str();
+            self.n_chars -= 1;
+        }
+    }
+
+    #[inline]
+    fn skip_n(&mut self, count: usize) {
+        let mut chars = self.buffer.chars();
+        for _ in 0..count {
+            if chars.next().is_none() {
+                break;
+            }
+        }
+        self.buffer = chars.as_str();
+        self.n_chars = self.n_chars.saturating_sub(count);
+    }
+
+    #[inline]
+    fn peek(&self) -> char {
+        self.buffer.chars().next().unwrap_or('\0')
+    }
+
+    #[inline]
+    fn peek_nth(&self, n: usize) -> char {
+        let mut chars = self.buffer.chars();
+        for _ in 0..n {
+            if chars.next().is_none() {
+                return '\0';
+            }
+        }
+        chars.next().unwrap_or('\0')
+    }
+
+    #[inline]
+    fn look_ch(&mut self) -> char {
+        self.lookahead(1);
+        self.peek()
+    }
+
+    #[inline]
+    fn next_char_is(&self, c: char) -> bool {
+        self.peek() == c
+    }
+
+    #[inline]
+    fn nth_char_is(&self, n: usize, c: char) -> bool {
+        self.peek_nth(n) == c
+    }
+
+    #[inline]
+    fn next_2_are(&self, c1: char, c2: char) -> bool {
+        let mut chars = self.buffer.chars();
+        chars.next().is_some_and(|c| c == c1) && chars.next().is_some_and(|c| c == c2)
+    }
+
+    #[inline]
+    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
+        let mut chars = self.buffer.chars();
+        chars.next().is_some_and(|c| c == c1)
+            && chars.next().is_some_and(|c| c == c2)
+            && chars.next().is_some_and(|c| c == c3)
+    }
+
+    #[inline]
+    fn next_is_document_indicator(&self) -> bool {
+        if self.buffer.len() < 3 {
+            false
+        } else {
+            // Since all characters we look for are ascii, we can directly use the byte API of str.
+            (if self.buffer.len() == 3 {
+                true
+            } else {
+                is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
+            }) && (self.buffer.starts_with("...") || self.buffer.starts_with("---"))
+        }
+    }
+
+    #[inline]
+    fn next_is_document_start(&self) -> bool {
+        if self.buffer.len() < 3 {
+            false
+        } else {
+            // Since all characters we look for are ascii, we can directly use the byte API of str.
+            (if self.buffer.len() == 3 {
+                true
+            } else {
+                is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
+            }) && self.buffer.starts_with("---")
+        }
+    }
+
+    #[inline]
+    fn next_is_document_end(&self) -> bool {
+        if self.buffer.len() < 3 {
+            false
+        } else {
+            // Since all characters we look for are ascii, we can directly use the byte API of str.
+            (if self.buffer.len() == 3 {
+                true
+            } else {
+                is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
+            }) && self.buffer.starts_with("...")
+        }
+    }
+}
+
+/// The buffer size we return to the scanner.
+///
+/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
+/// any character they want. If it's within the input buffer, the given character is returned,
+/// otherwise `\0` is returned.
+///
+/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
+/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
+/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
+/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
+/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
+/// call.
+///
+/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
+/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
+/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
+/// at a time, should fit what we expect to be a good balance between memory consumption and what
+/// we expect the maximum line length to be.
+///
+/// [`lookahead`]: `StrInput::lookahead`
+/// [`bufmaxlen`]: `StrInput::bufmaxlen`
+/// [`buflen`]: `StrInput::buflen`
+const BUFFER_LEN: usize = 128;
+
+#[cfg(test)]
+mod test {
+    use crate::input::Input;
+
+    use super::StrInput;
+
+    #[test]
+    pub fn is_document_start() {
+        let input = StrInput::new("---\n");
+        assert!(input.next_is_document_start());
+        assert!(input.next_is_document_indicator());
+        let input = StrInput::new("---");
+        assert!(input.next_is_document_start());
+        assert!(input.next_is_document_indicator());
+        let input = StrInput::new("...\n");
+        assert!(!input.next_is_document_start());
+        assert!(input.next_is_document_indicator());
+        let input = StrInput::new("--- ");
+        assert!(input.next_is_document_start());
+        assert!(input.next_is_document_indicator());
+    }
+
+    #[test]
+    pub fn is_document_end() {
+        let input = StrInput::new("...\n");
+        assert!(input.next_is_document_end());
+        assert!(input.next_is_document_indicator());
+        let input = StrInput::new("...");
+        assert!(input.next_is_document_end());
+        assert!(input.next_is_document_indicator());
+        let input = StrInput::new("---\n");
+        assert!(!input.next_is_document_end());
+        assert!(input.next_is_document_indicator());
+        let input = StrInput::new("... ");
+        assert!(input.next_is_document_end());
+        assert!(input.next_is_document_indicator());
+    }
+}
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -5,9 +5,8 @@
 //! YAML objects.

 use crate::{
-    input::Input,
+    input::{str::StrInput, Input},
    scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType},
-    BufferedInput,
 };
 use std::collections::HashMap;

@ -229,11 +228,11 @@ impl<R: EventReceiver> MarkedEventReceiver for R {
 /// A convenience alias for a `Result` of a parser event.
 pub type ParseResult = Result<(Event, Marker), ScanError>;

-impl<'a> Parser<BufferedInput<std::str::Chars<'a>>> {
+impl<'a> Parser<StrInput<'a>> {
    /// Create a new instance of a parser from a &str.
    #[must_use]
    pub fn new_from_str(value: &'a str) -> Self {
-        Parser::new(BufferedInput::new(value.chars()))
+        Parser::new(StrInput::new(value))
    }
 }