Move skip_ws_to_eol to Input trait.

2024-07-10 17:25:11 +02:00 · 2024-07-10 17:25:11 +02:00 · 8d7c3a1c1b
commit 8d7c3a1c1b
parent db4f26da42
3 changed files with 230 additions and 103 deletions
--- a/parser/src/input.rs
+++ b/parser/src/input.rs
@ -4,7 +4,7 @@ pub mod str;
 #[allow(clippy::module_name_repetitions)]
 pub use buffered::BufferedInput;

-use crate::char_traits::is_blank_or_breakz;
+use crate::char_traits::{is_blank_or_breakz, is_breakz};

 /// Interface for a source of characters.
 ///
@ -165,4 +165,88 @@ pub trait Input {
        assert!(self.buflen() >= 4);
        self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3))
    }
+
+    /// Skip yaml whitespace at most up to eol. Also skips comments. Advances the input.
+    ///
+    /// # Return
+    /// Return a tuple with the number of characters that were consumed and the result of skipping
+    /// whitespace. The number of characters returned can be used to advance the index and columns,
+    /// since no end-of-line character will be consumed.
+    /// See [`SkipTabs`] For more details on the success variant.
+    ///
+    /// # Errors
+    /// Errors if a comment is encountered but it was not preceded by a whitespace. In that event,
+    /// the first tuple element will contain the number of characters consumed prior to reaching
+    /// the `#`.
+    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
+        let mut encountered_tab = false;
+        let mut has_yaml_ws = false;
+        let mut chars_consumed = 0;
+        loop {
+            match self.look_ch() {
+                ' ' => {
+                    has_yaml_ws = true;
+                    self.skip();
+                }
+                '\t' if skip_tabs != SkipTabs::No => {
+                    encountered_tab = true;
+                    self.skip();
+                }
+                // YAML comments must be preceded by whitespace.
+                '#' if !encountered_tab && !has_yaml_ws => {
+                    return (
+                        chars_consumed,
+                        Err("comments must be separated from other tokens by whitespace"),
+                    );
+                }
+                '#' => {
+                    while !is_breakz(self.look_ch()) {
+                        self.skip();
+                        chars_consumed += 1;
+                    }
+                }
+                _ => break,
+            }
+            chars_consumed += 1;
+        }
+
+        (
+            chars_consumed,
+            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
+        )
+    }
+}
+
+/// Behavior to adopt regarding treating tabs as whitespace.
+///
+/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub enum SkipTabs {
+    /// Skip all tabs as whitespace.
+    Yes,
+    /// Don't skip any tab. Return from the function when encountering one.
+    No,
+    /// Return value from the function.
+    Result(
+        /// Whether tabs were encountered.
+        bool,
+        /// Whether at least 1 valid yaml whitespace has been encountered.
+        bool,
+    ),
+}
+
+impl SkipTabs {
+    /// Whether tabs were found while skipping whitespace.
+    ///
+    /// This function must be called after a call to `skip_ws_to_eol`.
+    pub fn found_tabs(self) -> bool {
+        matches!(self, SkipTabs::Result(true, _))
+    }
+
+    /// Whether a valid YAML whitespace has been found in skipped-over content.
+    ///
+    /// This function must be called after a call to `skip_ws_to_eol`.
+    pub fn has_valid_yaml_ws(self) -> bool {
+        matches!(self, SkipTabs::Result(_, true))
+    }
 }
--- a/parser/src/input/str.rs
+++ b/parser/src/input/str.rs
@ -1,11 +1,12 @@
-use crate::{char_traits::is_blank_or_breakz, input::Input};
+use crate::{
+    char_traits::{is_blank_or_breakz, is_breakz},
+    input::{Input, SkipTabs},
+};

 #[allow(clippy::module_name_repetitions)]
 pub struct StrInput<'a> {
    /// The input str buffer.
    buffer: &'a str,
-    /// The number of characters (**not** bytes) in the buffer.
-    n_chars: usize,
    /// The number of characters we have looked ahead.
    ///
    /// We must however keep track of how many characters the parser asked us to look ahead for so
@ -18,7 +19,6 @@ impl<'a> StrInput<'a> {
    pub fn new(input: &'a str) -> Self {
        Self {
            buffer: input,
-            n_chars: input.chars().count(),
            lookahead: 0,
        }
    }
@ -52,7 +52,6 @@ impl<'a> Input for StrInput<'a> {
        let mut chars = self.buffer.chars();
        if let Some(c) = chars.next() {
            self.buffer = chars.as_str();
-            self.n_chars -= 1;
            c
        } else {
            '\0'
@ -61,20 +60,7 @@ impl<'a> Input for StrInput<'a> {

    #[inline]
    fn push_back(&mut self, c: char) {
-        let n_bytes = c.len_utf8();
-
-        // SAFETY: The character that gets pushed back is guaranteed to be the one that is
-        // immediately preceding our buffer. We can compute the length of the character and move
-        // our buffer back that many bytes.
-        unsafe {
-            let buffer_byte_len = self.buffer.len();
-            let mut now_ptr = self.buffer.as_ptr();
-            now_ptr = now_ptr.wrapping_sub(n_bytes);
-            self.buffer = std::str::from_utf8_unchecked(std::slice::from_raw_parts(
-                now_ptr,
-                buffer_byte_len + n_bytes,
-            ));
-        }
+        self.buffer = put_back_in_str(self.buffer, c);
    }

    #[inline]
@ -82,7 +68,6 @@ impl<'a> Input for StrInput<'a> {
        let mut chars = self.buffer.chars();
        if chars.next().is_some() {
            self.buffer = chars.as_str();
-            self.n_chars -= 1;
        }
    }

@ -95,7 +80,6 @@ impl<'a> Input for StrInput<'a> {
            }
        }
        self.buffer = chars.as_str();
-        self.n_chars = self.n_chars.saturating_sub(count);
    }

    #[inline]
@ -150,11 +134,11 @@ impl<'a> Input for StrInput<'a> {
            false
        } else {
            // Since all characters we look for are ascii, we can directly use the byte API of str.
-            (if self.buffer.len() == 3 {
-                true
-            } else {
-                is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
-            }) && (self.buffer.starts_with("...") || self.buffer.starts_with("---"))
+            let bytes = self.buffer.as_bytes();
+            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
+                && (bytes[0] == b'.' || bytes[0] == b'-')
+                && bytes[0] == bytes[1]
+                && bytes[1] == bytes[2]
        }
    }

@ -164,11 +148,11 @@ impl<'a> Input for StrInput<'a> {
            false
        } else {
            // Since all characters we look for are ascii, we can directly use the byte API of str.
-            (if self.buffer.len() == 3 {
-                true
-            } else {
-                is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
-            }) && self.buffer.starts_with("---")
+            let bytes = self.buffer.as_bytes();
+            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
+                && bytes[0] == b'-'
+                && bytes[1] == b'-'
+                && bytes[2] == b'-'
        }
    }

@ -178,13 +162,92 @@ impl<'a> Input for StrInput<'a> {
            false
        } else {
            // Since all characters we look for are ascii, we can directly use the byte API of str.
-            (if self.buffer.len() == 3 {
-                true
-            } else {
-                is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
-            }) && self.buffer.starts_with("...")
+            let bytes = self.buffer.as_bytes();
+            (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
+                && bytes[0] == b'.'
+                && bytes[1] == b'.'
+                && bytes[2] == b'.'
        }
    }
+
+    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
+        assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
+
+        let mut new_str = self.buffer.as_bytes();
+        let mut has_yaml_ws = false;
+        let mut encountered_tab = false;
+
+        // This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
+        // while keeping track of whether we encountered spaces and/or tabs.
+        if skip_tabs == SkipTabs::Yes {
+            let mut i = 0;
+            while i < new_str.len() {
+                if new_str[i] == b' ' {
+                    has_yaml_ws = true;
+                } else if new_str[i] == b'\t' {
+                    encountered_tab = true;
+                } else {
+                    break;
+                }
+                i += 1;
+            }
+            new_str = &new_str[i..];
+        } else {
+            let mut i = 0;
+            while i < new_str.len() {
+                if new_str[i] != b' ' {
+                    break;
+                }
+                i += 1;
+            }
+            has_yaml_ws = i != 0;
+            new_str = &new_str[i..];
+        }
+
+        // All characters consumed were ascii. We can use the byte length difference to count the
+        // number of whitespace ignored.
+        let mut chars_consumed = self.buffer.len() - new_str.len();
+        // SAFETY: We only trimmed spaces and tabs, both of which are bytes. This means we won't
+        // start the string outside of a valid UTF-8 boundary.
+        // It is assumed the input string is valid UTF-8, so the rest of the string is assumed to
+        // be valid UTF-8 as well.
+        let mut new_str = unsafe { std::str::from_utf8_unchecked(new_str) };
+
+        if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
+            if !encountered_tab && !has_yaml_ws {
+                return (
+                    chars_consumed,
+                    Err("comments must be separated from other tokens by whitespace"),
+                );
+            }
+
+            let mut chars = new_str.chars();
+            let mut found_breakz = false;
+            // Iterate over all remaining chars until we hit a breakz.
+            for c in chars.by_ref() {
+                if is_breakz(c) {
+                    found_breakz = true;
+                    break;
+                }
+                chars_consumed += 1;
+            }
+
+            new_str = if found_breakz {
+                // SAFETY: The last character we pulled out of the `chars()` is a breakz, one of
+                // '\0', '\r', '\n'. All 3 of them are 1-byte long.
+                unsafe { extend_left(chars.as_str(), 1) }
+            } else {
+                chars.as_str()
+            };
+        }
+
+        self.buffer = new_str;
+
+        (
+            chars_consumed,
+            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
+        )
+    }
 }

 /// The buffer size we return to the scanner.
@ -211,9 +274,40 @@ impl<'a> Input for StrInput<'a> {
 /// [`buflen`]: `StrInput::buflen`
 const BUFFER_LEN: usize = 128;

+/// Fake prepending a character to the given string.
+///
+/// The character given as parameter MUST be the one that precedes the given string.
+///
+/// # Exmaple
+/// ```ignore
+/// let s1 = "foo";
+/// let s2 = &s1[1..];
+/// let s3 = put_back_in_str(s2, 'f'); // OK, 'f' is the character immediately preceding
+/// // let s3 = put_back_in_str('g'); // Not allowed
+/// assert_eq!(s1, s3);
+/// assert_eq!(s1.as_ptr(), s3.as_ptr());
+/// ```
+fn put_back_in_str(s: &str, c: char) -> &str {
+    let n_bytes = c.len_utf8();
+
+    // SAFETY: The character that gets pushed back is guaranteed to be the one that is
+    // immediately preceding our buffer. We can compute the length of the character and move
+    // our buffer back that many bytes.
+    unsafe { extend_left(s, n_bytes) }
+}
+
+/// Extend the string by moving the start pointer to the left by `n` bytes.
+#[inline]
+unsafe fn extend_left(s: &str, n: usize) -> &str {
+    std::str::from_utf8_unchecked(std::slice::from_raw_parts(
+        s.as_ptr().wrapping_sub(n),
+        s.len() + n,
+    ))
+}
+
 #[cfg(test)]
 mod test {
-    use crate::input::Input;
+    use crate::input::{str::put_back_in_str, Input};

    use super::StrInput;

@ -248,4 +342,13 @@ mod test {
        assert!(input.next_is_document_end());
        assert!(input.next_is_document_indicator());
    }
+
+    #[test]
+    pub fn put_back_in_str_example() {
+        let s1 = "foo";
+        let s2 = &s1[1..];
+        let s3 = put_back_in_str(s2, 'f'); // OK, 'f' is the character immediately preceding
+        assert_eq!(s1, s3);
+        assert_eq!(s1.as_ptr(), s3.as_ptr());
+    }
 }
--- a/parser/src/scanner.rs
+++ b/parser/src/scanner.rs
@ -16,7 +16,7 @@ use crate::{
        as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz,
        is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z,
    },
-    input::Input,
+    input::{Input, SkipTabs},
 };

 /// The encoding of the input. Currently, only UTF-8 is supported.
@ -847,37 +847,11 @@ impl<T: Input> Scanner<T> {
        }
    }

-    /// Skip yaml whitespace at most up to eol. Also skips comments.
    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
-        let mut encountered_tab = false;
-        let mut has_yaml_ws = false;
-        loop {
-            match self.input.look_ch() {
-                ' ' => {
-                    has_yaml_ws = true;
-                    self.skip_blank();
-                }
-                '\t' if skip_tabs != SkipTabs::No => {
-                    encountered_tab = true;
-                    self.skip_blank();
-                }
-                // YAML comments must be preceded by whitespace.
-                '#' if !encountered_tab && !has_yaml_ws => {
-                    return Err(ScanError::new_str(
-                        self.mark,
-                        "comments must be separated from other tokens by whitespace",
-                    ));
-                }
-                '#' => {
-                    while !is_breakz(self.input.look_ch()) {
-                        self.skip_non_blank();
-                    }
-                }
-                _ => break,
-            }
-        }
-
-        Ok(SkipTabs::Result(encountered_tab, has_yaml_ws))
+        let (n_bytes, result) = self.input.skip_ws_to_eol(skip_tabs);
+        self.mark.col += n_bytes;
+        self.mark.index += n_bytes;
+        result.map_err(|msg| ScanError::new_str(self.mark, msg))
    }

    fn fetch_stream_start(&mut self) {
@ -2544,40 +2518,6 @@ impl<T: Input> Scanner<T> {
    }
 }

-/// Behavior to adopt regarding treating tabs as whitespace.
-///
-/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
-#[derive(Copy, Clone, Eq, PartialEq)]
-enum SkipTabs {
-    /// Skip all tabs as whitespace.
-    Yes,
-    /// Don't skip any tab. Return from the function when encountering one.
-    No,
-    /// Return value from the function.
-    Result(
-        /// Whether tabs were encountered.
-        bool,
-        /// Whether at least 1 valid yaml whitespace has been encountered.
-        bool,
-    ),
-}
-
-impl SkipTabs {
-    /// Whether tabs were found while skipping whitespace.
-    ///
-    /// This function must be called after a call to `skip_ws_to_eol`.
-    fn found_tabs(self) -> bool {
-        matches!(self, SkipTabs::Result(true, _))
-    }
-
-    /// Whether a valid YAML whitespace has been found in skipped-over content.
-    ///
-    /// This function must be called after a call to `skip_ws_to_eol`.
-    fn has_valid_yaml_ws(self) -> bool {
-        matches!(self, SkipTabs::Result(_, true))
-    }
-}
-
 /// Chomping, how final line breaks and trailing empty lines are interpreted.
 ///
 /// See YAML spec 8.1.1.2.