From 8d7c3a1c1bad1b145bc836867d91e10567b8ab70 Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Wed, 10 Jul 2024 17:25:11 +0200 Subject: [PATCH] Move `skip_ws_to_eol` to `Input` trait. --- parser/src/input.rs | 86 ++++++++++++++++++- parser/src/input/str.rs | 177 +++++++++++++++++++++++++++++++--------- parser/src/scanner.rs | 70 ++-------------- 3 files changed, 230 insertions(+), 103 deletions(-) diff --git a/parser/src/input.rs b/parser/src/input.rs index f407257..3fc3d0e 100644 --- a/parser/src/input.rs +++ b/parser/src/input.rs @@ -4,7 +4,7 @@ pub mod str; #[allow(clippy::module_name_repetitions)] pub use buffered::BufferedInput; -use crate::char_traits::is_blank_or_breakz; +use crate::char_traits::{is_blank_or_breakz, is_breakz}; /// Interface for a source of characters. /// @@ -165,4 +165,88 @@ pub trait Input { assert!(self.buflen() >= 4); self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3)) } + + /// Skip yaml whitespace at most up to eol. Also skips comments. Advances the input. + /// + /// # Return + /// Return a tuple with the number of characters that were consumed and the result of skipping + /// whitespace. The number of characters returned can be used to advance the index and columns, + /// since no end-of-line character will be consumed. + /// See [`SkipTabs`] For more details on the success variant. + /// + /// # Errors + /// Errors if a comment is encountered but it was not preceded by a whitespace. In that event, + /// the first tuple element will contain the number of characters consumed prior to reaching + /// the `#`. + fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result) { + let mut encountered_tab = false; + let mut has_yaml_ws = false; + let mut chars_consumed = 0; + loop { + match self.look_ch() { + ' ' => { + has_yaml_ws = true; + self.skip(); + } + '\t' if skip_tabs != SkipTabs::No => { + encountered_tab = true; + self.skip(); + } + // YAML comments must be preceded by whitespace. + '#' if !encountered_tab && !has_yaml_ws => { + return ( + chars_consumed, + Err("comments must be separated from other tokens by whitespace"), + ); + } + '#' => { + while !is_breakz(self.look_ch()) { + self.skip(); + chars_consumed += 1; + } + } + _ => break, + } + chars_consumed += 1; + } + + ( + chars_consumed, + Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)), + ) + } +} + +/// Behavior to adopt regarding treating tabs as whitespace. +/// +/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space. +#[derive(Copy, Clone, Eq, PartialEq)] +pub enum SkipTabs { + /// Skip all tabs as whitespace. + Yes, + /// Don't skip any tab. Return from the function when encountering one. + No, + /// Return value from the function. + Result( + /// Whether tabs were encountered. + bool, + /// Whether at least 1 valid yaml whitespace has been encountered. + bool, + ), +} + +impl SkipTabs { + /// Whether tabs were found while skipping whitespace. + /// + /// This function must be called after a call to `skip_ws_to_eol`. + pub fn found_tabs(self) -> bool { + matches!(self, SkipTabs::Result(true, _)) + } + + /// Whether a valid YAML whitespace has been found in skipped-over content. + /// + /// This function must be called after a call to `skip_ws_to_eol`. + pub fn has_valid_yaml_ws(self) -> bool { + matches!(self, SkipTabs::Result(_, true)) + } } diff --git a/parser/src/input/str.rs b/parser/src/input/str.rs index deaf58d..05e78f6 100644 --- a/parser/src/input/str.rs +++ b/parser/src/input/str.rs @@ -1,11 +1,12 @@ -use crate::{char_traits::is_blank_or_breakz, input::Input}; +use crate::{ + char_traits::{is_blank_or_breakz, is_breakz}, + input::{Input, SkipTabs}, +}; #[allow(clippy::module_name_repetitions)] pub struct StrInput<'a> { /// The input str buffer. buffer: &'a str, - /// The number of characters (**not** bytes) in the buffer. - n_chars: usize, /// The number of characters we have looked ahead. /// /// We must however keep track of how many characters the parser asked us to look ahead for so @@ -18,7 +19,6 @@ impl<'a> StrInput<'a> { pub fn new(input: &'a str) -> Self { Self { buffer: input, - n_chars: input.chars().count(), lookahead: 0, } } @@ -52,7 +52,6 @@ impl<'a> Input for StrInput<'a> { let mut chars = self.buffer.chars(); if let Some(c) = chars.next() { self.buffer = chars.as_str(); - self.n_chars -= 1; c } else { '\0' @@ -61,20 +60,7 @@ impl<'a> Input for StrInput<'a> { #[inline] fn push_back(&mut self, c: char) { - let n_bytes = c.len_utf8(); - - // SAFETY: The character that gets pushed back is guaranteed to be the one that is - // immediately preceding our buffer. We can compute the length of the character and move - // our buffer back that many bytes. - unsafe { - let buffer_byte_len = self.buffer.len(); - let mut now_ptr = self.buffer.as_ptr(); - now_ptr = now_ptr.wrapping_sub(n_bytes); - self.buffer = std::str::from_utf8_unchecked(std::slice::from_raw_parts( - now_ptr, - buffer_byte_len + n_bytes, - )); - } + self.buffer = put_back_in_str(self.buffer, c); } #[inline] @@ -82,7 +68,6 @@ impl<'a> Input for StrInput<'a> { let mut chars = self.buffer.chars(); if chars.next().is_some() { self.buffer = chars.as_str(); - self.n_chars -= 1; } } @@ -95,7 +80,6 @@ impl<'a> Input for StrInput<'a> { } } self.buffer = chars.as_str(); - self.n_chars = self.n_chars.saturating_sub(count); } #[inline] @@ -150,11 +134,11 @@ impl<'a> Input for StrInput<'a> { false } else { // Since all characters we look for are ascii, we can directly use the byte API of str. - (if self.buffer.len() == 3 { - true - } else { - is_blank_or_breakz(self.buffer.as_bytes()[3] as char) - }) && (self.buffer.starts_with("...") || self.buffer.starts_with("---")) + let bytes = self.buffer.as_bytes(); + (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char)) + && (bytes[0] == b'.' || bytes[0] == b'-') + && bytes[0] == bytes[1] + && bytes[1] == bytes[2] } } @@ -164,11 +148,11 @@ impl<'a> Input for StrInput<'a> { false } else { // Since all characters we look for are ascii, we can directly use the byte API of str. - (if self.buffer.len() == 3 { - true - } else { - is_blank_or_breakz(self.buffer.as_bytes()[3] as char) - }) && self.buffer.starts_with("---") + let bytes = self.buffer.as_bytes(); + (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char)) + && bytes[0] == b'-' + && bytes[1] == b'-' + && bytes[2] == b'-' } } @@ -178,13 +162,92 @@ impl<'a> Input for StrInput<'a> { false } else { // Since all characters we look for are ascii, we can directly use the byte API of str. - (if self.buffer.len() == 3 { - true - } else { - is_blank_or_breakz(self.buffer.as_bytes()[3] as char) - }) && self.buffer.starts_with("...") + let bytes = self.buffer.as_bytes(); + (bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char)) + && bytes[0] == b'.' + && bytes[1] == b'.' + && bytes[2] == b'.' } } + + fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result) { + assert!(!matches!(skip_tabs, SkipTabs::Result(..))); + + let mut new_str = self.buffer.as_bytes(); + let mut has_yaml_ws = false; + let mut encountered_tab = false; + + // This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found + // while keeping track of whether we encountered spaces and/or tabs. + if skip_tabs == SkipTabs::Yes { + let mut i = 0; + while i < new_str.len() { + if new_str[i] == b' ' { + has_yaml_ws = true; + } else if new_str[i] == b'\t' { + encountered_tab = true; + } else { + break; + } + i += 1; + } + new_str = &new_str[i..]; + } else { + let mut i = 0; + while i < new_str.len() { + if new_str[i] != b' ' { + break; + } + i += 1; + } + has_yaml_ws = i != 0; + new_str = &new_str[i..]; + } + + // All characters consumed were ascii. We can use the byte length difference to count the + // number of whitespace ignored. + let mut chars_consumed = self.buffer.len() - new_str.len(); + // SAFETY: We only trimmed spaces and tabs, both of which are bytes. This means we won't + // start the string outside of a valid UTF-8 boundary. + // It is assumed the input string is valid UTF-8, so the rest of the string is assumed to + // be valid UTF-8 as well. + let mut new_str = unsafe { std::str::from_utf8_unchecked(new_str) }; + + if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' { + if !encountered_tab && !has_yaml_ws { + return ( + chars_consumed, + Err("comments must be separated from other tokens by whitespace"), + ); + } + + let mut chars = new_str.chars(); + let mut found_breakz = false; + // Iterate over all remaining chars until we hit a breakz. + for c in chars.by_ref() { + if is_breakz(c) { + found_breakz = true; + break; + } + chars_consumed += 1; + } + + new_str = if found_breakz { + // SAFETY: The last character we pulled out of the `chars()` is a breakz, one of + // '\0', '\r', '\n'. All 3 of them are 1-byte long. + unsafe { extend_left(chars.as_str(), 1) } + } else { + chars.as_str() + }; + } + + self.buffer = new_str; + + ( + chars_consumed, + Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)), + ) + } } /// The buffer size we return to the scanner. @@ -211,9 +274,40 @@ impl<'a> Input for StrInput<'a> { /// [`buflen`]: `StrInput::buflen` const BUFFER_LEN: usize = 128; +/// Fake prepending a character to the given string. +/// +/// The character given as parameter MUST be the one that precedes the given string. +/// +/// # Exmaple +/// ```ignore +/// let s1 = "foo"; +/// let s2 = &s1[1..]; +/// let s3 = put_back_in_str(s2, 'f'); // OK, 'f' is the character immediately preceding +/// // let s3 = put_back_in_str('g'); // Not allowed +/// assert_eq!(s1, s3); +/// assert_eq!(s1.as_ptr(), s3.as_ptr()); +/// ``` +fn put_back_in_str(s: &str, c: char) -> &str { + let n_bytes = c.len_utf8(); + + // SAFETY: The character that gets pushed back is guaranteed to be the one that is + // immediately preceding our buffer. We can compute the length of the character and move + // our buffer back that many bytes. + unsafe { extend_left(s, n_bytes) } +} + +/// Extend the string by moving the start pointer to the left by `n` bytes. +#[inline] +unsafe fn extend_left(s: &str, n: usize) -> &str { + std::str::from_utf8_unchecked(std::slice::from_raw_parts( + s.as_ptr().wrapping_sub(n), + s.len() + n, + )) +} + #[cfg(test)] mod test { - use crate::input::Input; + use crate::input::{str::put_back_in_str, Input}; use super::StrInput; @@ -248,4 +342,13 @@ mod test { assert!(input.next_is_document_end()); assert!(input.next_is_document_indicator()); } + + #[test] + pub fn put_back_in_str_example() { + let s1 = "foo"; + let s2 = &s1[1..]; + let s3 = put_back_in_str(s2, 'f'); // OK, 'f' is the character immediately preceding + assert_eq!(s1, s3); + assert_eq!(s1.as_ptr(), s3.as_ptr()); + } } diff --git a/parser/src/scanner.rs b/parser/src/scanner.rs index 7e37976..7c9f69b 100644 --- a/parser/src/scanner.rs +++ b/parser/src/scanner.rs @@ -16,7 +16,7 @@ use crate::{ as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z, }, - input::Input, + input::{Input, SkipTabs}, }; /// The encoding of the input. Currently, only UTF-8 is supported. @@ -847,37 +847,11 @@ impl Scanner { } } - /// Skip yaml whitespace at most up to eol. Also skips comments. fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result { - let mut encountered_tab = false; - let mut has_yaml_ws = false; - loop { - match self.input.look_ch() { - ' ' => { - has_yaml_ws = true; - self.skip_blank(); - } - '\t' if skip_tabs != SkipTabs::No => { - encountered_tab = true; - self.skip_blank(); - } - // YAML comments must be preceded by whitespace. - '#' if !encountered_tab && !has_yaml_ws => { - return Err(ScanError::new_str( - self.mark, - "comments must be separated from other tokens by whitespace", - )); - } - '#' => { - while !is_breakz(self.input.look_ch()) { - self.skip_non_blank(); - } - } - _ => break, - } - } - - Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)) + let (n_bytes, result) = self.input.skip_ws_to_eol(skip_tabs); + self.mark.col += n_bytes; + self.mark.index += n_bytes; + result.map_err(|msg| ScanError::new_str(self.mark, msg)) } fn fetch_stream_start(&mut self) { @@ -2544,40 +2518,6 @@ impl Scanner { } } -/// Behavior to adopt regarding treating tabs as whitespace. -/// -/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space. -#[derive(Copy, Clone, Eq, PartialEq)] -enum SkipTabs { - /// Skip all tabs as whitespace. - Yes, - /// Don't skip any tab. Return from the function when encountering one. - No, - /// Return value from the function. - Result( - /// Whether tabs were encountered. - bool, - /// Whether at least 1 valid yaml whitespace has been encountered. - bool, - ), -} - -impl SkipTabs { - /// Whether tabs were found while skipping whitespace. - /// - /// This function must be called after a call to `skip_ws_to_eol`. - fn found_tabs(self) -> bool { - matches!(self, SkipTabs::Result(true, _)) - } - - /// Whether a valid YAML whitespace has been found in skipped-over content. - /// - /// This function must be called after a call to `skip_ws_to_eol`. - fn has_valid_yaml_ws(self) -> bool { - matches!(self, SkipTabs::Result(_, true)) - } -} - /// Chomping, how final line breaks and trailing empty lines are interpreted. /// /// See YAML spec 8.1.1.2.