From e215f546f35a05f0059f0281ccd311afcb24d3a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Mon, 12 Aug 2024 17:06:14 +0200 Subject: [PATCH] Remove all unsafe code. * Use `str::strip_prefix` to avoid using `str::from_utf8_unchecked` * Avoid most uses of `extend_left` unsafe function * Remove `Input::push_back` and remaining unsafe --- parser/src/input.rs | 14 ++-- parser/src/input/buffered.rs | 14 +++- parser/src/input/str.rs | 121 ++++++++++------------------------- parser/src/scanner.rs | 9 +-- 4 files changed, 55 insertions(+), 103 deletions(-) diff --git a/parser/src/input.rs b/parser/src/input.rs index f8ff3b5..13c309c 100644 --- a/parser/src/input.rs +++ b/parser/src/input.rs @@ -45,16 +45,18 @@ pub trait Input { /// Read a character from the input stream and return it directly. /// - /// The internal buffer (is any) is bypassed. + /// The internal buffer (if any) is bypassed. #[must_use] fn raw_read_ch(&mut self) -> char; - /// Put a character back in the buffer. + /// Read a non-breakz a character from the input stream and return it directly. /// - /// This function is only called when we read one too many characters and the pushed back - /// character is exactly the last character that was read. This function will not be called - /// multiple times consecutively. - fn push_back(&mut self, c: char); + /// The internal buffer (if any) is bypassed. + /// + /// If the next character is a breakz, it is either not consumed or placed into the buffer (if + /// any). + #[must_use] + fn raw_read_non_breakz_ch(&mut self) -> Option; /// Consume the next character. fn skip(&mut self); diff --git a/parser/src/input/buffered.rs b/parser/src/input/buffered.rs index c51efac..f162e77 100644 --- a/parser/src/input/buffered.rs +++ b/parser/src/input/buffered.rs @@ -1,3 +1,4 @@ +use crate::char_traits::is_breakz; use crate::input::Input; use arraydeque::ArrayDeque; @@ -68,8 +69,17 @@ impl> Input for BufferedInput { } #[inline] - fn push_back(&mut self, c: char) { - self.buffer.push_back(c).unwrap(); + fn raw_read_non_breakz_ch(&mut self) -> Option { + if let Some(c) = self.input.next() { + if is_breakz(c) { + self.buffer.push_back(c).unwrap(); + None + } else { + Some(c) + } + } else { + None + } } #[inline] diff --git a/parser/src/input/str.rs b/parser/src/input/str.rs index 32bd136..e44dcee 100644 --- a/parser/src/input/str.rs +++ b/parser/src/input/str.rs @@ -61,10 +61,17 @@ impl<'a> Input for StrInput<'a> { } #[inline] - fn push_back(&mut self, c: char) { - // SAFETY: The preconditions of this function is that the character we are given is the one - // immediately preceding `self.buffer`. - self.buffer = unsafe { put_back_in_str(self.buffer, c) }; + fn raw_read_non_breakz_ch(&mut self) -> Option { + if let Some((c, sub_str)) = split_first_char(self.buffer) { + if is_breakz(c) { + None + } else { + self.buffer = sub_str; + Some(c) + } + } else { + None + } } #[inline] @@ -177,45 +184,34 @@ impl<'a> Input for StrInput<'a> { fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result) { assert!(!matches!(skip_tabs, SkipTabs::Result(..))); - let mut new_str = self.buffer.as_bytes(); + let mut new_str = self.buffer; let mut has_yaml_ws = false; let mut encountered_tab = false; // This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found // while keeping track of whether we encountered spaces and/or tabs. if skip_tabs == SkipTabs::Yes { - let mut i = 0; - while i < new_str.len() { - if new_str[i] == b' ' { + loop { + if let Some(sub_str) = new_str.strip_prefix(' ') { has_yaml_ws = true; - } else if new_str[i] == b'\t' { + new_str = sub_str; + } else if let Some(sub_str) = new_str.strip_prefix('\t') { encountered_tab = true; + new_str = sub_str; } else { break; } - i += 1; } - new_str = &new_str[i..]; } else { - let mut i = 0; - while i < new_str.len() { - if new_str[i] != b' ' { - break; - } - i += 1; + while let Some(sub_str) = new_str.strip_prefix(' ') { + has_yaml_ws = true; + new_str = sub_str; } - has_yaml_ws = i != 0; - new_str = &new_str[i..]; } // All characters consumed were ascii. We can use the byte length difference to count the // number of whitespace ignored. let mut chars_consumed = self.buffer.len() - new_str.len(); - // SAFETY: We only trimmed spaces and tabs, both of which are bytes. This means we won't - // start the string outside of a valid UTF-8 boundary. - // It is assumed the input string is valid UTF-8, so the rest of the string is assumed to - // be valid UTF-8 as well. - let mut new_str = unsafe { std::str::from_utf8_unchecked(new_str) }; if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' { if !encountered_tab && !has_yaml_ws { @@ -225,24 +221,14 @@ impl<'a> Input for StrInput<'a> { ); } - let mut chars = new_str.chars(); - let mut found_breakz = false; - // Iterate over all remaining chars until we hit a breakz. - for c in chars.by_ref() { + // Skip remaining characters until we hit a breakz. + while let Some((c, sub_str)) = split_first_char(new_str) { if is_breakz(c) { - found_breakz = true; break; } + new_str = sub_str; chars_consumed += 1; } - - new_str = if found_breakz { - // SAFETY: The last character we pulled out of the `chars()` is a breakz, one of - // '\0', '\r', '\n'. All 3 of them are 1-byte long. - unsafe { extend_left(chars.as_str(), 1) } - } else { - chars.as_str() - }; } self.buffer = new_str; @@ -325,27 +311,19 @@ impl<'a> Input for StrInput<'a> { } fn skip_while_non_breakz(&mut self) -> usize { - let mut found_breakz = false; + let mut new_str = self.buffer; let mut count = 0; // Skip over all non-breaks. - let mut chars = self.buffer.chars(); - for c in chars.by_ref() { + while let Some((c, sub_str)) = split_first_char(new_str) { if is_breakz(c) { - found_breakz = true; break; } + new_str = sub_str; count += 1; } - self.buffer = if found_breakz { - // If we read a breakz, we need to put it back to the buffer. - // SAFETY: The last character we extracted is either a '\n', '\r' or '\0', all of which - // are 1-byte long. - unsafe { extend_left(chars.as_str(), 1) } - } else { - chars.as_str() - }; + self.buffer = new_str; count } @@ -416,40 +394,18 @@ impl<'a> Input for StrInput<'a> { /// [`buflen`]: `StrInput::buflen` const BUFFER_LEN: usize = 128; -/// Fake prepending a character to the given string. -/// -/// The character given as parameter MUST be the one that precedes the given string. -/// -/// # Exmaple -/// ```ignore -/// let s1 = "foo"; -/// let s2 = &s1[1..]; -/// let s3 = put_back_in_str(s2, 'f'); // OK, 'f' is the character immediately preceding -/// // let s3 = put_back_in_str('g'); // Not allowed -/// assert_eq!(s1, s3); -/// assert_eq!(s1.as_ptr(), s3.as_ptr()); -/// ``` -unsafe fn put_back_in_str(s: &str, c: char) -> &str { - let n_bytes = c.len_utf8(); - - // SAFETY: The character that gets pushed back is guaranteed to be the one that is - // immediately preceding our buffer. We can compute the length of the character and move - // our buffer back that many bytes. - extend_left(s, n_bytes) -} - -/// Extend the string by moving the start pointer to the left by `n` bytes. +/// Splits the first character of the given string and returns it along with the rest of the +/// string. #[inline] -unsafe fn extend_left(s: &str, n: usize) -> &str { - std::str::from_utf8_unchecked(std::slice::from_raw_parts( - s.as_ptr().wrapping_sub(n), - s.len() + n, - )) +fn split_first_char(s: &str) -> Option<(char, &str)> { + let mut chars = s.chars(); + let c = chars.next()?; + Some((c, chars.as_str())) } #[cfg(test)] mod test { - use crate::input::{str::put_back_in_str, Input}; + use crate::input::Input; use super::StrInput; @@ -484,13 +440,4 @@ mod test { assert!(input.next_is_document_end()); assert!(input.next_is_document_indicator()); } - - #[test] - pub fn put_back_in_str_example() { - let s1 = "foo"; - let s2 = &s1[1..]; - let s3 = unsafe { put_back_in_str(s2, 'f') }; // OK, 'f' is the character immediately preceding - assert_eq!(s1, s3); - assert_eq!(s1.as_ptr(), s3.as_ptr()); - } } diff --git a/parser/src/scanner.rs b/parser/src/scanner.rs index d488005..1f4a2bb 100644 --- a/parser/src/scanner.rs +++ b/parser/src/scanner.rs @@ -1770,17 +1770,10 @@ impl Scanner { // characters are appended here as their real size (1B for ascii, or up to 4 bytes for // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string` // (using `String::push_str`). - let mut c = self.input.raw_read_ch(); - while !is_breakz(c) { + while let Some(c) = self.input.raw_read_non_breakz_ch() { line_buffer.push(c); - c = self.input.raw_read_ch(); } - // Our last character read is stored in `c`. It is either an EOF or a break. In any - // case, we need to push it back into `self.buffer` so it may be properly read - // after. We must not insert it in `string`. - self.input.push_back(c); - // We need to manually update our position; we haven't called a `skip` function. self.mark.col += line_buffer.len(); self.mark.index += line_buffer.len();