From 93b7e55bcf1b820e8e4a4d97a73fc990c7075545 Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Wed, 10 Jul 2024 22:29:59 +0200 Subject: [PATCH] Move scanning low-level functions to `Input`. --- parser/src/input.rs | 188 +++++++++++++++++++++++++++++++++++++++- parser/src/input/str.rs | 130 +++++++++++++++++++++++++-- parser/src/scanner.rs | 126 ++++++++++++++------------- 3 files changed, 375 insertions(+), 69 deletions(-) diff --git a/parser/src/input.rs b/parser/src/input.rs index 14b5e95..f8ff3b5 100644 --- a/parser/src/input.rs +++ b/parser/src/input.rs @@ -4,7 +4,9 @@ pub mod str; #[allow(clippy::module_name_repetitions)] pub use buffered::BufferedInput; -use crate::char_traits::{is_blank_or_breakz, is_breakz, is_flow}; +use crate::char_traits::{ + is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z, +}; /// Interface for a source of characters. /// @@ -170,7 +172,7 @@ pub trait Input { /// /// # Return /// Return a tuple with the number of characters that were consumed and the result of skipping - /// whitespace. The number of characters returned can be used to advance the index and columns, + /// whitespace. The number of characters returned can be used to advance the index and column, /// since no end-of-line character will be consumed. /// See [`SkipTabs`] For more details on the success variant. /// @@ -230,6 +232,188 @@ pub trait Input { _ => true, } } + + /// Check whether the next character is [a blank] or [a break]. + /// + /// The character must have previously been fetched through [`lookahead`] + /// + /// # Return + /// Returns true if the character is [a blank] or [a break], false otherwise. + /// + /// [`lookahead`]: Input::lookahead + /// [a blank]: is_blank + /// [a break]: is_break + #[inline] + fn next_is_blank_or_break(&self) -> bool { + is_blank(self.peek()) || is_break(self.peek()) + } + + /// Check whether the next character is [a blank] or [a breakz]. + /// + /// The character must have previously been fetched through [`lookahead`] + /// + /// # Return + /// Returns true if the character is [a blank] or [a break], false otherwise. + /// + /// [`lookahead`]: Input::lookahead + /// [a blank]: is_blank + /// [a breakz]: is_breakz + #[inline] + fn next_is_blank_or_breakz(&self) -> bool { + is_blank(self.peek()) || is_breakz(self.peek()) + } + + /// Check whether the next character is [a blank]. + /// + /// The character must have previously been fetched through [`lookahead`] + /// + /// # Return + /// Returns true if the character is [a blank], false otherwise. + /// + /// [`lookahead`]: Input::lookahead + /// [a blank]: is_blank + #[inline] + fn next_is_blank(&self) -> bool { + is_blank(self.peek()) + } + + /// Check whether the next character is [a break]. + /// + /// The character must have previously been fetched through [`lookahead`] + /// + /// # Return + /// Returns true if the character is [a break], false otherwise. + /// + /// [`lookahead`]: Input::lookahead + /// [a break]: is_break + #[inline] + fn next_is_break(&self) -> bool { + is_break(self.peek()) + } + + /// Check whether the next character is [a breakz]. + /// + /// The character must have previously been fetched through [`lookahead`] + /// + /// # Return + /// Returns true if the character is [a breakz], false otherwise. + /// + /// [`lookahead`]: Input::lookahead + /// [a breakz]: is_breakz + #[inline] + fn next_is_breakz(&self) -> bool { + is_breakz(self.peek()) + } + + /// Check whether the next character is [a z]. + /// + /// The character must have previously been fetched through [`lookahead`] + /// + /// # Return + /// Returns true if the character is [a z], false otherwise. + /// + /// [`lookahead`]: Input::lookahead + /// [a z]: is_z + #[inline] + fn next_is_z(&self) -> bool { + is_z(self.peek()) + } + + /// Check whether the next character is [a flow]. + /// + /// The character must have previously been fetched through [`lookahead`] + /// + /// # Return + /// Returns true if the character is [a flow], false otherwise. + /// + /// [`lookahead`]: Input::lookahead + /// [a flow]: is_flow + #[inline] + fn next_is_flow(&self) -> bool { + is_flow(self.peek()) + } + + /// Check whether the next character is [a digit]. + /// + /// The character must have previously been fetched through [`lookahead`] + /// + /// # Return + /// Returns true if the character is [a digit], false otherwise. + /// + /// [`lookahead`]: Input::lookahead + /// [a digit]: is_digit + #[inline] + fn next_is_digit(&self) -> bool { + is_digit(self.peek()) + } + + /// Check whether the next character is [a letter]. + /// + /// The character must have previously been fetched through [`lookahead`] + /// + /// # Return + /// Returns true if the character is [a letter], false otherwise. + /// + /// [`lookahead`]: Input::lookahead + /// [a letter]: is_alpha + #[inline] + fn next_is_alpha(&self) -> bool { + is_alpha(self.peek()) + } + + /// Skip characters from the input until a [breakz] is found. + /// + /// The characters are consumed from the input. + /// + /// # Return + /// Return the number of characters that were consumed. The number of characters returned can + /// be used to advance the index and column, since no end-of-line character will be consumed. + /// + /// [breakz]: is_breakz + #[inline] + fn skip_while_non_breakz(&mut self) -> usize { + let mut count = 0; + while !is_breakz(self.look_ch()) { + count += 1; + self.skip(); + } + count + } + + /// Skip characters from the input while [blanks] are found. + /// + /// The characters are consumed from the input. + /// + /// # Return + /// Return the number of characters that were consumed. The number of characters returned can + /// be used to advance the index and column, since no end-of-line character will be consumed. + /// + /// [blanks]: is_blank + fn skip_while_blank(&mut self) -> usize { + let mut n_chars = 0; + while is_blank(self.look_ch()) { + n_chars += 1; + self.skip(); + } + n_chars + } + + /// Fetch characters from the input while we encounter letters and store them in `out`. + /// + /// The characters are consumed from the input. + /// + /// # Return + /// Return the number of characters that were consumed. The number of characters returned can + /// be used to advance the index and column, since no end-of-line character will be consumed. + fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize { + let mut n_chars = 0; + while is_alpha(self.look_ch()) { + n_chars += 1; + out.push(self.peek()); + self.skip(); + } + n_chars + } } /// Behavior to adopt regarding treating tabs as whitespace. diff --git a/parser/src/input/str.rs b/parser/src/input/str.rs index e50adac..32bd136 100644 --- a/parser/src/input/str.rs +++ b/parser/src/input/str.rs @@ -1,5 +1,7 @@ use crate::{ - char_traits::{is_blank_or_breakz, is_breakz, is_flow}, + char_traits::{ + is_alpha, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit, is_flow, is_z, + }, input::{Input, SkipTabs}, }; @@ -60,7 +62,9 @@ impl<'a> Input for StrInput<'a> { #[inline] fn push_back(&mut self, c: char) { - self.buffer = put_back_in_str(self.buffer, c); + // SAFETY: The preconditions of this function is that the character we are given is the one + // immediately preceding `self.buffer`. + self.buffer = unsafe { put_back_in_str(self.buffer, c) }; } #[inline] @@ -270,6 +274,122 @@ impl<'a> Input for StrInput<'a> { } } } + + #[inline] + fn next_is_blank_or_break(&self) -> bool { + !self.buffer.is_empty() + && (is_blank(self.buffer.as_bytes()[0] as char) + || is_break(self.buffer.as_bytes()[0] as char)) + } + + #[inline] + fn next_is_blank_or_breakz(&self) -> bool { + self.buffer.is_empty() + || (is_blank(self.buffer.as_bytes()[0] as char) + || is_breakz(self.buffer.as_bytes()[0] as char)) + } + + #[inline] + fn next_is_blank(&self) -> bool { + !self.buffer.is_empty() && is_blank(self.buffer.as_bytes()[0] as char) + } + + #[inline] + fn next_is_break(&self) -> bool { + !self.buffer.is_empty() && is_break(self.buffer.as_bytes()[0] as char) + } + + #[inline] + fn next_is_breakz(&self) -> bool { + self.buffer.is_empty() || is_breakz(self.buffer.as_bytes()[0] as char) + } + + #[inline] + fn next_is_z(&self) -> bool { + self.buffer.is_empty() || is_z(self.buffer.as_bytes()[0] as char) + } + + #[inline] + fn next_is_flow(&self) -> bool { + !self.buffer.is_empty() && is_flow(self.buffer.as_bytes()[0] as char) + } + + #[inline] + fn next_is_digit(&self) -> bool { + !self.buffer.is_empty() && is_digit(self.buffer.as_bytes()[0] as char) + } + + #[inline] + fn next_is_alpha(&self) -> bool { + !self.buffer.is_empty() && is_alpha(self.buffer.as_bytes()[0] as char) + } + + fn skip_while_non_breakz(&mut self) -> usize { + let mut found_breakz = false; + let mut count = 0; + + // Skip over all non-breaks. + let mut chars = self.buffer.chars(); + for c in chars.by_ref() { + if is_breakz(c) { + found_breakz = true; + break; + } + count += 1; + } + + self.buffer = if found_breakz { + // If we read a breakz, we need to put it back to the buffer. + // SAFETY: The last character we extracted is either a '\n', '\r' or '\0', all of which + // are 1-byte long. + unsafe { extend_left(chars.as_str(), 1) } + } else { + chars.as_str() + }; + + count + } + + fn skip_while_blank(&mut self) -> usize { + // Since all characters we look for are ascii, we can directly use the byte API of str. + let mut i = 0; + while i < self.buffer.len() { + if !is_blank(self.buffer.as_bytes()[i] as char) { + break; + } + i += 1; + } + self.buffer = &self.buffer[i..]; + i + } + + fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize { + let mut not_alpha = None; + + // Skip while we have alpha characters. + let mut chars = self.buffer.chars(); + for c in chars.by_ref() { + if !is_alpha(c) { + not_alpha = Some(c); + break; + } + } + + let remaining_string = if let Some(c) = not_alpha { + let n_bytes_read = chars.as_str().as_ptr() as usize - self.buffer.as_ptr() as usize; + let last_char_bytes = c.len_utf8(); + &self.buffer[n_bytes_read - last_char_bytes..] + } else { + chars.as_str() + }; + + let n_bytes_to_append = remaining_string.as_ptr() as usize - self.buffer.as_ptr() as usize; + out.reserve(n_bytes_to_append); + out.push_str(&self.buffer[..n_bytes_to_append]); + self.buffer = remaining_string; + + n_bytes_to_append + } } /// The buffer size we return to the scanner. @@ -309,13 +429,13 @@ const BUFFER_LEN: usize = 128; /// assert_eq!(s1, s3); /// assert_eq!(s1.as_ptr(), s3.as_ptr()); /// ``` -fn put_back_in_str(s: &str, c: char) -> &str { +unsafe fn put_back_in_str(s: &str, c: char) -> &str { let n_bytes = c.len_utf8(); // SAFETY: The character that gets pushed back is guaranteed to be the one that is // immediately preceding our buffer. We can compute the length of the character and move // our buffer back that many bytes. - unsafe { extend_left(s, n_bytes) } + extend_left(s, n_bytes) } /// Extend the string by moving the start pointer to the left by `n` bytes. @@ -369,7 +489,7 @@ mod test { pub fn put_back_in_str_example() { let s1 = "foo"; let s2 = &s1[1..]; - let s3 = put_back_in_str(s2, 'f'); // OK, 'f' is the character immediately preceding + let s3 = unsafe { put_back_in_str(s2, 'f') }; // OK, 'f' is the character immediately preceding assert_eq!(s1, s3); assert_eq!(s1.as_ptr(), s3.as_ptr()); } diff --git a/parser/src/scanner.rs b/parser/src/scanner.rs index e2c92ec..b1cb1f5 100644 --- a/parser/src/scanner.rs +++ b/parser/src/scanner.rs @@ -13,8 +13,8 @@ use std::{char, collections::VecDeque, error::Error, fmt}; use crate::{ char_traits::{ - as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, - is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z, + as_hex, is_anchor_char, is_blank_or_breakz, is_break, is_breakz, is_flow, is_hex, + is_tag_char, is_uri_char, }, input::{Input, SkipTabs}, }; @@ -533,7 +533,7 @@ impl Scanner { // will be reset by `skip_nl`. self.skip_blank(); self.skip_nl(); - } else if is_break(self.input.peek()) { + } else if self.input.next_is_break() { self.skip_nl(); } } @@ -616,7 +616,7 @@ impl Scanner { self.input.lookahead(4); - if is_z(self.input.peek()) { + if self.input.next_is_z() { self.fetch_stream_end()?; return Ok(()); } @@ -629,7 +629,7 @@ impl Scanner { } else if self.input.next_is_document_end() { self.fetch_document_indicator(TokenType::DocumentEnd)?; self.skip_ws_to_eol(SkipTabs::Yes)?; - if !is_breakz(self.input.peek()) { + if !self.input.next_is_breakz() { return Err(ScanError::new_str( self.mark, "invalid content after document end marker", @@ -784,7 +784,7 @@ impl Scanner { { self.skip_ws_to_eol(SkipTabs::Yes)?; // If we have content on that line with a tab, return an error. - if !is_breakz(self.input.peek()) { + if !self.input.next_is_breakz() { return Err(ScanError::new_str( self.mark, "tabs disallowed within this context (block indentation)", @@ -800,9 +800,9 @@ impl Scanner { } } '#' => { - while !is_breakz(self.input.look_ch()) { - self.skip_non_blank(); - } + let comment_length = self.input.skip_while_non_breakz(); + self.mark.index += comment_length; + self.mark.col += comment_length; } _ => break, } @@ -832,9 +832,9 @@ impl Scanner { need_whitespace = false; } '#' => { - while !is_breakz(self.input.look_ch()) { - self.skip_non_blank(); - } + let comment_length = self.input.skip_while_non_breakz(); + self.mark.index += comment_length; + self.mark.col += comment_length; } _ => break, } @@ -912,9 +912,9 @@ impl Scanner { // XXX This should be a warning instead of an error _ => { // skip current line - while !is_breakz(self.input.look_ch()) { - self.skip_non_blank(); - } + let line_len = self.input.skip_while_non_breakz(); + self.mark.index += line_len; + self.mark.col += line_len; // XXX return an empty TagDirective token Token( start_mark, @@ -927,7 +927,7 @@ impl Scanner { self.skip_ws_to_eol(SkipTabs::Yes)?; - if is_breakz(self.input.peek()) { + if self.input.next_is_breakz() { self.input.lookahead(2); self.skip_linebreak(); Ok(tok) @@ -940,9 +940,9 @@ impl Scanner { } fn scan_version_directive_value(&mut self, mark: &Marker) -> Result { - while is_blank(self.input.look_ch()) { - self.skip_blank(); - } + let n_blanks = self.input.skip_while_blank(); + self.mark.index += n_blanks; + self.mark.col += n_blanks; let major = self.scan_version_directive_number(mark)?; @@ -962,10 +962,10 @@ impl Scanner { fn scan_directive_name(&mut self) -> Result { let start_mark = self.mark; let mut string = String::new(); - while is_alpha(self.input.look_ch()) { - string.push(self.input.peek()); - self.skip_non_blank(); - } + + let n_chars = self.input.fetch_while_is_alpha(&mut string); + self.mark.index += n_chars; + self.mark.col += n_chars; if string.is_empty() { return Err(ScanError::new_str( @@ -1010,22 +1010,21 @@ impl Scanner { } fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result { - /* Eat whitespaces. */ - while is_blank(self.input.look_ch()) { - self.skip_blank(); - } + let n_blanks = self.input.skip_while_blank(); + self.mark.index += n_blanks; + self.mark.col += n_blanks; + let handle = self.scan_tag_handle(true, mark)?; - /* Eat whitespaces. */ - while is_blank(self.input.look_ch()) { - self.skip_blank(); - } + let n_blanks = self.input.skip_while_blank(); + self.mark.index += n_blanks; + self.mark.col += n_blanks; let prefix = self.scan_tag_prefix(mark)?; self.input.lookahead(1); - if is_blank_or_breakz(self.input.peek()) { + if self.input.next_is_blank_or_breakz() { Ok(Token(*mark, TokenType::TagDirective(handle, prefix))) } else { Err(ScanError::new_str( @@ -1076,7 +1075,7 @@ impl Scanner { } if is_blank_or_breakz(self.input.look_ch()) - || (self.flow_level > 0 && is_flow(self.input.peek())) + || (self.flow_level > 0 && self.input.next_is_flow()) { // XXX: ex 7.2, an empty scalar can follow a secondary tag Ok(Token(start_mark, TokenType::Tag(handle, suffix))) @@ -1100,10 +1099,9 @@ impl Scanner { string.push(self.input.peek()); self.skip_non_blank(); - while is_alpha(self.input.look_ch()) { - string.push(self.input.peek()); - self.skip_non_blank(); - } + let n_chars = self.input.fetch_while_is_alpha(&mut string); + self.mark.index += n_chars; + self.mark.col += n_chars; // Check if the trailing character is '!' and copy it. if self.input.peek() == '!' { @@ -1448,7 +1446,8 @@ impl Scanner { } self.skip_ws_to_eol(SkipTabs::No)?; - if is_break(self.input.look_ch()) || is_flow(self.input.peek()) { + self.input.lookahead(1); + if self.input.next_is_break() || self.input.next_is_flow() { self.roll_one_col_indent(); } @@ -1513,7 +1512,8 @@ impl Scanner { chomping = Chomping::Strip; } self.skip_non_blank(); - if is_digit(self.input.look_ch()) { + self.input.lookahead(1); + if self.input.next_is_digit() { if self.input.peek() == '0' { return Err(ScanError::new_str( start_mark, @@ -1523,7 +1523,7 @@ impl Scanner { increment = (self.input.peek() as usize) - ('0' as usize); self.skip_non_blank(); } - } else if is_digit(self.input.peek()) { + } else if self.input.next_is_digit() { if self.input.peek() == '0' { return Err(ScanError::new_str( start_mark, @@ -1547,14 +1547,15 @@ impl Scanner { self.skip_ws_to_eol(SkipTabs::Yes)?; // Check if we are at the end of the line. - if !is_breakz(self.input.look_ch()) { + self.input.lookahead(1); + if !self.input.next_is_breakz() { return Err(ScanError::new_str( start_mark, "while scanning a block scalar, did not find expected comment or line break", )); } - if is_break(self.input.peek()) { + if self.input.next_is_break() { self.input.lookahead(2); self.read_break(&mut chomping_break); } @@ -1585,7 +1586,7 @@ impl Scanner { // ```yaml // - |+ // ``` - if is_z(self.input.peek()) { + if self.input.next_is_z() { let contents = match chomping { // We strip trailing linebreaks. Nothing remain. Chomping::Strip => String::new(), @@ -1612,7 +1613,7 @@ impl Scanner { let mut line_buffer = String::with_capacity(100); let start_mark = self.mark; - while self.mark.col == indent && !is_z(self.input.peek()) { + while self.mark.col == indent && !self.input.next_is_z() { if indent == 0 { self.input.lookahead(4); if self.input.next_is_document_end() { @@ -1621,7 +1622,7 @@ impl Scanner { } // We are at the first content character of a content line. - trailing_blank = is_blank(self.input.peek()); + trailing_blank = self.input.next_is_blank(); if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank { string.push_str(&trailing_breaks); if trailing_breaks.is_empty() { @@ -1635,12 +1636,12 @@ impl Scanner { leading_break.clear(); trailing_breaks.clear(); - leading_blank = is_blank(self.input.peek()); + leading_blank = self.input.next_is_blank(); self.scan_block_scalar_content_line(&mut string, &mut line_buffer); // break on EOF - if is_z(self.input.peek()) { + if self.input.next_is_z() { break; } @@ -1657,7 +1658,7 @@ impl Scanner { // If we had reached an eof but the last character wasn't an end-of-line, check if the // last line was indented at least as the rest of the scalar, then we need to consider // there is a newline. - if is_z(self.input.peek()) && self.mark.col >= indent.max(1) { + if self.input.next_is_z() && self.mark.col >= indent.max(1) { string.push('\n'); } } @@ -1680,7 +1681,7 @@ impl Scanner { /// line. This function does not consume the line break character(s) after the line. fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) { // Start by evaluating characters in the buffer. - while !self.input.buf_is_empty() && !is_breakz(self.input.peek()) { + while !self.input.buf_is_empty() && !self.input.next_is_breakz() { string.push(self.input.peek()); // We may technically skip non-blank characters. However, the only distinction is // to determine what is leading whitespace and what is not. Here, we read the @@ -1752,7 +1753,7 @@ impl Scanner { } // If our current line is empty, skip over the break and continue looping. - if is_break(self.input.peek()) { + if self.input.next_is_break() { self.read_break(breaks); } else { // Otherwise, we have a content line. Return control. @@ -1777,7 +1778,7 @@ impl Scanner { max_indent = self.mark.col; } - if is_break(self.input.peek()) { + if self.input.next_is_break() { // If our current line is empty, skip over the break and continue looping. self.input.lookahead(2); self.read_break(breaks); @@ -1840,7 +1841,7 @@ impl Scanner { )); } - if is_z(self.input.peek()) { + if self.input.next_is_z() { return Err(ScanError::new_str( start_mark, "while scanning a quoted scalar, found unexpected end of stream", @@ -1869,8 +1870,8 @@ impl Scanner { } // Consume blank characters. - while is_blank(self.input.peek()) || is_break(self.input.peek()) { - if is_blank(self.input.peek()) { + while self.input.next_is_blank() || self.input.next_is_break() { + if self.input.next_is_blank() { // Consume a space or a tab character. if leading_blanks { if self.input.peek() == '\t' && (self.mark.col as isize) < self.indent { @@ -2118,7 +2119,7 @@ impl Scanner { )); } - if !is_blank_or_breakz(self.input.peek()) + if !self.input.next_is_blank_or_breakz() && self.input.next_can_be_plain_scalar(self.flow_level > 0) { if self.leading_whitespace { @@ -2155,7 +2156,7 @@ impl Scanner { // hence the `for` loop looping `self.input.bufmaxlen() - 1` times. self.input.lookahead(self.input.bufmaxlen()); for _ in 0..self.input.bufmaxlen() - 1 { - if is_blank_or_breakz(self.input.peek()) + if self.input.next_is_blank_or_breakz() || !self.input.next_can_be_plain_scalar(self.flow_level > 0) { end = true; @@ -2172,13 +2173,14 @@ impl Scanner { // - We reach eof // - We reach ": " // - We find a flow character in a flow context - if !(is_blank(self.input.peek()) || is_break(self.input.peek())) { + if !(self.input.next_is_blank() || self.input.next_is_break()) { break; } // Process blank characters. - while is_blank(self.input.look_ch()) || is_break(self.input.peek()) { - if is_blank(self.input.peek()) { + self.input.lookahead(1); + while self.input.next_is_blank_or_break() { + if self.input.next_is_blank() { if !self.leading_whitespace { whitespaces.push(self.input.peek()); self.skip_blank(); @@ -2186,7 +2188,7 @@ impl Scanner { // Tabs in an indentation columns are allowed if and only if the line is // empty. Skip to the end of the line. self.skip_ws_to_eol(SkipTabs::Yes)?; - if !is_breakz(self.input.peek()) { + if !self.input.next_is_breakz() { return Err(ScanError::new_str( start_mark, "while scanning a plain scalar, found a tab", @@ -2196,7 +2198,6 @@ impl Scanner { self.skip_blank(); } } else { - self.input.lookahead(2); // Check if it is a first line break if self.leading_whitespace { self.read_break(&mut trailing_breaks); @@ -2206,6 +2207,7 @@ impl Scanner { self.leading_whitespace = true; } } + self.input.lookahead(1); } // check indentation level @@ -2309,7 +2311,7 @@ impl Scanner { self.skip_non_blank(); if self.input.look_ch() == '\t' && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws() - && (self.input.peek() == '-' || is_alpha(self.input.peek())) + && (self.input.peek() == '-' || self.input.next_is_alpha()) { return Err(ScanError::new_str( self.mark,