Move skip_ws_to_eol
to Input
trait.
This commit is contained in:
parent
db4f26da42
commit
8d7c3a1c1b
3 changed files with 230 additions and 103 deletions
|
@ -4,7 +4,7 @@ pub mod str;
|
||||||
#[allow(clippy::module_name_repetitions)]
|
#[allow(clippy::module_name_repetitions)]
|
||||||
pub use buffered::BufferedInput;
|
pub use buffered::BufferedInput;
|
||||||
|
|
||||||
use crate::char_traits::is_blank_or_breakz;
|
use crate::char_traits::{is_blank_or_breakz, is_breakz};
|
||||||
|
|
||||||
/// Interface for a source of characters.
|
/// Interface for a source of characters.
|
||||||
///
|
///
|
||||||
|
@ -165,4 +165,88 @@ pub trait Input {
|
||||||
assert!(self.buflen() >= 4);
|
assert!(self.buflen() >= 4);
|
||||||
self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3))
|
self.next_3_are('.', '.', '.') && is_blank_or_breakz(self.peek_nth(3))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Skip yaml whitespace at most up to eol. Also skips comments. Advances the input.
|
||||||
|
///
|
||||||
|
/// # Return
|
||||||
|
/// Return a tuple with the number of characters that were consumed and the result of skipping
|
||||||
|
/// whitespace. The number of characters returned can be used to advance the index and columns,
|
||||||
|
/// since no end-of-line character will be consumed.
|
||||||
|
/// See [`SkipTabs`] For more details on the success variant.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Errors if a comment is encountered but it was not preceded by a whitespace. In that event,
|
||||||
|
/// the first tuple element will contain the number of characters consumed prior to reaching
|
||||||
|
/// the `#`.
|
||||||
|
fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
|
||||||
|
let mut encountered_tab = false;
|
||||||
|
let mut has_yaml_ws = false;
|
||||||
|
let mut chars_consumed = 0;
|
||||||
|
loop {
|
||||||
|
match self.look_ch() {
|
||||||
|
' ' => {
|
||||||
|
has_yaml_ws = true;
|
||||||
|
self.skip();
|
||||||
|
}
|
||||||
|
'\t' if skip_tabs != SkipTabs::No => {
|
||||||
|
encountered_tab = true;
|
||||||
|
self.skip();
|
||||||
|
}
|
||||||
|
// YAML comments must be preceded by whitespace.
|
||||||
|
'#' if !encountered_tab && !has_yaml_ws => {
|
||||||
|
return (
|
||||||
|
chars_consumed,
|
||||||
|
Err("comments must be separated from other tokens by whitespace"),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
'#' => {
|
||||||
|
while !is_breakz(self.look_ch()) {
|
||||||
|
self.skip();
|
||||||
|
chars_consumed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => break,
|
||||||
|
}
|
||||||
|
chars_consumed += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
(
|
||||||
|
chars_consumed,
|
||||||
|
Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Behavior to adopt regarding treating tabs as whitespace.
|
||||||
|
///
|
||||||
|
/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
|
||||||
|
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||||
|
pub enum SkipTabs {
|
||||||
|
/// Skip all tabs as whitespace.
|
||||||
|
Yes,
|
||||||
|
/// Don't skip any tab. Return from the function when encountering one.
|
||||||
|
No,
|
||||||
|
/// Return value from the function.
|
||||||
|
Result(
|
||||||
|
/// Whether tabs were encountered.
|
||||||
|
bool,
|
||||||
|
/// Whether at least 1 valid yaml whitespace has been encountered.
|
||||||
|
bool,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SkipTabs {
|
||||||
|
/// Whether tabs were found while skipping whitespace.
|
||||||
|
///
|
||||||
|
/// This function must be called after a call to `skip_ws_to_eol`.
|
||||||
|
pub fn found_tabs(self) -> bool {
|
||||||
|
matches!(self, SkipTabs::Result(true, _))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether a valid YAML whitespace has been found in skipped-over content.
|
||||||
|
///
|
||||||
|
/// This function must be called after a call to `skip_ws_to_eol`.
|
||||||
|
pub fn has_valid_yaml_ws(self) -> bool {
|
||||||
|
matches!(self, SkipTabs::Result(_, true))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
use crate::{char_traits::is_blank_or_breakz, input::Input};
|
use crate::{
|
||||||
|
char_traits::{is_blank_or_breakz, is_breakz},
|
||||||
|
input::{Input, SkipTabs},
|
||||||
|
};
|
||||||
|
|
||||||
#[allow(clippy::module_name_repetitions)]
|
#[allow(clippy::module_name_repetitions)]
|
||||||
pub struct StrInput<'a> {
|
pub struct StrInput<'a> {
|
||||||
/// The input str buffer.
|
/// The input str buffer.
|
||||||
buffer: &'a str,
|
buffer: &'a str,
|
||||||
/// The number of characters (**not** bytes) in the buffer.
|
|
||||||
n_chars: usize,
|
|
||||||
/// The number of characters we have looked ahead.
|
/// The number of characters we have looked ahead.
|
||||||
///
|
///
|
||||||
/// We must however keep track of how many characters the parser asked us to look ahead for so
|
/// We must however keep track of how many characters the parser asked us to look ahead for so
|
||||||
|
@ -18,7 +19,6 @@ impl<'a> StrInput<'a> {
|
||||||
pub fn new(input: &'a str) -> Self {
|
pub fn new(input: &'a str) -> Self {
|
||||||
Self {
|
Self {
|
||||||
buffer: input,
|
buffer: input,
|
||||||
n_chars: input.chars().count(),
|
|
||||||
lookahead: 0,
|
lookahead: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -52,7 +52,6 @@ impl<'a> Input for StrInput<'a> {
|
||||||
let mut chars = self.buffer.chars();
|
let mut chars = self.buffer.chars();
|
||||||
if let Some(c) = chars.next() {
|
if let Some(c) = chars.next() {
|
||||||
self.buffer = chars.as_str();
|
self.buffer = chars.as_str();
|
||||||
self.n_chars -= 1;
|
|
||||||
c
|
c
|
||||||
} else {
|
} else {
|
||||||
'\0'
|
'\0'
|
||||||
|
@ -61,20 +60,7 @@ impl<'a> Input for StrInput<'a> {
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn push_back(&mut self, c: char) {
|
fn push_back(&mut self, c: char) {
|
||||||
let n_bytes = c.len_utf8();
|
self.buffer = put_back_in_str(self.buffer, c);
|
||||||
|
|
||||||
// SAFETY: The character that gets pushed back is guaranteed to be the one that is
|
|
||||||
// immediately preceding our buffer. We can compute the length of the character and move
|
|
||||||
// our buffer back that many bytes.
|
|
||||||
unsafe {
|
|
||||||
let buffer_byte_len = self.buffer.len();
|
|
||||||
let mut now_ptr = self.buffer.as_ptr();
|
|
||||||
now_ptr = now_ptr.wrapping_sub(n_bytes);
|
|
||||||
self.buffer = std::str::from_utf8_unchecked(std::slice::from_raw_parts(
|
|
||||||
now_ptr,
|
|
||||||
buffer_byte_len + n_bytes,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
@ -82,7 +68,6 @@ impl<'a> Input for StrInput<'a> {
|
||||||
let mut chars = self.buffer.chars();
|
let mut chars = self.buffer.chars();
|
||||||
if chars.next().is_some() {
|
if chars.next().is_some() {
|
||||||
self.buffer = chars.as_str();
|
self.buffer = chars.as_str();
|
||||||
self.n_chars -= 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -95,7 +80,6 @@ impl<'a> Input for StrInput<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.buffer = chars.as_str();
|
self.buffer = chars.as_str();
|
||||||
self.n_chars = self.n_chars.saturating_sub(count);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
@ -150,11 +134,11 @@ impl<'a> Input for StrInput<'a> {
|
||||||
false
|
false
|
||||||
} else {
|
} else {
|
||||||
// Since all characters we look for are ascii, we can directly use the byte API of str.
|
// Since all characters we look for are ascii, we can directly use the byte API of str.
|
||||||
(if self.buffer.len() == 3 {
|
let bytes = self.buffer.as_bytes();
|
||||||
true
|
(bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
|
||||||
} else {
|
&& (bytes[0] == b'.' || bytes[0] == b'-')
|
||||||
is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
|
&& bytes[0] == bytes[1]
|
||||||
}) && (self.buffer.starts_with("...") || self.buffer.starts_with("---"))
|
&& bytes[1] == bytes[2]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -164,11 +148,11 @@ impl<'a> Input for StrInput<'a> {
|
||||||
false
|
false
|
||||||
} else {
|
} else {
|
||||||
// Since all characters we look for are ascii, we can directly use the byte API of str.
|
// Since all characters we look for are ascii, we can directly use the byte API of str.
|
||||||
(if self.buffer.len() == 3 {
|
let bytes = self.buffer.as_bytes();
|
||||||
true
|
(bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
|
||||||
} else {
|
&& bytes[0] == b'-'
|
||||||
is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
|
&& bytes[1] == b'-'
|
||||||
}) && self.buffer.starts_with("---")
|
&& bytes[2] == b'-'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -178,13 +162,92 @@ impl<'a> Input for StrInput<'a> {
|
||||||
false
|
false
|
||||||
} else {
|
} else {
|
||||||
// Since all characters we look for are ascii, we can directly use the byte API of str.
|
// Since all characters we look for are ascii, we can directly use the byte API of str.
|
||||||
(if self.buffer.len() == 3 {
|
let bytes = self.buffer.as_bytes();
|
||||||
true
|
(bytes.len() == 3 || is_blank_or_breakz(bytes[3] as char))
|
||||||
} else {
|
&& bytes[0] == b'.'
|
||||||
is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
|
&& bytes[1] == b'.'
|
||||||
}) && self.buffer.starts_with("...")
|
&& bytes[2] == b'.'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
|
||||||
|
assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
|
||||||
|
|
||||||
|
let mut new_str = self.buffer.as_bytes();
|
||||||
|
let mut has_yaml_ws = false;
|
||||||
|
let mut encountered_tab = false;
|
||||||
|
|
||||||
|
// This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
|
||||||
|
// while keeping track of whether we encountered spaces and/or tabs.
|
||||||
|
if skip_tabs == SkipTabs::Yes {
|
||||||
|
let mut i = 0;
|
||||||
|
while i < new_str.len() {
|
||||||
|
if new_str[i] == b' ' {
|
||||||
|
has_yaml_ws = true;
|
||||||
|
} else if new_str[i] == b'\t' {
|
||||||
|
encountered_tab = true;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
new_str = &new_str[i..];
|
||||||
|
} else {
|
||||||
|
let mut i = 0;
|
||||||
|
while i < new_str.len() {
|
||||||
|
if new_str[i] != b' ' {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
has_yaml_ws = i != 0;
|
||||||
|
new_str = &new_str[i..];
|
||||||
|
}
|
||||||
|
|
||||||
|
// All characters consumed were ascii. We can use the byte length difference to count the
|
||||||
|
// number of whitespace ignored.
|
||||||
|
let mut chars_consumed = self.buffer.len() - new_str.len();
|
||||||
|
// SAFETY: We only trimmed spaces and tabs, both of which are bytes. This means we won't
|
||||||
|
// start the string outside of a valid UTF-8 boundary.
|
||||||
|
// It is assumed the input string is valid UTF-8, so the rest of the string is assumed to
|
||||||
|
// be valid UTF-8 as well.
|
||||||
|
let mut new_str = unsafe { std::str::from_utf8_unchecked(new_str) };
|
||||||
|
|
||||||
|
if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
|
||||||
|
if !encountered_tab && !has_yaml_ws {
|
||||||
|
return (
|
||||||
|
chars_consumed,
|
||||||
|
Err("comments must be separated from other tokens by whitespace"),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut chars = new_str.chars();
|
||||||
|
let mut found_breakz = false;
|
||||||
|
// Iterate over all remaining chars until we hit a breakz.
|
||||||
|
for c in chars.by_ref() {
|
||||||
|
if is_breakz(c) {
|
||||||
|
found_breakz = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
chars_consumed += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
new_str = if found_breakz {
|
||||||
|
// SAFETY: The last character we pulled out of the `chars()` is a breakz, one of
|
||||||
|
// '\0', '\r', '\n'. All 3 of them are 1-byte long.
|
||||||
|
unsafe { extend_left(chars.as_str(), 1) }
|
||||||
|
} else {
|
||||||
|
chars.as_str()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
self.buffer = new_str;
|
||||||
|
|
||||||
|
(
|
||||||
|
chars_consumed,
|
||||||
|
Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The buffer size we return to the scanner.
|
/// The buffer size we return to the scanner.
|
||||||
|
@ -211,9 +274,40 @@ impl<'a> Input for StrInput<'a> {
|
||||||
/// [`buflen`]: `StrInput::buflen`
|
/// [`buflen`]: `StrInput::buflen`
|
||||||
const BUFFER_LEN: usize = 128;
|
const BUFFER_LEN: usize = 128;
|
||||||
|
|
||||||
|
/// Fake prepending a character to the given string.
|
||||||
|
///
|
||||||
|
/// The character given as parameter MUST be the one that precedes the given string.
|
||||||
|
///
|
||||||
|
/// # Exmaple
|
||||||
|
/// ```ignore
|
||||||
|
/// let s1 = "foo";
|
||||||
|
/// let s2 = &s1[1..];
|
||||||
|
/// let s3 = put_back_in_str(s2, 'f'); // OK, 'f' is the character immediately preceding
|
||||||
|
/// // let s3 = put_back_in_str('g'); // Not allowed
|
||||||
|
/// assert_eq!(s1, s3);
|
||||||
|
/// assert_eq!(s1.as_ptr(), s3.as_ptr());
|
||||||
|
/// ```
|
||||||
|
fn put_back_in_str(s: &str, c: char) -> &str {
|
||||||
|
let n_bytes = c.len_utf8();
|
||||||
|
|
||||||
|
// SAFETY: The character that gets pushed back is guaranteed to be the one that is
|
||||||
|
// immediately preceding our buffer. We can compute the length of the character and move
|
||||||
|
// our buffer back that many bytes.
|
||||||
|
unsafe { extend_left(s, n_bytes) }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extend the string by moving the start pointer to the left by `n` bytes.
|
||||||
|
#[inline]
|
||||||
|
unsafe fn extend_left(s: &str, n: usize) -> &str {
|
||||||
|
std::str::from_utf8_unchecked(std::slice::from_raw_parts(
|
||||||
|
s.as_ptr().wrapping_sub(n),
|
||||||
|
s.len() + n,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use crate::input::Input;
|
use crate::input::{str::put_back_in_str, Input};
|
||||||
|
|
||||||
use super::StrInput;
|
use super::StrInput;
|
||||||
|
|
||||||
|
@ -248,4 +342,13 @@ mod test {
|
||||||
assert!(input.next_is_document_end());
|
assert!(input.next_is_document_end());
|
||||||
assert!(input.next_is_document_indicator());
|
assert!(input.next_is_document_indicator());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
pub fn put_back_in_str_example() {
|
||||||
|
let s1 = "foo";
|
||||||
|
let s2 = &s1[1..];
|
||||||
|
let s3 = put_back_in_str(s2, 'f'); // OK, 'f' is the character immediately preceding
|
||||||
|
assert_eq!(s1, s3);
|
||||||
|
assert_eq!(s1.as_ptr(), s3.as_ptr());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,7 @@ use crate::{
|
||||||
as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz,
|
as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz,
|
||||||
is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z,
|
is_digit, is_flow, is_hex, is_tag_char, is_uri_char, is_z,
|
||||||
},
|
},
|
||||||
input::Input,
|
input::{Input, SkipTabs},
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The encoding of the input. Currently, only UTF-8 is supported.
|
/// The encoding of the input. Currently, only UTF-8 is supported.
|
||||||
|
@ -847,37 +847,11 @@ impl<T: Input> Scanner<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Skip yaml whitespace at most up to eol. Also skips comments.
|
|
||||||
fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
|
fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
|
||||||
let mut encountered_tab = false;
|
let (n_bytes, result) = self.input.skip_ws_to_eol(skip_tabs);
|
||||||
let mut has_yaml_ws = false;
|
self.mark.col += n_bytes;
|
||||||
loop {
|
self.mark.index += n_bytes;
|
||||||
match self.input.look_ch() {
|
result.map_err(|msg| ScanError::new_str(self.mark, msg))
|
||||||
' ' => {
|
|
||||||
has_yaml_ws = true;
|
|
||||||
self.skip_blank();
|
|
||||||
}
|
|
||||||
'\t' if skip_tabs != SkipTabs::No => {
|
|
||||||
encountered_tab = true;
|
|
||||||
self.skip_blank();
|
|
||||||
}
|
|
||||||
// YAML comments must be preceded by whitespace.
|
|
||||||
'#' if !encountered_tab && !has_yaml_ws => {
|
|
||||||
return Err(ScanError::new_str(
|
|
||||||
self.mark,
|
|
||||||
"comments must be separated from other tokens by whitespace",
|
|
||||||
));
|
|
||||||
}
|
|
||||||
'#' => {
|
|
||||||
while !is_breakz(self.input.look_ch()) {
|
|
||||||
self.skip_non_blank();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => break,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(SkipTabs::Result(encountered_tab, has_yaml_ws))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fetch_stream_start(&mut self) {
|
fn fetch_stream_start(&mut self) {
|
||||||
|
@ -2544,40 +2518,6 @@ impl<T: Input> Scanner<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Behavior to adopt regarding treating tabs as whitespace.
|
|
||||||
///
|
|
||||||
/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
|
|
||||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
|
||||||
enum SkipTabs {
|
|
||||||
/// Skip all tabs as whitespace.
|
|
||||||
Yes,
|
|
||||||
/// Don't skip any tab. Return from the function when encountering one.
|
|
||||||
No,
|
|
||||||
/// Return value from the function.
|
|
||||||
Result(
|
|
||||||
/// Whether tabs were encountered.
|
|
||||||
bool,
|
|
||||||
/// Whether at least 1 valid yaml whitespace has been encountered.
|
|
||||||
bool,
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SkipTabs {
|
|
||||||
/// Whether tabs were found while skipping whitespace.
|
|
||||||
///
|
|
||||||
/// This function must be called after a call to `skip_ws_to_eol`.
|
|
||||||
fn found_tabs(self) -> bool {
|
|
||||||
matches!(self, SkipTabs::Result(true, _))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Whether a valid YAML whitespace has been found in skipped-over content.
|
|
||||||
///
|
|
||||||
/// This function must be called after a call to `skip_ws_to_eol`.
|
|
||||||
fn has_valid_yaml_ws(self) -> bool {
|
|
||||||
matches!(self, SkipTabs::Result(_, true))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Chomping, how final line breaks and trailing empty lines are interpreted.
|
/// Chomping, how final line breaks and trailing empty lines are interpreted.
|
||||||
///
|
///
|
||||||
/// See YAML spec 8.1.1.2.
|
/// See YAML spec 8.1.1.2.
|
||||||
|
|
Loading…
Reference in a new issue