Improve scan_plain_scalar readability.

Take whitespace checking out of the innermost loop for performance.
This commit is contained in:
Ethiraric 2024-01-25 03:06:18 +01:00
parent f535e505a7
commit 99fb05c937
2 changed files with 89 additions and 64 deletions

View file

@ -28,7 +28,7 @@ pub(crate) fn is_blank(c: char) -> bool {
/// ///
/// `\0`, ` `, `\t`, `\n`, `\r` /// `\0`, ` `, `\t`, `\n`, `\r`
#[inline] #[inline]
pub(crate) fn is_blankz(c: char) -> bool { pub(crate) fn is_blank_or_breakz(c: char) -> bool {
is_blank(c) || is_breakz(c) is_blank(c) || is_breakz(c)
} }

View file

@ -4,8 +4,8 @@
use std::{char, collections::VecDeque, error::Error, fmt}; use std::{char, collections::VecDeque, error::Error, fmt};
use crate::char_traits::{ use crate::char_traits::{
as_hex, is_alpha, is_anchor_char, is_blank, is_blankz, is_break, is_breakz, is_digit, is_flow, as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit,
is_hex, is_tag_char, is_uri_char, is_z, is_flow, is_hex, is_tag_char, is_uri_char, is_z,
}; };
#[derive(Clone, Copy, PartialEq, Debug, Eq)] #[derive(Clone, Copy, PartialEq, Debug, Eq)]
@ -505,7 +505,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
/// Read a character from the input stream, returning it directly. /// Read a character from the input stream, returning it directly.
/// ///
/// The buffer is bypassed and `self.mark` would need to be updated manually. /// The buffer is bypassed and `self.mark` needs to be updated manually.
#[inline] #[inline]
#[must_use] #[must_use]
fn raw_read_ch(&mut self) -> char { fn raw_read_ch(&mut self) -> char {
@ -559,7 +559,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.buffer[0] == '.' self.buffer[0] == '.'
&& self.buffer[1] == '.' && self.buffer[1] == '.'
&& self.buffer[2] == '.' && self.buffer[2] == '.'
&& is_blankz(self.buffer[3]) && is_blank_or_breakz(self.buffer[3])
} }
/// Insert a token at the given position. /// Insert a token at the given position.
@ -614,7 +614,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
&& self.buffer[0] == '-' && self.buffer[0] == '-'
&& self.buffer[1] == '-' && self.buffer[1] == '-'
&& self.buffer[2] == '-' && self.buffer[2] == '-'
&& is_blankz(self.buffer[3]) && is_blank_or_breakz(self.buffer[3])
{ {
self.fetch_document_indicator(TokenType::DocumentStart)?; self.fetch_document_indicator(TokenType::DocumentStart)?;
return Ok(()); return Ok(());
@ -624,7 +624,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
&& self.buffer[0] == '.' && self.buffer[0] == '.'
&& self.buffer[1] == '.' && self.buffer[1] == '.'
&& self.buffer[2] == '.' && self.buffer[2] == '.'
&& is_blankz(self.buffer[3]) && is_blank_or_breakz(self.buffer[3])
{ {
self.fetch_document_indicator(TokenType::DocumentEnd)?; self.fetch_document_indicator(TokenType::DocumentEnd)?;
self.skip_ws_to_eol(SkipTabs::Yes)?; self.skip_ws_to_eol(SkipTabs::Yes)?;
@ -649,9 +649,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd), ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
'}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd), '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
',' => self.fetch_flow_entry(), ',' => self.fetch_flow_entry(),
'-' if is_blankz(nc) => self.fetch_block_entry(), '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
'?' if is_blankz(nc) => self.fetch_key(), '?' if is_blank_or_breakz(nc) => self.fetch_key(),
':' if is_blankz(nc) ':' if is_blank_or_breakz(nc)
|| (self.flow_level > 0 || (self.flow_level > 0
&& (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) => && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) =>
{ {
@ -669,8 +669,10 @@ impl<T: Iterator<Item = char>> Scanner<T> {
'\'' => self.fetch_flow_scalar(true), '\'' => self.fetch_flow_scalar(true),
'"' => self.fetch_flow_scalar(false), '"' => self.fetch_flow_scalar(false),
// plain scalar // plain scalar
'-' if !is_blankz(nc) => self.fetch_plain_scalar(), '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
':' | '?' if !is_blankz(nc) && self.flow_level == 0 => self.fetch_plain_scalar(), ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
self.fetch_plain_scalar()
}
'%' | '@' | '`' => Err(ScanError::new( '%' | '@' | '`' => Err(ScanError::new(
self.mark, self.mark,
&format!("unexpected character: `{c}'"), &format!("unexpected character: `{c}'"),
@ -992,7 +994,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
)); ));
} }
if !is_blankz(self.ch()) { if !is_blank_or_breakz(self.ch()) {
return Err(ScanError::new( return Err(ScanError::new(
start_mark, start_mark,
"while scanning a directive, found unexpected non-alphabetical character", "while scanning a directive, found unexpected non-alphabetical character",
@ -1043,7 +1045,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.lookahead(1); self.lookahead(1);
if is_blankz(self.ch()) { if is_blank_or_breakz(self.ch()) {
Ok(Token(*mark, TokenType::TagDirective(handle, prefix))) Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
} else { } else {
Err(ScanError::new( Err(ScanError::new(
@ -1093,7 +1095,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
} }
if is_blankz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) { if is_blank_or_breakz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) {
// XXX: ex 7.2, an empty scalar can follow a secondary tag // XXX: ex 7.2, an empty scalar can follow a secondary tag
Ok(Token(start_mark, TokenType::Tag(handle, suffix))) Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
} else { } else {
@ -1442,7 +1444,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark); self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs(); let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
self.lookahead(2); self.lookahead(2);
if found_tabs && self.buffer[0] == '-' && is_blankz(self.buffer[1]) { if found_tabs && self.buffer[0] == '-' && is_blank_or_breakz(self.buffer[1]) {
return Err(ScanError::new( return Err(ScanError::new(
self.mark, self.mark,
"'-' must be followed by a valid YAML whitespace", "'-' must be followed by a valid YAML whitespace",
@ -1819,7 +1821,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|| ((self.buffer[0] == '.') || ((self.buffer[0] == '.')
&& (self.buffer[1] == '.') && (self.buffer[1] == '.')
&& (self.buffer[2] == '.'))) && (self.buffer[2] == '.')))
&& is_blankz(self.buffer[3]) && is_blank_or_breakz(self.buffer[3])
{ {
return Err(ScanError::new( return Err(ScanError::new(
start_mark, start_mark,
@ -1953,7 +1955,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
start_mark: &Marker, start_mark: &Marker,
) -> Result<(), ScanError> { ) -> Result<(), ScanError> {
self.lookahead(2); self.lookahead(2);
while !is_blankz(self.ch()) { while !is_blank_or_breakz(self.ch()) {
match self.ch() { match self.ch() {
// Check for an escaped single quote. // Check for an escaped single quote.
'\'' if self.buffer[1] == '\'' && single => { '\'' if self.buffer[1] == '\'' && single => {
@ -2069,6 +2071,10 @@ impl<T: Iterator<Item = char>> Scanner<T> {
Ok(()) Ok(())
} }
/// Scan for a plain scalar.
///
/// Plain scalars are the most readable but restricted style. They may span multiple lines in
/// some contexts.
#[allow(clippy::too_many_lines)] #[allow(clippy::too_many_lines)]
fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> { fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> {
self.unroll_non_block_indents(); self.unroll_non_block_indents();
@ -2086,7 +2092,6 @@ impl<T: Iterator<Item = char>> Scanner<T> {
let mut leading_break = String::new(); let mut leading_break = String::new();
let mut trailing_breaks = String::new(); let mut trailing_breaks = String::new();
let mut whitespaces = String::new(); let mut whitespaces = String::new();
let mut leading_blanks = true;
loop { loop {
/* Check for a document indicator. */ /* Check for a document indicator. */
@ -2096,7 +2101,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|| ((self.buffer[0] == '.') || ((self.buffer[0] == '.')
&& (self.buffer[1] == '.') && (self.buffer[1] == '.')
&& (self.buffer[2] == '.'))) && (self.buffer[2] == '.')))
&& is_blankz(self.buffer[3]) && is_blank_or_breakz(self.buffer[3])
{ {
break; break;
} }
@ -2112,20 +2117,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
)); ));
} }
while !is_blankz(self.ch()) { if !is_blank_or_breakz(self.ch())
// indicators can end a plain scalar, see 7.3.3. Plain Style && self.next_can_be_plain_scalar()
match self.ch() { && (self.leading_whitespace || !whitespaces.is_empty())
':' if is_blankz(self.buffer[1])
|| (self.flow_level > 0 && is_flow(self.buffer[1])) =>
{ {
break; if self.leading_whitespace {
}
c if is_flow(c) && self.flow_level > 0 => break,
_ => {}
}
if leading_blanks || !whitespaces.is_empty() {
if leading_blanks {
if leading_break.is_empty() { if leading_break.is_empty() {
string.push_str(&leading_break); string.push_str(&leading_break);
string.push_str(&trailing_breaks); string.push_str(&trailing_breaks);
@ -2140,50 +2136,60 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
leading_break.clear(); leading_break.clear();
} }
leading_blanks = false; self.leading_whitespace = false;
} else { } else {
string.push_str(&whitespaces); string.push_str(&whitespaces);
whitespaces.clear(); whitespaces.clear();
} }
} }
// Add content non-blank characters to the scalar.
while !is_blank_or_breakz(self.ch()) {
if !self.next_can_be_plain_scalar() {
break;
}
string.push(self.ch()); string.push(self.ch());
self.skip_non_blank(); self.skip_non_blank();
self.lookahead(2); self.lookahead(2);
} }
// is the end?
// We may reach the end of a plain scalar if:
// - We reach eof
// - We reach ": "
// - We find a flow character in a flow context
if !(is_blank(self.ch()) || is_break(self.ch())) { if !(is_blank(self.ch()) || is_break(self.ch())) {
break; break;
} }
// Process blank characters.
while is_blank(self.look_ch()) || is_break(self.ch()) { while is_blank(self.look_ch()) || is_break(self.ch()) {
if is_blank(self.ch()) { if is_blank(self.ch()) {
if leading_blanks && (self.mark.col as isize) < indent && self.ch() == '\t' { if !self.leading_whitespace {
// If our line contains only whitespace, this is not an error. whitespaces.push(self.ch());
// Skip over it. self.skip_blank();
} else if (self.mark.col as isize) < indent && self.ch() == '\t' {
// Tabs in an indentation columns are allowed if and only if the line is
// empty. Skip to the end of the line.
self.skip_ws_to_eol(SkipTabs::Yes)?; self.skip_ws_to_eol(SkipTabs::Yes)?;
if is_breakz(self.ch()) { if !is_breakz(self.ch()) {
continue;
}
return Err(ScanError::new( return Err(ScanError::new(
start_mark, start_mark,
"while scanning a plain scalar, found a tab", "while scanning a plain scalar, found a tab",
)); ));
} }
} else {
if !leading_blanks {
whitespaces.push(self.ch());
}
self.skip_blank(); self.skip_blank();
}
} else { } else {
self.lookahead(2); self.lookahead(2);
// Check if it is a first line break // Check if it is a first line break
if leading_blanks { if self.leading_whitespace {
self.read_break(&mut trailing_breaks); self.read_break(&mut trailing_breaks);
} else { } else {
whitespaces.clear(); whitespaces.clear();
self.read_break(&mut leading_break); self.read_break(&mut leading_break);
leading_blanks = true; self.leading_whitespace = true;
} }
} }
} }
@ -2194,7 +2200,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
} }
} }
if leading_blanks { if self.leading_whitespace {
self.allow_simple_key(); self.allow_simple_key();
} }
@ -2432,6 +2438,25 @@ impl<T: Iterator<Item = char>> Scanner<T> {
Ok(()) Ok(())
} }
/// Check whether the next characters may be part of a plain scalar.
///
/// This function assumes we are not given a blankz character.
// For some reason, `#[inline]` is not enough.
#[allow(clippy::inline_always)]
#[inline(always)]
fn next_can_be_plain_scalar(&self) -> bool {
match self.ch() {
// indicators can end a plain scalar, see 7.3.3. Plain Style
':' if is_blank_or_breakz(self.buffer[1])
|| (self.flow_level > 0 && is_flow(self.buffer[1])) =>
{
false
}
c if self.flow_level > 0 && is_flow(c) => false,
_ => true,
}
}
/// Return whether the scanner is inside a block but outside of a flow sequence. /// Return whether the scanner is inside a block but outside of a flow sequence.
fn is_within_block(&self) -> bool { fn is_within_block(&self) -> bool {
!self.indents.is_empty() !self.indents.is_empty()