Improve scan_plain_scalar
readability.
Take whitespace checking out of the innermost loop for performance.
This commit is contained in:
parent
28893c4567
commit
4fee65f27a
2 changed files with 89 additions and 64 deletions
|
@ -28,7 +28,7 @@ pub(crate) fn is_blank(c: char) -> bool {
|
||||||
///
|
///
|
||||||
/// `\0`, ` `, `\t`, `\n`, `\r`
|
/// `\0`, ` `, `\t`, `\n`, `\r`
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn is_blankz(c: char) -> bool {
|
pub(crate) fn is_blank_or_breakz(c: char) -> bool {
|
||||||
is_blank(c) || is_breakz(c)
|
is_blank(c) || is_breakz(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,8 @@
|
||||||
use std::{char, collections::VecDeque, error::Error, fmt};
|
use std::{char, collections::VecDeque, error::Error, fmt};
|
||||||
|
|
||||||
use crate::char_traits::{
|
use crate::char_traits::{
|
||||||
as_hex, is_alpha, is_anchor_char, is_blank, is_blankz, is_break, is_breakz, is_digit, is_flow,
|
as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit,
|
||||||
is_hex, is_tag_char, is_uri_char, is_z,
|
is_flow, is_hex, is_tag_char, is_uri_char, is_z,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
|
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
|
||||||
|
@ -505,7 +505,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
|
|
||||||
/// Read a character from the input stream, returning it directly.
|
/// Read a character from the input stream, returning it directly.
|
||||||
///
|
///
|
||||||
/// The buffer is bypassed and `self.mark` would need to be updated manually.
|
/// The buffer is bypassed and `self.mark` needs to be updated manually.
|
||||||
#[inline]
|
#[inline]
|
||||||
#[must_use]
|
#[must_use]
|
||||||
fn raw_read_ch(&mut self) -> char {
|
fn raw_read_ch(&mut self) -> char {
|
||||||
|
@ -559,7 +559,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
self.buffer[0] == '.'
|
self.buffer[0] == '.'
|
||||||
&& self.buffer[1] == '.'
|
&& self.buffer[1] == '.'
|
||||||
&& self.buffer[2] == '.'
|
&& self.buffer[2] == '.'
|
||||||
&& is_blankz(self.buffer[3])
|
&& is_blank_or_breakz(self.buffer[3])
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Insert a token at the given position.
|
/// Insert a token at the given position.
|
||||||
|
@ -614,7 +614,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
&& self.buffer[0] == '-'
|
&& self.buffer[0] == '-'
|
||||||
&& self.buffer[1] == '-'
|
&& self.buffer[1] == '-'
|
||||||
&& self.buffer[2] == '-'
|
&& self.buffer[2] == '-'
|
||||||
&& is_blankz(self.buffer[3])
|
&& is_blank_or_breakz(self.buffer[3])
|
||||||
{
|
{
|
||||||
self.fetch_document_indicator(TokenType::DocumentStart)?;
|
self.fetch_document_indicator(TokenType::DocumentStart)?;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
|
@ -624,7 +624,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
&& self.buffer[0] == '.'
|
&& self.buffer[0] == '.'
|
||||||
&& self.buffer[1] == '.'
|
&& self.buffer[1] == '.'
|
||||||
&& self.buffer[2] == '.'
|
&& self.buffer[2] == '.'
|
||||||
&& is_blankz(self.buffer[3])
|
&& is_blank_or_breakz(self.buffer[3])
|
||||||
{
|
{
|
||||||
self.fetch_document_indicator(TokenType::DocumentEnd)?;
|
self.fetch_document_indicator(TokenType::DocumentEnd)?;
|
||||||
self.skip_ws_to_eol(SkipTabs::Yes)?;
|
self.skip_ws_to_eol(SkipTabs::Yes)?;
|
||||||
|
@ -649,9 +649,9 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
|
']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
|
||||||
'}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
|
'}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
|
||||||
',' => self.fetch_flow_entry(),
|
',' => self.fetch_flow_entry(),
|
||||||
'-' if is_blankz(nc) => self.fetch_block_entry(),
|
'-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
|
||||||
'?' if is_blankz(nc) => self.fetch_key(),
|
'?' if is_blank_or_breakz(nc) => self.fetch_key(),
|
||||||
':' if is_blankz(nc)
|
':' if is_blank_or_breakz(nc)
|
||||||
|| (self.flow_level > 0
|
|| (self.flow_level > 0
|
||||||
&& (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) =>
|
&& (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) =>
|
||||||
{
|
{
|
||||||
|
@ -669,8 +669,10 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
'\'' => self.fetch_flow_scalar(true),
|
'\'' => self.fetch_flow_scalar(true),
|
||||||
'"' => self.fetch_flow_scalar(false),
|
'"' => self.fetch_flow_scalar(false),
|
||||||
// plain scalar
|
// plain scalar
|
||||||
'-' if !is_blankz(nc) => self.fetch_plain_scalar(),
|
'-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
|
||||||
':' | '?' if !is_blankz(nc) && self.flow_level == 0 => self.fetch_plain_scalar(),
|
':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
|
||||||
|
self.fetch_plain_scalar()
|
||||||
|
}
|
||||||
'%' | '@' | '`' => Err(ScanError::new(
|
'%' | '@' | '`' => Err(ScanError::new(
|
||||||
self.mark,
|
self.mark,
|
||||||
&format!("unexpected character: `{c}'"),
|
&format!("unexpected character: `{c}'"),
|
||||||
|
@ -992,7 +994,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
if !is_blankz(self.ch()) {
|
if !is_blank_or_breakz(self.ch()) {
|
||||||
return Err(ScanError::new(
|
return Err(ScanError::new(
|
||||||
start_mark,
|
start_mark,
|
||||||
"while scanning a directive, found unexpected non-alphabetical character",
|
"while scanning a directive, found unexpected non-alphabetical character",
|
||||||
|
@ -1043,7 +1045,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
|
|
||||||
self.lookahead(1);
|
self.lookahead(1);
|
||||||
|
|
||||||
if is_blankz(self.ch()) {
|
if is_blank_or_breakz(self.ch()) {
|
||||||
Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
|
Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
|
||||||
} else {
|
} else {
|
||||||
Err(ScanError::new(
|
Err(ScanError::new(
|
||||||
|
@ -1093,7 +1095,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if is_blankz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) {
|
if is_blank_or_breakz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) {
|
||||||
// XXX: ex 7.2, an empty scalar can follow a secondary tag
|
// XXX: ex 7.2, an empty scalar can follow a secondary tag
|
||||||
Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
|
Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
|
||||||
} else {
|
} else {
|
||||||
|
@ -1442,7 +1444,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
|
self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
|
||||||
let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
|
let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
|
||||||
self.lookahead(2);
|
self.lookahead(2);
|
||||||
if found_tabs && self.buffer[0] == '-' && is_blankz(self.buffer[1]) {
|
if found_tabs && self.buffer[0] == '-' && is_blank_or_breakz(self.buffer[1]) {
|
||||||
return Err(ScanError::new(
|
return Err(ScanError::new(
|
||||||
self.mark,
|
self.mark,
|
||||||
"'-' must be followed by a valid YAML whitespace",
|
"'-' must be followed by a valid YAML whitespace",
|
||||||
|
@ -1819,7 +1821,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
|| ((self.buffer[0] == '.')
|
|| ((self.buffer[0] == '.')
|
||||||
&& (self.buffer[1] == '.')
|
&& (self.buffer[1] == '.')
|
||||||
&& (self.buffer[2] == '.')))
|
&& (self.buffer[2] == '.')))
|
||||||
&& is_blankz(self.buffer[3])
|
&& is_blank_or_breakz(self.buffer[3])
|
||||||
{
|
{
|
||||||
return Err(ScanError::new(
|
return Err(ScanError::new(
|
||||||
start_mark,
|
start_mark,
|
||||||
|
@ -1953,7 +1955,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
start_mark: &Marker,
|
start_mark: &Marker,
|
||||||
) -> Result<(), ScanError> {
|
) -> Result<(), ScanError> {
|
||||||
self.lookahead(2);
|
self.lookahead(2);
|
||||||
while !is_blankz(self.ch()) {
|
while !is_blank_or_breakz(self.ch()) {
|
||||||
match self.ch() {
|
match self.ch() {
|
||||||
// Check for an escaped single quote.
|
// Check for an escaped single quote.
|
||||||
'\'' if self.buffer[1] == '\'' && single => {
|
'\'' if self.buffer[1] == '\'' && single => {
|
||||||
|
@ -2069,6 +2071,10 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Scan for a plain scalar.
|
||||||
|
///
|
||||||
|
/// Plain scalars are the most readable but restricted style. They may span multiple lines in
|
||||||
|
/// some contexts.
|
||||||
#[allow(clippy::too_many_lines)]
|
#[allow(clippy::too_many_lines)]
|
||||||
fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> {
|
fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> {
|
||||||
self.unroll_non_block_indents();
|
self.unroll_non_block_indents();
|
||||||
|
@ -2086,7 +2092,6 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
let mut leading_break = String::new();
|
let mut leading_break = String::new();
|
||||||
let mut trailing_breaks = String::new();
|
let mut trailing_breaks = String::new();
|
||||||
let mut whitespaces = String::new();
|
let mut whitespaces = String::new();
|
||||||
let mut leading_blanks = true;
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
/* Check for a document indicator. */
|
/* Check for a document indicator. */
|
||||||
|
@ -2096,7 +2101,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
|| ((self.buffer[0] == '.')
|
|| ((self.buffer[0] == '.')
|
||||||
&& (self.buffer[1] == '.')
|
&& (self.buffer[1] == '.')
|
||||||
&& (self.buffer[2] == '.')))
|
&& (self.buffer[2] == '.')))
|
||||||
&& is_blankz(self.buffer[3])
|
&& is_blank_or_breakz(self.buffer[3])
|
||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -2112,20 +2117,11 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
while !is_blankz(self.ch()) {
|
if !is_blank_or_breakz(self.ch())
|
||||||
// indicators can end a plain scalar, see 7.3.3. Plain Style
|
&& self.next_can_be_plain_scalar()
|
||||||
match self.ch() {
|
&& (self.leading_whitespace || !whitespaces.is_empty())
|
||||||
':' if is_blankz(self.buffer[1])
|
|
||||||
|| (self.flow_level > 0 && is_flow(self.buffer[1])) =>
|
|
||||||
{
|
{
|
||||||
break;
|
if self.leading_whitespace {
|
||||||
}
|
|
||||||
c if is_flow(c) && self.flow_level > 0 => break,
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
|
|
||||||
if leading_blanks || !whitespaces.is_empty() {
|
|
||||||
if leading_blanks {
|
|
||||||
if leading_break.is_empty() {
|
if leading_break.is_empty() {
|
||||||
string.push_str(&leading_break);
|
string.push_str(&leading_break);
|
||||||
string.push_str(&trailing_breaks);
|
string.push_str(&trailing_breaks);
|
||||||
|
@ -2140,50 +2136,60 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
}
|
}
|
||||||
leading_break.clear();
|
leading_break.clear();
|
||||||
}
|
}
|
||||||
leading_blanks = false;
|
self.leading_whitespace = false;
|
||||||
} else {
|
} else {
|
||||||
string.push_str(&whitespaces);
|
string.push_str(&whitespaces);
|
||||||
whitespaces.clear();
|
whitespaces.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add content non-blank characters to the scalar.
|
||||||
|
while !is_blank_or_breakz(self.ch()) {
|
||||||
|
if !self.next_can_be_plain_scalar() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
string.push(self.ch());
|
string.push(self.ch());
|
||||||
self.skip_non_blank();
|
self.skip_non_blank();
|
||||||
self.lookahead(2);
|
self.lookahead(2);
|
||||||
}
|
}
|
||||||
// is the end?
|
|
||||||
|
// We may reach the end of a plain scalar if:
|
||||||
|
// - We reach eof
|
||||||
|
// - We reach ": "
|
||||||
|
// - We find a flow character in a flow context
|
||||||
if !(is_blank(self.ch()) || is_break(self.ch())) {
|
if !(is_blank(self.ch()) || is_break(self.ch())) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Process blank characters.
|
||||||
while is_blank(self.look_ch()) || is_break(self.ch()) {
|
while is_blank(self.look_ch()) || is_break(self.ch()) {
|
||||||
if is_blank(self.ch()) {
|
if is_blank(self.ch()) {
|
||||||
if leading_blanks && (self.mark.col as isize) < indent && self.ch() == '\t' {
|
if !self.leading_whitespace {
|
||||||
// If our line contains only whitespace, this is not an error.
|
whitespaces.push(self.ch());
|
||||||
// Skip over it.
|
self.skip_blank();
|
||||||
|
} else if (self.mark.col as isize) < indent && self.ch() == '\t' {
|
||||||
|
// Tabs in an indentation columns are allowed if and only if the line is
|
||||||
|
// empty. Skip to the end of the line.
|
||||||
self.skip_ws_to_eol(SkipTabs::Yes)?;
|
self.skip_ws_to_eol(SkipTabs::Yes)?;
|
||||||
if is_breakz(self.ch()) {
|
if !is_breakz(self.ch()) {
|
||||||
continue;
|
|
||||||
}
|
|
||||||
return Err(ScanError::new(
|
return Err(ScanError::new(
|
||||||
start_mark,
|
start_mark,
|
||||||
"while scanning a plain scalar, found a tab",
|
"while scanning a plain scalar, found a tab",
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
if !leading_blanks {
|
|
||||||
whitespaces.push(self.ch());
|
|
||||||
}
|
|
||||||
self.skip_blank();
|
self.skip_blank();
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
self.lookahead(2);
|
self.lookahead(2);
|
||||||
// Check if it is a first line break
|
// Check if it is a first line break
|
||||||
if leading_blanks {
|
if self.leading_whitespace {
|
||||||
self.read_break(&mut trailing_breaks);
|
self.read_break(&mut trailing_breaks);
|
||||||
} else {
|
} else {
|
||||||
whitespaces.clear();
|
whitespaces.clear();
|
||||||
self.read_break(&mut leading_break);
|
self.read_break(&mut leading_break);
|
||||||
leading_blanks = true;
|
self.leading_whitespace = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2194,7 +2200,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if leading_blanks {
|
if self.leading_whitespace {
|
||||||
self.allow_simple_key();
|
self.allow_simple_key();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2432,6 +2438,25 @@ impl<T: Iterator<Item = char>> Scanner<T> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check whether the next characters may be part of a plain scalar.
|
||||||
|
///
|
||||||
|
/// This function assumes we are not given a blankz character.
|
||||||
|
// For some reason, `#[inline]` is not enough.
|
||||||
|
#[allow(clippy::inline_always)]
|
||||||
|
#[inline(always)]
|
||||||
|
fn next_can_be_plain_scalar(&self) -> bool {
|
||||||
|
match self.ch() {
|
||||||
|
// indicators can end a plain scalar, see 7.3.3. Plain Style
|
||||||
|
':' if is_blank_or_breakz(self.buffer[1])
|
||||||
|
|| (self.flow_level > 0 && is_flow(self.buffer[1])) =>
|
||||||
|
{
|
||||||
|
false
|
||||||
|
}
|
||||||
|
c if self.flow_level > 0 && is_flow(c) => false,
|
||||||
|
_ => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Return whether the scanner is inside a block but outside of a flow sequence.
|
/// Return whether the scanner is inside a block but outside of a flow sequence.
|
||||||
fn is_within_block(&self) -> bool {
|
fn is_within_block(&self) -> bool {
|
||||||
!self.indents.is_empty()
|
!self.indents.is_empty()
|
||||||
|
|
Loading…
Reference in a new issue