saphyr-serde/parser/src/scanner.rs

1830 lines
54 KiB
Rust
Raw Normal View History

2023-08-11 23:54:46 +00:00
#![allow(clippy::cast_possible_wrap)]
#![allow(clippy::cast_sign_loss)]
2015-05-24 06:27:42 +00:00
use std::collections::VecDeque;
use std::error::Error;
2018-09-15 16:49:04 +00:00
use std::{char, fmt};
2015-05-24 06:27:42 +00:00
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
pub enum TEncoding {
2018-09-15 16:49:04 +00:00
Utf8,
2015-05-24 06:27:42 +00:00
}
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
pub enum TScalarStyle {
Any,
Plain,
SingleQuoted,
DoubleQuoted,
Literal,
2018-09-15 16:49:04 +00:00
Foled,
2015-05-24 06:27:42 +00:00
}
/// A location in a yaml document.
2015-05-24 06:27:42 +00:00
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
pub struct Marker {
index: usize,
line: usize,
col: usize,
}
impl Marker {
fn new(index: usize, line: usize, col: usize) -> Marker {
2018-09-15 17:03:55 +00:00
Marker { index, line, col }
2015-05-24 06:27:42 +00:00
}
2017-09-02 11:49:53 +00:00
/// Return the index (in bytes) of the marker in the source.
2023-08-11 23:54:46 +00:00
#[must_use]
2017-09-02 11:49:53 +00:00
pub fn index(&self) -> usize {
self.index
}
/// Return the line of the marker in the source.
2023-08-11 23:54:46 +00:00
#[must_use]
2017-09-02 11:49:53 +00:00
pub fn line(&self) -> usize {
self.line
}
/// Return the column of the marker in the source.
2023-08-11 23:54:46 +00:00
#[must_use]
2017-09-02 11:49:53 +00:00
pub fn col(&self) -> usize {
self.col
}
2015-05-24 06:27:42 +00:00
}
/// An error that occured while scanning.
2015-05-24 06:27:42 +00:00
#[derive(Clone, PartialEq, Debug, Eq)]
pub struct ScanError {
mark: Marker,
info: String,
}
impl ScanError {
/// Create a new error from a location and an error string.
2023-08-11 23:54:46 +00:00
#[must_use]
2015-05-24 06:27:42 +00:00
pub fn new(loc: Marker, info: &str) -> ScanError {
ScanError {
mark: loc,
2018-09-15 16:49:04 +00:00
info: info.to_owned(),
2015-05-24 06:27:42 +00:00
}
}
2017-11-15 01:36:16 +00:00
/// Return the marker pointing to the error in the source.
2023-08-11 23:54:46 +00:00
#[must_use]
2017-11-15 03:41:39 +00:00
pub fn marker(&self) -> &Marker {
&self.mark
2017-11-15 01:36:16 +00:00
}
/// Return the information string describing the error that happened.
#[must_use]
pub fn info(&self) -> &str {
self.info.as_ref()
}
2015-05-24 06:27:42 +00:00
}
impl Error for ScanError {
fn description(&self) -> &str {
self.info.as_ref()
}
2020-05-27 06:15:28 +00:00
fn cause(&self) -> Option<&dyn Error> {
None
}
}
impl fmt::Display for ScanError {
// col starts from 0
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
2018-09-15 16:49:04 +00:00
write!(
formatter,
"{} at line {} column {}",
self.info,
self.mark.line,
self.mark.col + 1
)
}
}
2015-05-24 06:27:42 +00:00
#[derive(Clone, PartialEq, Debug, Eq)]
pub enum TokenType {
NoToken,
2023-11-19 00:09:41 +00:00
/// The start of the stream. Sent first, before even [`DocumentStart`].
StreamStart(TEncoding),
2023-11-19 00:09:41 +00:00
/// The end of the stream, EOF.
StreamEnd,
2023-11-19 00:09:41 +00:00
VersionDirective(
/// Major
u32,
/// Minor
u32,
),
TagDirective(
/// Handle
String,
/// Prefix
String,
),
/// The start of a YAML document (`---`).
DocumentStart,
2023-11-19 00:09:41 +00:00
/// The end of a YAML document (`...`).
DocumentEnd,
BlockSequenceStart,
BlockMappingStart,
BlockEnd,
2023-11-19 00:09:41 +00:00
/// Start of an inline array (`[ a, b ]`).
FlowSequenceStart,
2023-11-19 00:09:41 +00:00
/// End of an inline array.
FlowSequenceEnd,
2023-11-19 00:09:41 +00:00
/// Start of an inline mapping (`{ a: b, c: d }`).
FlowMappingStart,
2023-11-19 00:09:41 +00:00
/// End of an inline mapping.
FlowMappingEnd,
BlockEntry,
FlowEntry,
Key,
Value,
Alias(String),
2023-11-19 00:09:41 +00:00
/// A YAML anchor (`&`/`*`).
Anchor(String),
2015-05-28 18:57:41 +00:00
/// handle, suffix
Tag(String, String),
2018-09-15 16:49:04 +00:00
Scalar(TScalarStyle, String),
2015-05-24 06:27:42 +00:00
}
#[derive(Clone, PartialEq, Debug, Eq)]
pub struct Token(pub Marker, pub TokenType);
#[derive(Clone, PartialEq, Debug, Eq)]
struct SimpleKey {
possible: bool,
required: bool,
token_number: usize,
mark: Marker,
}
impl SimpleKey {
fn new(mark: Marker) -> SimpleKey {
SimpleKey {
possible: false,
required: false,
token_number: 0,
2018-09-15 17:03:55 +00:00
mark,
2015-05-24 06:27:42 +00:00
}
}
}
#[derive(Debug)]
2023-08-11 23:54:46 +00:00
#[allow(clippy::struct_excessive_bools)]
2015-05-24 06:27:42 +00:00
pub struct Scanner<T> {
rdr: T,
mark: Marker,
tokens: VecDeque<Token>,
buffer: VecDeque<char>,
2015-05-28 14:07:59 +00:00
error: Option<ScanError>,
2015-05-24 06:27:42 +00:00
stream_start_produced: bool,
stream_end_produced: bool,
adjacent_value_allowed_at: usize,
2015-05-24 06:27:42 +00:00
simple_key_allowed: bool,
simple_keys: Vec<SimpleKey>,
indent: isize,
indents: Vec<isize>,
2018-09-15 10:33:26 +00:00
flow_level: u8,
2015-05-24 06:27:42 +00:00
tokens_parsed: usize,
token_available: bool,
2023-12-20 22:14:22 +00:00
/// Whether all characters encountered since the last newline were whitespace.
leading_whitespace: bool,
2015-05-24 06:27:42 +00:00
}
2018-09-15 16:49:04 +00:00
impl<T: Iterator<Item = char>> Iterator for Scanner<T> {
2015-05-24 06:27:42 +00:00
type Item = Token;
fn next(&mut self) -> Option<Token> {
2015-05-28 14:07:59 +00:00
if self.error.is_some() {
return None;
}
2015-05-24 06:27:42 +00:00
match self.next_token() {
Ok(tok) => tok,
Err(e) => {
2015-05-28 14:07:59 +00:00
self.error = Some(e);
2015-05-24 06:27:42 +00:00
None
}
}
}
}
2023-11-19 18:13:01 +00:00
/// Check whether the character is nil (`\0`).
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn is_z(c: char) -> bool {
c == '\0'
}
2023-11-19 18:13:01 +00:00
/// Check whether the character is a line break (`\r` or `\n`).
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn is_break(c: char) -> bool {
c == '\n' || c == '\r'
}
2023-11-19 18:13:01 +00:00
/// Check whether the character is nil or a line break (`\0`, `\r`, `\n`).
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn is_breakz(c: char) -> bool {
is_break(c) || is_z(c)
}
2023-11-19 18:13:01 +00:00
/// Check whether the character is a whitespace (` ` or `\t`).
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn is_blank(c: char) -> bool {
c == ' ' || c == '\t'
}
2023-11-19 18:13:01 +00:00
/// Check whether the character is nil or a whitespace (`\0`, ` `, `\t`).
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn is_blankz(c: char) -> bool {
is_blank(c) || is_breakz(c)
}
2023-11-19 18:13:01 +00:00
/// Check whether the character is an ascii digit.
2015-05-26 16:29:40 +00:00
#[inline]
fn is_digit(c: char) -> bool {
2023-08-11 23:54:46 +00:00
c.is_ascii_digit()
2015-05-26 16:29:40 +00:00
}
2023-11-19 18:13:01 +00:00
/// Check whether the character is a digit, letter, `_` or `-`.
2015-05-27 08:35:13 +00:00
#[inline]
2015-05-28 17:56:03 +00:00
fn is_alpha(c: char) -> bool {
2023-08-11 23:54:46 +00:00
matches!(c, '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' | '-')
2015-05-28 17:56:03 +00:00
}
2023-11-19 18:13:01 +00:00
/// Check whether the character is a hexadecimal character (case insensitive).
2015-05-28 17:56:03 +00:00
#[inline]
2015-05-27 08:35:13 +00:00
fn is_hex(c: char) -> bool {
2023-08-11 23:54:46 +00:00
c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
2015-05-27 08:35:13 +00:00
}
2023-11-19 18:13:01 +00:00
/// Convert the hexadecimal digit to an integer.
2015-05-27 08:35:13 +00:00
#[inline]
fn as_hex(c: char) -> u32 {
match c {
2020-05-27 06:15:28 +00:00
'0'..='9' => (c as u32) - ('0' as u32),
'a'..='f' => (c as u32) - ('a' as u32) + 10,
'A'..='F' => (c as u32) - ('A' as u32) + 10,
2018-09-15 16:49:04 +00:00
_ => unreachable!(),
2015-05-27 08:35:13 +00:00
}
}
2023-11-19 18:13:01 +00:00
/// Check whether the character is a YAML flow character (one of `,[]{}`).
#[inline]
fn is_flow(c: char) -> bool {
2023-08-11 23:54:46 +00:00
matches!(c, ',' | '[' | ']' | '{' | '}')
}
2015-05-24 06:27:42 +00:00
pub type ScanResult = Result<(), ScanError>;
2018-09-15 16:49:04 +00:00
impl<T: Iterator<Item = char>> Scanner<T> {
2015-05-24 06:27:42 +00:00
/// Creates the YAML tokenizer.
pub fn new(rdr: T) -> Scanner<T> {
2015-05-24 06:37:36 +00:00
Scanner {
2018-09-15 17:03:55 +00:00
rdr,
2015-05-24 06:27:42 +00:00
buffer: VecDeque::new(),
mark: Marker::new(0, 1, 0),
tokens: VecDeque::new(),
2015-05-28 14:07:59 +00:00
error: None,
2015-05-24 06:27:42 +00:00
stream_start_produced: false,
stream_end_produced: false,
adjacent_value_allowed_at: 0,
2015-05-24 06:27:42 +00:00
simple_key_allowed: true,
simple_keys: Vec::new(),
indent: -1,
indents: Vec::new(),
flow_level: 0,
tokens_parsed: 0,
token_available: false,
2023-12-20 22:14:22 +00:00
leading_whitespace: true,
2015-05-24 06:37:36 +00:00
}
2015-05-24 06:27:42 +00:00
}
2015-05-28 14:07:59 +00:00
#[inline]
pub fn get_error(&self) -> Option<ScanError> {
2023-08-11 23:54:46 +00:00
self.error.as_ref().map(std::clone::Clone::clone)
2015-05-28 14:07:59 +00:00
}
2015-05-24 06:27:42 +00:00
/// Fill `self.buffer` with at least `count` characters.
///
/// The characters that are extracted this way are not consumed but only placed in the buffer.
#[inline]
2015-05-24 06:38:54 +00:00
fn lookahead(&mut self, count: usize) {
2015-05-24 06:27:42 +00:00
if self.buffer.len() >= count {
return;
}
2015-05-24 06:37:36 +00:00
for _ in 0..(count - self.buffer.len()) {
2015-05-24 06:27:42 +00:00
self.buffer.push_back(self.rdr.next().unwrap_or('\0'));
}
}
2023-12-20 22:14:22 +00:00
/// Consume the next character. Remove from buffer and update mark.
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn skip(&mut self) {
let c = self.buffer.pop_front().unwrap();
self.mark.index += 1;
if c == '\n' {
2023-12-20 22:14:22 +00:00
self.leading_whitespace = true;
2015-05-24 06:27:42 +00:00
self.mark.line += 1;
self.mark.col = 0;
} else {
2023-12-20 22:14:22 +00:00
// TODO(ethiraric, 20/12/2023): change to `self.leading_whitespace &= is_blank(c)`?
if self.leading_whitespace && !is_blank(c) {
self.leading_whitespace = false;
}
2015-05-24 06:27:42 +00:00
self.mark.col += 1;
}
}
2023-12-20 22:14:22 +00:00
/// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
2015-05-24 19:21:53 +00:00
#[inline]
fn skip_line(&mut self) {
if self.buffer[0] == '\r' && self.buffer[1] == '\n' {
self.skip();
self.skip();
} else if is_break(self.buffer[0]) {
self.skip();
}
}
/// Return the next character in the buffer.
///
/// The character is not consumed.
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn ch(&self) -> char {
self.buffer[0]
}
/// Look for the next character and return it.
///
/// The character is not consumed.
/// Equivalent to calling [`Self::lookahead`] and [`Self::ch`].
#[inline]
fn look_ch(&mut self) -> char {
self.lookahead(1);
self.ch()
}
/// Consume and return the next character.
///
/// Equivalent to calling [`Self::ch`] and [`Self::skip`].
#[inline]
fn ch_skip(&mut self) -> char {
let ret = self.ch();
self.skip();
ret
}
/// Return whether the next character is `c`.
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn ch_is(&self, c: char) -> bool {
self.buffer[0] == c
}
2015-05-24 06:37:36 +00:00
#[allow(dead_code)]
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn eof(&self) -> bool {
self.ch_is('\0')
}
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
pub fn stream_started(&self) -> bool {
self.stream_start_produced
}
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
pub fn stream_ended(&self) -> bool {
self.stream_end_produced
}
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
pub fn mark(&self) -> Marker {
self.mark
}
2023-11-19 16:08:28 +00:00
// Read and consume a line break (either `\r`, `\n` or `\r\n`).
//
// A `\n` is pushed into `s`.
//
// # Panics
// If the next characters do not correspond to a line break.
2015-05-24 19:21:53 +00:00
#[inline]
2015-05-24 06:27:42 +00:00
fn read_break(&mut self, s: &mut String) {
if self.buffer[0] == '\r' && self.buffer[1] == '\n' {
s.push('\n');
self.skip();
self.skip();
} else if self.buffer[0] == '\r' || self.buffer[0] == '\n' {
s.push('\n');
self.skip();
} else {
unreachable!();
}
}
2015-05-24 06:27:42 +00:00
fn insert_token(&mut self, pos: usize, tok: Token) {
let old_len = self.tokens.len();
assert!(pos <= old_len);
self.tokens.push_back(tok);
for i in 0..old_len - pos {
self.tokens.swap(old_len - i, old_len - i - 1);
}
}
2015-05-24 06:27:42 +00:00
fn allow_simple_key(&mut self) {
2018-09-15 16:49:04 +00:00
self.simple_key_allowed = true;
2015-05-24 06:27:42 +00:00
}
2015-05-24 06:27:42 +00:00
fn disallow_simple_key(&mut self) {
2018-09-15 16:49:04 +00:00
self.simple_key_allowed = false;
2015-05-24 06:27:42 +00:00
}
pub fn fetch_next_token(&mut self) -> ScanResult {
2015-05-24 06:38:54 +00:00
self.lookahead(1);
2015-05-24 06:27:42 +00:00
// println!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch());
if !self.stream_start_produced {
self.fetch_stream_start();
return Ok(());
}
2023-12-20 22:14:22 +00:00
self.skip_to_next_token()?;
2015-05-24 06:27:42 +00:00
2018-09-16 06:58:48 +00:00
self.stale_simple_keys()?;
2015-05-24 06:27:42 +00:00
let mark = self.mark;
self.unroll_indent(mark.col as isize);
2015-05-24 06:38:54 +00:00
self.lookahead(4);
2015-05-24 06:27:42 +00:00
if is_z(self.ch()) {
2018-09-16 06:58:48 +00:00
self.fetch_stream_end()?;
2015-05-24 06:27:42 +00:00
return Ok(());
}
2015-05-28 14:07:59 +00:00
// Is it a directive?
2015-05-24 06:27:42 +00:00
if self.mark.col == 0 && self.ch_is('%') {
2015-05-28 14:07:59 +00:00
return self.fetch_directive();
2015-05-24 06:27:42 +00:00
}
if self.mark.col == 0
&& self.buffer[0] == '-'
&& self.buffer[1] == '-'
&& self.buffer[2] == '-'
2018-09-15 16:49:04 +00:00
&& is_blankz(self.buffer[3])
{
2018-09-16 06:58:48 +00:00
self.fetch_document_indicator(TokenType::DocumentStart)?;
2015-05-24 06:27:42 +00:00
return Ok(());
}
if self.mark.col == 0
&& self.buffer[0] == '.'
&& self.buffer[1] == '.'
&& self.buffer[2] == '.'
2018-09-15 16:49:04 +00:00
&& is_blankz(self.buffer[3])
{
2018-09-16 06:58:48 +00:00
self.fetch_document_indicator(TokenType::DocumentEnd)?;
2015-05-24 06:27:42 +00:00
return Ok(());
}
let c = self.buffer[0];
let nc = self.buffer[1];
match c {
'[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
'{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
'}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
2015-05-28 17:56:03 +00:00
',' => self.fetch_flow_entry(),
'-' if is_blankz(nc) => self.fetch_block_entry(),
'?' if is_blankz(nc) => self.fetch_key(),
':' if is_blankz(nc)
|| (self.flow_level > 0
&& (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) =>
{
self.fetch_value()
}
2015-05-28 17:56:03 +00:00
// Is it an alias?
'*' => self.fetch_anchor(true),
// Is it an anchor?
'&' => self.fetch_anchor(false),
'!' => self.fetch_tag(),
2015-05-26 16:29:40 +00:00
// Is it a literal scalar?
2015-05-28 17:56:03 +00:00
'|' if self.flow_level == 0 => self.fetch_block_scalar(true),
2015-05-26 16:29:40 +00:00
// Is it a folded scalar?
2015-05-28 17:56:03 +00:00
'>' if self.flow_level == 0 => self.fetch_block_scalar(false),
'\'' => self.fetch_flow_scalar(true),
'"' => self.fetch_flow_scalar(false),
2015-05-24 06:27:42 +00:00
// plain scalar
2015-05-28 17:56:03 +00:00
'-' if !is_blankz(nc) => self.fetch_plain_scalar(),
':' | '?' if !is_blankz(nc) && self.flow_level == 0 => self.fetch_plain_scalar(),
2018-09-15 16:49:04 +00:00
'%' | '@' | '`' => Err(ScanError::new(
self.mark,
2023-08-11 23:54:46 +00:00
&format!("unexpected character: `{c}'"),
2018-09-15 16:49:04 +00:00
)),
2015-05-28 17:56:03 +00:00
_ => self.fetch_plain_scalar(),
2015-05-24 06:27:42 +00:00
}
}
pub fn next_token(&mut self) -> Result<Option<Token>, ScanError> {
if self.stream_end_produced {
return Ok(None);
}
if !self.token_available {
2018-09-16 06:58:48 +00:00
self.fetch_more_tokens()?;
2015-05-24 06:27:42 +00:00
}
let t = self.tokens.pop_front().unwrap();
self.token_available = false;
self.tokens_parsed += 1;
if let TokenType::StreamEnd = t.1 {
self.stream_end_produced = true;
2015-05-24 06:27:42 +00:00
}
Ok(Some(t))
}
pub fn fetch_more_tokens(&mut self) -> ScanResult {
2015-05-24 06:37:36 +00:00
let mut need_more;
2015-05-24 06:27:42 +00:00
loop {
need_more = false;
if self.tokens.is_empty() {
need_more = true;
} else {
2018-09-16 06:58:48 +00:00
self.stale_simple_keys()?;
2015-05-24 06:27:42 +00:00
for sk in &self.simple_keys {
if sk.possible && sk.token_number == self.tokens_parsed {
need_more = true;
break;
}
}
}
2018-09-15 16:49:04 +00:00
if !need_more {
break;
}
2018-09-16 06:58:48 +00:00
self.fetch_next_token()?;
2015-05-24 06:27:42 +00:00
}
self.token_available = true;
Ok(())
}
fn stale_simple_keys(&mut self) -> ScanResult {
for sk in &mut self.simple_keys {
2018-09-15 16:49:04 +00:00
if sk.possible
&& (sk.mark.line < self.mark.line || sk.mark.index + 1024 < self.mark.index)
{
if sk.required {
return Err(ScanError::new(self.mark, "simple key expect ':'"));
2015-05-24 06:27:42 +00:00
}
2018-09-15 16:49:04 +00:00
sk.possible = false;
}
2015-05-24 06:27:42 +00:00
}
Ok(())
}
2023-12-20 22:14:22 +00:00
fn skip_to_next_token(&mut self) -> ScanResult {
2015-05-24 06:27:42 +00:00
loop {
// TODO(chenyh) BOM
2023-12-20 22:14:22 +00:00
match self.look_ch() {
2015-05-24 06:27:42 +00:00
' ' => self.skip(),
2023-12-20 22:14:22 +00:00
// Tabs may not be used as indentation.
// "Indentation" only exists as long as a block is started, but does not exist
// inside of flow-style constructs. Tabs are allowed as part of leaading
// whitespaces outside of indentation.
'\t' if self.is_within_block() && self.leading_whitespace => {
return Err(ScanError::new(
self.mark,
"tabs disallowed within this context (block indentation)",
))
}
2015-05-24 06:27:42 +00:00
'\t' if self.flow_level > 0 || !self.simple_key_allowed => self.skip(),
'\n' | '\r' => {
2015-05-24 19:21:53 +00:00
self.lookahead(2);
self.skip_line();
2015-05-24 06:27:42 +00:00
if self.flow_level == 0 {
self.allow_simple_key();
}
2018-09-15 16:49:04 +00:00
}
2018-12-13 07:35:01 +00:00
'#' => {
while !is_breakz(self.ch()) {
self.skip();
self.lookahead(1);
}
}
2018-09-15 16:49:04 +00:00
_ => break,
2015-05-24 06:27:42 +00:00
}
}
2023-12-20 22:14:22 +00:00
Ok(())
2015-05-24 06:27:42 +00:00
}
fn fetch_stream_start(&mut self) {
let mark = self.mark;
self.indent = -1;
self.stream_start_produced = true;
self.allow_simple_key();
2018-09-15 16:49:04 +00:00
self.tokens
.push_back(Token(mark, TokenType::StreamStart(TEncoding::Utf8)));
self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
2015-05-24 06:27:42 +00:00
}
fn fetch_stream_end(&mut self) -> ScanResult {
// force new line
if self.mark.col != 0 {
self.mark.col = 0;
self.mark.line += 1;
}
self.unroll_indent(-1);
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.disallow_simple_key();
2018-09-15 16:49:04 +00:00
self.tokens
.push_back(Token(self.mark, TokenType::StreamEnd));
2015-05-24 06:27:42 +00:00
Ok(())
}
2015-05-28 14:07:59 +00:00
fn fetch_directive(&mut self) -> ScanResult {
self.unroll_indent(-1);
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-28 14:07:59 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_directive()?;
2015-05-28 14:07:59 +00:00
self.tokens.push_back(tok);
Ok(())
}
fn scan_directive(&mut self) -> Result<Token, ScanError> {
let start_mark = self.mark;
self.skip();
2018-09-16 06:58:48 +00:00
let name = self.scan_directive_name()?;
2015-05-28 14:07:59 +00:00
let tok = match name.as_ref() {
2018-09-16 06:58:48 +00:00
"YAML" => self.scan_version_directive_value(&start_mark)?,
"TAG" => self.scan_tag_directive_value(&start_mark)?,
2015-05-28 18:57:41 +00:00
// XXX This should be a warning instead of an error
2015-05-30 10:49:54 +00:00
_ => {
// skip current line
self.lookahead(1);
while !is_breakz(self.ch()) {
self.skip();
self.lookahead(1);
}
// XXX return an empty TagDirective token
2018-09-15 16:49:04 +00:00
Token(
start_mark,
TokenType::TagDirective(String::new(), String::new()),
)
2015-05-30 10:49:54 +00:00
// return Err(ScanError::new(start_mark,
// "while scanning a directive, found unknown directive name"))
}
2015-05-28 14:07:59 +00:00
};
self.lookahead(1);
while is_blank(self.ch()) {
self.skip();
self.lookahead(1);
}
if self.ch() == '#' {
while !is_breakz(self.ch()) {
self.skip();
self.lookahead(1);
}
}
if !is_breakz(self.ch()) {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
start_mark,
"while scanning a directive, did not find expected comment or line break",
));
2015-05-28 14:07:59 +00:00
}
// Eat a line break
if is_break(self.ch()) {
self.lookahead(2);
self.skip_line();
}
Ok(tok)
}
fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
self.lookahead(1);
while is_blank(self.ch()) {
self.skip();
self.lookahead(1);
}
2018-09-16 06:58:48 +00:00
let major = self.scan_version_directive_number(mark)?;
2015-05-28 14:07:59 +00:00
if self.ch() != '.' {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
*mark,
"while scanning a YAML directive, did not find expected digit or '.' character",
));
2015-05-28 14:07:59 +00:00
}
self.skip();
2018-09-16 06:58:48 +00:00
let minor = self.scan_version_directive_number(mark)?;
2015-05-28 14:07:59 +00:00
Ok(Token(*mark, TokenType::VersionDirective(major, minor)))
2015-05-28 14:07:59 +00:00
}
fn scan_directive_name(&mut self) -> Result<String, ScanError> {
let start_mark = self.mark;
let mut string = String::new();
self.lookahead(1);
2015-05-28 17:56:03 +00:00
while is_alpha(self.ch()) {
2015-05-28 14:07:59 +00:00
string.push(self.ch());
self.skip();
self.lookahead(1);
}
if string.is_empty() {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
start_mark,
"while scanning a directive, could not find expected directive name",
));
2015-05-28 14:07:59 +00:00
}
if !is_blankz(self.ch()) {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
start_mark,
"while scanning a directive, found unexpected non-alphabetical character",
));
2015-05-28 14:07:59 +00:00
}
Ok(string)
}
fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
let mut val = 0u32;
let mut length = 0usize;
self.lookahead(1);
while is_digit(self.ch()) {
if length + 1 > 9 {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
*mark,
"while scanning a YAML directive, found extremely long version number",
));
2015-05-28 14:07:59 +00:00
}
length += 1;
val = val * 10 + ((self.ch() as u32) - ('0' as u32));
self.skip();
self.lookahead(1);
}
if length == 0 {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
*mark,
"while scanning a YAML directive, did not find expected version number",
));
2015-05-28 14:07:59 +00:00
}
Ok(val)
}
fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
2015-05-28 18:57:41 +00:00
self.lookahead(1);
/* Eat whitespaces. */
while is_blank(self.ch()) {
self.skip();
self.lookahead(1);
}
2018-09-16 06:58:48 +00:00
let handle = self.scan_tag_handle(true, mark)?;
2015-05-28 18:57:41 +00:00
self.lookahead(1);
/* Eat whitespaces. */
while is_blank(self.ch()) {
self.skip();
self.lookahead(1);
}
let is_secondary = handle == "!!";
2023-08-11 23:54:46 +00:00
let prefix = self.scan_tag_uri(true, is_secondary, "", mark)?;
2015-05-28 18:57:41 +00:00
self.lookahead(1);
2016-03-10 12:49:02 +00:00
if is_blankz(self.ch()) {
Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
} else {
2018-09-15 16:49:04 +00:00
Err(ScanError::new(
*mark,
"while scanning TAG, did not find expected whitespace or line break",
))
2015-05-28 18:57:41 +00:00
}
2015-05-28 14:07:59 +00:00
}
2015-05-28 17:56:03 +00:00
fn fetch_tag(&mut self) -> ScanResult {
2018-09-16 06:58:48 +00:00
self.save_simple_key()?;
2015-05-28 17:56:03 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_tag()?;
2015-05-28 17:56:03 +00:00
self.tokens.push_back(tok);
Ok(())
}
fn scan_tag(&mut self) -> Result<Token, ScanError> {
let start_mark = self.mark;
let mut handle = String::new();
2015-05-28 18:57:41 +00:00
let mut suffix;
2015-05-28 17:56:03 +00:00
// Check if the tag is in the canonical form (verbatim).
self.lookahead(2);
if self.buffer[1] == '<' {
// Eat '!<'
self.skip();
self.skip();
2023-08-11 23:54:46 +00:00
suffix = self.scan_tag_uri(false, false, "", &start_mark)?;
2015-05-28 17:56:03 +00:00
if self.ch() != '>' {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
start_mark,
"while scanning a tag, did not find the expected '>'",
));
2015-05-28 17:56:03 +00:00
}
self.skip();
} else {
2015-06-29 16:31:22 +00:00
// The tag has either the '!suffix' or the '!handle!suffix'
2018-09-16 06:58:48 +00:00
handle = self.scan_tag_handle(false, &start_mark)?;
2015-05-28 17:56:03 +00:00
// Check if it is, indeed, handle.
if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
// A tag handle starting with "!!" is a secondary tag handle.
let is_secondary_handle = handle == "!!";
suffix = self.scan_tag_uri(false, is_secondary_handle, "", &start_mark)?;
2015-05-28 17:56:03 +00:00
} else {
2018-09-16 06:58:48 +00:00
suffix = self.scan_tag_uri(false, false, &handle, &start_mark)?;
handle = "!".to_owned();
2015-05-28 17:56:03 +00:00
// A special case: the '!' tag. Set the handle to '' and the
// suffix to '!'.
if suffix.is_empty() {
2015-05-28 17:56:03 +00:00
handle.clear();
suffix = "!".to_owned();
2015-05-28 17:56:03 +00:00
}
}
}
if is_blankz(self.look_ch()) {
2015-05-28 17:56:03 +00:00
// XXX: ex 7.2, an empty scalar can follow a secondary tag
Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
2015-05-28 17:56:03 +00:00
} else {
2018-09-15 16:49:04 +00:00
Err(ScanError::new(
start_mark,
"while scanning a tag, did not find expected whitespace or line break",
))
2015-05-28 17:56:03 +00:00
}
}
fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new();
if self.look_ch() != '!' {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
*mark,
"while scanning a tag, did not find expected '!'",
));
2015-05-28 17:56:03 +00:00
}
string.push(self.ch_skip());
2015-05-28 17:56:03 +00:00
while is_alpha(self.look_ch()) {
string.push(self.ch_skip());
2015-05-28 17:56:03 +00:00
}
// Check if the trailing character is '!' and copy it.
if self.ch() == '!' {
string.push(self.ch_skip());
} else if directive && string != "!" {
2015-05-28 17:56:03 +00:00
// It's either the '!' tag or not really a tag handle. If it's a %TAG
// directive, it's an error. If it's a tag token, it must be a part of
// URI.
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
*mark,
"while parsing a tag directive, did not find expected '!'",
));
2015-05-28 17:56:03 +00:00
}
Ok(string)
}
2018-09-15 16:49:04 +00:00
fn scan_tag_uri(
&mut self,
directive: bool,
_is_secondary: bool,
head: &str,
mark: &Marker,
) -> Result<String, ScanError> {
2015-05-28 17:56:03 +00:00
let mut length = head.len();
let mut string = String::new();
// Copy the head if needed.
// Note that we don't copy the leading '!' character.
if length > 1 {
string.extend(head.chars().skip(1));
}
/*
* The set of characters that may appear in URI is as follows:
*
* '0'-'9', 'A'-'Z', 'a'-'z', '_', '-', ';', '/', '?', ':', '@', '&',
* '=', '+', '$', ',', '.', '!', '~', '*', '\'', '(', ')', '[', ']',
* '%'.
*/
while match self.look_ch() {
2015-05-28 18:57:41 +00:00
';' | '/' | '?' | ':' | '@' | '&' => true,
'=' | '+' | '$' | ',' | '.' | '!' | '~' | '*' | '\'' | '(' | ')' | '[' | ']' => true,
2015-05-28 17:56:03 +00:00
'%' => true,
c if is_alpha(c) => true,
2018-09-15 16:49:04 +00:00
_ => false,
2015-05-28 17:56:03 +00:00
} {
// Check if it is a URI-escape sequence.
if self.ch() == '%' {
2018-09-16 06:58:48 +00:00
string.push(self.scan_uri_escapes(directive, mark)?);
2015-05-28 17:56:03 +00:00
} else {
string.push(self.ch());
self.skip();
}
length += 1;
}
if length == 0 {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
*mark,
"while parsing a tag, did not find expected tag URI",
));
2015-05-28 17:56:03 +00:00
}
Ok(string)
}
2018-09-15 16:49:04 +00:00
fn scan_uri_escapes(&mut self, _directive: bool, mark: &Marker) -> Result<char, ScanError> {
2015-05-30 10:49:54 +00:00
let mut width = 0usize;
let mut code = 0u32;
loop {
self.lookahead(3);
2018-09-15 16:49:04 +00:00
if !(self.ch() == '%' && is_hex(self.buffer[1]) && is_hex(self.buffer[2])) {
return Err(ScanError::new(
*mark,
"while parsing a tag, did not find URI escaped octet",
));
2015-05-30 10:49:54 +00:00
}
let octet = (as_hex(self.buffer[1]) << 4) + as_hex(self.buffer[2]);
if width == 0 {
width = match octet {
_ if octet & 0x80 == 0x00 => 1,
_ if octet & 0xE0 == 0xC0 => 2,
_ if octet & 0xF0 == 0xE0 => 3,
_ if octet & 0xF8 == 0xF0 => 4,
_ => {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
*mark,
"while parsing a tag, found an incorrect leading UTF-8 octet",
));
2015-05-30 10:49:54 +00:00
}
};
code = octet;
} else {
if octet & 0xc0 != 0x80 {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
*mark,
"while parsing a tag, found an incorrect trailing UTF-8 octet",
));
2015-05-30 10:49:54 +00:00
}
code = (code << 8) + octet;
}
self.skip();
self.skip();
self.skip();
width -= 1;
if width == 0 {
break;
}
}
match char::from_u32(code) {
Some(ch) => Ok(ch),
2018-09-15 16:49:04 +00:00
None => Err(ScanError::new(
*mark,
"while parsing a tag, found an invalid UTF-8 codepoint",
)),
2015-05-30 10:49:54 +00:00
}
}
2015-05-28 17:56:03 +00:00
fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
2018-09-16 06:58:48 +00:00
self.save_simple_key()?;
2015-05-28 17:56:03 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_anchor(alias)?;
2015-05-28 17:56:03 +00:00
self.tokens.push_back(tok);
Ok(())
}
2018-09-15 16:49:04 +00:00
fn scan_anchor(&mut self, alias: bool) -> Result<Token, ScanError> {
2015-05-28 17:56:03 +00:00
let mut string = String::new();
let start_mark = self.mark;
self.skip();
self.lookahead(1);
while is_alpha(self.ch()) || self.ch_is(':') {
2015-05-28 17:56:03 +00:00
string.push(self.ch());
self.skip();
self.lookahead(1);
}
2018-12-13 07:35:01 +00:00
if string.is_empty()
|| match self.ch() {
c if is_blankz(c) => false,
'?' | ',' | ']' | '}' | '%' | '@' | '`' => false,
2018-12-13 07:35:01 +00:00
_ => true,
}
{
2015-05-28 17:56:03 +00:00
return Err(ScanError::new(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
}
if alias {
Ok(Token(start_mark, TokenType::Alias(string)))
2015-05-28 17:56:03 +00:00
} else {
Ok(Token(start_mark, TokenType::Anchor(string)))
2015-05-28 17:56:03 +00:00
}
}
2018-09-15 16:49:04 +00:00
fn fetch_flow_collection_start(&mut self, tok: TokenType) -> ScanResult {
2015-05-24 06:27:42 +00:00
// The indicators '[' and '{' may start a simple key.
2018-09-16 06:58:48 +00:00
self.save_simple_key()?;
2015-05-24 06:27:42 +00:00
2018-09-15 10:33:26 +00:00
self.increase_flow_level()?;
2015-05-24 06:27:42 +00:00
self.allow_simple_key();
let start_mark = self.mark;
self.skip();
self.tokens.push_back(Token(start_mark, tok));
Ok(())
}
2018-09-15 16:49:04 +00:00
fn fetch_flow_collection_end(&mut self, tok: TokenType) -> ScanResult {
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.decrease_flow_level();
self.disallow_simple_key();
let start_mark = self.mark;
self.skip();
self.tokens.push_back(Token(start_mark, tok));
Ok(())
}
fn fetch_flow_entry(&mut self) -> ScanResult {
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.allow_simple_key();
let start_mark = self.mark;
self.skip();
2018-09-15 16:49:04 +00:00
self.tokens
.push_back(Token(start_mark, TokenType::FlowEntry));
2015-05-24 06:27:42 +00:00
Ok(())
}
2018-09-15 10:33:26 +00:00
fn increase_flow_level(&mut self) -> ScanResult {
2018-09-15 16:49:04 +00:00
self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
self.flow_level = self
.flow_level
.checked_add(1)
.ok_or_else(|| ScanError::new(self.mark, "recursion limit exceeded"))?;
2018-09-15 10:33:26 +00:00
Ok(())
2015-05-24 06:27:42 +00:00
}
fn decrease_flow_level(&mut self) {
if self.flow_level > 0 {
self.flow_level -= 1;
self.simple_keys.pop().unwrap();
}
}
fn fetch_block_entry(&mut self) -> ScanResult {
if self.flow_level == 0 {
// Check if we are allowed to start a new entry.
if !self.simple_key_allowed {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
self.mark,
"block sequence entries are not allowed in this context",
));
2015-05-24 06:27:42 +00:00
}
let mark = self.mark;
// generate BLOCK-SEQUENCE-START if indented
self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
2015-05-24 06:27:42 +00:00
} else {
// - * only allowed in block
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
self.mark,
r#""-" is only valid inside a block"#,
));
2015-05-24 06:27:42 +00:00
}
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.allow_simple_key();
let start_mark = self.mark;
self.skip();
2018-09-15 16:49:04 +00:00
self.tokens
.push_back(Token(start_mark, TokenType::BlockEntry));
2015-05-24 06:27:42 +00:00
Ok(())
}
2015-06-29 16:31:22 +00:00
2015-05-24 06:27:42 +00:00
fn fetch_document_indicator(&mut self, t: TokenType) -> ScanResult {
self.unroll_indent(-1);
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.disallow_simple_key();
let mark = self.mark;
self.skip();
self.skip();
self.skip();
self.tokens.push_back(Token(mark, t));
Ok(())
}
2015-05-26 16:29:40 +00:00
fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
2018-09-16 06:58:48 +00:00
self.save_simple_key()?;
2015-05-26 16:29:40 +00:00
self.allow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_block_scalar(literal)?;
2015-05-26 16:29:40 +00:00
self.tokens.push_back(tok);
Ok(())
}
2023-08-11 23:54:46 +00:00
#[allow(clippy::too_many_lines)]
2015-05-26 16:29:40 +00:00
fn scan_block_scalar(&mut self, literal: bool) -> Result<Token, ScanError> {
let start_mark = self.mark;
let mut chomping: i32 = 0;
let mut increment: usize = 0;
let mut indent: usize = 0;
2015-05-28 09:18:20 +00:00
let mut trailing_blank: bool;
2015-05-26 16:29:40 +00:00
let mut leading_blank: bool = false;
let mut string = String::new();
let mut leading_break = String::new();
let mut trailing_breaks = String::new();
// skip '|' or '>'
self.skip();
self.lookahead(1);
if self.ch() == '+' || self.ch() == '-' {
if self.ch() == '+' {
chomping = 1;
} else {
chomping = -1;
}
self.skip();
self.lookahead(1);
if is_digit(self.ch()) {
if self.ch() == '0' {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
start_mark,
"while scanning a block scalar, found an indentation indicator equal to 0",
2018-09-15 16:49:04 +00:00
));
2015-05-26 16:29:40 +00:00
}
increment = (self.ch() as usize) - ('0' as usize);
self.skip();
}
} else if is_digit(self.ch()) {
if self.ch() == '0' {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
start_mark,
"while scanning a block scalar, found an indentation indicator equal to 0",
2018-09-15 16:49:04 +00:00
));
2015-05-26 16:29:40 +00:00
}
increment = (self.ch() as usize) - ('0' as usize);
self.skip();
self.lookahead(1);
if self.ch() == '+' || self.ch() == '-' {
if self.ch() == '+' {
chomping = 1;
} else {
chomping = -1;
}
self.skip();
}
}
// Eat whitespaces and comments to the end of the line.
self.lookahead(1);
while is_blank(self.ch()) {
self.skip();
self.lookahead(1);
}
if self.ch() == '#' {
while !is_breakz(self.ch()) {
self.skip();
self.lookahead(1);
}
}
// Check if we are at the end of the line.
if !is_breakz(self.ch()) {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
start_mark,
"while scanning a block scalar, did not find expected comment or line break",
));
2015-05-26 16:29:40 +00:00
}
if is_break(self.ch()) {
self.lookahead(2);
self.skip_line();
}
if increment > 0 {
2018-09-15 16:49:04 +00:00
indent = if self.indent >= 0 {
(self.indent + increment as isize) as usize
} else {
increment
}
2015-05-26 16:29:40 +00:00
}
// Scan the leading line breaks and determine the indentation level if needed.
2023-11-19 16:08:28 +00:00
self.block_scalar_breaks(&mut indent, &mut trailing_breaks);
2015-06-29 16:31:22 +00:00
2015-05-26 16:29:40 +00:00
self.lookahead(1);
let start_mark = self.mark;
while self.mark.col == indent && !is_z(self.ch()) {
// We are at the beginning of a non-empty line.
trailing_blank = is_blank(self.ch());
2018-09-15 16:49:04 +00:00
if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
if trailing_breaks.is_empty() {
string.push(' ');
}
leading_break.clear();
2015-05-26 16:29:40 +00:00
} else {
2016-11-24 10:10:49 +00:00
string.push_str(&leading_break);
2015-05-26 16:29:40 +00:00
leading_break.clear();
}
2016-11-24 10:10:49 +00:00
string.push_str(&trailing_breaks);
2015-05-26 16:29:40 +00:00
trailing_breaks.clear();
leading_blank = is_blank(self.ch());
while !is_breakz(self.ch()) {
string.push(self.ch());
self.skip();
self.lookahead(1);
}
2015-09-15 07:27:32 +00:00
// break on EOF
2018-09-15 16:49:04 +00:00
if is_z(self.ch()) {
break;
}
2015-05-26 16:29:40 +00:00
self.lookahead(2);
2015-09-15 07:27:32 +00:00
self.read_break(&mut leading_break);
2015-05-26 16:29:40 +00:00
// Eat the following indentation spaces and line breaks.
2023-11-19 16:08:28 +00:00
self.block_scalar_breaks(&mut indent, &mut trailing_breaks);
2015-05-26 16:29:40 +00:00
}
// Chomp the tail.
if chomping != -1 {
2016-11-24 10:10:49 +00:00
string.push_str(&leading_break);
2015-05-26 16:29:40 +00:00
}
if chomping == 1 {
2016-11-24 10:10:49 +00:00
string.push_str(&trailing_breaks);
2015-05-26 16:29:40 +00:00
}
if literal {
2018-09-15 16:49:04 +00:00
Ok(Token(
start_mark,
TokenType::Scalar(TScalarStyle::Literal, string),
))
2015-05-26 16:29:40 +00:00
} else {
2018-09-15 16:49:04 +00:00
Ok(Token(
start_mark,
TokenType::Scalar(TScalarStyle::Foled, string),
))
2015-05-26 16:29:40 +00:00
}
}
2023-11-19 16:08:28 +00:00
fn block_scalar_breaks(&mut self, indent: &mut usize, breaks: &mut String) {
2015-05-26 16:29:40 +00:00
let mut max_indent = 0;
2023-11-19 16:08:28 +00:00
// Consume all empty lines.
2015-05-26 16:29:40 +00:00
loop {
2023-11-19 16:08:28 +00:00
// Consume all spaces. Tabs cannot be used as indentation.
while (*indent == 0 || self.mark.col < *indent) && self.look_ch() == ' ' {
2018-09-15 16:49:04 +00:00
self.skip();
2015-05-26 16:29:40 +00:00
}
if self.mark.col > max_indent {
max_indent = self.mark.col;
}
2023-11-19 16:08:28 +00:00
// If our current line is not empty, break out of the loop.
if !is_break(self.look_ch()) {
2015-05-26 16:29:40 +00:00
break;
}
// Consume the line break.
2023-11-19 16:08:28 +00:00
self.lookahead(2);
2015-05-26 16:29:40 +00:00
self.read_break(breaks);
}
if *indent == 0 {
2023-11-19 16:08:28 +00:00
*indent = max_indent.max((self.indent + 1) as usize).max(1);
2015-05-26 16:29:40 +00:00
}
}
2015-05-24 19:21:53 +00:00
fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
2018-09-16 06:58:48 +00:00
self.save_simple_key()?;
2015-05-24 06:27:42 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_flow_scalar(single)?;
2015-05-24 06:27:42 +00:00
// From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
// YAML allows the following value to be specified adjacent to the “:”.
self.adjacent_value_allowed_at = self.mark.index;
2015-05-24 06:27:42 +00:00
self.tokens.push_back(tok);
Ok(())
}
2023-08-11 23:54:46 +00:00
#[allow(clippy::too_many_lines)]
2015-05-24 19:21:53 +00:00
fn scan_flow_scalar(&mut self, single: bool) -> Result<Token, ScanError> {
2015-05-24 06:27:42 +00:00
let start_mark = self.mark;
let mut string = String::new();
let mut leading_break = String::new();
let mut trailing_breaks = String::new();
let mut whitespaces = String::new();
2015-05-28 09:18:20 +00:00
let mut leading_blanks;
2015-05-24 06:27:42 +00:00
2015-05-24 19:21:53 +00:00
/* Eat the left quote. */
self.skip();
2015-05-24 06:27:42 +00:00
loop {
/* Check for a document indicator. */
2015-05-24 06:38:54 +00:00
self.lookahead(4);
2015-05-24 06:27:42 +00:00
2018-09-15 16:49:04 +00:00
if self.mark.col == 0
&& (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
|| ((self.buffer[0] == '.')
&& (self.buffer[1] == '.')
&& (self.buffer[2] == '.')))
&& is_blankz(self.buffer[3])
{
return Err(ScanError::new(
start_mark,
"while scanning a quoted scalar, found unexpected document indicator",
));
}
2015-05-24 06:27:42 +00:00
2015-05-24 19:21:53 +00:00
if is_z(self.ch()) {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
start_mark,
"while scanning a quoted scalar, found unexpected end of stream",
));
2015-05-24 19:21:53 +00:00
}
self.lookahead(2);
leading_blanks = false;
// Consume non-blank characters.
while !is_blankz(self.ch()) {
match self.ch() {
// Check for an escaped single quote.
'\'' if self.buffer[1] == '\'' && single => {
string.push('\'');
self.skip();
self.skip();
2018-09-15 16:49:04 +00:00
}
2015-05-24 19:21:53 +00:00
// Check for the right quote.
2018-09-15 16:49:04 +00:00
'\'' if single => break,
'"' if !single => break,
2015-05-24 19:21:53 +00:00
// Check for an escaped line break.
'\\' if !single && is_break(self.buffer[1]) => {
self.lookahead(3);
self.skip();
self.skip_line();
leading_blanks = true;
break;
}
// Check for an escape sequence.
'\\' if !single => {
let mut code_length = 0usize;
match self.buffer[1] {
'0' => string.push('\0'),
'a' => string.push('\x07'),
'b' => string.push('\x08'),
't' | '\t' => string.push('\t'),
'n' => string.push('\n'),
'v' => string.push('\x0b'),
'f' => string.push('\x0c'),
'r' => string.push('\x0d'),
'e' => string.push('\x1b'),
' ' => string.push('\x20'),
'"' => string.push('"'),
'\'' => string.push('\''),
'\\' => string.push('\\'),
2015-05-28 14:19:06 +00:00
// NEL (#x85)
'N' => string.push(char::from_u32(0x85).unwrap()),
// #xA0
'_' => string.push(char::from_u32(0xA0).unwrap()),
// LS (#x2028)
'L' => string.push(char::from_u32(0x2028).unwrap()),
// PS (#x2029)
'P' => string.push(char::from_u32(0x2029).unwrap()),
2015-05-24 19:21:53 +00:00
'x' => code_length = 2,
'u' => code_length = 4,
'U' => code_length = 8,
2018-09-15 16:49:04 +00:00
_ => {
return Err(ScanError::new(
start_mark,
"while parsing a quoted scalar, found unknown escape character",
))
}
2015-05-24 19:21:53 +00:00
}
self.skip();
self.skip();
// Consume an arbitrary escape code.
if code_length > 0 {
self.lookahead(code_length);
2015-05-27 08:35:13 +00:00
let mut value = 0u32;
for i in 0..code_length {
if !is_hex(self.buffer[i]) {
return Err(ScanError::new(start_mark,
"while parsing a quoted scalar, did not find expected hexadecimal number"));
2015-05-27 08:35:13 +00:00
}
value = (value << 4) + as_hex(self.buffer[i]);
}
2023-08-11 23:54:46 +00:00
let Some(ch) = char::from_u32(value) else {
return Err(ScanError::new(start_mark, "while parsing a quoted scalar, found invalid Unicode character escape code"));
2015-05-27 08:35:13 +00:00
};
string.push(ch);
2015-05-28 09:18:20 +00:00
for _ in 0..code_length {
2015-05-27 08:35:13 +00:00
self.skip();
}
2015-05-24 19:21:53 +00:00
}
2018-09-15 16:49:04 +00:00
}
c => {
string.push(c);
self.skip();
}
2015-05-24 19:21:53 +00:00
}
self.lookahead(2);
}
2017-05-13 12:48:48 +00:00
self.lookahead(1);
2015-05-24 19:21:53 +00:00
match self.ch() {
2018-09-15 16:49:04 +00:00
'\'' if single => break,
'"' if !single => break,
2015-05-24 19:21:53 +00:00
_ => {}
}
// Consume blank characters.
while is_blank(self.ch()) || is_break(self.ch()) {
if is_blank(self.ch()) {
// Consume a space or a tab character.
2016-03-10 12:49:02 +00:00
if leading_blanks {
2015-05-24 19:21:53 +00:00
self.skip();
} else {
2016-03-10 12:49:02 +00:00
whitespaces.push(self.ch());
2015-05-24 19:21:53 +00:00
self.skip();
}
} else {
self.lookahead(2);
// Check if it is a first line break.
2016-03-10 12:49:02 +00:00
if leading_blanks {
self.read_break(&mut trailing_breaks);
} else {
2015-05-24 19:21:53 +00:00
whitespaces.clear();
self.read_break(&mut leading_break);
leading_blanks = true;
}
}
self.lookahead(1);
}
// Join the whitespaces or fold line breaks.
if leading_blanks {
2016-03-10 12:49:02 +00:00
if leading_break.is_empty() {
2016-11-24 10:10:49 +00:00
string.push_str(&leading_break);
string.push_str(&trailing_breaks);
2016-03-10 12:49:02 +00:00
trailing_breaks.clear();
leading_break.clear();
} else {
2015-05-24 19:21:53 +00:00
if trailing_breaks.is_empty() {
string.push(' ');
} else {
2016-11-24 10:10:49 +00:00
string.push_str(&trailing_breaks);
2015-05-24 19:21:53 +00:00
trailing_breaks.clear();
}
leading_break.clear();
}
} else {
2016-11-24 10:10:49 +00:00
string.push_str(&whitespaces);
2015-05-24 19:21:53 +00:00
whitespaces.clear();
}
} // loop
// Eat the right quote.
self.skip();
if single {
2018-09-15 16:49:04 +00:00
Ok(Token(
start_mark,
TokenType::Scalar(TScalarStyle::SingleQuoted, string),
))
2015-05-24 19:21:53 +00:00
} else {
2018-09-15 16:49:04 +00:00
Ok(Token(
start_mark,
TokenType::Scalar(TScalarStyle::DoubleQuoted, string),
))
2015-05-24 19:21:53 +00:00
}
}
fn fetch_plain_scalar(&mut self) -> ScanResult {
2018-09-16 06:58:48 +00:00
self.save_simple_key()?;
2015-05-24 19:21:53 +00:00
self.disallow_simple_key();
2018-09-16 06:58:48 +00:00
let tok = self.scan_plain_scalar()?;
2015-05-24 19:21:53 +00:00
self.tokens.push_back(tok);
Ok(())
}
fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> {
let indent = self.indent + 1;
let start_mark = self.mark;
let mut string = String::new();
let mut leading_break = String::new();
let mut trailing_breaks = String::new();
let mut whitespaces = String::new();
2023-12-20 23:13:53 +00:00
let mut leading_blanks = true;
2015-05-24 19:21:53 +00:00
loop {
/* Check for a document indicator. */
self.lookahead(4);
2018-09-15 16:49:04 +00:00
if self.mark.col == 0
&& (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
|| ((self.buffer[0] == '.')
&& (self.buffer[1] == '.')
&& (self.buffer[2] == '.')))
&& is_blankz(self.buffer[3])
{
break;
}
2015-05-24 19:21:53 +00:00
2018-09-15 16:49:04 +00:00
if self.ch() == '#' {
break;
}
2015-05-24 06:27:42 +00:00
while !is_blankz(self.ch()) {
// indicators can end a plain scalar, see 7.3.3. Plain Style
2015-05-24 06:27:42 +00:00
match self.ch() {
':' if is_blankz(self.buffer[1])
|| (self.flow_level > 0 && is_flow(self.buffer[1])) =>
{
break;
}
',' | '[' | ']' | '{' | '}' if self.flow_level > 0 => break,
2015-05-24 06:27:42 +00:00
_ => {}
}
if leading_blanks || !whitespaces.is_empty() {
if leading_blanks {
2016-03-10 12:49:02 +00:00
if leading_break.is_empty() {
2016-11-24 10:10:49 +00:00
string.push_str(&leading_break);
string.push_str(&trailing_breaks);
2016-03-10 12:49:02 +00:00
trailing_breaks.clear();
leading_break.clear();
} else {
2015-05-24 06:27:42 +00:00
if trailing_breaks.is_empty() {
string.push(' ');
} else {
2016-11-24 10:10:49 +00:00
string.push_str(&trailing_breaks);
2015-05-24 06:27:42 +00:00
trailing_breaks.clear();
}
leading_break.clear();
}
leading_blanks = false;
} else {
2016-11-24 10:10:49 +00:00
string.push_str(&whitespaces);
2015-05-24 06:27:42 +00:00
whitespaces.clear();
}
}
string.push(self.ch());
self.skip();
2015-05-24 06:38:54 +00:00
self.lookahead(2);
2015-05-24 06:27:42 +00:00
}
// is the end?
2018-09-15 16:49:04 +00:00
if !(is_blank(self.ch()) || is_break(self.ch())) {
break;
}
2015-05-24 06:27:42 +00:00
2023-12-20 23:13:53 +00:00
while is_blank(self.look_ch()) || is_break(self.ch()) {
2015-05-24 06:27:42 +00:00
if is_blank(self.ch()) {
2018-09-15 16:49:04 +00:00
if leading_blanks && (self.mark.col as isize) < indent && self.ch() == '\t' {
return Err(ScanError::new(
start_mark,
"while scanning a plain scalar, found a tab",
));
2015-05-24 06:27:42 +00:00
}
2023-12-20 22:14:22 +00:00
if !leading_blanks {
2016-03-10 12:49:02 +00:00
whitespaces.push(self.ch());
2015-05-24 06:27:42 +00:00
}
2023-12-20 22:14:22 +00:00
self.skip();
2015-05-24 06:27:42 +00:00
} else {
2015-05-24 06:38:54 +00:00
self.lookahead(2);
2015-05-24 06:27:42 +00:00
// Check if it is a first line break
2016-03-10 12:49:02 +00:00
if leading_blanks {
self.read_break(&mut trailing_breaks);
} else {
2015-05-24 06:27:42 +00:00
whitespaces.clear();
self.read_break(&mut leading_break);
leading_blanks = true;
}
}
}
// check indentation level
2015-05-24 06:27:42 +00:00
if self.flow_level == 0 && (self.mark.col as isize) < indent {
break;
}
}
if leading_blanks {
self.allow_simple_key();
}
2018-09-15 16:49:04 +00:00
Ok(Token(
start_mark,
TokenType::Scalar(TScalarStyle::Plain, string),
))
2015-05-24 06:27:42 +00:00
}
2015-05-25 11:31:33 +00:00
fn fetch_key(&mut self) -> ScanResult {
let start_mark = self.mark;
if self.flow_level == 0 {
// Check if we are allowed to start a new key (not necessarily simple).
2015-05-25 11:31:33 +00:00
if !self.simple_key_allowed {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
self.mark,
"mapping keys are not allowed in this context",
));
2015-05-25 11:31:33 +00:00
}
2018-09-15 16:49:04 +00:00
self.roll_indent(
start_mark.col,
None,
TokenType::BlockMappingStart,
start_mark,
);
2015-05-25 11:31:33 +00:00
}
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-25 11:31:33 +00:00
if self.flow_level == 0 {
self.allow_simple_key();
} else {
self.disallow_simple_key();
}
self.skip();
self.tokens.push_back(Token(start_mark, TokenType::Key));
2015-05-25 11:31:33 +00:00
Ok(())
}
2015-05-24 06:27:42 +00:00
fn fetch_value(&mut self) -> ScanResult {
let sk = self.simple_keys.last().unwrap().clone();
let start_mark = self.mark;
if sk.possible {
2015-05-25 11:31:33 +00:00
// insert simple key
let tok = Token(sk.mark, TokenType::Key);
2015-05-24 06:27:42 +00:00
let tokens_parsed = self.tokens_parsed;
2015-06-29 16:31:22 +00:00
self.insert_token(sk.token_number - tokens_parsed, tok);
2015-05-24 06:27:42 +00:00
// Add the BLOCK-MAPPING-START token if needed.
2018-09-15 16:49:04 +00:00
self.roll_indent(
sk.mark.col,
Some(sk.token_number),
TokenType::BlockMappingStart,
start_mark,
);
2015-05-24 06:27:42 +00:00
self.simple_keys.last_mut().unwrap().possible = false;
self.disallow_simple_key();
} else {
// The ':' indicator follows a complex key.
2015-05-25 11:31:33 +00:00
if self.flow_level == 0 {
if !self.simple_key_allowed {
2018-09-15 16:49:04 +00:00
return Err(ScanError::new(
start_mark,
"mapping values are not allowed in this context",
));
2015-05-25 11:31:33 +00:00
}
2015-05-24 06:27:42 +00:00
2018-09-15 16:49:04 +00:00
self.roll_indent(
start_mark.col,
None,
TokenType::BlockMappingStart,
start_mark,
);
2015-05-25 11:31:33 +00:00
}
if self.flow_level == 0 {
self.allow_simple_key();
} else {
self.disallow_simple_key();
}
}
2015-05-24 06:27:42 +00:00
self.skip();
self.tokens.push_back(Token(start_mark, TokenType::Value));
2015-05-24 06:27:42 +00:00
Ok(())
}
2018-09-15 16:49:04 +00:00
fn roll_indent(&mut self, col: usize, number: Option<usize>, tok: TokenType, mark: Marker) {
2015-05-24 06:27:42 +00:00
if self.flow_level > 0 {
return;
}
if self.indent < col as isize {
self.indents.push(self.indent);
self.indent = col as isize;
let tokens_parsed = self.tokens_parsed;
match number {
Some(n) => self.insert_token(n - tokens_parsed, Token(mark, tok)),
2018-09-15 16:49:04 +00:00
None => self.tokens.push_back(Token(mark, tok)),
2015-05-24 06:27:42 +00:00
}
}
}
fn unroll_indent(&mut self, col: isize) {
if self.flow_level > 0 {
return;
}
while self.indent > col {
self.tokens.push_back(Token(self.mark, TokenType::BlockEnd));
2015-05-24 06:27:42 +00:00
self.indent = self.indents.pop().unwrap();
}
}
2023-11-19 20:16:52 +00:00
fn save_simple_key(&mut self) -> ScanResult {
2015-05-24 06:27:42 +00:00
let required = self.flow_level > 0 && self.indent == (self.mark.col as isize);
if self.simple_key_allowed {
let mut sk = SimpleKey::new(self.mark);
sk.possible = true;
sk.required = required;
sk.token_number = self.tokens_parsed + self.tokens.len();
2018-09-16 06:58:48 +00:00
self.remove_simple_key()?;
2015-05-24 06:27:42 +00:00
self.simple_keys.pop();
self.simple_keys.push(sk);
}
Ok(())
}
fn remove_simple_key(&mut self) -> ScanResult {
let last = self.simple_keys.last_mut().unwrap();
if last.possible && last.required {
return Err(ScanError::new(self.mark, "simple key expected"));
2015-05-24 06:27:42 +00:00
}
last.possible = false;
Ok(())
}
2023-12-20 22:14:22 +00:00
/// Return whether the scanner is inside a block but outside of a flow sequence.
fn is_within_block(&self) -> bool {
!self.indents.is_empty() && self.flow_level == 0
}
2015-05-24 06:27:42 +00:00
}