Split yaml.rs into sizeable files.

This commit is contained in:
Ethiraric 2024-06-10 18:05:25 +02:00
parent 976007017d
commit 2b8eb3f62b
5 changed files with 533 additions and 520 deletions

View file

@ -1,4 +1,4 @@
use saphyr::yaml;
use saphyr::{Yaml, YamlLoader};
use std::env;
use std::fs::File;
use std::io::prelude::*;
@ -9,14 +9,14 @@ fn print_indent(indent: usize) {
}
}
fn dump_node(doc: &yaml::Yaml, indent: usize) {
fn dump_node(doc: &Yaml, indent: usize) {
match *doc {
yaml::Yaml::Array(ref v) => {
Yaml::Array(ref v) => {
for x in v {
dump_node(x, indent + 1);
}
}
yaml::Yaml::Hash(ref h) => {
Yaml::Hash(ref h) => {
for (k, v) in h {
print_indent(indent);
println!("{k:?}:");
@ -36,7 +36,7 @@ fn main() {
let mut s = String::new();
f.read_to_string(&mut s).unwrap();
let docs = yaml::YamlLoader::load_from_str(&s).unwrap();
let docs = YamlLoader::load_from_str(&s).unwrap();
for doc in &docs {
println!("---");
dump_node(doc, 0);

289
saphyr/src/encoding.rs Normal file
View file

@ -0,0 +1,289 @@
//! Encoding utilities. Available only with the `encoding` feature.
use std::{borrow::Cow, ops::ControlFlow};
use encoding_rs::{Decoder, DecoderResult, Encoding};
use crate::{loader::LoadError, Yaml, YamlLoader};
/// The signature of the function to call when using [`YAMLDecodingTrap::Call`].
///
/// The arguments are as follows:
/// * `malformation_length`: The length of the sequence the decoder failed to decode.
/// * `bytes_read_after_malformation`: The number of lookahead bytes the decoder consumed after
/// the malformation.
/// * `input_at_malformation`: What the input buffer is at the malformation.
/// This is the buffer starting at the malformation. The first `malformation_length` bytes are
/// the problematic sequence. The following `bytes_read_after_malformation` are already stored
/// in the decoder and will not be re-fed.
/// * `output`: The output string.
///
/// The function must modify `output` as it feels is best. For instance, one could recreate the
/// behavior of [`YAMLDecodingTrap::Ignore`] with an empty function, [`YAMLDecodingTrap::Replace`]
/// by pushing a `\u{FFFD}` into `output` and [`YAMLDecodingTrap::Strict`] by returning
/// [`ControlFlow::Break`].
///
/// # Returns
/// The function must return [`ControlFlow::Continue`] if decoding may continue or
/// [`ControlFlow::Break`] if decoding must be aborted. An optional error string may be supplied.
pub type YAMLDecodingTrapFn = fn(
malformation_length: u8,
bytes_read_after_malformation: u8,
input_at_malformation: &[u8],
output: &mut String,
) -> ControlFlow<Cow<'static, str>>;
/// The behavior [`YamlDecoder`] must have when an decoding error occurs.
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum YAMLDecodingTrap {
/// Ignore the offending bytes, remove them from the output.
Ignore,
/// Error out.
Strict,
/// Replace them with the Unicode REPLACEMENT CHARACTER.
Replace,
/// Call the user-supplied function upon decoding malformation.
Call(YAMLDecodingTrapFn),
}
/// `YamlDecoder` is a `YamlLoader` builder that allows you to supply your own encoding error trap.
/// For example, to read a YAML file while ignoring Unicode decoding errors you can set the
/// `encoding_trap` to `encoding::DecoderTrap::Ignore`.
/// ```rust
/// use saphyr::{YamlDecoder, YAMLDecodingTrap};
///
/// let string = b"---
/// a\xa9: 1
/// b: 2.2
/// c: [1, 2]
/// ";
/// let out = YamlDecoder::read(string as &[u8])
/// .encoding_trap(YAMLDecodingTrap::Ignore)
/// .decode()
/// .unwrap();
/// ```
pub struct YamlDecoder<T: std::io::Read> {
/// The input stream.
source: T,
/// The behavior to adopt when encountering a malformed encoding.
trap: YAMLDecodingTrap,
}
impl<T: std::io::Read> YamlDecoder<T> {
/// Create a `YamlDecoder` decoding the given source.
pub fn read(source: T) -> YamlDecoder<T> {
YamlDecoder {
source,
trap: YAMLDecodingTrap::Strict,
}
}
/// Set the behavior of the decoder when the encoding is invalid.
pub fn encoding_trap(&mut self, trap: YAMLDecodingTrap) -> &mut Self {
self.trap = trap;
self
}
/// Run the decode operation with the source and trap the `YamlDecoder` was built with.
///
/// # Errors
/// Returns `LoadError` when decoding fails.
pub fn decode(&mut self) -> Result<Vec<Yaml>, LoadError> {
let mut buffer = Vec::new();
self.source.read_to_end(&mut buffer)?;
// Check if the `encoding` library can detect encoding from the BOM, otherwise use
// `detect_utf16_endianness`.
let (encoding, _) =
Encoding::for_bom(&buffer).unwrap_or_else(|| (detect_utf16_endianness(&buffer), 2));
let mut decoder = encoding.new_decoder();
let mut output = String::new();
// Decode the input buffer.
decode_loop(&buffer, &mut output, &mut decoder, self.trap)?;
YamlLoader::load_from_str(&output).map_err(LoadError::Scan)
}
}
/// Perform a loop of [`Decoder::decode_to_string`], reallocating `output` if needed.
fn decode_loop(
input: &[u8],
output: &mut String,
decoder: &mut Decoder,
trap: YAMLDecodingTrap,
) -> Result<(), LoadError> {
use crate::loader::LoadError;
output.reserve(input.len());
let mut total_bytes_read = 0;
loop {
match decoder.decode_to_string_without_replacement(&input[total_bytes_read..], output, true)
{
// If the input is empty, we processed the whole input.
(DecoderResult::InputEmpty, _) => break Ok(()),
// If the output is full, we must reallocate.
(DecoderResult::OutputFull, bytes_read) => {
total_bytes_read += bytes_read;
// The output is already reserved to the size of the input. We slowly resize. Here,
// we're expecting that 10% of bytes will double in size when converting to UTF-8.
output.reserve(input.len() / 10);
}
(DecoderResult::Malformed(malformed_len, bytes_after_malformed), bytes_read) => {
total_bytes_read += bytes_read;
match trap {
// Ignore (skip over) malformed character.
YAMLDecodingTrap::Ignore => {}
// Replace them with the Unicode REPLACEMENT CHARACTER.
YAMLDecodingTrap::Replace => {
output.push('\u{FFFD}');
}
// Otherwise error, getting as much context as possible.
YAMLDecodingTrap::Strict => {
let malformed_len = malformed_len as usize;
let bytes_after_malformed = bytes_after_malformed as usize;
let byte_idx = total_bytes_read - (malformed_len + bytes_after_malformed);
let malformed_sequence = &input[byte_idx..byte_idx + malformed_len];
break Err(LoadError::Decode(Cow::Owned(format!(
"Invalid character sequence at {byte_idx}: {malformed_sequence:?}",
))));
}
YAMLDecodingTrap::Call(callback) => {
let byte_idx =
total_bytes_read - ((malformed_len + bytes_after_malformed) as usize);
let malformed_sequence =
&input[byte_idx..byte_idx + malformed_len as usize];
if let ControlFlow::Break(error) = callback(
malformed_len,
bytes_after_malformed,
&input[byte_idx..],
output,
) {
if error.is_empty() {
break Err(LoadError::Decode(Cow::Owned(format!(
"Invalid character sequence at {byte_idx}: {malformed_sequence:?}",
))));
}
break Err(LoadError::Decode(error));
}
}
}
}
}
}
}
/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the
/// bytestream starts with BOM codepoint.
/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since
/// in the general case the bytestream could start with a codepoint that uses both bytes.
///
/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character.
/// This allows the encoding to be deduced by the pattern of null (#x00) characters.
//
/// See spec at <https://yaml.org/spec/1.2/spec.html#id2771184>
fn detect_utf16_endianness(b: &[u8]) -> &'static Encoding {
if b.len() > 1 && (b[0] != b[1]) {
if b[0] == 0 {
return encoding_rs::UTF_16BE;
} else if b[1] == 0 {
return encoding_rs::UTF_16LE;
}
}
encoding_rs::UTF_8
}
#[cfg(test)]
mod test {
use super::{YAMLDecodingTrap, Yaml, YamlDecoder};
#[test]
fn test_read_bom() {
let s = b"\xef\xbb\xbf---
a: 1
b: 2.2
c: [1, 2]
";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_utf16le() {
let s = b"\xff\xfe-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
\x00";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64) <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_utf16be() {
let s = b"\xfe\xff\x00-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_utf16le_nobom() {
let s = b"-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
\x00";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_trap() {
let s = b"---
a\xa9: 1
b: 2.2
c: [1, 2]
";
let out = YamlDecoder::read(s as &[u8])
.encoding_trap(YAMLDecodingTrap::Ignore)
.decode()
.unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_or() {
assert_eq!(Yaml::Null.or(Yaml::Integer(3)), Yaml::Integer(3));
assert_eq!(Yaml::Integer(3).or(Yaml::Integer(7)), Yaml::Integer(3));
}
}

View file

@ -43,16 +43,20 @@
#![warn(missing_docs, clippy::pedantic)]
pub(crate) mod char_traits;
pub mod emitter;
pub mod yaml;
mod char_traits;
mod emitter;
mod loader;
mod yaml;
// Re-export main components.
pub use crate::emitter::YamlEmitter;
pub use crate::yaml::{Array, Hash, Yaml, YamlLoader};
pub use crate::loader::YamlLoader;
pub use crate::yaml::{Array, Hash, Yaml};
#[cfg(feature = "encoding")]
pub use crate::yaml::{YAMLDecodingTrap, YAMLDecodingTrapFn, YamlDecoder};
mod encoding;
#[cfg(feature = "encoding")]
pub use crate::encoding::{YAMLDecodingTrap, YAMLDecodingTrapFn, YamlDecoder};
// Re-export `ScanError` as it is used as part of our public API and we want consumers to be able
// to inspect it (e.g. perform a `match`). They wouldn't be able without it.

227
saphyr/src/loader.rs Normal file
View file

@ -0,0 +1,227 @@
//! The default loader.
use std::collections::BTreeMap;
use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser, ScanError, TScalarStyle, Tag};
use crate::{Hash, Yaml};
/// Main structure for quickly parsing YAML.
///
/// See [`YamlLoader::load_from_str`].
#[derive(Default)]
#[allow(clippy::module_name_repetitions)]
pub struct YamlLoader {
/// The different YAML documents that are loaded.
docs: Vec<Yaml>,
// states
// (current node, anchor_id) tuple
doc_stack: Vec<(Yaml, usize)>,
key_stack: Vec<Yaml>,
anchor_map: BTreeMap<usize, Yaml>,
}
impl MarkedEventReceiver for YamlLoader {
fn on_event(&mut self, ev: Event, _: Marker) {
// println!("EV {:?}", ev);
match ev {
Event::DocumentStart | Event::Nothing | Event::StreamStart | Event::StreamEnd => {
// do nothing
}
Event::DocumentEnd => {
match self.doc_stack.len() {
// empty document
0 => self.docs.push(Yaml::BadValue),
1 => self.docs.push(self.doc_stack.pop().unwrap().0),
_ => unreachable!(),
}
}
Event::SequenceStart(aid, _) => {
self.doc_stack.push((Yaml::Array(Vec::new()), aid));
}
Event::SequenceEnd => {
let node = self.doc_stack.pop().unwrap();
self.insert_new_node(node);
}
Event::MappingStart(aid, _) => {
self.doc_stack.push((Yaml::Hash(Hash::new()), aid));
self.key_stack.push(Yaml::BadValue);
}
Event::MappingEnd => {
self.key_stack.pop().unwrap();
let node = self.doc_stack.pop().unwrap();
self.insert_new_node(node);
}
Event::Scalar(v, style, aid, tag) => {
let node = if style != TScalarStyle::Plain {
Yaml::String(v)
} else if let Some(Tag {
ref handle,
ref suffix,
}) = tag
{
if handle == "tag:yaml.org,2002:" {
match suffix.as_ref() {
"bool" => {
// "true" or "false"
match v.parse::<bool>() {
Err(_) => Yaml::BadValue,
Ok(v) => Yaml::Boolean(v),
}
}
"int" => match v.parse::<i64>() {
Err(_) => Yaml::BadValue,
Ok(v) => Yaml::Integer(v),
},
"float" => match parse_f64(&v) {
Some(_) => Yaml::Real(v),
None => Yaml::BadValue,
},
"null" => match v.as_ref() {
"~" | "null" => Yaml::Null,
_ => Yaml::BadValue,
},
_ => Yaml::String(v),
}
} else {
Yaml::String(v)
}
} else {
// Datatype is not specified, or unrecognized
Yaml::from_str(&v)
};
self.insert_new_node((node, aid));
}
Event::Alias(id) => {
let n = match self.anchor_map.get(&id) {
Some(v) => v.clone(),
None => Yaml::BadValue,
};
self.insert_new_node((n, 0));
}
}
// println!("DOC {:?}", self.doc_stack);
}
}
/// An error that happened when loading a YAML document.
#[derive(Debug)]
pub enum LoadError {
/// An I/O error.
IO(std::io::Error),
/// An error within the scanner. This indicates a malformed YAML input.
Scan(ScanError),
/// A decoding error (e.g.: Invalid UTF-8).
Decode(std::borrow::Cow<'static, str>),
}
impl From<std::io::Error> for LoadError {
fn from(error: std::io::Error) -> Self {
LoadError::IO(error)
}
}
impl std::error::Error for LoadError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
Some(match &self {
LoadError::IO(e) => e,
LoadError::Scan(e) => e,
LoadError::Decode(_) => return None,
})
}
}
impl std::fmt::Display for LoadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
LoadError::IO(e) => e.fmt(f),
LoadError::Scan(e) => e.fmt(f),
LoadError::Decode(e) => e.fmt(f),
}
}
}
impl YamlLoader {
fn insert_new_node(&mut self, node: (Yaml, usize)) {
// valid anchor id starts from 1
if node.1 > 0 {
self.anchor_map.insert(node.1, node.0.clone());
}
if self.doc_stack.is_empty() {
self.doc_stack.push(node);
} else {
let parent = self.doc_stack.last_mut().unwrap();
match *parent {
(Yaml::Array(ref mut v), _) => v.push(node.0),
(Yaml::Hash(ref mut h), _) => {
let cur_key = self.key_stack.last_mut().unwrap();
// current node is a key
if cur_key.is_badvalue() {
*cur_key = node.0;
// current node is a value
} else {
let mut newkey = Yaml::BadValue;
std::mem::swap(&mut newkey, cur_key);
h.insert(newkey, node.0);
}
}
_ => unreachable!(),
}
}
}
/// Load the given string as a set of YAML documents.
///
/// The `source` is interpreted as YAML documents and is parsed. Parsing succeeds if and only
/// if all documents are parsed successfully. An error in a latter document prevents the former
/// from being returned.
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load_from_str(source: &str) -> Result<Vec<Yaml>, ScanError> {
Self::load_from_iter(source.chars())
}
/// Load the contents of the given iterator as a set of YAML documents.
///
/// The `source` is interpreted as YAML documents and is parsed. Parsing succeeds if and only
/// if all documents are parsed successfully. An error in a latter document prevents the former
/// from being returned.
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load_from_iter<I: Iterator<Item = char>>(source: I) -> Result<Vec<Yaml>, ScanError> {
let mut parser = Parser::new(source);
Self::load_from_parser(&mut parser)
}
/// Load the contents from the specified Parser as a set of YAML documents.
///
/// Parsing succeeds if and only if all documents are parsed successfully.
/// An error in a latter document prevents the former from being returned.
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load_from_parser<I: Iterator<Item = char>>(
parser: &mut Parser<I>,
) -> Result<Vec<Yaml>, ScanError> {
let mut loader = YamlLoader::default();
parser.load(&mut loader, true)?;
Ok(loader.docs)
}
/// Return a reference to the parsed Yaml documents.
#[must_use]
pub fn documents(&self) -> &[Yaml] {
&self.docs
}
}
// parse f64 as Core schema
// See: https://github.com/chyh1990/yaml-rust/issues/51
pub(crate) fn parse_f64(v: &str) -> Option<f64> {
match v {
".inf" | ".Inf" | ".INF" | "+.inf" | "+.Inf" | "+.INF" => Some(f64::INFINITY),
"-.inf" | "-.Inf" | "-.INF" => Some(f64::NEG_INFINITY),
".nan" | "NaN" | ".NAN" => Some(f64::NAN),
_ => v.parse::<f64>().ok(),
}
}

View file

@ -2,15 +2,11 @@
#![allow(clippy::module_name_repetitions)]
use std::borrow::Cow;
use std::ops::ControlFlow;
use std::{collections::BTreeMap, convert::TryFrom, mem, ops::Index, ops::IndexMut};
use std::{convert::TryFrom, ops::Index, ops::IndexMut};
#[cfg(feature = "encoding")]
use encoding_rs::{Decoder, DecoderResult, Encoding};
use hashlink::LinkedHashMap;
use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser, ScanError, TScalarStyle, Tag};
use crate::loader::parse_f64;
/// A YAML node is stored as this `Yaml` enumeration, which provides an easy way to
/// access your YAML document.
@ -60,416 +56,6 @@ pub type Array = Vec<Yaml>;
/// The type contained in the `Yaml::Hash` variant. This corresponds to YAML mappings.
pub type Hash = LinkedHashMap<Yaml, Yaml>;
// parse f64 as Core schema
// See: https://github.com/chyh1990/yaml-rust/issues/51
fn parse_f64(v: &str) -> Option<f64> {
match v {
".inf" | ".Inf" | ".INF" | "+.inf" | "+.Inf" | "+.INF" => Some(f64::INFINITY),
"-.inf" | "-.Inf" | "-.INF" => Some(f64::NEG_INFINITY),
".nan" | "NaN" | ".NAN" => Some(f64::NAN),
_ => v.parse::<f64>().ok(),
}
}
/// Main structure for quickly parsing YAML.
///
/// See [`YamlLoader::load_from_str`].
#[derive(Default)]
pub struct YamlLoader {
/// The different YAML documents that are loaded.
docs: Vec<Yaml>,
// states
// (current node, anchor_id) tuple
doc_stack: Vec<(Yaml, usize)>,
key_stack: Vec<Yaml>,
anchor_map: BTreeMap<usize, Yaml>,
}
impl MarkedEventReceiver for YamlLoader {
fn on_event(&mut self, ev: Event, _: Marker) {
// println!("EV {:?}", ev);
match ev {
Event::DocumentStart | Event::Nothing | Event::StreamStart | Event::StreamEnd => {
// do nothing
}
Event::DocumentEnd => {
match self.doc_stack.len() {
// empty document
0 => self.docs.push(Yaml::BadValue),
1 => self.docs.push(self.doc_stack.pop().unwrap().0),
_ => unreachable!(),
}
}
Event::SequenceStart(aid, _) => {
self.doc_stack.push((Yaml::Array(Vec::new()), aid));
}
Event::SequenceEnd => {
let node = self.doc_stack.pop().unwrap();
self.insert_new_node(node);
}
Event::MappingStart(aid, _) => {
self.doc_stack.push((Yaml::Hash(Hash::new()), aid));
self.key_stack.push(Yaml::BadValue);
}
Event::MappingEnd => {
self.key_stack.pop().unwrap();
let node = self.doc_stack.pop().unwrap();
self.insert_new_node(node);
}
Event::Scalar(v, style, aid, tag) => {
let node = if style != TScalarStyle::Plain {
Yaml::String(v)
} else if let Some(Tag {
ref handle,
ref suffix,
}) = tag
{
if handle == "tag:yaml.org,2002:" {
match suffix.as_ref() {
"bool" => {
// "true" or "false"
match v.parse::<bool>() {
Err(_) => Yaml::BadValue,
Ok(v) => Yaml::Boolean(v),
}
}
"int" => match v.parse::<i64>() {
Err(_) => Yaml::BadValue,
Ok(v) => Yaml::Integer(v),
},
"float" => match parse_f64(&v) {
Some(_) => Yaml::Real(v),
None => Yaml::BadValue,
},
"null" => match v.as_ref() {
"~" | "null" => Yaml::Null,
_ => Yaml::BadValue,
},
_ => Yaml::String(v),
}
} else {
Yaml::String(v)
}
} else {
// Datatype is not specified, or unrecognized
Yaml::from_str(&v)
};
self.insert_new_node((node, aid));
}
Event::Alias(id) => {
let n = match self.anchor_map.get(&id) {
Some(v) => v.clone(),
None => Yaml::BadValue,
};
self.insert_new_node((n, 0));
}
}
// println!("DOC {:?}", self.doc_stack);
}
}
/// An error that happened when loading a YAML document.
#[derive(Debug)]
pub enum LoadError {
/// An I/O error.
IO(std::io::Error),
/// An error within the scanner. This indicates a malformed YAML input.
Scan(ScanError),
/// A decoding error (e.g.: Invalid UTF_8).
Decode(std::borrow::Cow<'static, str>),
}
impl From<std::io::Error> for LoadError {
fn from(error: std::io::Error) -> Self {
LoadError::IO(error)
}
}
impl std::error::Error for LoadError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
Some(match &self {
LoadError::IO(e) => e,
LoadError::Scan(e) => e,
LoadError::Decode(_) => return None,
})
}
}
impl std::fmt::Display for LoadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
LoadError::IO(e) => e.fmt(f),
LoadError::Scan(e) => e.fmt(f),
LoadError::Decode(e) => e.fmt(f),
}
}
}
impl YamlLoader {
fn insert_new_node(&mut self, node: (Yaml, usize)) {
// valid anchor id starts from 1
if node.1 > 0 {
self.anchor_map.insert(node.1, node.0.clone());
}
if self.doc_stack.is_empty() {
self.doc_stack.push(node);
} else {
let parent = self.doc_stack.last_mut().unwrap();
match *parent {
(Yaml::Array(ref mut v), _) => v.push(node.0),
(Yaml::Hash(ref mut h), _) => {
let cur_key = self.key_stack.last_mut().unwrap();
// current node is a key
if cur_key.is_badvalue() {
*cur_key = node.0;
// current node is a value
} else {
let mut newkey = Yaml::BadValue;
mem::swap(&mut newkey, cur_key);
h.insert(newkey, node.0);
}
}
_ => unreachable!(),
}
}
}
/// Load the given string as a set of YAML documents.
///
/// The `source` is interpreted as YAML documents and is parsed. Parsing succeeds if and only
/// if all documents are parsed successfully. An error in a latter document prevents the former
/// from being returned.
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load_from_str(source: &str) -> Result<Vec<Yaml>, ScanError> {
Self::load_from_iter(source.chars())
}
/// Load the contents of the given iterator as a set of YAML documents.
///
/// The `source` is interpreted as YAML documents and is parsed. Parsing succeeds if and only
/// if all documents are parsed successfully. An error in a latter document prevents the former
/// from being returned.
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load_from_iter<I: Iterator<Item = char>>(source: I) -> Result<Vec<Yaml>, ScanError> {
let mut parser = Parser::new(source);
Self::load_from_parser(&mut parser)
}
/// Load the contents from the specified Parser as a set of YAML documents.
///
/// Parsing succeeds if and only if all documents are parsed successfully.
/// An error in a latter document prevents the former from being returned.
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load_from_parser<I: Iterator<Item = char>>(
parser: &mut Parser<I>,
) -> Result<Vec<Yaml>, ScanError> {
let mut loader = YamlLoader::default();
parser.load(&mut loader, true)?;
Ok(loader.docs)
}
/// Return a reference to the parsed Yaml documents.
#[must_use]
pub fn documents(&self) -> &[Yaml] {
&self.docs
}
}
/// The signature of the function to call when using [`YAMLDecodingTrap::Call`].
///
/// The arguments are as follows:
/// * `malformation_length`: The length of the sequence the decoder failed to decode.
/// * `bytes_read_after_malformation`: The number of lookahead bytes the decoder consumed after
/// the malformation.
/// * `input_at_malformation`: What the input buffer is at the malformation.
/// This is the buffer starting at the malformation. The first `malformation_length` bytes are
/// the problematic sequence. The following `bytes_read_after_malformation` are already stored
/// in the decoder and will not be re-fed.
/// * `output`: The output string.
///
/// The function must modify `output` as it feels is best. For instance, one could recreate the
/// behavior of [`YAMLDecodingTrap::Ignore`] with an empty function, [`YAMLDecodingTrap::Replace`]
/// by pushing a `\u{FFFD}` into `output` and [`YAMLDecodingTrap::Strict`] by returning
/// [`ControlFlow::Break`].
///
/// # Returns
/// The function must return [`ControlFlow::Continue`] if decoding may continue or
/// [`ControlFlow::Break`] if decoding must be aborted. An optional error string may be supplied.
#[cfg(feature = "encoding")]
pub type YAMLDecodingTrapFn = fn(
malformation_length: u8,
bytes_read_after_malformation: u8,
input_at_malformation: &[u8],
output: &mut String,
) -> ControlFlow<Cow<'static, str>>;
/// The behavior [`YamlDecoder`] must have when an decoding error occurs.
#[cfg(feature = "encoding")]
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum YAMLDecodingTrap {
/// Ignore the offending bytes, remove them from the output.
Ignore,
/// Error out.
Strict,
/// Replace them with the Unicode REPLACEMENT CHARACTER.
Replace,
/// Call the user-supplied function upon decoding malformation.
Call(YAMLDecodingTrapFn),
}
/// `YamlDecoder` is a `YamlLoader` builder that allows you to supply your own encoding error trap.
/// For example, to read a YAML file while ignoring Unicode decoding errors you can set the
/// `encoding_trap` to `encoding::DecoderTrap::Ignore`.
/// ```rust
/// use saphyr::{YamlDecoder, YAMLDecodingTrap};
///
/// let string = b"---
/// a\xa9: 1
/// b: 2.2
/// c: [1, 2]
/// ";
/// let out = YamlDecoder::read(string as &[u8])
/// .encoding_trap(YAMLDecodingTrap::Ignore)
/// .decode()
/// .unwrap();
/// ```
#[cfg(feature = "encoding")]
pub struct YamlDecoder<T: std::io::Read> {
source: T,
trap: YAMLDecodingTrap,
}
#[cfg(feature = "encoding")]
impl<T: std::io::Read> YamlDecoder<T> {
/// Create a `YamlDecoder` decoding the given source.
pub fn read(source: T) -> YamlDecoder<T> {
YamlDecoder {
source,
trap: YAMLDecodingTrap::Strict,
}
}
/// Set the behavior of the decoder when the encoding is invalid.
pub fn encoding_trap(&mut self, trap: YAMLDecodingTrap) -> &mut Self {
self.trap = trap;
self
}
/// Run the decode operation with the source and trap the `YamlDecoder` was built with.
///
/// # Errors
/// Returns `LoadError` when decoding fails.
pub fn decode(&mut self) -> Result<Vec<Yaml>, LoadError> {
let mut buffer = Vec::new();
self.source.read_to_end(&mut buffer)?;
// Check if the `encoding` library can detect encoding from the BOM, otherwise use
// `detect_utf16_endianness`.
let (encoding, _) =
Encoding::for_bom(&buffer).unwrap_or_else(|| (detect_utf16_endianness(&buffer), 2));
let mut decoder = encoding.new_decoder();
let mut output = String::new();
// Decode the input buffer.
decode_loop(&buffer, &mut output, &mut decoder, self.trap)?;
YamlLoader::load_from_str(&output).map_err(LoadError::Scan)
}
}
/// Perform a loop of [`Decoder::decode_to_string`], reallocating `output` if needed.
#[cfg(feature = "encoding")]
fn decode_loop(
input: &[u8],
output: &mut String,
decoder: &mut Decoder,
trap: YAMLDecodingTrap,
) -> Result<(), LoadError> {
output.reserve(input.len());
let mut total_bytes_read = 0;
loop {
match decoder.decode_to_string_without_replacement(&input[total_bytes_read..], output, true)
{
// If the input is empty, we processed the whole input.
(DecoderResult::InputEmpty, _) => break Ok(()),
// If the output is full, we must reallocate.
(DecoderResult::OutputFull, bytes_read) => {
total_bytes_read += bytes_read;
// The output is already reserved to the size of the input. We slowly resize. Here,
// we're expecting that 10% of bytes will double in size when converting to UTF-8.
output.reserve(input.len() / 10);
}
(DecoderResult::Malformed(malformed_len, bytes_after_malformed), bytes_read) => {
total_bytes_read += bytes_read;
match trap {
// Ignore (skip over) malformed character.
YAMLDecodingTrap::Ignore => {}
// Replace them with the Unicode REPLACEMENT CHARACTER.
YAMLDecodingTrap::Replace => {
output.push('\u{FFFD}');
}
// Otherwise error, getting as much context as possible.
YAMLDecodingTrap::Strict => {
let malformed_len = malformed_len as usize;
let bytes_after_malformed = bytes_after_malformed as usize;
let byte_idx = total_bytes_read - (malformed_len + bytes_after_malformed);
let malformed_sequence = &input[byte_idx..byte_idx + malformed_len];
break Err(LoadError::Decode(Cow::Owned(format!(
"Invalid character sequence at {byte_idx}: {malformed_sequence:?}",
))));
}
YAMLDecodingTrap::Call(callback) => {
let byte_idx =
total_bytes_read - ((malformed_len + bytes_after_malformed) as usize);
let malformed_sequence =
&input[byte_idx..byte_idx + malformed_len as usize];
if let ControlFlow::Break(error) = callback(
malformed_len,
bytes_after_malformed,
&input[byte_idx..],
output,
) {
if error.is_empty() {
break Err(LoadError::Decode(Cow::Owned(format!(
"Invalid character sequence at {byte_idx}: {malformed_sequence:?}",
))));
}
break Err(LoadError::Decode(error));
}
}
}
}
}
}
}
/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the
/// bytestream starts with BOM codepoint.
/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since
/// in the general case the bytestream could start with a codepoint that uses both bytes.
///
/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character.
/// This allows the encoding to be deduced by the pattern of null (#x00) characters.
//
/// See spec at <https://yaml.org/spec/1.2/spec.html#id2771184>
#[cfg(feature = "encoding")]
fn detect_utf16_endianness(b: &[u8]) -> &'static Encoding {
if b.len() > 1 && (b[0] != b[1]) {
if b[0] == 0 {
return encoding_rs::UTF_16BE;
} else if b[1] == 0 {
return encoding_rs::UTF_16LE;
}
}
encoding_rs::UTF_8
}
macro_rules! define_as (
($name:ident, $t:ident, $yt:ident) => (
/// Get a copy of the inner object in the YAML enum if it is a `$t`.
@ -623,7 +209,7 @@ impl Yaml {
}
}
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
#[allow(clippy::should_implement_trait)]
impl Yaml {
/// Convert a string to a [`Yaml`] node.
///
@ -757,96 +343,3 @@ impl Iterator for YamlIter {
self.yaml.next()
}
}
#[cfg(test)]
mod test {
use super::{YAMLDecodingTrap, Yaml, YamlDecoder};
#[test]
fn test_read_bom() {
let s = b"\xef\xbb\xbf---
a: 1
b: 2.2
c: [1, 2]
";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_utf16le() {
let s = b"\xff\xfe-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
\x00";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64) <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_utf16be() {
let s = b"\xfe\xff\x00-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_utf16le_nobom() {
let s = b"-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
\x00";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_trap() {
let s = b"---
a\xa9: 1
b: 2.2
c: [1, 2]
";
let out = YamlDecoder::read(s as &[u8])
.encoding_trap(YAMLDecodingTrap::Ignore)
.decode()
.unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_or() {
assert_eq!(Yaml::Null.or(Yaml::Integer(3)), Yaml::Integer(3));
assert_eq!(Yaml::Integer(3).or(Yaml::Integer(7)), Yaml::Integer(3));
}
}