From 2b8eb3f62b3e267262a66827ae8d01be1c1df754 Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Mon, 10 Jun 2024 18:05:25 +0200 Subject: [PATCH] Split `yaml.rs` into sizeable files. --- saphyr/examples/dump_yaml.rs | 10 +- saphyr/src/encoding.rs | 289 ++++++++++++++++++++ saphyr/src/lib.rs | 14 +- saphyr/src/loader.rs | 227 ++++++++++++++++ saphyr/src/yaml.rs | 513 +---------------------------------- 5 files changed, 533 insertions(+), 520 deletions(-) create mode 100644 saphyr/src/encoding.rs create mode 100644 saphyr/src/loader.rs diff --git a/saphyr/examples/dump_yaml.rs b/saphyr/examples/dump_yaml.rs index 1a9f0f5..8d85d7e 100644 --- a/saphyr/examples/dump_yaml.rs +++ b/saphyr/examples/dump_yaml.rs @@ -1,4 +1,4 @@ -use saphyr::yaml; +use saphyr::{Yaml, YamlLoader}; use std::env; use std::fs::File; use std::io::prelude::*; @@ -9,14 +9,14 @@ fn print_indent(indent: usize) { } } -fn dump_node(doc: &yaml::Yaml, indent: usize) { +fn dump_node(doc: &Yaml, indent: usize) { match *doc { - yaml::Yaml::Array(ref v) => { + Yaml::Array(ref v) => { for x in v { dump_node(x, indent + 1); } } - yaml::Yaml::Hash(ref h) => { + Yaml::Hash(ref h) => { for (k, v) in h { print_indent(indent); println!("{k:?}:"); @@ -36,7 +36,7 @@ fn main() { let mut s = String::new(); f.read_to_string(&mut s).unwrap(); - let docs = yaml::YamlLoader::load_from_str(&s).unwrap(); + let docs = YamlLoader::load_from_str(&s).unwrap(); for doc in &docs { println!("---"); dump_node(doc, 0); diff --git a/saphyr/src/encoding.rs b/saphyr/src/encoding.rs new file mode 100644 index 0000000..6d46dd3 --- /dev/null +++ b/saphyr/src/encoding.rs @@ -0,0 +1,289 @@ +//! Encoding utilities. Available only with the `encoding` feature. + +use std::{borrow::Cow, ops::ControlFlow}; + +use encoding_rs::{Decoder, DecoderResult, Encoding}; + +use crate::{loader::LoadError, Yaml, YamlLoader}; + +/// The signature of the function to call when using [`YAMLDecodingTrap::Call`]. +/// +/// The arguments are as follows: +/// * `malformation_length`: The length of the sequence the decoder failed to decode. +/// * `bytes_read_after_malformation`: The number of lookahead bytes the decoder consumed after +/// the malformation. +/// * `input_at_malformation`: What the input buffer is at the malformation. +/// This is the buffer starting at the malformation. The first `malformation_length` bytes are +/// the problematic sequence. The following `bytes_read_after_malformation` are already stored +/// in the decoder and will not be re-fed. +/// * `output`: The output string. +/// +/// The function must modify `output` as it feels is best. For instance, one could recreate the +/// behavior of [`YAMLDecodingTrap::Ignore`] with an empty function, [`YAMLDecodingTrap::Replace`] +/// by pushing a `\u{FFFD}` into `output` and [`YAMLDecodingTrap::Strict`] by returning +/// [`ControlFlow::Break`]. +/// +/// # Returns +/// The function must return [`ControlFlow::Continue`] if decoding may continue or +/// [`ControlFlow::Break`] if decoding must be aborted. An optional error string may be supplied. +pub type YAMLDecodingTrapFn = fn( + malformation_length: u8, + bytes_read_after_malformation: u8, + input_at_malformation: &[u8], + output: &mut String, +) -> ControlFlow>; + +/// The behavior [`YamlDecoder`] must have when an decoding error occurs. +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum YAMLDecodingTrap { + /// Ignore the offending bytes, remove them from the output. + Ignore, + /// Error out. + Strict, + /// Replace them with the Unicode REPLACEMENT CHARACTER. + Replace, + /// Call the user-supplied function upon decoding malformation. + Call(YAMLDecodingTrapFn), +} + +/// `YamlDecoder` is a `YamlLoader` builder that allows you to supply your own encoding error trap. +/// For example, to read a YAML file while ignoring Unicode decoding errors you can set the +/// `encoding_trap` to `encoding::DecoderTrap::Ignore`. +/// ```rust +/// use saphyr::{YamlDecoder, YAMLDecodingTrap}; +/// +/// let string = b"--- +/// a\xa9: 1 +/// b: 2.2 +/// c: [1, 2] +/// "; +/// let out = YamlDecoder::read(string as &[u8]) +/// .encoding_trap(YAMLDecodingTrap::Ignore) +/// .decode() +/// .unwrap(); +/// ``` +pub struct YamlDecoder { + /// The input stream. + source: T, + /// The behavior to adopt when encountering a malformed encoding. + trap: YAMLDecodingTrap, +} + +impl YamlDecoder { + /// Create a `YamlDecoder` decoding the given source. + pub fn read(source: T) -> YamlDecoder { + YamlDecoder { + source, + trap: YAMLDecodingTrap::Strict, + } + } + + /// Set the behavior of the decoder when the encoding is invalid. + pub fn encoding_trap(&mut self, trap: YAMLDecodingTrap) -> &mut Self { + self.trap = trap; + self + } + + /// Run the decode operation with the source and trap the `YamlDecoder` was built with. + /// + /// # Errors + /// Returns `LoadError` when decoding fails. + pub fn decode(&mut self) -> Result, LoadError> { + let mut buffer = Vec::new(); + self.source.read_to_end(&mut buffer)?; + + // Check if the `encoding` library can detect encoding from the BOM, otherwise use + // `detect_utf16_endianness`. + let (encoding, _) = + Encoding::for_bom(&buffer).unwrap_or_else(|| (detect_utf16_endianness(&buffer), 2)); + let mut decoder = encoding.new_decoder(); + let mut output = String::new(); + + // Decode the input buffer. + decode_loop(&buffer, &mut output, &mut decoder, self.trap)?; + + YamlLoader::load_from_str(&output).map_err(LoadError::Scan) + } +} + +/// Perform a loop of [`Decoder::decode_to_string`], reallocating `output` if needed. +fn decode_loop( + input: &[u8], + output: &mut String, + decoder: &mut Decoder, + trap: YAMLDecodingTrap, +) -> Result<(), LoadError> { + use crate::loader::LoadError; + + output.reserve(input.len()); + let mut total_bytes_read = 0; + + loop { + match decoder.decode_to_string_without_replacement(&input[total_bytes_read..], output, true) + { + // If the input is empty, we processed the whole input. + (DecoderResult::InputEmpty, _) => break Ok(()), + // If the output is full, we must reallocate. + (DecoderResult::OutputFull, bytes_read) => { + total_bytes_read += bytes_read; + // The output is already reserved to the size of the input. We slowly resize. Here, + // we're expecting that 10% of bytes will double in size when converting to UTF-8. + output.reserve(input.len() / 10); + } + (DecoderResult::Malformed(malformed_len, bytes_after_malformed), bytes_read) => { + total_bytes_read += bytes_read; + match trap { + // Ignore (skip over) malformed character. + YAMLDecodingTrap::Ignore => {} + // Replace them with the Unicode REPLACEMENT CHARACTER. + YAMLDecodingTrap::Replace => { + output.push('\u{FFFD}'); + } + // Otherwise error, getting as much context as possible. + YAMLDecodingTrap::Strict => { + let malformed_len = malformed_len as usize; + let bytes_after_malformed = bytes_after_malformed as usize; + let byte_idx = total_bytes_read - (malformed_len + bytes_after_malformed); + let malformed_sequence = &input[byte_idx..byte_idx + malformed_len]; + + break Err(LoadError::Decode(Cow::Owned(format!( + "Invalid character sequence at {byte_idx}: {malformed_sequence:?}", + )))); + } + YAMLDecodingTrap::Call(callback) => { + let byte_idx = + total_bytes_read - ((malformed_len + bytes_after_malformed) as usize); + let malformed_sequence = + &input[byte_idx..byte_idx + malformed_len as usize]; + if let ControlFlow::Break(error) = callback( + malformed_len, + bytes_after_malformed, + &input[byte_idx..], + output, + ) { + if error.is_empty() { + break Err(LoadError::Decode(Cow::Owned(format!( + "Invalid character sequence at {byte_idx}: {malformed_sequence:?}", + )))); + } + break Err(LoadError::Decode(error)); + } + } + } + } + } + } +} + +/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the +/// bytestream starts with BOM codepoint. +/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since +/// in the general case the bytestream could start with a codepoint that uses both bytes. +/// +/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character. +/// This allows the encoding to be deduced by the pattern of null (#x00) characters. +// +/// See spec at +fn detect_utf16_endianness(b: &[u8]) -> &'static Encoding { + if b.len() > 1 && (b[0] != b[1]) { + if b[0] == 0 { + return encoding_rs::UTF_16BE; + } else if b[1] == 0 { + return encoding_rs::UTF_16LE; + } + } + encoding_rs::UTF_8 +} + +#[cfg(test)] +mod test { + use super::{YAMLDecodingTrap, Yaml, YamlDecoder}; + + #[test] + fn test_read_bom() { + let s = b"\xef\xbb\xbf--- +a: 1 +b: 2.2 +c: [1, 2] +"; + let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); + let doc = &out[0]; + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_read_utf16le() { + let s = b"\xff\xfe-\x00-\x00-\x00 +\x00a\x00:\x00 \x001\x00 +\x00b\x00:\x00 \x002\x00.\x002\x00 +\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00 +\x00"; + let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); + let doc = &out[0]; + println!("GOT: {doc:?}"); + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert!((doc["b"].as_f64().unwrap() - 2.2f64) <= f64::EPSILON); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_read_utf16be() { + let s = b"\xfe\xff\x00-\x00-\x00-\x00 +\x00a\x00:\x00 \x001\x00 +\x00b\x00:\x00 \x002\x00.\x002\x00 +\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00 +"; + let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); + let doc = &out[0]; + println!("GOT: {doc:?}"); + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_read_utf16le_nobom() { + let s = b"-\x00-\x00-\x00 +\x00a\x00:\x00 \x001\x00 +\x00b\x00:\x00 \x002\x00.\x002\x00 +\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00 +\x00"; + let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); + let doc = &out[0]; + println!("GOT: {doc:?}"); + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_read_trap() { + let s = b"--- +a\xa9: 1 +b: 2.2 +c: [1, 2] +"; + let out = YamlDecoder::read(s as &[u8]) + .encoding_trap(YAMLDecodingTrap::Ignore) + .decode() + .unwrap(); + let doc = &out[0]; + println!("GOT: {doc:?}"); + assert_eq!(doc["a"].as_i64().unwrap(), 1i64); + assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON); + assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); + assert!(doc["d"][0].is_badvalue()); + } + + #[test] + fn test_or() { + assert_eq!(Yaml::Null.or(Yaml::Integer(3)), Yaml::Integer(3)); + assert_eq!(Yaml::Integer(3).or(Yaml::Integer(7)), Yaml::Integer(3)); + } +} diff --git a/saphyr/src/lib.rs b/saphyr/src/lib.rs index aaed759..ede027b 100644 --- a/saphyr/src/lib.rs +++ b/saphyr/src/lib.rs @@ -43,16 +43,20 @@ #![warn(missing_docs, clippy::pedantic)] -pub(crate) mod char_traits; -pub mod emitter; -pub mod yaml; +mod char_traits; +mod emitter; +mod loader; +mod yaml; // Re-export main components. pub use crate::emitter::YamlEmitter; -pub use crate::yaml::{Array, Hash, Yaml, YamlLoader}; +pub use crate::loader::YamlLoader; +pub use crate::yaml::{Array, Hash, Yaml}; #[cfg(feature = "encoding")] -pub use crate::yaml::{YAMLDecodingTrap, YAMLDecodingTrapFn, YamlDecoder}; +mod encoding; +#[cfg(feature = "encoding")] +pub use crate::encoding::{YAMLDecodingTrap, YAMLDecodingTrapFn, YamlDecoder}; // Re-export `ScanError` as it is used as part of our public API and we want consumers to be able // to inspect it (e.g. perform a `match`). They wouldn't be able without it. diff --git a/saphyr/src/loader.rs b/saphyr/src/loader.rs new file mode 100644 index 0000000..f2706bb --- /dev/null +++ b/saphyr/src/loader.rs @@ -0,0 +1,227 @@ +//! The default loader. + +use std::collections::BTreeMap; + +use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser, ScanError, TScalarStyle, Tag}; + +use crate::{Hash, Yaml}; + +/// Main structure for quickly parsing YAML. +/// +/// See [`YamlLoader::load_from_str`]. +#[derive(Default)] +#[allow(clippy::module_name_repetitions)] +pub struct YamlLoader { + /// The different YAML documents that are loaded. + docs: Vec, + // states + // (current node, anchor_id) tuple + doc_stack: Vec<(Yaml, usize)>, + key_stack: Vec, + anchor_map: BTreeMap, +} + +impl MarkedEventReceiver for YamlLoader { + fn on_event(&mut self, ev: Event, _: Marker) { + // println!("EV {:?}", ev); + match ev { + Event::DocumentStart | Event::Nothing | Event::StreamStart | Event::StreamEnd => { + // do nothing + } + Event::DocumentEnd => { + match self.doc_stack.len() { + // empty document + 0 => self.docs.push(Yaml::BadValue), + 1 => self.docs.push(self.doc_stack.pop().unwrap().0), + _ => unreachable!(), + } + } + Event::SequenceStart(aid, _) => { + self.doc_stack.push((Yaml::Array(Vec::new()), aid)); + } + Event::SequenceEnd => { + let node = self.doc_stack.pop().unwrap(); + self.insert_new_node(node); + } + Event::MappingStart(aid, _) => { + self.doc_stack.push((Yaml::Hash(Hash::new()), aid)); + self.key_stack.push(Yaml::BadValue); + } + Event::MappingEnd => { + self.key_stack.pop().unwrap(); + let node = self.doc_stack.pop().unwrap(); + self.insert_new_node(node); + } + Event::Scalar(v, style, aid, tag) => { + let node = if style != TScalarStyle::Plain { + Yaml::String(v) + } else if let Some(Tag { + ref handle, + ref suffix, + }) = tag + { + if handle == "tag:yaml.org,2002:" { + match suffix.as_ref() { + "bool" => { + // "true" or "false" + match v.parse::() { + Err(_) => Yaml::BadValue, + Ok(v) => Yaml::Boolean(v), + } + } + "int" => match v.parse::() { + Err(_) => Yaml::BadValue, + Ok(v) => Yaml::Integer(v), + }, + "float" => match parse_f64(&v) { + Some(_) => Yaml::Real(v), + None => Yaml::BadValue, + }, + "null" => match v.as_ref() { + "~" | "null" => Yaml::Null, + _ => Yaml::BadValue, + }, + _ => Yaml::String(v), + } + } else { + Yaml::String(v) + } + } else { + // Datatype is not specified, or unrecognized + Yaml::from_str(&v) + }; + + self.insert_new_node((node, aid)); + } + Event::Alias(id) => { + let n = match self.anchor_map.get(&id) { + Some(v) => v.clone(), + None => Yaml::BadValue, + }; + self.insert_new_node((n, 0)); + } + } + // println!("DOC {:?}", self.doc_stack); + } +} + +/// An error that happened when loading a YAML document. +#[derive(Debug)] +pub enum LoadError { + /// An I/O error. + IO(std::io::Error), + /// An error within the scanner. This indicates a malformed YAML input. + Scan(ScanError), + /// A decoding error (e.g.: Invalid UTF-8). + Decode(std::borrow::Cow<'static, str>), +} + +impl From for LoadError { + fn from(error: std::io::Error) -> Self { + LoadError::IO(error) + } +} + +impl std::error::Error for LoadError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + Some(match &self { + LoadError::IO(e) => e, + LoadError::Scan(e) => e, + LoadError::Decode(_) => return None, + }) + } +} + +impl std::fmt::Display for LoadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LoadError::IO(e) => e.fmt(f), + LoadError::Scan(e) => e.fmt(f), + LoadError::Decode(e) => e.fmt(f), + } + } +} + +impl YamlLoader { + fn insert_new_node(&mut self, node: (Yaml, usize)) { + // valid anchor id starts from 1 + if node.1 > 0 { + self.anchor_map.insert(node.1, node.0.clone()); + } + if self.doc_stack.is_empty() { + self.doc_stack.push(node); + } else { + let parent = self.doc_stack.last_mut().unwrap(); + match *parent { + (Yaml::Array(ref mut v), _) => v.push(node.0), + (Yaml::Hash(ref mut h), _) => { + let cur_key = self.key_stack.last_mut().unwrap(); + // current node is a key + if cur_key.is_badvalue() { + *cur_key = node.0; + // current node is a value + } else { + let mut newkey = Yaml::BadValue; + std::mem::swap(&mut newkey, cur_key); + h.insert(newkey, node.0); + } + } + _ => unreachable!(), + } + } + } + + /// Load the given string as a set of YAML documents. + /// + /// The `source` is interpreted as YAML documents and is parsed. Parsing succeeds if and only + /// if all documents are parsed successfully. An error in a latter document prevents the former + /// from being returned. + /// # Errors + /// Returns `ScanError` when loading fails. + pub fn load_from_str(source: &str) -> Result, ScanError> { + Self::load_from_iter(source.chars()) + } + + /// Load the contents of the given iterator as a set of YAML documents. + /// + /// The `source` is interpreted as YAML documents and is parsed. Parsing succeeds if and only + /// if all documents are parsed successfully. An error in a latter document prevents the former + /// from being returned. + /// # Errors + /// Returns `ScanError` when loading fails. + pub fn load_from_iter>(source: I) -> Result, ScanError> { + let mut parser = Parser::new(source); + Self::load_from_parser(&mut parser) + } + + /// Load the contents from the specified Parser as a set of YAML documents. + /// + /// Parsing succeeds if and only if all documents are parsed successfully. + /// An error in a latter document prevents the former from being returned. + /// # Errors + /// Returns `ScanError` when loading fails. + pub fn load_from_parser>( + parser: &mut Parser, + ) -> Result, ScanError> { + let mut loader = YamlLoader::default(); + parser.load(&mut loader, true)?; + Ok(loader.docs) + } + + /// Return a reference to the parsed Yaml documents. + #[must_use] + pub fn documents(&self) -> &[Yaml] { + &self.docs + } +} + +// parse f64 as Core schema +// See: https://github.com/chyh1990/yaml-rust/issues/51 +pub(crate) fn parse_f64(v: &str) -> Option { + match v { + ".inf" | ".Inf" | ".INF" | "+.inf" | "+.Inf" | "+.INF" => Some(f64::INFINITY), + "-.inf" | "-.Inf" | "-.INF" => Some(f64::NEG_INFINITY), + ".nan" | "NaN" | ".NAN" => Some(f64::NAN), + _ => v.parse::().ok(), + } +} diff --git a/saphyr/src/yaml.rs b/saphyr/src/yaml.rs index abe2174..acd8f68 100644 --- a/saphyr/src/yaml.rs +++ b/saphyr/src/yaml.rs @@ -2,15 +2,11 @@ #![allow(clippy::module_name_repetitions)] -use std::borrow::Cow; -use std::ops::ControlFlow; -use std::{collections::BTreeMap, convert::TryFrom, mem, ops::Index, ops::IndexMut}; +use std::{convert::TryFrom, ops::Index, ops::IndexMut}; -#[cfg(feature = "encoding")] -use encoding_rs::{Decoder, DecoderResult, Encoding}; use hashlink::LinkedHashMap; -use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser, ScanError, TScalarStyle, Tag}; +use crate::loader::parse_f64; /// A YAML node is stored as this `Yaml` enumeration, which provides an easy way to /// access your YAML document. @@ -60,416 +56,6 @@ pub type Array = Vec; /// The type contained in the `Yaml::Hash` variant. This corresponds to YAML mappings. pub type Hash = LinkedHashMap; -// parse f64 as Core schema -// See: https://github.com/chyh1990/yaml-rust/issues/51 -fn parse_f64(v: &str) -> Option { - match v { - ".inf" | ".Inf" | ".INF" | "+.inf" | "+.Inf" | "+.INF" => Some(f64::INFINITY), - "-.inf" | "-.Inf" | "-.INF" => Some(f64::NEG_INFINITY), - ".nan" | "NaN" | ".NAN" => Some(f64::NAN), - _ => v.parse::().ok(), - } -} - -/// Main structure for quickly parsing YAML. -/// -/// See [`YamlLoader::load_from_str`]. -#[derive(Default)] -pub struct YamlLoader { - /// The different YAML documents that are loaded. - docs: Vec, - // states - // (current node, anchor_id) tuple - doc_stack: Vec<(Yaml, usize)>, - key_stack: Vec, - anchor_map: BTreeMap, -} - -impl MarkedEventReceiver for YamlLoader { - fn on_event(&mut self, ev: Event, _: Marker) { - // println!("EV {:?}", ev); - match ev { - Event::DocumentStart | Event::Nothing | Event::StreamStart | Event::StreamEnd => { - // do nothing - } - Event::DocumentEnd => { - match self.doc_stack.len() { - // empty document - 0 => self.docs.push(Yaml::BadValue), - 1 => self.docs.push(self.doc_stack.pop().unwrap().0), - _ => unreachable!(), - } - } - Event::SequenceStart(aid, _) => { - self.doc_stack.push((Yaml::Array(Vec::new()), aid)); - } - Event::SequenceEnd => { - let node = self.doc_stack.pop().unwrap(); - self.insert_new_node(node); - } - Event::MappingStart(aid, _) => { - self.doc_stack.push((Yaml::Hash(Hash::new()), aid)); - self.key_stack.push(Yaml::BadValue); - } - Event::MappingEnd => { - self.key_stack.pop().unwrap(); - let node = self.doc_stack.pop().unwrap(); - self.insert_new_node(node); - } - Event::Scalar(v, style, aid, tag) => { - let node = if style != TScalarStyle::Plain { - Yaml::String(v) - } else if let Some(Tag { - ref handle, - ref suffix, - }) = tag - { - if handle == "tag:yaml.org,2002:" { - match suffix.as_ref() { - "bool" => { - // "true" or "false" - match v.parse::() { - Err(_) => Yaml::BadValue, - Ok(v) => Yaml::Boolean(v), - } - } - "int" => match v.parse::() { - Err(_) => Yaml::BadValue, - Ok(v) => Yaml::Integer(v), - }, - "float" => match parse_f64(&v) { - Some(_) => Yaml::Real(v), - None => Yaml::BadValue, - }, - "null" => match v.as_ref() { - "~" | "null" => Yaml::Null, - _ => Yaml::BadValue, - }, - _ => Yaml::String(v), - } - } else { - Yaml::String(v) - } - } else { - // Datatype is not specified, or unrecognized - Yaml::from_str(&v) - }; - - self.insert_new_node((node, aid)); - } - Event::Alias(id) => { - let n = match self.anchor_map.get(&id) { - Some(v) => v.clone(), - None => Yaml::BadValue, - }; - self.insert_new_node((n, 0)); - } - } - // println!("DOC {:?}", self.doc_stack); - } -} - -/// An error that happened when loading a YAML document. -#[derive(Debug)] -pub enum LoadError { - /// An I/O error. - IO(std::io::Error), - /// An error within the scanner. This indicates a malformed YAML input. - Scan(ScanError), - /// A decoding error (e.g.: Invalid UTF_8). - Decode(std::borrow::Cow<'static, str>), -} - -impl From for LoadError { - fn from(error: std::io::Error) -> Self { - LoadError::IO(error) - } -} - -impl std::error::Error for LoadError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - Some(match &self { - LoadError::IO(e) => e, - LoadError::Scan(e) => e, - LoadError::Decode(_) => return None, - }) - } -} - -impl std::fmt::Display for LoadError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - LoadError::IO(e) => e.fmt(f), - LoadError::Scan(e) => e.fmt(f), - LoadError::Decode(e) => e.fmt(f), - } - } -} - -impl YamlLoader { - fn insert_new_node(&mut self, node: (Yaml, usize)) { - // valid anchor id starts from 1 - if node.1 > 0 { - self.anchor_map.insert(node.1, node.0.clone()); - } - if self.doc_stack.is_empty() { - self.doc_stack.push(node); - } else { - let parent = self.doc_stack.last_mut().unwrap(); - match *parent { - (Yaml::Array(ref mut v), _) => v.push(node.0), - (Yaml::Hash(ref mut h), _) => { - let cur_key = self.key_stack.last_mut().unwrap(); - // current node is a key - if cur_key.is_badvalue() { - *cur_key = node.0; - // current node is a value - } else { - let mut newkey = Yaml::BadValue; - mem::swap(&mut newkey, cur_key); - h.insert(newkey, node.0); - } - } - _ => unreachable!(), - } - } - } - - /// Load the given string as a set of YAML documents. - /// - /// The `source` is interpreted as YAML documents and is parsed. Parsing succeeds if and only - /// if all documents are parsed successfully. An error in a latter document prevents the former - /// from being returned. - /// # Errors - /// Returns `ScanError` when loading fails. - pub fn load_from_str(source: &str) -> Result, ScanError> { - Self::load_from_iter(source.chars()) - } - - /// Load the contents of the given iterator as a set of YAML documents. - /// - /// The `source` is interpreted as YAML documents and is parsed. Parsing succeeds if and only - /// if all documents are parsed successfully. An error in a latter document prevents the former - /// from being returned. - /// # Errors - /// Returns `ScanError` when loading fails. - pub fn load_from_iter>(source: I) -> Result, ScanError> { - let mut parser = Parser::new(source); - Self::load_from_parser(&mut parser) - } - - /// Load the contents from the specified Parser as a set of YAML documents. - /// - /// Parsing succeeds if and only if all documents are parsed successfully. - /// An error in a latter document prevents the former from being returned. - /// # Errors - /// Returns `ScanError` when loading fails. - pub fn load_from_parser>( - parser: &mut Parser, - ) -> Result, ScanError> { - let mut loader = YamlLoader::default(); - parser.load(&mut loader, true)?; - Ok(loader.docs) - } - - /// Return a reference to the parsed Yaml documents. - #[must_use] - pub fn documents(&self) -> &[Yaml] { - &self.docs - } -} - -/// The signature of the function to call when using [`YAMLDecodingTrap::Call`]. -/// -/// The arguments are as follows: -/// * `malformation_length`: The length of the sequence the decoder failed to decode. -/// * `bytes_read_after_malformation`: The number of lookahead bytes the decoder consumed after -/// the malformation. -/// * `input_at_malformation`: What the input buffer is at the malformation. -/// This is the buffer starting at the malformation. The first `malformation_length` bytes are -/// the problematic sequence. The following `bytes_read_after_malformation` are already stored -/// in the decoder and will not be re-fed. -/// * `output`: The output string. -/// -/// The function must modify `output` as it feels is best. For instance, one could recreate the -/// behavior of [`YAMLDecodingTrap::Ignore`] with an empty function, [`YAMLDecodingTrap::Replace`] -/// by pushing a `\u{FFFD}` into `output` and [`YAMLDecodingTrap::Strict`] by returning -/// [`ControlFlow::Break`]. -/// -/// # Returns -/// The function must return [`ControlFlow::Continue`] if decoding may continue or -/// [`ControlFlow::Break`] if decoding must be aborted. An optional error string may be supplied. -#[cfg(feature = "encoding")] -pub type YAMLDecodingTrapFn = fn( - malformation_length: u8, - bytes_read_after_malformation: u8, - input_at_malformation: &[u8], - output: &mut String, -) -> ControlFlow>; - -/// The behavior [`YamlDecoder`] must have when an decoding error occurs. -#[cfg(feature = "encoding")] -#[derive(Copy, Clone, PartialEq, Eq)] -pub enum YAMLDecodingTrap { - /// Ignore the offending bytes, remove them from the output. - Ignore, - /// Error out. - Strict, - /// Replace them with the Unicode REPLACEMENT CHARACTER. - Replace, - /// Call the user-supplied function upon decoding malformation. - Call(YAMLDecodingTrapFn), -} - -/// `YamlDecoder` is a `YamlLoader` builder that allows you to supply your own encoding error trap. -/// For example, to read a YAML file while ignoring Unicode decoding errors you can set the -/// `encoding_trap` to `encoding::DecoderTrap::Ignore`. -/// ```rust -/// use saphyr::{YamlDecoder, YAMLDecodingTrap}; -/// -/// let string = b"--- -/// a\xa9: 1 -/// b: 2.2 -/// c: [1, 2] -/// "; -/// let out = YamlDecoder::read(string as &[u8]) -/// .encoding_trap(YAMLDecodingTrap::Ignore) -/// .decode() -/// .unwrap(); -/// ``` -#[cfg(feature = "encoding")] -pub struct YamlDecoder { - source: T, - trap: YAMLDecodingTrap, -} - -#[cfg(feature = "encoding")] -impl YamlDecoder { - /// Create a `YamlDecoder` decoding the given source. - pub fn read(source: T) -> YamlDecoder { - YamlDecoder { - source, - trap: YAMLDecodingTrap::Strict, - } - } - - /// Set the behavior of the decoder when the encoding is invalid. - pub fn encoding_trap(&mut self, trap: YAMLDecodingTrap) -> &mut Self { - self.trap = trap; - self - } - - /// Run the decode operation with the source and trap the `YamlDecoder` was built with. - /// - /// # Errors - /// Returns `LoadError` when decoding fails. - pub fn decode(&mut self) -> Result, LoadError> { - let mut buffer = Vec::new(); - self.source.read_to_end(&mut buffer)?; - - // Check if the `encoding` library can detect encoding from the BOM, otherwise use - // `detect_utf16_endianness`. - let (encoding, _) = - Encoding::for_bom(&buffer).unwrap_or_else(|| (detect_utf16_endianness(&buffer), 2)); - let mut decoder = encoding.new_decoder(); - let mut output = String::new(); - - // Decode the input buffer. - decode_loop(&buffer, &mut output, &mut decoder, self.trap)?; - - YamlLoader::load_from_str(&output).map_err(LoadError::Scan) - } -} - -/// Perform a loop of [`Decoder::decode_to_string`], reallocating `output` if needed. -#[cfg(feature = "encoding")] -fn decode_loop( - input: &[u8], - output: &mut String, - decoder: &mut Decoder, - trap: YAMLDecodingTrap, -) -> Result<(), LoadError> { - output.reserve(input.len()); - let mut total_bytes_read = 0; - - loop { - match decoder.decode_to_string_without_replacement(&input[total_bytes_read..], output, true) - { - // If the input is empty, we processed the whole input. - (DecoderResult::InputEmpty, _) => break Ok(()), - // If the output is full, we must reallocate. - (DecoderResult::OutputFull, bytes_read) => { - total_bytes_read += bytes_read; - // The output is already reserved to the size of the input. We slowly resize. Here, - // we're expecting that 10% of bytes will double in size when converting to UTF-8. - output.reserve(input.len() / 10); - } - (DecoderResult::Malformed(malformed_len, bytes_after_malformed), bytes_read) => { - total_bytes_read += bytes_read; - match trap { - // Ignore (skip over) malformed character. - YAMLDecodingTrap::Ignore => {} - // Replace them with the Unicode REPLACEMENT CHARACTER. - YAMLDecodingTrap::Replace => { - output.push('\u{FFFD}'); - } - // Otherwise error, getting as much context as possible. - YAMLDecodingTrap::Strict => { - let malformed_len = malformed_len as usize; - let bytes_after_malformed = bytes_after_malformed as usize; - let byte_idx = total_bytes_read - (malformed_len + bytes_after_malformed); - let malformed_sequence = &input[byte_idx..byte_idx + malformed_len]; - - break Err(LoadError::Decode(Cow::Owned(format!( - "Invalid character sequence at {byte_idx}: {malformed_sequence:?}", - )))); - } - YAMLDecodingTrap::Call(callback) => { - let byte_idx = - total_bytes_read - ((malformed_len + bytes_after_malformed) as usize); - let malformed_sequence = - &input[byte_idx..byte_idx + malformed_len as usize]; - if let ControlFlow::Break(error) = callback( - malformed_len, - bytes_after_malformed, - &input[byte_idx..], - output, - ) { - if error.is_empty() { - break Err(LoadError::Decode(Cow::Owned(format!( - "Invalid character sequence at {byte_idx}: {malformed_sequence:?}", - )))); - } - break Err(LoadError::Decode(error)); - } - } - } - } - } - } -} - -/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the -/// bytestream starts with BOM codepoint. -/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since -/// in the general case the bytestream could start with a codepoint that uses both bytes. -/// -/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character. -/// This allows the encoding to be deduced by the pattern of null (#x00) characters. -// -/// See spec at -#[cfg(feature = "encoding")] -fn detect_utf16_endianness(b: &[u8]) -> &'static Encoding { - if b.len() > 1 && (b[0] != b[1]) { - if b[0] == 0 { - return encoding_rs::UTF_16BE; - } else if b[1] == 0 { - return encoding_rs::UTF_16LE; - } - } - encoding_rs::UTF_8 -} - macro_rules! define_as ( ($name:ident, $t:ident, $yt:ident) => ( /// Get a copy of the inner object in the YAML enum if it is a `$t`. @@ -623,7 +209,7 @@ impl Yaml { } } -#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))] +#[allow(clippy::should_implement_trait)] impl Yaml { /// Convert a string to a [`Yaml`] node. /// @@ -757,96 +343,3 @@ impl Iterator for YamlIter { self.yaml.next() } } - -#[cfg(test)] -mod test { - use super::{YAMLDecodingTrap, Yaml, YamlDecoder}; - - #[test] - fn test_read_bom() { - let s = b"\xef\xbb\xbf--- -a: 1 -b: 2.2 -c: [1, 2] -"; - let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); - let doc = &out[0]; - assert_eq!(doc["a"].as_i64().unwrap(), 1i64); - assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON); - assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); - assert!(doc["d"][0].is_badvalue()); - } - - #[test] - fn test_read_utf16le() { - let s = b"\xff\xfe-\x00-\x00-\x00 -\x00a\x00:\x00 \x001\x00 -\x00b\x00:\x00 \x002\x00.\x002\x00 -\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00 -\x00"; - let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); - let doc = &out[0]; - println!("GOT: {doc:?}"); - assert_eq!(doc["a"].as_i64().unwrap(), 1i64); - assert!((doc["b"].as_f64().unwrap() - 2.2f64) <= f64::EPSILON); - assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); - assert!(doc["d"][0].is_badvalue()); - } - - #[test] - fn test_read_utf16be() { - let s = b"\xfe\xff\x00-\x00-\x00-\x00 -\x00a\x00:\x00 \x001\x00 -\x00b\x00:\x00 \x002\x00.\x002\x00 -\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00 -"; - let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); - let doc = &out[0]; - println!("GOT: {doc:?}"); - assert_eq!(doc["a"].as_i64().unwrap(), 1i64); - assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON); - assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); - assert!(doc["d"][0].is_badvalue()); - } - - #[test] - fn test_read_utf16le_nobom() { - let s = b"-\x00-\x00-\x00 -\x00a\x00:\x00 \x001\x00 -\x00b\x00:\x00 \x002\x00.\x002\x00 -\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00 -\x00"; - let out = YamlDecoder::read(s as &[u8]).decode().unwrap(); - let doc = &out[0]; - println!("GOT: {doc:?}"); - assert_eq!(doc["a"].as_i64().unwrap(), 1i64); - assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON); - assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); - assert!(doc["d"][0].is_badvalue()); - } - - #[test] - fn test_read_trap() { - let s = b"--- -a\xa9: 1 -b: 2.2 -c: [1, 2] -"; - let out = YamlDecoder::read(s as &[u8]) - .encoding_trap(YAMLDecodingTrap::Ignore) - .decode() - .unwrap(); - let doc = &out[0]; - println!("GOT: {doc:?}"); - assert_eq!(doc["a"].as_i64().unwrap(), 1i64); - assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON); - assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64); - assert!(doc["d"][0].is_badvalue()); - } - - #[test] - fn test_or() { - assert_eq!(Yaml::Null.or(Yaml::Integer(3)), Yaml::Integer(3)); - assert_eq!(Yaml::Integer(3).or(Yaml::Integer(7)), Yaml::Integer(3)); - } -}