Implement load_from_bytes
Also helps in some cases with #142, when the BOM is at the beginning of the file (common), but not in corner case where the BOM is at the start of a document which is not the first one. Closes: #155
This commit is contained in:
parent
c962741719
commit
7705e87ff6
2 changed files with 120 additions and 6 deletions
|
@ -15,6 +15,7 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
arraydeque = "0.5.1"
|
arraydeque = "0.5.1"
|
||||||
|
encoding = "0.2"
|
||||||
hashlink = "0.8"
|
hashlink = "0.8"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
|
|
@ -67,6 +67,7 @@ fn parse_f64(v: &str) -> Option<f64> {
|
||||||
/// Main structure for quickly parsing YAML.
|
/// Main structure for quickly parsing YAML.
|
||||||
///
|
///
|
||||||
/// See [`YamlLoader::load_from_str`].
|
/// See [`YamlLoader::load_from_str`].
|
||||||
|
#[derive(Default)]
|
||||||
pub struct YamlLoader {
|
pub struct YamlLoader {
|
||||||
docs: Vec<Yaml>,
|
docs: Vec<Yaml>,
|
||||||
// states
|
// states
|
||||||
|
@ -161,6 +162,19 @@ impl MarkedEventReceiver for YamlLoader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum LoadError {
|
||||||
|
IO(std::io::Error),
|
||||||
|
Scan(ScanError),
|
||||||
|
Decode(std::borrow::Cow<'static, str>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<std::io::Error> for LoadError {
|
||||||
|
fn from(error: std::io::Error) -> Self {
|
||||||
|
LoadError::IO(error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl YamlLoader {
|
impl YamlLoader {
|
||||||
fn insert_new_node(&mut self, node: (Yaml, usize)) {
|
fn insert_new_node(&mut self, node: (Yaml, usize)) {
|
||||||
// valid anchor id starts from 1
|
// valid anchor id starts from 1
|
||||||
|
@ -205,16 +219,47 @@ impl YamlLoader {
|
||||||
/// if all documents are parsed successfully. An error in a latter document prevents the former
|
/// if all documents are parsed successfully. An error in a latter document prevents the former
|
||||||
/// from being returned.
|
/// from being returned.
|
||||||
pub fn load_from_iter<I: Iterator<Item = char>>(source: I) -> Result<Vec<Yaml>, ScanError> {
|
pub fn load_from_iter<I: Iterator<Item = char>>(source: I) -> Result<Vec<Yaml>, ScanError> {
|
||||||
let mut loader = YamlLoader {
|
let mut loader = YamlLoader::default();
|
||||||
docs: Vec::new(),
|
|
||||||
doc_stack: Vec::new(),
|
|
||||||
key_stack: Vec::new(),
|
|
||||||
anchor_map: BTreeMap::new(),
|
|
||||||
};
|
|
||||||
let mut parser = Parser::new(source);
|
let mut parser = Parser::new(source);
|
||||||
parser.load(&mut loader, true)?;
|
parser.load(&mut loader, true)?;
|
||||||
Ok(loader.docs)
|
Ok(loader.docs)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn load_from_bytes(mut source: impl std::io::Read) -> Result<Vec<Yaml>, LoadError> {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
source.read_to_end(&mut buffer)?;
|
||||||
|
|
||||||
|
// Decodes the input buffer using either UTF-8, UTF-16LE or UTF-16BE depending on the BOM codepoint.
|
||||||
|
// If the buffer doesn't start with a BOM codepoint, it will use a fallback encoding obtained by
|
||||||
|
// detect_utf16_endianness.
|
||||||
|
let (res, _) = encoding::types::decode(
|
||||||
|
&buffer,
|
||||||
|
encoding::DecoderTrap::Strict,
|
||||||
|
detect_utf16_endianness(&buffer),
|
||||||
|
);
|
||||||
|
let s = res.map_err(LoadError::Decode)?;
|
||||||
|
YamlLoader::load_from_str(&s).map_err(LoadError::Scan)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the
|
||||||
|
/// bytestream starts with BOM codepoint.
|
||||||
|
/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since
|
||||||
|
/// in the general case the bytestream could start with a codepoint that uses both bytes.
|
||||||
|
///
|
||||||
|
/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character.
|
||||||
|
/// This allows the encoding to be deduced by the pattern of null (#x00) characters.
|
||||||
|
//
|
||||||
|
/// See spec at <https://yaml.org/spec/1.2/spec.html#id2771184>
|
||||||
|
fn detect_utf16_endianness(b: &[u8]) -> encoding::types::EncodingRef {
|
||||||
|
if b.len() > 1 && (b[0] != b[1]) {
|
||||||
|
if b[0] == 0 {
|
||||||
|
return encoding::all::UTF_16BE;
|
||||||
|
} else if b[1] == 0 {
|
||||||
|
return encoding::all::UTF_16LE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
encoding::all::UTF_8
|
||||||
}
|
}
|
||||||
|
|
||||||
macro_rules! define_as (
|
macro_rules! define_as (
|
||||||
|
@ -410,3 +455,71 @@ impl Iterator for YamlIter {
|
||||||
self.yaml.next()
|
self.yaml.next()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use crate::YamlLoader;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_read_bom() {
|
||||||
|
let s = b"\xef\xbb\xbf---
|
||||||
|
a: 1
|
||||||
|
b: 2.2
|
||||||
|
c: [1, 2]
|
||||||
|
";
|
||||||
|
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
|
||||||
|
let doc = &out[0];
|
||||||
|
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
|
||||||
|
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
|
||||||
|
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
|
||||||
|
assert!(doc["d"][0].is_badvalue());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_read_utf16le() {
|
||||||
|
let s = b"\xff\xfe-\x00-\x00-\x00
|
||||||
|
\x00a\x00:\x00 \x001\x00
|
||||||
|
\x00b\x00:\x00 \x002\x00.\x002\x00
|
||||||
|
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
|
||||||
|
\x00";
|
||||||
|
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
|
||||||
|
let doc = &out[0];
|
||||||
|
println!("GOT: {doc:?}");
|
||||||
|
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
|
||||||
|
assert!((doc["b"].as_f64().unwrap() - 2.2f64) <= f64::EPSILON);
|
||||||
|
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
|
||||||
|
assert!(doc["d"][0].is_badvalue());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_read_utf16be() {
|
||||||
|
let s = b"\xfe\xff\x00-\x00-\x00-\x00
|
||||||
|
\x00a\x00:\x00 \x001\x00
|
||||||
|
\x00b\x00:\x00 \x002\x00.\x002\x00
|
||||||
|
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
|
||||||
|
";
|
||||||
|
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
|
||||||
|
let doc = &out[0];
|
||||||
|
println!("GOT: {doc:?}");
|
||||||
|
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
|
||||||
|
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
|
||||||
|
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
|
||||||
|
assert!(doc["d"][0].is_badvalue());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_read_utf16le_nobom() {
|
||||||
|
let s = b"-\x00-\x00-\x00
|
||||||
|
\x00a\x00:\x00 \x001\x00
|
||||||
|
\x00b\x00:\x00 \x002\x00.\x002\x00
|
||||||
|
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
|
||||||
|
\x00";
|
||||||
|
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
|
||||||
|
let doc = &out[0];
|
||||||
|
println!("GOT: {doc:?}");
|
||||||
|
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
|
||||||
|
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
|
||||||
|
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
|
||||||
|
assert!(doc["d"][0].is_badvalue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue