Add StrInput
.
This commit is contained in:
parent
0e9cee18f2
commit
db4f26da42
4 changed files with 256 additions and 5 deletions
|
@ -24,4 +24,4 @@ ethi_build_dump:
|
||||||
|
|
||||||
ethi_compare: ethi_build_dump
|
ethi_compare: ethi_build_dump
|
||||||
cg_file=`\ls -1t callgrind.out.* | head -n1` && callgrind_annotate $cg_file --auto=no --threshold=99.99 > cg/WORK && rm $cg_file
|
cg_file=`\ls -1t callgrind.out.* | head -n1` && callgrind_annotate $cg_file --auto=no --threshold=99.99 > cg/WORK && rm $cg_file
|
||||||
callgrind_differ cg/00{05,06,07,08,09,10,11,12}* cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a
|
callgrind_differ `\ls cg/0*` cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
pub mod buffered;
|
pub mod buffered;
|
||||||
|
pub mod str;
|
||||||
|
|
||||||
#[allow(clippy::module_name_repetitions)]
|
#[allow(clippy::module_name_repetitions)]
|
||||||
pub use buffered::BufferedInput;
|
pub use buffered::BufferedInput;
|
||||||
|
|
251
parser/src/input/str.rs
Normal file
251
parser/src/input/str.rs
Normal file
|
@ -0,0 +1,251 @@
|
||||||
|
use crate::{char_traits::is_blank_or_breakz, input::Input};
|
||||||
|
|
||||||
|
#[allow(clippy::module_name_repetitions)]
|
||||||
|
pub struct StrInput<'a> {
|
||||||
|
/// The input str buffer.
|
||||||
|
buffer: &'a str,
|
||||||
|
/// The number of characters (**not** bytes) in the buffer.
|
||||||
|
n_chars: usize,
|
||||||
|
/// The number of characters we have looked ahead.
|
||||||
|
///
|
||||||
|
/// We must however keep track of how many characters the parser asked us to look ahead for so
|
||||||
|
/// that we can return the correct value in [`Self::buflen`].
|
||||||
|
lookahead: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> StrInput<'a> {
|
||||||
|
/// Create a new [`StrInput`] with the given str.
|
||||||
|
pub fn new(input: &'a str) -> Self {
|
||||||
|
Self {
|
||||||
|
buffer: input,
|
||||||
|
n_chars: input.chars().count(),
|
||||||
|
lookahead: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Input for StrInput<'a> {
|
||||||
|
#[inline]
|
||||||
|
fn lookahead(&mut self, x: usize) {
|
||||||
|
// We already have all characters that we need.
|
||||||
|
// We cannot add '\0's to the buffer should we prematurely reach EOF.
|
||||||
|
// Returning '\0's befalls the character-retrieving functions.
|
||||||
|
self.lookahead = self.lookahead.max(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn buflen(&self) -> usize {
|
||||||
|
self.lookahead
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn bufmaxlen(&self) -> usize {
|
||||||
|
BUFFER_LEN
|
||||||
|
}
|
||||||
|
|
||||||
|
fn buf_is_empty(&self) -> bool {
|
||||||
|
self.buflen() == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn raw_read_ch(&mut self) -> char {
|
||||||
|
let mut chars = self.buffer.chars();
|
||||||
|
if let Some(c) = chars.next() {
|
||||||
|
self.buffer = chars.as_str();
|
||||||
|
self.n_chars -= 1;
|
||||||
|
c
|
||||||
|
} else {
|
||||||
|
'\0'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn push_back(&mut self, c: char) {
|
||||||
|
let n_bytes = c.len_utf8();
|
||||||
|
|
||||||
|
// SAFETY: The character that gets pushed back is guaranteed to be the one that is
|
||||||
|
// immediately preceding our buffer. We can compute the length of the character and move
|
||||||
|
// our buffer back that many bytes.
|
||||||
|
unsafe {
|
||||||
|
let buffer_byte_len = self.buffer.len();
|
||||||
|
let mut now_ptr = self.buffer.as_ptr();
|
||||||
|
now_ptr = now_ptr.wrapping_sub(n_bytes);
|
||||||
|
self.buffer = std::str::from_utf8_unchecked(std::slice::from_raw_parts(
|
||||||
|
now_ptr,
|
||||||
|
buffer_byte_len + n_bytes,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn skip(&mut self) {
|
||||||
|
let mut chars = self.buffer.chars();
|
||||||
|
if chars.next().is_some() {
|
||||||
|
self.buffer = chars.as_str();
|
||||||
|
self.n_chars -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn skip_n(&mut self, count: usize) {
|
||||||
|
let mut chars = self.buffer.chars();
|
||||||
|
for _ in 0..count {
|
||||||
|
if chars.next().is_none() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.buffer = chars.as_str();
|
||||||
|
self.n_chars = self.n_chars.saturating_sub(count);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn peek(&self) -> char {
|
||||||
|
self.buffer.chars().next().unwrap_or('\0')
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn peek_nth(&self, n: usize) -> char {
|
||||||
|
let mut chars = self.buffer.chars();
|
||||||
|
for _ in 0..n {
|
||||||
|
if chars.next().is_none() {
|
||||||
|
return '\0';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
chars.next().unwrap_or('\0')
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn look_ch(&mut self) -> char {
|
||||||
|
self.lookahead(1);
|
||||||
|
self.peek()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next_char_is(&self, c: char) -> bool {
|
||||||
|
self.peek() == c
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn nth_char_is(&self, n: usize, c: char) -> bool {
|
||||||
|
self.peek_nth(n) == c
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next_2_are(&self, c1: char, c2: char) -> bool {
|
||||||
|
let mut chars = self.buffer.chars();
|
||||||
|
chars.next().is_some_and(|c| c == c1) && chars.next().is_some_and(|c| c == c2)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
|
||||||
|
let mut chars = self.buffer.chars();
|
||||||
|
chars.next().is_some_and(|c| c == c1)
|
||||||
|
&& chars.next().is_some_and(|c| c == c2)
|
||||||
|
&& chars.next().is_some_and(|c| c == c3)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next_is_document_indicator(&self) -> bool {
|
||||||
|
if self.buffer.len() < 3 {
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
// Since all characters we look for are ascii, we can directly use the byte API of str.
|
||||||
|
(if self.buffer.len() == 3 {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
|
||||||
|
}) && (self.buffer.starts_with("...") || self.buffer.starts_with("---"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next_is_document_start(&self) -> bool {
|
||||||
|
if self.buffer.len() < 3 {
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
// Since all characters we look for are ascii, we can directly use the byte API of str.
|
||||||
|
(if self.buffer.len() == 3 {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
|
||||||
|
}) && self.buffer.starts_with("---")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next_is_document_end(&self) -> bool {
|
||||||
|
if self.buffer.len() < 3 {
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
// Since all characters we look for are ascii, we can directly use the byte API of str.
|
||||||
|
(if self.buffer.len() == 3 {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
|
||||||
|
}) && self.buffer.starts_with("...")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The buffer size we return to the scanner.
|
||||||
|
///
|
||||||
|
/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
|
||||||
|
/// any character they want. If it's within the input buffer, the given character is returned,
|
||||||
|
/// otherwise `\0` is returned.
|
||||||
|
///
|
||||||
|
/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
|
||||||
|
/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
|
||||||
|
/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
|
||||||
|
/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
|
||||||
|
/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
|
||||||
|
/// call.
|
||||||
|
///
|
||||||
|
/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
|
||||||
|
/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
|
||||||
|
/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
|
||||||
|
/// at a time, should fit what we expect to be a good balance between memory consumption and what
|
||||||
|
/// we expect the maximum line length to be.
|
||||||
|
///
|
||||||
|
/// [`lookahead`]: `StrInput::lookahead`
|
||||||
|
/// [`bufmaxlen`]: `StrInput::bufmaxlen`
|
||||||
|
/// [`buflen`]: `StrInput::buflen`
|
||||||
|
const BUFFER_LEN: usize = 128;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use crate::input::Input;
|
||||||
|
|
||||||
|
use super::StrInput;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
pub fn is_document_start() {
|
||||||
|
let input = StrInput::new("---\n");
|
||||||
|
assert!(input.next_is_document_start());
|
||||||
|
assert!(input.next_is_document_indicator());
|
||||||
|
let input = StrInput::new("---");
|
||||||
|
assert!(input.next_is_document_start());
|
||||||
|
assert!(input.next_is_document_indicator());
|
||||||
|
let input = StrInput::new("...\n");
|
||||||
|
assert!(!input.next_is_document_start());
|
||||||
|
assert!(input.next_is_document_indicator());
|
||||||
|
let input = StrInput::new("--- ");
|
||||||
|
assert!(input.next_is_document_start());
|
||||||
|
assert!(input.next_is_document_indicator());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
pub fn is_document_end() {
|
||||||
|
let input = StrInput::new("...\n");
|
||||||
|
assert!(input.next_is_document_end());
|
||||||
|
assert!(input.next_is_document_indicator());
|
||||||
|
let input = StrInput::new("...");
|
||||||
|
assert!(input.next_is_document_end());
|
||||||
|
assert!(input.next_is_document_indicator());
|
||||||
|
let input = StrInput::new("---\n");
|
||||||
|
assert!(!input.next_is_document_end());
|
||||||
|
assert!(input.next_is_document_indicator());
|
||||||
|
let input = StrInput::new("... ");
|
||||||
|
assert!(input.next_is_document_end());
|
||||||
|
assert!(input.next_is_document_indicator());
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,9 +5,8 @@
|
||||||
//! YAML objects.
|
//! YAML objects.
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
input::Input,
|
input::{str::StrInput, Input},
|
||||||
scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType},
|
scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType},
|
||||||
BufferedInput,
|
|
||||||
};
|
};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
@ -229,11 +228,11 @@ impl<R: EventReceiver> MarkedEventReceiver for R {
|
||||||
/// A convenience alias for a `Result` of a parser event.
|
/// A convenience alias for a `Result` of a parser event.
|
||||||
pub type ParseResult = Result<(Event, Marker), ScanError>;
|
pub type ParseResult = Result<(Event, Marker), ScanError>;
|
||||||
|
|
||||||
impl<'a> Parser<BufferedInput<std::str::Chars<'a>>> {
|
impl<'a> Parser<StrInput<'a>> {
|
||||||
/// Create a new instance of a parser from a &str.
|
/// Create a new instance of a parser from a &str.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn new_from_str(value: &'a str) -> Self {
|
pub fn new_from_str(value: &'a str) -> Self {
|
||||||
Parser::new(BufferedInput::new(value.chars()))
|
Parser::new(StrInput::new(value))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue