Add StrInput.

This commit is contained in:
Ethiraric 2024-07-05 21:38:34 +02:00
parent 0e9cee18f2
commit db4f26da42
4 changed files with 256 additions and 5 deletions

View file

@ -24,4 +24,4 @@ ethi_build_dump:
ethi_compare: ethi_build_dump
cg_file=`\ls -1t callgrind.out.* | head -n1` && callgrind_annotate $cg_file --auto=no --threshold=99.99 > cg/WORK && rm $cg_file
callgrind_differ cg/00{05,06,07,08,09,10,11,12}* cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a
callgrind_differ `\ls cg/0*` cg/WORK --show percentagediff,ircount --sort-by=-first-ir -a

View file

@ -1,4 +1,5 @@
pub mod buffered;
pub mod str;
#[allow(clippy::module_name_repetitions)]
pub use buffered::BufferedInput;

251
parser/src/input/str.rs Normal file
View file

@ -0,0 +1,251 @@
use crate::{char_traits::is_blank_or_breakz, input::Input};
#[allow(clippy::module_name_repetitions)]
pub struct StrInput<'a> {
/// The input str buffer.
buffer: &'a str,
/// The number of characters (**not** bytes) in the buffer.
n_chars: usize,
/// The number of characters we have looked ahead.
///
/// We must however keep track of how many characters the parser asked us to look ahead for so
/// that we can return the correct value in [`Self::buflen`].
lookahead: usize,
}
impl<'a> StrInput<'a> {
/// Create a new [`StrInput`] with the given str.
pub fn new(input: &'a str) -> Self {
Self {
buffer: input,
n_chars: input.chars().count(),
lookahead: 0,
}
}
}
impl<'a> Input for StrInput<'a> {
#[inline]
fn lookahead(&mut self, x: usize) {
// We already have all characters that we need.
// We cannot add '\0's to the buffer should we prematurely reach EOF.
// Returning '\0's befalls the character-retrieving functions.
self.lookahead = self.lookahead.max(x);
}
#[inline]
fn buflen(&self) -> usize {
self.lookahead
}
#[inline]
fn bufmaxlen(&self) -> usize {
BUFFER_LEN
}
fn buf_is_empty(&self) -> bool {
self.buflen() == 0
}
#[inline]
fn raw_read_ch(&mut self) -> char {
let mut chars = self.buffer.chars();
if let Some(c) = chars.next() {
self.buffer = chars.as_str();
self.n_chars -= 1;
c
} else {
'\0'
}
}
#[inline]
fn push_back(&mut self, c: char) {
let n_bytes = c.len_utf8();
// SAFETY: The character that gets pushed back is guaranteed to be the one that is
// immediately preceding our buffer. We can compute the length of the character and move
// our buffer back that many bytes.
unsafe {
let buffer_byte_len = self.buffer.len();
let mut now_ptr = self.buffer.as_ptr();
now_ptr = now_ptr.wrapping_sub(n_bytes);
self.buffer = std::str::from_utf8_unchecked(std::slice::from_raw_parts(
now_ptr,
buffer_byte_len + n_bytes,
));
}
}
#[inline]
fn skip(&mut self) {
let mut chars = self.buffer.chars();
if chars.next().is_some() {
self.buffer = chars.as_str();
self.n_chars -= 1;
}
}
#[inline]
fn skip_n(&mut self, count: usize) {
let mut chars = self.buffer.chars();
for _ in 0..count {
if chars.next().is_none() {
break;
}
}
self.buffer = chars.as_str();
self.n_chars = self.n_chars.saturating_sub(count);
}
#[inline]
fn peek(&self) -> char {
self.buffer.chars().next().unwrap_or('\0')
}
#[inline]
fn peek_nth(&self, n: usize) -> char {
let mut chars = self.buffer.chars();
for _ in 0..n {
if chars.next().is_none() {
return '\0';
}
}
chars.next().unwrap_or('\0')
}
#[inline]
fn look_ch(&mut self) -> char {
self.lookahead(1);
self.peek()
}
#[inline]
fn next_char_is(&self, c: char) -> bool {
self.peek() == c
}
#[inline]
fn nth_char_is(&self, n: usize, c: char) -> bool {
self.peek_nth(n) == c
}
#[inline]
fn next_2_are(&self, c1: char, c2: char) -> bool {
let mut chars = self.buffer.chars();
chars.next().is_some_and(|c| c == c1) && chars.next().is_some_and(|c| c == c2)
}
#[inline]
fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
let mut chars = self.buffer.chars();
chars.next().is_some_and(|c| c == c1)
&& chars.next().is_some_and(|c| c == c2)
&& chars.next().is_some_and(|c| c == c3)
}
#[inline]
fn next_is_document_indicator(&self) -> bool {
if self.buffer.len() < 3 {
false
} else {
// Since all characters we look for are ascii, we can directly use the byte API of str.
(if self.buffer.len() == 3 {
true
} else {
is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
}) && (self.buffer.starts_with("...") || self.buffer.starts_with("---"))
}
}
#[inline]
fn next_is_document_start(&self) -> bool {
if self.buffer.len() < 3 {
false
} else {
// Since all characters we look for are ascii, we can directly use the byte API of str.
(if self.buffer.len() == 3 {
true
} else {
is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
}) && self.buffer.starts_with("---")
}
}
#[inline]
fn next_is_document_end(&self) -> bool {
if self.buffer.len() < 3 {
false
} else {
// Since all characters we look for are ascii, we can directly use the byte API of str.
(if self.buffer.len() == 3 {
true
} else {
is_blank_or_breakz(self.buffer.as_bytes()[3] as char)
}) && self.buffer.starts_with("...")
}
}
}
/// The buffer size we return to the scanner.
///
/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
/// any character they want. If it's within the input buffer, the given character is returned,
/// otherwise `\0` is returned.
///
/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
/// call.
///
/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
/// at a time, should fit what we expect to be a good balance between memory consumption and what
/// we expect the maximum line length to be.
///
/// [`lookahead`]: `StrInput::lookahead`
/// [`bufmaxlen`]: `StrInput::bufmaxlen`
/// [`buflen`]: `StrInput::buflen`
const BUFFER_LEN: usize = 128;
#[cfg(test)]
mod test {
use crate::input::Input;
use super::StrInput;
#[test]
pub fn is_document_start() {
let input = StrInput::new("---\n");
assert!(input.next_is_document_start());
assert!(input.next_is_document_indicator());
let input = StrInput::new("---");
assert!(input.next_is_document_start());
assert!(input.next_is_document_indicator());
let input = StrInput::new("...\n");
assert!(!input.next_is_document_start());
assert!(input.next_is_document_indicator());
let input = StrInput::new("--- ");
assert!(input.next_is_document_start());
assert!(input.next_is_document_indicator());
}
#[test]
pub fn is_document_end() {
let input = StrInput::new("...\n");
assert!(input.next_is_document_end());
assert!(input.next_is_document_indicator());
let input = StrInput::new("...");
assert!(input.next_is_document_end());
assert!(input.next_is_document_indicator());
let input = StrInput::new("---\n");
assert!(!input.next_is_document_end());
assert!(input.next_is_document_indicator());
let input = StrInput::new("... ");
assert!(input.next_is_document_end());
assert!(input.next_is_document_indicator());
}
}

View file

@ -5,9 +5,8 @@
//! YAML objects.
use crate::{
input::Input,
input::{str::StrInput, Input},
scanner::{Marker, ScanError, Scanner, TScalarStyle, Token, TokenType},
BufferedInput,
};
use std::collections::HashMap;
@ -229,11 +228,11 @@ impl<R: EventReceiver> MarkedEventReceiver for R {
/// A convenience alias for a `Result` of a parser event.
pub type ParseResult = Result<(Event, Marker), ScanError>;
impl<'a> Parser<BufferedInput<std::str::Chars<'a>>> {
impl<'a> Parser<StrInput<'a>> {
/// Create a new instance of a parser from a &str.
#[must_use]
pub fn new_from_str(value: &'a str) -> Self {
Parser::new(BufferedInput::new(value.chars()))
Parser::new(StrInput::new(value))
}
}