Remove all unsafe code.
* Use `str::strip_prefix` to avoid using `str::from_utf8_unchecked` * Avoid most uses of `extend_left` unsafe function * Remove `Input::push_back` and remaining unsafe
This commit is contained in:
parent
6c57b5b5e4
commit
e215f546f3
4 changed files with 55 additions and 103 deletions
|
@ -45,16 +45,18 @@ pub trait Input {
|
||||||
|
|
||||||
/// Read a character from the input stream and return it directly.
|
/// Read a character from the input stream and return it directly.
|
||||||
///
|
///
|
||||||
/// The internal buffer (is any) is bypassed.
|
/// The internal buffer (if any) is bypassed.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
fn raw_read_ch(&mut self) -> char;
|
fn raw_read_ch(&mut self) -> char;
|
||||||
|
|
||||||
/// Put a character back in the buffer.
|
/// Read a non-breakz a character from the input stream and return it directly.
|
||||||
///
|
///
|
||||||
/// This function is only called when we read one too many characters and the pushed back
|
/// The internal buffer (if any) is bypassed.
|
||||||
/// character is exactly the last character that was read. This function will not be called
|
///
|
||||||
/// multiple times consecutively.
|
/// If the next character is a breakz, it is either not consumed or placed into the buffer (if
|
||||||
fn push_back(&mut self, c: char);
|
/// any).
|
||||||
|
#[must_use]
|
||||||
|
fn raw_read_non_breakz_ch(&mut self) -> Option<char>;
|
||||||
|
|
||||||
/// Consume the next character.
|
/// Consume the next character.
|
||||||
fn skip(&mut self);
|
fn skip(&mut self);
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
use crate::char_traits::is_breakz;
|
||||||
use crate::input::Input;
|
use crate::input::Input;
|
||||||
|
|
||||||
use arraydeque::ArrayDeque;
|
use arraydeque::ArrayDeque;
|
||||||
|
@ -68,8 +69,17 @@ impl<T: Iterator<Item = char>> Input for BufferedInput<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn push_back(&mut self, c: char) {
|
fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
|
||||||
|
if let Some(c) = self.input.next() {
|
||||||
|
if is_breakz(c) {
|
||||||
self.buffer.push_back(c).unwrap();
|
self.buffer.push_back(c).unwrap();
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(c)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
|
|
@ -61,10 +61,17 @@ impl<'a> Input for StrInput<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn push_back(&mut self, c: char) {
|
fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
|
||||||
// SAFETY: The preconditions of this function is that the character we are given is the one
|
if let Some((c, sub_str)) = split_first_char(self.buffer) {
|
||||||
// immediately preceding `self.buffer`.
|
if is_breakz(c) {
|
||||||
self.buffer = unsafe { put_back_in_str(self.buffer, c) };
|
None
|
||||||
|
} else {
|
||||||
|
self.buffer = sub_str;
|
||||||
|
Some(c)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
@ -177,45 +184,34 @@ impl<'a> Input for StrInput<'a> {
|
||||||
fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
|
fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
|
||||||
assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
|
assert!(!matches!(skip_tabs, SkipTabs::Result(..)));
|
||||||
|
|
||||||
let mut new_str = self.buffer.as_bytes();
|
let mut new_str = self.buffer;
|
||||||
let mut has_yaml_ws = false;
|
let mut has_yaml_ws = false;
|
||||||
let mut encountered_tab = false;
|
let mut encountered_tab = false;
|
||||||
|
|
||||||
// This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
|
// This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
|
||||||
// while keeping track of whether we encountered spaces and/or tabs.
|
// while keeping track of whether we encountered spaces and/or tabs.
|
||||||
if skip_tabs == SkipTabs::Yes {
|
if skip_tabs == SkipTabs::Yes {
|
||||||
let mut i = 0;
|
loop {
|
||||||
while i < new_str.len() {
|
if let Some(sub_str) = new_str.strip_prefix(' ') {
|
||||||
if new_str[i] == b' ' {
|
|
||||||
has_yaml_ws = true;
|
has_yaml_ws = true;
|
||||||
} else if new_str[i] == b'\t' {
|
new_str = sub_str;
|
||||||
|
} else if let Some(sub_str) = new_str.strip_prefix('\t') {
|
||||||
encountered_tab = true;
|
encountered_tab = true;
|
||||||
|
new_str = sub_str;
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
i += 1;
|
|
||||||
}
|
}
|
||||||
new_str = &new_str[i..];
|
|
||||||
} else {
|
} else {
|
||||||
let mut i = 0;
|
while let Some(sub_str) = new_str.strip_prefix(' ') {
|
||||||
while i < new_str.len() {
|
has_yaml_ws = true;
|
||||||
if new_str[i] != b' ' {
|
new_str = sub_str;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
has_yaml_ws = i != 0;
|
|
||||||
new_str = &new_str[i..];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// All characters consumed were ascii. We can use the byte length difference to count the
|
// All characters consumed were ascii. We can use the byte length difference to count the
|
||||||
// number of whitespace ignored.
|
// number of whitespace ignored.
|
||||||
let mut chars_consumed = self.buffer.len() - new_str.len();
|
let mut chars_consumed = self.buffer.len() - new_str.len();
|
||||||
// SAFETY: We only trimmed spaces and tabs, both of which are bytes. This means we won't
|
|
||||||
// start the string outside of a valid UTF-8 boundary.
|
|
||||||
// It is assumed the input string is valid UTF-8, so the rest of the string is assumed to
|
|
||||||
// be valid UTF-8 as well.
|
|
||||||
let mut new_str = unsafe { std::str::from_utf8_unchecked(new_str) };
|
|
||||||
|
|
||||||
if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
|
if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
|
||||||
if !encountered_tab && !has_yaml_ws {
|
if !encountered_tab && !has_yaml_ws {
|
||||||
|
@ -225,24 +221,14 @@ impl<'a> Input for StrInput<'a> {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut chars = new_str.chars();
|
// Skip remaining characters until we hit a breakz.
|
||||||
let mut found_breakz = false;
|
while let Some((c, sub_str)) = split_first_char(new_str) {
|
||||||
// Iterate over all remaining chars until we hit a breakz.
|
|
||||||
for c in chars.by_ref() {
|
|
||||||
if is_breakz(c) {
|
if is_breakz(c) {
|
||||||
found_breakz = true;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
new_str = sub_str;
|
||||||
chars_consumed += 1;
|
chars_consumed += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
new_str = if found_breakz {
|
|
||||||
// SAFETY: The last character we pulled out of the `chars()` is a breakz, one of
|
|
||||||
// '\0', '\r', '\n'. All 3 of them are 1-byte long.
|
|
||||||
unsafe { extend_left(chars.as_str(), 1) }
|
|
||||||
} else {
|
|
||||||
chars.as_str()
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.buffer = new_str;
|
self.buffer = new_str;
|
||||||
|
@ -325,27 +311,19 @@ impl<'a> Input for StrInput<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn skip_while_non_breakz(&mut self) -> usize {
|
fn skip_while_non_breakz(&mut self) -> usize {
|
||||||
let mut found_breakz = false;
|
let mut new_str = self.buffer;
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
|
|
||||||
// Skip over all non-breaks.
|
// Skip over all non-breaks.
|
||||||
let mut chars = self.buffer.chars();
|
while let Some((c, sub_str)) = split_first_char(new_str) {
|
||||||
for c in chars.by_ref() {
|
|
||||||
if is_breakz(c) {
|
if is_breakz(c) {
|
||||||
found_breakz = true;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
new_str = sub_str;
|
||||||
count += 1;
|
count += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.buffer = if found_breakz {
|
self.buffer = new_str;
|
||||||
// If we read a breakz, we need to put it back to the buffer.
|
|
||||||
// SAFETY: The last character we extracted is either a '\n', '\r' or '\0', all of which
|
|
||||||
// are 1-byte long.
|
|
||||||
unsafe { extend_left(chars.as_str(), 1) }
|
|
||||||
} else {
|
|
||||||
chars.as_str()
|
|
||||||
};
|
|
||||||
|
|
||||||
count
|
count
|
||||||
}
|
}
|
||||||
|
@ -416,40 +394,18 @@ impl<'a> Input for StrInput<'a> {
|
||||||
/// [`buflen`]: `StrInput::buflen`
|
/// [`buflen`]: `StrInput::buflen`
|
||||||
const BUFFER_LEN: usize = 128;
|
const BUFFER_LEN: usize = 128;
|
||||||
|
|
||||||
/// Fake prepending a character to the given string.
|
/// Splits the first character of the given string and returns it along with the rest of the
|
||||||
///
|
/// string.
|
||||||
/// The character given as parameter MUST be the one that precedes the given string.
|
|
||||||
///
|
|
||||||
/// # Exmaple
|
|
||||||
/// ```ignore
|
|
||||||
/// let s1 = "foo";
|
|
||||||
/// let s2 = &s1[1..];
|
|
||||||
/// let s3 = put_back_in_str(s2, 'f'); // OK, 'f' is the character immediately preceding
|
|
||||||
/// // let s3 = put_back_in_str('g'); // Not allowed
|
|
||||||
/// assert_eq!(s1, s3);
|
|
||||||
/// assert_eq!(s1.as_ptr(), s3.as_ptr());
|
|
||||||
/// ```
|
|
||||||
unsafe fn put_back_in_str(s: &str, c: char) -> &str {
|
|
||||||
let n_bytes = c.len_utf8();
|
|
||||||
|
|
||||||
// SAFETY: The character that gets pushed back is guaranteed to be the one that is
|
|
||||||
// immediately preceding our buffer. We can compute the length of the character and move
|
|
||||||
// our buffer back that many bytes.
|
|
||||||
extend_left(s, n_bytes)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extend the string by moving the start pointer to the left by `n` bytes.
|
|
||||||
#[inline]
|
#[inline]
|
||||||
unsafe fn extend_left(s: &str, n: usize) -> &str {
|
fn split_first_char(s: &str) -> Option<(char, &str)> {
|
||||||
std::str::from_utf8_unchecked(std::slice::from_raw_parts(
|
let mut chars = s.chars();
|
||||||
s.as_ptr().wrapping_sub(n),
|
let c = chars.next()?;
|
||||||
s.len() + n,
|
Some((c, chars.as_str()))
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use crate::input::{str::put_back_in_str, Input};
|
use crate::input::Input;
|
||||||
|
|
||||||
use super::StrInput;
|
use super::StrInput;
|
||||||
|
|
||||||
|
@ -484,13 +440,4 @@ mod test {
|
||||||
assert!(input.next_is_document_end());
|
assert!(input.next_is_document_end());
|
||||||
assert!(input.next_is_document_indicator());
|
assert!(input.next_is_document_indicator());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
pub fn put_back_in_str_example() {
|
|
||||||
let s1 = "foo";
|
|
||||||
let s2 = &s1[1..];
|
|
||||||
let s3 = unsafe { put_back_in_str(s2, 'f') }; // OK, 'f' is the character immediately preceding
|
|
||||||
assert_eq!(s1, s3);
|
|
||||||
assert_eq!(s1.as_ptr(), s3.as_ptr());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1770,17 +1770,10 @@ impl<T: Input> Scanner<T> {
|
||||||
// characters are appended here as their real size (1B for ascii, or up to 4 bytes for
|
// characters are appended here as their real size (1B for ascii, or up to 4 bytes for
|
||||||
// UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
|
// UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
|
||||||
// (using `String::push_str`).
|
// (using `String::push_str`).
|
||||||
let mut c = self.input.raw_read_ch();
|
while let Some(c) = self.input.raw_read_non_breakz_ch() {
|
||||||
while !is_breakz(c) {
|
|
||||||
line_buffer.push(c);
|
line_buffer.push(c);
|
||||||
c = self.input.raw_read_ch();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Our last character read is stored in `c`. It is either an EOF or a break. In any
|
|
||||||
// case, we need to push it back into `self.buffer` so it may be properly read
|
|
||||||
// after. We must not insert it in `string`.
|
|
||||||
self.input.push_back(c);
|
|
||||||
|
|
||||||
// We need to manually update our position; we haven't called a `skip` function.
|
// We need to manually update our position; we haven't called a `skip` function.
|
||||||
self.mark.col += line_buffer.len();
|
self.mark.col += line_buffer.len();
|
||||||
self.mark.index += line_buffer.len();
|
self.mark.index += line_buffer.len();
|
||||||
|
|
Loading…
Reference in a new issue