From cfbf287b3df89693a860ea7e2e7e472ecf1c4be3 Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Wed, 24 Jan 2024 21:45:18 +0100 Subject: [PATCH] Buffer block scalar lines. Instead of doing a loop that goes: * fetch from input stream * push char into string Make a loop that fetches characters while they're not a breakz and _then_ extend the string. This avoids a bunch of reallocations. --- saphyr/src/scanner.rs | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/saphyr/src/scanner.rs b/saphyr/src/scanner.rs index c79ee82..900b243 100644 --- a/saphyr/src/scanner.rs +++ b/saphyr/src/scanner.rs @@ -503,6 +503,17 @@ impl> Scanner { self.ch() } + /// Read a character from the input stream, place it in the buffer and return it. + /// + /// No character is consumed. The character returned is the one at the back of the buffer (the + /// one we just read from the input stream). + #[inline] + fn read_ch(&mut self) -> char { + let c = self.rdr.next().unwrap_or('\0'); + self.buffer.push_back(c); + c + } + /// Return whether the next character is `c`. #[inline] fn ch_is(&self, c: char) -> bool { @@ -1612,24 +1623,25 @@ impl> Scanner { } } - // We are at the beginning of a non-empty line. + // We are at the first content character of a content line. trailing_blank = is_blank(self.ch()); if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank { + string.push_str(&trailing_breaks); if trailing_breaks.is_empty() { string.push(' '); } - leading_break.clear(); } else { string.push_str(&leading_break); - leading_break.clear(); + string.push_str(&trailing_breaks); } - string.push_str(&trailing_breaks); + leading_break.clear(); trailing_breaks.clear(); leading_blank = is_blank(self.ch()); - while !is_breakz(self.look_ch()) { + // Start by evaluating characters in the buffer. + while !self.buffer.is_empty() && !is_breakz(self.ch()) { string.push(self.ch()); // We may technically skip non-blank characters. However, the only distinction is // to determine what is leading whitespace and what is not. Here, we read the @@ -1638,6 +1650,25 @@ impl> Scanner { // This allows us to call a slightly less expensive function. self.skip_blank(); } + + // All characters that were in the buffer were consumed. We need to check if more + // follow. + if self.buffer.is_empty() { + // We will read all consecutive non-breakz characters into `self.buffer` before + // pushing them all in `string` instead of moving them one by one. + while !is_breakz(self.read_ch()) {} + // The last character from the buffer is a breakz. We must not insert it. + let last_char = self.buffer.pop_back().unwrap(); + // We need to manually update our position; we won't call a `skip` function. + self.mark.col += self.buffer.len(); + self.mark.index += self.buffer.len(); + string.reserve(self.buffer.len()); + string.extend(self.buffer.iter()); + // Put back our breakz character, we didn't consume this one. + self.buffer.clear(); + self.buffer.push_back(last_char); + } + // break on EOF if is_z(self.ch()) { break;