From 7f7919748aaab07eb76e9619f86342bad4b2e5f2 Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Tue, 23 Jan 2024 00:04:46 +0100 Subject: [PATCH] Fix tag scanning. --- saphyr/src/scanner.rs | 124 +++++++++++++++++++++++--------- saphyr/tests/yaml-test-suite.rs | 1 - 2 files changed, 89 insertions(+), 36 deletions(-) diff --git a/saphyr/src/scanner.rs b/saphyr/src/scanner.rs index 3f1b199..6379234 100644 --- a/saphyr/src/scanner.rs +++ b/saphyr/src/scanner.rs @@ -465,6 +465,24 @@ fn is_anchor_char(c: char) -> bool { is_yaml_non_space(c) && !is_flow(c) && !is_z(c) } +/// Check whether the character is a valid word character. +#[inline] +fn is_word_char(c: char) -> bool { + is_alpha(c) && c != '_' +} + +/// Check whether the character is a valid URI character. +#[inline] +fn is_uri_char(c: char) -> bool { + is_word_char(c) || "#;/?:@&=+$,_.!~*\'()[]%".contains(c) +} + +/// Check whether the character is a valid tag character. +#[inline] +fn is_tag_char(c: char) -> bool { + is_uri_char(c) && !is_flow(c) && c != '!' +} + pub type ScanResult = Result<(), ScanError>; impl> Scanner { @@ -1116,8 +1134,7 @@ impl> Scanner { self.skip(); } - let is_secondary = handle == "!!"; - let prefix = self.scan_tag_uri(true, is_secondary, "", mark)?; + let prefix = self.scan_tag_prefix(mark)?; self.lookahead(1); @@ -1149,19 +1166,7 @@ impl> Scanner { self.lookahead(2); if self.buffer[1] == '<' { - // Eat '!<' - self.skip(); - self.skip(); - suffix = self.scan_tag_uri(false, false, "", &start_mark)?; - - if self.ch() != '>' { - return Err(ScanError::new( - start_mark, - "while scanning a tag, did not find the expected '>'", - )); - } - - self.skip(); + suffix = self.scan_verbatim_tag(&start_mark)?; } else { // The tag has either the '!suffix' or the '!handle!suffix' handle = self.scan_tag_handle(false, &start_mark)?; @@ -1169,9 +1174,10 @@ impl> Scanner { if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') { // A tag handle starting with "!!" is a secondary tag handle. let is_secondary_handle = handle == "!!"; - suffix = self.scan_tag_uri(false, is_secondary_handle, "", &start_mark)?; + suffix = + self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", &start_mark)?; } else { - suffix = self.scan_tag_uri(false, false, &handle, &start_mark)?; + suffix = self.scan_tag_shorthand_suffix(false, false, &handle, &start_mark)?; handle = "!".to_owned(); // A special case: the '!' tag. Set the handle to '' and the // suffix to '!'. @@ -1223,9 +1229,70 @@ impl> Scanner { Ok(string) } - fn scan_tag_uri( + /// Scan for a tag prefix (6.8.2.2). + /// + /// There are 2 kinds of tag prefixes: + /// - Local: Starts with a `!`, contains only URI chars (`!foo`) + /// - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`) + fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result { + let mut string = String::new(); + + if self.look_ch() == '!' { + // If we have a local tag, insert and skip `!`. + string.push(self.ch_skip()); + } else if !is_tag_char(self.ch()) { + // Otherwise, check if the first global tag character is valid. + return Err(ScanError::new(*start_mark, "invalid global tag character")); + } else if self.ch() == '%' { + // If it is valid and an escape sequence, escape it. + string.push(self.scan_uri_escapes(start_mark)?); + } else { + // Otherwise, push the first character. + string.push(self.ch_skip()); + } + + while is_uri_char(self.look_ch()) { + if self.ch() == '%' { + string.push(self.scan_uri_escapes(start_mark)?); + } else { + string.push(self.ch_skip()); + } + } + + Ok(string) + } + + /// Scan for a verbatim tag. + /// + /// The prefixing `!<` must _not_ have been skipped. + fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result { + // Eat `!<` + self.skip(); + self.skip(); + + let mut string = String::new(); + while is_uri_char(self.look_ch()) { + if self.ch() == '%' { + string.push(self.scan_uri_escapes(start_mark)?); + } else { + string.push(self.ch_skip()); + } + } + + if self.ch() != '>' { + return Err(ScanError::new( + *start_mark, + "while scanning a verbatim tag, did not find the expected '>'", + )); + } + self.skip(); + + Ok(string) + } + + fn scan_tag_shorthand_suffix( &mut self, - directive: bool, + _directive: bool, _is_secondary: bool, head: &str, mark: &Marker, @@ -1239,23 +1306,10 @@ impl> Scanner { string.extend(head.chars().skip(1)); } - /* - * The set of characters that may appear in URI is as follows: - * - * '0'-'9', 'A'-'Z', 'a'-'z', '_', '-', ';', '/', '?', ':', '@', '&', - * '=', '+', '$', ',', '.', '!', '~', '*', '\'', '(', ')', '[', ']', - * '%'. - */ - while match self.look_ch() { - ';' | '/' | '?' | ':' | '@' | '&' => true, - '=' | '+' | '$' | ',' | '.' | '!' | '~' | '*' | '\'' | '(' | ')' | '[' | ']' => true, - '%' => true, - c if is_alpha(c) => true, - _ => false, - } { + while is_tag_char(self.look_ch()) { // Check if it is a URI-escape sequence. if self.ch() == '%' { - string.push(self.scan_uri_escapes(directive, mark)?); + string.push(self.scan_uri_escapes(mark)?); } else { string.push(self.ch()); self.skip(); @@ -1274,7 +1328,7 @@ impl> Scanner { Ok(string) } - fn scan_uri_escapes(&mut self, _directive: bool, mark: &Marker) -> Result { + fn scan_uri_escapes(&mut self, mark: &Marker) -> Result { let mut width = 0usize; let mut code = 0u32; loop { diff --git a/saphyr/tests/yaml-test-suite.rs b/saphyr/tests/yaml-test-suite.rs index 0d35790..d6b3f3f 100644 --- a/saphyr/tests/yaml-test-suite.rs +++ b/saphyr/tests/yaml-test-suite.rs @@ -315,6 +315,5 @@ fn expected_events(expected_tree: &str) -> Vec { #[rustfmt::skip] static EXPECTED_FAILURES: &[&str] = &[ // Misc - "U99R", // Comma is not allowed in tags "WZ62", // Empty content ];