Fix tag scanning.

This commit is contained in:
Ethiraric 2024-01-23 00:04:46 +01:00
parent 76b3773ffd
commit 7f7919748a
2 changed files with 89 additions and 36 deletions

View file

@ -465,6 +465,24 @@ fn is_anchor_char(c: char) -> bool {
is_yaml_non_space(c) && !is_flow(c) && !is_z(c) is_yaml_non_space(c) && !is_flow(c) && !is_z(c)
} }
/// Check whether the character is a valid word character.
#[inline]
fn is_word_char(c: char) -> bool {
is_alpha(c) && c != '_'
}
/// Check whether the character is a valid URI character.
#[inline]
fn is_uri_char(c: char) -> bool {
is_word_char(c) || "#;/?:@&=+$,_.!~*\'()[]%".contains(c)
}
/// Check whether the character is a valid tag character.
#[inline]
fn is_tag_char(c: char) -> bool {
is_uri_char(c) && !is_flow(c) && c != '!'
}
pub type ScanResult = Result<(), ScanError>; pub type ScanResult = Result<(), ScanError>;
impl<T: Iterator<Item = char>> Scanner<T> { impl<T: Iterator<Item = char>> Scanner<T> {
@ -1116,8 +1134,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.skip(); self.skip();
} }
let is_secondary = handle == "!!"; let prefix = self.scan_tag_prefix(mark)?;
let prefix = self.scan_tag_uri(true, is_secondary, "", mark)?;
self.lookahead(1); self.lookahead(1);
@ -1149,19 +1166,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
self.lookahead(2); self.lookahead(2);
if self.buffer[1] == '<' { if self.buffer[1] == '<' {
// Eat '!<' suffix = self.scan_verbatim_tag(&start_mark)?;
self.skip();
self.skip();
suffix = self.scan_tag_uri(false, false, "", &start_mark)?;
if self.ch() != '>' {
return Err(ScanError::new(
start_mark,
"while scanning a tag, did not find the expected '>'",
));
}
self.skip();
} else { } else {
// The tag has either the '!suffix' or the '!handle!suffix' // The tag has either the '!suffix' or the '!handle!suffix'
handle = self.scan_tag_handle(false, &start_mark)?; handle = self.scan_tag_handle(false, &start_mark)?;
@ -1169,9 +1174,10 @@ impl<T: Iterator<Item = char>> Scanner<T> {
if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') { if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
// A tag handle starting with "!!" is a secondary tag handle. // A tag handle starting with "!!" is a secondary tag handle.
let is_secondary_handle = handle == "!!"; let is_secondary_handle = handle == "!!";
suffix = self.scan_tag_uri(false, is_secondary_handle, "", &start_mark)?; suffix =
self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", &start_mark)?;
} else { } else {
suffix = self.scan_tag_uri(false, false, &handle, &start_mark)?; suffix = self.scan_tag_shorthand_suffix(false, false, &handle, &start_mark)?;
handle = "!".to_owned(); handle = "!".to_owned();
// A special case: the '!' tag. Set the handle to '' and the // A special case: the '!' tag. Set the handle to '' and the
// suffix to '!'. // suffix to '!'.
@ -1223,9 +1229,70 @@ impl<T: Iterator<Item = char>> Scanner<T> {
Ok(string) Ok(string)
} }
fn scan_tag_uri( /// Scan for a tag prefix (6.8.2.2).
///
/// There are 2 kinds of tag prefixes:
/// - Local: Starts with a `!`, contains only URI chars (`!foo`)
/// - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
let mut string = String::new();
if self.look_ch() == '!' {
// If we have a local tag, insert and skip `!`.
string.push(self.ch_skip());
} else if !is_tag_char(self.ch()) {
// Otherwise, check if the first global tag character is valid.
return Err(ScanError::new(*start_mark, "invalid global tag character"));
} else if self.ch() == '%' {
// If it is valid and an escape sequence, escape it.
string.push(self.scan_uri_escapes(start_mark)?);
} else {
// Otherwise, push the first character.
string.push(self.ch_skip());
}
while is_uri_char(self.look_ch()) {
if self.ch() == '%' {
string.push(self.scan_uri_escapes(start_mark)?);
} else {
string.push(self.ch_skip());
}
}
Ok(string)
}
/// Scan for a verbatim tag.
///
/// The prefixing `!<` must _not_ have been skipped.
fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
// Eat `!<`
self.skip();
self.skip();
let mut string = String::new();
while is_uri_char(self.look_ch()) {
if self.ch() == '%' {
string.push(self.scan_uri_escapes(start_mark)?);
} else {
string.push(self.ch_skip());
}
}
if self.ch() != '>' {
return Err(ScanError::new(
*start_mark,
"while scanning a verbatim tag, did not find the expected '>'",
));
}
self.skip();
Ok(string)
}
fn scan_tag_shorthand_suffix(
&mut self, &mut self,
directive: bool, _directive: bool,
_is_secondary: bool, _is_secondary: bool,
head: &str, head: &str,
mark: &Marker, mark: &Marker,
@ -1239,23 +1306,10 @@ impl<T: Iterator<Item = char>> Scanner<T> {
string.extend(head.chars().skip(1)); string.extend(head.chars().skip(1));
} }
/* while is_tag_char(self.look_ch()) {
* The set of characters that may appear in URI is as follows:
*
* '0'-'9', 'A'-'Z', 'a'-'z', '_', '-', ';', '/', '?', ':', '@', '&',
* '=', '+', '$', ',', '.', '!', '~', '*', '\'', '(', ')', '[', ']',
* '%'.
*/
while match self.look_ch() {
';' | '/' | '?' | ':' | '@' | '&' => true,
'=' | '+' | '$' | ',' | '.' | '!' | '~' | '*' | '\'' | '(' | ')' | '[' | ']' => true,
'%' => true,
c if is_alpha(c) => true,
_ => false,
} {
// Check if it is a URI-escape sequence. // Check if it is a URI-escape sequence.
if self.ch() == '%' { if self.ch() == '%' {
string.push(self.scan_uri_escapes(directive, mark)?); string.push(self.scan_uri_escapes(mark)?);
} else { } else {
string.push(self.ch()); string.push(self.ch());
self.skip(); self.skip();
@ -1274,7 +1328,7 @@ impl<T: Iterator<Item = char>> Scanner<T> {
Ok(string) Ok(string)
} }
fn scan_uri_escapes(&mut self, _directive: bool, mark: &Marker) -> Result<char, ScanError> { fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
let mut width = 0usize; let mut width = 0usize;
let mut code = 0u32; let mut code = 0u32;
loop { loop {

View file

@ -315,6 +315,5 @@ fn expected_events(expected_tree: &str) -> Vec<String> {
#[rustfmt::skip] #[rustfmt::skip]
static EXPECTED_FAILURES: &[&str] = &[ static EXPECTED_FAILURES: &[&str] = &[
// Misc // Misc
"U99R", // Comma is not allowed in tags
"WZ62", // Empty content "WZ62", // Empty content
]; ];