From b4f66c457a24c110ab60e0eb96ad5ae58b127cf1 Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Wed, 24 Jan 2024 01:02:20 +0100 Subject: [PATCH] Move char `is_xxx` fn to their own file. --- saphyr/src/char_traits.rs | 111 ++++++++++++++++++++++++++++++++++ saphyr/src/lib.rs | 1 + saphyr/src/scanner.rs | 124 ++++---------------------------------- 3 files changed, 123 insertions(+), 113 deletions(-) create mode 100644 saphyr/src/char_traits.rs diff --git a/saphyr/src/char_traits.rs b/saphyr/src/char_traits.rs new file mode 100644 index 0000000..b95318c --- /dev/null +++ b/saphyr/src/char_traits.rs @@ -0,0 +1,111 @@ +//! Holds functions to determine if a character belongs to a specific character set. + +/// Check whether the character is nil (`\0`). +#[inline] +pub(crate) fn is_z(c: char) -> bool { + c == '\0' +} + +/// Check whether the character is a line break (`\r` or `\n`). +#[inline] +pub(crate) fn is_break(c: char) -> bool { + c == '\n' || c == '\r' +} + +/// Check whether the character is nil or a line break (`\0`, `\r`, `\n`). +#[inline] +pub(crate) fn is_breakz(c: char) -> bool { + is_break(c) || is_z(c) +} + +/// Check whether the character is a whitespace (` ` or `\t`). +#[inline] +pub(crate) fn is_blank(c: char) -> bool { + c == ' ' || c == '\t' +} + +/// Check whether the character is nil, a linebreak or a whitespace. +/// +/// `\0`, ` `, `\t`, `\n`, `\r` +#[inline] +pub(crate) fn is_blankz(c: char) -> bool { + is_blank(c) || is_breakz(c) +} + +/// Check whether the character is an ascii digit. +#[inline] +pub(crate) fn is_digit(c: char) -> bool { + c.is_ascii_digit() +} + +/// Check whether the character is a digit, letter, `_` or `-`. +#[inline] +pub(crate) fn is_alpha(c: char) -> bool { + matches!(c, '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' | '-') +} + +/// Check whether the character is a hexadecimal character (case insensitive). +#[inline] +pub(crate) fn is_hex(c: char) -> bool { + c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c) +} + +/// Convert the hexadecimal digit to an integer. +#[inline] +pub(crate) fn as_hex(c: char) -> u32 { + match c { + '0'..='9' => (c as u32) - ('0' as u32), + 'a'..='f' => (c as u32) - ('a' as u32) + 10, + 'A'..='F' => (c as u32) - ('A' as u32) + 10, + _ => unreachable!(), + } +} + +/// Check whether the character is a YAML flow character (one of `,[]{}`). +#[inline] +pub(crate) fn is_flow(c: char) -> bool { + matches!(c, ',' | '[' | ']' | '{' | '}') +} + +/// Check whether the character is the BOM character. +#[inline] +pub(crate) fn is_bom(c: char) -> bool { + c == '\u{FEFF}' +} + +/// Check whether the character is a YAML non-breaking character. +#[inline] +pub(crate) fn is_yaml_non_break(c: char) -> bool { + // TODO(ethiraric, 28/12/2023): is_printable + !is_break(c) && !is_bom(c) +} + +/// Check whether the character is NOT a YAML whitespace (` ` / `\t`). +#[inline] +pub(crate) fn is_yaml_non_space(c: char) -> bool { + is_yaml_non_break(c) && !is_blank(c) +} + +/// Check whether the character is a valid YAML anchor name character. +#[inline] +pub(crate) fn is_anchor_char(c: char) -> bool { + is_yaml_non_space(c) && !is_flow(c) && !is_z(c) +} + +/// Check whether the character is a valid word character. +#[inline] +pub(crate) fn is_word_char(c: char) -> bool { + is_alpha(c) && c != '_' +} + +/// Check whether the character is a valid URI character. +#[inline] +pub(crate) fn is_uri_char(c: char) -> bool { + is_word_char(c) || "#;/?:@&=+$,_.!~*\'()[]%".contains(c) +} + +/// Check whether the character is a valid tag character. +#[inline] +pub(crate) fn is_tag_char(c: char) -> bool { + is_uri_char(c) && !is_flow(c) && c != '!' +} diff --git a/saphyr/src/lib.rs b/saphyr/src/lib.rs index d4d9def..5107745 100644 --- a/saphyr/src/lib.rs +++ b/saphyr/src/lib.rs @@ -45,6 +45,7 @@ extern crate linked_hash_map; +pub(crate) mod char_traits; pub mod emitter; pub mod parser; pub mod scanner; diff --git a/saphyr/src/scanner.rs b/saphyr/src/scanner.rs index ebbe35a..391dc28 100644 --- a/saphyr/src/scanner.rs +++ b/saphyr/src/scanner.rs @@ -1,9 +1,12 @@ #![allow(clippy::cast_possible_wrap)] #![allow(clippy::cast_sign_loss)] -use std::collections::VecDeque; -use std::error::Error; -use std::{char, fmt}; +use std::{char, collections::VecDeque, error::Error, fmt}; + +use crate::char_traits::{ + as_hex, is_alpha, is_anchor_char, is_blank, is_blankz, is_break, is_breakz, is_digit, is_flow, + is_hex, is_tag_char, is_uri_char, is_z, +}; #[derive(Clone, Copy, PartialEq, Debug, Eq)] pub enum TEncoding { @@ -24,8 +27,11 @@ pub enum TScalarStyle { /// A location in a yaml document. #[derive(Clone, Copy, PartialEq, Debug, Eq)] pub struct Marker { + /// The index (in chars) in the input string. index: usize, + /// The line (1-indexed). line: usize, + /// The column (1-indexed). col: usize, } @@ -56,7 +62,9 @@ impl Marker { /// An error that occured while scanning. #[derive(Clone, PartialEq, Debug, Eq)] pub struct ScanError { + /// The position at which the error happened in the source. mark: Marker, + /// Human-readable details about the error. info: String, } @@ -373,116 +381,6 @@ impl> Iterator for Scanner { } } -/// Check whether the character is nil (`\0`). -#[inline] -fn is_z(c: char) -> bool { - c == '\0' -} - -/// Check whether the character is a line break (`\r` or `\n`). -#[inline] -fn is_break(c: char) -> bool { - c == '\n' || c == '\r' -} - -/// Check whether the character is nil or a line break (`\0`, `\r`, `\n`). -#[inline] -fn is_breakz(c: char) -> bool { - is_break(c) || is_z(c) -} - -/// Check whether the character is a whitespace (` ` or `\t`). -#[inline] -fn is_blank(c: char) -> bool { - c == ' ' || c == '\t' -} - -/// Check whether the character is nil, a linebreak or a whitespace. -/// -/// `\0`, ` `, `\t`, `\n`, `\r` -#[inline] -fn is_blankz(c: char) -> bool { - is_blank(c) || is_breakz(c) -} - -/// Check whether the character is an ascii digit. -#[inline] -fn is_digit(c: char) -> bool { - c.is_ascii_digit() -} - -/// Check whether the character is a digit, letter, `_` or `-`. -#[inline] -fn is_alpha(c: char) -> bool { - matches!(c, '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' | '-') -} - -/// Check whether the character is a hexadecimal character (case insensitive). -#[inline] -fn is_hex(c: char) -> bool { - c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c) -} - -/// Convert the hexadecimal digit to an integer. -#[inline] -fn as_hex(c: char) -> u32 { - match c { - '0'..='9' => (c as u32) - ('0' as u32), - 'a'..='f' => (c as u32) - ('a' as u32) + 10, - 'A'..='F' => (c as u32) - ('A' as u32) + 10, - _ => unreachable!(), - } -} - -/// Check whether the character is a YAML flow character (one of `,[]{}`). -#[inline] -fn is_flow(c: char) -> bool { - matches!(c, ',' | '[' | ']' | '{' | '}') -} - -/// Check whether the character is the BOM character. -#[inline] -fn is_bom(c: char) -> bool { - c == '\u{FEFF}' -} - -/// Check whether the character is a YAML non-breaking character. -#[inline] -fn is_yaml_non_break(c: char) -> bool { - // TODO(ethiraric, 28/12/2023): is_printable - !is_break(c) && !is_bom(c) -} - -/// Check whether the character is NOT a YAML whitespace (` ` / `\t`). -#[inline] -fn is_yaml_non_space(c: char) -> bool { - is_yaml_non_break(c) && !is_blank(c) -} - -/// Check whether the character is a valid YAML anchor name character. -#[inline] -fn is_anchor_char(c: char) -> bool { - is_yaml_non_space(c) && !is_flow(c) && !is_z(c) -} - -/// Check whether the character is a valid word character. -#[inline] -fn is_word_char(c: char) -> bool { - is_alpha(c) && c != '_' -} - -/// Check whether the character is a valid URI character. -#[inline] -fn is_uri_char(c: char) -> bool { - is_word_char(c) || "#;/?:@&=+$,_.!~*\'()[]%".contains(c) -} - -/// Check whether the character is a valid tag character. -#[inline] -fn is_tag_char(c: char) -> bool { - is_uri_char(c) && !is_flow(c) && c != '!' -} - pub type ScanResult = Result<(), ScanError>; impl> Scanner {