From d03dec21971ab060e9ead4b5c0ffd434d35870e7 Mon Sep 17 00:00:00 2001 From: Yuheng Chen Date: Sun, 24 May 2015 14:27:42 +0800 Subject: [PATCH] Initial commit --- parser/.gitignore | 2 + parser/Cargo.toml | 4 + parser/src/lib.rs | 6 + parser/src/parser.rs | 475 +++++++++++++++++++++++++++ parser/src/scanner.rs | 726 ++++++++++++++++++++++++++++++++++++++++++ parser/src/yaml.rs | 39 +++ 6 files changed, 1252 insertions(+) create mode 100644 parser/.gitignore create mode 100644 parser/Cargo.toml create mode 100644 parser/src/lib.rs create mode 100644 parser/src/parser.rs create mode 100644 parser/src/scanner.rs create mode 100644 parser/src/yaml.rs diff --git a/parser/.gitignore b/parser/.gitignore new file mode 100644 index 0000000..a9d37c5 --- /dev/null +++ b/parser/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/parser/Cargo.toml b/parser/Cargo.toml new file mode 100644 index 0000000..d7dd718 --- /dev/null +++ b/parser/Cargo.toml @@ -0,0 +1,4 @@ +[package] +name = "yaml-rust" +version = "0.1.0" +authors = ["Yuheng Chen "] diff --git a/parser/src/lib.rs b/parser/src/lib.rs new file mode 100644 index 0000000..a824d7d --- /dev/null +++ b/parser/src/lib.rs @@ -0,0 +1,6 @@ +pub mod yaml; +pub mod scanner; +pub mod parser; +#[test] +fn it_works() { +} diff --git a/parser/src/parser.rs b/parser/src/parser.rs new file mode 100644 index 0000000..104f6a6 --- /dev/null +++ b/parser/src/parser.rs @@ -0,0 +1,475 @@ +use scanner::*; + +#[derive(Clone, Copy, PartialEq, Debug, Eq)] +pub enum State { + StreamStart, + ImplicitDocumentStart, + DocumentStart, + DocumentContent, + DocumentEnd, + BlockNode, + BlockNodeOrIndentlessSequence, + FlowNode, + BlockSequenceFirstEntry, + BlockSequenceEntry, + IndentlessSequenceEntry, + BlockMappingFirstKey, + BlockMappingKey, + BlockMappingValue, + FlowSequenceFirstEntry, + FlowSequenceEntry, + FlowSequenceEntryMappingKey, + FlowSequenceEntryMappingValue, + FlowSequenceEntryMappingEnd, + FlowMappingFirstKey, + FlowMappingKey, + FlowMappingValue, + FlowMappingEmptyValue, + End +} + +#[derive(Clone, PartialEq, Debug, Eq)] +pub enum Event { + NoEvent, + StreamStart, + StreamEnd, + DocumentStart, + DocumentEnd, + Alias, + Scalar(String), + SequenceStart, + SequenceEnd, + MappingStart, + MappingEnd +} + +#[derive(Debug)] +pub struct Parser { + scanner: Scanner, + states: Vec, + state: State, + marks: Vec, + token: Option, +} + +pub type ParseResult = Result; + +impl> Parser { + pub fn new(src: T) -> Parser { + Parser { + scanner: Scanner::new(src), + states: Vec::new(), + state: State::StreamStart, + marks: Vec::new(), + token: None + } + } + + fn peek(&mut self) -> Result { + if self.token.is_none() { + self.token = self.scanner.next(); + } + if self.token.is_none() { + return Err(ScanError::new(self.scanner.mark(), + "unexpected eof")); + } + // XXX better? + Ok(self.token.clone().unwrap()) + } + + fn skip(&mut self) { + self.token = None; + self.peek(); + } + fn pop_state(&mut self) { + self.state = self.states.pop().unwrap() + } + fn push_state(&mut self, state: State) { + self.states.push(state); + } + + pub fn parse(&mut self) -> ParseResult { + if self.scanner.stream_ended() + || self.state == State::End { + return Ok(Event::NoEvent); + } + let ev = self.state_machine(); + println!("EV {:?}", ev); + ev + } + + pub fn load(&mut self) -> Result<(), ScanError> { + if !self.scanner.stream_started() { + let ev = try!(self.parse()); + assert_eq!(ev, Event::StreamStart); + } + + if self.scanner.stream_ended() { + return Ok(()); + } + let ev = try!(self.parse()); + if ev == Event::StreamEnd { + return Ok(()); + } + self.load_document(&ev); + Ok(()) + } + + fn load_document(&mut self, first_ev: &Event) -> Result<(), ScanError> { + assert_eq!(first_ev, &Event::DocumentStart); + + let ev = try!(self.parse()); + let ev = try!(self.load_node(&ev)); + + Ok(()) + } + + fn load_node(&mut self, first_ev: &Event) -> Result<(), ScanError> { + match *first_ev { + Event::Scalar(_) => { + // TODO scalar + println!("Scalar: {:?}", first_ev); + Ok(()) + }, + Event::SequenceStart => { + self.load_sequence(first_ev) + }, + Event::MappingStart => { + self.load_mapping(first_ev) + }, + _ => { unreachable!(); } + } + } + + fn load_mapping(&mut self, first_ev: &Event) -> Result<(), ScanError> { + let mut ev = try!(self.parse()); + while ev != Event::MappingEnd { + // key + try!(self.load_node(&ev)); + + // value + ev = try!(self.parse()); + try!(self.load_node(&ev)); + + // next event + ev = try!(self.parse()); + } + Ok(()) + } + + fn load_sequence(&mut self, first_ev: &Event) -> Result<(), ScanError> { + let mut ev = try!(self.parse()); + while ev != Event::SequenceEnd { + try!(self.load_node(&ev)); + + // next event + ev = try!(self.parse()); + } + Ok(()) + } + + fn state_machine(&mut self) -> ParseResult { + let next_tok = self.peek(); + println!("cur_state {:?}, next tok: {:?}", self.state, next_tok); + match self.state { + State::StreamStart => self.stream_start(), + State::ImplicitDocumentStart => self.document_start(true), + State::DocumentStart => self.document_start(false), + State::DocumentContent => self.document_content(), + + State::BlockMappingFirstKey => self.block_mapping_key(true), + State::BlockMappingKey => self.block_mapping_key(false), + State::BlockMappingValue => self.block_mapping_value(), + + State::BlockSequenceFirstEntry => self.block_sequence_entry(true), + State::BlockSequenceEntry => self.block_sequence_entry(false), + + State::FlowSequenceFirstEntry => self.flow_sequence_entry(true), + State::FlowSequenceEntry => self.flow_sequence_entry(false), + + _ => unimplemented!() + } + } + + fn stream_start(&mut self) -> ParseResult { + let tok = try!(self.peek()); + + match tok.1 { + TokenType::StreamStartToken(_) => { + self.state = State::ImplicitDocumentStart; + self.skip(); + Ok(Event::StreamStart) + }, + _ => return Err(ScanError::new(tok.0, + "did not find expected ")), + } + } + + fn document_start(&mut self, implicit: bool) -> ParseResult { + let mut tok = try!(self.peek()); + if !implicit { + loop { + match tok.1 { + TokenType::DocumentEndToken => { + self.skip(); + tok = try!(self.peek()); + }, + _ => break + } + } + } + + match tok.1 { + TokenType::StreamEndToken => { + self.state = State::End; + self.skip(); + return Ok(Event::StreamEnd); + }, + TokenType::VersionDirectiveToken + | TokenType::TagDirectiveToken + | TokenType::DocumentStartToken => { + // explicit document + self._explict_document_start() + }, + _ if implicit => { + self.push_state(State::DocumentEnd); + self.state = State::BlockNode; + self.skip(); + Ok(Event::DocumentStart) + }, + _ => { + // explicit document + self._explict_document_start() + } + } + } + + fn _explict_document_start(&mut self) -> ParseResult { + let mut tok = try!(self.peek()); + if tok.1 != TokenType::DocumentStartToken { + return Err(ScanError::new(tok.0, "did not find expected ")); + } + self.push_state(State::DocumentEnd); + self.state = State::DocumentContent; + self.skip(); + Ok(Event::DocumentStart) + } + + fn document_content(&mut self) -> ParseResult { + let tok = try!(self.peek()); + match tok.1 { + TokenType::VersionDirectiveToken + |TokenType::TagDirectiveToken + |TokenType::DocumentStartToken + |TokenType::DocumentEndToken + |TokenType::StreamEndToken => { + self.pop_state(); + // empty scalar + Ok(Event::Scalar(String::new())) + }, + _ => { + self.parse_node(true, false) + } + } + } + + fn parse_node(&mut self, block: bool, indentless_sequence: bool) -> ParseResult { + let mut tok = try!(self.peek()); + match tok.1 { + TokenType::AliasToken => unimplemented!(), + TokenType::AnchorToken => unimplemented!(), + TokenType::BlockEntryToken if indentless_sequence => { + self.state = State::IndentlessSequenceEntry; + Ok(Event::SequenceStart) + }, + TokenType::ScalarToken(_, v) => { + self.pop_state(); + self.skip(); + Ok(Event::Scalar(v)) + }, + TokenType::FlowSequenceStartToken => { + self.state = State::FlowSequenceFirstEntry; + Ok(Event::SequenceStart) + }, + TokenType::FlowMappingStartToken => { + self.state = State::FlowMappingFirstKey; + Ok(Event::MappingStart) + }, + TokenType::BlockSequenceStartToken if block => { + self.state = State::BlockSequenceFirstEntry; + Ok(Event::SequenceStart) + }, + TokenType::BlockMappingStartToken if block => { + self.state = State::BlockMappingFirstKey; + Ok(Event::MappingStart) + }, + _ => { unimplemented!(); } + } + } + + fn block_mapping_key(&mut self, first: bool) -> ParseResult { + // skip BlockMappingStartToken + if first { + let tok = try!(self.peek()); + //self.marks.push(tok.0); + self.skip(); + } + let tok = try!(self.peek()); + match tok.1 { + TokenType::KeyToken => { + self.skip(); + let tok = try!(self.peek()); + match tok.1 { + TokenType::KeyToken | TokenType::ValueToken | TokenType::BlockEndToken + => { + self.state = State::BlockMappingValue; + // empty scalar + Ok(Event::Scalar(String::new())) + } + _ => { + self.push_state(State::BlockMappingValue); + self.parse_node(true, true) + } + } + }, + TokenType::BlockEndToken => { + self.pop_state(); + self.skip(); + Ok(Event::MappingEnd) + }, + _ => { + Err(ScanError::new(tok.0, "while parsing a block mapping, did not find expected key")) + } + } + } + + fn block_mapping_value(&mut self) -> ParseResult { + let tok = try!(self.peek()); + match tok.1 { + TokenType::ValueToken => { + self.skip(); + let tok = try!(self.peek()); + match tok.1 { + TokenType::KeyToken | TokenType::ValueToken | TokenType::BlockEndToken + => { + self.state = State::BlockMappingValue; + // empty scalar + Ok(Event::Scalar(String::new())) + } + _ => { + self.push_state(State::BlockMappingKey); + self.parse_node(true, true) + } + } + }, + _ => { + self.state = State::BlockMappingKey; + // empty scalar + Ok(Event::Scalar(String::new())) + } + } + } + + fn flow_sequence_entry(&mut self, first: bool) -> ParseResult { + // skip FlowMappingStartToken + if first { + let tok = try!(self.peek()); + //self.marks.push(tok.0); + self.skip(); + } + let mut tok = try!(self.peek()); + match tok.1 { + TokenType::FlowSequenceEndToken => { + self.pop_state(); + self.skip(); + return Ok(Event::SequenceEnd); + }, + TokenType::FlowEntryToken if !first => { + self.skip(); + tok = try!(self.peek()); + }, + _ if !first => { + return Err(ScanError::new(tok.0, + "while parsing a flow sequence, expectd ',' or ']'")); + } + _ => { /* next */ } + } + match tok.1 { + TokenType::FlowMappingEndToken => { + self.pop_state(); + self.skip(); + Ok(Event::SequenceEnd) + }, + TokenType::KeyToken => { + self.state = State::FlowSequenceEntryMappingKey; + self.skip(); + Ok(Event::MappingStart) + } + _ => { + self.push_state(State::FlowSequenceEntry); + self.parse_node(false, false) + } + } + } + + fn block_sequence_entry(&mut self, first: bool) -> ParseResult { + // BLOCK-SEQUENCE-START + if first { + let tok = try!(self.peek()); + //self.marks.push(tok.0); + self.skip(); + } + let mut tok = try!(self.peek()); + match tok.1 { + TokenType::BlockEndToken => { + self.pop_state(); + self.skip(); + Ok(Event::SequenceEnd) + }, + TokenType::BlockEntryToken => { + self.skip(); + tok = try!(self.peek()); + match tok.1 { + TokenType::BlockEntryToken | TokenType::BlockEndToken => { + self.state = State::BlockSequenceEntry; + Ok(Event::Scalar(String::new())) + }, + _ => { + self.push_state(State::BlockSequenceEntry); + self.parse_node(true, false) + } + } + }, + _ => { + Err(ScanError::new(tok.0, + "while parsing a block collection, did not find expected '-' indicator")) + } + } + } + +} + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn test_parser() { + let s: String = "--- +# comment +a0 bb: val +a1: + b1: 4 + b2: d +a2: 4 +a3: [1, 2, 3] +a4: + - - a1 + - a2 + - 2 +".to_string(); + let mut parser = Parser::new(s.chars()); + parser.load().unwrap(); + + } +} + diff --git a/parser/src/scanner.rs b/parser/src/scanner.rs new file mode 100644 index 0000000..2b10542 --- /dev/null +++ b/parser/src/scanner.rs @@ -0,0 +1,726 @@ +use std::collections::VecDeque; +use yaml::*; + +#[derive(Clone, Copy, PartialEq, Debug, Eq)] +pub enum TEncoding { + Utf8 +} + +#[derive(Clone, Copy, PartialEq, Debug, Eq)] +pub enum TScalarStyle { + Any, + Plain, + SingleQuoted, + DoubleQuoted, + + Literal, + Foled +} + +#[derive(Clone, Copy, PartialEq, Debug, Eq)] +pub struct Marker { + index: usize, + line: usize, + col: usize, +} + +impl Marker { + fn new(index: usize, line: usize, col: usize) -> Marker { + Marker { + index: index, + line: line, + col: col + } + } +} + +#[derive(Clone, PartialEq, Debug, Eq)] +pub struct ScanError { + mark: Marker, + info: String, +} + +impl ScanError { + pub fn new(loc: Marker, info: &str) -> ScanError { + ScanError { + mark: loc, + info: info.to_string() + } + } +} + +#[derive(Clone, PartialEq, Debug, Eq)] +pub enum TokenType { + NoToken, + StreamStartToken(TEncoding), + StreamEndToken, + VersionDirectiveToken, + TagDirectiveToken, + DocumentStartToken, + DocumentEndToken, + BlockSequenceStartToken, + BlockMappingStartToken, + BlockEndToken, + FlowSequenceStartToken, + FlowSequenceEndToken, + FlowMappingStartToken, + FlowMappingEndToken, + BlockEntryToken, + FlowEntryToken, + KeyToken, + ValueToken, + AliasToken, + AnchorToken, + TagToken, + ScalarToken(TScalarStyle, String) +} + +#[derive(Clone, PartialEq, Debug, Eq)] +pub struct Token(pub Marker, pub TokenType); + +#[derive(Clone, PartialEq, Debug, Eq)] +struct SimpleKey { + possible: bool, + required: bool, + token_number: usize, + mark: Marker, +} + +impl SimpleKey { + fn new(mark: Marker) -> SimpleKey { + SimpleKey { + possible: false, + required: false, + token_number: 0, + mark: mark, + } + } +} + +#[derive(Debug)] +pub struct Scanner { + rdr: T, + mark: Marker, + tokens: VecDeque, + buffer: VecDeque, + + stream_start_produced: bool, + stream_end_produced: bool, + simple_key_allowed: bool, + simple_keys: Vec, + indent: isize, + indents: Vec, + flow_level: usize, + tokens_parsed: usize, + token_available: bool, +} + +impl> Iterator for Scanner { + type Item = Token; + fn next(&mut self) -> Option { + match self.next_token() { + Ok(tok) => tok, + Err(e) => { + println!("Error: {:?}", e); + None + } + } + } +} + +fn is_z(c: char) -> bool { + c == '\0' +} +fn is_break(c: char) -> bool { + c == '\n' || c == '\r' +} +fn is_breakz(c: char) -> bool { + is_break(c) || is_z(c) +} +fn is_blank(c: char) -> bool { + c == ' ' || c == '\t' +} +fn is_blankz(c: char) -> bool { + is_blank(c) || is_breakz(c) +} + +pub type ScanResult = Result<(), ScanError>; + +impl> Scanner { + /// Creates the YAML tokenizer. + pub fn new(rdr: T) -> Scanner { + let mut p = Scanner { + rdr: rdr, + buffer: VecDeque::new(), + mark: Marker::new(0, 1, 0), + tokens: VecDeque::new(), + + stream_start_produced: false, + stream_end_produced: false, + simple_key_allowed: true, + simple_keys: Vec::new(), + indent: -1, + indents: Vec::new(), + flow_level: 0, + tokens_parsed: 0, + token_available: false, + }; + return p; + } + + fn lookhead(&mut self, count: usize) { + if self.buffer.len() >= count { + return; + } + for i in 0..(count - self.buffer.len()) { + self.buffer.push_back(self.rdr.next().unwrap_or('\0')); + } + } + fn skip(&mut self) { + let c = self.buffer.pop_front().unwrap(); + + self.mark.index += 1; + if c == '\n' { + self.mark.line += 1; + self.mark.col = 0; + } else { + self.mark.col += 1; + } + } + fn ch(&self) -> char { + self.buffer[0] + } + fn ch_is(&self, c: char) -> bool { + self.buffer[0] == c + } + fn eof(&self) -> bool { + self.ch_is('\0') + } + pub fn stream_started(&self) -> bool { + self.stream_start_produced + } + pub fn stream_ended(&self) -> bool { + self.stream_end_produced + } + pub fn mark(&self) -> Marker { + self.mark + } + fn read_break(&mut self, s: &mut String) { + if self.buffer[0] == '\r' && self.buffer[1] == '\n' { + s.push('\n'); + self.skip(); + self.skip(); + } else if self.buffer[0] == '\r' || self.buffer[0] == '\n' { + s.push('\n'); + self.skip(); + } else { + unreachable!(); + } + } + fn insert_token(&mut self, pos: usize, tok: Token) { + let old_len = self.tokens.len(); + assert!(pos <= old_len); + self.tokens.push_back(tok); + for i in 0..old_len - pos { + self.tokens.swap(old_len - i, old_len - i - 1); + } + } + fn allow_simple_key(&mut self) { + self.simple_key_allowed = true; + } + fn disallow_simple_key(&mut self) { + self.simple_key_allowed = false; + } + + pub fn fetch_next_token(&mut self) -> ScanResult { + self.lookhead(1); + // println!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch()); + + if !self.stream_start_produced { + self.fetch_stream_start(); + return Ok(()); + } + self.skip_to_next_token(); + + try!(self.stale_simple_keys()); + + let mark = self.mark; + self.unroll_indent(mark.col as isize); + + self.lookhead(4); + + if is_z(self.ch()) { + self.fetch_stream_end(); + return Ok(()); + } + + if self.mark.col == 0 && self.ch_is('%') { + unimplemented!(); + } + + if self.mark.col == 0 + && self.buffer[0] == '-' + && self.buffer[1] == '-' + && self.buffer[2] == '-' + && is_blankz(self.buffer[3]) { + try!(self.fetch_document_indicator(TokenType::DocumentStartToken)); + return Ok(()); + } + + if self.mark.col == 0 + && self.buffer[0] == '.' + && self.buffer[1] == '.' + && self.buffer[2] == '.' + && is_blankz(self.buffer[3]) { + try!(self.fetch_document_indicator(TokenType::DocumentEndToken)); + return Ok(()); + } + + let c = self.buffer[0]; + let nc = self.buffer[1]; + match c { + '[' => try!(self.fetch_flow_collection_start(TokenType::FlowSequenceStartToken)), + '{' => try!(self.fetch_flow_collection_start(TokenType::FlowMappingStartToken)), + ']' => try!(self.fetch_flow_collection_end(TokenType::FlowSequenceEndToken)), + '}' => try!(self.fetch_flow_collection_end(TokenType::FlowMappingEndToken)), + ',' => try!(self.fetch_flow_entry()), + '-' if is_blankz(nc) => try!(self.fetch_block_entry()), + '?' if self.flow_level > 0 || is_blankz(nc) => unimplemented!(), + ':' if self.flow_level > 0 || is_blankz(nc) => try!(self.fetch_value()), + '*' => unimplemented!(), + '&' => unimplemented!(), + '!' => unimplemented!(), + '|' if self.flow_level == 0 => unimplemented!(), + '>' if self.flow_level == 0 => unimplemented!(), + '\'' => unimplemented!(), + '"' => unimplemented!(), + // plain scalar + '-' if !is_blankz(nc) => try!(self.fetch_plain_scalar()), + ':' | '?' if !is_blankz(nc) && self.flow_level == 0 => try!(self.fetch_plain_scalar()), + '%' | '@' | '`' => return Err(ScanError::new(self.mark, + &format!("unexpected character: `{}'", c))), + _ => try!(self.fetch_plain_scalar()), + } + + Ok(()) + } + + pub fn next_token(&mut self) -> Result, ScanError> { + if self.stream_end_produced { + return Ok(None); + } + + if !self.token_available { + try!(self.fetch_more_tokens()); + } + let t = self.tokens.pop_front().unwrap(); + self.token_available = false; + self.tokens_parsed += 1; + + match t.1 { + TokenType::StreamEndToken => self.stream_end_produced = true, + _ => {} + } + Ok(Some(t)) + } + + pub fn fetch_more_tokens(&mut self) -> ScanResult { + let mut need_more = false; + loop { + need_more = false; + if self.tokens.is_empty() { + need_more = true; + } else { + try!(self.stale_simple_keys()); + for sk in &self.simple_keys { + if sk.possible && sk.token_number == self.tokens_parsed { + need_more = true; + break; + } + } + } + + if !need_more { break; } + try!(self.fetch_next_token()); + } + self.token_available = true; + + Ok(()) + } + + fn stale_simple_keys(&mut self) -> ScanResult { + for sk in &mut self.simple_keys { + if sk.possible && (sk.mark.line < self.mark.line + || sk.mark.index + 1024 < self.mark.index) { + if sk.required { + return Err(ScanError::new(self.mark, "simple key expect ':'")); + } + sk.possible = false; + } + } + Ok(()) + } + + fn skip_to_next_token(&mut self) { + loop { + self.lookhead(1); + // TODO(chenyh) BOM + match self.ch() { + ' ' => self.skip(), + '\t' if self.flow_level > 0 || !self.simple_key_allowed => self.skip(), + '\n' | '\r' => { + self.skip(); + if self.flow_level == 0 { + self.allow_simple_key(); + } + }, + '#' => while !is_breakz(self.ch()) { self.skip(); self.lookhead(1); }, + _ => break + } + } + } + + fn fetch_stream_start(&mut self) { + let mark = self.mark; + self.indent = -1; + self.stream_start_produced = true; + self.allow_simple_key(); + self.tokens.push_back(Token(mark, TokenType::StreamStartToken(TEncoding::Utf8))); + self.simple_keys.push(SimpleKey::new(Marker::new(0,0,0))); + } + + fn fetch_stream_end(&mut self) -> ScanResult { + // force new line + if self.mark.col != 0 { + self.mark.col = 0; + self.mark.line += 1; + } + + self.unroll_indent(-1); + try!(self.remove_simple_key()); + self.disallow_simple_key(); + + self.tokens.push_back(Token(self.mark, TokenType::StreamEndToken)); + Ok(()) + } + + fn fetch_flow_collection_start(&mut self, tok :TokenType) -> ScanResult { + // The indicators '[' and '{' may start a simple key. + try!(self.save_simple_key()); + + self.increase_flow_level(); + + self.allow_simple_key(); + + let start_mark = self.mark; + self.skip(); + + self.tokens.push_back(Token(start_mark, tok)); + Ok(()) + } + + fn fetch_flow_collection_end(&mut self, tok :TokenType) -> ScanResult { + try!(self.remove_simple_key()); + self.decrease_flow_level(); + + self.disallow_simple_key(); + + let start_mark = self.mark; + self.skip(); + + self.tokens.push_back(Token(start_mark, tok)); + Ok(()) + } + + fn fetch_flow_entry(&mut self) -> ScanResult { + try!(self.remove_simple_key()); + self.allow_simple_key(); + + let start_mark = self.mark; + self.skip(); + + self.tokens.push_back(Token(start_mark, TokenType::FlowEntryToken)); + Ok(()) + } + + fn increase_flow_level(&mut self) { + self.simple_keys.push(SimpleKey::new(Marker::new(0,0,0))); + self.flow_level += 1; + } + fn decrease_flow_level(&mut self) { + if self.flow_level > 0 { + self.flow_level -= 1; + self.simple_keys.pop().unwrap(); + } + } + + fn fetch_block_entry(&mut self) -> ScanResult { + if self.flow_level == 0 { + // Check if we are allowed to start a new entry. + if !self.simple_key_allowed { + return Err(ScanError::new(self.mark, + "block sequence entries are not allowed in this context")); + } + + let mark = self.mark; + // generate BLOCK-SEQUENCE-START if indented + self.roll_indent(mark.col, None, TokenType::BlockSequenceStartToken, mark); + } else { + // - * only allowed in block + unreachable!(); + } + self.remove_simple_key(); + self.allow_simple_key(); + + let start_mark = self.mark; + self.skip(); + + self.tokens.push_back(Token(start_mark, TokenType::BlockEntryToken)); + Ok(()) + } + + fn fetch_document_indicator(&mut self, t: TokenType) -> ScanResult { + self.unroll_indent(-1); + try!(self.remove_simple_key()); + self.disallow_simple_key(); + + let mark = self.mark; + + self.skip(); + self.skip(); + self.skip(); + + self.tokens.push_back(Token(mark, t)); + Ok(()) + } + + fn fetch_plain_scalar(&mut self) -> Result<(), ScanError> { + try!(self.save_simple_key()); + + self.disallow_simple_key(); + + let tok = try!(self.scan_plain_scalar()); + + self.tokens.push_back(tok); + + Ok(()) + } + + fn scan_plain_scalar(&mut self) -> Result { + let indent = self.indent + 1; + let start_mark = self.mark; + + let mut string = String::new(); + let mut leading_break = String::new(); + let mut trailing_breaks = String::new(); + let mut whitespaces = String::new(); + let mut leading_blanks = false; + + loop { + /* Check for a document indicator. */ + self.lookhead(4); + + if self.mark.col == 0 && + ((self.buffer[0] == '-') && + (self.buffer[1] == '-') && + (self.buffer[2] == '-')) || + ((self.buffer[0] == '.') && + (self.buffer[1] == '.') && + (self.buffer[2] == '.')) && + is_blankz(self.buffer[3]) { + break; + } + + if self.ch() == '#' { break; } + while !is_blankz(self.ch()) { + if self.flow_level > 0 && self.ch() == ':' + && is_blankz(self.ch()) { + return Err(ScanError::new(start_mark, + "while scanning a plain scalar, found unexpected ':'")); + } + // indicators ends a plain scalar + match self.ch() { + ':' if is_blankz(self.buffer[1]) => break, + ',' | ':' | '?' | '[' | ']' |'{' |'}' => break, + _ => {} + } + + if leading_blanks || !whitespaces.is_empty() { + if leading_blanks { + if !leading_break.is_empty() { + if trailing_breaks.is_empty() { + string.push(' '); + } else { + string.extend(trailing_breaks.chars()); + trailing_breaks.clear(); + } + leading_break.clear(); + } else { + string.extend(leading_break.chars()); + string.extend(trailing_breaks.chars()); + trailing_breaks.clear(); + leading_break.clear(); + } + leading_blanks = false; + } else { + string.extend(whitespaces.chars()); + whitespaces.clear(); + } + } + + string.push(self.ch()); + self.skip(); + self.lookhead(2); + } + // is the end? + if !(is_blank(self.ch()) || is_break(self.ch())) { break; } + self.lookhead(1); + + while is_blank(self.ch()) || is_break(self.ch()) { + if is_blank(self.ch()) { + if leading_blanks && (self.mark.col as isize) < indent + && self.ch() == '\t' { + return Err(ScanError::new(start_mark, + "while scanning a plain scalar, found a tab")); + } + + if !leading_blanks { + whitespaces.push(self.ch()); + self.skip(); + } else { + self.skip(); + } + } else { + self.lookhead(2); + // Check if it is a first line break + if !leading_blanks { + whitespaces.clear(); + self.read_break(&mut leading_break); + leading_blanks = true; + } else { + self.read_break(&mut trailing_breaks); + } + } + self.lookhead(1); + } + + // check intendation level + if self.flow_level == 0 && (self.mark.col as isize) < indent { + break; + } + } + + if leading_blanks { + self.allow_simple_key(); + } + + Ok(Token(start_mark, TokenType::ScalarToken(TScalarStyle::Plain, string))) + } + + fn fetch_value(&mut self) -> ScanResult { + let sk = self.simple_keys.last().unwrap().clone(); + let start_mark = self.mark; + if sk.possible { + let tok = Token(start_mark, TokenType::KeyToken); + let tokens_parsed = self.tokens_parsed; + self.insert_token(sk.token_number - tokens_parsed, tok); + + // Add the BLOCK-MAPPING-START token if needed. + self.roll_indent(sk.mark.col, Some(sk.token_number), + TokenType::BlockMappingStartToken, start_mark); + + self.simple_keys.last_mut().unwrap().possible = false; + self.disallow_simple_key(); + } else { + // The ':' indicator follows a complex key. + unimplemented!(); + } + + self.skip(); + self.tokens.push_back(Token(start_mark, TokenType::ValueToken)); + + Ok(()) + } + + fn roll_indent(&mut self, col: usize, number: Option, + tok: TokenType, mark: Marker) { + if self.flow_level > 0 { + return; + } + + if self.indent < col as isize { + self.indents.push(self.indent); + self.indent = col as isize; + let tokens_parsed = self.tokens_parsed; + match number { + Some(n) => self.insert_token(n - tokens_parsed, Token(mark, tok)), + None => self.tokens.push_back(Token(mark, tok)) + } + } + } + + fn unroll_indent(&mut self, col: isize) { + if self.flow_level > 0 { + return; + } + while self.indent > col { + self.tokens.push_back(Token(self.mark, TokenType::BlockEndToken)); + self.indent = self.indents.pop().unwrap(); + } + } + + fn save_simple_key(&mut self) -> Result<(), ScanError> { + let required = self.flow_level > 0 && self.indent == (self.mark.col as isize); + if self.simple_key_allowed { + let mut sk = SimpleKey::new(self.mark); + sk.possible = true; + sk.required = required; + sk.token_number = self.tokens_parsed + self.tokens.len(); + + try!(self.remove_simple_key()); + + self.simple_keys.pop(); + self.simple_keys.push(sk); + } + Ok(()) + } + + fn remove_simple_key(&mut self) -> ScanResult { + let last = self.simple_keys.last_mut().unwrap(); + if last.possible { + if last.required { + return Err(ScanError::new(self.mark, "simple key expected")); + } + } + + last.possible = false; + Ok(()) + } + +} + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn test_tokenizer() { + let s: String = "--- +# comment +a0 bb: val +a1: + b1: 4 + b2: d +a2: 4 +a3: [1, 2, 3] +a4: + - - a1 + - a2 + - 2 +".to_string(); + let p = Scanner::new(s.chars()); + for t in p { + //println!("{:?}", t); + } + } +} + diff --git a/parser/src/yaml.rs b/parser/src/yaml.rs new file mode 100644 index 0000000..d803bdc --- /dev/null +++ b/parser/src/yaml.rs @@ -0,0 +1,39 @@ +use std::collections::{HashMap, BTreeMap}; +use std::string; + +#[derive(Clone, PartialEq, PartialOrd, Debug)] +pub enum Yaml { + I64(i64), + U64(u64), + F64(f64), + String(string::String), + Boolean(bool), + Array(self::Array), + Hash(self::Hash), + Null, +} + +pub type Array = Vec; +pub type Hash = BTreeMap; + +/// The errors that can arise while parsing a YAML stream. +#[derive(Clone, Copy, PartialEq, Debug)] +pub enum ErrorCode { + InvalidSyntax, + InvalidNumber, + EOFWhileParsingObject, + EOFWhileParsingArray, + EOFWhileParsingValue, + EOFWhileParsingString, + KeyMustBeAString, + ExpectedColon, + TrailingCharacters, + TrailingComma, + InvalidEscape, + InvalidUnicodeCodePoint, + LoneLeadingSurrogateInHexEscape, + UnexpectedEndOfHexEscape, + UnrecognizedHex, + NotFourDigit, + NotUtf8, +}