From 63a29908261169742cb9d5528ba383a504d290f5 Mon Sep 17 00:00:00 2001 From: Hydrostic Date: Fri, 8 May 2026 22:54:46 +0800 Subject: [PATCH] feat(lexer): Add diagnostic and impl error recovering --- Cargo.lock | 81 ++++++++++++ Cargo.toml | 1 + src/ast/mod.rs | 1 + src/ast/types.rs | 75 +++++++++++ src/diagnostic/mod.rs | 94 ++++++++++++++ src/diagnostic/span.rs | 6 + src/err.rs | 9 ++ src/frontend/err.rs | 28 +++++ src/frontend/lexer.rs | 273 +++++++++++++++++++++++++---------------- src/frontend/mod.rs | 4 +- src/frontend/types.rs | 71 +++++++++-- src/main.rs | 2 + 12 files changed, 525 insertions(+), 120 deletions(-) create mode 100644 src/ast/mod.rs create mode 100644 src/ast/types.rs create mode 100644 src/diagnostic/mod.rs create mode 100644 src/diagnostic/span.rs create mode 100644 src/err.rs create mode 100644 src/frontend/err.rs diff --git a/Cargo.lock b/Cargo.lock index 1fd0f62..db97c61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,17 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "codespan-reporting" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681" +dependencies = [ + "serde", + "termcolor", + "unicode-width", +] + [[package]] name = "heck" version = "0.5.0" @@ -153,12 +164,43 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" name = "rusty-minic" version = "0.1.0" dependencies = [ + "codespan-reporting", "num", "regex", "strum", "thiserror", ] +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "strum" version = "0.28.0" @@ -191,6 +233,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + [[package]] name = "thiserror" version = "2.0.18" @@ -216,3 +267,33 @@ name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] diff --git a/Cargo.toml b/Cargo.toml index e2c6a11..61533d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2024" [dependencies] +codespan-reporting = "0.13.1" num = "0.4.3" regex = "1.12.3" strum = { version = "0.28.0", features = ["derive"] } diff --git a/src/ast/mod.rs b/src/ast/mod.rs new file mode 100644 index 0000000..cd40856 --- /dev/null +++ b/src/ast/mod.rs @@ -0,0 +1 @@ +pub mod types; diff --git a/src/ast/types.rs b/src/ast/types.rs new file mode 100644 index 0000000..dbe2293 --- /dev/null +++ b/src/ast/types.rs @@ -0,0 +1,75 @@ +#[derive(Debug, Clone, Copy)] +pub struct Span { + start: usize, + end: usize, +} +pub struct CompileUnit { + pub global_decls: Vec, +} +pub enum GlobalDeclStmt { + VarDecl(VarDeclStmt), + FuncDecl(FuncDeclStmt), +} + +pub struct VarDeclStmt { + pub name: String, + pub var_type: Type, + pub span: Span, +} + +pub struct FuncDeclStmt { + pub name: String, + pub return_type: Type, + pub params: Vec, + pub body: BlockStmt, + pub span: Span, +} +pub struct BlockStmt { + pub statements: Vec, + pub span: Span, +} + +pub enum Statement { + Return(ReturnStmt), + Block(BlockStmt), + Expr(Expr), + VarDecl(VarDeclStmt), +} +pub struct ReturnStmt { + pub value: Option, + pub span: Span, +} +pub struct Expr { + pub value: ExprValue, + pub span: Span, +} +pub enum ExprValue { + IntLit(i64), + Var(String), + BinaryOp { + lhs: Box, + op: BinaryOp, + rhs: Box + }, + FuncCall(String, Vec), + Assign { + lvalue: Box, + rvalue: Box + }, +} + +pub enum BinaryOp { + Add, Sub, Mul, Div, Mod, + Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual, +} + +pub enum Type { + Int, + Void, +} + +pub struct Param { + name: String, + param_type: Type, + span: Span, +} \ No newline at end of file diff --git a/src/diagnostic/mod.rs b/src/diagnostic/mod.rs new file mode 100644 index 0000000..685dbd6 --- /dev/null +++ b/src/diagnostic/mod.rs @@ -0,0 +1,94 @@ +use crate::{diagnostic::span::Span, err::CompileError, frontend::err::FrontendError}; + +pub mod span; + +pub struct Diagnositics { + diagnostics: Vec, +} +pub enum DiagnosticLevel { + Error, + Warning, + Info, +} +pub struct Diagnostic { + level: DiagnosticLevel, + message: String, + span: Span +} +impl Diagnositics { + pub fn new() -> Self { + Self { diagnostics: vec![] } + } + pub fn add(&mut self, diagnostic: Diagnostic) { + self.diagnostics.push(diagnostic); + } + pub fn add_from_error(&mut self, error: impl Into, span: Span) { + self.diagnostics.push(Diagnostic { + level: DiagnosticLevel::Error, + message: Into::::into(error).to_string(), + span, + }); + } + pub fn add_from_frontend_error(&mut self, error: impl Into, span: Span) { + self.diagnostics.push(Diagnostic { + level: DiagnosticLevel::Error, + message: Into::::into(error).to_string(), + span, + }); + } + pub fn is_empty(&self) -> bool { + self.diagnostics.is_empty() + } + pub fn print(&self, name: &str, source: &str) { + use codespan_reporting::diagnostic::Diagnostic as CodespanDiagnostic; + use codespan_reporting::files::SimpleFile; + use codespan_reporting::diagnostic::{Severity, Label}; + use std::io::IsTerminal; + + use codespan_reporting::term::{self, termcolor::{ColorChoice, StandardStream}}; + let mut choice = ColorChoice::Auto; + if !std::io::stdin().is_terminal() { + choice = ColorChoice::Never; + } + let stdout = StandardStream::stdout(choice); + let source_file = SimpleFile::new(name, source); + let output_config = codespan_reporting::term::Config::default(); + for diagnostic in &self.diagnostics { + let output_level = match diagnostic.level { + DiagnosticLevel::Error => Severity::Error, + DiagnosticLevel::Warning => Severity::Warning, + DiagnosticLevel::Info => Severity::Note, + }; + let output_diagnostic = CodespanDiagnostic::new(output_level) + .with_message(&diagnostic.message) + .with_label( + Label::primary((), diagnostic.span.start..diagnostic.span.end) + ); + term::emit_to_write_style(&mut stdout.lock(), &output_config, &source_file, &output_diagnostic); + + } + + } +} +#[cfg(test)] +mod tests { + #[test] + fn test_diagnostics() { + use crate::diagnostic::{Diagnostic, DiagnosticLevel, Diagnositics}; + let mut diagnostics = Diagnositics::new(); + diagnostics.add(Diagnostic { + level: DiagnosticLevel::Error, + message: "test error".to_string(), + span: crate::diagnostic::span::Span { start: 0, end: 3 }, + }); + diagnostics.add(Diagnostic { + level: DiagnosticLevel::Error, + message: "test error".to_string(), + span: crate::diagnostic::span::Span { start: 16, end: 22 }, + }); + diagnostics.print("main.c", +r#"int main(){ + return 1; +}"#); + } +} \ No newline at end of file diff --git a/src/diagnostic/span.rs b/src/diagnostic/span.rs new file mode 100644 index 0000000..4a0820c --- /dev/null +++ b/src/diagnostic/span.rs @@ -0,0 +1,6 @@ + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Span { + pub start: usize, + pub end: usize, +} \ No newline at end of file diff --git a/src/err.rs b/src/err.rs new file mode 100644 index 0000000..9f4869f --- /dev/null +++ b/src/err.rs @@ -0,0 +1,9 @@ +use thiserror::Error; + +use crate::frontend::err::FrontendError; + +#[derive(Debug, Clone, PartialEq, Eq, Error)] +pub enum CompileError { + #[error(transparent)] + Frontend(#[from] FrontendError), +} \ No newline at end of file diff --git a/src/frontend/err.rs b/src/frontend/err.rs new file mode 100644 index 0000000..c733c7b --- /dev/null +++ b/src/frontend/err.rs @@ -0,0 +1,28 @@ +use thiserror::Error; + +// #[derive(Debug, Clone, PartialEq, Eq, Error)] +// pub enum ParseError { +// BlockStmt(#[from] BlockStmtError) +// } +// #[derive(Debug, Clone, PartialEq, Eq, Error)] +// pub enum BlockStmtError { +// MissingLBrace, +// MissingRBrace, +// } +#[derive(Debug, Clone, PartialEq, Eq, Error)] +pub enum LexingError { + #[error("invalid int literal")] + InvalidIntLiteral, + #[error("invalid ident")] + InvalidIdent, + #[error("comment unterminated")] + UnterminatedComment, + #[error("unrecognized token: {0}")] + UnrecognizedToken(String), +} +#[derive(Debug, Clone, PartialEq, Eq, Error)] +pub enum FrontendError { + #[error(transparent)] + Lexing(#[from] LexingError), + +} \ No newline at end of file diff --git a/src/frontend/lexer.rs b/src/frontend/lexer.rs index c7476da..c378ed7 100644 --- a/src/frontend/lexer.rs +++ b/src/frontend/lexer.rs @@ -1,21 +1,32 @@ -use std::{io::BufRead, iter::Peekable, str::FromStr}; +use std::{io::BufRead, str::FromStr}; +use codespan_reporting::diagnostic; use thiserror::Error; -use crate::frontend::types::{Span, TokenValue, TypeIdent}; +use crate::{diagnostic::{Diagnositics, span::{self, Span}}, frontend::{err::LexingError, types::{TokenValue, TypeIdent}}}; use super::types::Token; pub struct Lexer { - tokens: Vec, - errors: Vec, // every entry points to the index of unrecognized tokens + pub tokens: Vec, + pub diagnostics: Diagnositics, + old_char_count: usize, + block_comment_span: Option, + in_skip_line: bool } const WHITESPACE_CHARS: &[char] = &[' ', '\t', '\n', '\r']; +const DELIMITER_CHARS: &[char] = &[ + '+', '-', '*', '/', '%', '=', '!', '<', '>', '(', ')', ',', ';' +]; struct Cursor { chars: Vec, pos: usize, } +enum LexParseError { + NotMatched, + InvalidInMatch(LexingError) +} impl Cursor { pub fn new(s: &str) -> Self { Self { chars: s.chars().collect(), pos: 0 } @@ -47,20 +58,35 @@ impl Cursor { self.pos } } +/// try parse using the giving function, return whether should continue fn try_parse_as( - f: fn(&mut Cursor) -> Option, + f: fn(&mut Cursor) -> Result, tokens: &mut Vec, str_iter: &mut Cursor, - line: &mut usize, - column: &mut usize, + diagnostics: &mut Diagnositics, + last_char_count: usize, ) -> bool { - let last_pos = str_iter.pos(); - if let Some(token) = f(str_iter) { - let span = Span { line: *line, column: *column, length: str_iter.pos() - last_pos }; - tokens.push(Token { value: token, span }); - return true; + let last_pos = str_iter.pos() + last_char_count; + match f(str_iter) { + Ok(token_value) => { + let span = Span { start: last_pos, end: str_iter.pos() + last_char_count }; + tokens.push(Token { value: token_value, span }); + return true; + } + Err(LexParseError::NotMatched) => false, + Err(LexParseError::InvalidInMatch(err)) => { + // try recover from delimiter char or whitespace char + while let Some(c) = str_iter.peek() { + if DELIMITER_CHARS.contains(&c) || WHITESPACE_CHARS.contains(&c) { + break; + } + str_iter.advance(1); + } + let span = Span { start: last_pos, end: str_iter.pos() + last_char_count }; + diagnostics.add_from_frontend_error(err, span); + return true; + } } - false } macro_rules! if_true_then_continue { ($e: expr) => { @@ -77,99 +103,106 @@ pub enum LexerError { TooManyErrors, } impl Lexer { - pub fn has_errors(&self) -> bool { - !self.errors.is_empty() + pub fn new() -> Self { + Self { tokens: vec![], diagnostics: Diagnositics::new(), old_char_count: 0, block_comment_span: None, in_skip_line: false } } - pub fn parse(reader: &mut impl BufRead) -> Result { - let mut tokens = Vec::new(); - let mut errors = Vec::new(); - let mut line = 1; - let mut column = 1; - let mut in_block_comment = false; - for line_str in reader.lines() { - let line_str = line_str?; - let mut cursor = Cursor::new(&line_str); - loop { - if let Some(c) = cursor.peek() { - // check white space first, if it's white space, skip it and continue to the next character - if WHITESPACE_CHARS.contains(&c) { - column += 1; - cursor.advance(1); - continue; - } - // check comment - match cursor.peek_multiple(2) { - Some(['/', '/']) => { - // skip the rest of the line - line += 1; - column = 1; - break; - } - Some(['/', '*']) => { - in_block_comment = true; - cursor.advance(2); - column += 2; - continue; - } - Some(['*', '/']) => { - in_block_comment = false; - cursor.advance(2); - column += 2; - continue; - } - _ => {} - } - } else { - break; - } - if in_block_comment { - cursor.advance(1); - column += 1; - } - if_true_then_continue!(try_parse_as(parse_litint, &mut tokens, &mut cursor, &mut line, &mut column)); - if_true_then_continue!(try_parse_as(parse_delimiter, &mut tokens, &mut cursor, &mut line, &mut column)); - if_true_then_continue!(try_parse_as(parse_puncuation, &mut tokens, &mut cursor, &mut line, &mut column)); - if_true_then_continue!(try_parse_as(parse_ident, &mut tokens, &mut cursor, &mut line, &mut column)); - // unrecognized token - errors.push(tokens.len()); - let c = cursor.next().unwrap(); - tokens.push(Token { - value: TokenValue::Unrecognized(c), - span: Span { line, column, length: 1 }, - }); - if errors.len() > 20 { - return Err(LexerError::TooManyErrors); - } - column += 1; - } - line += 1; - column = 1; + pub fn finish(mut self) -> (Vec, Diagnositics) { + if let Some(span) = self.block_comment_span.take() { + self.diagnostics.add_from_frontend_error(LexingError::UnterminatedComment, span); } - Ok(Self { tokens, errors }) + (self.tokens, self.diagnostics) + } + /// call `parse_str` will continue to parse the input from current state + /// please also pass the whitespace to ensure the correct char position in diagnostics + pub fn parse_next_str(&mut self, s: &str) { + let mut cursor = Cursor::new(s); + loop { + if let Some(c) = cursor.peek() { + if self.in_skip_line && c != '\n' { + cursor.advance(1); + continue; + } + // check white space first, if it's white space, skip it and continue to the next character + if WHITESPACE_CHARS.contains(&c) { + if c == '\n' { + self.in_skip_line = false; + } + cursor.advance(1); + continue; + } + // check comment + match cursor.peek_multiple(2) { + Some(['/', '/']) => { + // skip the rest of the line + self.in_skip_line = true; + cursor.advance(2); + continue; + } + Some(['/', '*']) => { + let start = cursor.pos() + self.old_char_count; + self.block_comment_span = Some(Span { start, end: start + 2 }); + cursor.advance(2); + continue; + } + Some(['*', '/']) => { + self.block_comment_span = None; + cursor.advance(2); + continue; + } + _ => {} + } + if self.block_comment_span.is_some() { + cursor.advance(1); + continue; + } + } else { + break; + } + if_true_then_continue!(try_parse_as(parse_litint, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count)); + if_true_then_continue!(try_parse_as(parse_delimiter, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count)); + if_true_then_continue!(try_parse_as(parse_puncuation, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count)); + if_true_then_continue!(try_parse_as(parse_ident, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count)); + // unrecognized token + let last_pos = cursor.pos() + self.old_char_count; + let mut unrecognized = Vec::new(); + while let Some(c) = cursor.peek() { + if DELIMITER_CHARS.contains(&c) || WHITESPACE_CHARS.contains(&c) { + break; + } + unrecognized.push(c); + cursor.advance(1); + } + let span = Span { start: last_pos, end: cursor.pos() + self.old_char_count }; + let unrecognized = unrecognized.into_iter().collect::(); + self.diagnostics.add_from_frontend_error(LexingError::UnrecognizedToken(unrecognized), span); + self.tokens.push(Token { value: TokenValue::Unrecognized, span }); + } + self.old_char_count += s.len(); } } fn parse_litint( str_iter: &mut Cursor, -) -> Option { - let mut c1 = str_iter.peek()?; +) -> Result { + let mut c1 = str_iter.peek().ok_or(LexParseError::NotMatched)?; // c1 is the peek value from here let mut sign_base: i64 = 1; let mut base: i64 = 10; if !(c1.is_ascii_digit() || c1 == '-') { - return None; + return Err(LexParseError::NotMatched); } if c1 == '-' { sign_base = -1; str_iter.advance(1); - c1 = str_iter.peek()?; + c1 = str_iter.peek().ok_or(LexParseError::NotMatched)?; if !c1.is_ascii_digit() { // only a minus sign, not a number // back one so cursor still points to the minus sign str_iter.back(1); - return None; + return Err(LexParseError::NotMatched); } } let mut number = 0i64; + let mut has_digits = false; if c1 == '0' { str_iter.advance(1); match str_iter.peek() { @@ -181,12 +214,13 @@ fn parse_litint( base = 8; } _ => { + has_digits = true; // only zero } } } // from here, the cursor points to: - // 0x1234 -> cursor at 'x' + // 0x1234 -> cursor at '1' // 0123 -> cursor at '1' // 0 -> cursor at end // 1234 -> cursor at '1' @@ -199,32 +233,42 @@ fn parse_litint( '0'..='9' if (c as u8 - b'0') < base as u8 => c as i64 - '0' as i64 , 'a'..='f' if base == 16 => c as i64 - 'a' as i64 + 10 , 'A'..='F' if base == 16 => c as i64 - 'A' as i64 + 10, - _ => break, + c => if WHITESPACE_CHARS.contains(&c) || DELIMITER_CHARS.contains(&c) { + break; + } else { + // unrecognized character in number literal + return Err(LexParseError::InvalidInMatch(LexingError::InvalidIntLiteral)); + } }; + has_digits = true; number = number * base + digit; str_iter.advance(1); } + if !has_digits { + // No valid digits found, add a diagnostic + return Err(LexParseError::InvalidInMatch(LexingError::InvalidIntLiteral)); + } number *= sign_base; - Some(TokenValue::IntLit(number)) + Ok(TokenValue::IntLit(number)) } fn parse_delimiter( str_iter: &mut Cursor, -) -> Option { - let c = str_iter.peek()?; +) -> Result { + let c = str_iter.peek().ok_or(LexParseError::NotMatched)?; let token_value = match c { '(' => TokenValue::LParen, ')' => TokenValue::RParen, '{' => TokenValue::LBrace, '}' => TokenValue::RBrace, - _ => return None, + _ => return Err(LexParseError::NotMatched), }; str_iter.advance(1); - Some(token_value) + Ok(token_value) } fn parse_puncuation( str_iter: &mut Cursor, -) -> Option { +) -> Result { let get_value_by_next_char = |str_iter: &mut Cursor, not_equal_value: TokenValue, equal_value: TokenValue| { str_iter.advance(1); @@ -235,7 +279,7 @@ fn parse_puncuation( not_equal_value } }; - let c = str_iter.peek()?; + let c = str_iter.peek().ok_or(LexParseError::NotMatched)?; let token_value = match c { '+' => TokenValue::Plus, '-' => TokenValue::Minus, @@ -249,9 +293,7 @@ fn parse_puncuation( if let Some('=') = str_iter.peek() { TokenValue::NotEqual } else { - // only '!' is not a valid token, back one so cursor still points to '!' - str_iter.back(1); - return None; + TokenValue::Not } }, '<' => get_value_by_next_char(str_iter, TokenValue::Less, TokenValue::LessEqual), @@ -260,33 +302,35 @@ fn parse_puncuation( ',' => TokenValue::Comma, ';' => TokenValue::Semicolon, - _ => return None, + _ => return Err(LexParseError::NotMatched), }; str_iter.advance(1); - Some(token_value) + Ok(token_value) } fn parse_ident( str_iter: &mut Cursor, -) -> Option { - let c = str_iter.peek()?; +) -> Result { + let c = str_iter.peek().ok_or(LexParseError::NotMatched)?; if !c.is_ascii_alphabetic() && c != '_' { - return None; + return Err(LexParseError::NotMatched); } let mut name = Vec::new(); while let Some(c) = str_iter.peek() { if c.is_ascii_alphanumeric() || c == '_' { name.push(c); str_iter.advance(1); - } else { + } else if DELIMITER_CHARS.contains(&c) || WHITESPACE_CHARS.contains(&c) { break; + } else { + return Err(LexParseError::InvalidInMatch(LexingError::InvalidIdent)); } } let name = name.into_iter().collect::(); if let Some(type_ident) = TypeIdent::from_str(&name).ok() { - return Some(TokenValue::TypeIdent(type_ident)); + return Ok(TokenValue::TypeIdent(type_ident)); } - Some(TokenValue::Ident(name)) + Ok(TokenValue::Ident(name)) } #[cfg(test)] mod tests { @@ -303,11 +347,22 @@ mod tests { for case_no in case_sequence { let case_path = case_list.get_case_path(case_no).unwrap(); println!("{}", case_path.display()); - let file = File::open(case_path).unwrap(); + let file = File::open(&case_path).unwrap(); let mut buf_reader = std::io::BufReader::new(file); - let lexer = Lexer::parse(&mut buf_reader).unwrap(); - if lexer.has_errors() { - eprintln!("Case {} has error", case_list.get_case_name(case_no).unwrap()); + let mut lexer = Lexer::new(); + let mut full_text = String::new(); + loop { + let mut line = String::new(); + let bytes_read = buf_reader.read_line(&mut line).unwrap(); + if bytes_read == 0 { + break; + } + full_text.push_str(&line); + lexer.parse_next_str(&line); + } + let (_tokens, diagnostics) = lexer.finish(); + if !diagnostics.is_empty() { + diagnostics.print(&format!("{}", case_path.display()), &full_text); error_case_cnt += 1; } } diff --git a/src/frontend/mod.rs b/src/frontend/mod.rs index c7108a2..a82d1d8 100644 --- a/src/frontend/mod.rs +++ b/src/frontend/mod.rs @@ -1,2 +1,4 @@ pub mod types; -mod lexer; \ No newline at end of file +mod lexer; +// pub mod parser; +pub mod err; \ No newline at end of file diff --git a/src/frontend/types.rs b/src/frontend/types.rs index 1899d24..b82fd6b 100644 --- a/src/frontend/types.rs +++ b/src/frontend/types.rs @@ -1,4 +1,6 @@ -use strum::EnumString; +use strum::{AsRefStr, EnumString}; + +use crate::diagnostic::span::Span; #[derive(Debug, Clone, PartialEq, Eq)] @@ -7,12 +9,6 @@ pub struct Token { pub span: Span, } -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Span { - pub line: usize, - pub column: usize, - pub length: usize, -} #[derive(Debug, Clone, PartialEq, Eq)] pub enum TokenValue { @@ -20,6 +16,61 @@ pub enum TokenValue { Ident(String), TypeIdent(TypeIdent), + Plus, Minus, Star, Slash, Percent, + Equal, DoubleEqual, Not, NotEqual, Less, LessEqual, Greater, GreaterEqual, + + LParen, RParen, + LBrace, RBrace, + Comma, Semicolon, + + If, Else, While, Return, Break, Continue, + + // Eof, + Unrecognized, +} +impl std::fmt::Display for TokenValue { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TokenValue::IntLit(i) => write!(f, "literal int: {}", i), + TokenValue::Ident(s) => write!(f, "identifier: {}", s), + TokenValue::TypeIdent(t) => write!(f, "type {}", t.as_ref()), + TokenValue::Plus => write!(f, "+"), + TokenValue::Minus => write!(f, "-"), + TokenValue::Star => write!(f, "*"), + TokenValue::Slash => write!(f, "/"), + TokenValue::Percent => write!(f, "%"), + TokenValue::Equal => write!(f, "="), + TokenValue::DoubleEqual => write!(f, "=="), + TokenValue::Not => write!(f, "!"), + TokenValue::NotEqual => write!(f, "!="), + TokenValue::Less => write!(f, "<"), + TokenValue::LessEqual => write!(f, "<="), + TokenValue::Greater => write!(f, ">"), + TokenValue::GreaterEqual => write!(f, ">="), + TokenValue::LParen => write!(f, "("), + TokenValue::RParen => write!(f, ")"), + TokenValue::LBrace => write!(f, "{{"), + TokenValue::RBrace => write!(f, "}}"), + TokenValue::Comma => write!(f, ","), + TokenValue::Semicolon => write!(f, ";"), + TokenValue::If => write!(f, "if"), + TokenValue::Else => write!(f, "else"), + TokenValue::While => write!(f, "while"), + TokenValue::Return => write!(f, "return"), + TokenValue::Break => write!(f, "break"), + TokenValue::Continue => write!(f, "continue"), + // TokenValue::Eof => write!(f, ""), + TokenValue::Unrecognized => write!(f, "unrecognized"), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenKind { + IntLit, + Ident, + TypeIdent, + Plus, Minus, Star, Slash, Percent, Equal, DoubleEqual, NotEqual, Less, LessEqual, Greater, GreaterEqual, @@ -29,10 +80,10 @@ pub enum TokenValue { If, Else, While, Return, Break, Continue, - Eof, - Unrecognized(char), + // Eof, + Unrecognized, } -#[derive(Debug, Clone, PartialEq, Eq, EnumString)] +#[derive(Debug, Clone, PartialEq, Eq, EnumString, AsRefStr)] pub enum TypeIdent { #[strum(serialize = "int")] Int, diff --git a/src/main.rs b/src/main.rs index 1f94806..9540caf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,8 @@ mod frontend; mod ast; mod utils; +mod diagnostic; +mod err; fn main() { println!("Hello, world!"); }