feat(lexer): Add diagnostic and impl error recovering
This commit is contained in:
Generated
+81
@@ -17,6 +17,17 @@ version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||
|
||||
[[package]]
|
||||
name = "codespan-reporting"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"termcolor",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@@ -153,12 +164,43 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
||||
name = "rusty-minic"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"codespan-reporting",
|
||||
"num",
|
||||
"regex",
|
||||
"strum",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_core"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.28.0"
|
||||
@@ -191,6 +233,15 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.18"
|
||||
@@ -216,3 +267,33 @@ name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
|
||||
dependencies = [
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.61.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
@@ -4,6 +4,7 @@ version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
codespan-reporting = "0.13.1"
|
||||
num = "0.4.3"
|
||||
regex = "1.12.3"
|
||||
strum = { version = "0.28.0", features = ["derive"] }
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
pub mod types;
|
||||
@@ -0,0 +1,75 @@
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Span {
|
||||
start: usize,
|
||||
end: usize,
|
||||
}
|
||||
pub struct CompileUnit {
|
||||
pub global_decls: Vec<GlobalDeclStmt>,
|
||||
}
|
||||
pub enum GlobalDeclStmt {
|
||||
VarDecl(VarDeclStmt),
|
||||
FuncDecl(FuncDeclStmt),
|
||||
}
|
||||
|
||||
pub struct VarDeclStmt {
|
||||
pub name: String,
|
||||
pub var_type: Type,
|
||||
pub span: Span,
|
||||
}
|
||||
|
||||
pub struct FuncDeclStmt {
|
||||
pub name: String,
|
||||
pub return_type: Type,
|
||||
pub params: Vec<Param>,
|
||||
pub body: BlockStmt,
|
||||
pub span: Span,
|
||||
}
|
||||
pub struct BlockStmt {
|
||||
pub statements: Vec<Statement>,
|
||||
pub span: Span,
|
||||
}
|
||||
|
||||
pub enum Statement {
|
||||
Return(ReturnStmt),
|
||||
Block(BlockStmt),
|
||||
Expr(Expr),
|
||||
VarDecl(VarDeclStmt),
|
||||
}
|
||||
pub struct ReturnStmt {
|
||||
pub value: Option<Expr>,
|
||||
pub span: Span,
|
||||
}
|
||||
pub struct Expr {
|
||||
pub value: ExprValue,
|
||||
pub span: Span,
|
||||
}
|
||||
pub enum ExprValue {
|
||||
IntLit(i64),
|
||||
Var(String),
|
||||
BinaryOp {
|
||||
lhs: Box<Expr>,
|
||||
op: BinaryOp,
|
||||
rhs: Box<Expr>
|
||||
},
|
||||
FuncCall(String, Vec<Expr>),
|
||||
Assign {
|
||||
lvalue: Box<Expr>,
|
||||
rvalue: Box<Expr>
|
||||
},
|
||||
}
|
||||
|
||||
pub enum BinaryOp {
|
||||
Add, Sub, Mul, Div, Mod,
|
||||
Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual,
|
||||
}
|
||||
|
||||
pub enum Type {
|
||||
Int,
|
||||
Void,
|
||||
}
|
||||
|
||||
pub struct Param {
|
||||
name: String,
|
||||
param_type: Type,
|
||||
span: Span,
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
use crate::{diagnostic::span::Span, err::CompileError, frontend::err::FrontendError};
|
||||
|
||||
pub mod span;
|
||||
|
||||
pub struct Diagnositics {
|
||||
diagnostics: Vec<Diagnostic>,
|
||||
}
|
||||
pub enum DiagnosticLevel {
|
||||
Error,
|
||||
Warning,
|
||||
Info,
|
||||
}
|
||||
pub struct Diagnostic {
|
||||
level: DiagnosticLevel,
|
||||
message: String,
|
||||
span: Span
|
||||
}
|
||||
impl Diagnositics {
|
||||
pub fn new() -> Self {
|
||||
Self { diagnostics: vec![] }
|
||||
}
|
||||
pub fn add(&mut self, diagnostic: Diagnostic) {
|
||||
self.diagnostics.push(diagnostic);
|
||||
}
|
||||
pub fn add_from_error(&mut self, error: impl Into<CompileError>, span: Span) {
|
||||
self.diagnostics.push(Diagnostic {
|
||||
level: DiagnosticLevel::Error,
|
||||
message: Into::<CompileError>::into(error).to_string(),
|
||||
span,
|
||||
});
|
||||
}
|
||||
pub fn add_from_frontend_error(&mut self, error: impl Into<FrontendError>, span: Span) {
|
||||
self.diagnostics.push(Diagnostic {
|
||||
level: DiagnosticLevel::Error,
|
||||
message: Into::<FrontendError>::into(error).to_string(),
|
||||
span,
|
||||
});
|
||||
}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.diagnostics.is_empty()
|
||||
}
|
||||
pub fn print(&self, name: &str, source: &str) {
|
||||
use codespan_reporting::diagnostic::Diagnostic as CodespanDiagnostic;
|
||||
use codespan_reporting::files::SimpleFile;
|
||||
use codespan_reporting::diagnostic::{Severity, Label};
|
||||
use std::io::IsTerminal;
|
||||
|
||||
use codespan_reporting::term::{self, termcolor::{ColorChoice, StandardStream}};
|
||||
let mut choice = ColorChoice::Auto;
|
||||
if !std::io::stdin().is_terminal() {
|
||||
choice = ColorChoice::Never;
|
||||
}
|
||||
let stdout = StandardStream::stdout(choice);
|
||||
let source_file = SimpleFile::new(name, source);
|
||||
let output_config = codespan_reporting::term::Config::default();
|
||||
for diagnostic in &self.diagnostics {
|
||||
let output_level = match diagnostic.level {
|
||||
DiagnosticLevel::Error => Severity::Error,
|
||||
DiagnosticLevel::Warning => Severity::Warning,
|
||||
DiagnosticLevel::Info => Severity::Note,
|
||||
};
|
||||
let output_diagnostic = CodespanDiagnostic::new(output_level)
|
||||
.with_message(&diagnostic.message)
|
||||
.with_label(
|
||||
Label::primary((), diagnostic.span.start..diagnostic.span.end)
|
||||
);
|
||||
term::emit_to_write_style(&mut stdout.lock(), &output_config, &source_file, &output_diagnostic);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn test_diagnostics() {
|
||||
use crate::diagnostic::{Diagnostic, DiagnosticLevel, Diagnositics};
|
||||
let mut diagnostics = Diagnositics::new();
|
||||
diagnostics.add(Diagnostic {
|
||||
level: DiagnosticLevel::Error,
|
||||
message: "test error".to_string(),
|
||||
span: crate::diagnostic::span::Span { start: 0, end: 3 },
|
||||
});
|
||||
diagnostics.add(Diagnostic {
|
||||
level: DiagnosticLevel::Error,
|
||||
message: "test error".to_string(),
|
||||
span: crate::diagnostic::span::Span { start: 16, end: 22 },
|
||||
});
|
||||
diagnostics.print("main.c",
|
||||
r#"int main(){
|
||||
return 1;
|
||||
}"#);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct Span {
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::frontend::err::FrontendError;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||
pub enum CompileError {
|
||||
#[error(transparent)]
|
||||
Frontend(#[from] FrontendError),
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
use thiserror::Error;
|
||||
|
||||
// #[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||
// pub enum ParseError {
|
||||
// BlockStmt(#[from] BlockStmtError)
|
||||
// }
|
||||
// #[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||
// pub enum BlockStmtError {
|
||||
// MissingLBrace,
|
||||
// MissingRBrace,
|
||||
// }
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||
pub enum LexingError {
|
||||
#[error("invalid int literal")]
|
||||
InvalidIntLiteral,
|
||||
#[error("invalid ident")]
|
||||
InvalidIdent,
|
||||
#[error("comment unterminated")]
|
||||
UnterminatedComment,
|
||||
#[error("unrecognized token: {0}")]
|
||||
UnrecognizedToken(String),
|
||||
}
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||
pub enum FrontendError {
|
||||
#[error(transparent)]
|
||||
Lexing(#[from] LexingError),
|
||||
|
||||
}
|
||||
+164
-109
@@ -1,21 +1,32 @@
|
||||
use std::{io::BufRead, iter::Peekable, str::FromStr};
|
||||
use std::{io::BufRead, str::FromStr};
|
||||
|
||||
use codespan_reporting::diagnostic;
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::frontend::types::{Span, TokenValue, TypeIdent};
|
||||
use crate::{diagnostic::{Diagnositics, span::{self, Span}}, frontend::{err::LexingError, types::{TokenValue, TypeIdent}}};
|
||||
|
||||
use super::types::Token;
|
||||
|
||||
pub struct Lexer {
|
||||
tokens: Vec<Token>,
|
||||
errors: Vec<usize>, // every entry points to the index of unrecognized tokens
|
||||
pub tokens: Vec<Token>,
|
||||
pub diagnostics: Diagnositics,
|
||||
old_char_count: usize,
|
||||
block_comment_span: Option<Span>,
|
||||
in_skip_line: bool
|
||||
}
|
||||
|
||||
const WHITESPACE_CHARS: &[char] = &[' ', '\t', '\n', '\r'];
|
||||
const DELIMITER_CHARS: &[char] = &[
|
||||
'+', '-', '*', '/', '%', '=', '!', '<', '>', '(', ')', ',', ';'
|
||||
];
|
||||
struct Cursor {
|
||||
chars: Vec<char>,
|
||||
pos: usize,
|
||||
}
|
||||
enum LexParseError {
|
||||
NotMatched,
|
||||
InvalidInMatch(LexingError)
|
||||
}
|
||||
impl Cursor {
|
||||
pub fn new(s: &str) -> Self {
|
||||
Self { chars: s.chars().collect(), pos: 0 }
|
||||
@@ -47,20 +58,35 @@ impl Cursor {
|
||||
self.pos
|
||||
}
|
||||
}
|
||||
/// try parse using the giving function, return whether should continue
|
||||
fn try_parse_as(
|
||||
f: fn(&mut Cursor) -> Option<TokenValue>,
|
||||
f: fn(&mut Cursor) -> Result<TokenValue, LexParseError>,
|
||||
tokens: &mut Vec<Token>,
|
||||
str_iter: &mut Cursor,
|
||||
line: &mut usize,
|
||||
column: &mut usize,
|
||||
diagnostics: &mut Diagnositics,
|
||||
last_char_count: usize,
|
||||
) -> bool {
|
||||
let last_pos = str_iter.pos();
|
||||
if let Some(token) = f(str_iter) {
|
||||
let span = Span { line: *line, column: *column, length: str_iter.pos() - last_pos };
|
||||
tokens.push(Token { value: token, span });
|
||||
return true;
|
||||
let last_pos = str_iter.pos() + last_char_count;
|
||||
match f(str_iter) {
|
||||
Ok(token_value) => {
|
||||
let span = Span { start: last_pos, end: str_iter.pos() + last_char_count };
|
||||
tokens.push(Token { value: token_value, span });
|
||||
return true;
|
||||
}
|
||||
Err(LexParseError::NotMatched) => false,
|
||||
Err(LexParseError::InvalidInMatch(err)) => {
|
||||
// try recover from delimiter char or whitespace char
|
||||
while let Some(c) = str_iter.peek() {
|
||||
if DELIMITER_CHARS.contains(&c) || WHITESPACE_CHARS.contains(&c) {
|
||||
break;
|
||||
}
|
||||
str_iter.advance(1);
|
||||
}
|
||||
let span = Span { start: last_pos, end: str_iter.pos() + last_char_count };
|
||||
diagnostics.add_from_frontend_error(err, span);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
macro_rules! if_true_then_continue {
|
||||
($e: expr) => {
|
||||
@@ -77,99 +103,106 @@ pub enum LexerError {
|
||||
TooManyErrors,
|
||||
}
|
||||
impl Lexer {
|
||||
pub fn has_errors(&self) -> bool {
|
||||
!self.errors.is_empty()
|
||||
pub fn new() -> Self {
|
||||
Self { tokens: vec![], diagnostics: Diagnositics::new(), old_char_count: 0, block_comment_span: None, in_skip_line: false }
|
||||
}
|
||||
pub fn parse(reader: &mut impl BufRead) -> Result<Self, LexerError> {
|
||||
let mut tokens = Vec::new();
|
||||
let mut errors = Vec::new();
|
||||
let mut line = 1;
|
||||
let mut column = 1;
|
||||
let mut in_block_comment = false;
|
||||
for line_str in reader.lines() {
|
||||
let line_str = line_str?;
|
||||
let mut cursor = Cursor::new(&line_str);
|
||||
loop {
|
||||
if let Some(c) = cursor.peek() {
|
||||
// check white space first, if it's white space, skip it and continue to the next character
|
||||
if WHITESPACE_CHARS.contains(&c) {
|
||||
column += 1;
|
||||
cursor.advance(1);
|
||||
continue;
|
||||
}
|
||||
// check comment
|
||||
match cursor.peek_multiple(2) {
|
||||
Some(['/', '/']) => {
|
||||
// skip the rest of the line
|
||||
line += 1;
|
||||
column = 1;
|
||||
break;
|
||||
}
|
||||
Some(['/', '*']) => {
|
||||
in_block_comment = true;
|
||||
cursor.advance(2);
|
||||
column += 2;
|
||||
continue;
|
||||
}
|
||||
Some(['*', '/']) => {
|
||||
in_block_comment = false;
|
||||
cursor.advance(2);
|
||||
column += 2;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
if in_block_comment {
|
||||
cursor.advance(1);
|
||||
column += 1;
|
||||
}
|
||||
if_true_then_continue!(try_parse_as(parse_litint, &mut tokens, &mut cursor, &mut line, &mut column));
|
||||
if_true_then_continue!(try_parse_as(parse_delimiter, &mut tokens, &mut cursor, &mut line, &mut column));
|
||||
if_true_then_continue!(try_parse_as(parse_puncuation, &mut tokens, &mut cursor, &mut line, &mut column));
|
||||
if_true_then_continue!(try_parse_as(parse_ident, &mut tokens, &mut cursor, &mut line, &mut column));
|
||||
// unrecognized token
|
||||
errors.push(tokens.len());
|
||||
let c = cursor.next().unwrap();
|
||||
tokens.push(Token {
|
||||
value: TokenValue::Unrecognized(c),
|
||||
span: Span { line, column, length: 1 },
|
||||
});
|
||||
if errors.len() > 20 {
|
||||
return Err(LexerError::TooManyErrors);
|
||||
}
|
||||
column += 1;
|
||||
}
|
||||
line += 1;
|
||||
column = 1;
|
||||
pub fn finish(mut self) -> (Vec<Token>, Diagnositics) {
|
||||
if let Some(span) = self.block_comment_span.take() {
|
||||
self.diagnostics.add_from_frontend_error(LexingError::UnterminatedComment, span);
|
||||
}
|
||||
Ok(Self { tokens, errors })
|
||||
(self.tokens, self.diagnostics)
|
||||
}
|
||||
/// call `parse_str` will continue to parse the input from current state
|
||||
/// please also pass the whitespace to ensure the correct char position in diagnostics
|
||||
pub fn parse_next_str(&mut self, s: &str) {
|
||||
let mut cursor = Cursor::new(s);
|
||||
loop {
|
||||
if let Some(c) = cursor.peek() {
|
||||
if self.in_skip_line && c != '\n' {
|
||||
cursor.advance(1);
|
||||
continue;
|
||||
}
|
||||
// check white space first, if it's white space, skip it and continue to the next character
|
||||
if WHITESPACE_CHARS.contains(&c) {
|
||||
if c == '\n' {
|
||||
self.in_skip_line = false;
|
||||
}
|
||||
cursor.advance(1);
|
||||
continue;
|
||||
}
|
||||
// check comment
|
||||
match cursor.peek_multiple(2) {
|
||||
Some(['/', '/']) => {
|
||||
// skip the rest of the line
|
||||
self.in_skip_line = true;
|
||||
cursor.advance(2);
|
||||
continue;
|
||||
}
|
||||
Some(['/', '*']) => {
|
||||
let start = cursor.pos() + self.old_char_count;
|
||||
self.block_comment_span = Some(Span { start, end: start + 2 });
|
||||
cursor.advance(2);
|
||||
continue;
|
||||
}
|
||||
Some(['*', '/']) => {
|
||||
self.block_comment_span = None;
|
||||
cursor.advance(2);
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
if self.block_comment_span.is_some() {
|
||||
cursor.advance(1);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
if_true_then_continue!(try_parse_as(parse_litint, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count));
|
||||
if_true_then_continue!(try_parse_as(parse_delimiter, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count));
|
||||
if_true_then_continue!(try_parse_as(parse_puncuation, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count));
|
||||
if_true_then_continue!(try_parse_as(parse_ident, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count));
|
||||
// unrecognized token
|
||||
let last_pos = cursor.pos() + self.old_char_count;
|
||||
let mut unrecognized = Vec::new();
|
||||
while let Some(c) = cursor.peek() {
|
||||
if DELIMITER_CHARS.contains(&c) || WHITESPACE_CHARS.contains(&c) {
|
||||
break;
|
||||
}
|
||||
unrecognized.push(c);
|
||||
cursor.advance(1);
|
||||
}
|
||||
let span = Span { start: last_pos, end: cursor.pos() + self.old_char_count };
|
||||
let unrecognized = unrecognized.into_iter().collect::<String>();
|
||||
self.diagnostics.add_from_frontend_error(LexingError::UnrecognizedToken(unrecognized), span);
|
||||
self.tokens.push(Token { value: TokenValue::Unrecognized, span });
|
||||
}
|
||||
self.old_char_count += s.len();
|
||||
}
|
||||
}
|
||||
fn parse_litint(
|
||||
str_iter: &mut Cursor,
|
||||
) -> Option<TokenValue> {
|
||||
let mut c1 = str_iter.peek()?;
|
||||
) -> Result<TokenValue, LexParseError> {
|
||||
let mut c1 = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||
// c1 is the peek value from here
|
||||
let mut sign_base: i64 = 1;
|
||||
let mut base: i64 = 10;
|
||||
if !(c1.is_ascii_digit() || c1 == '-') {
|
||||
return None;
|
||||
return Err(LexParseError::NotMatched);
|
||||
}
|
||||
if c1 == '-' {
|
||||
sign_base = -1;
|
||||
str_iter.advance(1);
|
||||
c1 = str_iter.peek()?;
|
||||
c1 = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||
if !c1.is_ascii_digit() {
|
||||
// only a minus sign, not a number
|
||||
// back one so cursor still points to the minus sign
|
||||
str_iter.back(1);
|
||||
return None;
|
||||
return Err(LexParseError::NotMatched);
|
||||
}
|
||||
}
|
||||
let mut number = 0i64;
|
||||
let mut has_digits = false;
|
||||
if c1 == '0' {
|
||||
str_iter.advance(1);
|
||||
match str_iter.peek() {
|
||||
@@ -181,12 +214,13 @@ fn parse_litint(
|
||||
base = 8;
|
||||
}
|
||||
_ => {
|
||||
has_digits = true;
|
||||
// only zero
|
||||
}
|
||||
}
|
||||
}
|
||||
// from here, the cursor points to:
|
||||
// 0x1234 -> cursor at 'x'
|
||||
// 0x1234 -> cursor at '1'
|
||||
// 0123 -> cursor at '1'
|
||||
// 0 -> cursor at end
|
||||
// 1234 -> cursor at '1'
|
||||
@@ -199,32 +233,42 @@ fn parse_litint(
|
||||
'0'..='9' if (c as u8 - b'0') < base as u8 => c as i64 - '0' as i64 ,
|
||||
'a'..='f' if base == 16 => c as i64 - 'a' as i64 + 10 ,
|
||||
'A'..='F' if base == 16 => c as i64 - 'A' as i64 + 10,
|
||||
_ => break,
|
||||
c => if WHITESPACE_CHARS.contains(&c) || DELIMITER_CHARS.contains(&c) {
|
||||
break;
|
||||
} else {
|
||||
// unrecognized character in number literal
|
||||
return Err(LexParseError::InvalidInMatch(LexingError::InvalidIntLiteral));
|
||||
}
|
||||
};
|
||||
has_digits = true;
|
||||
number = number * base + digit;
|
||||
str_iter.advance(1);
|
||||
}
|
||||
if !has_digits {
|
||||
// No valid digits found, add a diagnostic
|
||||
return Err(LexParseError::InvalidInMatch(LexingError::InvalidIntLiteral));
|
||||
}
|
||||
number *= sign_base;
|
||||
Some(TokenValue::IntLit(number))
|
||||
Ok(TokenValue::IntLit(number))
|
||||
}
|
||||
|
||||
fn parse_delimiter(
|
||||
str_iter: &mut Cursor,
|
||||
) -> Option<TokenValue> {
|
||||
let c = str_iter.peek()?;
|
||||
) -> Result<TokenValue, LexParseError> {
|
||||
let c = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||
let token_value = match c {
|
||||
'(' => TokenValue::LParen,
|
||||
')' => TokenValue::RParen,
|
||||
'{' => TokenValue::LBrace,
|
||||
'}' => TokenValue::RBrace,
|
||||
_ => return None,
|
||||
_ => return Err(LexParseError::NotMatched),
|
||||
};
|
||||
str_iter.advance(1);
|
||||
Some(token_value)
|
||||
Ok(token_value)
|
||||
}
|
||||
fn parse_puncuation(
|
||||
str_iter: &mut Cursor,
|
||||
) -> Option<TokenValue> {
|
||||
) -> Result<TokenValue, LexParseError> {
|
||||
let get_value_by_next_char =
|
||||
|str_iter: &mut Cursor, not_equal_value: TokenValue, equal_value: TokenValue| {
|
||||
str_iter.advance(1);
|
||||
@@ -235,7 +279,7 @@ fn parse_puncuation(
|
||||
not_equal_value
|
||||
}
|
||||
};
|
||||
let c = str_iter.peek()?;
|
||||
let c = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||
let token_value = match c {
|
||||
'+' => TokenValue::Plus,
|
||||
'-' => TokenValue::Minus,
|
||||
@@ -249,9 +293,7 @@ fn parse_puncuation(
|
||||
if let Some('=') = str_iter.peek() {
|
||||
TokenValue::NotEqual
|
||||
} else {
|
||||
// only '!' is not a valid token, back one so cursor still points to '!'
|
||||
str_iter.back(1);
|
||||
return None;
|
||||
TokenValue::Not
|
||||
}
|
||||
},
|
||||
'<' => get_value_by_next_char(str_iter, TokenValue::Less, TokenValue::LessEqual),
|
||||
@@ -260,33 +302,35 @@ fn parse_puncuation(
|
||||
',' => TokenValue::Comma,
|
||||
';' => TokenValue::Semicolon,
|
||||
|
||||
_ => return None,
|
||||
_ => return Err(LexParseError::NotMatched),
|
||||
};
|
||||
str_iter.advance(1);
|
||||
Some(token_value)
|
||||
Ok(token_value)
|
||||
}
|
||||
|
||||
fn parse_ident(
|
||||
str_iter: &mut Cursor,
|
||||
) -> Option<TokenValue> {
|
||||
let c = str_iter.peek()?;
|
||||
) -> Result<TokenValue, LexParseError> {
|
||||
let c = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||
if !c.is_ascii_alphabetic() && c != '_' {
|
||||
return None;
|
||||
return Err(LexParseError::NotMatched);
|
||||
}
|
||||
let mut name = Vec::new();
|
||||
while let Some(c) = str_iter.peek() {
|
||||
if c.is_ascii_alphanumeric() || c == '_' {
|
||||
name.push(c);
|
||||
str_iter.advance(1);
|
||||
} else {
|
||||
} else if DELIMITER_CHARS.contains(&c) || WHITESPACE_CHARS.contains(&c) {
|
||||
break;
|
||||
} else {
|
||||
return Err(LexParseError::InvalidInMatch(LexingError::InvalidIdent));
|
||||
}
|
||||
}
|
||||
let name = name.into_iter().collect::<String>();
|
||||
if let Some(type_ident) = TypeIdent::from_str(&name).ok() {
|
||||
return Some(TokenValue::TypeIdent(type_ident));
|
||||
return Ok(TokenValue::TypeIdent(type_ident));
|
||||
}
|
||||
Some(TokenValue::Ident(name))
|
||||
Ok(TokenValue::Ident(name))
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -303,11 +347,22 @@ mod tests {
|
||||
for case_no in case_sequence {
|
||||
let case_path = case_list.get_case_path(case_no).unwrap();
|
||||
println!("{}", case_path.display());
|
||||
let file = File::open(case_path).unwrap();
|
||||
let file = File::open(&case_path).unwrap();
|
||||
let mut buf_reader = std::io::BufReader::new(file);
|
||||
let lexer = Lexer::parse(&mut buf_reader).unwrap();
|
||||
if lexer.has_errors() {
|
||||
eprintln!("Case {} has error", case_list.get_case_name(case_no).unwrap());
|
||||
let mut lexer = Lexer::new();
|
||||
let mut full_text = String::new();
|
||||
loop {
|
||||
let mut line = String::new();
|
||||
let bytes_read = buf_reader.read_line(&mut line).unwrap();
|
||||
if bytes_read == 0 {
|
||||
break;
|
||||
}
|
||||
full_text.push_str(&line);
|
||||
lexer.parse_next_str(&line);
|
||||
}
|
||||
let (_tokens, diagnostics) = lexer.finish();
|
||||
if !diagnostics.is_empty() {
|
||||
diagnostics.print(&format!("{}", case_path.display()), &full_text);
|
||||
error_case_cnt += 1;
|
||||
}
|
||||
}
|
||||
|
||||
+3
-1
@@ -1,2 +1,4 @@
|
||||
pub mod types;
|
||||
mod lexer;
|
||||
mod lexer;
|
||||
// pub mod parser;
|
||||
pub mod err;
|
||||
+61
-10
@@ -1,4 +1,6 @@
|
||||
use strum::EnumString;
|
||||
use strum::{AsRefStr, EnumString};
|
||||
|
||||
use crate::diagnostic::span::Span;
|
||||
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
@@ -7,12 +9,6 @@ pub struct Token {
|
||||
pub span: Span,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Span {
|
||||
pub line: usize,
|
||||
pub column: usize,
|
||||
pub length: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum TokenValue {
|
||||
@@ -20,6 +16,61 @@ pub enum TokenValue {
|
||||
Ident(String),
|
||||
TypeIdent(TypeIdent),
|
||||
|
||||
Plus, Minus, Star, Slash, Percent,
|
||||
Equal, DoubleEqual, Not, NotEqual, Less, LessEqual, Greater, GreaterEqual,
|
||||
|
||||
LParen, RParen,
|
||||
LBrace, RBrace,
|
||||
Comma, Semicolon,
|
||||
|
||||
If, Else, While, Return, Break, Continue,
|
||||
|
||||
// Eof,
|
||||
Unrecognized,
|
||||
}
|
||||
impl std::fmt::Display for TokenValue {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
TokenValue::IntLit(i) => write!(f, "literal int: {}", i),
|
||||
TokenValue::Ident(s) => write!(f, "identifier: {}", s),
|
||||
TokenValue::TypeIdent(t) => write!(f, "type {}", t.as_ref()),
|
||||
TokenValue::Plus => write!(f, "+"),
|
||||
TokenValue::Minus => write!(f, "-"),
|
||||
TokenValue::Star => write!(f, "*"),
|
||||
TokenValue::Slash => write!(f, "/"),
|
||||
TokenValue::Percent => write!(f, "%"),
|
||||
TokenValue::Equal => write!(f, "="),
|
||||
TokenValue::DoubleEqual => write!(f, "=="),
|
||||
TokenValue::Not => write!(f, "!"),
|
||||
TokenValue::NotEqual => write!(f, "!="),
|
||||
TokenValue::Less => write!(f, "<"),
|
||||
TokenValue::LessEqual => write!(f, "<="),
|
||||
TokenValue::Greater => write!(f, ">"),
|
||||
TokenValue::GreaterEqual => write!(f, ">="),
|
||||
TokenValue::LParen => write!(f, "("),
|
||||
TokenValue::RParen => write!(f, ")"),
|
||||
TokenValue::LBrace => write!(f, "{{"),
|
||||
TokenValue::RBrace => write!(f, "}}"),
|
||||
TokenValue::Comma => write!(f, ","),
|
||||
TokenValue::Semicolon => write!(f, ";"),
|
||||
TokenValue::If => write!(f, "if"),
|
||||
TokenValue::Else => write!(f, "else"),
|
||||
TokenValue::While => write!(f, "while"),
|
||||
TokenValue::Return => write!(f, "return"),
|
||||
TokenValue::Break => write!(f, "break"),
|
||||
TokenValue::Continue => write!(f, "continue"),
|
||||
// TokenValue::Eof => write!(f, "<EOF>"),
|
||||
TokenValue::Unrecognized => write!(f, "unrecognized"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TokenKind {
|
||||
IntLit,
|
||||
Ident,
|
||||
TypeIdent,
|
||||
|
||||
Plus, Minus, Star, Slash, Percent,
|
||||
Equal, DoubleEqual, NotEqual, Less, LessEqual, Greater, GreaterEqual,
|
||||
|
||||
@@ -29,10 +80,10 @@ pub enum TokenValue {
|
||||
|
||||
If, Else, While, Return, Break, Continue,
|
||||
|
||||
Eof,
|
||||
Unrecognized(char),
|
||||
// Eof,
|
||||
Unrecognized,
|
||||
}
|
||||
#[derive(Debug, Clone, PartialEq, Eq, EnumString)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, EnumString, AsRefStr)]
|
||||
pub enum TypeIdent {
|
||||
#[strum(serialize = "int")]
|
||||
Int,
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
mod frontend;
|
||||
mod ast;
|
||||
mod utils;
|
||||
mod diagnostic;
|
||||
mod err;
|
||||
fn main() {
|
||||
println!("Hello, world!");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user