Merge branch 'feat/lexer'
This commit is contained in:
Generated
+81
@@ -17,6 +17,17 @@ version = "1.5.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "codespan-reporting"
|
||||||
|
version = "0.13.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
"termcolor",
|
||||||
|
"unicode-width",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heck"
|
name = "heck"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
@@ -153,12 +164,43 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
|||||||
name = "rusty-minic"
|
name = "rusty-minic"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"codespan-reporting",
|
||||||
"num",
|
"num",
|
||||||
"regex",
|
"regex",
|
||||||
"strum",
|
"strum",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||||
|
dependencies = [
|
||||||
|
"serde_core",
|
||||||
|
"serde_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_core"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||||
|
dependencies = [
|
||||||
|
"serde_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_derive"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strum"
|
name = "strum"
|
||||||
version = "0.28.0"
|
version = "0.28.0"
|
||||||
@@ -191,6 +233,15 @@ dependencies = [
|
|||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "termcolor"
|
||||||
|
version = "1.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-util",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "thiserror"
|
||||||
version = "2.0.18"
|
version = "2.0.18"
|
||||||
@@ -216,3 +267,33 @@ name = "unicode-ident"
|
|||||||
version = "1.0.24"
|
version = "1.0.24"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-width"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-util"
|
||||||
|
version = "0.1.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-link"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.61.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
|
||||||
|
dependencies = [
|
||||||
|
"windows-link",
|
||||||
|
]
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ version = "0.1.0"
|
|||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
codespan-reporting = "0.13.1"
|
||||||
num = "0.4.3"
|
num = "0.4.3"
|
||||||
regex = "1.12.3"
|
regex = "1.12.3"
|
||||||
strum = { version = "0.28.0", features = ["derive"] }
|
strum = { version = "0.28.0", features = ["derive"] }
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
pub mod types;
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub struct Span {
|
||||||
|
start: usize,
|
||||||
|
end: usize,
|
||||||
|
}
|
||||||
|
pub struct CompileUnit {
|
||||||
|
pub global_decls: Vec<GlobalDeclStmt>,
|
||||||
|
}
|
||||||
|
pub enum GlobalDeclStmt {
|
||||||
|
VarDecl(VarDeclStmt),
|
||||||
|
FuncDecl(FuncDeclStmt),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct VarDeclStmt {
|
||||||
|
pub name: String,
|
||||||
|
pub var_type: Type,
|
||||||
|
pub span: Span,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FuncDeclStmt {
|
||||||
|
pub name: String,
|
||||||
|
pub return_type: Type,
|
||||||
|
pub params: Vec<Param>,
|
||||||
|
pub body: BlockStmt,
|
||||||
|
pub span: Span,
|
||||||
|
}
|
||||||
|
pub struct BlockStmt {
|
||||||
|
pub statements: Vec<Statement>,
|
||||||
|
pub span: Span,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum Statement {
|
||||||
|
Return(ReturnStmt),
|
||||||
|
Block(BlockStmt),
|
||||||
|
Expr(Expr),
|
||||||
|
VarDecl(VarDeclStmt),
|
||||||
|
}
|
||||||
|
pub struct ReturnStmt {
|
||||||
|
pub value: Option<Expr>,
|
||||||
|
pub span: Span,
|
||||||
|
}
|
||||||
|
pub struct Expr {
|
||||||
|
pub value: ExprValue,
|
||||||
|
pub span: Span,
|
||||||
|
}
|
||||||
|
pub enum ExprValue {
|
||||||
|
IntLit(i64),
|
||||||
|
Var(String),
|
||||||
|
BinaryOp {
|
||||||
|
lhs: Box<Expr>,
|
||||||
|
op: BinaryOp,
|
||||||
|
rhs: Box<Expr>
|
||||||
|
},
|
||||||
|
FuncCall(String, Vec<Expr>),
|
||||||
|
Assign {
|
||||||
|
lvalue: Box<Expr>,
|
||||||
|
rvalue: Box<Expr>
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum BinaryOp {
|
||||||
|
Add, Sub, Mul, Div, Mod,
|
||||||
|
Equal, NotEqual, Less, LessEqual, Greater, GreaterEqual,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum Type {
|
||||||
|
Int,
|
||||||
|
Void,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Param {
|
||||||
|
name: String,
|
||||||
|
param_type: Type,
|
||||||
|
span: Span,
|
||||||
|
}
|
||||||
@@ -0,0 +1,94 @@
|
|||||||
|
use crate::{diagnostic::span::Span, err::CompileError, frontend::err::FrontendError};
|
||||||
|
|
||||||
|
pub mod span;
|
||||||
|
|
||||||
|
pub struct Diagnositics {
|
||||||
|
diagnostics: Vec<Diagnostic>,
|
||||||
|
}
|
||||||
|
pub enum DiagnosticLevel {
|
||||||
|
Error,
|
||||||
|
Warning,
|
||||||
|
Info,
|
||||||
|
}
|
||||||
|
pub struct Diagnostic {
|
||||||
|
level: DiagnosticLevel,
|
||||||
|
message: String,
|
||||||
|
span: Span
|
||||||
|
}
|
||||||
|
impl Diagnositics {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self { diagnostics: vec![] }
|
||||||
|
}
|
||||||
|
pub fn add(&mut self, diagnostic: Diagnostic) {
|
||||||
|
self.diagnostics.push(diagnostic);
|
||||||
|
}
|
||||||
|
pub fn add_from_error(&mut self, error: impl Into<CompileError>, span: Span) {
|
||||||
|
self.diagnostics.push(Diagnostic {
|
||||||
|
level: DiagnosticLevel::Error,
|
||||||
|
message: Into::<CompileError>::into(error).to_string(),
|
||||||
|
span,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
pub fn add_from_frontend_error(&mut self, error: impl Into<FrontendError>, span: Span) {
|
||||||
|
self.diagnostics.push(Diagnostic {
|
||||||
|
level: DiagnosticLevel::Error,
|
||||||
|
message: Into::<FrontendError>::into(error).to_string(),
|
||||||
|
span,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.diagnostics.is_empty()
|
||||||
|
}
|
||||||
|
pub fn print(&self, name: &str, source: &str) {
|
||||||
|
use codespan_reporting::diagnostic::Diagnostic as CodespanDiagnostic;
|
||||||
|
use codespan_reporting::files::SimpleFile;
|
||||||
|
use codespan_reporting::diagnostic::{Severity, Label};
|
||||||
|
use std::io::IsTerminal;
|
||||||
|
|
||||||
|
use codespan_reporting::term::{self, termcolor::{ColorChoice, StandardStream}};
|
||||||
|
let mut choice = ColorChoice::Auto;
|
||||||
|
if !std::io::stdin().is_terminal() {
|
||||||
|
choice = ColorChoice::Never;
|
||||||
|
}
|
||||||
|
let stdout = StandardStream::stdout(choice);
|
||||||
|
let source_file = SimpleFile::new(name, source);
|
||||||
|
let output_config = codespan_reporting::term::Config::default();
|
||||||
|
for diagnostic in &self.diagnostics {
|
||||||
|
let output_level = match diagnostic.level {
|
||||||
|
DiagnosticLevel::Error => Severity::Error,
|
||||||
|
DiagnosticLevel::Warning => Severity::Warning,
|
||||||
|
DiagnosticLevel::Info => Severity::Note,
|
||||||
|
};
|
||||||
|
let output_diagnostic = CodespanDiagnostic::new(output_level)
|
||||||
|
.with_message(&diagnostic.message)
|
||||||
|
.with_label(
|
||||||
|
Label::primary((), diagnostic.span.start..diagnostic.span.end)
|
||||||
|
);
|
||||||
|
term::emit_to_write_style(&mut stdout.lock(), &output_config, &source_file, &output_diagnostic);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
#[test]
|
||||||
|
fn test_diagnostics() {
|
||||||
|
use crate::diagnostic::{Diagnostic, DiagnosticLevel, Diagnositics};
|
||||||
|
let mut diagnostics = Diagnositics::new();
|
||||||
|
diagnostics.add(Diagnostic {
|
||||||
|
level: DiagnosticLevel::Error,
|
||||||
|
message: "test error".to_string(),
|
||||||
|
span: crate::diagnostic::span::Span { start: 0, end: 3 },
|
||||||
|
});
|
||||||
|
diagnostics.add(Diagnostic {
|
||||||
|
level: DiagnosticLevel::Error,
|
||||||
|
message: "test error".to_string(),
|
||||||
|
span: crate::diagnostic::span::Span { start: 16, end: 22 },
|
||||||
|
});
|
||||||
|
diagnostics.print("main.c",
|
||||||
|
r#"int main(){
|
||||||
|
return 1;
|
||||||
|
}"#);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct Span {
|
||||||
|
pub start: usize,
|
||||||
|
pub end: usize,
|
||||||
|
}
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
use crate::frontend::err::FrontendError;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||||
|
pub enum CompileError {
|
||||||
|
#[error(transparent)]
|
||||||
|
Frontend(#[from] FrontendError),
|
||||||
|
}
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
// #[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||||
|
// pub enum ParseError {
|
||||||
|
// BlockStmt(#[from] BlockStmtError)
|
||||||
|
// }
|
||||||
|
// #[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||||
|
// pub enum BlockStmtError {
|
||||||
|
// MissingLBrace,
|
||||||
|
// MissingRBrace,
|
||||||
|
// }
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||||
|
pub enum LexingError {
|
||||||
|
#[error("invalid int literal")]
|
||||||
|
InvalidIntLiteral,
|
||||||
|
#[error("invalid ident")]
|
||||||
|
InvalidIdent,
|
||||||
|
#[error("comment unterminated")]
|
||||||
|
UnterminatedComment,
|
||||||
|
#[error("unrecognized token: {0}")]
|
||||||
|
UnrecognizedToken(String),
|
||||||
|
}
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Error)]
|
||||||
|
pub enum FrontendError {
|
||||||
|
#[error(transparent)]
|
||||||
|
Lexing(#[from] LexingError),
|
||||||
|
|
||||||
|
}
|
||||||
+164
-109
@@ -1,21 +1,32 @@
|
|||||||
use std::{io::BufRead, iter::Peekable, str::FromStr};
|
use std::{io::BufRead, str::FromStr};
|
||||||
|
|
||||||
|
use codespan_reporting::diagnostic;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
use crate::frontend::types::{Span, TokenValue, TypeIdent};
|
use crate::{diagnostic::{Diagnositics, span::{self, Span}}, frontend::{err::LexingError, types::{TokenValue, TypeIdent}}};
|
||||||
|
|
||||||
use super::types::Token;
|
use super::types::Token;
|
||||||
|
|
||||||
pub struct Lexer {
|
pub struct Lexer {
|
||||||
tokens: Vec<Token>,
|
pub tokens: Vec<Token>,
|
||||||
errors: Vec<usize>, // every entry points to the index of unrecognized tokens
|
pub diagnostics: Diagnositics,
|
||||||
|
old_char_count: usize,
|
||||||
|
block_comment_span: Option<Span>,
|
||||||
|
in_skip_line: bool
|
||||||
}
|
}
|
||||||
|
|
||||||
const WHITESPACE_CHARS: &[char] = &[' ', '\t', '\n', '\r'];
|
const WHITESPACE_CHARS: &[char] = &[' ', '\t', '\n', '\r'];
|
||||||
|
const DELIMITER_CHARS: &[char] = &[
|
||||||
|
'+', '-', '*', '/', '%', '=', '!', '<', '>', '(', ')', ',', ';'
|
||||||
|
];
|
||||||
struct Cursor {
|
struct Cursor {
|
||||||
chars: Vec<char>,
|
chars: Vec<char>,
|
||||||
pos: usize,
|
pos: usize,
|
||||||
}
|
}
|
||||||
|
enum LexParseError {
|
||||||
|
NotMatched,
|
||||||
|
InvalidInMatch(LexingError)
|
||||||
|
}
|
||||||
impl Cursor {
|
impl Cursor {
|
||||||
pub fn new(s: &str) -> Self {
|
pub fn new(s: &str) -> Self {
|
||||||
Self { chars: s.chars().collect(), pos: 0 }
|
Self { chars: s.chars().collect(), pos: 0 }
|
||||||
@@ -47,20 +58,35 @@ impl Cursor {
|
|||||||
self.pos
|
self.pos
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/// try parse using the giving function, return whether should continue
|
||||||
fn try_parse_as(
|
fn try_parse_as(
|
||||||
f: fn(&mut Cursor) -> Option<TokenValue>,
|
f: fn(&mut Cursor) -> Result<TokenValue, LexParseError>,
|
||||||
tokens: &mut Vec<Token>,
|
tokens: &mut Vec<Token>,
|
||||||
str_iter: &mut Cursor,
|
str_iter: &mut Cursor,
|
||||||
line: &mut usize,
|
diagnostics: &mut Diagnositics,
|
||||||
column: &mut usize,
|
last_char_count: usize,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
let last_pos = str_iter.pos();
|
let last_pos = str_iter.pos() + last_char_count;
|
||||||
if let Some(token) = f(str_iter) {
|
match f(str_iter) {
|
||||||
let span = Span { line: *line, column: *column, length: str_iter.pos() - last_pos };
|
Ok(token_value) => {
|
||||||
tokens.push(Token { value: token, span });
|
let span = Span { start: last_pos, end: str_iter.pos() + last_char_count };
|
||||||
return true;
|
tokens.push(Token { value: token_value, span });
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
Err(LexParseError::NotMatched) => false,
|
||||||
|
Err(LexParseError::InvalidInMatch(err)) => {
|
||||||
|
// try recover from delimiter char or whitespace char
|
||||||
|
while let Some(c) = str_iter.peek() {
|
||||||
|
if DELIMITER_CHARS.contains(&c) || WHITESPACE_CHARS.contains(&c) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
str_iter.advance(1);
|
||||||
|
}
|
||||||
|
let span = Span { start: last_pos, end: str_iter.pos() + last_char_count };
|
||||||
|
diagnostics.add_from_frontend_error(err, span);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
false
|
|
||||||
}
|
}
|
||||||
macro_rules! if_true_then_continue {
|
macro_rules! if_true_then_continue {
|
||||||
($e: expr) => {
|
($e: expr) => {
|
||||||
@@ -77,99 +103,106 @@ pub enum LexerError {
|
|||||||
TooManyErrors,
|
TooManyErrors,
|
||||||
}
|
}
|
||||||
impl Lexer {
|
impl Lexer {
|
||||||
pub fn has_errors(&self) -> bool {
|
pub fn new() -> Self {
|
||||||
!self.errors.is_empty()
|
Self { tokens: vec![], diagnostics: Diagnositics::new(), old_char_count: 0, block_comment_span: None, in_skip_line: false }
|
||||||
}
|
}
|
||||||
pub fn parse(reader: &mut impl BufRead) -> Result<Self, LexerError> {
|
pub fn finish(mut self) -> (Vec<Token>, Diagnositics) {
|
||||||
let mut tokens = Vec::new();
|
if let Some(span) = self.block_comment_span.take() {
|
||||||
let mut errors = Vec::new();
|
self.diagnostics.add_from_frontend_error(LexingError::UnterminatedComment, span);
|
||||||
let mut line = 1;
|
|
||||||
let mut column = 1;
|
|
||||||
let mut in_block_comment = false;
|
|
||||||
for line_str in reader.lines() {
|
|
||||||
let line_str = line_str?;
|
|
||||||
let mut cursor = Cursor::new(&line_str);
|
|
||||||
loop {
|
|
||||||
if let Some(c) = cursor.peek() {
|
|
||||||
// check white space first, if it's white space, skip it and continue to the next character
|
|
||||||
if WHITESPACE_CHARS.contains(&c) {
|
|
||||||
column += 1;
|
|
||||||
cursor.advance(1);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// check comment
|
|
||||||
match cursor.peek_multiple(2) {
|
|
||||||
Some(['/', '/']) => {
|
|
||||||
// skip the rest of the line
|
|
||||||
line += 1;
|
|
||||||
column = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
Some(['/', '*']) => {
|
|
||||||
in_block_comment = true;
|
|
||||||
cursor.advance(2);
|
|
||||||
column += 2;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Some(['*', '/']) => {
|
|
||||||
in_block_comment = false;
|
|
||||||
cursor.advance(2);
|
|
||||||
column += 2;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if in_block_comment {
|
|
||||||
cursor.advance(1);
|
|
||||||
column += 1;
|
|
||||||
}
|
|
||||||
if_true_then_continue!(try_parse_as(parse_litint, &mut tokens, &mut cursor, &mut line, &mut column));
|
|
||||||
if_true_then_continue!(try_parse_as(parse_delimiter, &mut tokens, &mut cursor, &mut line, &mut column));
|
|
||||||
if_true_then_continue!(try_parse_as(parse_puncuation, &mut tokens, &mut cursor, &mut line, &mut column));
|
|
||||||
if_true_then_continue!(try_parse_as(parse_ident, &mut tokens, &mut cursor, &mut line, &mut column));
|
|
||||||
// unrecognized token
|
|
||||||
errors.push(tokens.len());
|
|
||||||
let c = cursor.next().unwrap();
|
|
||||||
tokens.push(Token {
|
|
||||||
value: TokenValue::Unrecognized(c),
|
|
||||||
span: Span { line, column, length: 1 },
|
|
||||||
});
|
|
||||||
if errors.len() > 20 {
|
|
||||||
return Err(LexerError::TooManyErrors);
|
|
||||||
}
|
|
||||||
column += 1;
|
|
||||||
}
|
|
||||||
line += 1;
|
|
||||||
column = 1;
|
|
||||||
}
|
}
|
||||||
Ok(Self { tokens, errors })
|
(self.tokens, self.diagnostics)
|
||||||
|
}
|
||||||
|
/// call `parse_str` will continue to parse the input from current state
|
||||||
|
/// please also pass the whitespace to ensure the correct char position in diagnostics
|
||||||
|
pub fn parse_next_str(&mut self, s: &str) {
|
||||||
|
let mut cursor = Cursor::new(s);
|
||||||
|
loop {
|
||||||
|
if let Some(c) = cursor.peek() {
|
||||||
|
if self.in_skip_line && c != '\n' {
|
||||||
|
cursor.advance(1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// check white space first, if it's white space, skip it and continue to the next character
|
||||||
|
if WHITESPACE_CHARS.contains(&c) {
|
||||||
|
if c == '\n' {
|
||||||
|
self.in_skip_line = false;
|
||||||
|
}
|
||||||
|
cursor.advance(1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// check comment
|
||||||
|
match cursor.peek_multiple(2) {
|
||||||
|
Some(['/', '/']) => {
|
||||||
|
// skip the rest of the line
|
||||||
|
self.in_skip_line = true;
|
||||||
|
cursor.advance(2);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Some(['/', '*']) => {
|
||||||
|
let start = cursor.pos() + self.old_char_count;
|
||||||
|
self.block_comment_span = Some(Span { start, end: start + 2 });
|
||||||
|
cursor.advance(2);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Some(['*', '/']) => {
|
||||||
|
self.block_comment_span = None;
|
||||||
|
cursor.advance(2);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
if self.block_comment_span.is_some() {
|
||||||
|
cursor.advance(1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if_true_then_continue!(try_parse_as(parse_litint, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count));
|
||||||
|
if_true_then_continue!(try_parse_as(parse_delimiter, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count));
|
||||||
|
if_true_then_continue!(try_parse_as(parse_puncuation, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count));
|
||||||
|
if_true_then_continue!(try_parse_as(parse_ident, &mut self.tokens, &mut cursor, &mut self.diagnostics, self.old_char_count));
|
||||||
|
// unrecognized token
|
||||||
|
let last_pos = cursor.pos() + self.old_char_count;
|
||||||
|
let mut unrecognized = Vec::new();
|
||||||
|
while let Some(c) = cursor.peek() {
|
||||||
|
if DELIMITER_CHARS.contains(&c) || WHITESPACE_CHARS.contains(&c) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
unrecognized.push(c);
|
||||||
|
cursor.advance(1);
|
||||||
|
}
|
||||||
|
let span = Span { start: last_pos, end: cursor.pos() + self.old_char_count };
|
||||||
|
let unrecognized = unrecognized.into_iter().collect::<String>();
|
||||||
|
self.diagnostics.add_from_frontend_error(LexingError::UnrecognizedToken(unrecognized), span);
|
||||||
|
self.tokens.push(Token { value: TokenValue::Unrecognized, span });
|
||||||
|
}
|
||||||
|
self.old_char_count += s.len();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn parse_litint(
|
fn parse_litint(
|
||||||
str_iter: &mut Cursor,
|
str_iter: &mut Cursor,
|
||||||
) -> Option<TokenValue> {
|
) -> Result<TokenValue, LexParseError> {
|
||||||
let mut c1 = str_iter.peek()?;
|
let mut c1 = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||||
// c1 is the peek value from here
|
// c1 is the peek value from here
|
||||||
let mut sign_base: i64 = 1;
|
let mut sign_base: i64 = 1;
|
||||||
let mut base: i64 = 10;
|
let mut base: i64 = 10;
|
||||||
if !(c1.is_ascii_digit() || c1 == '-') {
|
if !(c1.is_ascii_digit() || c1 == '-') {
|
||||||
return None;
|
return Err(LexParseError::NotMatched);
|
||||||
}
|
}
|
||||||
if c1 == '-' {
|
if c1 == '-' {
|
||||||
sign_base = -1;
|
sign_base = -1;
|
||||||
str_iter.advance(1);
|
str_iter.advance(1);
|
||||||
c1 = str_iter.peek()?;
|
c1 = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||||
if !c1.is_ascii_digit() {
|
if !c1.is_ascii_digit() {
|
||||||
// only a minus sign, not a number
|
// only a minus sign, not a number
|
||||||
// back one so cursor still points to the minus sign
|
// back one so cursor still points to the minus sign
|
||||||
str_iter.back(1);
|
str_iter.back(1);
|
||||||
return None;
|
return Err(LexParseError::NotMatched);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut number = 0i64;
|
let mut number = 0i64;
|
||||||
|
let mut has_digits = false;
|
||||||
if c1 == '0' {
|
if c1 == '0' {
|
||||||
str_iter.advance(1);
|
str_iter.advance(1);
|
||||||
match str_iter.peek() {
|
match str_iter.peek() {
|
||||||
@@ -181,12 +214,13 @@ fn parse_litint(
|
|||||||
base = 8;
|
base = 8;
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
|
has_digits = true;
|
||||||
// only zero
|
// only zero
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// from here, the cursor points to:
|
// from here, the cursor points to:
|
||||||
// 0x1234 -> cursor at 'x'
|
// 0x1234 -> cursor at '1'
|
||||||
// 0123 -> cursor at '1'
|
// 0123 -> cursor at '1'
|
||||||
// 0 -> cursor at end
|
// 0 -> cursor at end
|
||||||
// 1234 -> cursor at '1'
|
// 1234 -> cursor at '1'
|
||||||
@@ -199,32 +233,42 @@ fn parse_litint(
|
|||||||
'0'..='9' if (c as u8 - b'0') < base as u8 => c as i64 - '0' as i64 ,
|
'0'..='9' if (c as u8 - b'0') < base as u8 => c as i64 - '0' as i64 ,
|
||||||
'a'..='f' if base == 16 => c as i64 - 'a' as i64 + 10 ,
|
'a'..='f' if base == 16 => c as i64 - 'a' as i64 + 10 ,
|
||||||
'A'..='F' if base == 16 => c as i64 - 'A' as i64 + 10,
|
'A'..='F' if base == 16 => c as i64 - 'A' as i64 + 10,
|
||||||
_ => break,
|
c => if WHITESPACE_CHARS.contains(&c) || DELIMITER_CHARS.contains(&c) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
// unrecognized character in number literal
|
||||||
|
return Err(LexParseError::InvalidInMatch(LexingError::InvalidIntLiteral));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
has_digits = true;
|
||||||
number = number * base + digit;
|
number = number * base + digit;
|
||||||
str_iter.advance(1);
|
str_iter.advance(1);
|
||||||
}
|
}
|
||||||
|
if !has_digits {
|
||||||
|
// No valid digits found, add a diagnostic
|
||||||
|
return Err(LexParseError::InvalidInMatch(LexingError::InvalidIntLiteral));
|
||||||
|
}
|
||||||
number *= sign_base;
|
number *= sign_base;
|
||||||
Some(TokenValue::IntLit(number))
|
Ok(TokenValue::IntLit(number))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_delimiter(
|
fn parse_delimiter(
|
||||||
str_iter: &mut Cursor,
|
str_iter: &mut Cursor,
|
||||||
) -> Option<TokenValue> {
|
) -> Result<TokenValue, LexParseError> {
|
||||||
let c = str_iter.peek()?;
|
let c = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||||
let token_value = match c {
|
let token_value = match c {
|
||||||
'(' => TokenValue::LParen,
|
'(' => TokenValue::LParen,
|
||||||
')' => TokenValue::RParen,
|
')' => TokenValue::RParen,
|
||||||
'{' => TokenValue::LBrace,
|
'{' => TokenValue::LBrace,
|
||||||
'}' => TokenValue::RBrace,
|
'}' => TokenValue::RBrace,
|
||||||
_ => return None,
|
_ => return Err(LexParseError::NotMatched),
|
||||||
};
|
};
|
||||||
str_iter.advance(1);
|
str_iter.advance(1);
|
||||||
Some(token_value)
|
Ok(token_value)
|
||||||
}
|
}
|
||||||
fn parse_puncuation(
|
fn parse_puncuation(
|
||||||
str_iter: &mut Cursor,
|
str_iter: &mut Cursor,
|
||||||
) -> Option<TokenValue> {
|
) -> Result<TokenValue, LexParseError> {
|
||||||
let get_value_by_next_char =
|
let get_value_by_next_char =
|
||||||
|str_iter: &mut Cursor, not_equal_value: TokenValue, equal_value: TokenValue| {
|
|str_iter: &mut Cursor, not_equal_value: TokenValue, equal_value: TokenValue| {
|
||||||
str_iter.advance(1);
|
str_iter.advance(1);
|
||||||
@@ -235,7 +279,7 @@ fn parse_puncuation(
|
|||||||
not_equal_value
|
not_equal_value
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let c = str_iter.peek()?;
|
let c = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||||
let token_value = match c {
|
let token_value = match c {
|
||||||
'+' => TokenValue::Plus,
|
'+' => TokenValue::Plus,
|
||||||
'-' => TokenValue::Minus,
|
'-' => TokenValue::Minus,
|
||||||
@@ -249,9 +293,7 @@ fn parse_puncuation(
|
|||||||
if let Some('=') = str_iter.peek() {
|
if let Some('=') = str_iter.peek() {
|
||||||
TokenValue::NotEqual
|
TokenValue::NotEqual
|
||||||
} else {
|
} else {
|
||||||
// only '!' is not a valid token, back one so cursor still points to '!'
|
TokenValue::Not
|
||||||
str_iter.back(1);
|
|
||||||
return None;
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
'<' => get_value_by_next_char(str_iter, TokenValue::Less, TokenValue::LessEqual),
|
'<' => get_value_by_next_char(str_iter, TokenValue::Less, TokenValue::LessEqual),
|
||||||
@@ -260,33 +302,35 @@ fn parse_puncuation(
|
|||||||
',' => TokenValue::Comma,
|
',' => TokenValue::Comma,
|
||||||
';' => TokenValue::Semicolon,
|
';' => TokenValue::Semicolon,
|
||||||
|
|
||||||
_ => return None,
|
_ => return Err(LexParseError::NotMatched),
|
||||||
};
|
};
|
||||||
str_iter.advance(1);
|
str_iter.advance(1);
|
||||||
Some(token_value)
|
Ok(token_value)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_ident(
|
fn parse_ident(
|
||||||
str_iter: &mut Cursor,
|
str_iter: &mut Cursor,
|
||||||
) -> Option<TokenValue> {
|
) -> Result<TokenValue, LexParseError> {
|
||||||
let c = str_iter.peek()?;
|
let c = str_iter.peek().ok_or(LexParseError::NotMatched)?;
|
||||||
if !c.is_ascii_alphabetic() && c != '_' {
|
if !c.is_ascii_alphabetic() && c != '_' {
|
||||||
return None;
|
return Err(LexParseError::NotMatched);
|
||||||
}
|
}
|
||||||
let mut name = Vec::new();
|
let mut name = Vec::new();
|
||||||
while let Some(c) = str_iter.peek() {
|
while let Some(c) = str_iter.peek() {
|
||||||
if c.is_ascii_alphanumeric() || c == '_' {
|
if c.is_ascii_alphanumeric() || c == '_' {
|
||||||
name.push(c);
|
name.push(c);
|
||||||
str_iter.advance(1);
|
str_iter.advance(1);
|
||||||
} else {
|
} else if DELIMITER_CHARS.contains(&c) || WHITESPACE_CHARS.contains(&c) {
|
||||||
break;
|
break;
|
||||||
|
} else {
|
||||||
|
return Err(LexParseError::InvalidInMatch(LexingError::InvalidIdent));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let name = name.into_iter().collect::<String>();
|
let name = name.into_iter().collect::<String>();
|
||||||
if let Some(type_ident) = TypeIdent::from_str(&name).ok() {
|
if let Some(type_ident) = TypeIdent::from_str(&name).ok() {
|
||||||
return Some(TokenValue::TypeIdent(type_ident));
|
return Ok(TokenValue::TypeIdent(type_ident));
|
||||||
}
|
}
|
||||||
Some(TokenValue::Ident(name))
|
Ok(TokenValue::Ident(name))
|
||||||
}
|
}
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
@@ -303,11 +347,22 @@ mod tests {
|
|||||||
for case_no in case_sequence {
|
for case_no in case_sequence {
|
||||||
let case_path = case_list.get_case_path(case_no).unwrap();
|
let case_path = case_list.get_case_path(case_no).unwrap();
|
||||||
println!("{}", case_path.display());
|
println!("{}", case_path.display());
|
||||||
let file = File::open(case_path).unwrap();
|
let file = File::open(&case_path).unwrap();
|
||||||
let mut buf_reader = std::io::BufReader::new(file);
|
let mut buf_reader = std::io::BufReader::new(file);
|
||||||
let lexer = Lexer::parse(&mut buf_reader).unwrap();
|
let mut lexer = Lexer::new();
|
||||||
if lexer.has_errors() {
|
let mut full_text = String::new();
|
||||||
eprintln!("Case {} has error", case_list.get_case_name(case_no).unwrap());
|
loop {
|
||||||
|
let mut line = String::new();
|
||||||
|
let bytes_read = buf_reader.read_line(&mut line).unwrap();
|
||||||
|
if bytes_read == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
full_text.push_str(&line);
|
||||||
|
lexer.parse_next_str(&line);
|
||||||
|
}
|
||||||
|
let (_tokens, diagnostics) = lexer.finish();
|
||||||
|
if !diagnostics.is_empty() {
|
||||||
|
diagnostics.print(&format!("{}", case_path.display()), &full_text);
|
||||||
error_case_cnt += 1;
|
error_case_cnt += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+3
-1
@@ -1,2 +1,4 @@
|
|||||||
pub mod types;
|
pub mod types;
|
||||||
mod lexer;
|
mod lexer;
|
||||||
|
// pub mod parser;
|
||||||
|
pub mod err;
|
||||||
+61
-10
@@ -1,4 +1,6 @@
|
|||||||
use strum::EnumString;
|
use strum::{AsRefStr, EnumString};
|
||||||
|
|
||||||
|
use crate::diagnostic::span::Span;
|
||||||
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
@@ -7,12 +9,6 @@ pub struct Token {
|
|||||||
pub span: Span,
|
pub span: Span,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
||||||
pub struct Span {
|
|
||||||
pub line: usize,
|
|
||||||
pub column: usize,
|
|
||||||
pub length: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub enum TokenValue {
|
pub enum TokenValue {
|
||||||
@@ -20,6 +16,61 @@ pub enum TokenValue {
|
|||||||
Ident(String),
|
Ident(String),
|
||||||
TypeIdent(TypeIdent),
|
TypeIdent(TypeIdent),
|
||||||
|
|
||||||
|
Plus, Minus, Star, Slash, Percent,
|
||||||
|
Equal, DoubleEqual, Not, NotEqual, Less, LessEqual, Greater, GreaterEqual,
|
||||||
|
|
||||||
|
LParen, RParen,
|
||||||
|
LBrace, RBrace,
|
||||||
|
Comma, Semicolon,
|
||||||
|
|
||||||
|
If, Else, While, Return, Break, Continue,
|
||||||
|
|
||||||
|
// Eof,
|
||||||
|
Unrecognized,
|
||||||
|
}
|
||||||
|
impl std::fmt::Display for TokenValue {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
TokenValue::IntLit(i) => write!(f, "literal int: {}", i),
|
||||||
|
TokenValue::Ident(s) => write!(f, "identifier: {}", s),
|
||||||
|
TokenValue::TypeIdent(t) => write!(f, "type {}", t.as_ref()),
|
||||||
|
TokenValue::Plus => write!(f, "+"),
|
||||||
|
TokenValue::Minus => write!(f, "-"),
|
||||||
|
TokenValue::Star => write!(f, "*"),
|
||||||
|
TokenValue::Slash => write!(f, "/"),
|
||||||
|
TokenValue::Percent => write!(f, "%"),
|
||||||
|
TokenValue::Equal => write!(f, "="),
|
||||||
|
TokenValue::DoubleEqual => write!(f, "=="),
|
||||||
|
TokenValue::Not => write!(f, "!"),
|
||||||
|
TokenValue::NotEqual => write!(f, "!="),
|
||||||
|
TokenValue::Less => write!(f, "<"),
|
||||||
|
TokenValue::LessEqual => write!(f, "<="),
|
||||||
|
TokenValue::Greater => write!(f, ">"),
|
||||||
|
TokenValue::GreaterEqual => write!(f, ">="),
|
||||||
|
TokenValue::LParen => write!(f, "("),
|
||||||
|
TokenValue::RParen => write!(f, ")"),
|
||||||
|
TokenValue::LBrace => write!(f, "{{"),
|
||||||
|
TokenValue::RBrace => write!(f, "}}"),
|
||||||
|
TokenValue::Comma => write!(f, ","),
|
||||||
|
TokenValue::Semicolon => write!(f, ";"),
|
||||||
|
TokenValue::If => write!(f, "if"),
|
||||||
|
TokenValue::Else => write!(f, "else"),
|
||||||
|
TokenValue::While => write!(f, "while"),
|
||||||
|
TokenValue::Return => write!(f, "return"),
|
||||||
|
TokenValue::Break => write!(f, "break"),
|
||||||
|
TokenValue::Continue => write!(f, "continue"),
|
||||||
|
// TokenValue::Eof => write!(f, "<EOF>"),
|
||||||
|
TokenValue::Unrecognized => write!(f, "unrecognized"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum TokenKind {
|
||||||
|
IntLit,
|
||||||
|
Ident,
|
||||||
|
TypeIdent,
|
||||||
|
|
||||||
Plus, Minus, Star, Slash, Percent,
|
Plus, Minus, Star, Slash, Percent,
|
||||||
Equal, DoubleEqual, NotEqual, Less, LessEqual, Greater, GreaterEqual,
|
Equal, DoubleEqual, NotEqual, Less, LessEqual, Greater, GreaterEqual,
|
||||||
|
|
||||||
@@ -29,10 +80,10 @@ pub enum TokenValue {
|
|||||||
|
|
||||||
If, Else, While, Return, Break, Continue,
|
If, Else, While, Return, Break, Continue,
|
||||||
|
|
||||||
Eof,
|
// Eof,
|
||||||
Unrecognized(char),
|
Unrecognized,
|
||||||
}
|
}
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, EnumString)]
|
#[derive(Debug, Clone, PartialEq, Eq, EnumString, AsRefStr)]
|
||||||
pub enum TypeIdent {
|
pub enum TypeIdent {
|
||||||
#[strum(serialize = "int")]
|
#[strum(serialize = "int")]
|
||||||
Int,
|
Int,
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
mod frontend;
|
mod frontend;
|
||||||
mod ast;
|
mod ast;
|
||||||
mod utils;
|
mod utils;
|
||||||
|
mod diagnostic;
|
||||||
|
mod err;
|
||||||
fn main() {
|
fn main() {
|
||||||
println!("Hello, world!");
|
println!("Hello, world!");
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user