feat(lexer): Finish lexer parse and add tests

This commit is contained in:
2026-05-06 14:30:32 +08:00
commit e8b50ae0d7
10 changed files with 696 additions and 0 deletions
+2
View File
@@ -0,0 +1,2 @@
/target
/testcases
Generated
+218
View File
@@ -0,0 +1,218 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
dependencies = [
"memchr",
]
[[package]]
name = "autocfg"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "memchr"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
[[package]]
name = "num"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
dependencies = [
"num-bigint",
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-traits",
]
[[package]]
name = "num-bigint"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
dependencies = [
"num-integer",
"num-traits",
]
[[package]]
name = "num-complex"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
dependencies = [
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
dependencies = [
"num-traits",
]
[[package]]
name = "num-iter"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
dependencies = [
"num-bigint",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "proc-macro2"
version = "1.0.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
[[package]]
name = "rusty-minic"
version = "0.1.0"
dependencies = [
"num",
"regex",
"strum",
"thiserror",
]
[[package]]
name = "strum"
version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd"
dependencies = [
"strum_macros",
]
[[package]]
name = "strum_macros"
version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "syn"
version = "2.0.117"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "2.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "unicode-ident"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+10
View File
@@ -0,0 +1,10 @@
[package]
name = "rusty-minic"
version = "0.1.0"
edition = "2024"
[dependencies]
num = "0.4.3"
regex = "1.12.3"
strum = { version = "0.28.0", features = ["derive"] }
thiserror = "2.0.18"
+323
View File
@@ -0,0 +1,323 @@
use std::{io::BufRead, iter::Peekable, str::FromStr};
use thiserror::Error;
use crate::frontend::types::{Span, TokenValue, TypeIdent};
use super::types::Token;
pub struct Lexer {
tokens: Vec<Token>,
errors: Vec<usize>, // every entry points to the index of unrecognized tokens
}
const WHITESPACE_CHARS: &[char] = &[' ', '\t', '\n', '\r'];
struct Cursor {
chars: Vec<char>,
pos: usize,
}
impl Cursor {
pub fn new(s: &str) -> Self {
Self { chars: s.chars().collect(), pos: 0 }
}
fn peek(&self) -> Option<char> {
self.chars.get(self.pos).copied()
}
fn peek_multiple(&self, n: usize) -> Option<&[char]> {
if self.pos + n <= self.chars.len() {
Some(&self.chars[self.pos..self.pos + n])
} else {
None
}
}
fn advance(&mut self, n: usize) {
self.pos += n;
}
fn next(&mut self) -> Option<char> {
let c = self.chars.get(self.pos).copied();
if c.is_some() {
self.advance(1);
}
c
}
fn back(&mut self, n: usize) {
self.pos = self.pos.saturating_sub(n);
}
fn pos(&self) -> usize {
self.pos
}
}
fn try_parse_as(
f: fn(&mut Cursor) -> Option<TokenValue>,
tokens: &mut Vec<Token>,
str_iter: &mut Cursor,
line: &mut usize,
column: &mut usize,
) -> bool {
let last_pos = str_iter.pos();
if let Some(token) = f(str_iter) {
let span = Span { line: *line, column: *column, length: str_iter.pos() - last_pos };
tokens.push(Token { value: token, span });
return true;
}
false
}
macro_rules! if_true_then_continue {
($e: expr) => {
if $e {
continue;
}
};
}
#[derive(Debug, Error)]
pub enum LexerError {
#[error("Io error: {0}")]
Io(#[from] std::io::Error),
#[error("Too much errors, stop lexing")]
TooManyErrors,
}
impl Lexer {
pub fn has_errors(&self) -> bool {
!self.errors.is_empty()
}
pub fn parse(reader: &mut impl BufRead) -> Result<Self, LexerError> {
let mut tokens = Vec::new();
let mut errors = Vec::new();
let mut line = 1;
let mut column = 1;
let mut in_block_comment = false;
for line_str in reader.lines() {
let line_str = line_str?;
let mut cursor = Cursor::new(&line_str);
loop {
if let Some(c) = cursor.peek() {
// check white space first, if it's white space, skip it and continue to the next character
if WHITESPACE_CHARS.contains(&c) {
column += 1;
cursor.advance(1);
continue;
}
// check comment
match cursor.peek_multiple(2) {
Some(['/', '/']) => {
// skip the rest of the line
line += 1;
column = 1;
break;
}
Some(['/', '*']) => {
in_block_comment = true;
cursor.advance(2);
column += 2;
continue;
}
Some(['*', '/']) => {
in_block_comment = false;
cursor.advance(2);
column += 2;
continue;
}
_ => {}
}
} else {
break;
}
if in_block_comment {
cursor.advance(1);
column += 1;
}
if_true_then_continue!(try_parse_as(parse_litint, &mut tokens, &mut cursor, &mut line, &mut column));
if_true_then_continue!(try_parse_as(parse_delimiter, &mut tokens, &mut cursor, &mut line, &mut column));
if_true_then_continue!(try_parse_as(parse_puncuation, &mut tokens, &mut cursor, &mut line, &mut column));
if_true_then_continue!(try_parse_as(parse_ident, &mut tokens, &mut cursor, &mut line, &mut column));
// unrecognized token
errors.push(tokens.len());
let c = cursor.next().unwrap();
tokens.push(Token {
value: TokenValue::Unrecognized(c),
span: Span { line, column, length: 1 },
});
if errors.len() > 20 {
return Err(LexerError::TooManyErrors);
}
column += 1;
}
line += 1;
column = 1;
}
Ok(Self { tokens, errors })
}
}
fn parse_litint(
str_iter: &mut Cursor,
) -> Option<TokenValue> {
let mut c1 = str_iter.peek()?;
// c1 is the peek value from here
let mut sign_base: i64 = 1;
let mut base: i64 = 10;
if !(c1.is_ascii_digit() || c1 == '-') {
return None;
}
if c1 == '-' {
sign_base = -1;
str_iter.advance(1);
c1 = str_iter.peek()?;
if !c1.is_ascii_digit() {
// only a minus sign, not a number
// back one so cursor still points to the minus sign
str_iter.back(1);
return None;
}
}
let mut number = 0i64;
if c1 == '0' {
str_iter.advance(1);
match str_iter.peek() {
Some('x') | Some('X') => {
base = 16;
str_iter.advance(1);
}
Some(c) if c.is_ascii_digit() => {
base = 8;
}
_ => {
// only zero
}
}
}
// from here, the cursor points to:
// 0x1234 -> cursor at 'x'
// 0123 -> cursor at '1'
// 0 -> cursor at end
// 1234 -> cursor at '1'
loop {
let c = match str_iter.peek() {
Some(c) => c,
None => break,
};
let digit = match c {
'0'..='9' if (c as u8 - b'0') < base as u8 => c as i64 - '0' as i64 ,
'a'..='f' if base == 16 => c as i64 - 'a' as i64 + 10 ,
'A'..='F' if base == 16 => c as i64 - 'A' as i64 + 10,
_ => break,
};
number = number * base + digit;
str_iter.advance(1);
}
number *= sign_base;
Some(TokenValue::IntLit(number))
}
fn parse_delimiter(
str_iter: &mut Cursor,
) -> Option<TokenValue> {
let c = str_iter.peek()?;
let token_value = match c {
'(' => TokenValue::LParen,
')' => TokenValue::RParen,
'{' => TokenValue::LBrace,
'}' => TokenValue::RBrace,
_ => return None,
};
str_iter.advance(1);
Some(token_value)
}
fn parse_puncuation(
str_iter: &mut Cursor,
) -> Option<TokenValue> {
let get_value_by_next_char =
|str_iter: &mut Cursor, not_equal_value: TokenValue, equal_value: TokenValue| {
str_iter.advance(1);
if let Some('=') = str_iter.peek() {
equal_value
} else {
str_iter.back(1);
not_equal_value
}
};
let c = str_iter.peek()?;
let token_value = match c {
'+' => TokenValue::Plus,
'-' => TokenValue::Minus,
'*' => TokenValue::Star,
'/' => TokenValue::Slash,
'%' => TokenValue::Percent,
'=' => get_value_by_next_char(str_iter, TokenValue::Equal, TokenValue::DoubleEqual),
'!' => {
str_iter.advance(1);
if let Some('=') = str_iter.peek() {
TokenValue::NotEqual
} else {
// only '!' is not a valid token, back one so cursor still points to '!'
str_iter.back(1);
return None;
}
},
'<' => get_value_by_next_char(str_iter, TokenValue::Less, TokenValue::LessEqual),
'>' => get_value_by_next_char(str_iter, TokenValue::Greater, TokenValue::GreaterEqual),
',' => TokenValue::Comma,
';' => TokenValue::Semicolon,
_ => return None,
};
str_iter.advance(1);
Some(token_value)
}
fn parse_ident(
str_iter: &mut Cursor,
) -> Option<TokenValue> {
let c = str_iter.peek()?;
if !c.is_ascii_alphabetic() && c != '_' {
return None;
}
let mut name = Vec::new();
while let Some(c) = str_iter.peek() {
if c.is_ascii_alphanumeric() || c == '_' {
name.push(c);
str_iter.advance(1);
} else {
break;
}
}
let name = name.into_iter().collect::<String>();
if let Some(type_ident) = TypeIdent::from_str(&name).ok() {
return Some(TokenValue::TypeIdent(type_ident));
}
Some(TokenValue::Ident(name))
}
#[cfg(test)]
mod tests {
use std::path::Path;
use std::fs::File;
use crate::utils::case_list::CaseList;
use crate::utils::num_sequence::NumberSequence;
pub use super::*;
fn test_case(case_str: &str) {
let case_sequence = NumberSequence::from_str(case_str).unwrap();
let case_list = CaseList::from_dir(&Path::new("./testcases")).unwrap();
let mut error_case_cnt = 0;
for case_no in case_sequence {
let case_path = case_list.get_case_path(case_no).unwrap();
println!("{}", case_path.display());
let file = File::open(case_path).unwrap();
let mut buf_reader = std::io::BufReader::new(file);
let lexer = Lexer::parse(&mut buf_reader).unwrap();
if lexer.has_errors() {
eprintln!("Case {} has error", case_list.get_case_name(case_no).unwrap());
error_case_cnt += 1;
}
}
if error_case_cnt > 0 {
panic!("Found {} cases with errors", error_case_cnt);
}
}
#[test]
fn test_expr() {
test_case("0-3,14-25");
}
}
+2
View File
@@ -0,0 +1,2 @@
pub mod types;
mod lexer;
+41
View File
@@ -0,0 +1,41 @@
use strum::EnumString;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub value: TokenValue,
pub span: Span,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Span {
pub line: usize,
pub column: usize,
pub length: usize,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TokenValue {
IntLit(i64), // TODO: more literal types
Ident(String),
TypeIdent(TypeIdent),
Plus, Minus, Star, Slash, Percent,
Equal, DoubleEqual, NotEqual, Less, LessEqual, Greater, GreaterEqual,
LParen, RParen,
LBrace, RBrace,
Comma, Semicolon,
If, Else, While, Return, Break, Continue,
Eof,
Unrecognized(char),
}
#[derive(Debug, Clone, PartialEq, Eq, EnumString)]
pub enum TypeIdent {
#[strum(serialize = "int")]
Int,
#[strum(serialize = "void")]
Void,
}
+6
View File
@@ -0,0 +1,6 @@
mod frontend;
mod ast;
mod utils;
fn main() {
println!("Hello, world!");
}
+38
View File
@@ -0,0 +1,38 @@
use std::path::PathBuf;
use std::{collections::BTreeMap, path::Path};
use std::io;
pub struct CaseList {
index_map: BTreeMap<usize, String>,
base_path: PathBuf
}
impl CaseList {
pub fn from_dir(dir: &Path) -> io::Result<Self> {
let mut index_map = BTreeMap::new();
let case_dir = std::fs::read_dir(dir)?;
for case_item in case_dir {
let case_item = case_item?;
let file_name = match case_item.file_name().into_string() {
Ok(name) => name,
Err(_) => continue, // skip non-utf8 file names
};
if file_name.ends_with(".c") {
if let Some((index_str, _)) = file_name.split_once('_') {
if let Ok(index) = index_str.parse::<usize>() {
index_map.insert(index, file_name);
}
}
}
}
Ok(Self { index_map, base_path: dir.to_path_buf() })
}
pub fn get_case_name(&self, index: usize) -> Option<&String> {
self.index_map.get(&index)
}
pub fn get_case_path(&self, index: usize) -> Option<PathBuf> {
self.get_case_name(index).map(|name| self.base_path.join(name))
}
}
+2
View File
@@ -0,0 +1,2 @@
pub mod num_sequence;
pub mod case_list;
+54
View File
@@ -0,0 +1,54 @@
use std::{path::Iter, str::FromStr};
use num::Integer;
/// Number sequence, represents a set of integers as a union of several ranges
/// WARNING: this is intended for use in tests, so overlapping are not checked, also the ranges are not necessarily sorted
pub struct NumberSequence<T: Integer> {
cur_range_index: usize,
delta_in_range: T,
ranges: Vec<(T, T)>,
}
impl<T: Integer + Copy + FromStr> NumberSequence<T> {
pub fn from_str(s: &str) -> Option<Self> {
let mut ranges = vec![];
let groups = s.split(',');
for group in groups {
if group.is_empty() {
continue;
}
if let Ok(num) = group.parse::<T>() {
ranges.push((num, num + T::one()));
}
else if let Some((start_str, end_str)) = group.split_once('-') {
if let (Ok(start), Ok(end)) = (start_str.parse::<T>(), end_str.parse::<T>()) {
ranges.push((start, end));
} else {
return None;
}
} else {
return None;
}
}
Some(Self { cur_range_index: 0, delta_in_range: T::zero(), ranges })
}
}
impl<T: Integer + Copy> Iterator for NumberSequence<T> {
type Item = T;
fn next(&mut self) -> Option<Self::Item> {
if self.cur_range_index >= self.ranges.len() {
return None;
}
let (start, end) = self.ranges[self.cur_range_index];
let delta = self.delta_in_range;
if start + delta < end {
self.delta_in_range = self.delta_in_range + T::one();
Some(start + delta)
} else {
self.cur_range_index += 1;
self.delta_in_range = T::zero();
self.next()
}
}
}