commit e8b50ae0d7dba5fc0a2e8f918f961bcac9851672 Author: Hydrostic Date: Wed May 6 14:30:32 2026 +0800 feat(lexer): Finish lexer parse and add tests diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..26ceea5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +/testcases \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..1fd0f62 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,218 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "rusty-minic" +version = "0.1.0" +dependencies = [ + "num", + "regex", + "strum", + "thiserror", +] + +[[package]] +name = "strum" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e2c6a11 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "rusty-minic" +version = "0.1.0" +edition = "2024" + +[dependencies] +num = "0.4.3" +regex = "1.12.3" +strum = { version = "0.28.0", features = ["derive"] } +thiserror = "2.0.18" diff --git a/src/frontend/lexer.rs b/src/frontend/lexer.rs new file mode 100644 index 0000000..c7476da --- /dev/null +++ b/src/frontend/lexer.rs @@ -0,0 +1,323 @@ +use std::{io::BufRead, iter::Peekable, str::FromStr}; + +use thiserror::Error; + +use crate::frontend::types::{Span, TokenValue, TypeIdent}; + +use super::types::Token; + +pub struct Lexer { + tokens: Vec, + errors: Vec, // every entry points to the index of unrecognized tokens +} + +const WHITESPACE_CHARS: &[char] = &[' ', '\t', '\n', '\r']; +struct Cursor { + chars: Vec, + pos: usize, +} +impl Cursor { + pub fn new(s: &str) -> Self { + Self { chars: s.chars().collect(), pos: 0 } + } + fn peek(&self) -> Option { + self.chars.get(self.pos).copied() + } + fn peek_multiple(&self, n: usize) -> Option<&[char]> { + if self.pos + n <= self.chars.len() { + Some(&self.chars[self.pos..self.pos + n]) + } else { + None + } + } + fn advance(&mut self, n: usize) { + self.pos += n; + } + fn next(&mut self) -> Option { + let c = self.chars.get(self.pos).copied(); + if c.is_some() { + self.advance(1); + } + c + } + fn back(&mut self, n: usize) { + self.pos = self.pos.saturating_sub(n); + } + fn pos(&self) -> usize { + self.pos + } +} +fn try_parse_as( + f: fn(&mut Cursor) -> Option, + tokens: &mut Vec, + str_iter: &mut Cursor, + line: &mut usize, + column: &mut usize, +) -> bool { + let last_pos = str_iter.pos(); + if let Some(token) = f(str_iter) { + let span = Span { line: *line, column: *column, length: str_iter.pos() - last_pos }; + tokens.push(Token { value: token, span }); + return true; + } + false +} +macro_rules! if_true_then_continue { + ($e: expr) => { + if $e { + continue; + } + }; +} +#[derive(Debug, Error)] +pub enum LexerError { + #[error("Io error: {0}")] + Io(#[from] std::io::Error), + #[error("Too much errors, stop lexing")] + TooManyErrors, +} +impl Lexer { + pub fn has_errors(&self) -> bool { + !self.errors.is_empty() + } + pub fn parse(reader: &mut impl BufRead) -> Result { + let mut tokens = Vec::new(); + let mut errors = Vec::new(); + let mut line = 1; + let mut column = 1; + let mut in_block_comment = false; + for line_str in reader.lines() { + let line_str = line_str?; + let mut cursor = Cursor::new(&line_str); + loop { + if let Some(c) = cursor.peek() { + // check white space first, if it's white space, skip it and continue to the next character + if WHITESPACE_CHARS.contains(&c) { + column += 1; + cursor.advance(1); + continue; + } + // check comment + match cursor.peek_multiple(2) { + Some(['/', '/']) => { + // skip the rest of the line + line += 1; + column = 1; + break; + } + Some(['/', '*']) => { + in_block_comment = true; + cursor.advance(2); + column += 2; + continue; + } + Some(['*', '/']) => { + in_block_comment = false; + cursor.advance(2); + column += 2; + continue; + } + _ => {} + } + } else { + break; + } + if in_block_comment { + cursor.advance(1); + column += 1; + } + if_true_then_continue!(try_parse_as(parse_litint, &mut tokens, &mut cursor, &mut line, &mut column)); + if_true_then_continue!(try_parse_as(parse_delimiter, &mut tokens, &mut cursor, &mut line, &mut column)); + if_true_then_continue!(try_parse_as(parse_puncuation, &mut tokens, &mut cursor, &mut line, &mut column)); + if_true_then_continue!(try_parse_as(parse_ident, &mut tokens, &mut cursor, &mut line, &mut column)); + // unrecognized token + errors.push(tokens.len()); + let c = cursor.next().unwrap(); + tokens.push(Token { + value: TokenValue::Unrecognized(c), + span: Span { line, column, length: 1 }, + }); + if errors.len() > 20 { + return Err(LexerError::TooManyErrors); + } + column += 1; + } + line += 1; + column = 1; + } + Ok(Self { tokens, errors }) + } +} +fn parse_litint( + str_iter: &mut Cursor, +) -> Option { + let mut c1 = str_iter.peek()?; + // c1 is the peek value from here + let mut sign_base: i64 = 1; + let mut base: i64 = 10; + if !(c1.is_ascii_digit() || c1 == '-') { + return None; + } + if c1 == '-' { + sign_base = -1; + str_iter.advance(1); + c1 = str_iter.peek()?; + if !c1.is_ascii_digit() { + // only a minus sign, not a number + // back one so cursor still points to the minus sign + str_iter.back(1); + return None; + } + } + let mut number = 0i64; + if c1 == '0' { + str_iter.advance(1); + match str_iter.peek() { + Some('x') | Some('X') => { + base = 16; + str_iter.advance(1); + } + Some(c) if c.is_ascii_digit() => { + base = 8; + } + _ => { + // only zero + } + } + } + // from here, the cursor points to: + // 0x1234 -> cursor at 'x' + // 0123 -> cursor at '1' + // 0 -> cursor at end + // 1234 -> cursor at '1' + loop { + let c = match str_iter.peek() { + Some(c) => c, + None => break, + }; + let digit = match c { + '0'..='9' if (c as u8 - b'0') < base as u8 => c as i64 - '0' as i64 , + 'a'..='f' if base == 16 => c as i64 - 'a' as i64 + 10 , + 'A'..='F' if base == 16 => c as i64 - 'A' as i64 + 10, + _ => break, + }; + number = number * base + digit; + str_iter.advance(1); + } + number *= sign_base; + Some(TokenValue::IntLit(number)) +} + +fn parse_delimiter( + str_iter: &mut Cursor, +) -> Option { + let c = str_iter.peek()?; + let token_value = match c { + '(' => TokenValue::LParen, + ')' => TokenValue::RParen, + '{' => TokenValue::LBrace, + '}' => TokenValue::RBrace, + _ => return None, + }; + str_iter.advance(1); + Some(token_value) +} +fn parse_puncuation( + str_iter: &mut Cursor, +) -> Option { + let get_value_by_next_char = + |str_iter: &mut Cursor, not_equal_value: TokenValue, equal_value: TokenValue| { + str_iter.advance(1); + if let Some('=') = str_iter.peek() { + equal_value + } else { + str_iter.back(1); + not_equal_value + } + }; + let c = str_iter.peek()?; + let token_value = match c { + '+' => TokenValue::Plus, + '-' => TokenValue::Minus, + '*' => TokenValue::Star, + '/' => TokenValue::Slash, + '%' => TokenValue::Percent, + + '=' => get_value_by_next_char(str_iter, TokenValue::Equal, TokenValue::DoubleEqual), + '!' => { + str_iter.advance(1); + if let Some('=') = str_iter.peek() { + TokenValue::NotEqual + } else { + // only '!' is not a valid token, back one so cursor still points to '!' + str_iter.back(1); + return None; + } + }, + '<' => get_value_by_next_char(str_iter, TokenValue::Less, TokenValue::LessEqual), + '>' => get_value_by_next_char(str_iter, TokenValue::Greater, TokenValue::GreaterEqual), + + ',' => TokenValue::Comma, + ';' => TokenValue::Semicolon, + + _ => return None, + }; + str_iter.advance(1); + Some(token_value) +} + +fn parse_ident( + str_iter: &mut Cursor, +) -> Option { + let c = str_iter.peek()?; + if !c.is_ascii_alphabetic() && c != '_' { + return None; + } + let mut name = Vec::new(); + while let Some(c) = str_iter.peek() { + if c.is_ascii_alphanumeric() || c == '_' { + name.push(c); + str_iter.advance(1); + } else { + break; + } + } + let name = name.into_iter().collect::(); + if let Some(type_ident) = TypeIdent::from_str(&name).ok() { + return Some(TokenValue::TypeIdent(type_ident)); + } + Some(TokenValue::Ident(name)) +} +#[cfg(test)] +mod tests { + use std::path::Path; + use std::fs::File; + use crate::utils::case_list::CaseList; + use crate::utils::num_sequence::NumberSequence; + + pub use super::*; + fn test_case(case_str: &str) { + let case_sequence = NumberSequence::from_str(case_str).unwrap(); + let case_list = CaseList::from_dir(&Path::new("./testcases")).unwrap(); + let mut error_case_cnt = 0; + for case_no in case_sequence { + let case_path = case_list.get_case_path(case_no).unwrap(); + println!("{}", case_path.display()); + let file = File::open(case_path).unwrap(); + let mut buf_reader = std::io::BufReader::new(file); + let lexer = Lexer::parse(&mut buf_reader).unwrap(); + if lexer.has_errors() { + eprintln!("Case {} has error", case_list.get_case_name(case_no).unwrap()); + error_case_cnt += 1; + } + } + if error_case_cnt > 0 { + panic!("Found {} cases with errors", error_case_cnt); + } + + } + #[test] + fn test_expr() { + test_case("0-3,14-25"); + } +} \ No newline at end of file diff --git a/src/frontend/mod.rs b/src/frontend/mod.rs new file mode 100644 index 0000000..c7108a2 --- /dev/null +++ b/src/frontend/mod.rs @@ -0,0 +1,2 @@ +pub mod types; +mod lexer; \ No newline at end of file diff --git a/src/frontend/types.rs b/src/frontend/types.rs new file mode 100644 index 0000000..1899d24 --- /dev/null +++ b/src/frontend/types.rs @@ -0,0 +1,41 @@ +use strum::EnumString; + + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Token { + pub value: TokenValue, + pub span: Span, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Span { + pub line: usize, + pub column: usize, + pub length: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TokenValue { + IntLit(i64), // TODO: more literal types + Ident(String), + TypeIdent(TypeIdent), + + Plus, Minus, Star, Slash, Percent, + Equal, DoubleEqual, NotEqual, Less, LessEqual, Greater, GreaterEqual, + + LParen, RParen, + LBrace, RBrace, + Comma, Semicolon, + + If, Else, While, Return, Break, Continue, + + Eof, + Unrecognized(char), +} +#[derive(Debug, Clone, PartialEq, Eq, EnumString)] +pub enum TypeIdent { + #[strum(serialize = "int")] + Int, + #[strum(serialize = "void")] + Void, +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..1f94806 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,6 @@ +mod frontend; +mod ast; +mod utils; +fn main() { + println!("Hello, world!"); +} diff --git a/src/utils/case_list.rs b/src/utils/case_list.rs new file mode 100644 index 0000000..b0be1b3 --- /dev/null +++ b/src/utils/case_list.rs @@ -0,0 +1,38 @@ +use std::path::PathBuf; +use std::{collections::BTreeMap, path::Path}; +use std::io; +pub struct CaseList { + index_map: BTreeMap, + base_path: PathBuf +} + +impl CaseList { + pub fn from_dir(dir: &Path) -> io::Result { + let mut index_map = BTreeMap::new(); + let case_dir = std::fs::read_dir(dir)?; + for case_item in case_dir { + let case_item = case_item?; + let file_name = match case_item.file_name().into_string() { + Ok(name) => name, + Err(_) => continue, // skip non-utf8 file names + }; + if file_name.ends_with(".c") { + if let Some((index_str, _)) = file_name.split_once('_') { + + if let Ok(index) = index_str.parse::() { + index_map.insert(index, file_name); + } + } + } + } + Ok(Self { index_map, base_path: dir.to_path_buf() }) + } + + pub fn get_case_name(&self, index: usize) -> Option<&String> { + self.index_map.get(&index) + } + + pub fn get_case_path(&self, index: usize) -> Option { + self.get_case_name(index).map(|name| self.base_path.join(name)) + } +} \ No newline at end of file diff --git a/src/utils/mod.rs b/src/utils/mod.rs new file mode 100644 index 0000000..78cddd7 --- /dev/null +++ b/src/utils/mod.rs @@ -0,0 +1,2 @@ +pub mod num_sequence; +pub mod case_list; \ No newline at end of file diff --git a/src/utils/num_sequence.rs b/src/utils/num_sequence.rs new file mode 100644 index 0000000..9666b04 --- /dev/null +++ b/src/utils/num_sequence.rs @@ -0,0 +1,54 @@ +use std::{path::Iter, str::FromStr}; + +use num::Integer; +/// Number sequence, represents a set of integers as a union of several ranges +/// WARNING: this is intended for use in tests, so overlapping are not checked, also the ranges are not necessarily sorted +pub struct NumberSequence { + cur_range_index: usize, + delta_in_range: T, + ranges: Vec<(T, T)>, +} + +impl NumberSequence { + pub fn from_str(s: &str) -> Option { + let mut ranges = vec![]; + let groups = s.split(','); + for group in groups { + if group.is_empty() { + continue; + } + if let Ok(num) = group.parse::() { + ranges.push((num, num + T::one())); + } + else if let Some((start_str, end_str)) = group.split_once('-') { + if let (Ok(start), Ok(end)) = (start_str.parse::(), end_str.parse::()) { + ranges.push((start, end)); + } else { + return None; + } + } else { + return None; + } + } + Some(Self { cur_range_index: 0, delta_in_range: T::zero(), ranges }) + } +} + +impl Iterator for NumberSequence { + type Item = T; + fn next(&mut self) -> Option { + if self.cur_range_index >= self.ranges.len() { + return None; + } + let (start, end) = self.ranges[self.cur_range_index]; + let delta = self.delta_in_range; + if start + delta < end { + self.delta_in_range = self.delta_in_range + T::one(); + Some(start + delta) + } else { + self.cur_range_index += 1; + self.delta_in_range = T::zero(); + self.next() + } + } +} \ No newline at end of file