feat(lexer): Finish lexer parse and add tests

2026-05-06 14:30:32 +08:00
commit e8b50ae0d7
10 changed files with 696 additions and 0 deletions
@@ -0,0 +1,323 @@
+use std::{io::BufRead, iter::Peekable, str::FromStr};
+
+use thiserror::Error;
+
+use crate::frontend::types::{Span, TokenValue, TypeIdent};
+
+use super::types::Token;
+
+pub struct Lexer {
+    tokens: Vec<Token>,
+    errors: Vec<usize>, // every entry points to the index of unrecognized tokens
+}
+
+const WHITESPACE_CHARS: &[char] = &[' ', '\t', '\n', '\r'];
+struct Cursor {
+    chars: Vec<char>,
+    pos: usize,
+}
+impl Cursor {
+    pub fn new(s: &str) -> Self {
+        Self { chars: s.chars().collect(), pos: 0 }
+    }
+    fn peek(&self) -> Option<char> {
+        self.chars.get(self.pos).copied()
+    }
+    fn peek_multiple(&self, n: usize) -> Option<&[char]> {
+        if self.pos + n <= self.chars.len() {
+            Some(&self.chars[self.pos..self.pos + n])
+        } else {
+            None
+        }
+    }
+    fn advance(&mut self, n: usize) {
+        self.pos += n;
+    }
+    fn next(&mut self) -> Option<char> {
+        let c = self.chars.get(self.pos).copied();
+        if c.is_some() {
+            self.advance(1);
+        }
+        c
+    }
+    fn back(&mut self, n: usize) {
+        self.pos = self.pos.saturating_sub(n);
+    }
+    fn pos(&self) -> usize {
+        self.pos
+    }
+}
+fn try_parse_as(
+    f: fn(&mut Cursor) -> Option<TokenValue>,
+    tokens: &mut Vec<Token>,
+    str_iter: &mut Cursor,
+    line: &mut usize,
+    column: &mut usize,
+) -> bool {
+    let last_pos = str_iter.pos();
+    if let Some(token) = f(str_iter) {
+        let span = Span { line: *line, column: *column, length: str_iter.pos() - last_pos };
+        tokens.push(Token { value: token, span });
+        return true;
+    }
+    false
+}
+macro_rules! if_true_then_continue {
+    ($e: expr) => {
+        if $e {
+            continue;
+        }
+    };
+}
+#[derive(Debug, Error)]
+pub enum LexerError {
+    #[error("Io error: {0}")]
+    Io(#[from] std::io::Error),
+    #[error("Too much errors, stop lexing")]
+    TooManyErrors,
+}
+impl Lexer {
+    pub fn has_errors(&self) -> bool {
+        !self.errors.is_empty()
+    }
+    pub fn parse(reader: &mut impl BufRead) -> Result<Self, LexerError> {
+        let mut tokens = Vec::new();
+        let mut errors = Vec::new();
+        let mut line = 1;
+        let mut column = 1;
+        let mut in_block_comment = false;
+        for line_str in reader.lines() {
+            let line_str = line_str?;
+            let mut cursor = Cursor::new(&line_str);
+                loop {
+                    if let Some(c) = cursor.peek() {
+                        // check white space first, if it's white space, skip it and continue to the next character
+                        if WHITESPACE_CHARS.contains(&c) {
+                            column += 1;
+                            cursor.advance(1);
+                            continue;
+                        }
+                        // check comment
+                        match cursor.peek_multiple(2) {
+                            Some(['/', '/']) => {
+                                // skip the rest of the line
+                                line += 1;
+                                column = 1;
+                                break;
+                            }
+                            Some(['/', '*']) => {
+                                in_block_comment = true;
+                                cursor.advance(2);
+                                column += 2;
+                                continue;
+                            }
+                            Some(['*', '/']) => {
+                                in_block_comment = false;
+                                cursor.advance(2);
+                                column += 2;
+                                continue;
+                            }
+                            _ => {}
+                        }
+                    } else {
+                        break;
+                    }
+                    if in_block_comment {
+                        cursor.advance(1);
+                        column += 1;
+                    }
+                    if_true_then_continue!(try_parse_as(parse_litint, &mut tokens, &mut cursor, &mut line, &mut column));
+                    if_true_then_continue!(try_parse_as(parse_delimiter, &mut tokens, &mut cursor, &mut line, &mut column));
+                    if_true_then_continue!(try_parse_as(parse_puncuation, &mut tokens, &mut cursor, &mut line, &mut column));
+                    if_true_then_continue!(try_parse_as(parse_ident, &mut tokens, &mut cursor, &mut line, &mut column));
+                    // unrecognized token
+                    errors.push(tokens.len());
+                    let c = cursor.next().unwrap();
+                    tokens.push(Token {
+                        value: TokenValue::Unrecognized(c),
+                        span: Span { line, column, length: 1 },
+                    });
+                    if errors.len() > 20 {
+                        return Err(LexerError::TooManyErrors);
+                    }
+                    column += 1;
+                }
+            line += 1;
+            column = 1;
+        }
+        Ok(Self { tokens, errors })
+    }
+}
+fn parse_litint(
+    str_iter: &mut Cursor,
+) -> Option<TokenValue> {
+    let mut c1 = str_iter.peek()?;
+    // c1 is the peek value from here
+    let mut sign_base: i64 = 1;
+    let mut base: i64 = 10;
+    if !(c1.is_ascii_digit() || c1 == '-') {
+        return None;
+    }
+    if c1 == '-' {
+        sign_base = -1;
+        str_iter.advance(1);
+        c1 = str_iter.peek()?;
+        if !c1.is_ascii_digit() {
+            // only a minus sign, not a number
+            // back one so cursor still points to the minus sign
+            str_iter.back(1);
+            return None;
+        }
+    }
+    let mut number = 0i64;
+    if c1 == '0' {
+        str_iter.advance(1);
+        match str_iter.peek() {
+            Some('x') | Some('X') => {
+                base = 16;
+                str_iter.advance(1);
+            }
+            Some(c) if c.is_ascii_digit() => {
+                base = 8;
+            }
+            _ => {
+                // only zero
+            }
+        }
+    }
+    // from here, the cursor points to:
+    // 0x1234 -> cursor at 'x'
+    // 0123 -> cursor at '1'
+    // 0 -> cursor at end
+    // 1234 -> cursor at '1'
+    loop {
+        let c = match str_iter.peek() {
+            Some(c) => c,
+            None => break,
+        };
+        let digit = match c {
+            '0'..='9' if (c as u8 - b'0') < base as u8 => c as i64 - '0' as i64 ,
+            'a'..='f' if base == 16 => c as i64 - 'a' as i64 + 10 ,
+            'A'..='F' if base == 16 => c as i64 - 'A' as i64 + 10,
+            _ => break,
+        };
+        number = number * base + digit;
+        str_iter.advance(1);
+    }
+    number *= sign_base;
+    Some(TokenValue::IntLit(number))
+}
+
+fn parse_delimiter(
+    str_iter: &mut Cursor,
+) -> Option<TokenValue> {
+    let c = str_iter.peek()?;
+    let token_value = match c {
+        '(' => TokenValue::LParen,
+        ')' => TokenValue::RParen,
+        '{' => TokenValue::LBrace,
+        '}' => TokenValue::RBrace,
+        _ => return None,
+    };
+    str_iter.advance(1);
+    Some(token_value)
+}
+fn parse_puncuation(
+    str_iter: &mut Cursor,
+) -> Option<TokenValue> {
+    let get_value_by_next_char = 
+        |str_iter: &mut Cursor, not_equal_value: TokenValue, equal_value: TokenValue| {
+            str_iter.advance(1);
+            if let Some('=') = str_iter.peek() {
+                equal_value
+            } else {
+                str_iter.back(1);
+                not_equal_value
+            }
+    };
+    let c = str_iter.peek()?;
+    let token_value = match c {
+        '+' => TokenValue::Plus,
+        '-' => TokenValue::Minus,
+        '*' => TokenValue::Star,
+        '/' => TokenValue::Slash,
+        '%' => TokenValue::Percent,
+
+        '=' => get_value_by_next_char(str_iter, TokenValue::Equal, TokenValue::DoubleEqual),
+        '!' => {
+            str_iter.advance(1);
+            if let Some('=') = str_iter.peek() {
+                TokenValue::NotEqual
+            } else {
+                // only '!' is not a valid token, back one so cursor still points to '!'
+                str_iter.back(1);
+                return None;
+            }
+        },
+        '<' => get_value_by_next_char(str_iter, TokenValue::Less, TokenValue::LessEqual),
+        '>' => get_value_by_next_char(str_iter, TokenValue::Greater, TokenValue::GreaterEqual),
+
+        ',' => TokenValue::Comma,
+        ';' => TokenValue::Semicolon,
+
+        _ => return None,
+    };
+    str_iter.advance(1);
+    Some(token_value)
+}
+
+fn parse_ident(
+    str_iter: &mut Cursor,
+) -> Option<TokenValue> {
+    let c = str_iter.peek()?;
+    if !c.is_ascii_alphabetic() && c != '_' {
+        return None;
+    }
+    let mut name = Vec::new();
+    while let Some(c) = str_iter.peek() {
+        if c.is_ascii_alphanumeric() || c == '_' {
+            name.push(c);
+            str_iter.advance(1);
+        } else {
+            break;
+        }
+    }
+    let name = name.into_iter().collect::<String>();
+    if let Some(type_ident) = TypeIdent::from_str(&name).ok() {
+        return Some(TokenValue::TypeIdent(type_ident));
+    }
+    Some(TokenValue::Ident(name))
+}
+#[cfg(test)]
+mod tests {
+    use std::path::Path;
+    use std::fs::File;
+    use crate::utils::case_list::CaseList;
+    use crate::utils::num_sequence::NumberSequence;
+
+    pub use super::*;
+    fn test_case(case_str: &str) {
+        let case_sequence = NumberSequence::from_str(case_str).unwrap();
+        let case_list = CaseList::from_dir(&Path::new("./testcases")).unwrap();
+        let mut error_case_cnt = 0;
+        for case_no in case_sequence {
+            let case_path = case_list.get_case_path(case_no).unwrap();
+            println!("{}", case_path.display());
+            let file = File::open(case_path).unwrap();
+            let mut buf_reader = std::io::BufReader::new(file);
+            let lexer = Lexer::parse(&mut buf_reader).unwrap();
+            if lexer.has_errors() {
+                eprintln!("Case {} has error", case_list.get_case_name(case_no).unwrap());
+                error_case_cnt += 1;
+            }
+        }
+        if error_case_cnt > 0 {
+            panic!("Found {} cases with errors", error_case_cnt);
+        }
+
+    }
+    #[test]
+    fn test_expr() {
+        test_case("0-3,14-25");
+    }
+}
@@ -0,0 +1,2 @@
+pub mod types;
+mod lexer;
@@ -0,0 +1,41 @@
+use strum::EnumString;
+
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Token {
+    pub value: TokenValue,
+    pub span: Span,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Span {
+    pub line: usize,
+    pub column: usize,
+    pub length: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TokenValue {
+    IntLit(i64), // TODO: more literal types
+    Ident(String),
+    TypeIdent(TypeIdent),
+    
+    Plus, Minus, Star, Slash, Percent,
+    Equal, DoubleEqual, NotEqual, Less, LessEqual, Greater, GreaterEqual,
+
+    LParen, RParen,
+    LBrace, RBrace,
+    Comma, Semicolon,
+
+    If, Else, While, Return, Break, Continue,
+
+    Eof,
+    Unrecognized(char),
+}
+#[derive(Debug, Clone, PartialEq, Eq, EnumString)]
+pub enum TypeIdent {
+    #[strum(serialize = "int")]
+    Int,
+    #[strum(serialize = "void")]
+    Void,
+}
@@ -0,0 +1,6 @@
+mod frontend;
+mod ast;
+mod utils;
+fn main() {
+    println!("Hello, world!");
+}
@@ -0,0 +1,38 @@
+use std::path::PathBuf;
+use std::{collections::BTreeMap, path::Path};
+use std::io;
+pub struct CaseList {
+    index_map: BTreeMap<usize, String>,
+    base_path: PathBuf
+}
+
+impl CaseList {
+    pub fn from_dir(dir: &Path) -> io::Result<Self> {
+        let mut index_map = BTreeMap::new();
+        let case_dir = std::fs::read_dir(dir)?;
+        for case_item in case_dir {
+            let case_item = case_item?;
+            let file_name = match case_item.file_name().into_string() {
+                Ok(name) => name,
+                Err(_) => continue, // skip non-utf8 file names
+            };
+            if file_name.ends_with(".c") {
+                if let Some((index_str, _)) = file_name.split_once('_') {
+
+                    if let Ok(index) = index_str.parse::<usize>() {
+                        index_map.insert(index, file_name);
+                    }
+                }
+            }
+        }
+        Ok(Self { index_map, base_path: dir.to_path_buf() })
+    }
+
+    pub fn get_case_name(&self, index: usize) -> Option<&String> {
+        self.index_map.get(&index)
+    }
+
+    pub fn get_case_path(&self, index: usize) -> Option<PathBuf> {
+        self.get_case_name(index).map(|name| self.base_path.join(name))
+    }
+}
@@ -0,0 +1,2 @@
+pub mod num_sequence;
+pub mod case_list;
@@ -0,0 +1,54 @@
+use std::{path::Iter, str::FromStr};
+
+use num::Integer;
+/// Number sequence, represents a set of integers as a union of several ranges
+/// WARNING: this is intended for use in tests, so overlapping are not checked, also the ranges are not necessarily sorted
+pub struct NumberSequence<T: Integer> {
+    cur_range_index: usize,
+    delta_in_range: T,
+    ranges: Vec<(T, T)>,
+}
+
+impl<T: Integer + Copy + FromStr> NumberSequence<T> {
+    pub fn from_str(s: &str) -> Option<Self> {
+        let mut ranges = vec![];
+        let groups = s.split(',');
+        for group in groups {
+            if group.is_empty() {
+                continue;
+            }
+            if let Ok(num) = group.parse::<T>() {
+                ranges.push((num, num + T::one()));
+            }
+            else if let Some((start_str, end_str)) = group.split_once('-') {
+                if let (Ok(start), Ok(end)) = (start_str.parse::<T>(), end_str.parse::<T>()) {
+                    ranges.push((start, end));
+                } else {
+                    return None;
+                }
+            } else {
+                return None;
+            }
+        }
+        Some(Self { cur_range_index: 0, delta_in_range: T::zero(), ranges })
+    }
+}
+
+impl<T: Integer + Copy> Iterator for NumberSequence<T> {
+    type Item = T;
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cur_range_index >= self.ranges.len() {
+            return None;
+        }
+        let (start, end) = self.ranges[self.cur_range_index];
+        let delta = self.delta_in_range;
+        if start + delta < end {
+            self.delta_in_range = self.delta_in_range + T::one();
+            Some(start + delta)
+        } else {
+            self.cur_range_index += 1;
+            self.delta_in_range = T::zero();
+            self.next()
+        }
+    }
+}