From b9ded3ea783fac19e856d003da89629d8f7c3323 Mon Sep 17 00:00:00 2001 From: Jeff Date: Thu, 7 Nov 2024 17:59:28 -0500 Subject: [PATCH] Begin refactoring the lexer --- dust-lang/src/lexer.rs | 269 +++-------------------------------------- dust-lang/src/lib.rs | 2 +- dust-lang/src/token.rs | 20 ++- dust-shell/src/main.rs | 34 ++++-- 4 files changed, 65 insertions(+), 260 deletions(-) diff --git a/dust-lang/src/lexer.rs b/dust-lang/src/lexer.rs index 7b16a96..3967827 100644 --- a/dust-lang/src/lexer.rs +++ b/dust-lang/src/lexer.rs @@ -87,256 +87,10 @@ impl<'src> Lexer<'src> { pub fn next_token(&mut self) -> Result<(Token<'src>, Span), LexError> { self.skip_whitespace(); - let (token, span) = if let Some(c) = self.peek_char() { - match c { - '0'..='9' => self.lex_numeric()?, - '-' => { - let second_char = self.peek_second_char(); + let (token, span) = if let Some(character) = self.peek_char() { + let lexer = LexRule::from(&character).lexer; - if let Some('=') = second_char { - self.position += 2; - - (Token::MinusEqual, Span(self.position - 2, self.position)) - } else if let Some('>') = second_char { - self.position += 2; - - (Token::ArrowThin, Span(self.position - 2, self.position)) - } else if let Some('0'..='9') = second_char { - self.lex_numeric()? - } else if "-Infinity" == self.peek_chars(9) { - self.position += 9; - - ( - Token::Float("-Infinity"), - Span(self.position - 9, self.position), - ) - } else { - self.position += 1; - - (Token::Minus, Span(self.position - 1, self.position)) - } - } - 'a'..='z' | 'A'..='Z' => self.lex_alphanumeric()?, - '"' => self.lex_string()?, - '\'' => { - self.position += 1; - - if let Some(c) = self.peek_char() { - self.position += 1; - - let peek = self.peek_char(); - - if let Some('\'') = peek { - self.position += 1; - - (Token::Character(c), Span(self.position - 3, self.position)) - } else { - return Err(LexError::ExpectedCharacter { - expected: '\'', - actual: peek.unwrap_or('\0'), - position: self.position, - }); - } - } else { - return Err(LexError::UnexpectedEndOfFile { - position: self.position, - }); - } - } - '+' => { - if let Some('=') = self.peek_second_char() { - self.position += 2; - - (Token::PlusEqual, Span(self.position - 2, self.position)) - } else { - self.position += 1; - - (Token::Plus, Span(self.position - 1, self.position)) - } - } - '*' => { - if let Some('=') = self.peek_second_char() { - self.position += 2; - - (Token::StarEqual, Span(self.position - 2, self.position)) - } else { - self.position += 1; - - (Token::Star, Span(self.position - 1, self.position)) - } - } - '(' => { - self.position += 1; - - ( - Token::LeftParenthesis, - Span(self.position - 1, self.position), - ) - } - ')' => { - self.position += 1; - - ( - Token::RightParenthesis, - Span(self.position - 1, self.position), - ) - } - '=' => { - if let Some('=') = self.peek_second_char() { - self.position += 2; - - (Token::DoubleEqual, Span(self.position - 2, self.position)) - } else { - self.position += 1; - - (Token::Equal, Span(self.position - 1, self.position)) - } - } - '[' => { - self.position += 1; - - ( - Token::LeftSquareBrace, - Span(self.position - 1, self.position), - ) - } - ']' => { - self.position += 1; - - ( - Token::RightSquareBrace, - Span(self.position - 1, self.position), - ) - } - ',' => { - self.position += 1; - - (Token::Comma, Span(self.position - 1, self.position)) - } - '.' => { - if let Some('.') = self.peek_second_char() { - self.position += 2; - - (Token::DoubleDot, Span(self.position - 2, self.position)) - } else { - self.position += 1; - - (Token::Dot, Span(self.position - 1, self.position)) - } - } - '>' => { - if let Some('=') = self.peek_second_char() { - self.position += 2; - - (Token::GreaterEqual, Span(self.position - 2, self.position)) - } else { - self.position += 1; - - (Token::Greater, Span(self.position - 1, self.position)) - } - } - '<' => { - if let Some('=') = self.peek_second_char() { - self.position += 2; - - (Token::LessEqual, Span(self.position - 2, self.position)) - } else { - self.position += 1; - - (Token::Less, Span(self.position - 1, self.position)) - } - } - '{' => { - self.position += 1; - - ( - Token::LeftCurlyBrace, - Span(self.position - 1, self.position), - ) - } - '}' => { - self.position += 1; - - ( - Token::RightCurlyBrace, - Span(self.position - 1, self.position), - ) - } - '/' => { - if let Some('=') = self.peek_second_char() { - self.position += 2; - - (Token::SlashEqual, Span(self.position - 2, self.position)) - } else { - self.position += 1; - - (Token::Slash, Span(self.position - 1, self.position)) - } - } - '%' => { - self.position += 1; - - (Token::Percent, Span(self.position - 1, self.position)) - } - '&' => { - if let Some('&') = self.peek_second_char() { - self.position += 2; - - ( - Token::DoubleAmpersand, - Span(self.position - 2, self.position), - ) - } else { - self.position += 1; - - return Err(LexError::UnexpectedCharacter { - actual: c, - position: self.position, - }); - } - } - ';' => { - self.position += 1; - - (Token::Semicolon, Span(self.position - 1, self.position)) - } - '|' => { - if let Some('|') = self.peek_second_char() { - self.position += 2; - - (Token::DoublePipe, Span(self.position - 2, self.position)) - } else { - self.position += 1; - - return Err(LexError::UnexpectedCharacter { - actual: c, - position: self.position, - }); - } - } - '!' => { - self.position += 1; - - if let Some('=') = self.peek_char() { - self.position += 1; - - (Token::BangEqual, Span(self.position - 2, self.position)) - } else { - (Token::Bang, Span(self.position - 1, self.position)) - } - } - ':' => { - self.position += 1; - - (Token::Colon, Span(self.position - 1, self.position)) - } - _ => { - return Err(LexError::UnexpectedCharacter { - actual: c, - position: self.position, - }); - } - } + lexer(self)? } else { (Token::Eof, Span(self.position, self.position)) }; @@ -566,6 +320,23 @@ impl<'src> Lexer<'src> { } } +type LexerFn<'src> = fn(&mut Lexer<'src>) -> Result<(Token<'src>, Span), LexError>; + +pub struct LexRule<'src> { + lexer: LexerFn<'src>, +} + +impl<'src> From<&char> for LexRule<'src> { + fn from(char: &char) -> Self { + match char { + '0'..='9' => LexRule { + lexer: Lexer::lex_numeric, + }, + _ => panic!("Invalid character"), + } + } +} + #[derive(Debug, PartialEq, Clone)] pub enum LexError { ExpectedAsciiHexDigit { diff --git a/dust-lang/src/lib.rs b/dust-lang/src/lib.rs index c25a776..6640c84 100644 --- a/dust-lang/src/lib.rs +++ b/dust-lang/src/lib.rs @@ -26,7 +26,7 @@ pub use crate::native_function::{NativeFunction, NativeFunctionError}; pub use crate::operation::Operation; pub use crate::optimizer::{optimize, Optimizer}; pub use crate::r#type::{EnumType, FunctionType, RangeableType, StructType, Type, TypeConflict}; -pub use crate::token::{Token, TokenKind, TokenOwned}; +pub use crate::token::{output_token_list, Token, TokenKind, TokenOwned}; pub use crate::value::{ConcreteValue, Function, Value, ValueError}; pub use crate::vm::{run, Vm, VmError}; diff --git a/dust-lang/src/token.rs b/dust-lang/src/token.rs index 8e9b319..4f1333b 100644 --- a/dust-lang/src/token.rs +++ b/dust-lang/src/token.rs @@ -1,8 +1,26 @@ //! Token, TokenOwned and TokenKind types. -use std::fmt::{self, Display, Formatter}; +use std::{ + fmt::{self, Display, Formatter}, + io::Write, +}; use serde::{Deserialize, Serialize}; +use crate::Span; + +pub fn output_token_list(tokens: &[(Token, Span)], writer: &mut W) { + const HEADER: [&str; 2] = ["TOKEN POSITION ", "------------ ----------"]; + + writeln!(writer, "{}", HEADER[0]).unwrap(); + writeln!(writer, "{}", HEADER[1]).unwrap(); + + for (token, position) in tokens { + let token = token.to_string(); + + writeln!(writer, "{token:<12} {position}").unwrap(); + } +} + macro_rules! define_tokens { ($($variant:ident $(($data_type:ty))?),+ $(,)?) => { /// Source token. diff --git a/dust-shell/src/main.rs b/dust-shell/src/main.rs index 2bdc3fe..d8f4cd6 100644 --- a/dust-shell/src/main.rs +++ b/dust-shell/src/main.rs @@ -1,8 +1,11 @@ -use std::{fs::read_to_string, io::Write}; +use std::{ + fs::read_to_string, + io::{stdout, Write}, +}; use clap::Parser; use colored::Colorize; -use dust_lang::{compile, format, run}; +use dust_lang::{compile, format, lex, output_token_list, run}; use log::{Level, LevelFilter}; #[derive(Parser)] @@ -11,7 +14,7 @@ struct Cli { #[arg(short, long)] command: Option, - /// Whether to output formatted source code + /// Whether to output formatted source code instead of running the program #[arg(short, long)] format: bool, @@ -23,7 +26,7 @@ struct Cli { #[arg(long)] format_colored: Option, - /// Whether to output the disassembled chunk + /// Whether to output the disassembled chunk instead of running the program #[arg(short, long)] parse: bool, @@ -31,6 +34,10 @@ struct Cli { #[arg(long)] style_disassembly: Option, + /// Whether to tokenize the source code instead of running the program + #[arg(short, long)] + tokenize: bool, + /// Log level #[arg(short, long)] log: Option, @@ -78,11 +85,11 @@ fn main() { }; if args.format { + log::info!("Formatting source"); + let line_numbers = args.format_line_numbers.unwrap_or(true); let colored = args.format_colored.unwrap_or(true); - log::info!("Formatting source"); - match format(source, line_numbers, colored) { Ok(formatted) => println!("{}", formatted), Err(error) => { @@ -91,11 +98,20 @@ fn main() { } } - if args.parse { - let styled = args.style_disassembly.unwrap_or(true); + if args.tokenize { + log::info!("Tokenizing source"); + match lex(source) { + Ok(tokens) => output_token_list(&tokens, &mut stdout()), + Err(error) => eprintln!("{}", error.report()), + } + } + + if args.parse { log::info!("Parsing source"); + let styled = args.style_disassembly.unwrap_or(true); + match compile(source) { Ok(chunk) => { let disassembly = chunk @@ -112,7 +128,7 @@ fn main() { } } - if args.format || args.parse { + if args.format || args.tokenize || args.parse { return; }