1
0

Begin refactoring the lexer

This commit is contained in:
Jeff 2024-11-07 17:59:28 -05:00
parent bb345a7938
commit b9ded3ea78
4 changed files with 65 additions and 260 deletions

View File

@ -87,256 +87,10 @@ impl<'src> Lexer<'src> {
pub fn next_token(&mut self) -> Result<(Token<'src>, Span), LexError> {
self.skip_whitespace();
let (token, span) = if let Some(c) = self.peek_char() {
match c {
'0'..='9' => self.lex_numeric()?,
'-' => {
let second_char = self.peek_second_char();
let (token, span) = if let Some(character) = self.peek_char() {
let lexer = LexRule::from(&character).lexer;
if let Some('=') = second_char {
self.position += 2;
(Token::MinusEqual, Span(self.position - 2, self.position))
} else if let Some('>') = second_char {
self.position += 2;
(Token::ArrowThin, Span(self.position - 2, self.position))
} else if let Some('0'..='9') = second_char {
self.lex_numeric()?
} else if "-Infinity" == self.peek_chars(9) {
self.position += 9;
(
Token::Float("-Infinity"),
Span(self.position - 9, self.position),
)
} else {
self.position += 1;
(Token::Minus, Span(self.position - 1, self.position))
}
}
'a'..='z' | 'A'..='Z' => self.lex_alphanumeric()?,
'"' => self.lex_string()?,
'\'' => {
self.position += 1;
if let Some(c) = self.peek_char() {
self.position += 1;
let peek = self.peek_char();
if let Some('\'') = peek {
self.position += 1;
(Token::Character(c), Span(self.position - 3, self.position))
} else {
return Err(LexError::ExpectedCharacter {
expected: '\'',
actual: peek.unwrap_or('\0'),
position: self.position,
});
}
} else {
return Err(LexError::UnexpectedEndOfFile {
position: self.position,
});
}
}
'+' => {
if let Some('=') = self.peek_second_char() {
self.position += 2;
(Token::PlusEqual, Span(self.position - 2, self.position))
} else {
self.position += 1;
(Token::Plus, Span(self.position - 1, self.position))
}
}
'*' => {
if let Some('=') = self.peek_second_char() {
self.position += 2;
(Token::StarEqual, Span(self.position - 2, self.position))
} else {
self.position += 1;
(Token::Star, Span(self.position - 1, self.position))
}
}
'(' => {
self.position += 1;
(
Token::LeftParenthesis,
Span(self.position - 1, self.position),
)
}
')' => {
self.position += 1;
(
Token::RightParenthesis,
Span(self.position - 1, self.position),
)
}
'=' => {
if let Some('=') = self.peek_second_char() {
self.position += 2;
(Token::DoubleEqual, Span(self.position - 2, self.position))
} else {
self.position += 1;
(Token::Equal, Span(self.position - 1, self.position))
}
}
'[' => {
self.position += 1;
(
Token::LeftSquareBrace,
Span(self.position - 1, self.position),
)
}
']' => {
self.position += 1;
(
Token::RightSquareBrace,
Span(self.position - 1, self.position),
)
}
',' => {
self.position += 1;
(Token::Comma, Span(self.position - 1, self.position))
}
'.' => {
if let Some('.') = self.peek_second_char() {
self.position += 2;
(Token::DoubleDot, Span(self.position - 2, self.position))
} else {
self.position += 1;
(Token::Dot, Span(self.position - 1, self.position))
}
}
'>' => {
if let Some('=') = self.peek_second_char() {
self.position += 2;
(Token::GreaterEqual, Span(self.position - 2, self.position))
} else {
self.position += 1;
(Token::Greater, Span(self.position - 1, self.position))
}
}
'<' => {
if let Some('=') = self.peek_second_char() {
self.position += 2;
(Token::LessEqual, Span(self.position - 2, self.position))
} else {
self.position += 1;
(Token::Less, Span(self.position - 1, self.position))
}
}
'{' => {
self.position += 1;
(
Token::LeftCurlyBrace,
Span(self.position - 1, self.position),
)
}
'}' => {
self.position += 1;
(
Token::RightCurlyBrace,
Span(self.position - 1, self.position),
)
}
'/' => {
if let Some('=') = self.peek_second_char() {
self.position += 2;
(Token::SlashEqual, Span(self.position - 2, self.position))
} else {
self.position += 1;
(Token::Slash, Span(self.position - 1, self.position))
}
}
'%' => {
self.position += 1;
(Token::Percent, Span(self.position - 1, self.position))
}
'&' => {
if let Some('&') = self.peek_second_char() {
self.position += 2;
(
Token::DoubleAmpersand,
Span(self.position - 2, self.position),
)
} else {
self.position += 1;
return Err(LexError::UnexpectedCharacter {
actual: c,
position: self.position,
});
}
}
';' => {
self.position += 1;
(Token::Semicolon, Span(self.position - 1, self.position))
}
'|' => {
if let Some('|') = self.peek_second_char() {
self.position += 2;
(Token::DoublePipe, Span(self.position - 2, self.position))
} else {
self.position += 1;
return Err(LexError::UnexpectedCharacter {
actual: c,
position: self.position,
});
}
}
'!' => {
self.position += 1;
if let Some('=') = self.peek_char() {
self.position += 1;
(Token::BangEqual, Span(self.position - 2, self.position))
} else {
(Token::Bang, Span(self.position - 1, self.position))
}
}
':' => {
self.position += 1;
(Token::Colon, Span(self.position - 1, self.position))
}
_ => {
return Err(LexError::UnexpectedCharacter {
actual: c,
position: self.position,
});
}
}
lexer(self)?
} else {
(Token::Eof, Span(self.position, self.position))
};
@ -566,6 +320,23 @@ impl<'src> Lexer<'src> {
}
}
type LexerFn<'src> = fn(&mut Lexer<'src>) -> Result<(Token<'src>, Span), LexError>;
pub struct LexRule<'src> {
lexer: LexerFn<'src>,
}
impl<'src> From<&char> for LexRule<'src> {
fn from(char: &char) -> Self {
match char {
'0'..='9' => LexRule {
lexer: Lexer::lex_numeric,
},
_ => panic!("Invalid character"),
}
}
}
#[derive(Debug, PartialEq, Clone)]
pub enum LexError {
ExpectedAsciiHexDigit {

View File

@ -26,7 +26,7 @@ pub use crate::native_function::{NativeFunction, NativeFunctionError};
pub use crate::operation::Operation;
pub use crate::optimizer::{optimize, Optimizer};
pub use crate::r#type::{EnumType, FunctionType, RangeableType, StructType, Type, TypeConflict};
pub use crate::token::{Token, TokenKind, TokenOwned};
pub use crate::token::{output_token_list, Token, TokenKind, TokenOwned};
pub use crate::value::{ConcreteValue, Function, Value, ValueError};
pub use crate::vm::{run, Vm, VmError};

View File

@ -1,8 +1,26 @@
//! Token, TokenOwned and TokenKind types.
use std::fmt::{self, Display, Formatter};
use std::{
fmt::{self, Display, Formatter},
io::Write,
};
use serde::{Deserialize, Serialize};
use crate::Span;
pub fn output_token_list<W: Write>(tokens: &[(Token, Span)], writer: &mut W) {
const HEADER: [&str; 2] = ["TOKEN POSITION ", "------------ ----------"];
writeln!(writer, "{}", HEADER[0]).unwrap();
writeln!(writer, "{}", HEADER[1]).unwrap();
for (token, position) in tokens {
let token = token.to_string();
writeln!(writer, "{token:<12} {position}").unwrap();
}
}
macro_rules! define_tokens {
($($variant:ident $(($data_type:ty))?),+ $(,)?) => {
/// Source token.

View File

@ -1,8 +1,11 @@
use std::{fs::read_to_string, io::Write};
use std::{
fs::read_to_string,
io::{stdout, Write},
};
use clap::Parser;
use colored::Colorize;
use dust_lang::{compile, format, run};
use dust_lang::{compile, format, lex, output_token_list, run};
use log::{Level, LevelFilter};
#[derive(Parser)]
@ -11,7 +14,7 @@ struct Cli {
#[arg(short, long)]
command: Option<String>,
/// Whether to output formatted source code
/// Whether to output formatted source code instead of running the program
#[arg(short, long)]
format: bool,
@ -23,7 +26,7 @@ struct Cli {
#[arg(long)]
format_colored: Option<bool>,
/// Whether to output the disassembled chunk
/// Whether to output the disassembled chunk instead of running the program
#[arg(short, long)]
parse: bool,
@ -31,6 +34,10 @@ struct Cli {
#[arg(long)]
style_disassembly: Option<bool>,
/// Whether to tokenize the source code instead of running the program
#[arg(short, long)]
tokenize: bool,
/// Log level
#[arg(short, long)]
log: Option<LevelFilter>,
@ -78,11 +85,11 @@ fn main() {
};
if args.format {
log::info!("Formatting source");
let line_numbers = args.format_line_numbers.unwrap_or(true);
let colored = args.format_colored.unwrap_or(true);
log::info!("Formatting source");
match format(source, line_numbers, colored) {
Ok(formatted) => println!("{}", formatted),
Err(error) => {
@ -91,11 +98,20 @@ fn main() {
}
}
if args.parse {
let styled = args.style_disassembly.unwrap_or(true);
if args.tokenize {
log::info!("Tokenizing source");
match lex(source) {
Ok(tokens) => output_token_list(&tokens, &mut stdout()),
Err(error) => eprintln!("{}", error.report()),
}
}
if args.parse {
log::info!("Parsing source");
let styled = args.style_disassembly.unwrap_or(true);
match compile(source) {
Ok(chunk) => {
let disassembly = chunk
@ -112,7 +128,7 @@ fn main() {
}
}
if args.format || args.parse {
if args.format || args.tokenize || args.parse {
return;
}