1
0
dust/dust-lang/src/lexer.rs
2024-06-20 19:25:46 -04:00

555 lines
16 KiB
Rust

use std::{
f64::{INFINITY, NAN, NEG_INFINITY},
fmt::{self, Display, Formatter},
};
use chumsky::prelude::*;
use crate::error::DustError;
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Token<'src> {
Boolean(bool),
Comment(&'src str),
Integer(i64),
Float(f64),
String(&'src str),
Identifier(&'src str),
Symbol(Symbol),
Keyword(Keyword),
}
impl<'src> Display for Token<'src> {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Token::Boolean(boolean) => write!(f, "{boolean}"),
Token::Comment(comment) => write!(f, "// {comment}"),
Token::Integer(integer) => write!(f, "{integer}"),
Token::Float(float) => write!(f, "{float}"),
Token::String(string) => write!(f, "{string}"),
Token::Identifier(string) => write!(f, "{string}"),
Token::Symbol(control) => write!(f, "{control}"),
Token::Keyword(keyword) => write!(f, "{keyword}"),
}
}
}
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Keyword {
Any,
As,
Async,
Bool,
Break,
Else,
Enum,
Float,
Fn,
Int,
If,
JsonParse,
Length,
List,
Map,
None,
Range,
ReadFile,
ReadLine,
Sleep,
Struct,
Str,
Type,
Loop,
While,
WriteLine,
}
impl Display for Keyword {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Keyword::Any => write!(f, "any"),
Keyword::As => write!(f, "as"),
Keyword::Async => write!(f, "async"),
Keyword::Bool => write!(f, "bool"),
Keyword::Break => write!(f, "break"),
Keyword::Else => write!(f, "else"),
Keyword::Enum => write!(f, "enum"),
Keyword::Float => write!(f, "float"),
Keyword::Fn => write!(f, "fn"),
Keyword::Int => write!(f, "int"),
Keyword::If => write!(f, "if"),
Keyword::List => write!(f, "list"),
Keyword::Map => write!(f, "map"),
Keyword::None => write!(f, "none"),
Keyword::Range => write!(f, "range"),
Keyword::Struct => write!(f, "struct"),
Keyword::Str => write!(f, "str"),
Keyword::Loop => write!(f, "loop"),
Keyword::While => write!(f, "while"),
Keyword::Type => write!(f, "type"),
Keyword::JsonParse => write!(f, "JSON_PARSE"),
Keyword::Length => write!(f, "LENGTH"),
Keyword::ReadFile => write!(f, "READ_FILE"),
Keyword::ReadLine => write!(f, "READ_LINE"),
Keyword::Sleep => write!(f, "SLEEP"),
Keyword::WriteLine => write!(f, "WRITE_LINE"),
}
}
}
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Symbol {
Plus,
PlusEquals,
DoubleAmpersand,
Colon,
Comma,
CurlyClose,
CurlyOpen,
Slash,
Dollar,
Dot,
DoubleColon,
DoubleDot,
DoubleEqual,
DoubleUnderscore,
Equal,
FatArrow,
Greater,
GreaterOrEqual,
Less,
LessOrEqual,
Percent,
Asterisk,
Exclamation,
NotEqual,
DoublePipe,
ParenClose,
ParenOpen,
Pipe,
Semicolon,
SkinnyArrow,
SquareClose,
SquareOpen,
MinusEqual,
Minus,
}
impl Display for Symbol {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Symbol::Asterisk => write!(f, "*"),
Symbol::Colon => write!(f, ":"),
Symbol::Comma => write!(f, ","),
Symbol::CurlyClose => write!(f, "}}"),
Symbol::CurlyOpen => write!(f, "{{"),
Symbol::Dollar => write!(f, "$"),
Symbol::Dot => write!(f, "."),
Symbol::DoubleAmpersand => write!(f, "&&"),
Symbol::DoubleColon => write!(f, "::"),
Symbol::DoubleDot => write!(f, ".."),
Symbol::DoubleEqual => write!(f, "=="),
Symbol::DoublePipe => write!(f, "||"),
Symbol::DoubleUnderscore => write!(f, "__"),
Symbol::Equal => write!(f, "="),
Symbol::Exclamation => write!(f, "!"),
Symbol::FatArrow => write!(f, "=>"),
Symbol::Greater => write!(f, ">"),
Symbol::GreaterOrEqual => write!(f, ">="),
Symbol::Less => write!(f, "<"),
Symbol::LessOrEqual => write!(f, "<="),
Symbol::Minus => write!(f, "-"),
Symbol::MinusEqual => write!(f, "-="),
Symbol::NotEqual => write!(f, "!="),
Symbol::ParenClose => write!(f, ")"),
Symbol::ParenOpen => write!(f, "("),
Symbol::Percent => write!(f, "%"),
Symbol::Pipe => write!(f, "|"),
Symbol::Plus => write!(f, "+"),
Symbol::PlusEquals => write!(f, "+="),
Symbol::Semicolon => write!(f, ";"),
Symbol::SkinnyArrow => write!(f, "->"),
Symbol::Slash => write!(f, "/"),
Symbol::SquareClose => write!(f, "]"),
Symbol::SquareOpen => write!(f, "["),
}
}
}
pub fn lex<'src>(source: &'src str) -> Result<Vec<(Token<'src>, SimpleSpan)>, Vec<DustError>> {
lexer()
.parse(source)
.into_result()
.map_err(|errors| errors.into_iter().map(|error| error.into()).collect())
}
pub fn lexer<'src>() -> impl Parser<
'src,
&'src str,
Vec<(Token<'src>, SimpleSpan<usize>)>,
extra::Err<Rich<'src, char, SimpleSpan<usize>>>,
> {
let line_comment = just("//")
.ignore_then(
none_of('\n')
.repeated()
.to_slice()
.map(|text: &str| Token::Comment(text.trim())),
)
.then_ignore(just('\n').or_not());
let multi_line_comment = just("/*")
.ignore_then(
none_of('*')
.repeated()
.to_slice()
.map(|text: &str| Token::Comment(text.trim())),
)
.then_ignore(just("*/"));
let boolean = choice((
just("true").to(Token::Boolean(true)),
just("false").to(Token::Boolean(false)),
));
let float_numeric = just('-')
.or_not()
.then(text::int(10))
.then(just('.').then(text::digits(10)))
.then(just('e').then(text::digits(10)).or_not())
.to_slice()
.map(|text: &str| Token::Float(text.parse().unwrap()));
let float = choice((
float_numeric,
just("Infinity").to(Token::Float(INFINITY)),
just("-Infinity").to(Token::Float(NEG_INFINITY)),
just("NaN").to(Token::Float(NAN)),
));
let integer = just('-')
.or_not()
.then(text::int(10))
.to_slice()
.map(|text: &str| {
let integer = text.parse().unwrap();
Token::Integer(integer)
});
let delimited_string = |delimiter| {
just(delimiter)
.then(none_of(delimiter).repeated())
.then(just(delimiter))
.to_slice()
.map(|text: &str| Token::String(&text[1..text.len() - 1]))
};
let string = choice((
delimited_string('\''),
delimited_string('"'),
delimited_string('`'),
));
let identifier_and_keyword = text::ident().map(|text: &str| match text {
"any" => Token::Keyword(Keyword::Any),
"async" => Token::Keyword(Keyword::Async),
"as" => Token::Keyword(Keyword::As),
"bool" => Token::Keyword(Keyword::Bool),
"break" => Token::Keyword(Keyword::Break),
"enum" => Token::Keyword(Keyword::Enum),
"else" => Token::Keyword(Keyword::Else),
"float" => Token::Keyword(Keyword::Float),
"fn" => Token::Keyword(Keyword::Fn),
"int" => Token::Keyword(Keyword::Int),
"if" => Token::Keyword(Keyword::If),
"list" => Token::Keyword(Keyword::List),
"map" => Token::Keyword(Keyword::Map),
"none" => Token::Keyword(Keyword::None),
"range" => Token::Keyword(Keyword::Range),
"struct" => Token::Keyword(Keyword::Struct),
"str" => Token::Keyword(Keyword::Str),
"type" => Token::Keyword(Keyword::Type),
"loop" => Token::Keyword(Keyword::Loop),
"while" => Token::Keyword(Keyword::While),
"JSON_PARSE" => Token::Keyword(Keyword::JsonParse),
"LENGTH" => Token::Keyword(Keyword::Length),
"READ_FILE" => Token::Keyword(Keyword::ReadFile),
"READ_LINE" => Token::Keyword(Keyword::ReadLine),
"SLEEP" => Token::Keyword(Keyword::Sleep),
"WRITE_LINE" => Token::Keyword(Keyword::WriteLine),
_ => Token::Identifier(text),
});
let symbol = choice([
just("!=").to(Token::Symbol(Symbol::NotEqual)),
just("!").to(Token::Symbol(Symbol::Exclamation)),
just("$").to(Token::Symbol(Symbol::Dollar)),
just("%").to(Token::Symbol(Symbol::Percent)),
just("&&").to(Token::Symbol(Symbol::DoubleAmpersand)),
just("(").to(Token::Symbol(Symbol::ParenOpen)),
just(")").to(Token::Symbol(Symbol::ParenClose)),
just("*").to(Token::Symbol(Symbol::Asterisk)),
just("+=").to(Token::Symbol(Symbol::PlusEquals)),
just("+").to(Token::Symbol(Symbol::Plus)),
just(",").to(Token::Symbol(Symbol::Comma)),
just("->").to(Token::Symbol(Symbol::SkinnyArrow)),
just("-=").to(Token::Symbol(Symbol::MinusEqual)),
just("-").to(Token::Symbol(Symbol::Minus)),
just("..").to(Token::Symbol(Symbol::DoubleDot)),
just(".").to(Token::Symbol(Symbol::Dot)),
just("/").to(Token::Symbol(Symbol::Slash)),
just("::").to(Token::Symbol(Symbol::DoubleColon)),
just(":").to(Token::Symbol(Symbol::Colon)),
just(";").to(Token::Symbol(Symbol::Semicolon)),
just("<=").to(Token::Symbol(Symbol::LessOrEqual)),
just("<").to(Token::Symbol(Symbol::Less)),
just("=>").to(Token::Symbol(Symbol::FatArrow)),
just("==").to(Token::Symbol(Symbol::DoubleEqual)),
just("=").to(Token::Symbol(Symbol::Equal)),
just(">=").to(Token::Symbol(Symbol::GreaterOrEqual)),
just(">").to(Token::Symbol(Symbol::Greater)),
just("[").to(Token::Symbol(Symbol::SquareOpen)),
just("]").to(Token::Symbol(Symbol::SquareClose)),
just("__").to(Token::Symbol(Symbol::DoubleUnderscore)),
just("{").to(Token::Symbol(Symbol::CurlyOpen)),
just("||").to(Token::Symbol(Symbol::DoublePipe)),
just("|").to(Token::Symbol(Symbol::Pipe)),
just("}").to(Token::Symbol(Symbol::CurlyClose)),
]);
choice((
line_comment,
multi_line_comment,
boolean,
float,
integer,
string,
identifier_and_keyword,
symbol,
))
.map_with(|token: Token, state| (token, state.span()))
.padded()
.repeated()
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn line_comment() {
assert_eq!(
lex("// 42").unwrap(),
vec![(Token::Comment("42"), (0..5).into())]
);
assert_eq!(
lex("1// 42//2").unwrap(),
vec![
(Token::Integer(1), (0..1).into()),
(Token::Comment("42//2"), (1..9).into()),
]
);
assert_eq!(
lex("
1
// 42
2
")
.unwrap(),
vec![
(Token::Integer(1), (17..18).into()),
(Token::Comment("42"), (35..41).into()),
(Token::Integer(2), (57..58).into()),
]
);
}
#[test]
fn multi_line_comment() {
assert_eq!(
lex("/* 42 */").unwrap(),
vec![(Token::Comment("42"), (0..8).into())]
);
assert_eq!(
lex("1/* 42//2 */").unwrap(),
vec![
(Token::Integer(1), (0..1).into()),
(Token::Comment("42//2"), (1..12).into()),
]
);
assert_eq!(
lex("
1
/*
42
*/
2
")
.unwrap(),
vec![
(Token::Integer(1), (17..18).into()),
(Token::Comment("42"), (35..79).into()),
(Token::Integer(2), (96..97).into()),
]
);
}
#[test]
fn range() {
assert_eq!(
lex("1..10").unwrap(),
vec![
(Token::Integer(1), (0..1).into()),
(Token::Symbol(Symbol::DoubleDot), (1..3).into()),
(Token::Integer(10), (3..5).into())
]
)
}
#[test]
fn math_operators() {
assert_eq!(
lex("1 + 1").unwrap(),
vec![
(Token::Integer(1), (0..1).into()),
(Token::Symbol(Symbol::Plus), (2..3).into()),
(Token::Integer(1), (4..5).into())
]
)
}
#[test]
fn keywords() {
assert_eq!(lex("int").unwrap()[0].0, Token::Keyword(Keyword::Int))
}
#[test]
fn identifier() {
assert_eq!(lex("x").unwrap()[0].0, Token::Identifier("x"));
assert_eq!(lex("foobar").unwrap()[0].0, Token::Identifier("foobar"));
assert_eq!(lex("HELLO").unwrap()[0].0, Token::Identifier("HELLO"));
}
#[test]
fn r#true() {
assert_eq!(lex("true").unwrap()[0].0, Token::Boolean(true));
}
#[test]
fn r#false() {
assert_eq!(lex("false").unwrap()[0].0, Token::Boolean(false));
}
#[test]
fn positive_float() {
assert_eq!(lex("0.0").unwrap()[0].0, Token::Float(0.0));
assert_eq!(lex("42.0").unwrap()[0].0, Token::Float(42.0));
let max_float = f64::MAX.to_string() + ".0";
assert_eq!(lex(&max_float).unwrap()[0].0, Token::Float(f64::MAX));
let min_positive_float = f64::MIN_POSITIVE.to_string();
assert_eq!(
lex(&min_positive_float).unwrap()[0].0,
Token::Float(f64::MIN_POSITIVE)
);
}
#[test]
fn negative_float() {
assert_eq!(lex("-0.0").unwrap()[0].0, Token::Float(-0.0));
assert_eq!(lex("-42.0").unwrap()[0].0, Token::Float(-42.0));
let min_float = f64::MIN.to_string() + ".0";
assert_eq!(lex(&min_float).unwrap()[0].0, Token::Float(f64::MIN));
let max_negative_float = format!("-{}", f64::MIN_POSITIVE);
assert_eq!(
lex(&max_negative_float).unwrap()[0].0,
Token::Float(-f64::MIN_POSITIVE)
);
}
#[test]
fn other_float() {
assert_eq!(lex("Infinity").unwrap()[0].0, Token::Float(f64::INFINITY));
assert_eq!(
lex("-Infinity").unwrap()[0].0,
Token::Float(f64::NEG_INFINITY)
);
if let Token::Float(float) = &lex("NaN").unwrap()[0].0 {
assert!(float.is_nan());
} else {
panic!("Expected a float.")
}
}
#[test]
fn positive_integer() {
for i in 0..10 {
let source = i.to_string();
let tokens = lex(&source).unwrap();
assert_eq!(tokens[0].0, Token::Integer(i))
}
assert_eq!(lex("42").unwrap()[0].0, Token::Integer(42));
let maximum_integer = i64::MAX.to_string();
assert_eq!(
lex(&maximum_integer).unwrap()[0].0,
Token::Integer(i64::MAX)
);
}
#[test]
fn negative_integer() {
for i in -9..1 {
let source = i.to_string();
let tokens = lex(&source).unwrap();
assert_eq!(tokens[0].0, Token::Integer(i))
}
assert_eq!(lex("-42").unwrap()[0].0, Token::Integer(-42));
let minimum_integer = i64::MIN.to_string();
assert_eq!(
lex(&minimum_integer).unwrap()[0].0,
Token::Integer(i64::MIN)
);
}
#[test]
fn double_quoted_string() {
assert_eq!(lex("\"\"").unwrap()[0].0, Token::String(""));
assert_eq!(lex("\"42\"").unwrap()[0].0, Token::String("42"));
assert_eq!(lex("\"foobar\"").unwrap()[0].0, Token::String("foobar"));
}
#[test]
fn single_quoted_string() {
assert_eq!(lex("''").unwrap()[0].0, Token::String(""));
assert_eq!(lex("'42'").unwrap()[0].0, Token::String("42"));
assert_eq!(lex("'foobar'").unwrap()[0].0, Token::String("foobar"));
}
#[test]
fn grave_quoted_string() {
assert_eq!(lex("``").unwrap()[0].0, Token::String(""));
assert_eq!(lex("`42`").unwrap()[0].0, Token::String("42"));
assert_eq!(lex("`foobar`").unwrap()[0].0, Token::String("foobar"));
}
}