1
0
dust/dust-lang/src/lexer.rs

484 lines
13 KiB
Rust
Raw Normal View History

use std::{
f64::{INFINITY, NAN, NEG_INFINITY},
fmt::{self, Display, Formatter},
};
2024-03-20 15:43:47 +00:00
use chumsky::prelude::*;
2024-02-25 18:49:26 +00:00
use crate::error::Error;
2024-03-09 01:30:26 +00:00
#[derive(Copy, Clone, Debug, PartialEq)]
2024-02-25 18:49:26 +00:00
pub enum Token<'src> {
Boolean(bool),
BuiltInIdentifier(BuiltInIdentifier),
2024-02-25 18:49:26 +00:00
Integer(i64),
Float(f64),
String(&'src str),
Identifier(&'src str),
2024-03-07 10:37:26 +00:00
Operator(Operator),
2024-03-07 11:57:33 +00:00
Control(Control),
2024-03-20 15:43:47 +00:00
Keyword(Keyword),
}
impl<'src> Display for Token<'src> {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Token::Boolean(boolean) => write!(f, "{boolean}"),
Token::BuiltInIdentifier(built_in_identifier) => write!(f, "{built_in_identifier}"),
Token::Integer(integer) => write!(f, "{integer}"),
Token::Float(float) => write!(f, "{float}"),
Token::String(string) => write!(f, "{string}"),
Token::Identifier(string) => write!(f, "{string}"),
Token::Operator(operator) => write!(f, "{operator}"),
Token::Control(control) => write!(f, "{control}"),
Token::Keyword(keyword) => write!(f, "{keyword}"),
}
}
}
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum BuiltInIdentifier {
ReadLine,
WriteLine,
}
impl Display for BuiltInIdentifier {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
BuiltInIdentifier::ReadLine => write!(f, "__READ_LINE__"),
BuiltInIdentifier::WriteLine => write!(f, "__WRITE_LINE__"),
}
}
}
2024-03-20 15:43:47 +00:00
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Keyword {
Any,
2024-03-20 21:05:37 +00:00
Async,
2024-03-20 15:43:47 +00:00
Bool,
Break,
Else,
Float,
Int,
If,
List,
Map,
None,
Range,
Struct,
Str,
Loop,
While,
}
impl Display for Keyword {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Keyword::Any => write!(f, "any"),
2024-03-20 21:05:37 +00:00
Keyword::Async => write!(f, "async"),
2024-03-20 15:43:47 +00:00
Keyword::Bool => write!(f, "bool"),
Keyword::Break => write!(f, "break"),
Keyword::Else => write!(f, "else"),
Keyword::Float => write!(f, "float"),
Keyword::Int => write!(f, "int"),
Keyword::If => write!(f, "if"),
Keyword::List => write!(f, "list"),
Keyword::Map => write!(f, "map"),
Keyword::None => write!(f, "none"),
Keyword::Range => write!(f, "range"),
Keyword::Struct => write!(f, "struct"),
Keyword::Str => write!(f, "str"),
Keyword::Loop => write!(f, "loop"),
Keyword::While => write!(f, "while"),
}
}
}
2024-03-09 01:30:26 +00:00
#[derive(Copy, Clone, Debug, PartialEq)]
2024-03-07 10:37:26 +00:00
pub enum Operator {
Add,
AddAssign,
And,
Assign,
Divide,
Equal,
Greater,
GreaterOrEqual,
Less,
LessOrEqual,
Modulo,
Multiply,
Not,
NotEqual,
Or,
SubAssign,
Subtract,
}
impl Display for Operator {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Operator::Add => write!(f, "+"),
Operator::AddAssign => write!(f, "+="),
Operator::And => write!(f, "&&"),
Operator::Assign => write!(f, "="),
Operator::Divide => write!(f, "="),
Operator::Equal => write!(f, "=="),
Operator::Greater => write!(f, ">"),
Operator::GreaterOrEqual => write!(f, ">="),
Operator::Less => write!(f, "<"),
Operator::LessOrEqual => write!(f, "<="),
Operator::Modulo => write!(f, "%"),
Operator::Multiply => write!(f, "*"),
Operator::Not => write!(f, "!"),
Operator::NotEqual => write!(f, "!="),
Operator::Or => write!(f, "||"),
Operator::SubAssign => write!(f, "-="),
Operator::Subtract => write!(f, "-"),
}
}
}
2024-03-09 01:30:26 +00:00
#[derive(Copy, Clone, Debug, PartialEq)]
2024-03-07 11:57:33 +00:00
pub enum Control {
2024-03-10 01:57:46 +00:00
Arrow,
2024-03-07 11:57:33 +00:00
CurlyOpen,
CurlyClose,
SquareOpen,
SquareClose,
ParenOpen,
ParenClose,
Comma,
DoubleColon,
Colon,
2024-03-23 13:35:24 +00:00
Dollar,
2024-03-07 11:57:33 +00:00
Dot,
2024-03-09 01:30:26 +00:00
DoubleDot,
2024-03-07 11:57:33 +00:00
Semicolon,
}
impl Display for Control {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
2024-03-10 01:57:46 +00:00
Control::Arrow => write!(f, "->"),
2024-03-07 11:57:33 +00:00
Control::CurlyOpen => write!(f, "{{"),
Control::CurlyClose => write!(f, "}}"),
2024-03-23 13:35:24 +00:00
Control::Dollar => write!(f, "$"),
2024-03-07 11:57:33 +00:00
Control::SquareOpen => write!(f, "["),
Control::SquareClose => write!(f, "]"),
Control::ParenOpen => write!(f, "("),
Control::ParenClose => write!(f, ")"),
Control::Comma => write!(f, ","),
Control::DoubleColon => write!(f, "::"),
Control::Colon => write!(f, ":"),
Control::Dot => write!(f, "."),
Control::Semicolon => write!(f, ";"),
2024-03-11 21:58:26 +00:00
Control::DoubleDot => write!(f, ".."),
2024-03-07 11:57:33 +00:00
}
}
}
2024-03-06 20:36:58 +00:00
pub fn lex<'src>(source: &'src str) -> Result<Vec<(Token<'src>, SimpleSpan)>, Vec<Error>> {
2024-02-25 18:49:26 +00:00
lexer()
.parse(source)
.into_result()
2024-03-06 20:36:58 +00:00
.map_err(|errors| errors.into_iter().map(|error| error.into()).collect())
2024-02-25 18:49:26 +00:00
}
pub fn lexer<'src>() -> impl Parser<
'src,
&'src str,
Vec<(Token<'src>, SimpleSpan<usize>)>,
extra::Err<Rich<'src, char, SimpleSpan<usize>>>,
> {
2024-03-07 00:45:41 +00:00
let boolean = choice((
2024-03-20 15:43:47 +00:00
just("true").to(Token::Boolean(true)),
just("false").to(Token::Boolean(false)),
2024-03-07 00:45:41 +00:00
));
2024-02-25 18:49:26 +00:00
let float_numeric = just('-')
.or_not()
.then(text::int(10))
.then(just('.').then(text::digits(10)))
2024-03-09 00:05:17 +00:00
.then(just('e').then(text::digits(10)).or_not())
2024-02-25 18:49:26 +00:00
.to_slice()
.map(|text: &str| Token::Float(text.parse().unwrap()));
let float = choice((
float_numeric,
just("Infinity").to(Token::Float(INFINITY)),
just("-Infinity").to(Token::Float(NEG_INFINITY)),
just("NaN").to(Token::Float(NAN)),
));
2024-02-25 18:49:26 +00:00
let integer = just('-')
.or_not()
2024-02-28 23:16:25 +00:00
.then(text::int(10))
2024-02-25 18:49:26 +00:00
.to_slice()
.map(|text: &str| {
let integer = text.parse().unwrap();
2024-02-25 18:49:26 +00:00
Token::Integer(integer)
});
let delimited_string = |delimiter| {
just(delimiter)
.then(none_of(delimiter).repeated())
.then(just(delimiter))
.to_slice()
.map(|text: &str| Token::String(&text[1..text.len() - 1]))
};
let string = choice((
delimited_string('\''),
delimited_string('"'),
delimited_string('`'),
));
let identifier = text::ident().map(|text: &str| Token::Identifier(text));
let operator = choice((
2024-03-07 10:37:26 +00:00
// logic
2024-03-20 15:43:47 +00:00
just("&&").to(Operator::And),
just("==").to(Operator::Equal),
just("!=").to(Operator::NotEqual),
just(">=").to(Operator::GreaterOrEqual),
just("<=").to(Operator::LessOrEqual),
just(">").to(Operator::Greater),
just("<").to(Operator::Less),
just("!").to(Operator::Not),
just("!=").to(Operator::NotEqual),
just("||").to(Operator::Or),
2024-03-07 11:33:54 +00:00
// assignment
2024-03-20 15:43:47 +00:00
just("=").to(Operator::Assign),
just("+=").to(Operator::AddAssign),
just("-=").to(Operator::SubAssign),
2024-03-07 10:37:26 +00:00
// math
2024-03-20 15:43:47 +00:00
just("+").to(Operator::Add),
just("-").to(Operator::Subtract),
just("*").to(Operator::Multiply),
just("/").to(Operator::Divide),
just("%").to(Operator::Modulo),
2024-02-25 18:49:26 +00:00
))
.map(Token::Operator);
let control = choice((
2024-03-20 15:43:47 +00:00
just("->").to(Control::Arrow),
just("{").to(Control::CurlyOpen),
just("}").to(Control::CurlyClose),
just("[").to(Control::SquareOpen),
just("]").to(Control::SquareClose),
just("(").to(Control::ParenOpen),
just(")").to(Control::ParenClose),
just(",").to(Control::Comma),
just(";").to(Control::Semicolon),
just("::").to(Control::DoubleColon),
just(":").to(Control::Colon),
just("..").to(Control::DoubleDot),
just(".").to(Control::Dot),
2024-03-23 13:35:24 +00:00
just("$").to(Control::Dollar),
))
.map(Token::Control);
2024-02-25 18:49:26 +00:00
let keyword = choice((
2024-03-20 15:43:47 +00:00
just("any").to(Keyword::Any),
2024-03-20 21:05:37 +00:00
just("async").to(Keyword::Async),
2024-03-20 15:43:47 +00:00
just("bool").to(Keyword::Bool),
just("break").to(Keyword::Break),
just("else").to(Keyword::Else),
just("float").to(Keyword::Float),
just("int").to(Keyword::Int),
just("if").to(Keyword::If),
just("list").to(Keyword::List),
just("map").to(Keyword::Map),
just("none").to(Keyword::None),
just("range").to(Keyword::Range),
just("struct").to(Keyword::Struct),
just("str").to(Keyword::Str),
just("loop").to(Keyword::Loop),
just("while").to(Keyword::While),
))
.map(Token::Keyword);
let built_in_identifier = choice((
just("__READ_LINE__").to(BuiltInIdentifier::ReadLine),
just("__WRITE_LINE__").to(BuiltInIdentifier::WriteLine),
))
.map(Token::BuiltInIdentifier);
2024-02-25 18:49:26 +00:00
choice((
boolean,
float,
integer,
string,
keyword,
identifier,
control,
operator,
built_in_identifier,
2024-02-25 18:49:26 +00:00
))
.map_with(|token, state| (token, state.span()))
.padded()
.repeated()
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
2024-03-09 01:30:26 +00:00
#[test]
fn range() {
assert_eq!(
lex("1..10").unwrap(),
vec![
(Token::Integer(1), (0..1).into()),
(Token::Control(Control::DoubleDot), (1..3).into()),
(Token::Integer(10), (3..5).into())
]
)
}
2024-03-07 11:33:54 +00:00
#[test]
fn math_operators() {
assert_eq!(
lex("1 + 1").unwrap(),
vec![
(Token::Integer(1), (0..1).into()),
2024-03-20 15:43:47 +00:00
(Token::Operator(Operator::Add), (2..3).into()),
2024-03-07 11:33:54 +00:00
(Token::Integer(1), (4..5).into())
]
)
}
#[test]
fn keywords() {
2024-03-20 15:43:47 +00:00
assert_eq!(lex("int").unwrap()[0].0, Token::Keyword(Keyword::Int))
}
2024-02-25 18:49:26 +00:00
#[test]
fn identifier() {
assert_eq!(lex("x").unwrap()[0].0, Token::Identifier("x"));
assert_eq!(lex("foobar").unwrap()[0].0, Token::Identifier("foobar"));
assert_eq!(lex("HELLO").unwrap()[0].0, Token::Identifier("HELLO"));
}
#[test]
fn r#true() {
assert_eq!(lex("true").unwrap()[0].0, Token::Boolean(true));
}
#[test]
fn r#false() {
assert_eq!(lex("false").unwrap()[0].0, Token::Boolean(false));
}
#[test]
fn positive_float() {
assert_eq!(lex("0.0").unwrap()[0].0, Token::Float(0.0));
assert_eq!(lex("42.0").unwrap()[0].0, Token::Float(42.0));
let max_float = f64::MAX.to_string() + ".0";
assert_eq!(lex(&max_float).unwrap()[0].0, Token::Float(f64::MAX));
let min_positive_float = f64::MIN_POSITIVE.to_string();
assert_eq!(
lex(&min_positive_float).unwrap()[0].0,
Token::Float(f64::MIN_POSITIVE)
);
}
#[test]
fn negative_float() {
assert_eq!(lex("-0.0").unwrap()[0].0, Token::Float(-0.0));
assert_eq!(lex("-42.0").unwrap()[0].0, Token::Float(-42.0));
let min_float = f64::MIN.to_string() + ".0";
assert_eq!(lex(&min_float).unwrap()[0].0, Token::Float(f64::MIN));
let max_negative_float = format!("-{}", f64::MIN_POSITIVE);
assert_eq!(
lex(&max_negative_float).unwrap()[0].0,
Token::Float(-f64::MIN_POSITIVE)
);
}
#[test]
fn other_float() {
assert_eq!(lex("Infinity").unwrap()[0].0, Token::Float(f64::INFINITY));
assert_eq!(
lex("-Infinity").unwrap()[0].0,
Token::Float(f64::NEG_INFINITY)
);
if let Token::Float(float) = &lex("NaN").unwrap()[0].0 {
assert!(float.is_nan());
} else {
panic!("Expected a float.")
}
}
#[test]
fn positive_integer() {
for i in 0..10 {
let source = i.to_string();
let tokens = lex(&source).unwrap();
assert_eq!(tokens[0].0, Token::Integer(i))
}
assert_eq!(lex("42").unwrap()[0].0, Token::Integer(42));
let maximum_integer = i64::MAX.to_string();
assert_eq!(
lex(&maximum_integer).unwrap()[0].0,
Token::Integer(i64::MAX)
);
}
#[test]
fn negative_integer() {
for i in -9..1 {
let source = i.to_string();
let tokens = lex(&source).unwrap();
assert_eq!(tokens[0].0, Token::Integer(i))
}
assert_eq!(lex("-42").unwrap()[0].0, Token::Integer(-42));
let minimum_integer = i64::MIN.to_string();
assert_eq!(
lex(&minimum_integer).unwrap()[0].0,
Token::Integer(i64::MIN)
);
}
#[test]
fn double_quoted_string() {
assert_eq!(lex("\"\"").unwrap()[0].0, Token::String(""));
assert_eq!(lex("\"42\"").unwrap()[0].0, Token::String("42"));
assert_eq!(lex("\"foobar\"").unwrap()[0].0, Token::String("foobar"));
}
#[test]
fn single_quoted_string() {
assert_eq!(lex("''").unwrap()[0].0, Token::String(""));
assert_eq!(lex("'42'").unwrap()[0].0, Token::String("42"));
assert_eq!(lex("'foobar'").unwrap()[0].0, Token::String("foobar"));
}
#[test]
fn grave_quoted_string() {
assert_eq!(lex("``").unwrap()[0].0, Token::String(""));
assert_eq!(lex("`42`").unwrap()[0].0, Token::String("42"));
assert_eq!(lex("`foobar`").unwrap()[0].0, Token::String("foobar"));
}
}