1
0
dust/dust-lang/src/lex.rs

287 lines
7.2 KiB
Rust
Raw Normal View History

2024-08-04 23:41:00 +00:00
use std::num::{ParseFloatError, ParseIntError};
2024-08-04 00:23:52 +00:00
use crate::{Identifier, Span, Token};
pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
let mut lexer = Lexer::new(input);
let mut tokens = Vec::new();
loop {
let (token, span) = lexer.next_token()?;
let is_eof = matches!(token, Token::Eof);
tokens.push((token, span));
if is_eof {
break;
}
}
Ok(tokens)
}
#[derive(Debug, Clone)]
pub struct Lexer<'a> {
2024-08-05 04:40:51 +00:00
source: &'a str,
2024-08-04 00:23:52 +00:00
position: usize,
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
2024-08-05 04:40:51 +00:00
Lexer {
source: input,
position: 0,
}
2024-08-04 00:23:52 +00:00
}
fn next_char(&mut self) -> Option<char> {
2024-08-05 04:40:51 +00:00
self.source[self.position..].chars().next().map(|c| {
2024-08-04 00:23:52 +00:00
self.position += c.len_utf8();
c
})
}
pub fn next_token(&mut self) -> Result<(Token, Span), LexError> {
self.skip_whitespace();
let (token, span) = if let Some(c) = self.peek_char() {
match c {
'0'..='9' => self.lex_number()?,
'a'..='z' | 'A'..='Z' => self.lex_identifier()?,
'+' => {
self.position += 1;
(Token::Plus, (self.position - 1, self.position))
}
'*' => {
self.position += 1;
(Token::Star, (self.position - 1, self.position))
}
'(' => {
self.position += 1;
(Token::LeftParenthesis, (self.position - 1, self.position))
}
')' => {
self.position += 1;
(Token::RightParenthesis, (self.position - 1, self.position))
}
'=' => {
self.position += 1;
(Token::Equal, (self.position - 1, self.position))
}
2024-08-05 01:31:18 +00:00
'[' => {
self.position += 1;
(Token::LeftSquareBrace, (self.position - 1, self.position))
}
']' => {
self.position += 1;
(Token::RightSquareBrace, (self.position - 1, self.position))
}
',' => {
self.position += 1;
(Token::Comma, (self.position - 1, self.position))
}
2024-08-04 00:23:52 +00:00
_ => (Token::Eof, (self.position, self.position)),
}
} else {
(Token::Eof, (self.position, self.position))
};
Ok((token, span))
}
fn skip_whitespace(&mut self) {
while let Some(c) = self.peek_char() {
if c.is_whitespace() {
self.next_char();
} else {
break;
}
}
}
fn peek_char(&self) -> Option<char> {
2024-08-05 04:40:51 +00:00
self.source[self.position..].chars().next()
2024-08-04 00:23:52 +00:00
}
fn lex_number(&mut self) -> Result<(Token, Span), LexError> {
let start_pos = self.position;
2024-08-04 23:41:00 +00:00
let mut is_float = false;
2024-08-04 00:23:52 +00:00
while let Some(c) = self.peek_char() {
2024-08-04 23:41:00 +00:00
if c == '.' {
is_float = true;
self.next_char();
while let Some(c) = self.peek_char() {
if c.is_ascii_digit() {
self.next_char();
} else {
break;
}
}
}
2024-08-04 00:23:52 +00:00
if c.is_ascii_digit() {
self.next_char();
} else {
break;
}
}
2024-08-04 23:41:00 +00:00
if is_float {
2024-08-05 04:40:51 +00:00
let float = self.source[start_pos..self.position].parse::<f64>()?;
2024-08-04 00:23:52 +00:00
2024-08-04 23:41:00 +00:00
Ok((Token::Float(float), (start_pos, self.position)))
} else {
2024-08-05 04:40:51 +00:00
let integer = self.source[start_pos..self.position].parse::<i64>()?;
2024-08-04 23:41:00 +00:00
2024-08-05 00:08:43 +00:00
Ok((Token::Integer(integer), (start_pos, self.position)))
2024-08-04 23:41:00 +00:00
}
2024-08-04 00:23:52 +00:00
}
fn lex_identifier(&mut self) -> Result<(Token, Span), LexError> {
let start_pos = self.position;
while let Some(c) = self.peek_char() {
if c.is_ascii_alphanumeric() {
self.next_char();
} else {
break;
}
}
2024-08-05 04:40:51 +00:00
let identifier = &self.source[start_pos..self.position];
2024-08-04 00:23:52 +00:00
let token = Token::Identifier(Identifier::new(identifier));
Ok((token, (start_pos, self.position)))
}
}
2024-08-04 23:25:44 +00:00
#[derive(Debug, PartialEq, Clone)]
pub enum LexError {
2024-08-04 23:41:00 +00:00
FloatError(ParseFloatError),
IntegerError(ParseIntError),
2024-08-04 23:25:44 +00:00
}
2024-08-04 23:41:00 +00:00
impl From<ParseFloatError> for LexError {
fn from(error: std::num::ParseFloatError) -> Self {
Self::FloatError(error)
}
}
impl From<ParseIntError> for LexError {
fn from(error: std::num::ParseIntError) -> Self {
Self::IntegerError(error)
2024-08-04 23:25:44 +00:00
}
}
#[cfg(test)]
mod tests {
use super::*;
2024-08-05 01:31:18 +00:00
#[test]
fn square_braces() {
let input = "[]";
assert_eq!(
lex(input),
Ok(vec![
(Token::LeftSquareBrace, (0, 1)),
(Token::RightSquareBrace, (1, 2)),
(Token::Eof, (2, 2)),
])
)
}
2024-08-04 23:41:00 +00:00
#[test]
2024-08-05 00:08:43 +00:00
fn small_float() {
2024-08-04 23:41:00 +00:00
let input = "1.23";
assert_eq!(
lex(input),
Ok(vec![(Token::Float(1.23), (0, 4)), (Token::Eof, (4, 4)),])
)
}
2024-08-05 00:08:43 +00:00
#[test]
#[allow(clippy::excessive_precision)]
fn big_float() {
let input = "123456789.123456789";
assert_eq!(
lex(input),
Ok(vec![
(Token::Float(123456789.123456789), (0, 19)),
(Token::Eof, (19, 19)),
])
)
}
2024-08-04 23:25:44 +00:00
#[test]
fn add() {
let input = "1 + 2";
assert_eq!(
lex(input),
Ok(vec![
2024-08-05 00:08:43 +00:00
(Token::Integer(1), (0, 1)),
2024-08-04 23:25:44 +00:00
(Token::Plus, (2, 3)),
2024-08-05 00:08:43 +00:00
(Token::Integer(2), (4, 5)),
2024-08-04 23:25:44 +00:00
(Token::Eof, (5, 5)),
])
)
}
#[test]
fn multiply() {
let input = "1 * 2";
assert_eq!(
lex(input),
Ok(vec![
2024-08-05 00:08:43 +00:00
(Token::Integer(1), (0, 1)),
2024-08-04 23:25:44 +00:00
(Token::Star, (2, 3)),
2024-08-05 00:08:43 +00:00
(Token::Integer(2), (4, 5)),
2024-08-04 23:25:44 +00:00
(Token::Eof, (5, 5)),
])
)
}
#[test]
fn add_and_multiply() {
let input = "1 + 2 * 3";
assert_eq!(
lex(input),
Ok(vec![
2024-08-05 00:08:43 +00:00
(Token::Integer(1), (0, 1)),
2024-08-04 23:25:44 +00:00
(Token::Plus, (2, 3)),
2024-08-05 00:08:43 +00:00
(Token::Integer(2), (4, 5)),
2024-08-04 23:25:44 +00:00
(Token::Star, (6, 7)),
2024-08-05 00:08:43 +00:00
(Token::Integer(3), (8, 9)),
2024-08-04 23:25:44 +00:00
(Token::Eof, (9, 9)),
])
);
}
#[test]
fn assignment() {
let input = "a = 1 + 2 * 3";
assert_eq!(
lex(input,),
Ok(vec![
(Token::Identifier(Identifier::new("a")), (0, 1)),
(Token::Equal, (2, 3)),
2024-08-05 00:08:43 +00:00
(Token::Integer(1), (4, 5)),
2024-08-04 23:25:44 +00:00
(Token::Plus, (6, 7)),
2024-08-05 00:08:43 +00:00
(Token::Integer(2), (8, 9)),
2024-08-04 23:25:44 +00:00
(Token::Star, (10, 11)),
2024-08-05 00:08:43 +00:00
(Token::Integer(3), (12, 13)),
2024-08-04 23:25:44 +00:00
(Token::Eof, (13, 13)),
])
);
}
}