//! Lexing tools. //! //! This module provides two lexing options: //! - [`lex`], which lexes the entire input and returns a vector of tokens and their positions //! - [`Lexer`], which lexes the input a token at a time use std::{ error::Error, fmt::{self, Display, Formatter}, }; use crate::{Span, Token}; /// Lexes the input and return a vector of tokens and their positions. /// /// # Examples /// ``` /// # use dust_lang::*; /// let input = "x = 1 + 2"; /// let tokens = lex(input).unwrap(); /// /// assert_eq!( /// tokens, /// [ /// (Token::Identifier("x"), (0, 1)), /// (Token::Equal, (2, 3)), /// (Token::Integer("1"), (4, 5)), /// (Token::Plus, (6, 7)), /// (Token::Integer("2"), (8, 9)), /// (Token::Eof, (9, 9)), /// ] /// ); /// ``` pub fn lex<'chars, 'src: 'chars>(input: &'src str) -> Result, Span)>, LexError> { let mut lexer = Lexer::new(); let mut tokens = Vec::new(); loop { let (token, span) = lexer.next_token(input)?; let is_eof = matches!(token, Token::Eof); tokens.push((token, span)); if is_eof { break; } } Ok(tokens) } #[derive(Debug, Clone)] /// Low-level tool for lexing a single token at a time. /// /// **Note**: It is a logic error to call `next_token` with different inputs. /// /// # Examples /// ``` /// # use dust_lang::*; /// let input = "x = 1 + 2"; /// let mut lexer = Lexer::new(); /// let mut tokens = Vec::new(); /// /// loop { /// let (token, span) = lexer.next_token(input).unwrap(); /// let is_eof = matches!(token, Token::Eof); /// /// tokens.push((token, span)); /// /// if is_eof { /// break; /// } /// } /// /// assert_eq!( /// tokens, /// [ /// (Token::Identifier("x"), (0, 1)), /// (Token::Equal, (2, 3)), /// (Token::Integer("1"), (4, 5)), /// (Token::Plus, (6, 7)), /// (Token::Integer("2"), (8, 9)), /// (Token::Eof, (9, 9)), /// ] /// ) /// ``` pub struct Lexer { position: usize, } impl Lexer { /// Create a new lexer for the given input. pub fn new() -> Self { Lexer { position: 0 } } /// Produce the next token. /// /// It is a logic error to call this method with different inputs. pub fn next_token<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> { self.skip_whitespace(source); let (token, span) = if let Some(c) = self.peek_char(source) { match c { '0'..='9' => self.lex_number(source)?, '-' => { if let Some('0'..='9') = self.peek_second_char(source) { self.lex_number(source)? } else if "-Infinity" == self.peek_chars(source, 9) { self.position += 9; ( Token::Float("-Infinity"), (self.position - 9, self.position), ) } else { self.position += 1; (Token::Minus, (self.position - 1, self.position)) } } 'a'..='z' | 'A'..='Z' => self.lex_alphanumeric(source)?, '"' => self.lex_string('"', source)?, '\'' => self.lex_string('\'', source)?, '+' => { if let Some('=') = self.peek_second_char(source) { self.position += 2; (Token::PlusEqual, (self.position - 2, self.position)) } else { self.position += 1; (Token::Plus, (self.position - 1, self.position)) } } '*' => { self.position += 1; (Token::Star, (self.position - 1, self.position)) } '(' => { self.position += 1; (Token::LeftParenthesis, (self.position - 1, self.position)) } ')' => { self.position += 1; (Token::RightParenthesis, (self.position - 1, self.position)) } '=' => { if let Some('=') = self.peek_second_char(source) { self.position += 2; (Token::DoubleEqual, (self.position - 2, self.position)) } else { self.position += 1; (Token::Equal, (self.position - 1, self.position)) } } '[' => { self.position += 1; (Token::LeftSquareBrace, (self.position - 1, self.position)) } ']' => { self.position += 1; (Token::RightSquareBrace, (self.position - 1, self.position)) } ',' => { self.position += 1; (Token::Comma, (self.position - 1, self.position)) } '.' => { self.position += 1; (Token::Dot, (self.position - 1, self.position)) } '>' => { if let Some('=') = self.peek_second_char(source) { self.position += 2; (Token::GreaterEqual, (self.position - 2, self.position)) } else { self.position += 1; (Token::Greater, (self.position - 1, self.position)) } } '<' => { if let Some('=') = self.peek_second_char(source) { self.position += 2; (Token::LessEqual, (self.position - 2, self.position)) } else { self.position += 1; (Token::Less, (self.position - 1, self.position)) } } '{' => { self.position += 1; (Token::LeftCurlyBrace, (self.position - 1, self.position)) } '}' => { self.position += 1; (Token::RightCurlyBrace, (self.position - 1, self.position)) } '/' => { self.position += 1; (Token::Slash, (self.position - 1, self.position)) } '%' => { self.position += 1; (Token::Percent, (self.position - 1, self.position)) } '&' => { if let Some('&') = self.peek_second_char(source) { self.position += 2; (Token::DoubleAmpersand, (self.position - 2, self.position)) } else { self.position += 1; return Err(LexError::UnexpectedCharacter { character: c, position: self.position, }); } } ';' => { self.position += 1; (Token::Semicolon, (self.position - 1, self.position)) } '|' => { if let Some('|') = self.peek_second_char(source) { self.position += 2; (Token::DoublePipe, (self.position - 2, self.position)) } else { self.position += 1; return Err(LexError::UnexpectedCharacter { character: c, position: self.position, }); } } _ => { self.position += 1; return Err(LexError::UnexpectedCharacter { character: c, position: self.position, }); } } } else { (Token::Eof, (self.position, self.position)) }; Ok((token, span)) } /// Peek at the next token without consuming the source. pub fn peek_token<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> { let token = self.next_token(source)?; self.position -= token.0.as_str().len(); Ok(token) } /// Progress to the next character. fn next_char(&mut self, source: &str) -> Option { if let Some(c) = source[self.position..].chars().next() { self.position += c.len_utf8(); Some(c) } else { None } } /// Skip whitespace characters. fn skip_whitespace(&mut self, source: &str) { while let Some(c) = self.peek_char(source) { if c.is_whitespace() { self.next_char(source); } else { break; } } } /// Peek at the next character without consuming it. fn peek_char(&self, source: &str) -> Option { source[self.position..].chars().next() } /// Peek at the second-to-next character without consuming it. fn peek_second_char(&self, source: &str) -> Option { source[self.position..].chars().nth(1) } /// Peek the next `n` characters without consuming them. fn peek_chars<'src>(&self, source: &'src str, n: usize) -> &'src str { let remaining_source = &source[self.position..]; if remaining_source.len() < n { remaining_source } else { &remaining_source[..n] } } /// Lex an integer or float token. fn lex_number<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> { let start_pos = self.position; let mut is_float = false; if let Some('-') = self.peek_char(source) { self.next_char(source); } while let Some(c) = self.peek_char(source) { if c == '.' { if let Some('0'..='9') = self.peek_second_char(source) { if !is_float { self.next_char(source); } self.next_char(source); loop { let peek_char = self.peek_char(source); if let Some('0'..='9') = peek_char { self.next_char(source); } else if let Some('e') = peek_char { if let Some('0'..='9') = self.peek_second_char(source) { self.next_char(source); self.next_char(source); } else { break; } } else { break; } } is_float = true; } else { break; } } if c.is_ascii_digit() { self.next_char(source); } else { break; } } let text = &source[start_pos..self.position]; if is_float { Ok((Token::Float(text), (start_pos, self.position))) } else { Ok((Token::Integer(text), (start_pos, self.position))) } } /// Lex an identifier token. fn lex_alphanumeric<'src>( &mut self, source: &'src str, ) -> Result<(Token<'src>, Span), LexError> { let start_pos = self.position; while let Some(c) = self.peek_char(source) { if c.is_ascii_alphanumeric() || c == '_' { self.next_char(source); } else { break; } } let string = &source[start_pos..self.position]; let token = match string { "true" => Token::Boolean("true"), "false" => Token::Boolean("false"), "Infinity" => Token::Float("Infinity"), "is_even" => Token::IsEven, "is_odd" => Token::IsOdd, "length" => Token::Length, "NaN" => Token::Float("NaN"), "read_line" => Token::ReadLine, "while" => Token::While, "write_line" => Token::WriteLine, _ => Token::Identifier(string), }; Ok((token, (start_pos, self.position))) } fn lex_string<'src>( &mut self, delimiter: char, source: &'src str, ) -> Result<(Token<'src>, Span), LexError> { let start_pos = self.position; self.next_char(source); while let Some(c) = self.peek_char(source) { if c == delimiter { self.next_char(source); break; } else { self.next_char(source); } } let text = &source[start_pos + 1..self.position - 1]; Ok((Token::String(text), (start_pos, self.position))) } } impl Default for Lexer { fn default() -> Self { Self::new() } } #[derive(Debug, PartialEq, Clone)] pub enum LexError { UnexpectedCharacter { character: char, position: usize }, } impl LexError { pub fn position(&self) -> Span { match self { Self::UnexpectedCharacter { position, .. } => (*position, *position), } } } impl Error for LexError { fn source(&self) -> Option<&(dyn Error + 'static)> { match self { Self::UnexpectedCharacter { .. } => None, } } } impl Display for LexError { fn fmt(&self, f: &mut Formatter) -> fmt::Result { match self { Self::UnexpectedCharacter { character, .. } => { write!(f, "Unexpected character: '{}'", character) } } } } #[cfg(test)] mod tests { use super::*; #[test] fn while_loop() { let input = "while x < 10 { x += 1 }"; assert_eq!( lex(input), Ok(vec![ (Token::While, (0, 5)), (Token::Identifier("x"), (6, 7)), (Token::Less, (8, 9)), (Token::Integer("10"), (10, 12)), (Token::LeftCurlyBrace, (13, 14)), (Token::Identifier("x"), (15, 16)), (Token::PlusEqual, (17, 19)), (Token::Integer("1"), (20, 21)), (Token::RightCurlyBrace, (22, 23)), (Token::Eof, (23, 23)), ]) ) } #[test] fn add_assign() { let input = "x += 42"; assert_eq!( lex(input), Ok(vec![ (Token::Identifier("x"), (0, 1)), (Token::PlusEqual, (2, 4)), (Token::Integer("42"), (5, 7)), (Token::Eof, (7, 7)), ]) ) } #[test] fn or() { let input = "true || false"; assert_eq!( lex(input), Ok(vec![ (Token::Boolean("true"), (0, 4)), (Token::DoublePipe, (5, 7)), (Token::Boolean("false"), (8, 13)), (Token::Eof, (13, 13)), ]) ) } #[test] fn block() { let input = "{ x = 42; y = 'foobar' }"; assert_eq!( lex(input), Ok(vec![ (Token::LeftCurlyBrace, (0, 1)), (Token::Identifier("x"), (2, 3)), (Token::Equal, (4, 5)), (Token::Integer("42"), (6, 8)), (Token::Semicolon, (8, 9)), (Token::Identifier("y"), (10, 11)), (Token::Equal, (12, 13)), (Token::String("foobar"), (14, 22)), (Token::RightCurlyBrace, (23, 24)), (Token::Eof, (24, 24)), ]) ) } #[test] fn equal() { let input = "42 == 42"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("42"), (0, 2)), (Token::DoubleEqual, (3, 5)), (Token::Integer("42"), (6, 8)), (Token::Eof, (8, 8)), ]) ) } #[test] fn modulo() { let input = "42 % 2"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("42"), (0, 2)), (Token::Percent, (3, 4)), (Token::Integer("2"), (5, 6)), (Token::Eof, (6, 6)), ]) ) } #[test] fn divide() { let input = "42 / 2"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("42"), (0, 2)), (Token::Slash, (3, 4)), (Token::Integer("2"), (5, 6)), (Token::Eof, (6, 6)), ]) ) } #[test] fn map() { let input = "{ x = 42, y = 'foobar' }"; assert_eq!( lex(input), Ok(vec![ (Token::LeftCurlyBrace, (0, 1)), (Token::Identifier("x"), (2, 3)), (Token::Equal, (4, 5)), (Token::Integer("42"), (6, 8)), (Token::Comma, (8, 9)), (Token::Identifier("y"), (10, 11)), (Token::Equal, (12, 13)), (Token::String("foobar"), (14, 22)), (Token::RightCurlyBrace, (23, 24)), (Token::Eof, (24, 24)), ]) ) } #[test] fn greater_than() { let input = ">"; assert_eq!( lex(input), Ok(vec![(Token::Greater, (0, 1)), (Token::Eof, (1, 1))]) ) } #[test] fn greater_than_or_equal() { let input = ">="; assert_eq!( lex(input), Ok(vec![(Token::GreaterEqual, (0, 2)), (Token::Eof, (2, 2))]) ) } #[test] fn less_than() { let input = "<"; assert_eq!( lex(input), Ok(vec![(Token::Less, (0, 1)), (Token::Eof, (1, 1))]) ) } #[test] fn less_than_or_equal() { let input = "<="; assert_eq!( lex(input), Ok(vec![(Token::LessEqual, (0, 2)), (Token::Eof, (2, 2))]) ) } #[test] fn infinity() { let input = "Infinity"; assert_eq!( lex(input), Ok(vec![ (Token::Float("Infinity"), (0, 8)), (Token::Eof, (8, 8)), ]) ) } #[test] fn negative_infinity() { let input = "-Infinity"; assert_eq!( lex(input), Ok(vec![ (Token::Float("-Infinity"), (0, 9)), (Token::Eof, (9, 9)), ]) ) } #[test] fn nan() { let input = "NaN"; assert!(lex(input).is_ok_and(|tokens| tokens[0].0 == Token::Float("NaN"))); } #[test] fn complex_float() { let input = "42.42e42"; assert_eq!( lex(input), Ok(vec![ (Token::Float("42.42e42"), (0, 8)), (Token::Eof, (8, 8)), ]) ) } #[test] fn max_integer() { let input = "9223372036854775807"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("9223372036854775807"), (0, 19)), (Token::Eof, (19, 19)), ]) ) } #[test] fn min_integer() { let input = "-9223372036854775808"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("-9223372036854775808"), (0, 20)), (Token::Eof, (20, 20)), ]) ) } #[test] fn subtract_negative_integers() { let input = "-42 - -42"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("-42"), (0, 3)), (Token::Minus, (4, 5)), (Token::Integer("-42"), (6, 9)), (Token::Eof, (9, 9)), ]) ) } #[test] fn negative_integer() { let input = "-42"; assert_eq!( lex(input), Ok(vec![(Token::Integer("-42"), (0, 3)), (Token::Eof, (3, 3))]) ) } #[test] fn read_line() { let input = "read_line()"; assert_eq!( lex(input), Ok(vec![ (Token::ReadLine, (0, 9)), (Token::LeftParenthesis, (9, 10)), (Token::RightParenthesis, (10, 11)), (Token::Eof, (11, 11)), ]) ) } #[test] fn write_line() { let input = "write_line('Hello, world!')"; assert_eq!( lex(input), Ok(vec![ (Token::WriteLine, (0, 10)), (Token::LeftParenthesis, (10, 11)), (Token::String("Hello, world!"), (11, 26)), (Token::RightParenthesis, (26, 27)), (Token::Eof, (27, 27)), ]) ) } #[test] fn string_concatenation() { let input = "'Hello, ' + 'world!'"; assert_eq!( lex(input), Ok(vec![ (Token::String("Hello, "), (0, 9)), (Token::Plus, (10, 11)), (Token::String("world!"), (12, 20)), (Token::Eof, (20, 20)), ]) ) } #[test] fn string() { let input = "'Hello, world!'"; assert_eq!( lex(input), Ok(vec![ (Token::String("Hello, world!"), (0, 15)), (Token::Eof, (15, 15)), ]) ) } #[test] fn r#true() { let input = "true"; assert_eq!( lex(input), Ok(vec![(Token::Boolean("true"), (0, 4)), (Token::Eof, (4, 4)),]) ) } #[test] fn r#false() { let input = "false"; assert_eq!( lex(input), Ok(vec![ (Token::Boolean("false"), (0, 5)), (Token::Eof, (5, 5)) ]) ) } #[test] fn property_access_function_call() { let input = "42.is_even()"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("42"), (0, 2)), (Token::Dot, (2, 3)), (Token::IsEven, (3, 10)), (Token::LeftParenthesis, (10, 11)), (Token::RightParenthesis, (11, 12)), (Token::Eof, (12, 12)), ]) ) } #[test] fn empty() { let input = ""; assert_eq!(lex(input), Ok(vec![(Token::Eof, (0, 0))])) } #[test] fn reserved_identifier() { let input = "length"; assert_eq!( lex(input), Ok(vec![(Token::Length, (0, 6)), (Token::Eof, (6, 6)),]) ) } #[test] fn square_braces() { let input = "[]"; assert_eq!( lex(input), Ok(vec![ (Token::LeftSquareBrace, (0, 1)), (Token::RightSquareBrace, (1, 2)), (Token::Eof, (2, 2)), ]) ) } #[test] fn small_float() { let input = "1.23"; assert_eq!( lex(input), Ok(vec![(Token::Float("1.23"), (0, 4)), (Token::Eof, (4, 4)),]) ) } #[test] #[allow(clippy::excessive_precision)] fn big_float() { let input = "123456789.123456789"; assert_eq!( lex(input), Ok(vec![ (Token::Float("123456789.123456789"), (0, 19)), (Token::Eof, (19, 19)), ]) ) } #[test] fn add() { let input = "1 + 2"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("1"), (0, 1)), (Token::Plus, (2, 3)), (Token::Integer("2"), (4, 5)), (Token::Eof, (5, 5)), ]) ) } #[test] fn multiply() { let input = "1 * 2"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("1"), (0, 1)), (Token::Star, (2, 3)), (Token::Integer("2"), (4, 5)), (Token::Eof, (5, 5)), ]) ) } #[test] fn add_and_multiply() { let input = "1 + 2 * 3"; assert_eq!( lex(input), Ok(vec![ (Token::Integer("1"), (0, 1)), (Token::Plus, (2, 3)), (Token::Integer("2"), (4, 5)), (Token::Star, (6, 7)), (Token::Integer("3"), (8, 9)), (Token::Eof, (9, 9)), ]) ); } #[test] fn assignment() { let input = "a = 1 + 2 * 3"; assert_eq!( lex(input,), Ok(vec![ (Token::Identifier("a"), (0, 1)), (Token::Equal, (2, 3)), (Token::Integer("1"), (4, 5)), (Token::Plus, (6, 7)), (Token::Integer("2"), (8, 9)), (Token::Star, (10, 11)), (Token::Integer("3"), (12, 13)), (Token::Eof, (13, 13)), ]) ); } }