2024-08-05 04:54:12 +00:00
|
|
|
//! Lexing tools.
|
|
|
|
//!
|
|
|
|
//! This module provides two lexing options:
|
|
|
|
//! - [`lex`], which lexes the entire input and returns a vector of tokens and their positions
|
|
|
|
//! - [`Lexer`], which lexes the input a token at a time
|
2024-08-04 23:41:00 +00:00
|
|
|
use std::num::{ParseFloatError, ParseIntError};
|
|
|
|
|
2024-08-07 22:24:25 +00:00
|
|
|
use crate::{Identifier, Span, Token};
|
2024-08-04 00:23:52 +00:00
|
|
|
|
2024-08-07 16:32:18 +00:00
|
|
|
/// Lexes the input and return a vector of tokens and their positions.
|
2024-08-07 16:13:49 +00:00
|
|
|
///
|
|
|
|
/// # Examples
|
|
|
|
/// ```
|
|
|
|
/// # use dust_lang::*;
|
|
|
|
/// let input = "x = 1 + 2";
|
|
|
|
/// let tokens = lex(input).unwrap();
|
|
|
|
///
|
|
|
|
/// assert_eq!(
|
|
|
|
/// tokens,
|
|
|
|
/// [
|
|
|
|
/// (Token::Identifier(Identifier::new("x")), (0, 1)),
|
|
|
|
/// (Token::Equal, (2, 3)),
|
|
|
|
/// (Token::Integer(1), (4, 5)),
|
|
|
|
/// (Token::Plus, (6, 7)),
|
|
|
|
/// (Token::Integer(2), (8, 9)),
|
|
|
|
/// (Token::Eof, (9, 9)),
|
|
|
|
/// ]
|
|
|
|
/// );
|
|
|
|
/// ```
|
2024-08-04 00:23:52 +00:00
|
|
|
pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
|
|
|
|
let mut lexer = Lexer::new(input);
|
|
|
|
let mut tokens = Vec::new();
|
|
|
|
|
|
|
|
loop {
|
|
|
|
let (token, span) = lexer.next_token()?;
|
|
|
|
let is_eof = matches!(token, Token::Eof);
|
|
|
|
|
|
|
|
tokens.push((token, span));
|
|
|
|
|
|
|
|
if is_eof {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(tokens)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
2024-08-05 04:54:12 +00:00
|
|
|
/// Low-level tool for lexing a single token at a time.
|
2024-08-07 16:13:49 +00:00
|
|
|
///
|
|
|
|
/// # Examples
|
|
|
|
/// ```
|
|
|
|
/// # use dust_lang::*;
|
|
|
|
/// let input = "x = 1 + 2";
|
|
|
|
/// let mut lexer = Lexer::new(input);
|
|
|
|
/// let mut tokens = Vec::new();
|
|
|
|
///
|
|
|
|
/// loop {
|
|
|
|
/// let (token, span) = lexer.next_token().unwrap();
|
|
|
|
/// let is_eof = matches!(token, Token::Eof);
|
|
|
|
///
|
|
|
|
/// tokens.push((token, span));
|
|
|
|
///
|
|
|
|
/// if is_eof {
|
|
|
|
/// break;
|
|
|
|
/// }
|
|
|
|
/// }
|
|
|
|
///
|
|
|
|
/// assert_eq!(
|
|
|
|
/// tokens,
|
|
|
|
/// [
|
|
|
|
/// (Token::Identifier(Identifier::new("x")), (0, 1)),
|
|
|
|
/// (Token::Equal, (2, 3)),
|
|
|
|
/// (Token::Integer(1), (4, 5)),
|
|
|
|
/// (Token::Plus, (6, 7)),
|
|
|
|
/// (Token::Integer(2), (8, 9)),
|
|
|
|
/// (Token::Eof, (9, 9)),
|
|
|
|
/// ]
|
|
|
|
/// )
|
|
|
|
/// ```
|
2024-08-04 00:23:52 +00:00
|
|
|
pub struct Lexer<'a> {
|
2024-08-05 04:40:51 +00:00
|
|
|
source: &'a str,
|
2024-08-04 00:23:52 +00:00
|
|
|
position: usize,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> Lexer<'a> {
|
2024-08-05 04:54:12 +00:00
|
|
|
/// Create a new lexer for the given input.
|
2024-08-04 00:23:52 +00:00
|
|
|
pub fn new(input: &'a str) -> Self {
|
2024-08-05 04:40:51 +00:00
|
|
|
Lexer {
|
|
|
|
source: input,
|
|
|
|
position: 0,
|
|
|
|
}
|
2024-08-04 00:23:52 +00:00
|
|
|
}
|
|
|
|
|
2024-08-05 04:54:12 +00:00
|
|
|
/// Progress to the next character.
|
2024-08-04 00:23:52 +00:00
|
|
|
fn next_char(&mut self) -> Option<char> {
|
2024-08-05 04:40:51 +00:00
|
|
|
self.source[self.position..].chars().next().map(|c| {
|
2024-08-04 00:23:52 +00:00
|
|
|
self.position += c.len_utf8();
|
|
|
|
c
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2024-08-05 04:54:12 +00:00
|
|
|
/// Produce the next token.
|
2024-08-04 00:23:52 +00:00
|
|
|
pub fn next_token(&mut self) -> Result<(Token, Span), LexError> {
|
|
|
|
self.skip_whitespace();
|
|
|
|
|
|
|
|
let (token, span) = if let Some(c) = self.peek_char() {
|
|
|
|
match c {
|
|
|
|
'0'..='9' => self.lex_number()?,
|
2024-08-07 14:41:27 +00:00
|
|
|
'a'..='z' | 'A'..='Z' => self.lex_alphabetical()?,
|
2024-08-04 00:23:52 +00:00
|
|
|
'+' => {
|
|
|
|
self.position += 1;
|
|
|
|
(Token::Plus, (self.position - 1, self.position))
|
|
|
|
}
|
|
|
|
'*' => {
|
|
|
|
self.position += 1;
|
|
|
|
(Token::Star, (self.position - 1, self.position))
|
|
|
|
}
|
|
|
|
'(' => {
|
|
|
|
self.position += 1;
|
|
|
|
(Token::LeftParenthesis, (self.position - 1, self.position))
|
|
|
|
}
|
|
|
|
')' => {
|
|
|
|
self.position += 1;
|
|
|
|
(Token::RightParenthesis, (self.position - 1, self.position))
|
|
|
|
}
|
|
|
|
'=' => {
|
|
|
|
self.position += 1;
|
|
|
|
(Token::Equal, (self.position - 1, self.position))
|
|
|
|
}
|
2024-08-05 01:31:18 +00:00
|
|
|
'[' => {
|
|
|
|
self.position += 1;
|
|
|
|
(Token::LeftSquareBrace, (self.position - 1, self.position))
|
|
|
|
}
|
|
|
|
']' => {
|
|
|
|
self.position += 1;
|
|
|
|
(Token::RightSquareBrace, (self.position - 1, self.position))
|
|
|
|
}
|
|
|
|
',' => {
|
|
|
|
self.position += 1;
|
|
|
|
(Token::Comma, (self.position - 1, self.position))
|
|
|
|
}
|
2024-08-05 18:31:08 +00:00
|
|
|
'.' => {
|
|
|
|
self.position += 1;
|
|
|
|
(Token::Dot, (self.position - 1, self.position))
|
|
|
|
}
|
2024-08-04 00:23:52 +00:00
|
|
|
_ => (Token::Eof, (self.position, self.position)),
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
(Token::Eof, (self.position, self.position))
|
|
|
|
};
|
|
|
|
|
|
|
|
Ok((token, span))
|
|
|
|
}
|
|
|
|
|
2024-08-05 04:54:12 +00:00
|
|
|
/// Skip whitespace characters.
|
2024-08-04 00:23:52 +00:00
|
|
|
fn skip_whitespace(&mut self) {
|
|
|
|
while let Some(c) = self.peek_char() {
|
|
|
|
if c.is_whitespace() {
|
|
|
|
self.next_char();
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-05 04:54:12 +00:00
|
|
|
/// Peek at the next character without consuming it.
|
2024-08-04 00:23:52 +00:00
|
|
|
fn peek_char(&self) -> Option<char> {
|
2024-08-05 04:40:51 +00:00
|
|
|
self.source[self.position..].chars().next()
|
2024-08-04 00:23:52 +00:00
|
|
|
}
|
|
|
|
|
2024-08-05 22:34:20 +00:00
|
|
|
/// Peek at the second-to-next character without consuming it.
|
|
|
|
fn peek_second_char(&self) -> Option<char> {
|
|
|
|
self.source[self.position..].chars().nth(1)
|
|
|
|
}
|
|
|
|
|
2024-08-07 14:50:19 +00:00
|
|
|
fn _peek_until_whitespace(&self) -> Option<&str> {
|
2024-08-07 14:41:27 +00:00
|
|
|
let start = self.position;
|
|
|
|
let end = self.source[self.position..]
|
|
|
|
.find(char::is_whitespace)
|
|
|
|
.map(|i| i + start);
|
|
|
|
|
|
|
|
if let Some(end) = end {
|
|
|
|
Some(&self.source[start..end])
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-05 04:54:12 +00:00
|
|
|
/// Lex an integer or float token.
|
2024-08-04 00:23:52 +00:00
|
|
|
fn lex_number(&mut self) -> Result<(Token, Span), LexError> {
|
|
|
|
let start_pos = self.position;
|
2024-08-04 23:41:00 +00:00
|
|
|
let mut is_float = false;
|
2024-08-04 00:23:52 +00:00
|
|
|
|
|
|
|
while let Some(c) = self.peek_char() {
|
2024-08-04 23:41:00 +00:00
|
|
|
if c == '.' {
|
2024-08-05 22:34:20 +00:00
|
|
|
if let Some('0'..='9') = self.peek_second_char() {
|
|
|
|
if !is_float {
|
|
|
|
self.next_char();
|
|
|
|
}
|
2024-08-04 23:41:00 +00:00
|
|
|
|
2024-08-05 22:34:20 +00:00
|
|
|
self.next_char();
|
2024-08-05 18:31:08 +00:00
|
|
|
|
2024-08-05 22:34:20 +00:00
|
|
|
while let Some('0'..='9') = self.peek_char() {
|
2024-08-04 23:41:00 +00:00
|
|
|
self.next_char();
|
|
|
|
}
|
2024-08-05 22:34:20 +00:00
|
|
|
|
|
|
|
is_float = true;
|
|
|
|
} else {
|
|
|
|
break;
|
2024-08-04 23:41:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-04 00:23:52 +00:00
|
|
|
if c.is_ascii_digit() {
|
|
|
|
self.next_char();
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-04 23:41:00 +00:00
|
|
|
if is_float {
|
2024-08-05 04:40:51 +00:00
|
|
|
let float = self.source[start_pos..self.position].parse::<f64>()?;
|
2024-08-04 00:23:52 +00:00
|
|
|
|
2024-08-04 23:41:00 +00:00
|
|
|
Ok((Token::Float(float), (start_pos, self.position)))
|
|
|
|
} else {
|
2024-08-05 04:40:51 +00:00
|
|
|
let integer = self.source[start_pos..self.position].parse::<i64>()?;
|
2024-08-04 23:41:00 +00:00
|
|
|
|
2024-08-05 00:08:43 +00:00
|
|
|
Ok((Token::Integer(integer), (start_pos, self.position)))
|
2024-08-04 23:41:00 +00:00
|
|
|
}
|
2024-08-04 00:23:52 +00:00
|
|
|
}
|
|
|
|
|
2024-08-05 04:54:12 +00:00
|
|
|
/// Lex an identifier token.
|
2024-08-07 14:41:27 +00:00
|
|
|
fn lex_alphabetical(&mut self) -> Result<(Token, Span), LexError> {
|
2024-08-04 00:23:52 +00:00
|
|
|
let start_pos = self.position;
|
|
|
|
|
|
|
|
while let Some(c) = self.peek_char() {
|
2024-08-05 22:34:20 +00:00
|
|
|
if c.is_ascii_alphanumeric() || c == '_' {
|
2024-08-04 00:23:52 +00:00
|
|
|
self.next_char();
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-05 19:54:48 +00:00
|
|
|
let string = &self.source[start_pos..self.position];
|
|
|
|
let token = match string {
|
2024-08-07 14:41:27 +00:00
|
|
|
"true" => Token::Boolean(true),
|
|
|
|
"false" => Token::Boolean(false),
|
2024-08-07 22:24:25 +00:00
|
|
|
"is_even" => Token::IsEven,
|
|
|
|
"is_odd" => Token::IsOdd,
|
|
|
|
"length" => Token::Length,
|
2024-08-05 19:54:48 +00:00
|
|
|
_ => Token::Identifier(Identifier::new(string)),
|
|
|
|
};
|
2024-08-04 00:23:52 +00:00
|
|
|
|
|
|
|
Ok((token, (start_pos, self.position)))
|
|
|
|
}
|
|
|
|
}
|
2024-08-04 23:25:44 +00:00
|
|
|
|
|
|
|
#[derive(Debug, PartialEq, Clone)]
|
|
|
|
pub enum LexError {
|
2024-08-04 23:41:00 +00:00
|
|
|
FloatError(ParseFloatError),
|
|
|
|
IntegerError(ParseIntError),
|
2024-08-04 23:25:44 +00:00
|
|
|
}
|
|
|
|
|
2024-08-04 23:41:00 +00:00
|
|
|
impl From<ParseFloatError> for LexError {
|
|
|
|
fn from(error: std::num::ParseFloatError) -> Self {
|
|
|
|
Self::FloatError(error)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl From<ParseIntError> for LexError {
|
|
|
|
fn from(error: std::num::ParseIntError) -> Self {
|
|
|
|
Self::IntegerError(error)
|
2024-08-04 23:25:44 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
|
2024-08-07 14:41:27 +00:00
|
|
|
#[test]
|
|
|
|
fn r#true() {
|
|
|
|
let input = "true";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
|
|
|
Ok(vec![(Token::Boolean(true), (0, 4)), (Token::Eof, (4, 4)),])
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn r#false() {
|
|
|
|
let input = "false";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
|
|
|
Ok(vec![(Token::Boolean(false), (0, 5)), (Token::Eof, (5, 5))])
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2024-08-05 22:34:20 +00:00
|
|
|
#[test]
|
2024-08-07 22:24:25 +00:00
|
|
|
fn property_access_function_call() {
|
|
|
|
let input = "42.is_even()";
|
2024-08-05 22:34:20 +00:00
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
|
|
|
Ok(vec![
|
|
|
|
(Token::Integer(42), (0, 2)),
|
|
|
|
(Token::Dot, (2, 3)),
|
2024-08-07 22:24:25 +00:00
|
|
|
(Token::IsEven, (3, 10)),
|
|
|
|
(Token::LeftParenthesis, (10, 11)),
|
|
|
|
(Token::RightParenthesis, (11, 12)),
|
|
|
|
(Token::Eof, (12, 12)),
|
2024-08-05 22:34:20 +00:00
|
|
|
])
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn empty() {
|
|
|
|
let input = "";
|
|
|
|
|
|
|
|
assert_eq!(lex(input), Ok(vec![(Token::Eof, (0, 0))]))
|
|
|
|
}
|
|
|
|
|
2024-08-05 19:54:48 +00:00
|
|
|
#[test]
|
|
|
|
fn reserved_identifier() {
|
|
|
|
let input = "length";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
2024-08-07 22:24:25 +00:00
|
|
|
Ok(vec![(Token::Length, (0, 6)), (Token::Eof, (6, 6)),])
|
2024-08-05 19:54:48 +00:00
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2024-08-05 01:31:18 +00:00
|
|
|
#[test]
|
|
|
|
fn square_braces() {
|
|
|
|
let input = "[]";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
|
|
|
Ok(vec![
|
|
|
|
(Token::LeftSquareBrace, (0, 1)),
|
|
|
|
(Token::RightSquareBrace, (1, 2)),
|
|
|
|
(Token::Eof, (2, 2)),
|
|
|
|
])
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2024-08-04 23:41:00 +00:00
|
|
|
#[test]
|
2024-08-05 00:08:43 +00:00
|
|
|
fn small_float() {
|
2024-08-04 23:41:00 +00:00
|
|
|
let input = "1.23";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
|
|
|
Ok(vec![(Token::Float(1.23), (0, 4)), (Token::Eof, (4, 4)),])
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2024-08-05 00:08:43 +00:00
|
|
|
#[test]
|
|
|
|
#[allow(clippy::excessive_precision)]
|
|
|
|
fn big_float() {
|
|
|
|
let input = "123456789.123456789";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
|
|
|
Ok(vec![
|
|
|
|
(Token::Float(123456789.123456789), (0, 19)),
|
|
|
|
(Token::Eof, (19, 19)),
|
|
|
|
])
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2024-08-04 23:25:44 +00:00
|
|
|
#[test]
|
|
|
|
fn add() {
|
|
|
|
let input = "1 + 2";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
|
|
|
Ok(vec![
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(1), (0, 1)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Plus, (2, 3)),
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(2), (4, 5)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Eof, (5, 5)),
|
|
|
|
])
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn multiply() {
|
|
|
|
let input = "1 * 2";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
|
|
|
Ok(vec![
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(1), (0, 1)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Star, (2, 3)),
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(2), (4, 5)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Eof, (5, 5)),
|
|
|
|
])
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn add_and_multiply() {
|
|
|
|
let input = "1 + 2 * 3";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input),
|
|
|
|
Ok(vec![
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(1), (0, 1)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Plus, (2, 3)),
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(2), (4, 5)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Star, (6, 7)),
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(3), (8, 9)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Eof, (9, 9)),
|
|
|
|
])
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn assignment() {
|
|
|
|
let input = "a = 1 + 2 * 3";
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
lex(input,),
|
|
|
|
Ok(vec![
|
|
|
|
(Token::Identifier(Identifier::new("a")), (0, 1)),
|
|
|
|
(Token::Equal, (2, 3)),
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(1), (4, 5)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Plus, (6, 7)),
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(2), (8, 9)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Star, (10, 11)),
|
2024-08-05 00:08:43 +00:00
|
|
|
(Token::Integer(3), (12, 13)),
|
2024-08-04 23:25:44 +00:00
|
|
|
(Token::Eof, (13, 13)),
|
|
|
|
])
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|