Borrow string slices instead of copying them in lexing

This commit is contained in:
Jeff 2024-08-08 16:19:14 -04:00
parent 1c24286696
commit bf519ec087
3 changed files with 194 additions and 133 deletions

View File

@ -5,7 +5,7 @@
//! - [`Lexer`], which lexes the input a token at a time //! - [`Lexer`], which lexes the input a token at a time
use std::num::{ParseFloatError, ParseIntError}; use std::num::{ParseFloatError, ParseIntError};
use crate::{Identifier, Span, Token}; use crate::{Span, Token};
/// Lexes the input and return a vector of tokens and their positions. /// Lexes the input and return a vector of tokens and their positions.
/// ///
@ -18,7 +18,7 @@ use crate::{Identifier, Span, Token};
/// assert_eq!( /// assert_eq!(
/// tokens, /// tokens,
/// [ /// [
/// (Token::Identifier(Identifier::new("x")), (0, 1)), /// (Token::Identifier("x"), (0, 1)),
/// (Token::Equal, (2, 3)), /// (Token::Equal, (2, 3)),
/// (Token::Integer(1), (4, 5)), /// (Token::Integer(1), (4, 5)),
/// (Token::Plus, (6, 7)), /// (Token::Plus, (6, 7)),
@ -27,12 +27,12 @@ use crate::{Identifier, Span, Token};
/// ] /// ]
/// ); /// );
/// ``` /// ```
pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> { pub fn lex<'chars, 'src: 'chars>(input: &'src str) -> Result<Vec<(Token<'chars>, Span)>, LexError> {
let mut lexer = Lexer::new(input); let mut lexer = Lexer::new();
let mut tokens = Vec::new(); let mut tokens = Vec::new();
loop { loop {
let (token, span) = lexer.next_token()?; let (token, span) = lexer.next_token(input)?;
let is_eof = matches!(token, Token::Eof); let is_eof = matches!(token, Token::Eof);
tokens.push((token, span)); tokens.push((token, span));
@ -52,11 +52,11 @@ pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
/// ``` /// ```
/// # use dust_lang::*; /// # use dust_lang::*;
/// let input = "x = 1 + 2"; /// let input = "x = 1 + 2";
/// let mut lexer = Lexer::new(input); /// let mut lexer = Lexer::new();
/// let mut tokens = Vec::new(); /// let mut tokens = Vec::new();
/// ///
/// loop { /// loop {
/// let (token, span) = lexer.next_token().unwrap(); /// let (token, span) = lexer.next_token(input).unwrap();
/// let is_eof = matches!(token, Token::Eof); /// let is_eof = matches!(token, Token::Eof);
/// ///
/// tokens.push((token, span)); /// tokens.push((token, span));
@ -69,7 +69,7 @@ pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
/// assert_eq!( /// assert_eq!(
/// tokens, /// tokens,
/// [ /// [
/// (Token::Identifier(Identifier::new("x")), (0, 1)), /// (Token::Identifier("x"), (0, 1)),
/// (Token::Equal, (2, 3)), /// (Token::Equal, (2, 3)),
/// (Token::Integer(1), (4, 5)), /// (Token::Integer(1), (4, 5)),
/// (Token::Plus, (6, 7)), /// (Token::Plus, (6, 7)),
@ -78,38 +78,26 @@ pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
/// ] /// ]
/// ) /// )
/// ``` /// ```
pub struct Lexer<'a> { pub struct Lexer {
source: &'a str,
position: usize, position: usize,
} }
impl<'a> Lexer<'a> { impl Lexer {
/// Create a new lexer for the given input. /// Create a new lexer for the given input.
pub fn new(input: &'a str) -> Self { pub fn new() -> Self {
Lexer { Lexer { position: 0 }
source: input,
position: 0,
}
}
/// Progress to the next character.
fn next_char(&mut self) -> Option<char> {
self.source[self.position..].chars().next().map(|c| {
self.position += c.len_utf8();
c
})
} }
/// Produce the next token. /// Produce the next token.
pub fn next_token(&mut self) -> Result<(Token, Span), LexError> { pub fn next_token<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> {
self.skip_whitespace(); self.skip_whitespace(source);
let (token, span) = if let Some(c) = self.peek_char() { let (token, span) = if let Some(c) = self.peek_char(source) {
match c { match c {
'0'..='9' => self.lex_number()?, '0'..='9' => self.lex_number(source)?,
'a'..='z' | 'A'..='Z' => self.lex_alphabetical()?, 'a'..='z' | 'A'..='Z' => self.lex_alphabetical(source)?,
'"' => self.lex_string('"')?, '"' => self.lex_string('"', source)?,
'\'' => self.lex_string('\'')?, '\'' => self.lex_string('\'', source)?,
'+' => { '+' => {
self.position += 1; self.position += 1;
(Token::Plus, (self.position - 1, self.position)) (Token::Plus, (self.position - 1, self.position))
@ -155,11 +143,19 @@ impl<'a> Lexer<'a> {
Ok((token, span)) Ok((token, span))
} }
/// Progress to the next character.
fn next_char(&mut self, source: &str) -> Option<char> {
source[self.position..].chars().next().map(|c| {
self.position += c.len_utf8();
c
})
}
/// Skip whitespace characters. /// Skip whitespace characters.
fn skip_whitespace(&mut self) { fn skip_whitespace(&mut self, source: &str) {
while let Some(c) = self.peek_char() { while let Some(c) = self.peek_char(source) {
if c.is_whitespace() { if c.is_whitespace() {
self.next_char(); self.next_char(source);
} else { } else {
break; break;
} }
@ -167,44 +163,31 @@ impl<'a> Lexer<'a> {
} }
/// Peek at the next character without consuming it. /// Peek at the next character without consuming it.
fn peek_char(&self) -> Option<char> { fn peek_char(&self, source: &str) -> Option<char> {
self.source[self.position..].chars().next() source[self.position..].chars().next()
} }
/// Peek at the second-to-next character without consuming it. /// Peek at the second-to-next character without consuming it.
fn peek_second_char(&self) -> Option<char> { fn peek_second_char(&self, source: &str) -> Option<char> {
self.source[self.position..].chars().nth(1) source[self.position..].chars().nth(1)
}
fn _peek_until_whitespace(&self) -> Option<&str> {
let start = self.position;
let end = self.source[self.position..]
.find(char::is_whitespace)
.map(|i| i + start);
if let Some(end) = end {
Some(&self.source[start..end])
} else {
None
}
} }
/// Lex an integer or float token. /// Lex an integer or float token.
fn lex_number(&mut self) -> Result<(Token, Span), LexError> { fn lex_number<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> {
let start_pos = self.position; let start_pos = self.position;
let mut is_float = false; let mut is_float = false;
while let Some(c) = self.peek_char() { while let Some(c) = self.peek_char(source) {
if c == '.' { if c == '.' {
if let Some('0'..='9') = self.peek_second_char() { if let Some('0'..='9') = self.peek_second_char(source) {
if !is_float { if !is_float {
self.next_char(); self.next_char(source);
} }
self.next_char(); self.next_char(source);
while let Some('0'..='9') = self.peek_char() { while let Some('0'..='9') = self.peek_char(source) {
self.next_char(); self.next_char(source);
} }
is_float = true; is_float = true;
@ -214,36 +197,39 @@ impl<'a> Lexer<'a> {
} }
if c.is_ascii_digit() { if c.is_ascii_digit() {
self.next_char(); self.next_char(source);
} else { } else {
break; break;
} }
} }
if is_float { if is_float {
let float = self.source[start_pos..self.position].parse::<f64>()?; let float = source[start_pos..self.position].parse::<f64>()?;
Ok((Token::Float(float), (start_pos, self.position))) Ok((Token::Float(float), (start_pos, self.position)))
} else { } else {
let integer = self.source[start_pos..self.position].parse::<i64>()?; let integer = source[start_pos..self.position].parse::<i64>()?;
Ok((Token::Integer(integer), (start_pos, self.position))) Ok((Token::Integer(integer), (start_pos, self.position)))
} }
} }
/// Lex an identifier token. /// Lex an identifier token.
fn lex_alphabetical(&mut self) -> Result<(Token, Span), LexError> { fn lex_alphabetical<'src>(
&mut self,
source: &'src str,
) -> Result<(Token<'src>, Span), LexError> {
let start_pos = self.position; let start_pos = self.position;
while let Some(c) = self.peek_char() { while let Some(c) = self.peek_char(source) {
if c.is_ascii_alphanumeric() || c == '_' { if c.is_ascii_alphanumeric() || c == '_' {
self.next_char(); self.next_char(source);
} else { } else {
break; break;
} }
} }
let string = &self.source[start_pos..self.position]; let string = &source[start_pos..self.position];
let token = match string { let token = match string {
"true" => Token::Boolean(true), "true" => Token::Boolean(true),
"false" => Token::Boolean(false), "false" => Token::Boolean(false),
@ -252,31 +238,39 @@ impl<'a> Lexer<'a> {
"length" => Token::Length, "length" => Token::Length,
"read_line" => Token::ReadLine, "read_line" => Token::ReadLine,
"write_line" => Token::WriteLine, "write_line" => Token::WriteLine,
_ => Token::Identifier(Identifier::new(string)), _ => Token::Identifier(string),
}; };
Ok((token, (start_pos, self.position))) Ok((token, (start_pos, self.position)))
} }
fn lex_string(&mut self, delimiter: char) -> Result<(Token, Span), LexError> { fn lex_string<'src>(
&mut self,
delimiter: char,
source: &'src str,
) -> Result<(Token<'src>, Span), LexError> {
let start_pos = self.position; let start_pos = self.position;
self.next_char(); self.next_char(source);
while let Some(c) = self.peek_char() { while let Some(c) = self.peek_char(source) {
if c == delimiter { if c == delimiter {
self.next_char(); self.next_char(source);
break; break;
} else { } else {
self.next_char(); self.next_char(source);
} }
} }
let string = &self.source[start_pos + 1..self.position - 1]; let text = &source[start_pos + 1..self.position - 1];
Ok((
Token::String(string.to_string()), Ok((Token::String(text), (start_pos, self.position)))
(start_pos, self.position), }
)) }
impl Default for Lexer {
fn default() -> Self {
Self::new()
} }
} }
@ -326,7 +320,7 @@ mod tests {
Ok(vec![ Ok(vec![
(Token::WriteLine, (0, 10)), (Token::WriteLine, (0, 10)),
(Token::LeftParenthesis, (10, 11)), (Token::LeftParenthesis, (10, 11)),
(Token::String("Hello, world!".to_string()), (11, 26)), (Token::String("Hello, world!"), (11, 26)),
(Token::RightParenthesis, (26, 27)), (Token::RightParenthesis, (26, 27)),
(Token::Eof, (27, 27)), (Token::Eof, (27, 27)),
]) ])
@ -340,9 +334,9 @@ mod tests {
assert_eq!( assert_eq!(
lex(input), lex(input),
Ok(vec![ Ok(vec![
(Token::String("Hello, ".to_string()), (0, 9)), (Token::String("Hello, "), (0, 9)),
(Token::Plus, (10, 11)), (Token::Plus, (10, 11)),
(Token::String("world!".to_string()), (12, 20)), (Token::String("world!"), (12, 20)),
(Token::Eof, (20, 20)), (Token::Eof, (20, 20)),
]) ])
) )
@ -355,7 +349,7 @@ mod tests {
assert_eq!( assert_eq!(
lex(input), lex(input),
Ok(vec![ Ok(vec![
(Token::String("Hello, world!".to_string()), (0, 15)), (Token::String("Hello, world!"), (0, 15)),
(Token::Eof, (15, 15)), (Token::Eof, (15, 15)),
]) ])
) )
@ -507,7 +501,7 @@ mod tests {
assert_eq!( assert_eq!(
lex(input,), lex(input,),
Ok(vec![ Ok(vec![
(Token::Identifier(Identifier::new("a")), (0, 1)), (Token::Identifier("a"), (0, 1)),
(Token::Equal, (2, 3)), (Token::Equal, (2, 3)),
(Token::Integer(1), (4, 5)), (Token::Integer(1), (4, 5)),
(Token::Plus, (6, 7)), (Token::Plus, (6, 7)),

View File

@ -6,8 +6,8 @@
use std::collections::VecDeque; use std::collections::VecDeque;
use crate::{ use crate::{
built_in_function::BuiltInFunction, AbstractSyntaxTree, LexError, Lexer, Node, Span, Statement, built_in_function::BuiltInFunction, token::TokenOwned, AbstractSyntaxTree, Identifier,
Token, Value, LexError, Lexer, Node, Span, Statement, Token, Value,
}; };
/// Parses the input into an abstract syntax tree. /// Parses the input into an abstract syntax tree.
@ -40,8 +40,8 @@ use crate::{
/// ); /// );
/// ``` /// ```
pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> { pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> {
let lexer = Lexer::new(input); let lexer = Lexer::new();
let mut parser = Parser::new(lexer); let mut parser = Parser::new(input, lexer);
let mut nodes = VecDeque::new(); let mut nodes = VecDeque::new();
loop { loop {
@ -64,8 +64,8 @@ pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> {
/// # use std::collections::VecDeque; /// # use std::collections::VecDeque;
/// # use dust_lang::*; /// # use dust_lang::*;
/// let input = "x = 42"; /// let input = "x = 42";
/// let lexer = Lexer::new(input); /// let lexer = Lexer::new();
/// let mut parser = Parser::new(lexer); /// let mut parser = Parser::new(input, lexer);
/// let mut nodes = VecDeque::new(); /// let mut nodes = VecDeque::new();
/// ///
/// loop { /// loop {
@ -98,16 +98,21 @@ pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> {
/// ); /// );
/// ``` /// ```
pub struct Parser<'src> { pub struct Parser<'src> {
lexer: Lexer<'src>, source: &'src str,
current: (Token, Span), lexer: Lexer,
current: (Token<'src>, Span),
} }
impl<'src> Parser<'src> { impl<'src> Parser<'src> {
pub fn new(lexer: Lexer<'src>) -> Self { pub fn new(source: &'src str, lexer: Lexer) -> Self {
let mut lexer = lexer; let mut lexer = lexer;
let current = lexer.next_token().unwrap_or((Token::Eof, (0, 0))); let current = lexer.next_token(source).unwrap_or((Token::Eof, (0, 0)));
Parser { lexer, current } Parser {
source,
lexer,
current,
}
} }
pub fn parse(&mut self) -> Result<Node, ParseError> { pub fn parse(&mut self) -> Result<Node, ParseError> {
@ -119,7 +124,7 @@ impl<'src> Parser<'src> {
} }
fn next_token(&mut self) -> Result<(), ParseError> { fn next_token(&mut self) -> Result<(), ParseError> {
self.current = self.lexer.next_token()?; self.current = self.lexer.next_token(self.source)?;
Ok(()) Ok(())
} }
@ -182,7 +187,7 @@ impl<'src> Parser<'src> {
} }
fn parse_primary(&mut self) -> Result<Node, ParseError> { fn parse_primary(&mut self) -> Result<Node, ParseError> {
match self.current.clone() { match self.current {
(Token::Boolean(boolean), span) => { (Token::Boolean(boolean), span) => {
self.next_token()?; self.next_token()?;
@ -201,10 +206,13 @@ impl<'src> Parser<'src> {
Ok(Node::new(Statement::Constant(Value::integer(int)), span)) Ok(Node::new(Statement::Constant(Value::integer(int)), span))
} }
(Token::Identifier(identifier), span) => { (Token::Identifier(text), span) => {
self.next_token()?; self.next_token()?;
Ok(Node::new(Statement::Identifier(identifier), span)) Ok(Node::new(
Statement::Identifier(Identifier::new(text)),
span,
))
} }
(Token::String(string), span) => { (Token::String(string), span) => {
self.next_token()?; self.next_token()?;
@ -222,7 +230,7 @@ impl<'src> Parser<'src> {
Ok(Node::new(node.statement, (left_span.0, right_span.1))) Ok(Node::new(node.statement, (left_span.0, right_span.1)))
} else { } else {
Err(ParseError::ExpectedClosingParenthesis { Err(ParseError::ExpectedClosingParenthesis {
actual: self.current.0.clone(), actual: TokenOwned::from(self.current.0),
span: self.current.1, span: self.current.1,
}) })
} }
@ -252,7 +260,7 @@ impl<'src> Parser<'src> {
nodes.push(instruction); nodes.push(instruction);
} else { } else {
return Err(ParseError::ExpectedClosingSquareBrace { return Err(ParseError::ExpectedClosingSquareBrace {
actual: self.current.0.clone(), actual: TokenOwned::from(self.current.0),
span: self.current.1, span: self.current.1,
}); });
} }
@ -277,7 +285,7 @@ impl<'src> Parser<'src> {
self.next_token()?; self.next_token()?;
} else { } else {
return Err(ParseError::ExpectedOpeningParenthesis { return Err(ParseError::ExpectedOpeningParenthesis {
actual: self.current.0.clone(), actual: TokenOwned::from(self.current.0),
span: self.current.1, span: self.current.1,
}); });
} }
@ -303,7 +311,7 @@ impl<'src> Parser<'src> {
} }
} else { } else {
return Err(ParseError::ExpectedClosingParenthesis { return Err(ParseError::ExpectedClosingParenthesis {
actual: self.current.0.clone(), actual: TokenOwned::from(self.current.0),
span: self.current.1, span: self.current.1,
}); });
} }
@ -318,7 +326,9 @@ impl<'src> Parser<'src> {
left_span, left_span,
)) ))
} }
_ => Err(ParseError::UnexpectedToken(self.current.0.clone())), _ => Err(ParseError::UnexpectedToken(TokenOwned::from(
self.current.0,
))),
} }
} }
@ -335,11 +345,12 @@ impl<'src> Parser<'src> {
#[derive(Debug, PartialEq, Clone)] #[derive(Debug, PartialEq, Clone)]
pub enum ParseError { pub enum ParseError {
ExpectedClosingParenthesis { actual: Token, span: Span },
ExpectedClosingSquareBrace { actual: Token, span: Span },
ExpectedOpeningParenthesis { actual: Token, span: Span },
LexError(LexError), LexError(LexError),
UnexpectedToken(Token),
ExpectedClosingParenthesis { actual: TokenOwned, span: Span },
ExpectedClosingSquareBrace { actual: TokenOwned, span: Span },
ExpectedOpeningParenthesis { actual: TokenOwned, span: Span },
UnexpectedToken(TokenOwned),
} }
impl From<LexError> for ParseError { impl From<LexError> for ParseError {

View File

@ -2,13 +2,69 @@ use std::fmt::{self, Display, Formatter};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::Identifier; #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub enum Token<'src> {
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
pub enum Token {
Eof, Eof,
Identifier(Identifier), Identifier(&'src str),
// Hard-coded values
Boolean(bool),
Float(f64),
Integer(i64),
String(&'src str),
// Keywords
IsEven,
IsOdd,
Length,
ReadLine,
WriteLine,
// Symbols
Comma,
Dot,
Equal,
LeftParenthesis,
LeftSquareBrace,
Plus,
RightParenthesis,
RightSquareBrace,
Star,
}
impl<'src> Display for Token<'src> {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Token::Eof => write!(f, "EOF"),
Token::Identifier(text) => write!(f, "{text}"),
Token::Boolean(boolean) => write!(f, "{boolean}"),
Token::Float(float) => write!(f, "{float}"),
Token::Integer(integer) => write!(f, "{integer}"),
Token::String(string) => write!(f, "{string}"),
Token::IsEven => write!(f, "is_even"),
Token::IsOdd => write!(f, "is_odd"),
Token::Length => write!(f, "length"),
Token::ReadLine => write!(f, "read_line"),
Token::WriteLine => write!(f, "write_line"),
Token::Comma => write!(f, ","),
Token::Dot => write!(f, "."),
Token::Equal => write!(f, "="),
Token::Plus => write!(f, "+"),
Token::Star => write!(f, "*"),
Token::LeftParenthesis => write!(f, "("),
Token::RightParenthesis => write!(f, ")"),
Token::LeftSquareBrace => write!(f, "["),
Token::RightSquareBrace => write!(f, "]"),
}
}
}
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
pub enum TokenOwned {
Eof,
Identifier(String),
// Hard-coded values // Hard-coded values
Boolean(bool), Boolean(bool),
@ -35,29 +91,29 @@ pub enum Token {
Star, Star,
} }
impl Display for Token { impl<'str> From<Token<'str>> for TokenOwned {
fn fmt(&self, f: &mut Formatter) -> fmt::Result { fn from(token: Token<'str>) -> Self {
match self { match token {
Token::Eof => write!(f, "EOF"), Token::Eof => TokenOwned::Eof,
Token::Identifier(identifier) => write!(f, "{identifier}"), Token::Identifier(text) => TokenOwned::Identifier(text.to_string()),
Token::Boolean(boolean) => write!(f, "{boolean}"), Token::Boolean(boolean) => TokenOwned::Boolean(boolean),
Token::Float(float) => write!(f, "{float}"), Token::Float(float) => TokenOwned::Float(float),
Token::Integer(integer) => write!(f, "{integer}"), Token::Integer(integer) => TokenOwned::Integer(integer),
Token::String(string) => write!(f, "{string}"), Token::String(text) => TokenOwned::String(text.to_string()),
Token::IsEven => write!(f, "is_even"), Token::IsEven => TokenOwned::IsEven,
Token::IsOdd => write!(f, "is_odd"), Token::IsOdd => TokenOwned::IsOdd,
Token::Length => write!(f, "length"), Token::Length => TokenOwned::Length,
Token::ReadLine => write!(f, "read_line"), Token::ReadLine => TokenOwned::ReadLine,
Token::WriteLine => write!(f, "write_line"), Token::WriteLine => TokenOwned::WriteLine,
Token::Comma => write!(f, ","), Token::Comma => TokenOwned::Comma,
Token::Dot => write!(f, "."), Token::Dot => TokenOwned::Dot,
Token::Equal => write!(f, "="), Token::Equal => TokenOwned::Equal,
Token::Plus => write!(f, "+"), Token::Plus => TokenOwned::Plus,
Token::Star => write!(f, "*"), Token::Star => TokenOwned::Star,
Token::LeftParenthesis => write!(f, "("), Token::LeftParenthesis => TokenOwned::LeftParenthesis,
Token::RightParenthesis => write!(f, ")"), Token::RightParenthesis => TokenOwned::RightParenthesis,
Token::LeftSquareBrace => write!(f, "["), Token::LeftSquareBrace => TokenOwned::LeftSquareBrace,
Token::RightSquareBrace => write!(f, "]"), Token::RightSquareBrace => TokenOwned::RightSquareBrace,
} }
} }
} }