From bf519ec087fc198a5b5b4f76ff841bb11a41d6a1 Mon Sep 17 00:00:00 2001 From: Jeff Date: Thu, 8 Aug 2024 16:19:14 -0400 Subject: [PATCH] Borrow string slices instead of copying them in lexing --- dust-lang/src/lex.rs | 156 ++++++++++++++++++++--------------------- dust-lang/src/parse.rs | 59 +++++++++------- dust-lang/src/token.rs | 112 +++++++++++++++++++++-------- 3 files changed, 194 insertions(+), 133 deletions(-) diff --git a/dust-lang/src/lex.rs b/dust-lang/src/lex.rs index 83d1fbf..66538b5 100644 --- a/dust-lang/src/lex.rs +++ b/dust-lang/src/lex.rs @@ -5,7 +5,7 @@ //! - [`Lexer`], which lexes the input a token at a time use std::num::{ParseFloatError, ParseIntError}; -use crate::{Identifier, Span, Token}; +use crate::{Span, Token}; /// Lexes the input and return a vector of tokens and their positions. /// @@ -18,7 +18,7 @@ use crate::{Identifier, Span, Token}; /// assert_eq!( /// tokens, /// [ -/// (Token::Identifier(Identifier::new("x")), (0, 1)), +/// (Token::Identifier("x"), (0, 1)), /// (Token::Equal, (2, 3)), /// (Token::Integer(1), (4, 5)), /// (Token::Plus, (6, 7)), @@ -27,12 +27,12 @@ use crate::{Identifier, Span, Token}; /// ] /// ); /// ``` -pub fn lex(input: &str) -> Result, LexError> { - let mut lexer = Lexer::new(input); +pub fn lex<'chars, 'src: 'chars>(input: &'src str) -> Result, Span)>, LexError> { + let mut lexer = Lexer::new(); let mut tokens = Vec::new(); loop { - let (token, span) = lexer.next_token()?; + let (token, span) = lexer.next_token(input)?; let is_eof = matches!(token, Token::Eof); tokens.push((token, span)); @@ -52,11 +52,11 @@ pub fn lex(input: &str) -> Result, LexError> { /// ``` /// # use dust_lang::*; /// let input = "x = 1 + 2"; -/// let mut lexer = Lexer::new(input); +/// let mut lexer = Lexer::new(); /// let mut tokens = Vec::new(); /// /// loop { -/// let (token, span) = lexer.next_token().unwrap(); +/// let (token, span) = lexer.next_token(input).unwrap(); /// let is_eof = matches!(token, Token::Eof); /// /// tokens.push((token, span)); @@ -69,7 +69,7 @@ pub fn lex(input: &str) -> Result, LexError> { /// assert_eq!( /// tokens, /// [ -/// (Token::Identifier(Identifier::new("x")), (0, 1)), +/// (Token::Identifier("x"), (0, 1)), /// (Token::Equal, (2, 3)), /// (Token::Integer(1), (4, 5)), /// (Token::Plus, (6, 7)), @@ -78,38 +78,26 @@ pub fn lex(input: &str) -> Result, LexError> { /// ] /// ) /// ``` -pub struct Lexer<'a> { - source: &'a str, +pub struct Lexer { position: usize, } -impl<'a> Lexer<'a> { +impl Lexer { /// Create a new lexer for the given input. - pub fn new(input: &'a str) -> Self { - Lexer { - source: input, - position: 0, - } - } - - /// Progress to the next character. - fn next_char(&mut self) -> Option { - self.source[self.position..].chars().next().map(|c| { - self.position += c.len_utf8(); - c - }) + pub fn new() -> Self { + Lexer { position: 0 } } /// Produce the next token. - pub fn next_token(&mut self) -> Result<(Token, Span), LexError> { - self.skip_whitespace(); + pub fn next_token<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> { + self.skip_whitespace(source); - let (token, span) = if let Some(c) = self.peek_char() { + let (token, span) = if let Some(c) = self.peek_char(source) { match c { - '0'..='9' => self.lex_number()?, - 'a'..='z' | 'A'..='Z' => self.lex_alphabetical()?, - '"' => self.lex_string('"')?, - '\'' => self.lex_string('\'')?, + '0'..='9' => self.lex_number(source)?, + 'a'..='z' | 'A'..='Z' => self.lex_alphabetical(source)?, + '"' => self.lex_string('"', source)?, + '\'' => self.lex_string('\'', source)?, '+' => { self.position += 1; (Token::Plus, (self.position - 1, self.position)) @@ -155,11 +143,19 @@ impl<'a> Lexer<'a> { Ok((token, span)) } + /// Progress to the next character. + fn next_char(&mut self, source: &str) -> Option { + source[self.position..].chars().next().map(|c| { + self.position += c.len_utf8(); + c + }) + } + /// Skip whitespace characters. - fn skip_whitespace(&mut self) { - while let Some(c) = self.peek_char() { + fn skip_whitespace(&mut self, source: &str) { + while let Some(c) = self.peek_char(source) { if c.is_whitespace() { - self.next_char(); + self.next_char(source); } else { break; } @@ -167,44 +163,31 @@ impl<'a> Lexer<'a> { } /// Peek at the next character without consuming it. - fn peek_char(&self) -> Option { - self.source[self.position..].chars().next() + fn peek_char(&self, source: &str) -> Option { + source[self.position..].chars().next() } /// Peek at the second-to-next character without consuming it. - fn peek_second_char(&self) -> Option { - self.source[self.position..].chars().nth(1) - } - - fn _peek_until_whitespace(&self) -> Option<&str> { - let start = self.position; - let end = self.source[self.position..] - .find(char::is_whitespace) - .map(|i| i + start); - - if let Some(end) = end { - Some(&self.source[start..end]) - } else { - None - } + fn peek_second_char(&self, source: &str) -> Option { + source[self.position..].chars().nth(1) } /// Lex an integer or float token. - fn lex_number(&mut self) -> Result<(Token, Span), LexError> { + fn lex_number<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> { let start_pos = self.position; let mut is_float = false; - while let Some(c) = self.peek_char() { + while let Some(c) = self.peek_char(source) { if c == '.' { - if let Some('0'..='9') = self.peek_second_char() { + if let Some('0'..='9') = self.peek_second_char(source) { if !is_float { - self.next_char(); + self.next_char(source); } - self.next_char(); + self.next_char(source); - while let Some('0'..='9') = self.peek_char() { - self.next_char(); + while let Some('0'..='9') = self.peek_char(source) { + self.next_char(source); } is_float = true; @@ -214,36 +197,39 @@ impl<'a> Lexer<'a> { } if c.is_ascii_digit() { - self.next_char(); + self.next_char(source); } else { break; } } if is_float { - let float = self.source[start_pos..self.position].parse::()?; + let float = source[start_pos..self.position].parse::()?; Ok((Token::Float(float), (start_pos, self.position))) } else { - let integer = self.source[start_pos..self.position].parse::()?; + let integer = source[start_pos..self.position].parse::()?; Ok((Token::Integer(integer), (start_pos, self.position))) } } /// Lex an identifier token. - fn lex_alphabetical(&mut self) -> Result<(Token, Span), LexError> { + fn lex_alphabetical<'src>( + &mut self, + source: &'src str, + ) -> Result<(Token<'src>, Span), LexError> { let start_pos = self.position; - while let Some(c) = self.peek_char() { + while let Some(c) = self.peek_char(source) { if c.is_ascii_alphanumeric() || c == '_' { - self.next_char(); + self.next_char(source); } else { break; } } - let string = &self.source[start_pos..self.position]; + let string = &source[start_pos..self.position]; let token = match string { "true" => Token::Boolean(true), "false" => Token::Boolean(false), @@ -252,31 +238,39 @@ impl<'a> Lexer<'a> { "length" => Token::Length, "read_line" => Token::ReadLine, "write_line" => Token::WriteLine, - _ => Token::Identifier(Identifier::new(string)), + _ => Token::Identifier(string), }; Ok((token, (start_pos, self.position))) } - fn lex_string(&mut self, delimiter: char) -> Result<(Token, Span), LexError> { + fn lex_string<'src>( + &mut self, + delimiter: char, + source: &'src str, + ) -> Result<(Token<'src>, Span), LexError> { let start_pos = self.position; - self.next_char(); + self.next_char(source); - while let Some(c) = self.peek_char() { + while let Some(c) = self.peek_char(source) { if c == delimiter { - self.next_char(); + self.next_char(source); break; } else { - self.next_char(); + self.next_char(source); } } - let string = &self.source[start_pos + 1..self.position - 1]; - Ok(( - Token::String(string.to_string()), - (start_pos, self.position), - )) + let text = &source[start_pos + 1..self.position - 1]; + + Ok((Token::String(text), (start_pos, self.position))) + } +} + +impl Default for Lexer { + fn default() -> Self { + Self::new() } } @@ -326,7 +320,7 @@ mod tests { Ok(vec![ (Token::WriteLine, (0, 10)), (Token::LeftParenthesis, (10, 11)), - (Token::String("Hello, world!".to_string()), (11, 26)), + (Token::String("Hello, world!"), (11, 26)), (Token::RightParenthesis, (26, 27)), (Token::Eof, (27, 27)), ]) @@ -340,9 +334,9 @@ mod tests { assert_eq!( lex(input), Ok(vec![ - (Token::String("Hello, ".to_string()), (0, 9)), + (Token::String("Hello, "), (0, 9)), (Token::Plus, (10, 11)), - (Token::String("world!".to_string()), (12, 20)), + (Token::String("world!"), (12, 20)), (Token::Eof, (20, 20)), ]) ) @@ -355,7 +349,7 @@ mod tests { assert_eq!( lex(input), Ok(vec![ - (Token::String("Hello, world!".to_string()), (0, 15)), + (Token::String("Hello, world!"), (0, 15)), (Token::Eof, (15, 15)), ]) ) @@ -507,7 +501,7 @@ mod tests { assert_eq!( lex(input,), Ok(vec![ - (Token::Identifier(Identifier::new("a")), (0, 1)), + (Token::Identifier("a"), (0, 1)), (Token::Equal, (2, 3)), (Token::Integer(1), (4, 5)), (Token::Plus, (6, 7)), diff --git a/dust-lang/src/parse.rs b/dust-lang/src/parse.rs index fc9e949..d15e8ca 100644 --- a/dust-lang/src/parse.rs +++ b/dust-lang/src/parse.rs @@ -6,8 +6,8 @@ use std::collections::VecDeque; use crate::{ - built_in_function::BuiltInFunction, AbstractSyntaxTree, LexError, Lexer, Node, Span, Statement, - Token, Value, + built_in_function::BuiltInFunction, token::TokenOwned, AbstractSyntaxTree, Identifier, + LexError, Lexer, Node, Span, Statement, Token, Value, }; /// Parses the input into an abstract syntax tree. @@ -40,8 +40,8 @@ use crate::{ /// ); /// ``` pub fn parse(input: &str) -> Result { - let lexer = Lexer::new(input); - let mut parser = Parser::new(lexer); + let lexer = Lexer::new(); + let mut parser = Parser::new(input, lexer); let mut nodes = VecDeque::new(); loop { @@ -64,8 +64,8 @@ pub fn parse(input: &str) -> Result { /// # use std::collections::VecDeque; /// # use dust_lang::*; /// let input = "x = 42"; -/// let lexer = Lexer::new(input); -/// let mut parser = Parser::new(lexer); +/// let lexer = Lexer::new(); +/// let mut parser = Parser::new(input, lexer); /// let mut nodes = VecDeque::new(); /// /// loop { @@ -98,16 +98,21 @@ pub fn parse(input: &str) -> Result { /// ); /// ``` pub struct Parser<'src> { - lexer: Lexer<'src>, - current: (Token, Span), + source: &'src str, + lexer: Lexer, + current: (Token<'src>, Span), } impl<'src> Parser<'src> { - pub fn new(lexer: Lexer<'src>) -> Self { + pub fn new(source: &'src str, lexer: Lexer) -> Self { let mut lexer = lexer; - let current = lexer.next_token().unwrap_or((Token::Eof, (0, 0))); + let current = lexer.next_token(source).unwrap_or((Token::Eof, (0, 0))); - Parser { lexer, current } + Parser { + source, + lexer, + current, + } } pub fn parse(&mut self) -> Result { @@ -119,7 +124,7 @@ impl<'src> Parser<'src> { } fn next_token(&mut self) -> Result<(), ParseError> { - self.current = self.lexer.next_token()?; + self.current = self.lexer.next_token(self.source)?; Ok(()) } @@ -182,7 +187,7 @@ impl<'src> Parser<'src> { } fn parse_primary(&mut self) -> Result { - match self.current.clone() { + match self.current { (Token::Boolean(boolean), span) => { self.next_token()?; @@ -201,10 +206,13 @@ impl<'src> Parser<'src> { Ok(Node::new(Statement::Constant(Value::integer(int)), span)) } - (Token::Identifier(identifier), span) => { + (Token::Identifier(text), span) => { self.next_token()?; - Ok(Node::new(Statement::Identifier(identifier), span)) + Ok(Node::new( + Statement::Identifier(Identifier::new(text)), + span, + )) } (Token::String(string), span) => { self.next_token()?; @@ -222,7 +230,7 @@ impl<'src> Parser<'src> { Ok(Node::new(node.statement, (left_span.0, right_span.1))) } else { Err(ParseError::ExpectedClosingParenthesis { - actual: self.current.0.clone(), + actual: TokenOwned::from(self.current.0), span: self.current.1, }) } @@ -252,7 +260,7 @@ impl<'src> Parser<'src> { nodes.push(instruction); } else { return Err(ParseError::ExpectedClosingSquareBrace { - actual: self.current.0.clone(), + actual: TokenOwned::from(self.current.0), span: self.current.1, }); } @@ -277,7 +285,7 @@ impl<'src> Parser<'src> { self.next_token()?; } else { return Err(ParseError::ExpectedOpeningParenthesis { - actual: self.current.0.clone(), + actual: TokenOwned::from(self.current.0), span: self.current.1, }); } @@ -303,7 +311,7 @@ impl<'src> Parser<'src> { } } else { return Err(ParseError::ExpectedClosingParenthesis { - actual: self.current.0.clone(), + actual: TokenOwned::from(self.current.0), span: self.current.1, }); } @@ -318,7 +326,9 @@ impl<'src> Parser<'src> { left_span, )) } - _ => Err(ParseError::UnexpectedToken(self.current.0.clone())), + _ => Err(ParseError::UnexpectedToken(TokenOwned::from( + self.current.0, + ))), } } @@ -335,11 +345,12 @@ impl<'src> Parser<'src> { #[derive(Debug, PartialEq, Clone)] pub enum ParseError { - ExpectedClosingParenthesis { actual: Token, span: Span }, - ExpectedClosingSquareBrace { actual: Token, span: Span }, - ExpectedOpeningParenthesis { actual: Token, span: Span }, LexError(LexError), - UnexpectedToken(Token), + + ExpectedClosingParenthesis { actual: TokenOwned, span: Span }, + ExpectedClosingSquareBrace { actual: TokenOwned, span: Span }, + ExpectedOpeningParenthesis { actual: TokenOwned, span: Span }, + UnexpectedToken(TokenOwned), } impl From for ParseError { diff --git a/dust-lang/src/token.rs b/dust-lang/src/token.rs index 9a897ee..25ea17a 100644 --- a/dust-lang/src/token.rs +++ b/dust-lang/src/token.rs @@ -2,13 +2,69 @@ use std::fmt::{self, Display, Formatter}; use serde::{Deserialize, Serialize}; -use crate::Identifier; - -#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] -pub enum Token { +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +pub enum Token<'src> { Eof, - Identifier(Identifier), + Identifier(&'src str), + + // Hard-coded values + Boolean(bool), + Float(f64), + Integer(i64), + String(&'src str), + + // Keywords + IsEven, + IsOdd, + Length, + ReadLine, + WriteLine, + + // Symbols + Comma, + Dot, + Equal, + LeftParenthesis, + LeftSquareBrace, + Plus, + RightParenthesis, + RightSquareBrace, + Star, +} + +impl<'src> Display for Token<'src> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + match self { + Token::Eof => write!(f, "EOF"), + Token::Identifier(text) => write!(f, "{text}"), + Token::Boolean(boolean) => write!(f, "{boolean}"), + Token::Float(float) => write!(f, "{float}"), + Token::Integer(integer) => write!(f, "{integer}"), + Token::String(string) => write!(f, "{string}"), + Token::IsEven => write!(f, "is_even"), + Token::IsOdd => write!(f, "is_odd"), + Token::Length => write!(f, "length"), + Token::ReadLine => write!(f, "read_line"), + Token::WriteLine => write!(f, "write_line"), + Token::Comma => write!(f, ","), + Token::Dot => write!(f, "."), + Token::Equal => write!(f, "="), + Token::Plus => write!(f, "+"), + Token::Star => write!(f, "*"), + Token::LeftParenthesis => write!(f, "("), + Token::RightParenthesis => write!(f, ")"), + Token::LeftSquareBrace => write!(f, "["), + Token::RightSquareBrace => write!(f, "]"), + } + } +} + +#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] +pub enum TokenOwned { + Eof, + + Identifier(String), // Hard-coded values Boolean(bool), @@ -35,29 +91,29 @@ pub enum Token { Star, } -impl Display for Token { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - match self { - Token::Eof => write!(f, "EOF"), - Token::Identifier(identifier) => write!(f, "{identifier}"), - Token::Boolean(boolean) => write!(f, "{boolean}"), - Token::Float(float) => write!(f, "{float}"), - Token::Integer(integer) => write!(f, "{integer}"), - Token::String(string) => write!(f, "{string}"), - Token::IsEven => write!(f, "is_even"), - Token::IsOdd => write!(f, "is_odd"), - Token::Length => write!(f, "length"), - Token::ReadLine => write!(f, "read_line"), - Token::WriteLine => write!(f, "write_line"), - Token::Comma => write!(f, ","), - Token::Dot => write!(f, "."), - Token::Equal => write!(f, "="), - Token::Plus => write!(f, "+"), - Token::Star => write!(f, "*"), - Token::LeftParenthesis => write!(f, "("), - Token::RightParenthesis => write!(f, ")"), - Token::LeftSquareBrace => write!(f, "["), - Token::RightSquareBrace => write!(f, "]"), +impl<'str> From> for TokenOwned { + fn from(token: Token<'str>) -> Self { + match token { + Token::Eof => TokenOwned::Eof, + Token::Identifier(text) => TokenOwned::Identifier(text.to_string()), + Token::Boolean(boolean) => TokenOwned::Boolean(boolean), + Token::Float(float) => TokenOwned::Float(float), + Token::Integer(integer) => TokenOwned::Integer(integer), + Token::String(text) => TokenOwned::String(text.to_string()), + Token::IsEven => TokenOwned::IsEven, + Token::IsOdd => TokenOwned::IsOdd, + Token::Length => TokenOwned::Length, + Token::ReadLine => TokenOwned::ReadLine, + Token::WriteLine => TokenOwned::WriteLine, + Token::Comma => TokenOwned::Comma, + Token::Dot => TokenOwned::Dot, + Token::Equal => TokenOwned::Equal, + Token::Plus => TokenOwned::Plus, + Token::Star => TokenOwned::Star, + Token::LeftParenthesis => TokenOwned::LeftParenthesis, + Token::RightParenthesis => TokenOwned::RightParenthesis, + Token::LeftSquareBrace => TokenOwned::LeftSquareBrace, + Token::RightSquareBrace => TokenOwned::RightSquareBrace, } } }