Borrow string slices instead of copying them in lexing

2024-08-08 16:19:14 -04:00 · 2024-08-08 16:19:14 -04:00 · bf519ec087
commit bf519ec087
parent 1c24286696
3 changed files with 194 additions and 133 deletions
--- a/dust-lang/src/lex.rs
+++ b/dust-lang/src/lex.rs
@ -5,7 +5,7 @@
 //! - [`Lexer`], which lexes the input a token at a time
 use std::num::{ParseFloatError, ParseIntError};
-use crate::{Identifier, Span, Token};
+use crate::{Span, Token};
 /// Lexes the input and return a vector of tokens and their positions.
 ///
@ -18,7 +18,7 @@ use crate::{Identifier, Span, Token};
 /// assert_eq!(
 ///     tokens,
 ///     [
-///         (Token::Identifier(Identifier::new("x")), (0, 1)),
+///         (Token::Identifier("x"), (0, 1)),
 ///         (Token::Equal, (2, 3)),
 ///         (Token::Integer(1), (4, 5)),
 ///         (Token::Plus, (6, 7)),
@ -27,12 +27,12 @@ use crate::{Identifier, Span, Token};
 ///     ]
 /// );
 /// ```
-pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
+pub fn lex<'chars, 'src: 'chars>(input: &'src str) -> Result<Vec<(Token<'chars>, Span)>, LexError> {
-    let mut lexer = Lexer::new(input);
+    let mut lexer = Lexer::new();
    let mut tokens = Vec::new();
    loop {
-        let (token, span) = lexer.next_token()?;
+        let (token, span) = lexer.next_token(input)?;
        let is_eof = matches!(token, Token::Eof);
        tokens.push((token, span));
@ -52,11 +52,11 @@ pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
 /// ```
 /// # use dust_lang::*;
 /// let input = "x = 1 + 2";
-/// let mut lexer = Lexer::new(input);
+/// let mut lexer = Lexer::new();
 /// let mut tokens = Vec::new();
 ///
 /// loop {
-///     let (token, span) = lexer.next_token().unwrap();
+///     let (token, span) = lexer.next_token(input).unwrap();
 ///     let is_eof = matches!(token, Token::Eof);
 ///
 ///     tokens.push((token, span));
@ -69,7 +69,7 @@ pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
 /// assert_eq!(
 ///     tokens,
 ///     [
-///         (Token::Identifier(Identifier::new("x")), (0, 1)),
+///         (Token::Identifier("x"), (0, 1)),
 ///         (Token::Equal, (2, 3)),
 ///         (Token::Integer(1), (4, 5)),
 ///         (Token::Plus, (6, 7)),
@ -78,38 +78,26 @@ pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
 ///     ]
 /// )
 /// ```
-pub struct Lexer<'a> {
+pub struct Lexer {
    source: &'a str,
    position: usize,
 }
-impl<'a> Lexer<'a> {
+impl Lexer {
    /// Create a new lexer for the given input.
-    pub fn new(input: &'a str) -> Self {
+    pub fn new() -> Self {
-        Lexer {
+        Lexer { position: 0 }
            source: input,
            position: 0,
        }
    }
    /// Progress to the next character.
    fn next_char(&mut self) -> Option<char> {
        self.source[self.position..].chars().next().map(|c| {
            self.position += c.len_utf8();
            c
        })
    }
    /// Produce the next token.
-    pub fn next_token(&mut self) -> Result<(Token, Span), LexError> {
+    pub fn next_token<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> {
-        self.skip_whitespace();
+        self.skip_whitespace(source);
-        let (token, span) = if let Some(c) = self.peek_char() {
+        let (token, span) = if let Some(c) = self.peek_char(source) {
            match c {
-                '0'..='9' => self.lex_number()?,
+                '0'..='9' => self.lex_number(source)?,
-                'a'..='z' | 'A'..='Z' => self.lex_alphabetical()?,
+                'a'..='z' | 'A'..='Z' => self.lex_alphabetical(source)?,
-                '"' => self.lex_string('"')?,
+                '"' => self.lex_string('"', source)?,
-                '\'' => self.lex_string('\'')?,
+                '\'' => self.lex_string('\'', source)?,
                '+' => {
                    self.position += 1;
                    (Token::Plus, (self.position - 1, self.position))
@ -155,11 +143,19 @@ impl<'a> Lexer<'a> {
        Ok((token, span))
    }
    /// Progress to the next character.
    fn next_char(&mut self, source: &str) -> Option<char> {
        source[self.position..].chars().next().map(|c| {
            self.position += c.len_utf8();
            c
        })
    }
    /// Skip whitespace characters.
-    fn skip_whitespace(&mut self) {
+    fn skip_whitespace(&mut self, source: &str) {
-        while let Some(c) = self.peek_char() {
+        while let Some(c) = self.peek_char(source) {
            if c.is_whitespace() {
-                self.next_char();
+                self.next_char(source);
            } else {
                break;
            }
@ -167,44 +163,31 @@ impl<'a> Lexer<'a> {
    }
    /// Peek at the next character without consuming it.
-    fn peek_char(&self) -> Option<char> {
+    fn peek_char(&self, source: &str) -> Option<char> {
-        self.source[self.position..].chars().next()
+        source[self.position..].chars().next()
    }
    /// Peek at the second-to-next character without consuming it.
-    fn peek_second_char(&self) -> Option<char> {
+    fn peek_second_char(&self, source: &str) -> Option<char> {
-        self.source[self.position..].chars().nth(1)
+        source[self.position..].chars().nth(1)
    }
    fn _peek_until_whitespace(&self) -> Option<&str> {
        let start = self.position;
        let end = self.source[self.position..]
            .find(char::is_whitespace)
            .map(|i| i + start);
        if let Some(end) = end {
            Some(&self.source[start..end])
        } else {
            None
        }
    }
    /// Lex an integer or float token.
-    fn lex_number(&mut self) -> Result<(Token, Span), LexError> {
+    fn lex_number<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> {
        let start_pos = self.position;
        let mut is_float = false;
-        while let Some(c) = self.peek_char() {
+        while let Some(c) = self.peek_char(source) {
            if c == '.' {
-                if let Some('0'..='9') = self.peek_second_char() {
+                if let Some('0'..='9') = self.peek_second_char(source) {
                    if !is_float {
-                        self.next_char();
+                        self.next_char(source);
                    }
-                    self.next_char();
+                    self.next_char(source);
-                    while let Some('0'..='9') = self.peek_char() {
+                    while let Some('0'..='9') = self.peek_char(source) {
-                        self.next_char();
+                        self.next_char(source);
                    }
                    is_float = true;
@ -214,36 +197,39 @@ impl<'a> Lexer<'a> {
            }
            if c.is_ascii_digit() {
-                self.next_char();
+                self.next_char(source);
            } else {
                break;
            }
        }
        if is_float {
-            let float = self.source[start_pos..self.position].parse::<f64>()?;
+            let float = source[start_pos..self.position].parse::<f64>()?;
            Ok((Token::Float(float), (start_pos, self.position)))
        } else {
-            let integer = self.source[start_pos..self.position].parse::<i64>()?;
+            let integer = source[start_pos..self.position].parse::<i64>()?;
            Ok((Token::Integer(integer), (start_pos, self.position)))
        }
    }
    /// Lex an identifier token.
-    fn lex_alphabetical(&mut self) -> Result<(Token, Span), LexError> {
+    fn lex_alphabetical<'src>(
        &mut self,
        source: &'src str,
    ) -> Result<(Token<'src>, Span), LexError> {
        let start_pos = self.position;
-        while let Some(c) = self.peek_char() {
+        while let Some(c) = self.peek_char(source) {
            if c.is_ascii_alphanumeric() || c == '_' {
-                self.next_char();
+                self.next_char(source);
            } else {
                break;
            }
        }
-        let string = &self.source[start_pos..self.position];
+        let string = &source[start_pos..self.position];
        let token = match string {
            "true" => Token::Boolean(true),
            "false" => Token::Boolean(false),
@ -252,31 +238,39 @@ impl<'a> Lexer<'a> {
            "length" => Token::Length,
            "read_line" => Token::ReadLine,
            "write_line" => Token::WriteLine,
-            _ => Token::Identifier(Identifier::new(string)),
+            _ => Token::Identifier(string),
        };
        Ok((token, (start_pos, self.position)))
    }
-    fn lex_string(&mut self, delimiter: char) -> Result<(Token, Span), LexError> {
+    fn lex_string<'src>(
        &mut self,
        delimiter: char,
        source: &'src str,
    ) -> Result<(Token<'src>, Span), LexError> {
        let start_pos = self.position;
-        self.next_char();
+        self.next_char(source);
-        while let Some(c) = self.peek_char() {
+        while let Some(c) = self.peek_char(source) {
            if c == delimiter {
-                self.next_char();
+                self.next_char(source);
                break;
            } else {
-                self.next_char();
+                self.next_char(source);
            }
        }
-        let string = &self.source[start_pos + 1..self.position - 1];
+        let text = &source[start_pos + 1..self.position - 1];
-        Ok((
+
-            Token::String(string.to_string()),
+        Ok((Token::String(text), (start_pos, self.position)))
-            (start_pos, self.position),
+    }
-        ))
+}
 impl Default for Lexer {
    fn default() -> Self {
        Self::new()
    }
 }
@ -326,7 +320,7 @@ mod tests {
            Ok(vec![
                (Token::WriteLine, (0, 10)),
                (Token::LeftParenthesis, (10, 11)),
-                (Token::String("Hello, world!".to_string()), (11, 26)),
+                (Token::String("Hello, world!"), (11, 26)),
                (Token::RightParenthesis, (26, 27)),
                (Token::Eof, (27, 27)),
            ])
@ -340,9 +334,9 @@ mod tests {
        assert_eq!(
            lex(input),
            Ok(vec![
-                (Token::String("Hello, ".to_string()), (0, 9)),
+                (Token::String("Hello, "), (0, 9)),
                (Token::Plus, (10, 11)),
-                (Token::String("world!".to_string()), (12, 20)),
+                (Token::String("world!"), (12, 20)),
                (Token::Eof, (20, 20)),
            ])
        )
@ -355,7 +349,7 @@ mod tests {
        assert_eq!(
            lex(input),
            Ok(vec![
-                (Token::String("Hello, world!".to_string()), (0, 15)),
+                (Token::String("Hello, world!"), (0, 15)),
                (Token::Eof, (15, 15)),
            ])
        )
@ -507,7 +501,7 @@ mod tests {
        assert_eq!(
            lex(input,),
            Ok(vec![
-                (Token::Identifier(Identifier::new("a")), (0, 1)),
+                (Token::Identifier("a"), (0, 1)),
                (Token::Equal, (2, 3)),
                (Token::Integer(1), (4, 5)),
                (Token::Plus, (6, 7)),
--- a/dust-lang/src/parse.rs
+++ b/dust-lang/src/parse.rs
@ -6,8 +6,8 @@
 use std::collections::VecDeque;
 use crate::{
-    built_in_function::BuiltInFunction, AbstractSyntaxTree, LexError, Lexer, Node, Span, Statement,
+    built_in_function::BuiltInFunction, token::TokenOwned, AbstractSyntaxTree, Identifier,
-    Token, Value,
+    LexError, Lexer, Node, Span, Statement, Token, Value,
 };
 /// Parses the input into an abstract syntax tree.
@ -40,8 +40,8 @@ use crate::{
 /// );
 /// ```
 pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> {
-    let lexer = Lexer::new(input);
+    let lexer = Lexer::new();
-    let mut parser = Parser::new(lexer);
+    let mut parser = Parser::new(input, lexer);
    let mut nodes = VecDeque::new();
    loop {
@ -64,8 +64,8 @@ pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> {
 /// # use std::collections::VecDeque;
 /// # use dust_lang::*;
 /// let input = "x = 42";
-/// let lexer = Lexer::new(input);
+/// let lexer = Lexer::new();
-/// let mut parser = Parser::new(lexer);
+/// let mut parser = Parser::new(input, lexer);
 /// let mut nodes = VecDeque::new();
 ///
 /// loop {
@ -98,16 +98,21 @@ pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> {
 /// );
 /// ```
 pub struct Parser<'src> {
-    lexer: Lexer<'src>,
+    source: &'src str,
-    current: (Token, Span),
+    lexer: Lexer,
    current: (Token<'src>, Span),
 }
 impl<'src> Parser<'src> {
-    pub fn new(lexer: Lexer<'src>) -> Self {
+    pub fn new(source: &'src str, lexer: Lexer) -> Self {
        let mut lexer = lexer;
-        let current = lexer.next_token().unwrap_or((Token::Eof, (0, 0)));
+        let current = lexer.next_token(source).unwrap_or((Token::Eof, (0, 0)));
-        Parser { lexer, current }
+        Parser {
            source,
            lexer,
            current,
        }
    }
    pub fn parse(&mut self) -> Result<Node, ParseError> {
@ -119,7 +124,7 @@ impl<'src> Parser<'src> {
    }
    fn next_token(&mut self) -> Result<(), ParseError> {
-        self.current = self.lexer.next_token()?;
+        self.current = self.lexer.next_token(self.source)?;
        Ok(())
    }
@ -182,7 +187,7 @@ impl<'src> Parser<'src> {
    }
    fn parse_primary(&mut self) -> Result<Node, ParseError> {
-        match self.current.clone() {
+        match self.current {
            (Token::Boolean(boolean), span) => {
                self.next_token()?;
@ -201,10 +206,13 @@ impl<'src> Parser<'src> {
                Ok(Node::new(Statement::Constant(Value::integer(int)), span))
            }
-            (Token::Identifier(identifier), span) => {
+            (Token::Identifier(text), span) => {
                self.next_token()?;
-                Ok(Node::new(Statement::Identifier(identifier), span))
+                Ok(Node::new(
                    Statement::Identifier(Identifier::new(text)),
                    span,
                ))
            }
            (Token::String(string), span) => {
                self.next_token()?;
@ -222,7 +230,7 @@ impl<'src> Parser<'src> {
                    Ok(Node::new(node.statement, (left_span.0, right_span.1)))
                } else {
                    Err(ParseError::ExpectedClosingParenthesis {
-                        actual: self.current.0.clone(),
+                        actual: TokenOwned::from(self.current.0),
                        span: self.current.1,
                    })
                }
@ -252,7 +260,7 @@ impl<'src> Parser<'src> {
                        nodes.push(instruction);
                    } else {
                        return Err(ParseError::ExpectedClosingSquareBrace {
-                            actual: self.current.0.clone(),
+                            actual: TokenOwned::from(self.current.0),
                            span: self.current.1,
                        });
                    }
@ -277,7 +285,7 @@ impl<'src> Parser<'src> {
                    self.next_token()?;
                } else {
                    return Err(ParseError::ExpectedOpeningParenthesis {
-                        actual: self.current.0.clone(),
+                        actual: TokenOwned::from(self.current.0),
                        span: self.current.1,
                    });
                }
@ -303,7 +311,7 @@ impl<'src> Parser<'src> {
                        }
                    } else {
                        return Err(ParseError::ExpectedClosingParenthesis {
-                            actual: self.current.0.clone(),
+                            actual: TokenOwned::from(self.current.0),
                            span: self.current.1,
                        });
                    }
@ -318,7 +326,9 @@ impl<'src> Parser<'src> {
                    left_span,
                ))
            }
-            _ => Err(ParseError::UnexpectedToken(self.current.0.clone())),
+            _ => Err(ParseError::UnexpectedToken(TokenOwned::from(
                self.current.0,
            ))),
        }
    }
@ -335,11 +345,12 @@ impl<'src> Parser<'src> {
 #[derive(Debug, PartialEq, Clone)]
 pub enum ParseError {
    ExpectedClosingParenthesis { actual: Token, span: Span },
    ExpectedClosingSquareBrace { actual: Token, span: Span },
    ExpectedOpeningParenthesis { actual: Token, span: Span },
    LexError(LexError),
-    UnexpectedToken(Token),
+
    ExpectedClosingParenthesis { actual: TokenOwned, span: Span },
    ExpectedClosingSquareBrace { actual: TokenOwned, span: Span },
    ExpectedOpeningParenthesis { actual: TokenOwned, span: Span },
    UnexpectedToken(TokenOwned),
 }
 impl From<LexError> for ParseError {
--- a/dust-lang/src/token.rs
+++ b/dust-lang/src/token.rs
@ -2,13 +2,69 @@ use std::fmt::{self, Display, Formatter};
 use serde::{Deserialize, Serialize};
-use crate::Identifier;
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
-
+pub enum Token<'src> {
 #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
 pub enum Token {
    Eof,
-    Identifier(Identifier),
+    Identifier(&'src str),
    // Hard-coded values
    Boolean(bool),
    Float(f64),
    Integer(i64),
    String(&'src str),
    // Keywords
    IsEven,
    IsOdd,
    Length,
    ReadLine,
    WriteLine,
    // Symbols
    Comma,
    Dot,
    Equal,
    LeftParenthesis,
    LeftSquareBrace,
    Plus,
    RightParenthesis,
    RightSquareBrace,
    Star,
 }
 impl<'src> Display for Token<'src> {
    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
        match self {
            Token::Eof => write!(f, "EOF"),
            Token::Identifier(text) => write!(f, "{text}"),
            Token::Boolean(boolean) => write!(f, "{boolean}"),
            Token::Float(float) => write!(f, "{float}"),
            Token::Integer(integer) => write!(f, "{integer}"),
            Token::String(string) => write!(f, "{string}"),
            Token::IsEven => write!(f, "is_even"),
            Token::IsOdd => write!(f, "is_odd"),
            Token::Length => write!(f, "length"),
            Token::ReadLine => write!(f, "read_line"),
            Token::WriteLine => write!(f, "write_line"),
            Token::Comma => write!(f, ","),
            Token::Dot => write!(f, "."),
            Token::Equal => write!(f, "="),
            Token::Plus => write!(f, "+"),
            Token::Star => write!(f, "*"),
            Token::LeftParenthesis => write!(f, "("),
            Token::RightParenthesis => write!(f, ")"),
            Token::LeftSquareBrace => write!(f, "["),
            Token::RightSquareBrace => write!(f, "]"),
        }
    }
 }
 #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
 pub enum TokenOwned {
    Eof,
    Identifier(String),
    // Hard-coded values
    Boolean(bool),
@ -35,29 +91,29 @@ pub enum Token {
    Star,
 }
-impl Display for Token {
+impl<'str> From<Token<'str>> for TokenOwned {
-    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+    fn from(token: Token<'str>) -> Self {
-        match self {
+        match token {
-            Token::Eof => write!(f, "EOF"),
+            Token::Eof => TokenOwned::Eof,
-            Token::Identifier(identifier) => write!(f, "{identifier}"),
+            Token::Identifier(text) => TokenOwned::Identifier(text.to_string()),
-            Token::Boolean(boolean) => write!(f, "{boolean}"),
+            Token::Boolean(boolean) => TokenOwned::Boolean(boolean),
-            Token::Float(float) => write!(f, "{float}"),
+            Token::Float(float) => TokenOwned::Float(float),
-            Token::Integer(integer) => write!(f, "{integer}"),
+            Token::Integer(integer) => TokenOwned::Integer(integer),
-            Token::String(string) => write!(f, "{string}"),
+            Token::String(text) => TokenOwned::String(text.to_string()),
-            Token::IsEven => write!(f, "is_even"),
+            Token::IsEven => TokenOwned::IsEven,
-            Token::IsOdd => write!(f, "is_odd"),
+            Token::IsOdd => TokenOwned::IsOdd,
-            Token::Length => write!(f, "length"),
+            Token::Length => TokenOwned::Length,
-            Token::ReadLine => write!(f, "read_line"),
+            Token::ReadLine => TokenOwned::ReadLine,
-            Token::WriteLine => write!(f, "write_line"),
+            Token::WriteLine => TokenOwned::WriteLine,
-            Token::Comma => write!(f, ","),
+            Token::Comma => TokenOwned::Comma,
-            Token::Dot => write!(f, "."),
+            Token::Dot => TokenOwned::Dot,
-            Token::Equal => write!(f, "="),
+            Token::Equal => TokenOwned::Equal,
-            Token::Plus => write!(f, "+"),
+            Token::Plus => TokenOwned::Plus,
-            Token::Star => write!(f, "*"),
+            Token::Star => TokenOwned::Star,
-            Token::LeftParenthesis => write!(f, "("),
+            Token::LeftParenthesis => TokenOwned::LeftParenthesis,
-            Token::RightParenthesis => write!(f, ")"),
+            Token::RightParenthesis => TokenOwned::RightParenthesis,
-            Token::LeftSquareBrace => write!(f, "["),
+            Token::LeftSquareBrace => TokenOwned::LeftSquareBrace,
-            Token::RightSquareBrace => write!(f, "]"),
+            Token::RightSquareBrace => TokenOwned::RightSquareBrace,
        }
    }
 }