Borrow string slices instead of copying them in lexing

This commit is contained in:
Jeff 2024-08-08 16:19:14 -04:00
parent 1c24286696
commit bf519ec087
3 changed files with 194 additions and 133 deletions

View File

@ -5,7 +5,7 @@
//! - [`Lexer`], which lexes the input a token at a time
use std::num::{ParseFloatError, ParseIntError};
use crate::{Identifier, Span, Token};
use crate::{Span, Token};
/// Lexes the input and return a vector of tokens and their positions.
///
@ -18,7 +18,7 @@ use crate::{Identifier, Span, Token};
/// assert_eq!(
/// tokens,
/// [
/// (Token::Identifier(Identifier::new("x")), (0, 1)),
/// (Token::Identifier("x"), (0, 1)),
/// (Token::Equal, (2, 3)),
/// (Token::Integer(1), (4, 5)),
/// (Token::Plus, (6, 7)),
@ -27,12 +27,12 @@ use crate::{Identifier, Span, Token};
/// ]
/// );
/// ```
pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
let mut lexer = Lexer::new(input);
pub fn lex<'chars, 'src: 'chars>(input: &'src str) -> Result<Vec<(Token<'chars>, Span)>, LexError> {
let mut lexer = Lexer::new();
let mut tokens = Vec::new();
loop {
let (token, span) = lexer.next_token()?;
let (token, span) = lexer.next_token(input)?;
let is_eof = matches!(token, Token::Eof);
tokens.push((token, span));
@ -52,11 +52,11 @@ pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
/// ```
/// # use dust_lang::*;
/// let input = "x = 1 + 2";
/// let mut lexer = Lexer::new(input);
/// let mut lexer = Lexer::new();
/// let mut tokens = Vec::new();
///
/// loop {
/// let (token, span) = lexer.next_token().unwrap();
/// let (token, span) = lexer.next_token(input).unwrap();
/// let is_eof = matches!(token, Token::Eof);
///
/// tokens.push((token, span));
@ -69,7 +69,7 @@ pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
/// assert_eq!(
/// tokens,
/// [
/// (Token::Identifier(Identifier::new("x")), (0, 1)),
/// (Token::Identifier("x"), (0, 1)),
/// (Token::Equal, (2, 3)),
/// (Token::Integer(1), (4, 5)),
/// (Token::Plus, (6, 7)),
@ -78,38 +78,26 @@ pub fn lex(input: &str) -> Result<Vec<(Token, Span)>, LexError> {
/// ]
/// )
/// ```
pub struct Lexer<'a> {
source: &'a str,
pub struct Lexer {
position: usize,
}
impl<'a> Lexer<'a> {
impl Lexer {
/// Create a new lexer for the given input.
pub fn new(input: &'a str) -> Self {
Lexer {
source: input,
position: 0,
}
}
/// Progress to the next character.
fn next_char(&mut self) -> Option<char> {
self.source[self.position..].chars().next().map(|c| {
self.position += c.len_utf8();
c
})
pub fn new() -> Self {
Lexer { position: 0 }
}
/// Produce the next token.
pub fn next_token(&mut self) -> Result<(Token, Span), LexError> {
self.skip_whitespace();
pub fn next_token<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> {
self.skip_whitespace(source);
let (token, span) = if let Some(c) = self.peek_char() {
let (token, span) = if let Some(c) = self.peek_char(source) {
match c {
'0'..='9' => self.lex_number()?,
'a'..='z' | 'A'..='Z' => self.lex_alphabetical()?,
'"' => self.lex_string('"')?,
'\'' => self.lex_string('\'')?,
'0'..='9' => self.lex_number(source)?,
'a'..='z' | 'A'..='Z' => self.lex_alphabetical(source)?,
'"' => self.lex_string('"', source)?,
'\'' => self.lex_string('\'', source)?,
'+' => {
self.position += 1;
(Token::Plus, (self.position - 1, self.position))
@ -155,11 +143,19 @@ impl<'a> Lexer<'a> {
Ok((token, span))
}
/// Progress to the next character.
fn next_char(&mut self, source: &str) -> Option<char> {
source[self.position..].chars().next().map(|c| {
self.position += c.len_utf8();
c
})
}
/// Skip whitespace characters.
fn skip_whitespace(&mut self) {
while let Some(c) = self.peek_char() {
fn skip_whitespace(&mut self, source: &str) {
while let Some(c) = self.peek_char(source) {
if c.is_whitespace() {
self.next_char();
self.next_char(source);
} else {
break;
}
@ -167,44 +163,31 @@ impl<'a> Lexer<'a> {
}
/// Peek at the next character without consuming it.
fn peek_char(&self) -> Option<char> {
self.source[self.position..].chars().next()
fn peek_char(&self, source: &str) -> Option<char> {
source[self.position..].chars().next()
}
/// Peek at the second-to-next character without consuming it.
fn peek_second_char(&self) -> Option<char> {
self.source[self.position..].chars().nth(1)
}
fn _peek_until_whitespace(&self) -> Option<&str> {
let start = self.position;
let end = self.source[self.position..]
.find(char::is_whitespace)
.map(|i| i + start);
if let Some(end) = end {
Some(&self.source[start..end])
} else {
None
}
fn peek_second_char(&self, source: &str) -> Option<char> {
source[self.position..].chars().nth(1)
}
/// Lex an integer or float token.
fn lex_number(&mut self) -> Result<(Token, Span), LexError> {
fn lex_number<'src>(&mut self, source: &'src str) -> Result<(Token<'src>, Span), LexError> {
let start_pos = self.position;
let mut is_float = false;
while let Some(c) = self.peek_char() {
while let Some(c) = self.peek_char(source) {
if c == '.' {
if let Some('0'..='9') = self.peek_second_char() {
if let Some('0'..='9') = self.peek_second_char(source) {
if !is_float {
self.next_char();
self.next_char(source);
}
self.next_char();
self.next_char(source);
while let Some('0'..='9') = self.peek_char() {
self.next_char();
while let Some('0'..='9') = self.peek_char(source) {
self.next_char(source);
}
is_float = true;
@ -214,36 +197,39 @@ impl<'a> Lexer<'a> {
}
if c.is_ascii_digit() {
self.next_char();
self.next_char(source);
} else {
break;
}
}
if is_float {
let float = self.source[start_pos..self.position].parse::<f64>()?;
let float = source[start_pos..self.position].parse::<f64>()?;
Ok((Token::Float(float), (start_pos, self.position)))
} else {
let integer = self.source[start_pos..self.position].parse::<i64>()?;
let integer = source[start_pos..self.position].parse::<i64>()?;
Ok((Token::Integer(integer), (start_pos, self.position)))
}
}
/// Lex an identifier token.
fn lex_alphabetical(&mut self) -> Result<(Token, Span), LexError> {
fn lex_alphabetical<'src>(
&mut self,
source: &'src str,
) -> Result<(Token<'src>, Span), LexError> {
let start_pos = self.position;
while let Some(c) = self.peek_char() {
while let Some(c) = self.peek_char(source) {
if c.is_ascii_alphanumeric() || c == '_' {
self.next_char();
self.next_char(source);
} else {
break;
}
}
let string = &self.source[start_pos..self.position];
let string = &source[start_pos..self.position];
let token = match string {
"true" => Token::Boolean(true),
"false" => Token::Boolean(false),
@ -252,31 +238,39 @@ impl<'a> Lexer<'a> {
"length" => Token::Length,
"read_line" => Token::ReadLine,
"write_line" => Token::WriteLine,
_ => Token::Identifier(Identifier::new(string)),
_ => Token::Identifier(string),
};
Ok((token, (start_pos, self.position)))
}
fn lex_string(&mut self, delimiter: char) -> Result<(Token, Span), LexError> {
fn lex_string<'src>(
&mut self,
delimiter: char,
source: &'src str,
) -> Result<(Token<'src>, Span), LexError> {
let start_pos = self.position;
self.next_char();
self.next_char(source);
while let Some(c) = self.peek_char() {
while let Some(c) = self.peek_char(source) {
if c == delimiter {
self.next_char();
self.next_char(source);
break;
} else {
self.next_char();
self.next_char(source);
}
}
let string = &self.source[start_pos + 1..self.position - 1];
Ok((
Token::String(string.to_string()),
(start_pos, self.position),
))
let text = &source[start_pos + 1..self.position - 1];
Ok((Token::String(text), (start_pos, self.position)))
}
}
impl Default for Lexer {
fn default() -> Self {
Self::new()
}
}
@ -326,7 +320,7 @@ mod tests {
Ok(vec![
(Token::WriteLine, (0, 10)),
(Token::LeftParenthesis, (10, 11)),
(Token::String("Hello, world!".to_string()), (11, 26)),
(Token::String("Hello, world!"), (11, 26)),
(Token::RightParenthesis, (26, 27)),
(Token::Eof, (27, 27)),
])
@ -340,9 +334,9 @@ mod tests {
assert_eq!(
lex(input),
Ok(vec![
(Token::String("Hello, ".to_string()), (0, 9)),
(Token::String("Hello, "), (0, 9)),
(Token::Plus, (10, 11)),
(Token::String("world!".to_string()), (12, 20)),
(Token::String("world!"), (12, 20)),
(Token::Eof, (20, 20)),
])
)
@ -355,7 +349,7 @@ mod tests {
assert_eq!(
lex(input),
Ok(vec![
(Token::String("Hello, world!".to_string()), (0, 15)),
(Token::String("Hello, world!"), (0, 15)),
(Token::Eof, (15, 15)),
])
)
@ -507,7 +501,7 @@ mod tests {
assert_eq!(
lex(input,),
Ok(vec![
(Token::Identifier(Identifier::new("a")), (0, 1)),
(Token::Identifier("a"), (0, 1)),
(Token::Equal, (2, 3)),
(Token::Integer(1), (4, 5)),
(Token::Plus, (6, 7)),

View File

@ -6,8 +6,8 @@
use std::collections::VecDeque;
use crate::{
built_in_function::BuiltInFunction, AbstractSyntaxTree, LexError, Lexer, Node, Span, Statement,
Token, Value,
built_in_function::BuiltInFunction, token::TokenOwned, AbstractSyntaxTree, Identifier,
LexError, Lexer, Node, Span, Statement, Token, Value,
};
/// Parses the input into an abstract syntax tree.
@ -40,8 +40,8 @@ use crate::{
/// );
/// ```
pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> {
let lexer = Lexer::new(input);
let mut parser = Parser::new(lexer);
let lexer = Lexer::new();
let mut parser = Parser::new(input, lexer);
let mut nodes = VecDeque::new();
loop {
@ -64,8 +64,8 @@ pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> {
/// # use std::collections::VecDeque;
/// # use dust_lang::*;
/// let input = "x = 42";
/// let lexer = Lexer::new(input);
/// let mut parser = Parser::new(lexer);
/// let lexer = Lexer::new();
/// let mut parser = Parser::new(input, lexer);
/// let mut nodes = VecDeque::new();
///
/// loop {
@ -98,16 +98,21 @@ pub fn parse(input: &str) -> Result<AbstractSyntaxTree, ParseError> {
/// );
/// ```
pub struct Parser<'src> {
lexer: Lexer<'src>,
current: (Token, Span),
source: &'src str,
lexer: Lexer,
current: (Token<'src>, Span),
}
impl<'src> Parser<'src> {
pub fn new(lexer: Lexer<'src>) -> Self {
pub fn new(source: &'src str, lexer: Lexer) -> Self {
let mut lexer = lexer;
let current = lexer.next_token().unwrap_or((Token::Eof, (0, 0)));
let current = lexer.next_token(source).unwrap_or((Token::Eof, (0, 0)));
Parser { lexer, current }
Parser {
source,
lexer,
current,
}
}
pub fn parse(&mut self) -> Result<Node, ParseError> {
@ -119,7 +124,7 @@ impl<'src> Parser<'src> {
}
fn next_token(&mut self) -> Result<(), ParseError> {
self.current = self.lexer.next_token()?;
self.current = self.lexer.next_token(self.source)?;
Ok(())
}
@ -182,7 +187,7 @@ impl<'src> Parser<'src> {
}
fn parse_primary(&mut self) -> Result<Node, ParseError> {
match self.current.clone() {
match self.current {
(Token::Boolean(boolean), span) => {
self.next_token()?;
@ -201,10 +206,13 @@ impl<'src> Parser<'src> {
Ok(Node::new(Statement::Constant(Value::integer(int)), span))
}
(Token::Identifier(identifier), span) => {
(Token::Identifier(text), span) => {
self.next_token()?;
Ok(Node::new(Statement::Identifier(identifier), span))
Ok(Node::new(
Statement::Identifier(Identifier::new(text)),
span,
))
}
(Token::String(string), span) => {
self.next_token()?;
@ -222,7 +230,7 @@ impl<'src> Parser<'src> {
Ok(Node::new(node.statement, (left_span.0, right_span.1)))
} else {
Err(ParseError::ExpectedClosingParenthesis {
actual: self.current.0.clone(),
actual: TokenOwned::from(self.current.0),
span: self.current.1,
})
}
@ -252,7 +260,7 @@ impl<'src> Parser<'src> {
nodes.push(instruction);
} else {
return Err(ParseError::ExpectedClosingSquareBrace {
actual: self.current.0.clone(),
actual: TokenOwned::from(self.current.0),
span: self.current.1,
});
}
@ -277,7 +285,7 @@ impl<'src> Parser<'src> {
self.next_token()?;
} else {
return Err(ParseError::ExpectedOpeningParenthesis {
actual: self.current.0.clone(),
actual: TokenOwned::from(self.current.0),
span: self.current.1,
});
}
@ -303,7 +311,7 @@ impl<'src> Parser<'src> {
}
} else {
return Err(ParseError::ExpectedClosingParenthesis {
actual: self.current.0.clone(),
actual: TokenOwned::from(self.current.0),
span: self.current.1,
});
}
@ -318,7 +326,9 @@ impl<'src> Parser<'src> {
left_span,
))
}
_ => Err(ParseError::UnexpectedToken(self.current.0.clone())),
_ => Err(ParseError::UnexpectedToken(TokenOwned::from(
self.current.0,
))),
}
}
@ -335,11 +345,12 @@ impl<'src> Parser<'src> {
#[derive(Debug, PartialEq, Clone)]
pub enum ParseError {
ExpectedClosingParenthesis { actual: Token, span: Span },
ExpectedClosingSquareBrace { actual: Token, span: Span },
ExpectedOpeningParenthesis { actual: Token, span: Span },
LexError(LexError),
UnexpectedToken(Token),
ExpectedClosingParenthesis { actual: TokenOwned, span: Span },
ExpectedClosingSquareBrace { actual: TokenOwned, span: Span },
ExpectedOpeningParenthesis { actual: TokenOwned, span: Span },
UnexpectedToken(TokenOwned),
}
impl From<LexError> for ParseError {

View File

@ -2,13 +2,69 @@ use std::fmt::{self, Display, Formatter};
use serde::{Deserialize, Serialize};
use crate::Identifier;
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
pub enum Token {
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub enum Token<'src> {
Eof,
Identifier(Identifier),
Identifier(&'src str),
// Hard-coded values
Boolean(bool),
Float(f64),
Integer(i64),
String(&'src str),
// Keywords
IsEven,
IsOdd,
Length,
ReadLine,
WriteLine,
// Symbols
Comma,
Dot,
Equal,
LeftParenthesis,
LeftSquareBrace,
Plus,
RightParenthesis,
RightSquareBrace,
Star,
}
impl<'src> Display for Token<'src> {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Token::Eof => write!(f, "EOF"),
Token::Identifier(text) => write!(f, "{text}"),
Token::Boolean(boolean) => write!(f, "{boolean}"),
Token::Float(float) => write!(f, "{float}"),
Token::Integer(integer) => write!(f, "{integer}"),
Token::String(string) => write!(f, "{string}"),
Token::IsEven => write!(f, "is_even"),
Token::IsOdd => write!(f, "is_odd"),
Token::Length => write!(f, "length"),
Token::ReadLine => write!(f, "read_line"),
Token::WriteLine => write!(f, "write_line"),
Token::Comma => write!(f, ","),
Token::Dot => write!(f, "."),
Token::Equal => write!(f, "="),
Token::Plus => write!(f, "+"),
Token::Star => write!(f, "*"),
Token::LeftParenthesis => write!(f, "("),
Token::RightParenthesis => write!(f, ")"),
Token::LeftSquareBrace => write!(f, "["),
Token::RightSquareBrace => write!(f, "]"),
}
}
}
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
pub enum TokenOwned {
Eof,
Identifier(String),
// Hard-coded values
Boolean(bool),
@ -35,29 +91,29 @@ pub enum Token {
Star,
}
impl Display for Token {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Token::Eof => write!(f, "EOF"),
Token::Identifier(identifier) => write!(f, "{identifier}"),
Token::Boolean(boolean) => write!(f, "{boolean}"),
Token::Float(float) => write!(f, "{float}"),
Token::Integer(integer) => write!(f, "{integer}"),
Token::String(string) => write!(f, "{string}"),
Token::IsEven => write!(f, "is_even"),
Token::IsOdd => write!(f, "is_odd"),
Token::Length => write!(f, "length"),
Token::ReadLine => write!(f, "read_line"),
Token::WriteLine => write!(f, "write_line"),
Token::Comma => write!(f, ","),
Token::Dot => write!(f, "."),
Token::Equal => write!(f, "="),
Token::Plus => write!(f, "+"),
Token::Star => write!(f, "*"),
Token::LeftParenthesis => write!(f, "("),
Token::RightParenthesis => write!(f, ")"),
Token::LeftSquareBrace => write!(f, "["),
Token::RightSquareBrace => write!(f, "]"),
impl<'str> From<Token<'str>> for TokenOwned {
fn from(token: Token<'str>) -> Self {
match token {
Token::Eof => TokenOwned::Eof,
Token::Identifier(text) => TokenOwned::Identifier(text.to_string()),
Token::Boolean(boolean) => TokenOwned::Boolean(boolean),
Token::Float(float) => TokenOwned::Float(float),
Token::Integer(integer) => TokenOwned::Integer(integer),
Token::String(text) => TokenOwned::String(text.to_string()),
Token::IsEven => TokenOwned::IsEven,
Token::IsOdd => TokenOwned::IsOdd,
Token::Length => TokenOwned::Length,
Token::ReadLine => TokenOwned::ReadLine,
Token::WriteLine => TokenOwned::WriteLine,
Token::Comma => TokenOwned::Comma,
Token::Dot => TokenOwned::Dot,
Token::Equal => TokenOwned::Equal,
Token::Plus => TokenOwned::Plus,
Token::Star => TokenOwned::Star,
Token::LeftParenthesis => TokenOwned::LeftParenthesis,
Token::RightParenthesis => TokenOwned::RightParenthesis,
Token::LeftSquareBrace => TokenOwned::LeftSquareBrace,
Token::RightSquareBrace => TokenOwned::RightSquareBrace,
}
}
}