Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lex: advancement in lexer implementation #13

Merged
merged 5 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions examples/first.fs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,5 @@ _x_int: int = 0
_x_float: float = 0.1
_x_bool: bool = true
_x_bool2: bool = false
c_char: char = 'c'
c_char2: char = 'c'
x_str: str = "hello"
x_list: [int] = [1, 2, 3]
2 changes: 1 addition & 1 deletion examples/test.fs
Original file line number Diff line number Diff line change
@@ -1 +1 @@
x: str = "hello"
x_list: [int] = [1, 2, 3]
2 changes: 1 addition & 1 deletion src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ mod tests {
#[test]
fn identifier() {
let fs_files = collect_fs_files("./testdata/identifier", true);
assert_eq!(fs_files.len(), 16);
assert_eq!(fs_files.len(), 17);

for path in fs_files {
info!("file -> {:?}", path);
Expand Down
34 changes: 25 additions & 9 deletions src/lexer/states.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ use super::Lexer;
use super::LexerError;
use crate::lexer::token::Token;
use crate::lexer::token::TokenKind;
use crate::lexer::token::TokenKind::TokenCloseBrace;
use crate::lexer::token::TokenKind::TokenCloseBracket;
use crate::lexer::token::TokenKind::TokenCloseParen;
use crate::lexer::token::TokenKind::TokenDoubleQuote;
use crate::lexer::token::TokenKind::TokenSingleQuote;
use std::fmt::Debug;
Expand Down Expand Up @@ -98,7 +101,7 @@ impl State for StateStart {
)),
Some(_) => Err(LexerError::UnexpectedToken(Token::new(
TokenKind::TokenUnknown,
"".to_string(),
cursor.source().content()[cursor.index()..cursor.offset()].to_string(),
guerinoni marked this conversation as resolved.
Show resolved Hide resolved
cursor.location().clone(),
))),
None => Ok(Lexer::proceed(Box::new(StateEOF), TransitionKind::Consume)),
Expand All @@ -112,6 +115,11 @@ pub struct StateString;
impl State for StateString {
fn visit(&self, cursor: &mut Cursor) -> Result<Transition, LexerError> {
match cursor.peek() {
Some(c) if c.eq(&'\n') => Err(LexerError::UnexpectedToken(Token::new(
TokenKind::TokenUnknown,
"\\n".to_string(),
cursor.location().clone(),
))),
Some(c) if c.ne(&'"') => Ok(Lexer::proceed(
Box::new(StateString),
TransitionKind::AdvanceOffset,
Expand Down Expand Up @@ -172,12 +180,10 @@ impl State for StateNumber {
let lexeme = cursor.source().content()[cursor.index()..cursor.offset()].to_string();
let location = cursor.location().clone();
let token_kind = TokenKind::from(&lexeme);
Ok(Transition {
state: Box::new(StateStart),
transition_kind: TransitionKind::EmitToken(Token::new(
token_kind, lexeme, location,
)),
})
Ok(Lexer::proceed(
Box::new(StateStart),
TransitionKind::EmitToken(Token::new(token_kind, lexeme, location)),
))
}
}
}
Expand Down Expand Up @@ -214,7 +220,10 @@ pub struct StateSymbol;

impl StateSymbol {
fn is_symbol(c: char) -> bool {
matches!(c, ':' | '=' | '\n')
matches!(
c,
':' | '=' | '\n' | '(' | ')' | '{' | '}' | '[' | ']' | ','
guerinoni marked this conversation as resolved.
Show resolved Hide resolved
)
}
}

Expand All @@ -227,7 +236,14 @@ impl State for StateSymbol {

// NOTE: if a '\n' is found and it was scanning another "symbol" token, the previous was mangled, and only the '\n' is emitted,
// we need to handle the previous token since can be at the end of the line
if [TokenSingleQuote, TokenDoubleQuote].contains(&token_kind) {
let valid_last_token = vec![
TokenCloseBracket,
TokenCloseParen,
TokenCloseBrace,
TokenDoubleQuote,
TokenSingleQuote,
];
if valid_last_token.contains(&token_kind) {
return Ok(Lexer::proceed(
Box::new(StateStart),
TransitionKind::EmitToken(Token::new(
Expand Down
44 changes: 36 additions & 8 deletions src/lexer/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ const COLON: &str = ":";
const ASSIGN: &str = "=";
const SINGLE_QUOTE: &str = "'";
const DOUBLE_QUOTE: &str = "\"";
const OPEN_PAREN: &str = "(";
const CLOSE_PAREN: &str = ")";
const OPEN_BRACKET: &str = "{";
const CLOSE_BRACKET: &str = "}";
const OPEN_BRACE: &str = "[";
const CLOSE_BRACE: &str = "]";
const COMMA: &str = ",";

#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
pub enum Literal {
Expand All @@ -28,14 +35,21 @@ pub enum TokenKind {
TokenKeyword,
TokenType,
TokenComment,
TokenSpace, // ' '
TokenTab, // \t
TokenNewLine, // \n
TokenColon, // :
TokenAssign, // =
TokenSingleQuote, // '
TokenDoubleQuote, // "
TokenEOF, // End of file
TokenSpace, // ' '
TokenTab, // \t
TokenNewLine, // \n
TokenColon, // :
TokenAssign, // =
TokenSingleQuote, // '
TokenDoubleQuote, // "
TokenOpenParen, // (
TokenCloseParen, // )
TokenOpenBrace, // {
TokenCloseBrace, // }
TokenOpenBracket, // [
TokenCloseBracket, // ]
TokenComma, // ,
TokenEOF, // End of file
TokenUnknown,
}

Expand Down Expand Up @@ -70,6 +84,13 @@ impl TokenKind {
ASSIGN => Some(TokenKind::TokenAssign),
SINGLE_QUOTE => Some(TokenKind::TokenSingleQuote),
DOUBLE_QUOTE => Some(TokenKind::TokenDoubleQuote),
OPEN_PAREN => Some(TokenKind::TokenOpenParen),
CLOSE_PAREN => Some(TokenKind::TokenCloseParen),
OPEN_BRACE => Some(TokenKind::TokenOpenBrace),
CLOSE_BRACE => Some(TokenKind::TokenCloseBrace),
OPEN_BRACKET => Some(TokenKind::TokenOpenBracket),
CLOSE_BRACKET => Some(TokenKind::TokenCloseBracket),
COMMA => Some(TokenKind::TokenComma),
_ => None,
}
}
Expand Down Expand Up @@ -264,6 +285,13 @@ impl std::fmt::Display for TokenKind {
TokenKind::TokenAssign => write!(f, "TokenAssign"),
TokenKind::TokenSingleQuote => write!(f, "TokenTick"),
TokenKind::TokenDoubleQuote => write!(f, "TokenDoubleTick"),
TokenKind::TokenOpenParen => write!(f, "TokenOpenParen"),
TokenKind::TokenCloseParen => write!(f, "TokenCloseParen"),
TokenKind::TokenOpenBrace => write!(f, "TokenOpenBrace"),
TokenKind::TokenCloseBrace => write!(f, "TokenCloseBrace"),
TokenKind::TokenOpenBracket => write!(f, "TokenOpenBracket"),
TokenKind::TokenCloseBracket => write!(f, "TokenCloseBracket"),
TokenKind::TokenComma => write!(f, "TokenComma"),
TokenKind::TokenEOF => write!(f, "TokenEOF"),
TokenKind::TokenUnknown => write!(f, "TokenUnknown"),
}
Expand Down
1 change: 1 addition & 0 deletions testdata/identifier/id_list_assign.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
x_list: [int] = [1, 2, 3]
22 changes: 22 additions & 0 deletions testdata/identifier/id_list_assign.tokens.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[
{ "kind": "TokenIdentifier", "lexeme": "x_list", "location": { "file_path": "", "line": 0, "column_start": 0, "column_end": 6 } },
{ "kind": "TokenColon", "lexeme": ":", "location": { "file_path": "", "line": 0, "column_start": 6, "column_end": 7 } },
{ "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 7, "column_end": 8 } },
{ "kind": "TokenOpenBrace", "lexeme": "[", "location": { "file_path": "", "line": 0, "column_start": 8, "column_end": 9 } },
{ "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 9, "column_end": 12 } },
{ "kind": "TokenCloseBrace", "lexeme": "]", "location": { "file_path": "", "line": 0, "column_start": 12, "column_end": 13 } },
{ "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 13, "column_end": 14 } },
{ "kind": "TokenAssign", "lexeme": "=", "location": { "file_path": "", "line": 0, "column_start": 14, "column_end": 15 } },
{ "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 15, "column_end": 16 } },
{ "kind": "TokenOpenBrace", "lexeme": "[", "location": { "file_path": "", "line": 0, "column_start": 16, "column_end": 17 } },
{ "kind": { "TokenLiteral": "Int" }, "lexeme": "1", "location": { "file_path": "", "line": 0, "column_start": 17, "column_end": 18 } },
{ "kind": "TokenComma", "lexeme": ",", "location": { "file_path": "", "line": 0, "column_start": 18, "column_end": 19 } },
{ "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 19, "column_end": 20 } },
{ "kind": { "TokenLiteral": "Int" }, "lexeme": "2", "location": { "file_path": "", "line": 0, "column_start": 20, "column_end": 21 } },
{ "kind": "TokenComma", "lexeme": ",", "location": { "file_path": "", "line": 0, "column_start": 21, "column_end": 22 } },
{ "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 22, "column_end": 23 } },
{ "kind": { "TokenLiteral": "Int" }, "lexeme": "3", "location": { "file_path": "", "line": 0, "column_start": 23, "column_end": 24 } },
{ "kind": "TokenCloseBrace", "lexeme": "]", "location": { "file_path": "", "line": 0, "column_start": 24, "column_end": 25 } },
{ "kind": "TokenNewLine", "lexeme": "\\n", "location": { "file_path": "", "line": 0, "column_start": 25, "column_end": 25 } },
{ "kind": "TokenEOF", "lexeme": "", "location": { "file_path": "", "line": 1, "column_start": 0, "column_end": 0 } }
]