From 63a045cc47bea8224ae3d22e785300cf8e262f2c Mon Sep 17 00:00:00 2001 From: FedericoBruzzone Date: Wed, 14 Aug 2024 15:34:27 +0200 Subject: [PATCH 1/5] chore: fix the documentation Signed-off-by: FedericoBruzzone --- dev_doc/syntax.md | 12 ++++++------ dev_doc/syntax_definition.sh | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/dev_doc/syntax.md b/dev_doc/syntax.md index b76b056..414fa14 100644 --- a/dev_doc/syntax.md +++ b/dev_doc/syntax.md @@ -20,36 +20,36 @@ imp * of test # import all # Native Types +- NOT USED: `char` -- Unicode character - `()` -- Unit - `int` -- 32 bits - `float` -- 64 bits - `str` -- Unicode string -- `char` -- Unicode character - `bool` -- `True` or `False` - `[T]` -- List of `T` - `(T, U)` -- Tuple with two elements of type `T` and `U` -- `option` -- `Just` or `Nil` +- `opt` -- `Just` or `Nil` ```python # With type annotation +# x_char: char = '👾' # unicode x_int: int = 1 x_float: float = 1.0 x_bool: bool = True # False x_str: str = "hello 👾" -x_char: char = '👾' # unicode -x_list: [int]= [1, 2, 3] +x_list: [int] = [1, 2, 3] x_tuple: (int, str) = (1, "hello") -x_option: Option = Just(1) # Nil +x_option: opt = Just(1) # Nil x_f1: () -> unit = () -> print "hello" ; x_f2: () -> int = () -> 1 ; x_f3: (T) -> T = (x) -> x ; # Generic # Without type annotation +# x_char = 'a' # unicode x_int = 1 x_float = 1.0 x_bool = True # False x_str = "hello" -x_char = 'a' # unicode x_list = [1, 2, 3] x_tuple = (1, "hello") x_option = Just(1) # Nil diff --git a/dev_doc/syntax_definition.sh b/dev_doc/syntax_definition.sh index 973da7c..f629bc5 100644 --- a/dev_doc/syntax_definition.sh +++ b/dev_doc/syntax_definition.sh @@ -13,24 +13,24 @@ imp { .. } of test # import all # Native Types # With type annotation (no type inference) +# x_char: char = '👾' # unicode x_int: int = 1 x_float: float = 1.0 -x_bool: bool = True # False +x_bool: bool = true # false x_str: str = "hello 👾" -x_char: char = '👾' # unicode x_list: [int] = [1, 2, 3] x_tuple: (int, str) = (1, "hello") -x_option: option = Just(1) # Nil +x_option: opt = Just(1) # Nil x_f1: () -> unit = () -> print "hello" x_f2: () -> int = () -> 1 x_f3: (T) -> T = (x) -> x # Generic # Without type annotation (type inference) +# x_char = '👾' # unicode x_int = 1 x_float = 1.0 -x_bool = True # False +x_bool = true # false x_str = "hello 👾" -x_char = '👾' # unicode x_list = [1, 2, 3] x_tuple = (1, "hello") x_option = Just(1) # Nil From fffb17dda1ac929711107265c38f67d64181c04e Mon Sep 17 00:00:00 2001 From: FedericoBruzzone Date: Wed, 14 Aug 2024 15:48:27 +0200 Subject: [PATCH 2/5] lex: handle the `list` initialization Signed-off-by: FedericoBruzzone Co-authored-by: Federico Guerinoni --- examples/first.fs | 3 +- examples/test.fs | 2 +- src/lexer/mod.rs | 2 +- src/lexer/states.rs | 34 ++++++++++---- src/lexer/token.rs | 44 +++++++++++++++---- testdata/identifier/id_list_assign.fs | 1 + .../identifier/id_list_assign.tokens.json | 22 ++++++++++ 7 files changed, 87 insertions(+), 21 deletions(-) create mode 100644 testdata/identifier/id_list_assign.fs create mode 100644 testdata/identifier/id_list_assign.tokens.json diff --git a/examples/first.fs b/examples/first.fs index ed00ec9..30ee1ff 100644 --- a/examples/first.fs +++ b/examples/first.fs @@ -7,6 +7,5 @@ _x_int: int = 0 _x_float: float = 0.1 _x_bool: bool = true _x_bool2: bool = false -c_char: char = 'c' - c_char2: char = 'c' x_str: str = "hello" +x_list: [int] = [1, 2, 3] diff --git a/examples/test.fs b/examples/test.fs index 0d5a7e4..9d65423 100644 --- a/examples/test.fs +++ b/examples/test.fs @@ -1 +1 @@ -x: str = "hello" +x_list: [int] = [1, 2, 3] diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 5cdd050..7d83701 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -129,7 +129,7 @@ mod tests { #[test] fn identifier() { let fs_files = collect_fs_files("./testdata/identifier", true); - assert_eq!(fs_files.len(), 16); + assert_eq!(fs_files.len(), 17); for path in fs_files { info!("file -> {:?}", path); diff --git a/src/lexer/states.rs b/src/lexer/states.rs index a542068..10cd55f 100644 --- a/src/lexer/states.rs +++ b/src/lexer/states.rs @@ -4,6 +4,9 @@ use super::Lexer; use super::LexerError; use crate::lexer::token::Token; use crate::lexer::token::TokenKind; +use crate::lexer::token::TokenKind::TokenCloseBrace; +use crate::lexer::token::TokenKind::TokenCloseBracket; +use crate::lexer::token::TokenKind::TokenCloseParen; use crate::lexer::token::TokenKind::TokenDoubleQuote; use crate::lexer::token::TokenKind::TokenSingleQuote; use std::fmt::Debug; @@ -98,7 +101,7 @@ impl State for StateStart { )), Some(_) => Err(LexerError::UnexpectedToken(Token::new( TokenKind::TokenUnknown, - "".to_string(), + cursor.source().content()[cursor.index()..cursor.offset()].to_string(), cursor.location().clone(), ))), None => Ok(Lexer::proceed(Box::new(StateEOF), TransitionKind::Consume)), @@ -112,6 +115,11 @@ pub struct StateString; impl State for StateString { fn visit(&self, cursor: &mut Cursor) -> Result { match cursor.peek() { + Some(c) if c.eq(&'\n') => Err(LexerError::UnexpectedToken(Token::new( + TokenKind::TokenUnknown, + "\\n".to_string(), + cursor.location().clone(), + ))), Some(c) if c.ne(&'"') => Ok(Lexer::proceed( Box::new(StateString), TransitionKind::AdvanceOffset, @@ -172,12 +180,10 @@ impl State for StateNumber { let lexeme = cursor.source().content()[cursor.index()..cursor.offset()].to_string(); let location = cursor.location().clone(); let token_kind = TokenKind::from(&lexeme); - Ok(Transition { - state: Box::new(StateStart), - transition_kind: TransitionKind::EmitToken(Token::new( - token_kind, lexeme, location, - )), - }) + Ok(Lexer::proceed( + Box::new(StateStart), + TransitionKind::EmitToken(Token::new(token_kind, lexeme, location)), + )) } } } @@ -214,7 +220,10 @@ pub struct StateSymbol; impl StateSymbol { fn is_symbol(c: char) -> bool { - matches!(c, ':' | '=' | '\n') + matches!( + c, + ':' | '=' | '\n' | '(' | ')' | '{' | '}' | '[' | ']' | ',' + ) } } @@ -227,7 +236,14 @@ impl State for StateSymbol { // NOTE: if a '\n' is found and it was scanning another "symbol" token, the previous was mangled, and only the '\n' is emitted, // we need to handle the previous token since can be at the end of the line - if [TokenSingleQuote, TokenDoubleQuote].contains(&token_kind) { + let valid_last_token = vec![ + TokenCloseBracket, + TokenCloseParen, + TokenCloseBrace, + TokenDoubleQuote, + TokenSingleQuote, + ]; + if valid_last_token.contains(&token_kind) { return Ok(Lexer::proceed( Box::new(StateStart), TransitionKind::EmitToken(Token::new( diff --git a/src/lexer/token.rs b/src/lexer/token.rs index e829957..be8e60a 100644 --- a/src/lexer/token.rs +++ b/src/lexer/token.rs @@ -12,6 +12,13 @@ const COLON: &str = ":"; const ASSIGN: &str = "="; const SINGLE_QUOTE: &str = "'"; const DOUBLE_QUOTE: &str = "\""; +const OPEN_PAREN: &str = "("; +const CLOSE_PAREN: &str = ")"; +const OPEN_BRACKET: &str = "{"; +const CLOSE_BRACKET: &str = "}"; +const OPEN_BRACE: &str = "["; +const CLOSE_BRACE: &str = "]"; +const COMMA: &str = ","; #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] pub enum Literal { @@ -28,14 +35,21 @@ pub enum TokenKind { TokenKeyword, TokenType, TokenComment, - TokenSpace, // ' ' - TokenTab, // \t - TokenNewLine, // \n - TokenColon, // : - TokenAssign, // = - TokenSingleQuote, // ' - TokenDoubleQuote, // " - TokenEOF, // End of file + TokenSpace, // ' ' + TokenTab, // \t + TokenNewLine, // \n + TokenColon, // : + TokenAssign, // = + TokenSingleQuote, // ' + TokenDoubleQuote, // " + TokenOpenParen, // ( + TokenCloseParen, // ) + TokenOpenBrace, // { + TokenCloseBrace, // } + TokenOpenBracket, // [ + TokenCloseBracket, // ] + TokenComma, // , + TokenEOF, // End of file TokenUnknown, } @@ -70,6 +84,13 @@ impl TokenKind { ASSIGN => Some(TokenKind::TokenAssign), SINGLE_QUOTE => Some(TokenKind::TokenSingleQuote), DOUBLE_QUOTE => Some(TokenKind::TokenDoubleQuote), + OPEN_PAREN => Some(TokenKind::TokenOpenParen), + CLOSE_PAREN => Some(TokenKind::TokenCloseParen), + OPEN_BRACE => Some(TokenKind::TokenOpenBrace), + CLOSE_BRACE => Some(TokenKind::TokenCloseBrace), + OPEN_BRACKET => Some(TokenKind::TokenOpenBracket), + CLOSE_BRACKET => Some(TokenKind::TokenCloseBracket), + COMMA => Some(TokenKind::TokenComma), _ => None, } } @@ -264,6 +285,13 @@ impl std::fmt::Display for TokenKind { TokenKind::TokenAssign => write!(f, "TokenAssign"), TokenKind::TokenSingleQuote => write!(f, "TokenTick"), TokenKind::TokenDoubleQuote => write!(f, "TokenDoubleTick"), + TokenKind::TokenOpenParen => write!(f, "TokenOpenParen"), + TokenKind::TokenCloseParen => write!(f, "TokenCloseParen"), + TokenKind::TokenOpenBrace => write!(f, "TokenOpenBrace"), + TokenKind::TokenCloseBrace => write!(f, "TokenCloseBrace"), + TokenKind::TokenOpenBracket => write!(f, "TokenOpenBracket"), + TokenKind::TokenCloseBracket => write!(f, "TokenCloseBracket"), + TokenKind::TokenComma => write!(f, "TokenComma"), TokenKind::TokenEOF => write!(f, "TokenEOF"), TokenKind::TokenUnknown => write!(f, "TokenUnknown"), } diff --git a/testdata/identifier/id_list_assign.fs b/testdata/identifier/id_list_assign.fs new file mode 100644 index 0000000..9d65423 --- /dev/null +++ b/testdata/identifier/id_list_assign.fs @@ -0,0 +1 @@ +x_list: [int] = [1, 2, 3] diff --git a/testdata/identifier/id_list_assign.tokens.json b/testdata/identifier/id_list_assign.tokens.json new file mode 100644 index 0000000..68bf62b --- /dev/null +++ b/testdata/identifier/id_list_assign.tokens.json @@ -0,0 +1,22 @@ +[ + { "kind": "TokenIdentifier", "lexeme": "x_list", "location": { "file_path": "", "line": 0, "column_start": 0, "column_end": 6 } }, + { "kind": "TokenColon", "lexeme": ":", "location": { "file_path": "", "line": 0, "column_start": 6, "column_end": 7 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 7, "column_end": 8 } }, + { "kind": "TokenOpenBrace", "lexeme": "[", "location": { "file_path": "", "line": 0, "column_start": 8, "column_end": 9 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 9, "column_end": 12 } }, + { "kind": "TokenCloseBrace", "lexeme": "]", "location": { "file_path": "", "line": 0, "column_start": 12, "column_end": 13 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 13, "column_end": 14 } }, + { "kind": "TokenAssign", "lexeme": "=", "location": { "file_path": "", "line": 0, "column_start": 14, "column_end": 15 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 15, "column_end": 16 } }, + { "kind": "TokenOpenBrace", "lexeme": "[", "location": { "file_path": "", "line": 0, "column_start": 16, "column_end": 17 } }, + { "kind": { "TokenLiteral": "Int" }, "lexeme": "1", "location": { "file_path": "", "line": 0, "column_start": 17, "column_end": 18 } }, + { "kind": "TokenComma", "lexeme": ",", "location": { "file_path": "", "line": 0, "column_start": 18, "column_end": 19 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 19, "column_end": 20 } }, + { "kind": { "TokenLiteral": "Int" }, "lexeme": "2", "location": { "file_path": "", "line": 0, "column_start": 20, "column_end": 21 } }, + { "kind": "TokenComma", "lexeme": ",", "location": { "file_path": "", "line": 0, "column_start": 21, "column_end": 22 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 22, "column_end": 23 } }, + { "kind": { "TokenLiteral": "Int" }, "lexeme": "3", "location": { "file_path": "", "line": 0, "column_start": 23, "column_end": 24 } }, + { "kind": "TokenCloseBrace", "lexeme": "]", "location": { "file_path": "", "line": 0, "column_start": 24, "column_end": 25 } }, + { "kind": "TokenNewLine", "lexeme": "\\n", "location": { "file_path": "", "line": 0, "column_start": 25, "column_end": 25 } }, + { "kind": "TokenEOF", "lexeme": "", "location": { "file_path": "", "line": 1, "column_start": 0, "column_end": 0 } } +] From 8be64a2bc073353398337f8fd948833a0e60ec95 Mon Sep 17 00:00:00 2001 From: FedericoBruzzone Date: Thu, 15 Aug 2024 10:19:55 +0200 Subject: [PATCH 3/5] lex: handle the `tuple` initialization Signed-off-by: FedericoBruzzone --- dev_doc/syntax.md | 2 +- examples/test.fs | 2 +- src/lexer/mod.rs | 2 +- testdata/identifier/id_tuple_assign.fs | 1 + .../identifier/id_tuple_assign.tokens.json | 22 +++++++++++++++++++ 5 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 testdata/identifier/id_tuple_assign.fs create mode 100644 testdata/identifier/id_tuple_assign.tokens.json diff --git a/dev_doc/syntax.md b/dev_doc/syntax.md index 414fa14..1eaa4d2 100644 --- a/dev_doc/syntax.md +++ b/dev_doc/syntax.md @@ -21,7 +21,7 @@ imp * of test # import all # Native Types - NOT USED: `char` -- Unicode character -- `()` -- Unit +- `unit` -- Unit - `int` -- 32 bits - `float` -- 64 bits - `str` -- Unicode string diff --git a/examples/test.fs b/examples/test.fs index 9d65423..c564a73 100644 --- a/examples/test.fs +++ b/examples/test.fs @@ -1 +1 @@ -x_list: [int] = [1, 2, 3] +x_tuple: (int, str) = (1, "hello") diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 7d83701..dd3f08e 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -129,7 +129,7 @@ mod tests { #[test] fn identifier() { let fs_files = collect_fs_files("./testdata/identifier", true); - assert_eq!(fs_files.len(), 17); + assert_eq!(fs_files.len(), 18); for path in fs_files { info!("file -> {:?}", path); diff --git a/testdata/identifier/id_tuple_assign.fs b/testdata/identifier/id_tuple_assign.fs new file mode 100644 index 0000000..c564a73 --- /dev/null +++ b/testdata/identifier/id_tuple_assign.fs @@ -0,0 +1 @@ +x_tuple: (int, str) = (1, "hello") diff --git a/testdata/identifier/id_tuple_assign.tokens.json b/testdata/identifier/id_tuple_assign.tokens.json new file mode 100644 index 0000000..afc82c7 --- /dev/null +++ b/testdata/identifier/id_tuple_assign.tokens.json @@ -0,0 +1,22 @@ +[ + { "kind": "TokenIdentifier", "lexeme": "x_tuple", "location": { "file_path": "", "line": 0, "column_start": 0, "column_end": 7 } }, + { "kind": "TokenColon", "lexeme": ":", "location": { "file_path": "", "line": 0, "column_start": 7, "column_end": 8 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 8, "column_end": 9 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 9, "column_end": 10 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 10, "column_end": 13 } }, + { "kind": "TokenComma", "lexeme": ",", "location": { "file_path": "", "line": 0, "column_start": 13, "column_end": 14 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 14, "column_end": 15 } }, + { "kind": "TokenType", "lexeme": "str", "location": { "file_path": "", "line": 0, "column_start": 15, "column_end": 18 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 18, "column_end": 19 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 19, "column_end": 20 } }, + { "kind": "TokenAssign", "lexeme": "=", "location": { "file_path": "", "line": 0, "column_start": 20, "column_end": 21 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 21, "column_end": 22 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 22, "column_end": 23 } }, + { "kind": { "TokenLiteral" : "Int" }, "lexeme": "1", "location": { "file_path": "", "line": 0, "column_start": 23, "column_end": 24 } }, + { "kind": "TokenComma", "lexeme": ",", "location": { "file_path": "", "line": 0, "column_start": 24, "column_end": 25 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 25, "column_end": 26 } }, + { "kind": { "TokenLiteral": "Str" }, "lexeme": "\"hello\"", "location": { "file_path": "", "line": 0, "column_start": 26, "column_end": 33 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 33, "column_end": 34 } }, + { "kind": "TokenNewLine", "lexeme": "\\n", "location": { "file_path": "", "line": 0, "column_start": 34, "column_end": 34 } }, + { "kind": "TokenEOF", "lexeme": "", "location": { "file_path": "", "line": 1, "column_start": 0, "column_end": 0 } } +] From f72e5d6013861f996e7e5647acf0f1f65e687125 Mon Sep 17 00:00:00 2001 From: FedericoBruzzone Date: Thu, 15 Aug 2024 11:24:17 +0200 Subject: [PATCH 4/5] lex: handle the `func` initialization Signed-off-by: FedericoBruzzone --- examples/first.fs | 3 ++ examples/test.fs | 2 +- src/lexer/mod.rs | 2 +- src/lexer/states.rs | 32 +++++---------- src/lexer/token.rs | 40 ++++++++++++++++++- testdata/identifier/id_function_assign_int.fs | 1 + .../id_function_assign_int.tokens.json | 26 ++++++++++++ .../identifier/id_function_assign_unit.fs | 1 + .../id_function_assign_unit.tokens.json | 26 ++++++++++++ 9 files changed, 108 insertions(+), 25 deletions(-) create mode 100644 testdata/identifier/id_function_assign_int.fs create mode 100644 testdata/identifier/id_function_assign_int.tokens.json create mode 100644 testdata/identifier/id_function_assign_unit.fs create mode 100644 testdata/identifier/id_function_assign_unit.tokens.json diff --git a/examples/first.fs b/examples/first.fs index 30ee1ff..0ebcc1a 100644 --- a/examples/first.fs +++ b/examples/first.fs @@ -9,3 +9,6 @@ _x_bool: bool = true _x_bool2: bool = false x_str: str = "hello" x_list: [int] = [1, 2, 3] +x_tuple: (int, int) = (1, 2) +x_func: () -> unit = () -> print "hello" ; +x_func: (int) -> int = (x) -> x ; diff --git a/examples/test.fs b/examples/test.fs index c564a73..04ba125 100644 --- a/examples/test.fs +++ b/examples/test.fs @@ -1 +1 @@ -x_tuple: (int, str) = (1, "hello") +x_func: (int) -> int = (x) -> x ; diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index dd3f08e..f019d6b 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -129,7 +129,7 @@ mod tests { #[test] fn identifier() { let fs_files = collect_fs_files("./testdata/identifier", true); - assert_eq!(fs_files.len(), 18); + assert_eq!(fs_files.len(), 20); for path in fs_files { info!("file -> {:?}", path); diff --git a/src/lexer/states.rs b/src/lexer/states.rs index 10cd55f..d06c881 100644 --- a/src/lexer/states.rs +++ b/src/lexer/states.rs @@ -84,7 +84,7 @@ impl State for StateStart { Box::new(StateString), TransitionKind::AdvanceOffset, )), - Some(c) if StateSymbol::is_symbol(c) => { + Some(c) if TokenKind::is_start_of_symbol(c.to_string().as_str()) => { Ok(Lexer::proceed(Box::new(StateSymbol), TransitionKind::Empty)) } Some('#') => Ok(Lexer::proceed( @@ -218,15 +218,6 @@ impl State for StateWord { #[derive(Debug)] pub struct StateSymbol; -impl StateSymbol { - fn is_symbol(c: char) -> bool { - matches!( - c, - ':' | '=' | '\n' | '(' | ')' | '{' | '}' | '[' | ']' | ',' - ) - } -} - impl State for StateSymbol { fn visit(&self, cursor: &mut Cursor) -> Result { match cursor.peek() { @@ -265,20 +256,19 @@ impl State for StateSymbol { cursor.new_line(); Ok(transition) } - Some(c) if StateSymbol::is_symbol(c) => Ok(Lexer::proceed( - Box::new(StateSymbol), - TransitionKind::AdvanceOffset, - )), + Some(c) if TokenKind::can_be_followed_by_symbol(c.to_string().as_str()) => Ok( + Lexer::proceed(Box::new(StateSymbol), TransitionKind::AdvanceOffset), + ), _ => { - let lexeme = cursor.source().content()[cursor.index()..cursor.offset()].to_string(); + let lexeme = + cursor.source().content()[cursor.index()..cursor.offset() + 1].to_string(); let token_kind = TokenKind::from(&lexeme); + cursor.advance_offset(); let location = cursor.location().clone(); - Ok(Transition { - state: Box::new(StateStart), - transition_kind: TransitionKind::EmitToken(Token::new( - token_kind, lexeme, location, - )), - }) + Ok(Lexer::proceed( + Box::new(StateStart), + TransitionKind::EmitToken(Token::new(token_kind, lexeme, location)), + )) } } } diff --git a/src/lexer/token.rs b/src/lexer/token.rs index be8e60a..4d330ca 100644 --- a/src/lexer/token.rs +++ b/src/lexer/token.rs @@ -2,14 +2,18 @@ use crate::utils::color; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; +const KEYWORD_UNIT: &str = "unit"; const KEYWORD_INT: &str = "int"; const KEYWORD_FLOAT: &str = "float"; const KEYWORD_BOOL: &str = "bool"; const KEYWORD_STR: &str = "str"; const KEYWORD_BOOL_TRUE: &str = "true"; const KEYWORD_BOOL_FALSE: &str = "false"; + const COLON: &str = ":"; +const SEMICOLON: &str = ";"; const ASSIGN: &str = "="; +const NEW_LINE: &str = "\n"; const SINGLE_QUOTE: &str = "'"; const DOUBLE_QUOTE: &str = "\""; const OPEN_PAREN: &str = "("; @@ -19,6 +23,8 @@ const CLOSE_BRACKET: &str = "}"; const OPEN_BRACE: &str = "["; const CLOSE_BRACE: &str = "]"; const COMMA: &str = ","; +const MINUS: &str = "-"; +const RIGHT_ARROW: &str = "->"; #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] pub enum Literal { @@ -32,13 +38,13 @@ pub enum Literal { pub enum TokenKind { TokenLiteral(Literal), TokenIdentifier, - TokenKeyword, TokenType, TokenComment, TokenSpace, // ' ' TokenTab, // \t TokenNewLine, // \n TokenColon, // : + TokenSemicolon, // ; TokenAssign, // = TokenSingleQuote, // ' TokenDoubleQuote, // " @@ -49,13 +55,40 @@ pub enum TokenKind { TokenOpenBracket, // [ TokenCloseBracket, // ] TokenComma, // , + TokenRightArrow, // -> TokenEOF, // End of file TokenUnknown, } impl TokenKind { + pub fn is_start_of_symbol(c: &str) -> bool { + matches!( + c, + COLON + | SEMICOLON + | ASSIGN + | SINGLE_QUOTE + | DOUBLE_QUOTE + | OPEN_PAREN + | CLOSE_PAREN + | OPEN_BRACE + | CLOSE_BRACE + | OPEN_BRACKET + | CLOSE_BRACKET + | COMMA + | MINUS + | RIGHT_ARROW + | NEW_LINE + ) + } + + pub fn can_be_followed_by_symbol(c: &str) -> bool { + matches!(c, MINUS) + } + fn match_keyword(lexeme: &str) -> Option { match lexeme { + KEYWORD_UNIT => Some(TokenKind::TokenType), KEYWORD_INT => Some(TokenKind::TokenType), KEYWORD_FLOAT => Some(TokenKind::TokenType), KEYWORD_BOOL => Some(TokenKind::TokenType), @@ -81,6 +114,7 @@ impl TokenKind { fn match_separator(lexeme: &str) -> Option { match lexeme { COLON => Some(TokenKind::TokenColon), + SEMICOLON => Some(TokenKind::TokenSemicolon), ASSIGN => Some(TokenKind::TokenAssign), SINGLE_QUOTE => Some(TokenKind::TokenSingleQuote), DOUBLE_QUOTE => Some(TokenKind::TokenDoubleQuote), @@ -91,6 +125,7 @@ impl TokenKind { OPEN_BRACKET => Some(TokenKind::TokenOpenBracket), CLOSE_BRACKET => Some(TokenKind::TokenCloseBracket), COMMA => Some(TokenKind::TokenComma), + RIGHT_ARROW => Some(TokenKind::TokenRightArrow), _ => None, } } @@ -275,12 +310,12 @@ impl std::fmt::Display for TokenKind { match self { TokenKind::TokenLiteral(literal) => write!(f, "TokenLiteral({})", literal), TokenKind::TokenIdentifier => write!(f, "TokenIdentifier"), - TokenKind::TokenKeyword => write!(f, "TokenKeyword"), TokenKind::TokenType => write!(f, "TokenType"), TokenKind::TokenComment => write!(f, "TokenComment"), TokenKind::TokenSpace => write!(f, "TokenSpace"), TokenKind::TokenTab => write!(f, "TokenTab"), TokenKind::TokenNewLine => write!(f, "TokenNewLine"), + TokenKind::TokenSemicolon => write!(f, "TokenSemicolon"), TokenKind::TokenColon => write!(f, "TokenColon"), TokenKind::TokenAssign => write!(f, "TokenAssign"), TokenKind::TokenSingleQuote => write!(f, "TokenTick"), @@ -292,6 +327,7 @@ impl std::fmt::Display for TokenKind { TokenKind::TokenOpenBracket => write!(f, "TokenOpenBracket"), TokenKind::TokenCloseBracket => write!(f, "TokenCloseBracket"), TokenKind::TokenComma => write!(f, "TokenComma"), + TokenKind::TokenRightArrow => write!(f, "TokenRightArrow"), TokenKind::TokenEOF => write!(f, "TokenEOF"), TokenKind::TokenUnknown => write!(f, "TokenUnknown"), } diff --git a/testdata/identifier/id_function_assign_int.fs b/testdata/identifier/id_function_assign_int.fs new file mode 100644 index 0000000..04ba125 --- /dev/null +++ b/testdata/identifier/id_function_assign_int.fs @@ -0,0 +1 @@ +x_func: (int) -> int = (x) -> x ; diff --git a/testdata/identifier/id_function_assign_int.tokens.json b/testdata/identifier/id_function_assign_int.tokens.json new file mode 100644 index 0000000..ee497aa --- /dev/null +++ b/testdata/identifier/id_function_assign_int.tokens.json @@ -0,0 +1,26 @@ +[ + { "kind": "TokenIdentifier", "lexeme": "x_func", "location": { "file_path": "", "line": 0, "column_start": 0, "column_end": 6 } }, + { "kind": "TokenColon", "lexeme": ":", "location": { "file_path": "", "line": 0, "column_start": 6, "column_end": 7 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 7, "column_end": 8 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 8, "column_end": 9 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 9, "column_end": 12 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 12, "column_end": 13 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 13, "column_end": 14 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 14, "column_end": 16 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 16, "column_end": 17 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 17, "column_end": 20 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 20, "column_end": 21 } }, + { "kind": "TokenAssign", "lexeme": "=", "location": { "file_path": "", "line": 0, "column_start": 21, "column_end": 22 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 22, "column_end": 23 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 23, "column_end": 24 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 24, "column_end": 25 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 25, "column_end": 26 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 26, "column_end": 27 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 27, "column_end": 29 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 29, "column_end": 30 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 30, "column_end": 31 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 31, "column_end": 32 } }, + { "kind": "TokenSemicolon", "lexeme": ";", "location": { "file_path": "", "line": 0, "column_start": 32, "column_end": 33 } }, + { "kind": "TokenNewLine", "lexeme": "\\n", "location": { "file_path": "", "line": 0, "column_start": 33, "column_end": 33 } }, + { "kind": "TokenEOF", "lexeme": "", "location": { "file_path": "", "line": 1, "column_start": 0, "column_end": 0 } } +] diff --git a/testdata/identifier/id_function_assign_unit.fs b/testdata/identifier/id_function_assign_unit.fs new file mode 100644 index 0000000..ea2ebd4 --- /dev/null +++ b/testdata/identifier/id_function_assign_unit.fs @@ -0,0 +1 @@ +x_func: () -> unit = () -> print "hello" ; diff --git a/testdata/identifier/id_function_assign_unit.tokens.json b/testdata/identifier/id_function_assign_unit.tokens.json new file mode 100644 index 0000000..6789fd2 --- /dev/null +++ b/testdata/identifier/id_function_assign_unit.tokens.json @@ -0,0 +1,26 @@ +[ + { "kind": "TokenIdentifier", "lexeme": "x_func", "location": { "file_path": "", "line": 0, "column_start": 0, "column_end": 6 } }, + { "kind": "TokenColon", "lexeme": ":", "location": { "file_path": "", "line": 0, "column_start": 6, "column_end": 7 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 7, "column_end": 8 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 8, "column_end": 9 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 9, "column_end": 10 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 10, "column_end": 11 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 11, "column_end": 13 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 13, "column_end": 14 } }, + { "kind": "TokenType", "lexeme": "unit", "location": { "file_path": "", "line": 0, "column_start": 14, "column_end": 18 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 18, "column_end": 19 } }, + { "kind": "TokenAssign", "lexeme": "=", "location": { "file_path": "", "line": 0, "column_start": 19, "column_end": 20 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 20, "column_end": 21 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 21, "column_end": 22 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 22, "column_end": 23 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 23, "column_end": 24 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 24, "column_end": 26 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 26, "column_end": 27 } }, + { "kind": "TokenIdentifier", "lexeme": "print", "location": { "file_path": "", "line": 0, "column_start": 27, "column_end": 32 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 32, "column_end": 33 } }, + { "kind": { "TokenLiteral": "Str" }, "lexeme": "\"hello\"", "location": { "file_path": "", "line": 0, "column_start": 33, "column_end": 40 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 40, "column_end": 41 } }, + { "kind": "TokenSemicolon", "lexeme": ";", "location": { "file_path": "", "line": 0, "column_start": 41, "column_end": 42 } }, + { "kind": "TokenNewLine", "lexeme": "\\n", "location": { "file_path": "", "line": 0, "column_start": 42, "column_end": 42 } }, + { "kind": "TokenEOF", "lexeme": "", "location": { "file_path": "", "line": 1, "column_start": 0, "column_end": 0 } } + ] From 70ba10120d86225928f8f07894a6245b31d07c0d Mon Sep 17 00:00:00 2001 From: FedericoBruzzone Date: Thu, 15 Aug 2024 12:39:49 +0200 Subject: [PATCH 5/5] lex: add basic operators in functions Signed-off-by: FedericoBruzzone --- dev_doc/syntax.md | 15 +- dev_doc/syntax_definition.sh | 190 --------------------- examples/test.fs | 2 +- src/lexer/mod.rs | 2 +- src/lexer/states.rs | 48 +++--- src/lexer/token.rs | 47 ++++- testdata/identifier/id_fun_div.fs | 1 + testdata/identifier/id_fun_div.tokens.json | 33 ++++ testdata/identifier/id_fun_mul.fs | 1 + testdata/identifier/id_fun_mul.tokens.json | 32 ++++ testdata/identifier/id_fun_sub.fs | 1 + testdata/identifier/id_fun_sub.tokens.json | 31 ++++ testdata/identifier/id_fun_sum.fs | 1 + testdata/identifier/id_fun_sum.tokens.json | 30 ++++ 14 files changed, 210 insertions(+), 224 deletions(-) delete mode 100644 dev_doc/syntax_definition.sh create mode 100644 testdata/identifier/id_fun_div.fs create mode 100644 testdata/identifier/id_fun_div.tokens.json create mode 100644 testdata/identifier/id_fun_mul.fs create mode 100644 testdata/identifier/id_fun_mul.tokens.json create mode 100644 testdata/identifier/id_fun_sub.fs create mode 100644 testdata/identifier/id_fun_sub.tokens.json create mode 100644 testdata/identifier/id_fun_sum.fs create mode 100644 testdata/identifier/id_fun_sum.tokens.json diff --git a/dev_doc/syntax.md b/dev_doc/syntax.md index 1eaa4d2..2b48bd9 100644 --- a/dev_doc/syntax.md +++ b/dev_doc/syntax.md @@ -4,6 +4,19 @@ - the `;` character is the function/match terminator - the difference between "variables" and "functions" is the `(args) ->` part +# Stdlib + +- `print: str -> unit` -- print a string to the console + +# List +- `hd: [T] -> T` -- get the head of a list +- `tl: [T] -> [T]` -- get the tail of a list + +## Option +- `Opt` -- a type that can be `Just` or `Nil` +- `Just: T -> Opt` -- create a `Just` value +- `Nil: Opt` -- create a `Nil` value + # Imports ```python imp test # use: test. @@ -28,7 +41,6 @@ imp * of test # import all - `bool` -- `True` or `False` - `[T]` -- List of `T` - `(T, U)` -- Tuple with two elements of type `T` and `U` -- `opt` -- `Just` or `Nil` ```python # With type annotation @@ -39,7 +51,6 @@ x_bool: bool = True # False x_str: str = "hello 👾" x_list: [int] = [1, 2, 3] x_tuple: (int, str) = (1, "hello") -x_option: opt = Just(1) # Nil x_f1: () -> unit = () -> print "hello" ; x_f2: () -> int = () -> 1 ; x_f3: (T) -> T = (x) -> x ; # Generic diff --git a/dev_doc/syntax_definition.sh b/dev_doc/syntax_definition.sh deleted file mode 100644 index f629bc5..0000000 --- a/dev_doc/syntax_definition.sh +++ /dev/null @@ -1,190 +0,0 @@ -# It is an sh file because it is easier to use the syntax highlighting of the editor - -# Imports -imp test # use: test. -imp test as t # use: t. -imp ( - test as t - test2 - test3 -) -imp f1 of test # use: f1 -imp { .. } of test # import all - -# Native Types -# With type annotation (no type inference) -# x_char: char = '👾' # unicode -x_int: int = 1 -x_float: float = 1.0 -x_bool: bool = true # false -x_str: str = "hello 👾" -x_list: [int] = [1, 2, 3] -x_tuple: (int, str) = (1, "hello") -x_option: opt = Just(1) # Nil -x_f1: () -> unit = () -> print "hello" -x_f2: () -> int = () -> 1 -x_f3: (T) -> T = (x) -> x # Generic - -# Without type annotation (type inference) -# x_char = '👾' # unicode -x_int = 1 -x_float = 1.0 -x_bool = true # false -x_str = "hello 👾" -x_list = [1, 2, 3] -x_tuple = (1, "hello") -x_option = Just(1) # Nil -x_f1 = () -> print "hello" ; -x_f2 = () -> 1 ; -x_f3 = (x) -> x ; # Generic - -# Record -data MyRecord = { - a: int - b: mut str - c: int - d: mut str -} -record: MyRecord = {a: 1, b: "a", c: 2, d: "b"} -a = record.a -b = record.b -c = record.c -d = record.d -{a, b, c, d} = record -record.a = 3 # Error: record.a is immutable -record.b = "c" # OK -record.c = 4 # Error: record.c is immutable -record.d = "d" # OK - -# Variant -data MyVariant = -| First -| Second -| Third(Int) -; -first = First -second = Second -third = Third(1) -match_variant: MyVariant -> str = (v) -> - match v - | First => "first" - | Second => "second" - | Third(_) => "third" - ; -; - - -# Functions Overview -f_base: (int, int) -> int = (a, b) -> - square_a: int = a * a - square_b: int = b * b - square_a + square_b # return -; - -f_match: (int) -> str = (a) -> - match a - | 0 => "zero" - | 1 => "one" - | _ => "other" - ; -; - -f2 = (a, b) -> - if gt a b then # `gt` (greater than) will be imported from the standard library - a_square = a * a - a_square # return - else - b * b # return -; - -f3 = (a, b) -> - f_inner = (a, b) -> - if gt a b then - a_square = a * a - a_square # return - else - b * b # return - ; - f_inner a b -; - -f4 = (a, b) -> - f_inner = (..) -> # `..` inherits all arguments of the parent function - if gt a b then - a_square \ # `\` is the line continuation character - = a * a - a_square # return - else - b * b # return - ; - f_inner -; - -print_test = () -> print "test" ; - -## Recursive functions -fact = (n) -> if eq n 0 then 1 else n * fact (n - 1) ; - -## Higher order functions -apply: ((T) -> U, T) -> U = (f, x) -> f x ; - -## Partial application -add: (T, T) -> T = (a, b) -> a + b ; -add_1: (Int) -> Int = add 1 ; - -# Function call -result = add 1 2 - -# Function Composition -f1 = (x) -> x + 1 -f2 = (x) -> x * 2 -f3 = f3 = f1 (f2 x) # f1 . f2 -result = f3 1 # 3 - -# Lists Overviews -l: [int] = [1, 2, 3, 4, 5] # mutable with cons and concat -head = hd l # 1 -tail = tl l # [2, 3, 4, 5] -head, tail = l -first, second, tail = l -new_list = 0 : l # [0, 1, 2, 3, 4, 5] cons operator -new_list2 = l ++ [6, 7, 8, 9] # [1, 2, 3, 4, 5, 6, 7, 8, 9] concat operator -match_list: [Int] -> Str = (l) -> - match l - | [] => "empty" - | [single] => match single - | 0 => "zero" - | _ => "other" - ; - | [first, second] => "two" - | first : second : tail => "first, second and tail" - | head : tail => "head and tail" - ; -; - -# Some List Functions -map: ((T) -> U, [T]) -> [U] = (f, l) -> - match l - | [] => [] - | head : tail => f head : map f tail - ; -; - -filter: ((T) -> Bool, [T]) -> [T] = (f, l) -> - match l - | [] => [] - | head : tail => - if f head then - head : filter f tail - else - filter f tail - ; -; - -# Tuples Overview -tuple: (int, str, int, str) = (1, "a", 2, "b") # immutable -first = tuple.0 -second = tuple.1 -third = tuple.2 -fourth = tuple.3 -(one, a, two, b) = tuple diff --git a/examples/test.fs b/examples/test.fs index 04ba125..c13a8bb 100644 --- a/examples/test.fs +++ b/examples/test.fs @@ -1 +1 @@ -x_func: (int) -> int = (x) -> x ; +x_func: (int) -> int = (x) -> x - 1 ; diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index f019d6b..6ae4940 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -129,7 +129,7 @@ mod tests { #[test] fn identifier() { let fs_files = collect_fs_files("./testdata/identifier", true); - assert_eq!(fs_files.len(), 20); + assert_eq!(fs_files.len(), 24); for path in fs_files { info!("file -> {:?}", path); diff --git a/src/lexer/states.rs b/src/lexer/states.rs index d06c881..ddd9044 100644 --- a/src/lexer/states.rs +++ b/src/lexer/states.rs @@ -4,11 +4,6 @@ use super::Lexer; use super::LexerError; use crate::lexer::token::Token; use crate::lexer::token::TokenKind; -use crate::lexer::token::TokenKind::TokenCloseBrace; -use crate::lexer::token::TokenKind::TokenCloseBracket; -use crate::lexer::token::TokenKind::TokenCloseParen; -use crate::lexer::token::TokenKind::TokenDoubleQuote; -use crate::lexer::token::TokenKind::TokenSingleQuote; use std::fmt::Debug; pub trait State: Debug { @@ -222,29 +217,6 @@ impl State for StateSymbol { fn visit(&self, cursor: &mut Cursor) -> Result { match cursor.peek() { Some('\n') => { - let lexeme = cursor.source().content()[cursor.index()..cursor.offset()].to_string(); - let token_kind = TokenKind::from(&lexeme); - - // NOTE: if a '\n' is found and it was scanning another "symbol" token, the previous was mangled, and only the '\n' is emitted, - // we need to handle the previous token since can be at the end of the line - let valid_last_token = vec![ - TokenCloseBracket, - TokenCloseParen, - TokenCloseBrace, - TokenDoubleQuote, - TokenSingleQuote, - ]; - if valid_last_token.contains(&token_kind) { - return Ok(Lexer::proceed( - Box::new(StateStart), - TransitionKind::EmitToken(Token::new( - token_kind, - lexeme, - cursor.location().clone(), - )), - )); - } - let transition = Lexer::proceed( Box::new(StateStart), TransitionKind::EmitToken(Token::new( @@ -259,7 +231,7 @@ impl State for StateSymbol { Some(c) if TokenKind::can_be_followed_by_symbol(c.to_string().as_str()) => Ok( Lexer::proceed(Box::new(StateSymbol), TransitionKind::AdvanceOffset), ), - _ => { + Some(_) if TokenKind::is_symbol(cursor.peek().unwrap().to_string().as_str()) => { let lexeme = cursor.source().content()[cursor.index()..cursor.offset() + 1].to_string(); let token_kind = TokenKind::from(&lexeme); @@ -270,6 +242,24 @@ impl State for StateSymbol { TransitionKind::EmitToken(Token::new(token_kind, lexeme, location)), )) } + Some(_) if !TokenKind::is_symbol(cursor.peek().unwrap().to_string().as_str()) => { + let lexeme = cursor.source().content()[cursor.index()..cursor.offset()].to_string(); + let token_kind = TokenKind::from(&lexeme); + let location = cursor.location().clone(); + Ok(Lexer::proceed( + Box::new(StateStart), + TransitionKind::EmitToken(Token::new(token_kind, lexeme, location)), + )) + } + Some(c) => Ok(Lexer::proceed( + Box::new(StateStart), + TransitionKind::EmitToken(Token::new( + TokenKind::TokenUnknown, + c.to_string(), + cursor.location().clone(), + )), + )), + None => Ok(Lexer::proceed(Box::new(StateEOF), TransitionKind::Consume)), } } } diff --git a/src/lexer/token.rs b/src/lexer/token.rs index 4d330ca..c30da28 100644 --- a/src/lexer/token.rs +++ b/src/lexer/token.rs @@ -24,6 +24,10 @@ const OPEN_BRACE: &str = "["; const CLOSE_BRACE: &str = "]"; const COMMA: &str = ","; const MINUS: &str = "-"; +const PLUS: &str = "+"; +const MULTIPLY: &str = "*"; +const DIVIDE: &str = "/"; +const GREATER: &str = ">"; const RIGHT_ARROW: &str = "->"; #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] @@ -55,12 +59,41 @@ pub enum TokenKind { TokenOpenBracket, // [ TokenCloseBracket, // ] TokenComma, // , + TokenGreater, // > TokenRightArrow, // -> TokenEOF, // End of file + // Operators + TokenPlus, // + + TokenMinus, // - + TokenMultiply, // * + TokenDivide, // / TokenUnknown, } impl TokenKind { + pub fn is_symbol(c: &str) -> bool { + matches!( + c, + COLON + | SEMICOLON + | ASSIGN + | SINGLE_QUOTE + | DOUBLE_QUOTE + | OPEN_PAREN + | CLOSE_PAREN + | OPEN_BRACE + | CLOSE_BRACE + | OPEN_BRACKET + | CLOSE_BRACKET + | COMMA + | MINUS + | PLUS + | MULTIPLY + | DIVIDE + | GREATER + ) + } + pub fn is_start_of_symbol(c: &str) -> bool { matches!( c, @@ -77,8 +110,11 @@ impl TokenKind { | CLOSE_BRACKET | COMMA | MINUS - | RIGHT_ARROW + | PLUS + | MULTIPLY + | DIVIDE | NEW_LINE + | RIGHT_ARROW ) } @@ -125,6 +161,10 @@ impl TokenKind { OPEN_BRACKET => Some(TokenKind::TokenOpenBracket), CLOSE_BRACKET => Some(TokenKind::TokenCloseBracket), COMMA => Some(TokenKind::TokenComma), + PLUS => Some(TokenKind::TokenPlus), + MINUS => Some(TokenKind::TokenMinus), + MULTIPLY => Some(TokenKind::TokenMultiply), + DIVIDE => Some(TokenKind::TokenDivide), RIGHT_ARROW => Some(TokenKind::TokenRightArrow), _ => None, } @@ -326,9 +366,14 @@ impl std::fmt::Display for TokenKind { TokenKind::TokenCloseBrace => write!(f, "TokenCloseBrace"), TokenKind::TokenOpenBracket => write!(f, "TokenOpenBracket"), TokenKind::TokenCloseBracket => write!(f, "TokenCloseBracket"), + TokenKind::TokenGreater => write!(f, "TokenGreater"), TokenKind::TokenComma => write!(f, "TokenComma"), TokenKind::TokenRightArrow => write!(f, "TokenRightArrow"), TokenKind::TokenEOF => write!(f, "TokenEOF"), + TokenKind::TokenPlus => write!(f, "TokenPlus"), + TokenKind::TokenMinus => write!(f, "TokenMinus"), + TokenKind::TokenMultiply => write!(f, "TokenMultiply"), + TokenKind::TokenDivide => write!(f, "TokenDivide"), TokenKind::TokenUnknown => write!(f, "TokenUnknown"), } } diff --git a/testdata/identifier/id_fun_div.fs b/testdata/identifier/id_fun_div.fs new file mode 100644 index 0000000..e042ad4 --- /dev/null +++ b/testdata/identifier/id_fun_div.fs @@ -0,0 +1 @@ +x_func: (int) -> int = (x) -> x / 1 ; diff --git a/testdata/identifier/id_fun_div.tokens.json b/testdata/identifier/id_fun_div.tokens.json new file mode 100644 index 0000000..f175b13 --- /dev/null +++ b/testdata/identifier/id_fun_div.tokens.json @@ -0,0 +1,33 @@ +[ + { "kind": "TokenIdentifier", "lexeme": "x_func", "location": { "file_path": "", "line": 0, "column_start": 0, "column_end": 6 } }, + { "kind": "TokenColon", "lexeme": ":", "location": { "file_path": "", "line": 0, "column_start": 6, "column_end": 7 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 7, "column_end": 8 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 8, "column_end": 9 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 9, "column_end": 12 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 12, "column_end": 13 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 13, "column_end": 14 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 14, "column_end": 16 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 16, "column_end": 17 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 17, "column_end": 20 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 20, "column_end": 21 } }, + { "kind": "TokenAssign", "lexeme": "=", "location": { "file_path": "", "line": 0, "column_start": 21, "column_end": 22 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 22, "column_end": 23 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 23, "column_end": 24 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 24, "column_end": 25 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 25, "column_end": 26 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 26, "column_end": 27 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 27, "column_end": 29 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 29, "column_end": 30 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 30, "column_end": 31 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 31, "column_end": 32 } }, + { "kind": "TokenDivide", "lexeme": "/", "location": { "file_path": "", "line": 0, "column_start": 32, "column_end": 33 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 33, "column_end": 34 } }, + { "kind": { "TokenLiteral": "Int" }, "lexeme": "1", "location": { "file_path": "", "line": 0, "column_start": 34, "column_end": 35 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 35, "column_end": 36 } }, + { "kind": "TokenSemicolon", "lexeme": ";", "location": { "file_path": "", "line": 0, "column_start": 36, "column_end": 37 } }, + { "kind": "TokenNewLine", "lexeme": "\\n", "location": { "file_path": "", "line": 0, "column_start": 37, "column_end": 37 } }, + { "kind": "TokenEOF", "lexeme": "", "location": { "file_path": "", "line": 1, "column_start": 0, "column_end": 0 } } +] + + + diff --git a/testdata/identifier/id_fun_mul.fs b/testdata/identifier/id_fun_mul.fs new file mode 100644 index 0000000..27ccd9a --- /dev/null +++ b/testdata/identifier/id_fun_mul.fs @@ -0,0 +1 @@ +x_func: (int) -> int = (x) -> x * 1 ; diff --git a/testdata/identifier/id_fun_mul.tokens.json b/testdata/identifier/id_fun_mul.tokens.json new file mode 100644 index 0000000..57936d2 --- /dev/null +++ b/testdata/identifier/id_fun_mul.tokens.json @@ -0,0 +1,32 @@ +[ + { "kind": "TokenIdentifier", "lexeme": "x_func", "location": { "file_path": "", "line": 0, "column_start": 0, "column_end": 6 } }, + { "kind": "TokenColon", "lexeme": ":", "location": { "file_path": "", "line": 0, "column_start": 6, "column_end": 7 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 7, "column_end": 8 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 8, "column_end": 9 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 9, "column_end": 12 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 12, "column_end": 13 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 13, "column_end": 14 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 14, "column_end": 16 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 16, "column_end": 17 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 17, "column_end": 20 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 20, "column_end": 21 } }, + { "kind": "TokenAssign", "lexeme": "=", "location": { "file_path": "", "line": 0, "column_start": 21, "column_end": 22 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 22, "column_end": 23 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 23, "column_end": 24 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 24, "column_end": 25 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 25, "column_end": 26 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 26, "column_end": 27 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 27, "column_end": 29 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 29, "column_end": 30 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 30, "column_end": 31 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 31, "column_end": 32 } }, + { "kind": "TokenMultiply", "lexeme": "*", "location": { "file_path": "", "line": 0, "column_start": 32, "column_end": 33 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 33, "column_end": 34 } }, + { "kind": { "TokenLiteral": "Int" }, "lexeme": "1", "location": { "file_path": "", "line": 0, "column_start": 34, "column_end": 35 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 35, "column_end": 36 } }, + { "kind": "TokenSemicolon", "lexeme": ";", "location": { "file_path": "", "line": 0, "column_start": 36, "column_end": 37 } }, + { "kind": "TokenNewLine", "lexeme": "\\n", "location": { "file_path": "", "line": 0, "column_start": 37, "column_end": 37 } }, + { "kind": "TokenEOF", "lexeme": "", "location": { "file_path": "", "line": 1, "column_start": 0, "column_end": 0 } } +] + + diff --git a/testdata/identifier/id_fun_sub.fs b/testdata/identifier/id_fun_sub.fs new file mode 100644 index 0000000..c13a8bb --- /dev/null +++ b/testdata/identifier/id_fun_sub.fs @@ -0,0 +1 @@ +x_func: (int) -> int = (x) -> x - 1 ; diff --git a/testdata/identifier/id_fun_sub.tokens.json b/testdata/identifier/id_fun_sub.tokens.json new file mode 100644 index 0000000..9872b34 --- /dev/null +++ b/testdata/identifier/id_fun_sub.tokens.json @@ -0,0 +1,31 @@ +[ + { "kind": "TokenIdentifier", "lexeme": "x_func", "location": { "file_path": "", "line": 0, "column_start": 0, "column_end": 6 } }, + { "kind": "TokenColon", "lexeme": ":", "location": { "file_path": "", "line": 0, "column_start": 6, "column_end": 7 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 7, "column_end": 8 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 8, "column_end": 9 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 9, "column_end": 12 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 12, "column_end": 13 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 13, "column_end": 14 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 14, "column_end": 16 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 16, "column_end": 17 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 17, "column_end": 20 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 20, "column_end": 21 } }, + { "kind": "TokenAssign", "lexeme": "=", "location": { "file_path": "", "line": 0, "column_start": 21, "column_end": 22 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 22, "column_end": 23 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 23, "column_end": 24 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 24, "column_end": 25 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 25, "column_end": 26 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 26, "column_end": 27 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 27, "column_end": 29 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 29, "column_end": 30 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 30, "column_end": 31 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 31, "column_end": 32 } }, + { "kind": "TokenMinus", "lexeme": "-", "location": { "file_path": "", "line": 0, "column_start": 32, "column_end": 33 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 33, "column_end": 34 } }, + { "kind": { "TokenLiteral": "Int" }, "lexeme": "1", "location": { "file_path": "", "line": 0, "column_start": 34, "column_end": 35 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 35, "column_end": 36 } }, + { "kind": "TokenSemicolon", "lexeme": ";", "location": { "file_path": "", "line": 0, "column_start": 36, "column_end": 37 } }, + { "kind": "TokenNewLine", "lexeme": "\\n", "location": { "file_path": "", "line": 0, "column_start": 37, "column_end": 37 } }, + { "kind": "TokenEOF", "lexeme": "", "location": { "file_path": "", "line": 1, "column_start": 0, "column_end": 0 } } +] + diff --git a/testdata/identifier/id_fun_sum.fs b/testdata/identifier/id_fun_sum.fs new file mode 100644 index 0000000..aa7aa3f --- /dev/null +++ b/testdata/identifier/id_fun_sum.fs @@ -0,0 +1 @@ +x_func: (int) -> int = (x) -> x + 1 ; diff --git a/testdata/identifier/id_fun_sum.tokens.json b/testdata/identifier/id_fun_sum.tokens.json new file mode 100644 index 0000000..f34c587 --- /dev/null +++ b/testdata/identifier/id_fun_sum.tokens.json @@ -0,0 +1,30 @@ +[ + { "kind": "TokenIdentifier", "lexeme": "x_func", "location": { "file_path": "", "line": 0, "column_start": 0, "column_end": 6 } }, + { "kind": "TokenColon", "lexeme": ":", "location": { "file_path": "", "line": 0, "column_start": 6, "column_end": 7 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 7, "column_end": 8 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 8, "column_end": 9 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 9, "column_end": 12 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 12, "column_end": 13 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 13, "column_end": 14 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 14, "column_end": 16 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 16, "column_end": 17 } }, + { "kind": "TokenType", "lexeme": "int", "location": { "file_path": "", "line": 0, "column_start": 17, "column_end": 20 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 20, "column_end": 21 } }, + { "kind": "TokenAssign", "lexeme": "=", "location": { "file_path": "", "line": 0, "column_start": 21, "column_end": 22 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 22, "column_end": 23 } }, + { "kind": "TokenOpenParen", "lexeme": "(", "location": { "file_path": "", "line": 0, "column_start": 23, "column_end": 24 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 24, "column_end": 25 } }, + { "kind": "TokenCloseParen", "lexeme": ")", "location": { "file_path": "", "line": 0, "column_start": 25, "column_end": 26 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 26, "column_end": 27 } }, + { "kind": "TokenRightArrow", "lexeme": "->", "location": { "file_path": "", "line": 0, "column_start": 27, "column_end": 29 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 29, "column_end": 30 } }, + { "kind": "TokenIdentifier", "lexeme": "x", "location": { "file_path": "", "line": 0, "column_start": 30, "column_end": 31 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 31, "column_end": 32 } }, + { "kind": "TokenPlus", "lexeme": "+", "location": { "file_path": "", "line": 0, "column_start": 32, "column_end": 33 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 33, "column_end": 34 } }, + { "kind": { "TokenLiteral": "Int" }, "lexeme": "1", "location": { "file_path": "", "line": 0, "column_start": 34, "column_end": 35 } }, + { "kind": "TokenSpace", "lexeme": " ", "location": { "file_path": "", "line": 0, "column_start": 35, "column_end": 36 } }, + { "kind": "TokenSemicolon", "lexeme": ";", "location": { "file_path": "", "line": 0, "column_start": 36, "column_end": 37 } }, + { "kind": "TokenNewLine", "lexeme": "\\n", "location": { "file_path": "", "line": 0, "column_start": 37, "column_end": 37 } }, + { "kind": "TokenEOF", "lexeme": "", "location": { "file_path": "", "line": 1, "column_start": 0, "column_end": 0 } } +]