From 9716e88dcb220dac62fb46d9669188ce554581d0 Mon Sep 17 00:00:00 2001 From: Sky Johnson Date: Wed, 7 May 2025 08:38:33 -0500 Subject: [PATCH] scanner 1 --- go.mod | 2 + go.sum | 4 +- scanner/scanner.go | 315 ++++++++++++++++++++++++++++++++++++++++ scanner/scanner_test.go | 234 +++++++++++++++++++++++++++++ 4 files changed, 553 insertions(+), 2 deletions(-) create mode 100644 scanner/scanner.go create mode 100644 scanner/scanner_test.go diff --git a/go.mod b/go.mod index 9130d44..ba8bdda 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module git.sharkk.net/Sharkk/Mako go 1.24.1 + +require git.sharkk.net/Go/Assert v1.1.0 diff --git a/go.sum b/go.sum index 115793b..54a850a 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,2 @@ -git.sharkk.net/Go/Assert v0.0.0-20250426205601-1b0e5ea6e7ac h1:B6iLK3nv2ubDfk5Ve9Z2sRPqpTgPWgsm7PyaWlwr3NY= -git.sharkk.net/Go/Assert v0.0.0-20250426205601-1b0e5ea6e7ac/go.mod h1:7AMVm0RCtLlQfWsnKs6h/IdSfzj52/o0nR03rCW68gM= +git.sharkk.net/Go/Assert v1.1.0 h1:1Nbu8C9vmv3gXaLR4S+NBXfQ01gnh3IHHD7PQRIVIe8= +git.sharkk.net/Go/Assert v1.1.0/go.mod h1:7AMVm0RCtLlQfWsnKs6h/IdSfzj52/o0nR03rCW68gM= diff --git a/scanner/scanner.go b/scanner/scanner.go new file mode 100644 index 0000000..88ef921 --- /dev/null +++ b/scanner/scanner.go @@ -0,0 +1,315 @@ +package scanner + +import ( + "strconv" + + "git.sharkk.net/Sharkk/Mako/types" +) + +// Scanner holds the state needed for scanning +type Scanner struct { + source string + start int // start of the current lexeme + current int // current position in the source + line int // current line number + column int // current column number +} + +// New creates a new scanner for the given source +func New(source string) *Scanner { + return &Scanner{ + source: source, + line: 1, + column: 1, + } +} + +// NextToken returns the next token from the source +func (s *Scanner) NextToken() types.Token { + s.skipWhitespace() + + s.start = s.current + + if s.isAtEnd() { + return s.makeToken(types.EOF) + } + + c := s.advance() + + if isAlpha(c) { + return s.identifier() + } + + if isDigit(c) { + return s.number() + } + + switch c { + case '(': + return s.makeToken(types.LEFT_PAREN) + case ')': + return s.makeToken(types.RIGHT_PAREN) + case ',': + return s.makeToken(types.COMMA) + case '+': + return s.makeToken(types.PLUS) + case '-': + return s.makeToken(types.MINUS) + case '*': + return s.makeToken(types.STAR) + case '/': + if s.match('/') { + // Comment goes until end of line + for s.peek() != '\n' && !s.isAtEnd() { + s.advance() + } + // Recursive call to get the next non-comment token + return s.NextToken() + } + return s.makeToken(types.SLASH) + case '.': + if s.match('.') { + if s.match('.') { + return s.makeToken(types.ELLIPSIS) + } + // Error for '..' without the third '.' + return s.errorToken("Expected '...' (ellipsis).") + } + // Handle single '.' later (likely part of a number) + // For now, error + return s.errorToken("Unexpected '.'.") + case '=': + if s.match('=') { + return s.makeToken(types.EQUAL_EQUAL) + } + return s.makeToken(types.EQUAL) + case '!': + if s.match('=') { + return s.makeToken(types.BANG_EQUAL) + } + return s.errorToken("Unexpected character.") + case '<': + if s.match('=') { + return s.makeToken(types.LESS_EQUAL) + } + return s.makeToken(types.LESS) + case '>': + if s.match('=') { + return s.makeToken(types.GREATER_EQUAL) + } + return s.makeToken(types.GREATER) + case '"': + return s.string() + } + + return s.errorToken("Unexpected character.") +} + +// ScanTokens scans all tokens in the source and returns them +func (s *Scanner) ScanTokens() []types.Token { + var tokens []types.Token + + for { + token := s.NextToken() + tokens = append(tokens, token) + + if token.Type == types.EOF { + break + } + } + + return tokens +} + +// Helper methods for scanning +func (s *Scanner) isAtEnd() bool { + return s.current >= len(s.source) +} + +func (s *Scanner) advance() byte { + c := s.source[s.current] + s.current++ + s.column++ + return c +} + +func (s *Scanner) peek() byte { + if s.isAtEnd() { + return 0 + } + return s.source[s.current] +} + +func (s *Scanner) peekNext() byte { + if s.current+1 >= len(s.source) { + return 0 + } + return s.source[s.current+1] +} + +func (s *Scanner) match(expected byte) bool { + if s.isAtEnd() || s.source[s.current] != expected { + return false + } + + s.current++ + s.column++ + return true +} + +func (s *Scanner) makeToken(tokenType types.TokenType) types.Token { + return s.makeTokenWithLiteral(tokenType, nil) +} + +func (s *Scanner) makeTokenWithLiteral(tokenType types.TokenType, literal any) types.Token { + lexeme := s.source[s.start:s.current] + return types.Token{ + Type: tokenType, + Lexeme: lexeme, + Literal: literal, + Line: s.line, + Column: s.column - len(lexeme), + } +} + +func (s *Scanner) errorToken(message string) types.Token { + return types.Token{ + Type: types.ERROR, + Lexeme: message, + Line: s.line, + Column: s.column, + } +} + +func (s *Scanner) skipWhitespace() { + for { + c := s.peek() + switch c { + case ' ', '\r', '\t': + s.advance() + case '\n': + s.line++ + s.column = 0 // Reset column for new line + s.advance() + default: + return + } + } +} + +func (s *Scanner) string() types.Token { + // Scan until closing quote + for s.peek() != '"' && !s.isAtEnd() { + if s.peek() == '\n' { + s.line++ + s.column = 0 + } + s.advance() + } + + if s.isAtEnd() { + return s.errorToken("Unterminated string.") + } + + // Consume the closing " + s.advance() + + // Get the string value (without the quotes) + value := s.source[s.start+1 : s.current-1] + return s.makeTokenWithLiteral(types.STRING, value) +} + +func (s *Scanner) number() types.Token { + // Scan integer part + for isDigit(s.peek()) { + s.advance() + } + + // Look for a decimal part + if s.peek() == '.' && isDigit(s.peekNext()) { + // Consume the . + s.advance() + + // Consume decimal digits + for isDigit(s.peek()) { + s.advance() + } + } + + // Parse the number + value, err := strconv.ParseFloat(s.source[s.start:s.current], 64) + if err != nil { + return s.errorToken("Invalid number.") + } + + return s.makeTokenWithLiteral(types.NUMBER, value) +} + +func (s *Scanner) identifier() types.Token { + for isAlphaNumeric(s.peek()) { + s.advance() + } + + // Check if the identifier is actually a keyword + text := s.source[s.start:s.current] + tokenType := s.keywordType(text) + + var literal any + if tokenType == types.TRUE { + literal = true + } else if tokenType == types.FALSE { + literal = false + } else if tokenType == types.NIL { + literal = nil + } + + return s.makeTokenWithLiteral(tokenType, literal) +} + +func (s *Scanner) keywordType(text string) types.TokenType { + switch text { + case "and": + return types.AND + case "or": + return types.OR + case "if": + return types.IF + case "elseif": + return types.ELSEIF + case "else": + return types.ELSE + case "then": + return types.THEN + case "end": + return types.END + case "fn": + return types.FN + case "return": + return types.RETURN + case "echo": + return types.ECHO + case "true": + return types.TRUE + case "false": + return types.FALSE + case "nil": + return types.NIL + default: + return types.IDENTIFIER + } +} + +// Helper functions +func isDigit(c byte) bool { + return c >= '0' && c <= '9' +} + +func isAlpha(c byte) bool { + return (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + c == '_' +} + +func isAlphaNumeric(c byte) bool { + return isAlpha(c) || isDigit(c) +} diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go new file mode 100644 index 0000000..6439f6a --- /dev/null +++ b/scanner/scanner_test.go @@ -0,0 +1,234 @@ +package scanner_test + +import ( + "testing" + + assert "git.sharkk.net/Go/Assert" + "git.sharkk.net/Sharkk/Mako/scanner" + "git.sharkk.net/Sharkk/Mako/types" +) + +// Helper function to check token equality +func checkToken(t *testing.T, token types.Token, expectedType types.TokenType, expectedLexeme string, expectedLine int, expectedColumn int) { + assert.Equal(t, expectedType, token.Type) + assert.Equal(t, expectedLexeme, token.Lexeme) + assert.Equal(t, expectedLine, token.Line) + assert.Equal(t, expectedColumn, token.Column) +} + +func TestSingleTokens(t *testing.T) { + tests := []struct { + source string + tokType types.TokenType + lexeme string + line int + column int + }{ + {"(", types.LEFT_PAREN, "(", 1, 1}, + {")", types.RIGHT_PAREN, ")", 1, 1}, + {",", types.COMMA, ",", 1, 1}, + {"+", types.PLUS, "+", 1, 1}, + {"-", types.MINUS, "-", 1, 1}, + {"*", types.STAR, "*", 1, 1}, + {"/", types.SLASH, "/", 1, 1}, + {"=", types.EQUAL, "=", 1, 1}, + {"==", types.EQUAL_EQUAL, "==", 1, 1}, + {"!=", types.BANG_EQUAL, "!=", 1, 1}, + {"<", types.LESS, "<", 1, 1}, + {"<=", types.LESS_EQUAL, "<=", 1, 1}, + {">", types.GREATER, ">", 1, 1}, + {">=", types.GREATER_EQUAL, ">=", 1, 1}, + {"if", types.IF, "if", 1, 1}, + {"then", types.THEN, "then", 1, 1}, + {"elseif", types.ELSEIF, "elseif", 1, 1}, + {"else", types.ELSE, "else", 1, 1}, + {"end", types.END, "end", 1, 1}, + {"fn", types.FN, "fn", 1, 1}, + {"return", types.RETURN, "return", 1, 1}, + {"echo", types.ECHO, "echo", 1, 1}, + {"true", types.TRUE, "true", 1, 1}, + {"false", types.FALSE, "false", 1, 1}, + {"nil", types.NIL, "nil", 1, 1}, + {"and", types.AND, "and", 1, 1}, + {"or", types.OR, "or", 1, 1}, + {"identifier", types.IDENTIFIER, "identifier", 1, 1}, + {"...", types.ELLIPSIS, "...", 1, 1}, + } + + for _, test := range tests { + s := scanner.New(test.source) + token := s.NextToken() + + checkToken(t, token, test.tokType, test.lexeme, test.line, test.column) + + // Next token should be EOF + token = s.NextToken() + checkToken(t, token, types.EOF, "", test.line, test.column+len(test.lexeme)) + } +} + +func TestNumbers(t *testing.T) { + tests := []struct { + source string + lexeme string + value float64 + }{ + {"123", "123", 123.0}, + {"123.456", "123.456", 123.456}, + {"0.123", "0.123", 0.123}, + {"0", "0", 0.0}, + } + + for _, test := range tests { + s := scanner.New(test.source) + token := s.NextToken() + + assert.Equal(t, types.NUMBER, token.Type) + assert.Equal(t, test.lexeme, token.Lexeme) + assert.Equal(t, test.value, token.Literal.(float64)) + } +} + +func TestStrings(t *testing.T) { + tests := []struct { + source string + lexeme string + value string + }{ + {"\"hello\"", "\"hello\"", "hello"}, + {"\"\"", "\"\"", ""}, + {"\"hello world\"", "\"hello world\"", "hello world"}, + } + + for _, test := range tests { + s := scanner.New(test.source) + token := s.NextToken() + + assert.Equal(t, types.STRING, token.Type) + assert.Equal(t, test.lexeme, token.Lexeme) + assert.Equal(t, test.value, token.Literal.(string)) + } +} + +func TestComments(t *testing.T) { + s := scanner.New("// This is a comment\nx = 5") + + token := s.NextToken() + checkToken(t, token, types.IDENTIFIER, "x", 2, 1) + + token = s.NextToken() + checkToken(t, token, types.EQUAL, "=", 2, 3) + + token = s.NextToken() + checkToken(t, token, types.NUMBER, "5", 2, 5) + + token = s.NextToken() + checkToken(t, token, types.EOF, "", 2, 6) +} + +func TestMultipleTokens(t *testing.T) { + source := "fn add(a, b) return a + b end" + s := scanner.New(source) + + expected := []struct { + tokType types.TokenType + lexeme string + }{ + {types.FN, "fn"}, + {types.IDENTIFIER, "add"}, + {types.LEFT_PAREN, "("}, + {types.IDENTIFIER, "a"}, + {types.COMMA, ","}, + {types.IDENTIFIER, "b"}, + {types.RIGHT_PAREN, ")"}, + {types.RETURN, "return"}, + {types.IDENTIFIER, "a"}, + {types.PLUS, "+"}, + {types.IDENTIFIER, "b"}, + {types.END, "end"}, + {types.EOF, ""}, + } + + for _, exp := range expected { + token := s.NextToken() + assert.Equal(t, exp.tokType, token.Type) + assert.Equal(t, exp.lexeme, token.Lexeme) + } +} + +func TestScanTokens(t *testing.T) { + source := "fn add(a, b) return a + b end" + s := scanner.New(source) + + tokens := s.ScanTokens() + + assert.Equal(t, 13, len(tokens)) + assert.Equal(t, types.FN, tokens[0].Type) + assert.Equal(t, types.EOF, tokens[12].Type) +} + +func TestLineAndColumn(t *testing.T) { + source := "x = 1\ny = 2" + s := scanner.New(source) + + token := s.NextToken() // x + checkToken(t, token, types.IDENTIFIER, "x", 1, 1) + + token = s.NextToken() // = + checkToken(t, token, types.EQUAL, "=", 1, 3) + + token = s.NextToken() // 1 + checkToken(t, token, types.NUMBER, "1", 1, 5) + + token = s.NextToken() // y + checkToken(t, token, types.IDENTIFIER, "y", 2, 1) + + token = s.NextToken() // = + checkToken(t, token, types.EQUAL, "=", 2, 3) + + token = s.NextToken() // 2 + checkToken(t, token, types.NUMBER, "2", 2, 5) +} + +func TestErrors(t *testing.T) { + // Unterminated string + s := scanner.New("\"unterminated") + token := s.NextToken() + assert.Equal(t, types.ERROR, token.Type) + + // Invalid character + s = scanner.New("@") + token = s.NextToken() + assert.Equal(t, types.ERROR, token.Type) + + // Standalone ! without = + s = scanner.New("!") + token = s.NextToken() + assert.Equal(t, types.ERROR, token.Type) +} + +func TestLiterals(t *testing.T) { + // Test true literal + s := scanner.New("true") + token := s.NextToken() + assert.Equal(t, types.TRUE, token.Type) + assert.Equal(t, true, token.Literal.(bool)) + + // Test false literal + s = scanner.New("false") + token = s.NextToken() + assert.Equal(t, types.FALSE, token.Type) + assert.Equal(t, false, token.Literal.(bool)) + + // Test nil literal + s = scanner.New("nil") + token = s.NextToken() + assert.Equal(t, types.NIL, token.Type) + assert.Nil(t, token.Literal) +} + +func TestWhitespace(t *testing.T) { + s := scanner.New(" \t \r\n x") + token := s.NextToken() + checkToken(t, token, types.IDENTIFIER, "x", 2, 3) +}