scanner 1

2025-05-07 08:38:33 -05:00 · 2025-05-07 08:38:33 -05:00 · 9716e88dcb
commit 9716e88dcb
parent d4aa7cc282
4 changed files with 553 additions and 2 deletions
--- a/go.mod
+++ b/go.mod
@ -1,3 +1,5 @@
 module git.sharkk.net/Sharkk/Mako

 go 1.24.1
+
+require git.sharkk.net/Go/Assert v1.1.0
--- a/go.sum
+++ b/go.sum
@ -1,2 +1,2 @@
-git.sharkk.net/Go/Assert v0.0.0-20250426205601-1b0e5ea6e7ac h1:B6iLK3nv2ubDfk5Ve9Z2sRPqpTgPWgsm7PyaWlwr3NY=
-git.sharkk.net/Go/Assert v0.0.0-20250426205601-1b0e5ea6e7ac/go.mod h1:7AMVm0RCtLlQfWsnKs6h/IdSfzj52/o0nR03rCW68gM=
+git.sharkk.net/Go/Assert v1.1.0 h1:1Nbu8C9vmv3gXaLR4S+NBXfQ01gnh3IHHD7PQRIVIe8=
+git.sharkk.net/Go/Assert v1.1.0/go.mod h1:7AMVm0RCtLlQfWsnKs6h/IdSfzj52/o0nR03rCW68gM=
--- a/scanner/scanner.go
+++ b/scanner/scanner.go
@ -0,0 +1,315 @@
+package scanner
+
+import (
+	"strconv"
+
+	"git.sharkk.net/Sharkk/Mako/types"
+)
+
+// Scanner holds the state needed for scanning
+type Scanner struct {
+	source  string
+	start   int // start of the current lexeme
+	current int // current position in the source
+	line    int // current line number
+	column  int // current column number
+}
+
+// New creates a new scanner for the given source
+func New(source string) *Scanner {
+	return &Scanner{
+		source: source,
+		line:   1,
+		column: 1,
+	}
+}
+
+// NextToken returns the next token from the source
+func (s *Scanner) NextToken() types.Token {
+	s.skipWhitespace()
+
+	s.start = s.current
+
+	if s.isAtEnd() {
+		return s.makeToken(types.EOF)
+	}
+
+	c := s.advance()
+
+	if isAlpha(c) {
+		return s.identifier()
+	}
+
+	if isDigit(c) {
+		return s.number()
+	}
+
+	switch c {
+	case '(':
+		return s.makeToken(types.LEFT_PAREN)
+	case ')':
+		return s.makeToken(types.RIGHT_PAREN)
+	case ',':
+		return s.makeToken(types.COMMA)
+	case '+':
+		return s.makeToken(types.PLUS)
+	case '-':
+		return s.makeToken(types.MINUS)
+	case '*':
+		return s.makeToken(types.STAR)
+	case '/':
+		if s.match('/') {
+			// Comment goes until end of line
+			for s.peek() != '\n' && !s.isAtEnd() {
+				s.advance()
+			}
+			// Recursive call to get the next non-comment token
+			return s.NextToken()
+		}
+		return s.makeToken(types.SLASH)
+	case '.':
+		if s.match('.') {
+			if s.match('.') {
+				return s.makeToken(types.ELLIPSIS)
+			}
+			// Error for '..' without the third '.'
+			return s.errorToken("Expected '...' (ellipsis).")
+		}
+		// Handle single '.' later (likely part of a number)
+		// For now, error
+		return s.errorToken("Unexpected '.'.")
+	case '=':
+		if s.match('=') {
+			return s.makeToken(types.EQUAL_EQUAL)
+		}
+		return s.makeToken(types.EQUAL)
+	case '!':
+		if s.match('=') {
+			return s.makeToken(types.BANG_EQUAL)
+		}
+		return s.errorToken("Unexpected character.")
+	case '<':
+		if s.match('=') {
+			return s.makeToken(types.LESS_EQUAL)
+		}
+		return s.makeToken(types.LESS)
+	case '>':
+		if s.match('=') {
+			return s.makeToken(types.GREATER_EQUAL)
+		}
+		return s.makeToken(types.GREATER)
+	case '"':
+		return s.string()
+	}
+
+	return s.errorToken("Unexpected character.")
+}
+
+// ScanTokens scans all tokens in the source and returns them
+func (s *Scanner) ScanTokens() []types.Token {
+	var tokens []types.Token
+
+	for {
+		token := s.NextToken()
+		tokens = append(tokens, token)
+
+		if token.Type == types.EOF {
+			break
+		}
+	}
+
+	return tokens
+}
+
+// Helper methods for scanning
+func (s *Scanner) isAtEnd() bool {
+	return s.current >= len(s.source)
+}
+
+func (s *Scanner) advance() byte {
+	c := s.source[s.current]
+	s.current++
+	s.column++
+	return c
+}
+
+func (s *Scanner) peek() byte {
+	if s.isAtEnd() {
+		return 0
+	}
+	return s.source[s.current]
+}
+
+func (s *Scanner) peekNext() byte {
+	if s.current+1 >= len(s.source) {
+		return 0
+	}
+	return s.source[s.current+1]
+}
+
+func (s *Scanner) match(expected byte) bool {
+	if s.isAtEnd() || s.source[s.current] != expected {
+		return false
+	}
+
+	s.current++
+	s.column++
+	return true
+}
+
+func (s *Scanner) makeToken(tokenType types.TokenType) types.Token {
+	return s.makeTokenWithLiteral(tokenType, nil)
+}
+
+func (s *Scanner) makeTokenWithLiteral(tokenType types.TokenType, literal any) types.Token {
+	lexeme := s.source[s.start:s.current]
+	return types.Token{
+		Type:    tokenType,
+		Lexeme:  lexeme,
+		Literal: literal,
+		Line:    s.line,
+		Column:  s.column - len(lexeme),
+	}
+}
+
+func (s *Scanner) errorToken(message string) types.Token {
+	return types.Token{
+		Type:   types.ERROR,
+		Lexeme: message,
+		Line:   s.line,
+		Column: s.column,
+	}
+}
+
+func (s *Scanner) skipWhitespace() {
+	for {
+		c := s.peek()
+		switch c {
+		case ' ', '\r', '\t':
+			s.advance()
+		case '\n':
+			s.line++
+			s.column = 0 // Reset column for new line
+			s.advance()
+		default:
+			return
+		}
+	}
+}
+
+func (s *Scanner) string() types.Token {
+	// Scan until closing quote
+	for s.peek() != '"' && !s.isAtEnd() {
+		if s.peek() == '\n' {
+			s.line++
+			s.column = 0
+		}
+		s.advance()
+	}
+
+	if s.isAtEnd() {
+		return s.errorToken("Unterminated string.")
+	}
+
+	// Consume the closing "
+	s.advance()
+
+	// Get the string value (without the quotes)
+	value := s.source[s.start+1 : s.current-1]
+	return s.makeTokenWithLiteral(types.STRING, value)
+}
+
+func (s *Scanner) number() types.Token {
+	// Scan integer part
+	for isDigit(s.peek()) {
+		s.advance()
+	}
+
+	// Look for a decimal part
+	if s.peek() == '.' && isDigit(s.peekNext()) {
+		// Consume the .
+		s.advance()
+
+		// Consume decimal digits
+		for isDigit(s.peek()) {
+			s.advance()
+		}
+	}
+
+	// Parse the number
+	value, err := strconv.ParseFloat(s.source[s.start:s.current], 64)
+	if err != nil {
+		return s.errorToken("Invalid number.")
+	}
+
+	return s.makeTokenWithLiteral(types.NUMBER, value)
+}
+
+func (s *Scanner) identifier() types.Token {
+	for isAlphaNumeric(s.peek()) {
+		s.advance()
+	}
+
+	// Check if the identifier is actually a keyword
+	text := s.source[s.start:s.current]
+	tokenType := s.keywordType(text)
+
+	var literal any
+	if tokenType == types.TRUE {
+		literal = true
+	} else if tokenType == types.FALSE {
+		literal = false
+	} else if tokenType == types.NIL {
+		literal = nil
+	}
+
+	return s.makeTokenWithLiteral(tokenType, literal)
+}
+
+func (s *Scanner) keywordType(text string) types.TokenType {
+	switch text {
+	case "and":
+		return types.AND
+	case "or":
+		return types.OR
+	case "if":
+		return types.IF
+	case "elseif":
+		return types.ELSEIF
+	case "else":
+		return types.ELSE
+	case "then":
+		return types.THEN
+	case "end":
+		return types.END
+	case "fn":
+		return types.FN
+	case "return":
+		return types.RETURN
+	case "echo":
+		return types.ECHO
+	case "true":
+		return types.TRUE
+	case "false":
+		return types.FALSE
+	case "nil":
+		return types.NIL
+	default:
+		return types.IDENTIFIER
+	}
+}
+
+// Helper functions
+func isDigit(c byte) bool {
+	return c >= '0' && c <= '9'
+}
+
+func isAlpha(c byte) bool {
+	return (c >= 'a' && c <= 'z') ||
+		(c >= 'A' && c <= 'Z') ||
+		c == '_'
+}
+
+func isAlphaNumeric(c byte) bool {
+	return isAlpha(c) || isDigit(c)
+}
--- a/scanner/scanner_test.go
+++ b/scanner/scanner_test.go
@ -0,0 +1,234 @@
+package scanner_test
+
+import (
+	"testing"
+
+	assert "git.sharkk.net/Go/Assert"
+	"git.sharkk.net/Sharkk/Mako/scanner"
+	"git.sharkk.net/Sharkk/Mako/types"
+)
+
+// Helper function to check token equality
+func checkToken(t *testing.T, token types.Token, expectedType types.TokenType, expectedLexeme string, expectedLine int, expectedColumn int) {
+	assert.Equal(t, expectedType, token.Type)
+	assert.Equal(t, expectedLexeme, token.Lexeme)
+	assert.Equal(t, expectedLine, token.Line)
+	assert.Equal(t, expectedColumn, token.Column)
+}
+
+func TestSingleTokens(t *testing.T) {
+	tests := []struct {
+		source  string
+		tokType types.TokenType
+		lexeme  string
+		line    int
+		column  int
+	}{
+		{"(", types.LEFT_PAREN, "(", 1, 1},
+		{")", types.RIGHT_PAREN, ")", 1, 1},
+		{",", types.COMMA, ",", 1, 1},
+		{"+", types.PLUS, "+", 1, 1},
+		{"-", types.MINUS, "-", 1, 1},
+		{"*", types.STAR, "*", 1, 1},
+		{"/", types.SLASH, "/", 1, 1},
+		{"=", types.EQUAL, "=", 1, 1},
+		{"==", types.EQUAL_EQUAL, "==", 1, 1},
+		{"!=", types.BANG_EQUAL, "!=", 1, 1},
+		{"<", types.LESS, "<", 1, 1},
+		{"<=", types.LESS_EQUAL, "<=", 1, 1},
+		{">", types.GREATER, ">", 1, 1},
+		{">=", types.GREATER_EQUAL, ">=", 1, 1},
+		{"if", types.IF, "if", 1, 1},
+		{"then", types.THEN, "then", 1, 1},
+		{"elseif", types.ELSEIF, "elseif", 1, 1},
+		{"else", types.ELSE, "else", 1, 1},
+		{"end", types.END, "end", 1, 1},
+		{"fn", types.FN, "fn", 1, 1},
+		{"return", types.RETURN, "return", 1, 1},
+		{"echo", types.ECHO, "echo", 1, 1},
+		{"true", types.TRUE, "true", 1, 1},
+		{"false", types.FALSE, "false", 1, 1},
+		{"nil", types.NIL, "nil", 1, 1},
+		{"and", types.AND, "and", 1, 1},
+		{"or", types.OR, "or", 1, 1},
+		{"identifier", types.IDENTIFIER, "identifier", 1, 1},
+		{"...", types.ELLIPSIS, "...", 1, 1},
+	}
+
+	for _, test := range tests {
+		s := scanner.New(test.source)
+		token := s.NextToken()
+
+		checkToken(t, token, test.tokType, test.lexeme, test.line, test.column)
+
+		// Next token should be EOF
+		token = s.NextToken()
+		checkToken(t, token, types.EOF, "", test.line, test.column+len(test.lexeme))
+	}
+}
+
+func TestNumbers(t *testing.T) {
+	tests := []struct {
+		source string
+		lexeme string
+		value  float64
+	}{
+		{"123", "123", 123.0},
+		{"123.456", "123.456", 123.456},
+		{"0.123", "0.123", 0.123},
+		{"0", "0", 0.0},
+	}
+
+	for _, test := range tests {
+		s := scanner.New(test.source)
+		token := s.NextToken()
+
+		assert.Equal(t, types.NUMBER, token.Type)
+		assert.Equal(t, test.lexeme, token.Lexeme)
+		assert.Equal(t, test.value, token.Literal.(float64))
+	}
+}
+
+func TestStrings(t *testing.T) {
+	tests := []struct {
+		source string
+		lexeme string
+		value  string
+	}{
+		{"\"hello\"", "\"hello\"", "hello"},
+		{"\"\"", "\"\"", ""},
+		{"\"hello world\"", "\"hello world\"", "hello world"},
+	}
+
+	for _, test := range tests {
+		s := scanner.New(test.source)
+		token := s.NextToken()
+
+		assert.Equal(t, types.STRING, token.Type)
+		assert.Equal(t, test.lexeme, token.Lexeme)
+		assert.Equal(t, test.value, token.Literal.(string))
+	}
+}
+
+func TestComments(t *testing.T) {
+	s := scanner.New("// This is a comment\nx = 5")
+
+	token := s.NextToken()
+	checkToken(t, token, types.IDENTIFIER, "x", 2, 1)
+
+	token = s.NextToken()
+	checkToken(t, token, types.EQUAL, "=", 2, 3)
+
+	token = s.NextToken()
+	checkToken(t, token, types.NUMBER, "5", 2, 5)
+
+	token = s.NextToken()
+	checkToken(t, token, types.EOF, "", 2, 6)
+}
+
+func TestMultipleTokens(t *testing.T) {
+	source := "fn add(a, b) return a + b end"
+	s := scanner.New(source)
+
+	expected := []struct {
+		tokType types.TokenType
+		lexeme  string
+	}{
+		{types.FN, "fn"},
+		{types.IDENTIFIER, "add"},
+		{types.LEFT_PAREN, "("},
+		{types.IDENTIFIER, "a"},
+		{types.COMMA, ","},
+		{types.IDENTIFIER, "b"},
+		{types.RIGHT_PAREN, ")"},
+		{types.RETURN, "return"},
+		{types.IDENTIFIER, "a"},
+		{types.PLUS, "+"},
+		{types.IDENTIFIER, "b"},
+		{types.END, "end"},
+		{types.EOF, ""},
+	}
+
+	for _, exp := range expected {
+		token := s.NextToken()
+		assert.Equal(t, exp.tokType, token.Type)
+		assert.Equal(t, exp.lexeme, token.Lexeme)
+	}
+}
+
+func TestScanTokens(t *testing.T) {
+	source := "fn add(a, b) return a + b end"
+	s := scanner.New(source)
+
+	tokens := s.ScanTokens()
+
+	assert.Equal(t, 13, len(tokens))
+	assert.Equal(t, types.FN, tokens[0].Type)
+	assert.Equal(t, types.EOF, tokens[12].Type)
+}
+
+func TestLineAndColumn(t *testing.T) {
+	source := "x = 1\ny = 2"
+	s := scanner.New(source)
+
+	token := s.NextToken() // x
+	checkToken(t, token, types.IDENTIFIER, "x", 1, 1)
+
+	token = s.NextToken() // =
+	checkToken(t, token, types.EQUAL, "=", 1, 3)
+
+	token = s.NextToken() // 1
+	checkToken(t, token, types.NUMBER, "1", 1, 5)
+
+	token = s.NextToken() // y
+	checkToken(t, token, types.IDENTIFIER, "y", 2, 1)
+
+	token = s.NextToken() // =
+	checkToken(t, token, types.EQUAL, "=", 2, 3)
+
+	token = s.NextToken() // 2
+	checkToken(t, token, types.NUMBER, "2", 2, 5)
+}
+
+func TestErrors(t *testing.T) {
+	// Unterminated string
+	s := scanner.New("\"unterminated")
+	token := s.NextToken()
+	assert.Equal(t, types.ERROR, token.Type)
+
+	// Invalid character
+	s = scanner.New("@")
+	token = s.NextToken()
+	assert.Equal(t, types.ERROR, token.Type)
+
+	// Standalone ! without =
+	s = scanner.New("!")
+	token = s.NextToken()
+	assert.Equal(t, types.ERROR, token.Type)
+}
+
+func TestLiterals(t *testing.T) {
+	// Test true literal
+	s := scanner.New("true")
+	token := s.NextToken()
+	assert.Equal(t, types.TRUE, token.Type)
+	assert.Equal(t, true, token.Literal.(bool))
+
+	// Test false literal
+	s = scanner.New("false")
+	token = s.NextToken()
+	assert.Equal(t, types.FALSE, token.Type)
+	assert.Equal(t, false, token.Literal.(bool))
+
+	// Test nil literal
+	s = scanner.New("nil")
+	token = s.NextToken()
+	assert.Equal(t, types.NIL, token.Type)
+	assert.Nil(t, token.Literal)
+}
+
+func TestWhitespace(t *testing.T) {
+	s := scanner.New("   \t  \r\n  x")
+	token := s.NextToken()
+	checkToken(t, token, types.IDENTIFIER, "x", 2, 3)
+}