scanner 1
This commit is contained in:
parent
d4aa7cc282
commit
9716e88dcb
2
go.mod
2
go.mod
@ -1,3 +1,5 @@
|
||||
module git.sharkk.net/Sharkk/Mako
|
||||
|
||||
go 1.24.1
|
||||
|
||||
require git.sharkk.net/Go/Assert v1.1.0
|
||||
|
4
go.sum
4
go.sum
@ -1,2 +1,2 @@
|
||||
git.sharkk.net/Go/Assert v0.0.0-20250426205601-1b0e5ea6e7ac h1:B6iLK3nv2ubDfk5Ve9Z2sRPqpTgPWgsm7PyaWlwr3NY=
|
||||
git.sharkk.net/Go/Assert v0.0.0-20250426205601-1b0e5ea6e7ac/go.mod h1:7AMVm0RCtLlQfWsnKs6h/IdSfzj52/o0nR03rCW68gM=
|
||||
git.sharkk.net/Go/Assert v1.1.0 h1:1Nbu8C9vmv3gXaLR4S+NBXfQ01gnh3IHHD7PQRIVIe8=
|
||||
git.sharkk.net/Go/Assert v1.1.0/go.mod h1:7AMVm0RCtLlQfWsnKs6h/IdSfzj52/o0nR03rCW68gM=
|
||||
|
315
scanner/scanner.go
Normal file
315
scanner/scanner.go
Normal file
@ -0,0 +1,315 @@
|
||||
package scanner
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
|
||||
"git.sharkk.net/Sharkk/Mako/types"
|
||||
)
|
||||
|
||||
// Scanner holds the state needed for scanning
|
||||
type Scanner struct {
|
||||
source string
|
||||
start int // start of the current lexeme
|
||||
current int // current position in the source
|
||||
line int // current line number
|
||||
column int // current column number
|
||||
}
|
||||
|
||||
// New creates a new scanner for the given source
|
||||
func New(source string) *Scanner {
|
||||
return &Scanner{
|
||||
source: source,
|
||||
line: 1,
|
||||
column: 1,
|
||||
}
|
||||
}
|
||||
|
||||
// NextToken returns the next token from the source
|
||||
func (s *Scanner) NextToken() types.Token {
|
||||
s.skipWhitespace()
|
||||
|
||||
s.start = s.current
|
||||
|
||||
if s.isAtEnd() {
|
||||
return s.makeToken(types.EOF)
|
||||
}
|
||||
|
||||
c := s.advance()
|
||||
|
||||
if isAlpha(c) {
|
||||
return s.identifier()
|
||||
}
|
||||
|
||||
if isDigit(c) {
|
||||
return s.number()
|
||||
}
|
||||
|
||||
switch c {
|
||||
case '(':
|
||||
return s.makeToken(types.LEFT_PAREN)
|
||||
case ')':
|
||||
return s.makeToken(types.RIGHT_PAREN)
|
||||
case ',':
|
||||
return s.makeToken(types.COMMA)
|
||||
case '+':
|
||||
return s.makeToken(types.PLUS)
|
||||
case '-':
|
||||
return s.makeToken(types.MINUS)
|
||||
case '*':
|
||||
return s.makeToken(types.STAR)
|
||||
case '/':
|
||||
if s.match('/') {
|
||||
// Comment goes until end of line
|
||||
for s.peek() != '\n' && !s.isAtEnd() {
|
||||
s.advance()
|
||||
}
|
||||
// Recursive call to get the next non-comment token
|
||||
return s.NextToken()
|
||||
}
|
||||
return s.makeToken(types.SLASH)
|
||||
case '.':
|
||||
if s.match('.') {
|
||||
if s.match('.') {
|
||||
return s.makeToken(types.ELLIPSIS)
|
||||
}
|
||||
// Error for '..' without the third '.'
|
||||
return s.errorToken("Expected '...' (ellipsis).")
|
||||
}
|
||||
// Handle single '.' later (likely part of a number)
|
||||
// For now, error
|
||||
return s.errorToken("Unexpected '.'.")
|
||||
case '=':
|
||||
if s.match('=') {
|
||||
return s.makeToken(types.EQUAL_EQUAL)
|
||||
}
|
||||
return s.makeToken(types.EQUAL)
|
||||
case '!':
|
||||
if s.match('=') {
|
||||
return s.makeToken(types.BANG_EQUAL)
|
||||
}
|
||||
return s.errorToken("Unexpected character.")
|
||||
case '<':
|
||||
if s.match('=') {
|
||||
return s.makeToken(types.LESS_EQUAL)
|
||||
}
|
||||
return s.makeToken(types.LESS)
|
||||
case '>':
|
||||
if s.match('=') {
|
||||
return s.makeToken(types.GREATER_EQUAL)
|
||||
}
|
||||
return s.makeToken(types.GREATER)
|
||||
case '"':
|
||||
return s.string()
|
||||
}
|
||||
|
||||
return s.errorToken("Unexpected character.")
|
||||
}
|
||||
|
||||
// ScanTokens scans all tokens in the source and returns them
|
||||
func (s *Scanner) ScanTokens() []types.Token {
|
||||
var tokens []types.Token
|
||||
|
||||
for {
|
||||
token := s.NextToken()
|
||||
tokens = append(tokens, token)
|
||||
|
||||
if token.Type == types.EOF {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
// Helper methods for scanning
|
||||
func (s *Scanner) isAtEnd() bool {
|
||||
return s.current >= len(s.source)
|
||||
}
|
||||
|
||||
func (s *Scanner) advance() byte {
|
||||
c := s.source[s.current]
|
||||
s.current++
|
||||
s.column++
|
||||
return c
|
||||
}
|
||||
|
||||
func (s *Scanner) peek() byte {
|
||||
if s.isAtEnd() {
|
||||
return 0
|
||||
}
|
||||
return s.source[s.current]
|
||||
}
|
||||
|
||||
func (s *Scanner) peekNext() byte {
|
||||
if s.current+1 >= len(s.source) {
|
||||
return 0
|
||||
}
|
||||
return s.source[s.current+1]
|
||||
}
|
||||
|
||||
func (s *Scanner) match(expected byte) bool {
|
||||
if s.isAtEnd() || s.source[s.current] != expected {
|
||||
return false
|
||||
}
|
||||
|
||||
s.current++
|
||||
s.column++
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *Scanner) makeToken(tokenType types.TokenType) types.Token {
|
||||
return s.makeTokenWithLiteral(tokenType, nil)
|
||||
}
|
||||
|
||||
func (s *Scanner) makeTokenWithLiteral(tokenType types.TokenType, literal any) types.Token {
|
||||
lexeme := s.source[s.start:s.current]
|
||||
return types.Token{
|
||||
Type: tokenType,
|
||||
Lexeme: lexeme,
|
||||
Literal: literal,
|
||||
Line: s.line,
|
||||
Column: s.column - len(lexeme),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scanner) errorToken(message string) types.Token {
|
||||
return types.Token{
|
||||
Type: types.ERROR,
|
||||
Lexeme: message,
|
||||
Line: s.line,
|
||||
Column: s.column,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scanner) skipWhitespace() {
|
||||
for {
|
||||
c := s.peek()
|
||||
switch c {
|
||||
case ' ', '\r', '\t':
|
||||
s.advance()
|
||||
case '\n':
|
||||
s.line++
|
||||
s.column = 0 // Reset column for new line
|
||||
s.advance()
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scanner) string() types.Token {
|
||||
// Scan until closing quote
|
||||
for s.peek() != '"' && !s.isAtEnd() {
|
||||
if s.peek() == '\n' {
|
||||
s.line++
|
||||
s.column = 0
|
||||
}
|
||||
s.advance()
|
||||
}
|
||||
|
||||
if s.isAtEnd() {
|
||||
return s.errorToken("Unterminated string.")
|
||||
}
|
||||
|
||||
// Consume the closing "
|
||||
s.advance()
|
||||
|
||||
// Get the string value (without the quotes)
|
||||
value := s.source[s.start+1 : s.current-1]
|
||||
return s.makeTokenWithLiteral(types.STRING, value)
|
||||
}
|
||||
|
||||
func (s *Scanner) number() types.Token {
|
||||
// Scan integer part
|
||||
for isDigit(s.peek()) {
|
||||
s.advance()
|
||||
}
|
||||
|
||||
// Look for a decimal part
|
||||
if s.peek() == '.' && isDigit(s.peekNext()) {
|
||||
// Consume the .
|
||||
s.advance()
|
||||
|
||||
// Consume decimal digits
|
||||
for isDigit(s.peek()) {
|
||||
s.advance()
|
||||
}
|
||||
}
|
||||
|
||||
// Parse the number
|
||||
value, err := strconv.ParseFloat(s.source[s.start:s.current], 64)
|
||||
if err != nil {
|
||||
return s.errorToken("Invalid number.")
|
||||
}
|
||||
|
||||
return s.makeTokenWithLiteral(types.NUMBER, value)
|
||||
}
|
||||
|
||||
func (s *Scanner) identifier() types.Token {
|
||||
for isAlphaNumeric(s.peek()) {
|
||||
s.advance()
|
||||
}
|
||||
|
||||
// Check if the identifier is actually a keyword
|
||||
text := s.source[s.start:s.current]
|
||||
tokenType := s.keywordType(text)
|
||||
|
||||
var literal any
|
||||
if tokenType == types.TRUE {
|
||||
literal = true
|
||||
} else if tokenType == types.FALSE {
|
||||
literal = false
|
||||
} else if tokenType == types.NIL {
|
||||
literal = nil
|
||||
}
|
||||
|
||||
return s.makeTokenWithLiteral(tokenType, literal)
|
||||
}
|
||||
|
||||
func (s *Scanner) keywordType(text string) types.TokenType {
|
||||
switch text {
|
||||
case "and":
|
||||
return types.AND
|
||||
case "or":
|
||||
return types.OR
|
||||
case "if":
|
||||
return types.IF
|
||||
case "elseif":
|
||||
return types.ELSEIF
|
||||
case "else":
|
||||
return types.ELSE
|
||||
case "then":
|
||||
return types.THEN
|
||||
case "end":
|
||||
return types.END
|
||||
case "fn":
|
||||
return types.FN
|
||||
case "return":
|
||||
return types.RETURN
|
||||
case "echo":
|
||||
return types.ECHO
|
||||
case "true":
|
||||
return types.TRUE
|
||||
case "false":
|
||||
return types.FALSE
|
||||
case "nil":
|
||||
return types.NIL
|
||||
default:
|
||||
return types.IDENTIFIER
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
func isDigit(c byte) bool {
|
||||
return c >= '0' && c <= '9'
|
||||
}
|
||||
|
||||
func isAlpha(c byte) bool {
|
||||
return (c >= 'a' && c <= 'z') ||
|
||||
(c >= 'A' && c <= 'Z') ||
|
||||
c == '_'
|
||||
}
|
||||
|
||||
func isAlphaNumeric(c byte) bool {
|
||||
return isAlpha(c) || isDigit(c)
|
||||
}
|
234
scanner/scanner_test.go
Normal file
234
scanner/scanner_test.go
Normal file
@ -0,0 +1,234 @@
|
||||
package scanner_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
assert "git.sharkk.net/Go/Assert"
|
||||
"git.sharkk.net/Sharkk/Mako/scanner"
|
||||
"git.sharkk.net/Sharkk/Mako/types"
|
||||
)
|
||||
|
||||
// Helper function to check token equality
|
||||
func checkToken(t *testing.T, token types.Token, expectedType types.TokenType, expectedLexeme string, expectedLine int, expectedColumn int) {
|
||||
assert.Equal(t, expectedType, token.Type)
|
||||
assert.Equal(t, expectedLexeme, token.Lexeme)
|
||||
assert.Equal(t, expectedLine, token.Line)
|
||||
assert.Equal(t, expectedColumn, token.Column)
|
||||
}
|
||||
|
||||
func TestSingleTokens(t *testing.T) {
|
||||
tests := []struct {
|
||||
source string
|
||||
tokType types.TokenType
|
||||
lexeme string
|
||||
line int
|
||||
column int
|
||||
}{
|
||||
{"(", types.LEFT_PAREN, "(", 1, 1},
|
||||
{")", types.RIGHT_PAREN, ")", 1, 1},
|
||||
{",", types.COMMA, ",", 1, 1},
|
||||
{"+", types.PLUS, "+", 1, 1},
|
||||
{"-", types.MINUS, "-", 1, 1},
|
||||
{"*", types.STAR, "*", 1, 1},
|
||||
{"/", types.SLASH, "/", 1, 1},
|
||||
{"=", types.EQUAL, "=", 1, 1},
|
||||
{"==", types.EQUAL_EQUAL, "==", 1, 1},
|
||||
{"!=", types.BANG_EQUAL, "!=", 1, 1},
|
||||
{"<", types.LESS, "<", 1, 1},
|
||||
{"<=", types.LESS_EQUAL, "<=", 1, 1},
|
||||
{">", types.GREATER, ">", 1, 1},
|
||||
{">=", types.GREATER_EQUAL, ">=", 1, 1},
|
||||
{"if", types.IF, "if", 1, 1},
|
||||
{"then", types.THEN, "then", 1, 1},
|
||||
{"elseif", types.ELSEIF, "elseif", 1, 1},
|
||||
{"else", types.ELSE, "else", 1, 1},
|
||||
{"end", types.END, "end", 1, 1},
|
||||
{"fn", types.FN, "fn", 1, 1},
|
||||
{"return", types.RETURN, "return", 1, 1},
|
||||
{"echo", types.ECHO, "echo", 1, 1},
|
||||
{"true", types.TRUE, "true", 1, 1},
|
||||
{"false", types.FALSE, "false", 1, 1},
|
||||
{"nil", types.NIL, "nil", 1, 1},
|
||||
{"and", types.AND, "and", 1, 1},
|
||||
{"or", types.OR, "or", 1, 1},
|
||||
{"identifier", types.IDENTIFIER, "identifier", 1, 1},
|
||||
{"...", types.ELLIPSIS, "...", 1, 1},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
s := scanner.New(test.source)
|
||||
token := s.NextToken()
|
||||
|
||||
checkToken(t, token, test.tokType, test.lexeme, test.line, test.column)
|
||||
|
||||
// Next token should be EOF
|
||||
token = s.NextToken()
|
||||
checkToken(t, token, types.EOF, "", test.line, test.column+len(test.lexeme))
|
||||
}
|
||||
}
|
||||
|
||||
func TestNumbers(t *testing.T) {
|
||||
tests := []struct {
|
||||
source string
|
||||
lexeme string
|
||||
value float64
|
||||
}{
|
||||
{"123", "123", 123.0},
|
||||
{"123.456", "123.456", 123.456},
|
||||
{"0.123", "0.123", 0.123},
|
||||
{"0", "0", 0.0},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
s := scanner.New(test.source)
|
||||
token := s.NextToken()
|
||||
|
||||
assert.Equal(t, types.NUMBER, token.Type)
|
||||
assert.Equal(t, test.lexeme, token.Lexeme)
|
||||
assert.Equal(t, test.value, token.Literal.(float64))
|
||||
}
|
||||
}
|
||||
|
||||
func TestStrings(t *testing.T) {
|
||||
tests := []struct {
|
||||
source string
|
||||
lexeme string
|
||||
value string
|
||||
}{
|
||||
{"\"hello\"", "\"hello\"", "hello"},
|
||||
{"\"\"", "\"\"", ""},
|
||||
{"\"hello world\"", "\"hello world\"", "hello world"},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
s := scanner.New(test.source)
|
||||
token := s.NextToken()
|
||||
|
||||
assert.Equal(t, types.STRING, token.Type)
|
||||
assert.Equal(t, test.lexeme, token.Lexeme)
|
||||
assert.Equal(t, test.value, token.Literal.(string))
|
||||
}
|
||||
}
|
||||
|
||||
func TestComments(t *testing.T) {
|
||||
s := scanner.New("// This is a comment\nx = 5")
|
||||
|
||||
token := s.NextToken()
|
||||
checkToken(t, token, types.IDENTIFIER, "x", 2, 1)
|
||||
|
||||
token = s.NextToken()
|
||||
checkToken(t, token, types.EQUAL, "=", 2, 3)
|
||||
|
||||
token = s.NextToken()
|
||||
checkToken(t, token, types.NUMBER, "5", 2, 5)
|
||||
|
||||
token = s.NextToken()
|
||||
checkToken(t, token, types.EOF, "", 2, 6)
|
||||
}
|
||||
|
||||
func TestMultipleTokens(t *testing.T) {
|
||||
source := "fn add(a, b) return a + b end"
|
||||
s := scanner.New(source)
|
||||
|
||||
expected := []struct {
|
||||
tokType types.TokenType
|
||||
lexeme string
|
||||
}{
|
||||
{types.FN, "fn"},
|
||||
{types.IDENTIFIER, "add"},
|
||||
{types.LEFT_PAREN, "("},
|
||||
{types.IDENTIFIER, "a"},
|
||||
{types.COMMA, ","},
|
||||
{types.IDENTIFIER, "b"},
|
||||
{types.RIGHT_PAREN, ")"},
|
||||
{types.RETURN, "return"},
|
||||
{types.IDENTIFIER, "a"},
|
||||
{types.PLUS, "+"},
|
||||
{types.IDENTIFIER, "b"},
|
||||
{types.END, "end"},
|
||||
{types.EOF, ""},
|
||||
}
|
||||
|
||||
for _, exp := range expected {
|
||||
token := s.NextToken()
|
||||
assert.Equal(t, exp.tokType, token.Type)
|
||||
assert.Equal(t, exp.lexeme, token.Lexeme)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScanTokens(t *testing.T) {
|
||||
source := "fn add(a, b) return a + b end"
|
||||
s := scanner.New(source)
|
||||
|
||||
tokens := s.ScanTokens()
|
||||
|
||||
assert.Equal(t, 13, len(tokens))
|
||||
assert.Equal(t, types.FN, tokens[0].Type)
|
||||
assert.Equal(t, types.EOF, tokens[12].Type)
|
||||
}
|
||||
|
||||
func TestLineAndColumn(t *testing.T) {
|
||||
source := "x = 1\ny = 2"
|
||||
s := scanner.New(source)
|
||||
|
||||
token := s.NextToken() // x
|
||||
checkToken(t, token, types.IDENTIFIER, "x", 1, 1)
|
||||
|
||||
token = s.NextToken() // =
|
||||
checkToken(t, token, types.EQUAL, "=", 1, 3)
|
||||
|
||||
token = s.NextToken() // 1
|
||||
checkToken(t, token, types.NUMBER, "1", 1, 5)
|
||||
|
||||
token = s.NextToken() // y
|
||||
checkToken(t, token, types.IDENTIFIER, "y", 2, 1)
|
||||
|
||||
token = s.NextToken() // =
|
||||
checkToken(t, token, types.EQUAL, "=", 2, 3)
|
||||
|
||||
token = s.NextToken() // 2
|
||||
checkToken(t, token, types.NUMBER, "2", 2, 5)
|
||||
}
|
||||
|
||||
func TestErrors(t *testing.T) {
|
||||
// Unterminated string
|
||||
s := scanner.New("\"unterminated")
|
||||
token := s.NextToken()
|
||||
assert.Equal(t, types.ERROR, token.Type)
|
||||
|
||||
// Invalid character
|
||||
s = scanner.New("@")
|
||||
token = s.NextToken()
|
||||
assert.Equal(t, types.ERROR, token.Type)
|
||||
|
||||
// Standalone ! without =
|
||||
s = scanner.New("!")
|
||||
token = s.NextToken()
|
||||
assert.Equal(t, types.ERROR, token.Type)
|
||||
}
|
||||
|
||||
func TestLiterals(t *testing.T) {
|
||||
// Test true literal
|
||||
s := scanner.New("true")
|
||||
token := s.NextToken()
|
||||
assert.Equal(t, types.TRUE, token.Type)
|
||||
assert.Equal(t, true, token.Literal.(bool))
|
||||
|
||||
// Test false literal
|
||||
s = scanner.New("false")
|
||||
token = s.NextToken()
|
||||
assert.Equal(t, types.FALSE, token.Type)
|
||||
assert.Equal(t, false, token.Literal.(bool))
|
||||
|
||||
// Test nil literal
|
||||
s = scanner.New("nil")
|
||||
token = s.NextToken()
|
||||
assert.Equal(t, types.NIL, token.Type)
|
||||
assert.Nil(t, token.Literal)
|
||||
}
|
||||
|
||||
func TestWhitespace(t *testing.T) {
|
||||
s := scanner.New(" \t \r\n x")
|
||||
token := s.NextToken()
|
||||
checkToken(t, token, types.IDENTIFIER, "x", 2, 3)
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user