rewrite packet parser as string-based recursive descent

2025-08-07 15:13:56 -05:00 · 2025-08-07 15:13:56 -05:00 · d24ec376a8
commit d24ec376a8
parent 46121dcfc6
9 changed files with 1202 additions and 1145 deletions
--- a/internal/packets/loader.go
+++ b/internal/packets/loader.go
@ -78,7 +78,7 @@ func processDirectory(dirPath string, packets map[string]*parser.PacketDef) erro

 		err := processXMLFile(entryPath, packets)
 		if err != nil {
-			log.Printf("Warning: failed to process %s: %v", entryPath, err)
+			log.Printf("Warning: %s: %v", entryPath, err)
 		}
 	}

@ -93,7 +93,7 @@ func processXMLFile(filePath string, packets map[string]*parser.PacketDef) error

 	parsedPackets, err := parser.Parse(string(content))
 	if err != nil {
-		return fmt.Errorf("failed to parse XML: %w", err)
+		return fmt.Errorf("failed to parse packet def: %w", err)
 	}

 	for name, packet := range parsedPackets {
--- a/internal/packets/parser/lexer.go
+++ b/internal/packets/parser/lexer.go
@ -1,358 +0,0 @@
-package parser
-
-import (
-	"fmt"
-	"sync"
-	"unicode"
-)
-
-// Object pools for heavy reuse
-var tokenPool = sync.Pool{
-	New: func() any {
-		return &Token{
-			Attributes: make(map[string]string, 8),
-			TagStart:   -1,
-			TagEnd:     -1,
-			TextStart:  -1,
-			TextEnd:    -1,
-		}
-	},
-}
-
-// More efficient lexer using byte operations and minimal allocations
-type Lexer struct {
-	input []byte // Use byte slice for faster operations
-	pos   int
-	line  int
-	col   int
-}
-
-// Creates a new lexer
-func NewLexer(input string) *Lexer {
-	return &Lexer{
-		input: []byte(input),
-		line:  1,
-		col:   1,
-	}
-}
-
-// Returns next byte without advancing
-func (l *Lexer) peek() byte {
-	if l.pos >= len(l.input) {
-		return 0
-	}
-	return l.input[l.pos]
-}
-
-// Advances and returns next byte
-func (l *Lexer) next() byte {
-	if l.pos >= len(l.input) {
-		return 0
-	}
-	ch := l.input[l.pos]
-	l.pos++
-	if ch == '\n' {
-		l.line++
-		l.col = 1
-	} else {
-		l.col++
-	}
-	return ch
-}
-
-// Checks if a tag should be treated as self-closing (using byte comparison)
-func (l *Lexer) isSelfClosingTag(start, end int) bool {
-	length := end - start
-	if length < 2 || length > 6 {
-		return false
-	}
-
-	// Fast byte-based comparison
-	switch length {
-	case 2:
-		return (l.input[start] == 'i' && l.input[start+1] == '8') ||
-			(l.input[start] == 'f' && l.input[start+1] == '2')
-	case 3:
-		return (l.input[start] == 'i' && l.input[start+1] == '1' && l.input[start+2] == '6') ||
-			(l.input[start] == 'i' && l.input[start+1] == '3' && l.input[start+2] == '2') ||
-			(l.input[start] == 'i' && l.input[start+1] == '6' && l.input[start+2] == '4') ||
-			(l.input[start] == 's' && l.input[start+1] == 'i' && l.input[start+2] == '8') ||
-			(l.input[start] == 'f' && l.input[start+1] == '3' && l.input[start+2] == '2') ||
-			(l.input[start] == 'f' && l.input[start+1] == '6' && l.input[start+2] == '4')
-	case 4:
-		return (l.input[start] == 's' && l.input[start+1] == 'i' &&
-			l.input[start+2] == '1' && l.input[start+3] == '6') ||
-			(l.input[start] == 's' && l.input[start+1] == 'i' &&
-				l.input[start+2] == '3' && l.input[start+3] == '2') ||
-			(l.input[start] == 's' && l.input[start+1] == 'i' &&
-				l.input[start+2] == '6' && l.input[start+3] == '4') ||
-			(l.input[start] == 'c' && l.input[start+1] == 'h' &&
-				l.input[start+2] == 'a' && l.input[start+3] == 'r') ||
-			(l.input[start] == 's' && l.input[start+1] == 't' &&
-				l.input[start+2] == 'r' && l.input[start+3] == '8')
-	case 5:
-		return (l.input[start] == 'c' && l.input[start+1] == 'o' &&
-			l.input[start+2] == 'l' && l.input[start+3] == 'o' &&
-			l.input[start+4] == 'r') ||
-			(l.input[start] == 'e' && l.input[start+1] == 'q' &&
-				l.input[start+2] == 'u' && l.input[start+3] == 'i' &&
-				l.input[start+4] == 'p') ||
-			(l.input[start] == 's' && l.input[start+1] == 't' &&
-				l.input[start+2] == 'r' && l.input[start+3] == '1' &&
-				l.input[start+4] == '6') ||
-			(l.input[start] == 's' && l.input[start+1] == 't' &&
-				l.input[start+2] == 'r' && l.input[start+3] == '3' &&
-				l.input[start+4] == '2')
-	case 6:
-		return (l.input[start] == 'd' && l.input[start+1] == 'o' &&
-			l.input[start+2] == 'u' && l.input[start+3] == 'b' &&
-			l.input[start+4] == 'l' && l.input[start+5] == 'e')
-	}
-	return false
-}
-
-// Skips whitespace using byte operations
-func (l *Lexer) skipWhitespace() {
-	for l.pos < len(l.input) {
-		ch := l.input[l.pos]
-		if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
-			if ch == '\n' {
-				l.line++
-				l.col = 1
-			} else {
-				l.col++
-			}
-			l.pos++
-		} else {
-			break
-		}
-	}
-}
-
-// Optimized attribute parsing with minimal allocations - FIXED BUG
-func (l *Lexer) parseAttributes(attrs map[string]string) error {
-	// Clear existing attributes without deallocating
-	for k := range attrs {
-		delete(attrs, k)
-	}
-
-	for {
-		l.skipWhitespace()
-		if l.pos >= len(l.input) || l.peek() == '>' ||
-			(l.peek() == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '>') {
-			break
-		}
-
-		// Read attribute name using byte operations
-		nameStart := l.pos
-		for l.pos < len(l.input) {
-			ch := l.input[l.pos]
-			if ch == '=' || ch == ' ' || ch == '\t' || ch == '\n' || ch == '>' {
-				break
-			}
-			l.pos++
-			if ch != '\n' {
-				l.col++
-			}
-		}
-
-		nameEnd := l.pos // FIXED: Store end of name here
-
-		if nameStart == nameEnd {
-			break
-		}
-
-		l.skipWhitespace()
-		if l.peek() != '=' {
-			return fmt.Errorf("expected '=' after attribute name")
-		}
-		l.next() // skip '='
-		l.skipWhitespace()
-
-		// Read attribute value
-		quote := l.peek()
-		if quote != '"' && quote != '\'' {
-			return fmt.Errorf("attribute value must be quoted")
-		}
-		l.next() // skip opening quote
-
-		valueStart := l.pos
-		for l.pos < len(l.input) && l.input[l.pos] != quote {
-			if l.input[l.pos] == '\n' {
-				l.line++
-				l.col = 1
-			} else {
-				l.col++
-			}
-			l.pos++
-		}
-
-		if l.pos >= len(l.input) {
-			return fmt.Errorf("unclosed attribute value")
-		}
-
-		// FIXED: Correct name and value extraction
-		name := string(l.input[nameStart:nameEnd])
-		value := string(l.input[valueStart:l.pos])
-		attrs[name] = value
-
-		l.next() // skip closing quote
-	}
-
-	return nil
-}
-
-// Optimized token generation with pooling
-func (l *Lexer) NextToken() *Token {
-	token := tokenPool.Get().(*Token)
-	token.Type = TokenError
-	token.TagStart = -1
-	token.TagEnd = -1
-	token.TextStart = -1
-	token.TextEnd = -1
-	token.Line = l.line
-	token.Col = l.col
-
-	l.skipWhitespace()
-	if l.pos >= len(l.input) {
-		token.Type = TokenEOF
-		return token
-	}
-
-	if l.peek() == '<' {
-		l.next() // skip '<'
-
-		// Check for comment using byte comparison
-		if l.pos+2 < len(l.input) &&
-			l.input[l.pos] == '!' && l.input[l.pos+1] == '-' && l.input[l.pos+2] == '-' {
-			l.pos += 3
-			start := l.pos
-			// Find end of comment efficiently
-			for l.pos+2 < len(l.input) {
-				if l.input[l.pos] == '-' && l.input[l.pos+1] == '-' && l.input[l.pos+2] == '>' {
-					token.Type = TokenComment
-					token.TextStart = start
-					token.TextEnd = l.pos
-					l.pos += 3
-					return token
-				}
-				if l.input[l.pos] == '\n' {
-					l.line++
-					l.col = 1
-				} else {
-					l.col++
-				}
-				l.pos++
-			}
-			token.Type = TokenError
-			return token
-		}
-
-		// Check for closing tag
-		if l.peek() == '/' {
-			l.next() // skip '/'
-			start := l.pos
-			for l.pos < len(l.input) && l.input[l.pos] != '>' {
-				l.pos++
-				l.col++
-			}
-			if l.pos >= len(l.input) {
-				token.Type = TokenError
-				return token
-			}
-			token.Type = TokenCloseTag
-			token.TagStart = start
-			token.TagEnd = l.pos
-			l.next() // skip '>'
-			return token
-		}
-
-		// Opening or self-closing tag
-		start := l.pos
-		for l.pos < len(l.input) {
-			ch := l.input[l.pos]
-			if ch == '>' || ch == '/' || ch == ' ' || ch == '\t' || ch == '\n' {
-				break
-			}
-			l.pos++
-			l.col++
-		}
-
-		if start == l.pos {
-			token.Type = TokenError
-			return token
-		}
-
-		token.TagStart = start
-		token.TagEnd = l.pos
-
-		if err := l.parseAttributes(token.Attributes); err != nil {
-			token.Type = TokenError
-			return token
-		}
-
-		l.skipWhitespace()
-		if l.pos >= len(l.input) {
-			token.Type = TokenError
-			return token
-		}
-
-		if l.peek() == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '>' {
-			token.Type = TokenSelfCloseTag
-			l.pos += 2
-		} else {
-			// Check if this is a self-closing field type
-			if l.isSelfClosingTag(token.TagStart, token.TagEnd) {
-				token.Type = TokenSelfCloseTag
-			} else {
-				token.Type = TokenOpenTag
-			}
-			if l.peek() == '>' {
-				l.next()
-			} else {
-				token.Type = TokenError
-				return token
-			}
-		}
-
-		return token
-	}
-
-	// Text content - find range without copying
-	start := l.pos
-	for l.pos < len(l.input) && l.input[l.pos] != '<' {
-		if l.input[l.pos] == '\n' {
-			l.line++
-			l.col = 1
-		} else {
-			l.col++
-		}
-		l.pos++
-	}
-
-	// Trim whitespace from range
-	for start < l.pos && unicode.IsSpace(rune(l.input[start])) {
-		start++
-	}
-	end := l.pos
-	for end > start && unicode.IsSpace(rune(l.input[end-1])) {
-		end--
-	}
-
-	if start < end {
-		token.Type = TokenText
-		token.TextStart = start
-		token.TextEnd = end
-		return token
-	}
-
-	// Skip empty text, get next token
-	return l.NextToken()
-}
-
-// Returns token to pool
-func (l *Lexer) ReleaseToken(token *Token) {
-	if token != nil {
-		tokenPool.Put(token)
-	}
-}
--- a/internal/packets/parser/parser.go
+++ b/internal/packets/parser/parser.go
--- a/internal/packets/parser/parser_test.go
+++ b/internal/packets/parser/parser_test.go
@ -377,8 +377,7 @@ func TestSubstructReference(t *testing.T) {
 		</version>
 	</packet>`

-	parser := NewParser(pml)
-	packets, err := parser.Parse()
+	packets, err := Parse(pml)
 	if err != nil {
 		t.Fatalf("Parse failed: %v", err)
 	}
--- a/internal/packets/parser/tokens.go
+++ b/internal/packets/parser/tokens.go
@ -1,42 +0,0 @@
-package parser
-
-// Token types for PML parsing
-type TokenType int
-
-const (
-	TokenError TokenType = iota
-	TokenOpenTag
-	TokenCloseTag
-	TokenSelfCloseTag
-	TokenText
-	TokenComment
-	TokenEOF
-)
-
-// Represents a parsed token with string ranges instead of copies
-type Token struct {
-	Type       TokenType
-	TagStart   int // Start index in input for tag name
-	TagEnd     int // End index in input for tag name
-	TextStart  int // Start index for text content
-	TextEnd    int // End index for text content
-	Attributes map[string]string
-	Line       int
-	Col        int
-}
-
-// Gets tag name from input (avoids allocation until needed)
-func (t *Token) Tag(input string) string {
-	if t.TagStart >= 0 && t.TagEnd > t.TagStart {
-		return input[t.TagStart:t.TagEnd]
-	}
-	return ""
-}
-
-// Gets text content from input
-func (t *Token) Text(input string) string {
-	if t.TextStart >= 0 && t.TextEnd > t.TextStart {
-		return input[t.TextStart:t.TextEnd]
-	}
-	return ""
-}
--- a/internal/packets/xml/world/RecipeDetails.xml
+++ b/internal/packets/xml/world/RecipeDetails.xml
@ -10,7 +10,7 @@
 			<u32 name="unknown2" size="2">
 			<u32 name="technique">
 			<u32 name="knowledge">
-			<u8 name="level" size="1 ">
+			<u8 name="level" size="1">
 			<u32 name="unknown3">
 			<char name="recipe_book" size="200">
 			<char name="device" size="40">
--- a/internal/packets/xml/world/UpdateMerchant.xml
+++ b/internal/packets/xml/world/UpdateMerchant.xml
@ -19,13 +19,13 @@
 			<i32 name="item_id2">
 			<u16 name="stack_size2">
 			<u8 name="unknown7" size="4">
-			<u8 name="num_tokens" size=" 1">
+			<u8 name="num_tokens" size="1">
 			<array name="token_array" count="var:num_tokens">
-				<u16 name="token_icon" size=" 1">
-				<u16 name="token_qty" size=" 1">
-				<i32 name="token_id" size=" 1">
-				<i32 name="token_id2" size=" 1">
-				<str16 name="token_name" size=" 1">
+				<u16 name="token_icon" size="1">
+				<u16 name="token_qty" size="1">
+				<i32 name="token_id" size="1">
+				<i32 name="token_id2" size="1">
+				<str16 name="token_name" size="1">
 			</array>
 			<str16 name="description">
 		</array>
@ -122,13 +122,13 @@
 			<i32 name="item_id2">
 			<u16 name="stack_size2">
 			<u8 name="unknown7" size="4">
-			<u8 name="num_tokens" size=" 1">
+			<u8 name="num_tokens" size="1">
 			<array name="token_array" count="var:num_tokens">
-				<u16 name="token_icon" size=" 1">
-				<u16 name="token_qty" size=" 1">
-				<i32 name="token_id" size=" 1">
-				<i32 name="token_id2" size=" 1">
-				<str16 name="token_name" size=" 1">
+				<u16 name="token_icon" size="1">
+				<u16 name="token_qty" size="1">
+				<i32 name="token_id" size="1">
+				<i32 name="token_id2" size="1">
+				<str16 name="token_name" size="1">
 			</array>
 			<str16 name="description">
 		</array>
@ -156,13 +156,13 @@
 			<u32 name="status2">
 			<u32 name="station_cash">
 			<u8 name="unknown7" size="2">
-			<u8 name="num_tokens" size=" 1">
+			<u8 name="num_tokens" size="1">
 			<array name="token_array" count="var:num_tokens">
-				<u16 name="token_icon" size=" 1">
-				<u16 name="token_qty" size=" 1">
-				<i32 name="token_id" size=" 1">
-				<i32 name="token_id2" size=" 1">
-				<str16 name="token_name" size=" 1">
+				<u16 name="token_icon" size="1">
+				<u16 name="token_qty" size="1">
+				<i32 name="token_id" size="1">
+				<i32 name="token_id2" size="1">
+				<str16 name="token_name" size="1">
 			</array>
 			<str16 name="description">
 		</array>
@ -191,13 +191,13 @@
 			<u32 name="status2">
 			<u32 name="station_cash">
 			<u8 name="unknown7" size="2">
-			<u8 name="num_tokens" size=" 1">
+			<u8 name="num_tokens" size="1">
 			<array name="token_array" count="var:num_tokens">
-				<u16 name="token_icon" size=" 1">
-				<u16 name="token_qty" size=" 1">
-				<i32 name="token_id" size=" 1">
-				<i32 name="token_id2" size=" 1">
-				<str16 name="token_name" size=" 1">
+				<u16 name="token_icon" size="1">
+				<u16 name="token_qty" size="1">
+				<i32 name="token_id" size="1">
+				<i32 name="token_id2" size="1">
+				<str16 name="token_name" size="1">
 			</array>
 			<str16 name="description">
 		</array>
@ -226,13 +226,13 @@
 			<u32 name="status2">
 			<u32 name="station_cash">
 			<u8 name="unknown7" size="2">
-			<u8 name="num_tokens" size=" 1">
+			<u8 name="num_tokens" size="1">
 			<array name="token_array" count="var:num_tokens">
-				<u16 name="token_icon" size=" 1">
-				<u16 name="token_qty" size=" 1">
-				<i32 name="token_id" size=" 1">
-				<i32 name="token_id2" size=" 1">
-				<str16 name="token_name" size=" 1">
+				<u16 name="token_icon" size="1">
+				<u16 name="token_qty" size="1">
+				<i32 name="token_id" size="1">
+				<i32 name="token_id2" size="1">
+				<str16 name="token_name" size="1">
 			</array>
 			<str8 name="description">
 			<u8 name="unknown" size="3">
@ -263,13 +263,13 @@
 			<u32 name="status">
 			<u32 name="station_cash">
 			<u8 name="unknown7" size="4">
-			<u8 name="num_tokens" size=" 1">
+			<u8 name="num_tokens" size="1">
 			<array name="token_array" count="var:num_tokens">
-				<u16 name="token_icon" size=" 1">
-				<u16 name="token_qty" size=" 1">
-				<i32 name="token_id" size=" 1">
-				<i32 name="token_id2" size=" 1">
-				<str16 name="token_name" size=" 1">
+				<u16 name="token_icon" size="1">
+				<u16 name="token_qty" size="1">
+				<i32 name="token_id" size="1">
+				<i32 name="token_id2" size="1">
+				<str16 name="token_name" size="1">
 			</array>
 			<str16 name="description">
 		</array>
@ -300,13 +300,13 @@
 			<u32 name="status2">
 			<u32 name="station_cash">
 			<u8 name="unknown7" size="2">
-			<u8 name="num_tokens" size=" 1">
+			<u8 name="num_tokens" size="1">
 			<array name="token_array" count="var:num_tokens">
-				<u16 name="token_icon" size=" 1">
-				<u16 name="token_qty" size=" 1">
-				<i32 name="token_id" size=" 1">
-				<i32 name="token_id2" size=" 1">
-				<str16 name="token_name" size=" 1">
+				<u16 name="token_icon" size="1">
+				<u16 name="token_qty" size="1">
+				<i32 name="token_id" size="1">
+				<i32 name="token_id2" size="1">
+				<str16 name="token_name" size="1">
 			</array>
 			<str16 name="description">
 		</array>
--- a/test_empty_packets.go
+++ b/test_empty_packets.go
@ -0,0 +1,33 @@
+package main
+
+import (
+	"fmt"
+	"log"
+	"eq2emu/internal/packets"
+)
+
+func main() {
+	// Enable all logging to see warnings
+	log.SetFlags(log.LstdFlags | log.Lshortfile)
+	
+	// This will trigger the init() function in the packets loader
+	count := packets.GetPacketCount()
+	fmt.Printf("Loaded %d packet definitions\n", count)
+	
+	// Get all packet names to look for any patterns
+	names := packets.GetPacketNames()
+	if len(names) == 0 {
+		fmt.Println("No packets loaded!")
+		return
+	}
+	
+	// Look for packets that might be empty
+	fmt.Println("Checking for potentially problematic packets...")
+	for _, name := range names {
+		if packet, exists := packets.GetPacket(name); exists {
+			if len(packet.Fields) == 0 {
+				fmt.Printf("Empty packet found: %s\n", name)
+			}
+		}
+	}
+}
--- a/test_specific_empty.go
+++ b/test_specific_empty.go
@ -0,0 +1,39 @@
+package main
+
+import (
+	"fmt"
+	"eq2emu/internal/packets/parser"
+)
+
+func main() {
+	// Test parsing an empty packet that might fail
+	emptyPacketXML := `<packet name="EmptyTest">
+	<version number="1">
+	</version>
+</packet>`
+	
+	fmt.Println("Testing empty packet parsing...")
+	packets, err := parser.Parse(emptyPacketXML)
+	if err != nil {
+		fmt.Printf("ERROR parsing empty packet: %v\n", err)
+	} else {
+		fmt.Printf("SUCCESS: Parsed %d packets\n", len(packets))
+		if packet, exists := packets["EmptyTest"]; exists {
+			fmt.Printf("EmptyTest packet has %d fields\n", len(packet.Fields))
+		}
+	}
+
+	// Test a completely self-closing packet
+	selfClosingXML := `<packet name="SelfClosingTest" />`
+	
+	fmt.Println("\nTesting self-closing packet parsing...")
+	packets2, err2 := parser.Parse(selfClosingXML)
+	if err2 != nil {
+		fmt.Printf("ERROR parsing self-closing packet: %v\n", err2)
+	} else {
+		fmt.Printf("SUCCESS: Parsed %d packets\n", len(packets2))
+		if packet, exists := packets2["SelfClosingTest"]; exists {
+			fmt.Printf("SelfClosingTest packet has %d fields\n", len(packet.Fields))
+		}
+	}
+}