359 lines
8.0 KiB
Go

package parser
import (
"fmt"
"sync"
"unicode"
)
// Object pools for heavy reuse
var tokenPool = sync.Pool{
New: func() any {
return &Token{
Attributes: make(map[string]string, 8),
TagStart: -1,
TagEnd: -1,
TextStart: -1,
TextEnd: -1,
}
},
}
// More efficient lexer using byte operations and minimal allocations
type Lexer struct {
input []byte // Use byte slice for faster operations
pos int
line int
col int
}
// Creates a new lexer
func NewLexer(input string) *Lexer {
return &Lexer{
input: []byte(input),
line: 1,
col: 1,
}
}
// Returns next byte without advancing
func (l *Lexer) peek() byte {
if l.pos >= len(l.input) {
return 0
}
return l.input[l.pos]
}
// Advances and returns next byte
func (l *Lexer) next() byte {
if l.pos >= len(l.input) {
return 0
}
ch := l.input[l.pos]
l.pos++
if ch == '\n' {
l.line++
l.col = 1
} else {
l.col++
}
return ch
}
// Checks if a tag should be treated as self-closing (using byte comparison)
func (l *Lexer) isSelfClosingTag(start, end int) bool {
length := end - start
if length < 2 || length > 6 {
return false
}
// Fast byte-based comparison
switch length {
case 2:
return (l.input[start] == 'i' && l.input[start+1] == '8') ||
(l.input[start] == 'f' && l.input[start+1] == '2')
case 3:
return (l.input[start] == 'i' && l.input[start+1] == '1' && l.input[start+2] == '6') ||
(l.input[start] == 'i' && l.input[start+1] == '3' && l.input[start+2] == '2') ||
(l.input[start] == 'i' && l.input[start+1] == '6' && l.input[start+2] == '4') ||
(l.input[start] == 's' && l.input[start+1] == 'i' && l.input[start+2] == '8') ||
(l.input[start] == 'f' && l.input[start+1] == '3' && l.input[start+2] == '2') ||
(l.input[start] == 'f' && l.input[start+1] == '6' && l.input[start+2] == '4')
case 4:
return (l.input[start] == 's' && l.input[start+1] == 'i' &&
l.input[start+2] == '1' && l.input[start+3] == '6') ||
(l.input[start] == 's' && l.input[start+1] == 'i' &&
l.input[start+2] == '3' && l.input[start+3] == '2') ||
(l.input[start] == 's' && l.input[start+1] == 'i' &&
l.input[start+2] == '6' && l.input[start+3] == '4') ||
(l.input[start] == 'c' && l.input[start+1] == 'h' &&
l.input[start+2] == 'a' && l.input[start+3] == 'r') ||
(l.input[start] == 's' && l.input[start+1] == 't' &&
l.input[start+2] == 'r' && l.input[start+3] == '8')
case 5:
return (l.input[start] == 'c' && l.input[start+1] == 'o' &&
l.input[start+2] == 'l' && l.input[start+3] == 'o' &&
l.input[start+4] == 'r') ||
(l.input[start] == 'e' && l.input[start+1] == 'q' &&
l.input[start+2] == 'u' && l.input[start+3] == 'i' &&
l.input[start+4] == 'p') ||
(l.input[start] == 's' && l.input[start+1] == 't' &&
l.input[start+2] == 'r' && l.input[start+3] == '1' &&
l.input[start+4] == '6') ||
(l.input[start] == 's' && l.input[start+1] == 't' &&
l.input[start+2] == 'r' && l.input[start+3] == '3' &&
l.input[start+4] == '2')
case 6:
return (l.input[start] == 'd' && l.input[start+1] == 'o' &&
l.input[start+2] == 'u' && l.input[start+3] == 'b' &&
l.input[start+4] == 'l' && l.input[start+5] == 'e')
}
return false
}
// Skips whitespace using byte operations
func (l *Lexer) skipWhitespace() {
for l.pos < len(l.input) {
ch := l.input[l.pos]
if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
if ch == '\n' {
l.line++
l.col = 1
} else {
l.col++
}
l.pos++
} else {
break
}
}
}
// Optimized attribute parsing with minimal allocations - FIXED BUG
func (l *Lexer) parseAttributes(attrs map[string]string) error {
// Clear existing attributes without deallocating
for k := range attrs {
delete(attrs, k)
}
for {
l.skipWhitespace()
if l.pos >= len(l.input) || l.peek() == '>' ||
(l.peek() == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '>') {
break
}
// Read attribute name using byte operations
nameStart := l.pos
for l.pos < len(l.input) {
ch := l.input[l.pos]
if ch == '=' || ch == ' ' || ch == '\t' || ch == '\n' || ch == '>' {
break
}
l.pos++
if ch != '\n' {
l.col++
}
}
nameEnd := l.pos // FIXED: Store end of name here
if nameStart == nameEnd {
break
}
l.skipWhitespace()
if l.peek() != '=' {
return fmt.Errorf("expected '=' after attribute name")
}
l.next() // skip '='
l.skipWhitespace()
// Read attribute value
quote := l.peek()
if quote != '"' && quote != '\'' {
return fmt.Errorf("attribute value must be quoted")
}
l.next() // skip opening quote
valueStart := l.pos
for l.pos < len(l.input) && l.input[l.pos] != quote {
if l.input[l.pos] == '\n' {
l.line++
l.col = 1
} else {
l.col++
}
l.pos++
}
if l.pos >= len(l.input) {
return fmt.Errorf("unclosed attribute value")
}
// FIXED: Correct name and value extraction
name := string(l.input[nameStart:nameEnd])
value := string(l.input[valueStart:l.pos])
attrs[name] = value
l.next() // skip closing quote
}
return nil
}
// Optimized token generation with pooling
func (l *Lexer) NextToken() *Token {
token := tokenPool.Get().(*Token)
token.Type = TokenError
token.TagStart = -1
token.TagEnd = -1
token.TextStart = -1
token.TextEnd = -1
token.Line = l.line
token.Col = l.col
l.skipWhitespace()
if l.pos >= len(l.input) {
token.Type = TokenEOF
return token
}
if l.peek() == '<' {
l.next() // skip '<'
// Check for comment using byte comparison
if l.pos+2 < len(l.input) &&
l.input[l.pos] == '!' && l.input[l.pos+1] == '-' && l.input[l.pos+2] == '-' {
l.pos += 3
start := l.pos
// Find end of comment efficiently
for l.pos+2 < len(l.input) {
if l.input[l.pos] == '-' && l.input[l.pos+1] == '-' && l.input[l.pos+2] == '>' {
token.Type = TokenComment
token.TextStart = start
token.TextEnd = l.pos
l.pos += 3
return token
}
if l.input[l.pos] == '\n' {
l.line++
l.col = 1
} else {
l.col++
}
l.pos++
}
token.Type = TokenError
return token
}
// Check for closing tag
if l.peek() == '/' {
l.next() // skip '/'
start := l.pos
for l.pos < len(l.input) && l.input[l.pos] != '>' {
l.pos++
l.col++
}
if l.pos >= len(l.input) {
token.Type = TokenError
return token
}
token.Type = TokenCloseTag
token.TagStart = start
token.TagEnd = l.pos
l.next() // skip '>'
return token
}
// Opening or self-closing tag
start := l.pos
for l.pos < len(l.input) {
ch := l.input[l.pos]
if ch == '>' || ch == '/' || ch == ' ' || ch == '\t' || ch == '\n' {
break
}
l.pos++
l.col++
}
if start == l.pos {
token.Type = TokenError
return token
}
token.TagStart = start
token.TagEnd = l.pos
if err := l.parseAttributes(token.Attributes); err != nil {
token.Type = TokenError
return token
}
l.skipWhitespace()
if l.pos >= len(l.input) {
token.Type = TokenError
return token
}
if l.peek() == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '>' {
token.Type = TokenSelfCloseTag
l.pos += 2
} else {
// Check if this is a self-closing field type
if l.isSelfClosingTag(token.TagStart, token.TagEnd) {
token.Type = TokenSelfCloseTag
} else {
token.Type = TokenOpenTag
}
if l.peek() == '>' {
l.next()
} else {
token.Type = TokenError
return token
}
}
return token
}
// Text content - find range without copying
start := l.pos
for l.pos < len(l.input) && l.input[l.pos] != '<' {
if l.input[l.pos] == '\n' {
l.line++
l.col = 1
} else {
l.col++
}
l.pos++
}
// Trim whitespace from range
for start < l.pos && unicode.IsSpace(rune(l.input[start])) {
start++
}
end := l.pos
for end > start && unicode.IsSpace(rune(l.input[end-1])) {
end--
}
if start < end {
token.Type = TokenText
token.TextStart = start
token.TextEnd = end
return token
}
// Skip empty text, get next token
return l.NextToken()
}
// Returns token to pool
func (l *Lexer) ReleaseToken(token *Token) {
if token != nil {
tokenPool.Put(token)
}
}