355 lines
7.8 KiB
Go
355 lines
7.8 KiB
Go
package parser
|
|
|
|
import (
|
|
"fmt"
|
|
"sync"
|
|
"unicode"
|
|
)
|
|
|
|
// Object pools for heavy reuse
|
|
var tokenPool = sync.Pool{
|
|
New: func() any {
|
|
return &Token{
|
|
Attributes: make(map[string]string, 8),
|
|
TagStart: -1,
|
|
TagEnd: -1,
|
|
TextStart: -1,
|
|
TextEnd: -1,
|
|
}
|
|
},
|
|
}
|
|
|
|
// More efficient lexer using byte operations and minimal allocations
|
|
type Lexer struct {
|
|
input []byte // Use byte slice for faster operations
|
|
pos int
|
|
line int
|
|
col int
|
|
}
|
|
|
|
// Creates a new lexer
|
|
func NewLexer(input string) *Lexer {
|
|
return &Lexer{
|
|
input: []byte(input),
|
|
line: 1,
|
|
col: 1,
|
|
}
|
|
}
|
|
|
|
// Returns next byte without advancing
|
|
func (l *Lexer) peek() byte {
|
|
if l.pos >= len(l.input) {
|
|
return 0
|
|
}
|
|
return l.input[l.pos]
|
|
}
|
|
|
|
// Advances and returns next byte
|
|
func (l *Lexer) next() byte {
|
|
if l.pos >= len(l.input) {
|
|
return 0
|
|
}
|
|
ch := l.input[l.pos]
|
|
l.pos++
|
|
if ch == '\n' {
|
|
l.line++
|
|
l.col = 1
|
|
} else {
|
|
l.col++
|
|
}
|
|
return ch
|
|
}
|
|
|
|
// Checks if a tag should be treated as self-closing (using byte comparison)
|
|
func (l *Lexer) isSelfClosingTag(start, end int) bool {
|
|
length := end - start
|
|
if length < 2 || length > 5 {
|
|
return false
|
|
}
|
|
|
|
// Fast byte-based comparison
|
|
switch length {
|
|
case 2:
|
|
return (l.input[start] == 'i' && l.input[start+1] == '8') ||
|
|
(l.input[start] == 'f' && l.input[start+1] == '2')
|
|
case 3:
|
|
return (l.input[start] == 'i' && l.input[start+1] == '1' && l.input[start+2] == '6') ||
|
|
(l.input[start] == 'i' && l.input[start+1] == '3' && l.input[start+2] == '2') ||
|
|
(l.input[start] == 'i' && l.input[start+1] == '6' && l.input[start+2] == '4') ||
|
|
(l.input[start] == 's' && l.input[start+1] == 'i' && l.input[start+2] == '8') ||
|
|
(l.input[start] == 'f' && l.input[start+1] == '3' && l.input[start+2] == '2') ||
|
|
(l.input[start] == 'f' && l.input[start+1] == '6' && l.input[start+2] == '4')
|
|
case 4:
|
|
return (l.input[start] == 's' && l.input[start+1] == 'i' &&
|
|
l.input[start+2] == '1' && l.input[start+3] == '6') ||
|
|
(l.input[start] == 's' && l.input[start+1] == 'i' &&
|
|
l.input[start+2] == '3' && l.input[start+3] == '2') ||
|
|
(l.input[start] == 's' && l.input[start+1] == 'i' &&
|
|
l.input[start+2] == '6' && l.input[start+3] == '4') ||
|
|
(l.input[start] == 'c' && l.input[start+1] == 'h' &&
|
|
l.input[start+2] == 'a' && l.input[start+3] == 'r') ||
|
|
(l.input[start] == 's' && l.input[start+1] == 't' &&
|
|
l.input[start+2] == 'r' && l.input[start+3] == '8')
|
|
case 5:
|
|
return (l.input[start] == 'c' && l.input[start+1] == 'o' &&
|
|
l.input[start+2] == 'l' && l.input[start+3] == 'o' &&
|
|
l.input[start+4] == 'r') ||
|
|
(l.input[start] == 'e' && l.input[start+1] == 'q' &&
|
|
l.input[start+2] == 'u' && l.input[start+3] == 'i' &&
|
|
l.input[start+4] == 'p') ||
|
|
(l.input[start] == 's' && l.input[start+1] == 't' &&
|
|
l.input[start+2] == 'r' && l.input[start+3] == '1' &&
|
|
l.input[start+4] == '6') ||
|
|
(l.input[start] == 's' && l.input[start+1] == 't' &&
|
|
l.input[start+2] == 'r' && l.input[start+3] == '3' &&
|
|
l.input[start+4] == '2')
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Skips whitespace using byte operations
|
|
func (l *Lexer) skipWhitespace() {
|
|
for l.pos < len(l.input) {
|
|
ch := l.input[l.pos]
|
|
if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
|
|
if ch == '\n' {
|
|
l.line++
|
|
l.col = 1
|
|
} else {
|
|
l.col++
|
|
}
|
|
l.pos++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Optimized attribute parsing with minimal allocations - FIXED BUG
|
|
func (l *Lexer) parseAttributes(attrs map[string]string) error {
|
|
// Clear existing attributes without deallocating
|
|
for k := range attrs {
|
|
delete(attrs, k)
|
|
}
|
|
|
|
for {
|
|
l.skipWhitespace()
|
|
if l.pos >= len(l.input) || l.peek() == '>' ||
|
|
(l.peek() == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '>') {
|
|
break
|
|
}
|
|
|
|
// Read attribute name using byte operations
|
|
nameStart := l.pos
|
|
for l.pos < len(l.input) {
|
|
ch := l.input[l.pos]
|
|
if ch == '=' || ch == ' ' || ch == '\t' || ch == '\n' || ch == '>' {
|
|
break
|
|
}
|
|
l.pos++
|
|
if ch != '\n' {
|
|
l.col++
|
|
}
|
|
}
|
|
|
|
nameEnd := l.pos // FIXED: Store end of name here
|
|
|
|
if nameStart == nameEnd {
|
|
break
|
|
}
|
|
|
|
l.skipWhitespace()
|
|
if l.peek() != '=' {
|
|
return fmt.Errorf("expected '=' after attribute name")
|
|
}
|
|
l.next() // skip '='
|
|
l.skipWhitespace()
|
|
|
|
// Read attribute value
|
|
quote := l.peek()
|
|
if quote != '"' && quote != '\'' {
|
|
return fmt.Errorf("attribute value must be quoted")
|
|
}
|
|
l.next() // skip opening quote
|
|
|
|
valueStart := l.pos
|
|
for l.pos < len(l.input) && l.input[l.pos] != quote {
|
|
if l.input[l.pos] == '\n' {
|
|
l.line++
|
|
l.col = 1
|
|
} else {
|
|
l.col++
|
|
}
|
|
l.pos++
|
|
}
|
|
|
|
if l.pos >= len(l.input) {
|
|
return fmt.Errorf("unclosed attribute value")
|
|
}
|
|
|
|
// FIXED: Correct name and value extraction
|
|
name := string(l.input[nameStart:nameEnd])
|
|
value := string(l.input[valueStart:l.pos])
|
|
attrs[name] = value
|
|
|
|
l.next() // skip closing quote
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Optimized token generation with pooling
|
|
func (l *Lexer) NextToken() *Token {
|
|
token := tokenPool.Get().(*Token)
|
|
token.Type = TokenError
|
|
token.TagStart = -1
|
|
token.TagEnd = -1
|
|
token.TextStart = -1
|
|
token.TextEnd = -1
|
|
token.Line = l.line
|
|
token.Col = l.col
|
|
|
|
l.skipWhitespace()
|
|
if l.pos >= len(l.input) {
|
|
token.Type = TokenEOF
|
|
return token
|
|
}
|
|
|
|
if l.peek() == '<' {
|
|
l.next() // skip '<'
|
|
|
|
// Check for comment using byte comparison
|
|
if l.pos+2 < len(l.input) &&
|
|
l.input[l.pos] == '!' && l.input[l.pos+1] == '-' && l.input[l.pos+2] == '-' {
|
|
l.pos += 3
|
|
start := l.pos
|
|
// Find end of comment efficiently
|
|
for l.pos+2 < len(l.input) {
|
|
if l.input[l.pos] == '-' && l.input[l.pos+1] == '-' && l.input[l.pos+2] == '>' {
|
|
token.Type = TokenComment
|
|
token.TextStart = start
|
|
token.TextEnd = l.pos
|
|
l.pos += 3
|
|
return token
|
|
}
|
|
if l.input[l.pos] == '\n' {
|
|
l.line++
|
|
l.col = 1
|
|
} else {
|
|
l.col++
|
|
}
|
|
l.pos++
|
|
}
|
|
token.Type = TokenError
|
|
return token
|
|
}
|
|
|
|
// Check for closing tag
|
|
if l.peek() == '/' {
|
|
l.next() // skip '/'
|
|
start := l.pos
|
|
for l.pos < len(l.input) && l.input[l.pos] != '>' {
|
|
l.pos++
|
|
l.col++
|
|
}
|
|
if l.pos >= len(l.input) {
|
|
token.Type = TokenError
|
|
return token
|
|
}
|
|
token.Type = TokenCloseTag
|
|
token.TagStart = start
|
|
token.TagEnd = l.pos
|
|
l.next() // skip '>'
|
|
return token
|
|
}
|
|
|
|
// Opening or self-closing tag
|
|
start := l.pos
|
|
for l.pos < len(l.input) {
|
|
ch := l.input[l.pos]
|
|
if ch == '>' || ch == '/' || ch == ' ' || ch == '\t' || ch == '\n' {
|
|
break
|
|
}
|
|
l.pos++
|
|
l.col++
|
|
}
|
|
|
|
if start == l.pos {
|
|
token.Type = TokenError
|
|
return token
|
|
}
|
|
|
|
token.TagStart = start
|
|
token.TagEnd = l.pos
|
|
|
|
if err := l.parseAttributes(token.Attributes); err != nil {
|
|
token.Type = TokenError
|
|
return token
|
|
}
|
|
|
|
l.skipWhitespace()
|
|
if l.pos >= len(l.input) {
|
|
token.Type = TokenError
|
|
return token
|
|
}
|
|
|
|
if l.peek() == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '>' {
|
|
token.Type = TokenSelfCloseTag
|
|
l.pos += 2
|
|
} else {
|
|
// Check if this is a self-closing field type
|
|
if l.isSelfClosingTag(token.TagStart, token.TagEnd) {
|
|
token.Type = TokenSelfCloseTag
|
|
} else {
|
|
token.Type = TokenOpenTag
|
|
}
|
|
if l.peek() == '>' {
|
|
l.next()
|
|
} else {
|
|
token.Type = TokenError
|
|
return token
|
|
}
|
|
}
|
|
|
|
return token
|
|
}
|
|
|
|
// Text content - find range without copying
|
|
start := l.pos
|
|
for l.pos < len(l.input) && l.input[l.pos] != '<' {
|
|
if l.input[l.pos] == '\n' {
|
|
l.line++
|
|
l.col = 1
|
|
} else {
|
|
l.col++
|
|
}
|
|
l.pos++
|
|
}
|
|
|
|
// Trim whitespace from range
|
|
for start < l.pos && unicode.IsSpace(rune(l.input[start])) {
|
|
start++
|
|
}
|
|
end := l.pos
|
|
for end > start && unicode.IsSpace(rune(l.input[end-1])) {
|
|
end--
|
|
}
|
|
|
|
if start < end {
|
|
token.Type = TokenText
|
|
token.TextStart = start
|
|
token.TextEnd = end
|
|
return token
|
|
}
|
|
|
|
// Skip empty text, get next token
|
|
return l.NextToken()
|
|
}
|
|
|
|
// Returns token to pool
|
|
func (l *Lexer) ReleaseToken(token *Token) {
|
|
if token != nil {
|
|
tokenPool.Put(token)
|
|
}
|
|
}
|