package parser import ( "fmt" "sync" "unicode" ) // Object pools for heavy reuse var tokenPool = sync.Pool{ New: func() any { return &Token{ Attributes: make(map[string]string, 8), TagStart: -1, TagEnd: -1, TextStart: -1, TextEnd: -1, } }, } // More efficient lexer using byte operations and minimal allocations type Lexer struct { input []byte // Use byte slice for faster operations pos int line int col int } // Creates a new lexer func NewLexer(input string) *Lexer { return &Lexer{ input: []byte(input), line: 1, col: 1, } } // Returns next byte without advancing func (l *Lexer) peek() byte { if l.pos >= len(l.input) { return 0 } return l.input[l.pos] } // Advances and returns next byte func (l *Lexer) next() byte { if l.pos >= len(l.input) { return 0 } ch := l.input[l.pos] l.pos++ if ch == '\n' { l.line++ l.col = 1 } else { l.col++ } return ch } // Checks if a tag should be treated as self-closing (using byte comparison) func (l *Lexer) isSelfClosingTag(start, end int) bool { length := end - start if length < 2 || length > 6 { return false } // Fast byte-based comparison switch length { case 2: return (l.input[start] == 'i' && l.input[start+1] == '8') || (l.input[start] == 'f' && l.input[start+1] == '2') case 3: return (l.input[start] == 'i' && l.input[start+1] == '1' && l.input[start+2] == '6') || (l.input[start] == 'i' && l.input[start+1] == '3' && l.input[start+2] == '2') || (l.input[start] == 'i' && l.input[start+1] == '6' && l.input[start+2] == '4') || (l.input[start] == 's' && l.input[start+1] == 'i' && l.input[start+2] == '8') || (l.input[start] == 'f' && l.input[start+1] == '3' && l.input[start+2] == '2') || (l.input[start] == 'f' && l.input[start+1] == '6' && l.input[start+2] == '4') case 4: return (l.input[start] == 's' && l.input[start+1] == 'i' && l.input[start+2] == '1' && l.input[start+3] == '6') || (l.input[start] == 's' && l.input[start+1] == 'i' && l.input[start+2] == '3' && l.input[start+3] == '2') || (l.input[start] == 's' && l.input[start+1] == 'i' && l.input[start+2] == '6' && l.input[start+3] == '4') || (l.input[start] == 'c' && l.input[start+1] == 'h' && l.input[start+2] == 'a' && l.input[start+3] == 'r') || (l.input[start] == 's' && l.input[start+1] == 't' && l.input[start+2] == 'r' && l.input[start+3] == '8') case 5: return (l.input[start] == 'c' && l.input[start+1] == 'o' && l.input[start+2] == 'l' && l.input[start+3] == 'o' && l.input[start+4] == 'r') || (l.input[start] == 'e' && l.input[start+1] == 'q' && l.input[start+2] == 'u' && l.input[start+3] == 'i' && l.input[start+4] == 'p') || (l.input[start] == 's' && l.input[start+1] == 't' && l.input[start+2] == 'r' && l.input[start+3] == '1' && l.input[start+4] == '6') || (l.input[start] == 's' && l.input[start+1] == 't' && l.input[start+2] == 'r' && l.input[start+3] == '3' && l.input[start+4] == '2') case 6: return (l.input[start] == 'd' && l.input[start+1] == 'o' && l.input[start+2] == 'u' && l.input[start+3] == 'b' && l.input[start+4] == 'l' && l.input[start+5] == 'e') } return false } // Skips whitespace using byte operations func (l *Lexer) skipWhitespace() { for l.pos < len(l.input) { ch := l.input[l.pos] if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' { if ch == '\n' { l.line++ l.col = 1 } else { l.col++ } l.pos++ } else { break } } } // Optimized attribute parsing with minimal allocations - FIXED BUG func (l *Lexer) parseAttributes(attrs map[string]string) error { // Clear existing attributes without deallocating for k := range attrs { delete(attrs, k) } for { l.skipWhitespace() if l.pos >= len(l.input) || l.peek() == '>' || (l.peek() == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '>') { break } // Read attribute name using byte operations nameStart := l.pos for l.pos < len(l.input) { ch := l.input[l.pos] if ch == '=' || ch == ' ' || ch == '\t' || ch == '\n' || ch == '>' { break } l.pos++ if ch != '\n' { l.col++ } } nameEnd := l.pos // FIXED: Store end of name here if nameStart == nameEnd { break } l.skipWhitespace() if l.peek() != '=' { return fmt.Errorf("expected '=' after attribute name") } l.next() // skip '=' l.skipWhitespace() // Read attribute value quote := l.peek() if quote != '"' && quote != '\'' { return fmt.Errorf("attribute value must be quoted") } l.next() // skip opening quote valueStart := l.pos for l.pos < len(l.input) && l.input[l.pos] != quote { if l.input[l.pos] == '\n' { l.line++ l.col = 1 } else { l.col++ } l.pos++ } if l.pos >= len(l.input) { return fmt.Errorf("unclosed attribute value") } // FIXED: Correct name and value extraction name := string(l.input[nameStart:nameEnd]) value := string(l.input[valueStart:l.pos]) attrs[name] = value l.next() // skip closing quote } return nil } // Optimized token generation with pooling func (l *Lexer) NextToken() *Token { token := tokenPool.Get().(*Token) token.Type = TokenError token.TagStart = -1 token.TagEnd = -1 token.TextStart = -1 token.TextEnd = -1 token.Line = l.line token.Col = l.col l.skipWhitespace() if l.pos >= len(l.input) { token.Type = TokenEOF return token } if l.peek() == '<' { l.next() // skip '<' // Check for comment using byte comparison if l.pos+2 < len(l.input) && l.input[l.pos] == '!' && l.input[l.pos+1] == '-' && l.input[l.pos+2] == '-' { l.pos += 3 start := l.pos // Find end of comment efficiently for l.pos+2 < len(l.input) { if l.input[l.pos] == '-' && l.input[l.pos+1] == '-' && l.input[l.pos+2] == '>' { token.Type = TokenComment token.TextStart = start token.TextEnd = l.pos l.pos += 3 return token } if l.input[l.pos] == '\n' { l.line++ l.col = 1 } else { l.col++ } l.pos++ } token.Type = TokenError return token } // Check for closing tag if l.peek() == '/' { l.next() // skip '/' start := l.pos for l.pos < len(l.input) && l.input[l.pos] != '>' { l.pos++ l.col++ } if l.pos >= len(l.input) { token.Type = TokenError return token } token.Type = TokenCloseTag token.TagStart = start token.TagEnd = l.pos l.next() // skip '>' return token } // Opening or self-closing tag start := l.pos for l.pos < len(l.input) { ch := l.input[l.pos] if ch == '>' || ch == '/' || ch == ' ' || ch == '\t' || ch == '\n' { break } l.pos++ l.col++ } if start == l.pos { token.Type = TokenError return token } token.TagStart = start token.TagEnd = l.pos if err := l.parseAttributes(token.Attributes); err != nil { token.Type = TokenError return token } l.skipWhitespace() if l.pos >= len(l.input) { token.Type = TokenError return token } if l.peek() == '/' && l.pos+1 < len(l.input) && l.input[l.pos+1] == '>' { token.Type = TokenSelfCloseTag l.pos += 2 } else { // Check if this is a self-closing field type if l.isSelfClosingTag(token.TagStart, token.TagEnd) { token.Type = TokenSelfCloseTag } else { token.Type = TokenOpenTag } if l.peek() == '>' { l.next() } else { token.Type = TokenError return token } } return token } // Text content - find range without copying start := l.pos for l.pos < len(l.input) && l.input[l.pos] != '<' { if l.input[l.pos] == '\n' { l.line++ l.col = 1 } else { l.col++ } l.pos++ } // Trim whitespace from range for start < l.pos && unicode.IsSpace(rune(l.input[start])) { start++ } end := l.pos for end > start && unicode.IsSpace(rune(l.input[end-1])) { end-- } if start < end { token.Type = TokenText token.TextStart = start token.TextEnd = end return token } // Skip empty text, get next token return l.NextToken() } // Returns token to pool func (l *Lexer) ReleaseToken(token *Token) { if token != nil { tokenPool.Put(token) } }