browser/src/browser/css/Tokenizer.zig

// Copyright (C) 2023-2025  Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.

//! This file implements the tokenization step defined in the CSS Syntax Module Level 3 specification.
//!
//! The algorithm accepts a valid UTF-8 string and returns a stream of tokens.
//! The tokenization step never fails, even for complete gibberish.
//! Validity must then be checked by the parser.
//!
//! NOTE: The tokenizer is not thread-safe and does not own any memory, and does not check the validity of utf8.
//!
//! See spec for more info: https://drafts.csswg.org/css-syntax/#tokenization

const std = @import("std");
const builtin = @import("builtin");
const assert = std.debug.assert;

const Tokenizer = @This();

pub const Token = union(enum) {
    /// A `<ident-token>`
    ident: []const u8,

    /// A `<function-token>`
    ///
    /// The value (name) does not include the `(` marker.
    function: []const u8,

    /// A `<at-keyword-token>`
    ///
    /// The value does not include the `@` marker.
    at_keyword: []const u8,

    /// A `<hash-token>` with the type flag set to "id"
    ///
    /// The value does not include the `#` marker.
    id_hash: []const u8, // Hash that is a valid ID selector.

    /// A `<hash-token>` with the type flag set to "unrestricted"
    ///
    /// The value does not include the `#` marker.
    unrestricted_hash: []const u8,

    /// A `<string-token>`
    ///
    /// The value does not include the quotes.
    string: []const u8,

    /// A `<bad-string-token>`
    ///
    /// This token always indicates a parse error.
    bad_string: []const u8,

    /// A `<url-token>`
    ///
    /// The value does not include the `url(` `)` markers.  Note that `url( <string-token> )` is represented by a
    /// `Function` token.
    url: []const u8,

    /// A `<bad-url-token>`
    ///
    /// This token always indicates a parse error.
    bad_url: []const u8,

    /// A `<delim-token>`
    delim: u8,

    /// A `<number-token>`
    number: struct {
        /// Whether the number had a `+` or `-` sign.
        ///
        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
        has_sign: bool,

        /// If the origin source did not include a fractional part, the value as an integer.
        int_value: ?i32,

        /// The value as a float
        value: f32,
    },

    /// A `<percentage-token>`
    percentage: struct {
        /// Whether the number had a `+` or `-` sign.
        has_sign: bool,

        /// If the origin source did not include a fractional part, the value as an integer.
        /// It is **not** divided by 100.
        int_value: ?i32,

        /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
        unit_value: f32,
    },

    /// A `<dimension-token>`
    dimension: struct {
        /// Whether the number had a `+` or `-` sign.
        ///
        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
        has_sign: bool,

        /// If the origin source did not include a fractional part, the value as an integer.
        int_value: ?i32,

        /// The value as a float
        value: f32,

        /// The unit, e.g. "px" in `12px`
        unit: []const u8,
    },

    /// A `<unicode-range-token>`
    unicode_range: struct { bgn: u32, end: i32 },

    /// A `<whitespace-token>`
    white_space: []const u8,

    /// A `<!--` `<CDO-token>`
    cdo,

    /// A `-->` `<CDC-token>`
    cdc,

    /// A `:` `<colon-token>`
    colon, // :

    /// A `;` `<semicolon-token>`
    semicolon, // ;

    /// A `,` `<comma-token>`
    comma, // ,

    /// A `<[-token>`
    square_bracket_block,

    /// A `<]-token>`
    ///
    /// When obtained from one of the `Parser::next*` methods,
    /// this token is always unmatched and indicates a parse error.
    close_square_bracket,

    /// A `<(-token>`
    parenthesis_block,

    /// A `<)-token>`
    ///
    /// When obtained from one of the `Parser::next*` methods,
    /// this token is always unmatched and indicates a parse error.
    close_parenthesis,

    /// A `<{-token>`
    curly_bracket_block,

    /// A `<}-token>`
    ///
    /// When obtained from one of the `Parser::next*` methods,
    /// this token is always unmatched and indicates a parse error.
    close_curly_bracket,

    /// A comment.
    ///
    /// The CSS Syntax spec does not generate tokens for comments,
    /// But we do for simplicity of the interface.
    ///
    /// The value does not include the `/*` `*/` markers.
    comment: []const u8,
};

input: []const u8,

/// Counted in bytes, not code points. From 0.
position: usize = 0,

// If true, the input has at least `n` bytes left *after* the current one.
// That is, `Lexer.byteAt(n)` will not panic.
fn hasAtLeast(self: *const Tokenizer, n: usize) bool {
    return self.position + n < self.input.len;
}

fn isEof(self: *const Tokenizer) bool {
    return !self.hasAtLeast(0);
}

fn byteAt(self: *const Tokenizer, offset: usize) u8 {
    return self.input[self.position + offset];
}

// Assumes non-EOF
fn nextByteUnchecked(self: *const Tokenizer) u8 {
    return self.byteAt(0);
}

fn nextByte(self: *const Tokenizer) ?u8 {
    return if (self.isEof())
        null
    else
        self.input[self.position];
}

fn startsWith(self: *const Tokenizer, needle: []const u8) bool {
    return std.mem.startsWith(u8, self.input[self.position..], needle);
}

fn slice(self: *const Tokenizer, start: usize, end: usize) []const u8 {
    return self.input[start..end];
}

fn sliceFrom(self: *const Tokenizer, start_pos: usize) []const u8 {
    return self.slice(start_pos, self.position);
}

// Advance over N bytes in the input.  This function can advance
// over ASCII bytes (excluding newlines), or UTF-8 sequence
// leaders (excluding leaders for 4-byte sequences).
fn advance(self: *Tokenizer, n: usize) void {
    if (builtin.mode == .Debug) {
        // Each byte must either be an ASCII byte or a sequence leader,
        // but not a 4-byte leader; also newlines are rejected.
        for (0..n) |i| {
            const b = self.byteAt(i);
            assert(b != '\r' and b != '\n' and b != '\x0C');
            assert(b <= 0x7F or (b & 0xF0 != 0xF0 and b & 0xC0 != 0x80));
        }
    }
    self.position += n;
}

fn hasNewlineAt(self: *const Tokenizer, offset: usize) bool {
    if (!self.hasAtLeast(offset)) return false;

    return switch (self.byteAt(offset)) {
        '\n', '\r', '\x0C' => true,
        else => false,
    };
}

fn hasNonAsciiAt(self: *const Tokenizer, offset: usize) bool {
    if (!self.hasAtLeast(offset)) return false;

    const byte = self.byteAt(offset);
    const len_utf8 = std.unicode.utf8ByteSequenceLength(byte) catch return false;

    if (!self.hasAtLeast(offset + len_utf8 - 1)) return false;

    const start = self.position + offset;
    const bytes = self.slice(start, start + len_utf8);

    const codepoint = std.unicode.utf8Decode(bytes) catch return false;

    // https://drafts.csswg.org/css-syntax/#non-ascii-ident-code-point
    return switch (codepoint) {
        '\u{00B7}', '\u{200C}', '\u{200D}', '\u{203F}', '\u{2040}' => true,
        '\u{00C0}'...'\u{00D6}' => true,
        '\u{00D8}'...'\u{00F6}' => true,
        '\u{00F8}'...'\u{037D}' => true,
        '\u{037F}'...'\u{1FFF}' => true,
        '\u{2070}'...'\u{218F}' => true,
        '\u{2C00}'...'\u{2FEF}' => true,
        '\u{3001}'...'\u{D7FF}' => true,
        '\u{F900}'...'\u{FDCF}' => true,
        '\u{FDF0}'...'\u{FFFD}' => true,
        else => codepoint >= '\u{10000}',
    };
}

fn isIdentStart(self: *Tokenizer) bool {
    if (self.isEof()) return false;

    var b = self.nextByteUnchecked();
    if (b == '-') {
        b = if (self.hasAtLeast(1)) self.byteAt(1) else return false;
    }

    return switch (b) {
        'a'...'z', 'A'...'Z', '_', 0x0 => true,
        '\\' => !self.hasNewlineAt(1),
        else => b > 0x7F, // not is ascii
    };
}

fn consumeChar(self: *Tokenizer) void {
    const byte = self.nextByteUnchecked();
    const len_utf8 = std.unicode.utf8ByteSequenceLength(byte) catch 1;
    self.position += len_utf8;
}

// Given that a newline has been seen, advance over the newline
// and update the state.
fn consumeNewline(self: *Tokenizer) void {
    const byte = self.nextByteUnchecked();
    assert(byte == '\r' or byte == '\n' or byte == '\x0C');

    self.position += 1;
    if (byte == '\r' and self.nextByte() == '\n') {
        self.position += 1;
    }
}

fn consumeWhiteSpace(self: *Tokenizer, newline: bool) Token {
    const start_position = self.position;
    if (newline) {
        self.consumeNewline();
    } else {
        self.advance(1);
    }
    while (!self.isEof()) {
        const b = self.nextByteUnchecked();
        switch (b) {
            ' ', '\t' => {
                self.advance(1);
            },
            '\n', '\x0C', '\r' => {
                self.consumeNewline();
            },
            else => break,
        }
    }
    return .{ .white_space = self.sliceFrom(start_position) };
}

fn consumeComment(self: *Tokenizer) []const u8 {
    self.advance(2); // consume "/*"
    const start_position = self.position;
    while (!self.isEof()) {
        switch (self.nextByteUnchecked()) {
            '*' => {
                const end_position = self.position;
                self.advance(1);
                if (self.nextByte() == '/') {
                    self.advance(1);
                    return self.slice(start_position, end_position);
                }
            },
            '\n', '\x0C', '\r' => {
                self.consumeNewline();
            },
            0x0 => self.advance(1),
            else => self.consumeChar(),
        }
    }
    return self.sliceFrom(start_position);
}

fn byteToHexDigit(b: u8) ?u32 {
    return switch (b) {
        '0'...'9' => b - '0',
        'a'...'f' => b - 'a' + 10,
        'A'...'F' => b - 'A' + 10,
        else => null,
    };
}

fn byteToDecimalDigit(b: u8) ?u32 {
    return if (std.ascii.isDigit(b)) b - '0' else null;
}

// (value, number of digits up to 6)
fn consumeHexDigits(self: *Tokenizer) void {
    var value: u32 = 0;
    var digits: u32 = 0;

    while (digits < 6 and !self.isEof()) {
        if (byteToHexDigit(self.nextByteUnchecked())) |digit| {
            value = value * 16 + digit;
            digits += 1;
            self.advance(1);
        } else {
            break;
        }
    }

    _ = &value;
}

// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
// and that the next input character has already been verified
// to not be a newline.
fn consumeEscape(self: *Tokenizer) void {
    if (self.isEof())
        return; // Escaped EOF

    switch (self.nextByteUnchecked()) {
        '0'...'9', 'A'...'F', 'a'...'f' => {
            consumeHexDigits(self);

            if (!self.isEof()) {
                switch (self.nextByteUnchecked()) {
                    ' ', '\t' => {
                        self.advance(1);
                    },
                    '\n', '\x0C', '\r' => {
                        self.consumeNewline();
                    },
                    else => {},
                }
            }
        },
        else => self.consumeChar(),
    }
}

/// https://drafts.csswg.org/css-syntax/#consume-string-token
fn consumeString(self: *Tokenizer, single_quote: bool) Token {
    self.advance(1); // Skip the initial quote

    // start_pos is at code point boundary, after " or '
    const start_pos = self.position;

    while (!self.isEof()) {
        switch (self.nextByteUnchecked()) {
            '"' => {
                if (!single_quote) {
                    const value = self.sliceFrom(start_pos);
                    self.advance(1);
                    return .{ .string = value };
                }
                self.advance(1);
            },
            '\'' => {
                if (single_quote) {
                    const value = self.sliceFrom(start_pos);
                    self.advance(1);
                    return .{ .string = value };
                }
                self.advance(1);
            },
            '\n', '\r', '\x0C' => {
                return .{ .bad_string = self.sliceFrom(start_pos) };
            },
            '\\' => {
                self.advance(1);
                if (self.isEof())
                    continue; // escaped EOF, do nothing.

                switch (self.nextByteUnchecked()) {
                    // Escaped newline
                    '\n', '\x0C', '\r' => self.consumeNewline(),

                    // Spec calls for replacing escape sequences with characters,
                    // but this would require allocating a new string.
                    // Therefore, we leave it as is and let the parser handle the escaping.
                    else => self.consumeEscape(),
                }
            },
            else => self.consumeChar(),
        }
    }

    return .{ .string = self.sliceFrom(start_pos) };
}

fn consumeName(self: *Tokenizer) []const u8 {
    // start_pos is the end of the previous token, therefore at a code point boundary
    const start_pos = self.position;

    while (!self.isEof()) {
        switch (self.nextByteUnchecked()) {
            'a'...'z', 'A'...'Z', '0'...'9', '_', '-' => self.advance(1),
            '\\' => {
                if (self.hasNewlineAt(1)) {
                    break;
                }

                self.advance(1);
                self.consumeEscape();
            },
            0x0 => self.advance(1),
            '\x80'...'\xBF', '\xC0'...'\xEF', '\xF0'...'\xFF' => {
                // This byte *is* part of a multi-byte code point,
                // we’ll end up copying the whole code point before this loop does something else.
                self.advance(1);
            },
            else => {
                if (self.hasNonAsciiAt(0)) {
                    self.consumeChar();
                } else {
                    break; // ASCII
                }
            },
        }
    }

    return self.sliceFrom(start_pos);
}

fn consumeMark(self: *Tokenizer) Token {
    const byte = self.nextByteUnchecked();
    self.advance(1);
    return switch (byte) {
        ',' => .comma,
        ':' => .colon,
        ';' => .semicolon,
        '(' => .parenthesis_block,
        ')' => .close_parenthesis,
        '{' => .curly_bracket_block,
        '}' => .close_curly_bracket,
        '[' => .square_bracket_block,
        ']' => .close_square_bracket,
        else => unreachable,
    };
}

fn consumeNumeric(self: *Tokenizer) Token {
    // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
    // But this is always called so that there is at least one digit in \d*(\.\d+)?

    // Do all the math in f64 so that large numbers overflow to +/-inf
    // and i32::{MIN, MAX} are within range.

    var sign: f64 = 1.0;
    var has_sign = false;
    switch (self.nextByteUnchecked()) {
        '+' => {
            has_sign = true;
        },
        '-' => {
            has_sign = true;
            sign = -1.0;
        },
        else => {},
    }
    if (has_sign) {
        self.advance(1);
    }

    var is_integer = true;
    var integral_part: f64 = 0.0;
    var fractional_part: f64 = 0.0;

    while (!self.isEof()) {
        if (byteToDecimalDigit(self.nextByteUnchecked())) |digit| {
            integral_part = integral_part * 10.0 + @as(f64, @floatFromInt(digit));
            self.advance(1);
        } else {
            break;
        }
    }

    if (self.hasAtLeast(1) and self.nextByteUnchecked() == '.' and std.ascii.isDigit(self.byteAt(1))) {
        is_integer = false;
        self.advance(1); // Consume '.'

        var factor: f64 = 0.1;
        while (!self.isEof()) {
            if (byteToDecimalDigit(self.nextByteUnchecked())) |digit| {
                fractional_part += @as(f64, @floatFromInt(digit)) * factor;
                factor *= 0.1;
                self.advance(1);
            } else {
                break;
            }
        }
    }

    var value = sign * (integral_part + fractional_part);

    blk: {
        const e = self.nextByte() orelse break :blk;
        if (e != 'e' and e != 'E') break :blk;

        var mul: f64 = 1.0;

        if (self.hasAtLeast(2) and (self.byteAt(1) == '+' or self.byteAt(1) == '-') and std.ascii.isDigit(self.byteAt(2))) {
            mul = switch (self.byteAt(1)) {
                '-' => -1.0,
                '+' => 1.0,
                else => unreachable,
            };

            self.advance(2);
        } else if (self.hasAtLeast(2) and std.ascii.isDigit(self.byteAt(2))) {
            self.advance(1);
        } else {
            break :blk;
        }

        is_integer = false;

        var exponent: f64 = 0.0;
        while (!self.isEof()) {
            if (byteToDecimalDigit(self.nextByteUnchecked())) |digit| {
                exponent = exponent * 10.0 + @as(f64, @floatFromInt(digit));
                self.advance(1);
            } else {
                break;
            }
        }
        value *= std.math.pow(f64, 10.0, mul * exponent);
    }

    const int_value: ?i32 = if (is_integer) blk: {
        if (value >= std.math.maxInt(i32)) {
            break :blk std.math.maxInt(i32);
        }

        if (value <= std.math.minInt(i32)) {
            break :blk std.math.minInt(i32);
        }

        break :blk @as(i32, @intFromFloat(value));
    } else null;

    if (!self.isEof() and self.nextByteUnchecked() == '%') {
        self.advance(1);

        return .{ .percentage = .{
            .has_sign = has_sign,
            .int_value = int_value,
            .unit_value = @as(f32, @floatCast(value / 100.0)),
        } };
    }

    if (isIdentStart(self)) {
        return .{ .dimension = .{
            .has_sign = has_sign,
            .int_value = int_value,
            .value = @as(f32, @floatCast(value)),
            .unit = consumeName(self),
        } };
    }

    return .{ .number = .{
        .has_sign = has_sign,
        .int_value = int_value,
        .value = @as(f32, @floatCast(value)),
    } };
}

fn consumeUnquotedUrl(self: *Tokenizer) ?Token {
    // TODO: true url parser
    if (self.nextByte()) |it| {
        return self.consumeString(it == '\'');
    }

    return null;
}

fn consumeIdentLike(self: *Tokenizer) Token {
    const value = self.consumeName();

    if (!self.isEof() and self.nextByteUnchecked() == '(') {
        self.advance(1);

        if (std.ascii.eqlIgnoreCase(value, "url")) {
            if (self.consumeUnquotedUrl()) |result| {
                return result;
            }
        }

        return .{ .function = value };
    }

    return .{ .ident = value };
}

pub fn next(self: *Tokenizer) ?Token {
    if (self.isEof()) {
        return null;
    }

    const b = self.nextByteUnchecked();
    return switch (b) {
        // Consume comments
        '/' => {
            if (self.startsWith("/*")) {
                return .{ .comment = self.consumeComment() };
            } else {
                self.advance(1);
                return .{ .delim = '/' };
            }
        },

        // Consume marks
        '(', ')', '{', '}', '[', ']', ',', ':', ';' => {
            return self.consumeMark();
        },

        // Consume as much whitespace as possible. Return a <whitespace-token>.
        ' ', '\t' => self.consumeWhiteSpace(false),
        '\n', '\x0C', '\r' => self.consumeWhiteSpace(true),

        // Consume a string token and return it.
        '"' => self.consumeString(false),
        '\'' => self.consumeString(true),

        '0'...'9' => self.consumeNumeric(),
        'a'...'z', 'A'...'Z', '_', 0x0 => self.consumeIdentLike(),

        '+' => {
            if ((self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1))) or
                (self.hasAtLeast(2) and self.byteAt(1) == '.' and std.ascii.isDigit(self.byteAt(2))))
            {
                return self.consumeNumeric();
            }
            self.advance(1);
            return .{ .delim = '+' };
        },
        '-' => {
            if ((self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1))) or
                (self.hasAtLeast(2) and self.byteAt(1) == '.' and std.ascii.isDigit(self.byteAt(2))))
            {
                return self.consumeNumeric();
            }

            if (self.startsWith("-->")) {
                self.advance(3);
                return .cdc;
            }

            if (isIdentStart(self)) {
                return self.consumeIdentLike();
            }

            self.advance(1);
            return .{ .delim = '-' };
        },
        '.' => {
            if (self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1))) {
                return self.consumeNumeric();
            }
            self.advance(1);
            return .{ .delim = '.' };
        },

        // Consume hash token
        '#' => {
            self.advance(1);
            if (self.isIdentStart()) {
                return .{ .id_hash = self.consumeName() };
            }
            if (self.nextByte()) |it| {
                switch (it) {
                    // Any other valid case here already resulted in IDHash.
                    '0'...'9', '-' => return .{ .unrestricted_hash = self.consumeName() },
                    else => {},
                }
            }
            return .{ .delim = '#' };
        },

        // Consume at-rules
        '@' => {
            self.advance(1);
            return if (isIdentStart(self))
                .{ .at_keyword = consumeName(self) }
            else
                .{ .delim = '@' };
        },

        '<' => {
            if (self.startsWith("<!--")) {
                self.advance(4);
                return .cdo;
            } else {
                self.advance(1);
                return .{ .delim = '<' };
            }
        },

        '\\' => {
            if (!self.hasNewlineAt(1)) {
                return self.consumeIdentLike();
            }

            self.advance(1);
            return .{ .delim = '\\' };
        },

        else => {
            if (b > 0x7F) { // not is ascii
                return self.consumeIdentLike();
            }

            self.advance(1);
            return .{ .delim = b };
        },
    };
}

const testing = std.testing;

fn expectTokensEqual(input: []const u8, tokens: []const Token) !void {
    var lexer = Tokenizer{ .input = input };

    var i: usize = 0;
    while (lexer.next()) |token| : (i += 1) {
        assert(i < tokens.len);
        try testing.expectEqualDeep(tokens[i], token);
    }

    try testing.expectEqual(i, tokens.len);
    try testing.expectEqualDeep(null, lexer.next());
}

test "smoke" {
    try expectTokensEqual(
        \\.lightpanda  {color:red;}
    , &.{
        .{ .delim = '.' },
        .{ .ident = "lightpanda" },
        .{ .white_space = "  " },
        .curly_bracket_block,
        .{ .ident = "color" },
        .colon,
        .{ .ident = "red" },
        .semicolon,
        .close_curly_bracket,
    });
}