Files
browser/src/browser/css/Tokenizer.zig
Karl Seguin e978857820 Fix possible overflow when parsing floats without an integer
Fixes a WPT test, but I'm not exactly sure which one.
2026-02-26 11:52:29 +08:00

825 lines
23 KiB
Zig
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright (C) 2023-2025 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//! This file implements the tokenization step defined in the CSS Syntax Module Level 3 specification.
//!
//! The algorithm accepts a valid UTF-8 string and returns a stream of tokens.
//! The tokenization step never fails, even for complete gibberish.
//! Validity must then be checked by the parser.
//!
//! NOTE: The tokenizer is not thread-safe and does not own any memory, and does not check the validity of utf8.
//!
//! See spec for more info: https://drafts.csswg.org/css-syntax/#tokenization
const std = @import("std");
const builtin = @import("builtin");
const assert = std.debug.assert;
const Tokenizer = @This();
pub const Token = union(enum) {
/// A `<ident-token>`
ident: []const u8,
/// A `<function-token>`
///
/// The value (name) does not include the `(` marker.
function: []const u8,
/// A `<at-keyword-token>`
///
/// The value does not include the `@` marker.
at_keyword: []const u8,
/// A `<hash-token>` with the type flag set to "id"
///
/// The value does not include the `#` marker.
id_hash: []const u8, // Hash that is a valid ID selector.
/// A `<hash-token>` with the type flag set to "unrestricted"
///
/// The value does not include the `#` marker.
unrestricted_hash: []const u8,
/// A `<string-token>`
///
/// The value does not include the quotes.
string: []const u8,
/// A `<bad-string-token>`
///
/// This token always indicates a parse error.
bad_string: []const u8,
/// A `<url-token>`
///
/// The value does not include the `url(` `)` markers. Note that `url( <string-token> )` is represented by a
/// `Function` token.
url: []const u8,
/// A `<bad-url-token>`
///
/// This token always indicates a parse error.
bad_url: []const u8,
/// A `<delim-token>`
delim: u8,
/// A `<number-token>`
number: struct {
/// Whether the number had a `+` or `-` sign.
///
/// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
has_sign: bool,
/// If the origin source did not include a fractional part, the value as an integer.
int_value: ?i32,
/// The value as a float
value: f32,
},
/// A `<percentage-token>`
percentage: struct {
/// Whether the number had a `+` or `-` sign.
has_sign: bool,
/// If the origin source did not include a fractional part, the value as an integer.
/// It is **not** divided by 100.
int_value: ?i32,
/// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
unit_value: f32,
},
/// A `<dimension-token>`
dimension: struct {
/// Whether the number had a `+` or `-` sign.
///
/// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
has_sign: bool,
/// If the origin source did not include a fractional part, the value as an integer.
int_value: ?i32,
/// The value as a float
value: f32,
/// The unit, e.g. "px" in `12px`
unit: []const u8,
},
/// A `<unicode-range-token>`
unicode_range: struct { bgn: u32, end: i32 },
/// A `<whitespace-token>`
white_space: []const u8,
/// A `<!--` `<CDO-token>`
cdo,
/// A `-->` `<CDC-token>`
cdc,
/// A `:` `<colon-token>`
colon, // :
/// A `;` `<semicolon-token>`
semicolon, // ;
/// A `,` `<comma-token>`
comma, // ,
/// A `<[-token>`
square_bracket_block,
/// A `<]-token>`
///
/// When obtained from one of the `Parser::next*` methods,
/// this token is always unmatched and indicates a parse error.
close_square_bracket,
/// A `<(-token>`
parenthesis_block,
/// A `<)-token>`
///
/// When obtained from one of the `Parser::next*` methods,
/// this token is always unmatched and indicates a parse error.
close_parenthesis,
/// A `<{-token>`
curly_bracket_block,
/// A `<}-token>`
///
/// When obtained from one of the `Parser::next*` methods,
/// this token is always unmatched and indicates a parse error.
close_curly_bracket,
/// A comment.
///
/// The CSS Syntax spec does not generate tokens for comments,
/// But we do for simplicity of the interface.
///
/// The value does not include the `/*` `*/` markers.
comment: []const u8,
};
input: []const u8,
/// Counted in bytes, not code points. From 0.
position: usize = 0,
// If true, the input has at least `n` bytes left *after* the current one.
// That is, `Lexer.byteAt(n)` will not panic.
fn hasAtLeast(self: *const Tokenizer, n: usize) bool {
return self.position + n < self.input.len;
}
fn isEof(self: *const Tokenizer) bool {
return !self.hasAtLeast(0);
}
fn byteAt(self: *const Tokenizer, offset: usize) u8 {
return self.input[self.position + offset];
}
// Assumes non-EOF
fn nextByteUnchecked(self: *const Tokenizer) u8 {
return self.byteAt(0);
}
fn nextByte(self: *const Tokenizer) ?u8 {
return if (self.isEof())
null
else
self.input[self.position];
}
fn startsWith(self: *const Tokenizer, needle: []const u8) bool {
return std.mem.startsWith(u8, self.input[self.position..], needle);
}
fn slice(self: *const Tokenizer, start: usize, end: usize) []const u8 {
return self.input[start..end];
}
fn sliceFrom(self: *const Tokenizer, start_pos: usize) []const u8 {
return self.slice(start_pos, self.position);
}
// Advance over N bytes in the input. This function can advance
// over ASCII bytes (excluding newlines), or UTF-8 sequence
// leaders (excluding leaders for 4-byte sequences).
fn advance(self: *Tokenizer, n: usize) void {
if (builtin.mode == .Debug) {
// Each byte must either be an ASCII byte or a sequence leader,
// but not a 4-byte leader; also newlines are rejected.
for (0..n) |i| {
const b = self.byteAt(i);
assert(b != '\r' and b != '\n' and b != '\x0C');
assert(b <= 0x7F or (b & 0xF0 != 0xF0 and b & 0xC0 != 0x80));
}
}
self.position += n;
}
fn hasNewlineAt(self: *const Tokenizer, offset: usize) bool {
if (!self.hasAtLeast(offset)) return false;
return switch (self.byteAt(offset)) {
'\n', '\r', '\x0C' => true,
else => false,
};
}
fn hasNonAsciiAt(self: *const Tokenizer, offset: usize) bool {
if (!self.hasAtLeast(offset)) return false;
const byte = self.byteAt(offset);
const len_utf8 = std.unicode.utf8ByteSequenceLength(byte) catch return false;
if (!self.hasAtLeast(offset + len_utf8 - 1)) return false;
const start = self.position + offset;
const bytes = self.slice(start, start + len_utf8);
const codepoint = std.unicode.utf8Decode(bytes) catch return false;
// https://drafts.csswg.org/css-syntax/#non-ascii-ident-code-point
return switch (codepoint) {
'\u{00B7}', '\u{200C}', '\u{200D}', '\u{203F}', '\u{2040}' => true,
'\u{00C0}'...'\u{00D6}' => true,
'\u{00D8}'...'\u{00F6}' => true,
'\u{00F8}'...'\u{037D}' => true,
'\u{037F}'...'\u{1FFF}' => true,
'\u{2070}'...'\u{218F}' => true,
'\u{2C00}'...'\u{2FEF}' => true,
'\u{3001}'...'\u{D7FF}' => true,
'\u{F900}'...'\u{FDCF}' => true,
'\u{FDF0}'...'\u{FFFD}' => true,
else => codepoint >= '\u{10000}',
};
}
fn isIdentStart(self: *Tokenizer) bool {
if (self.isEof()) return false;
var b = self.nextByteUnchecked();
if (b == '-') {
b = if (self.hasAtLeast(1)) self.byteAt(1) else return false;
}
return switch (b) {
'a'...'z', 'A'...'Z', '_', 0x0 => true,
'\\' => !self.hasNewlineAt(1),
else => b > 0x7F, // not is ascii
};
}
fn consumeChar(self: *Tokenizer) void {
const byte = self.nextByteUnchecked();
const len_utf8 = std.unicode.utf8ByteSequenceLength(byte) catch 1;
self.position += len_utf8;
}
// Given that a newline has been seen, advance over the newline
// and update the state.
fn consumeNewline(self: *Tokenizer) void {
const byte = self.nextByteUnchecked();
assert(byte == '\r' or byte == '\n' or byte == '\x0C');
self.position += 1;
if (byte == '\r' and self.nextByte() == '\n') {
self.position += 1;
}
}
fn consumeWhiteSpace(self: *Tokenizer, newline: bool) Token {
const start_position = self.position;
if (newline) {
self.consumeNewline();
} else {
self.advance(1);
}
while (!self.isEof()) {
const b = self.nextByteUnchecked();
switch (b) {
' ', '\t' => {
self.advance(1);
},
'\n', '\x0C', '\r' => {
self.consumeNewline();
},
else => break,
}
}
return .{ .white_space = self.sliceFrom(start_position) };
}
fn consumeComment(self: *Tokenizer) []const u8 {
self.advance(2); // consume "/*"
const start_position = self.position;
while (!self.isEof()) {
switch (self.nextByteUnchecked()) {
'*' => {
const end_position = self.position;
self.advance(1);
if (self.nextByte() == '/') {
self.advance(1);
return self.slice(start_position, end_position);
}
},
'\n', '\x0C', '\r' => {
self.consumeNewline();
},
0x0 => self.advance(1),
else => self.consumeChar(),
}
}
return self.sliceFrom(start_position);
}
fn byteToHexDigit(b: u8) ?u32 {
return switch (b) {
'0'...'9' => b - '0',
'a'...'f' => b - 'a' + 10,
'A'...'F' => b - 'A' + 10,
else => null,
};
}
fn byteToDecimalDigit(b: u8) ?u32 {
return if (std.ascii.isDigit(b)) b - '0' else null;
}
// (value, number of digits up to 6)
fn consumeHexDigits(self: *Tokenizer) void {
var value: u32 = 0;
var digits: u32 = 0;
while (digits < 6 and !self.isEof()) {
if (byteToHexDigit(self.nextByteUnchecked())) |digit| {
value = value * 16 + digit;
digits += 1;
self.advance(1);
} else {
break;
}
}
_ = &value;
}
// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
// and that the next input character has already been verified
// to not be a newline.
fn consumeEscape(self: *Tokenizer) void {
if (self.isEof())
return; // Escaped EOF
switch (self.nextByteUnchecked()) {
'0'...'9', 'A'...'F', 'a'...'f' => {
consumeHexDigits(self);
if (!self.isEof()) {
switch (self.nextByteUnchecked()) {
' ', '\t' => {
self.advance(1);
},
'\n', '\x0C', '\r' => {
self.consumeNewline();
},
else => {},
}
}
},
else => self.consumeChar(),
}
}
/// https://drafts.csswg.org/css-syntax/#consume-string-token
fn consumeString(self: *Tokenizer, single_quote: bool) Token {
self.advance(1); // Skip the initial quote
// start_pos is at code point boundary, after " or '
const start_pos = self.position;
while (!self.isEof()) {
switch (self.nextByteUnchecked()) {
'"' => {
if (!single_quote) {
const value = self.sliceFrom(start_pos);
self.advance(1);
return .{ .string = value };
}
self.advance(1);
},
'\'' => {
if (single_quote) {
const value = self.sliceFrom(start_pos);
self.advance(1);
return .{ .string = value };
}
self.advance(1);
},
'\n', '\r', '\x0C' => {
return .{ .bad_string = self.sliceFrom(start_pos) };
},
'\\' => {
self.advance(1);
if (self.isEof())
continue; // escaped EOF, do nothing.
switch (self.nextByteUnchecked()) {
// Escaped newline
'\n', '\x0C', '\r' => self.consumeNewline(),
// Spec calls for replacing escape sequences with characters,
// but this would require allocating a new string.
// Therefore, we leave it as is and let the parser handle the escaping.
else => self.consumeEscape(),
}
},
else => self.consumeChar(),
}
}
return .{ .string = self.sliceFrom(start_pos) };
}
fn consumeName(self: *Tokenizer) []const u8 {
// start_pos is the end of the previous token, therefore at a code point boundary
const start_pos = self.position;
while (!self.isEof()) {
switch (self.nextByteUnchecked()) {
'a'...'z', 'A'...'Z', '0'...'9', '_', '-' => self.advance(1),
'\\' => {
if (self.hasNewlineAt(1)) {
break;
}
self.advance(1);
self.consumeEscape();
},
0x0 => self.advance(1),
'\x80'...'\xBF', '\xC0'...'\xEF', '\xF0'...'\xFF' => {
// This byte *is* part of a multi-byte code point,
// well end up copying the whole code point before this loop does something else.
self.advance(1);
},
else => {
if (self.hasNonAsciiAt(0)) {
self.consumeChar();
} else {
break; // ASCII
}
},
}
}
return self.sliceFrom(start_pos);
}
fn consumeMark(self: *Tokenizer) Token {
const byte = self.nextByteUnchecked();
self.advance(1);
return switch (byte) {
',' => .comma,
':' => .colon,
';' => .semicolon,
'(' => .parenthesis_block,
')' => .close_parenthesis,
'{' => .curly_bracket_block,
'}' => .close_curly_bracket,
'[' => .square_bracket_block,
']' => .close_square_bracket,
else => unreachable,
};
}
fn consumeNumeric(self: *Tokenizer) Token {
// Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
// But this is always called so that there is at least one digit in \d*(\.\d+)?
// Do all the math in f64 so that large numbers overflow to +/-inf
// and i32::{MIN, MAX} are within range.
var sign: f64 = 1.0;
var has_sign = false;
switch (self.nextByteUnchecked()) {
'+' => {
has_sign = true;
},
'-' => {
has_sign = true;
sign = -1.0;
},
else => {},
}
if (has_sign) {
self.advance(1);
}
var is_integer = true;
var integral_part: f64 = 0.0;
var fractional_part: f64 = 0.0;
while (!self.isEof()) {
if (byteToDecimalDigit(self.nextByteUnchecked())) |digit| {
integral_part = integral_part * 10.0 + @as(f64, @floatFromInt(digit));
self.advance(1);
} else {
break;
}
}
if (self.hasAtLeast(1) and self.nextByteUnchecked() == '.' and std.ascii.isDigit(self.byteAt(1))) {
is_integer = false;
self.advance(1); // Consume '.'
var factor: f64 = 0.1;
while (!self.isEof()) {
if (byteToDecimalDigit(self.nextByteUnchecked())) |digit| {
fractional_part += @as(f64, @floatFromInt(digit)) * factor;
factor *= 0.1;
self.advance(1);
} else {
break;
}
}
}
var value = sign * (integral_part + fractional_part);
blk: {
const e = self.nextByte() orelse break :blk;
if (e != 'e' and e != 'E') break :blk;
var mul: f64 = 1.0;
if (self.hasAtLeast(2) and (self.byteAt(1) == '+' or self.byteAt(1) == '-') and std.ascii.isDigit(self.byteAt(2))) {
mul = switch (self.byteAt(1)) {
'-' => -1.0,
'+' => 1.0,
else => unreachable,
};
self.advance(2);
} else if (self.hasAtLeast(2) and std.ascii.isDigit(self.byteAt(2))) {
self.advance(1);
} else {
break :blk;
}
is_integer = false;
var exponent: f64 = 0.0;
while (!self.isEof()) {
if (byteToDecimalDigit(self.nextByteUnchecked())) |digit| {
exponent = exponent * 10.0 + @as(f64, @floatFromInt(digit));
self.advance(1);
} else {
break;
}
}
value *= std.math.pow(f64, 10.0, mul * exponent);
}
const int_value: ?i32 = if (is_integer) blk: {
if (value >= std.math.maxInt(i32)) {
break :blk std.math.maxInt(i32);
}
if (value <= std.math.minInt(i32)) {
break :blk std.math.minInt(i32);
}
break :blk @as(i32, @intFromFloat(value));
} else null;
if (!self.isEof() and self.nextByteUnchecked() == '%') {
self.advance(1);
return .{ .percentage = .{
.has_sign = has_sign,
.int_value = int_value,
.unit_value = @as(f32, @floatCast(value / 100.0)),
} };
}
if (isIdentStart(self)) {
return .{ .dimension = .{
.has_sign = has_sign,
.int_value = int_value,
.value = @as(f32, @floatCast(value)),
.unit = consumeName(self),
} };
}
return .{ .number = .{
.has_sign = has_sign,
.int_value = int_value,
.value = @as(f32, @floatCast(value)),
} };
}
fn consumeUnquotedUrl(self: *Tokenizer) ?Token {
// TODO: true url parser
if (self.nextByte()) |it| {
return self.consumeString(it == '\'');
}
return null;
}
fn consumeIdentLike(self: *Tokenizer) Token {
const value = self.consumeName();
if (!self.isEof() and self.nextByteUnchecked() == '(') {
self.advance(1);
if (std.ascii.eqlIgnoreCase(value, "url")) {
if (self.consumeUnquotedUrl()) |result| {
return result;
}
}
return .{ .function = value };
}
return .{ .ident = value };
}
pub fn next(self: *Tokenizer) ?Token {
if (self.isEof()) {
return null;
}
const b = self.nextByteUnchecked();
return switch (b) {
// Consume comments
'/' => {
if (self.startsWith("/*")) {
return .{ .comment = self.consumeComment() };
} else {
self.advance(1);
return .{ .delim = '/' };
}
},
// Consume marks
'(', ')', '{', '}', '[', ']', ',', ':', ';' => {
return self.consumeMark();
},
// Consume as much whitespace as possible. Return a <whitespace-token>.
' ', '\t' => self.consumeWhiteSpace(false),
'\n', '\x0C', '\r' => self.consumeWhiteSpace(true),
// Consume a string token and return it.
'"' => self.consumeString(false),
'\'' => self.consumeString(true),
'0'...'9' => self.consumeNumeric(),
'a'...'z', 'A'...'Z', '_', 0x0 => self.consumeIdentLike(),
'+' => {
if ((self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1))) or
(self.hasAtLeast(2) and self.byteAt(1) == '.' and std.ascii.isDigit(self.byteAt(2))))
{
return self.consumeNumeric();
}
self.advance(1);
return .{ .delim = '+' };
},
'-' => {
if ((self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1))) or
(self.hasAtLeast(2) and self.byteAt(1) == '.' and std.ascii.isDigit(self.byteAt(2))))
{
return self.consumeNumeric();
}
if (self.startsWith("-->")) {
self.advance(3);
return .cdc;
}
if (isIdentStart(self)) {
return self.consumeIdentLike();
}
self.advance(1);
return .{ .delim = '-' };
},
'.' => {
if (self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1))) {
return self.consumeNumeric();
}
self.advance(1);
return .{ .delim = '.' };
},
// Consume hash token
'#' => {
self.advance(1);
if (self.isIdentStart()) {
return .{ .id_hash = self.consumeName() };
}
if (self.nextByte()) |it| {
switch (it) {
// Any other valid case here already resulted in IDHash.
'0'...'9', '-' => return .{ .unrestricted_hash = self.consumeName() },
else => {},
}
}
return .{ .delim = '#' };
},
// Consume at-rules
'@' => {
self.advance(1);
return if (isIdentStart(self))
.{ .at_keyword = consumeName(self) }
else
.{ .delim = '@' };
},
'<' => {
if (self.startsWith("<!--")) {
self.advance(4);
return .cdo;
} else {
self.advance(1);
return .{ .delim = '<' };
}
},
'\\' => {
if (!self.hasNewlineAt(1)) {
return self.consumeIdentLike();
}
self.advance(1);
return .{ .delim = '\\' };
},
else => {
if (b > 0x7F) { // not is ascii
return self.consumeIdentLike();
}
self.advance(1);
return .{ .delim = b };
},
};
}
const testing = std.testing;
fn expectTokensEqual(input: []const u8, tokens: []const Token) !void {
var lexer = Tokenizer{ .input = input };
var i: usize = 0;
while (lexer.next()) |token| : (i += 1) {
assert(i < tokens.len);
try testing.expectEqualDeep(tokens[i], token);
}
try testing.expectEqual(i, tokens.len);
try testing.expectEqualDeep(null, lexer.next());
}
test "smoke" {
try expectTokensEqual(
\\.lightpanda {color:red;}
, &.{
.{ .delim = '.' },
.{ .ident = "lightpanda" },
.{ .white_space = " " },
.curly_bracket_block,
.{ .ident = "color" },
.colon,
.{ .ident = "red" },
.semicolon,
.close_curly_bracket,
});
}