Merge pull request #1350 from lightpanda-io/css_selector_escape_sequence

Support escape sequences in CSS selector for id and class selectors
This commit is contained in:
Pierre Tachoire
2026-01-09 11:40:33 +01:00
committed by GitHub

View File

@@ -657,198 +657,126 @@ fn parseNthPattern(self: *Parser) !Selector.NthPattern {
} }
pub fn id(self: *Parser, arena: Allocator) ![]const u8 { pub fn id(self: *Parser, arena: Allocator) ![]const u8 {
// Must be called when we're at a '#'
std.debug.assert(self.peek() == '#'); std.debug.assert(self.peek() == '#');
self.input = self.input[1..]; // Skip '#'
// trim the leading # return self.parseIdentifier(arena, error.InvalidIDSelector);
var input = self.input[1..];
if (input.len == 0) {
@branchHint(.cold);
return error.InvalidIDSelector;
}
// First pass: find the end of the id and check if there are escape sequences
var i: usize = 0;
var has_escape = false;
var first_char_validated = false;
while (i < input.len) {
const b = input[i];
if (b == '\\') {
// Escape sequence
if (i + 1 >= input.len) {
@branchHint(.cold);
return error.InvalidIDSelector;
}
has_escape = true;
i += 2; // Skip backslash and escaped char
first_char_validated = true;
continue;
}
// Validate first character if not yet validated
if (!first_char_validated) {
if (b == '-') {
if (i + 1 >= input.len) {
@branchHint(.cold);
return error.InvalidIDSelector;
}
const second = input[i + 1];
if (second == '-' or std.ascii.isDigit(second)) {
@branchHint(.cold);
return error.InvalidIDSelector;
}
} else if (!std.ascii.isAlphabetic(b) and b != '_' and b < 0x80) {
@branchHint(.cold);
return error.InvalidIDSelector;
}
first_char_validated = true;
}
// Check if this is a valid id character
switch (b) {
'a'...'z', 'A'...'Z', '0'...'9', '-', '_' => {},
0x80...0xFF => {}, // non-ASCII characters
' ', '\t', '\n', '\r' => break,
// Stop at selector delimiters
'.', '#', '>', '+', '~', '[', ':', ')', ']' => break,
else => {
@branchHint(.cold);
return error.InvalidIDSelector;
},
}
i += 1;
}
if (i == 0) {
@branchHint(.cold);
return error.InvalidIDSelector;
}
const raw = input[0..i];
self.input = input[i..];
// If no escape sequences, return the slice as-is
if (!has_escape) {
return raw;
}
// Build unescaped string
var result = try std.ArrayList(u8).initCapacity(arena, raw.len);
var j: usize = 0;
while (j < raw.len) {
if (raw[j] == '\\') {
j += 1; // Skip backslash
if (j < raw.len) {
try result.append(arena, raw[j]); // Add escaped char
j += 1;
}
} else {
try result.append(arena, raw[j]);
j += 1;
}
}
return result.items;
} }
fn class(self: *Parser, arena: Allocator) ![]const u8 { fn class(self: *Parser, arena: Allocator) ![]const u8 {
// Must be called when we're at a '.'
std.debug.assert(self.peek() == '.'); std.debug.assert(self.peek() == '.');
self.input = self.input[1..]; // Skip '.'
return self.parseIdentifier(arena, error.InvalidClassSelector);
}
// trim the leading . // Parse a CSS identifier (used by id and class selectors)
var input = self.input[1..]; fn parseIdentifier(self: *Parser, arena: Allocator, err: ParseError) ParseError![]const u8 {
const input = self.input;
if (input.len == 0) { if (input.len == 0) {
@branchHint(.cold); @branchHint(.cold);
return error.InvalidClassSelector; return err;
} }
// First pass: find the end of the class name and check if there are escape sequences
var i: usize = 0; var i: usize = 0;
var has_escape = false; const first = input[0];
var first_char_validated = false;
if (first == '\\' or first == 0) {
// First char needs special processing - go straight to slow path
} else if (first >= 0x80 or std.ascii.isAlphabetic(first) or first == '_') {
// Valid first char
i = 1;
} else if (first == '-') {
// Dash must be followed by dash, letter, underscore, escape, or non-ASCII
if (input.len < 2) {
@branchHint(.cold);
return err;
}
const second = input[1];
if (second == '-' or second == '\\' or std.ascii.isAlphabetic(second) or second == '_' or second >= 0x80) {
i = 1; // First char validated, start scanning from position 1
} else {
@branchHint(.cold);
return err;
}
} else {
@branchHint(.cold);
return err;
}
// Fast scan remaining characters (no escapes/nulls)
while (i < input.len) { while (i < input.len) {
const b = input[i]; const b = input[i];
if (b == '\\') { if (b == '\\' or b == 0) {
// Escape sequence // Stop at escape or null - need slow path
if (i + 1 >= input.len) { break;
@branchHint(.cold);
return error.InvalidClassSelector;
}
has_escape = true;
i += 2; // Skip backslash and escaped char
first_char_validated = true;
continue;
} }
// Validate first character if not yet validated // Check if valid identifier character
if (!first_char_validated) {
if (b == '-') {
if (i + 1 >= input.len) {
@branchHint(.cold);
return error.InvalidClassSelector;
}
const second = input[i + 1];
if (second == '-' or std.ascii.isDigit(second)) {
@branchHint(.cold);
return error.InvalidClassSelector;
}
} else if (!std.ascii.isAlphabetic(b) and b != '_' and b < 0x80) {
@branchHint(.cold);
return error.InvalidClassSelector;
}
first_char_validated = true;
}
// Check if this is a valid class name character
switch (b) { switch (b) {
'a'...'z', 'A'...'Z', '0'...'9', '-', '_' => {}, 'a'...'z', 'A'...'Z', '0'...'9', '-', '_' => {},
0x80...0xFF => {}, // non-ASCII characters 0x80...0xFF => {},
' ', '\t', '\n', '\r' => break, ' ', '\t', '\n', '\r', '.', '#', '>', '+', '~', '[', ':', ')', ']' => break,
// Stop at selector delimiters
'.', '#', '>', '+', '~', '[', ':', ')', ']' => break,
else => { else => {
@branchHint(.cold); @branchHint(.cold);
return error.InvalidClassSelector; return err;
}, },
} }
i += 1; i += 1;
} }
if (i == 0) { // Fast path: no escapes/nulls found
@branchHint(.cold); if (i == input.len or (i > 0 and input[i] != '\\' and input[i] != 0)) {
return error.InvalidClassSelector; if (i == 0) {
} @branchHint(.cold);
return err;
const raw = input[0..i];
self.input = input[i..];
// If no escape sequences, return the slice as-is
if (!has_escape) {
return raw;
}
// Build unescaped string
var result = try std.ArrayList(u8).initCapacity(arena, raw.len);
var j: usize = 0;
while (j < raw.len) {
if (raw[j] == '\\') {
j += 1; // Skip backslash
if (j < raw.len) {
try result.append(arena, raw[j]); // Add escaped char
j += 1;
}
} else {
try result.append(arena, raw[j]);
j += 1;
} }
self.input = input[i..];
return input[0..i];
} }
// Slow path: has escapes or nulls
var result = try std.ArrayList(u8).initCapacity(arena, input.len);
try result.appendSlice(arena, input[0..i]);
var j = i;
while (j < input.len) {
const b = input[j];
if (b == '\\') {
j += 1;
const escape_result = try parseEscape(input[j..], arena);
try result.appendSlice(arena, escape_result.bytes);
j += escape_result.consumed;
continue;
}
if (b == 0) {
try result.appendSlice(arena, "\u{FFFD}");
j += 1;
continue;
}
const is_ident_char = switch (b) {
'a'...'z', 'A'...'Z', '0'...'9', '-', '_' => true,
0x80...0xFF => true,
else => false,
};
if (!is_ident_char) {
break;
}
try result.append(arena, b);
j += 1;
}
if (result.items.len == 0) {
@branchHint(.cold);
return err;
}
self.input = input[j..];
return result.items; return result.items;
} }
@@ -1036,6 +964,74 @@ fn fastEql(a: []const u8, comptime b: []const u8) bool {
return true; return true;
} }
const EscapeResult = struct {
bytes: []const u8,
consumed: usize, // how many bytes from input were consumed
};
// Parse CSS escape sequence starting after the backslash
// Input should point to the character after '\'
// Returns the UTF-8 bytes for the escaped character and how many input bytes were consumed
fn parseEscape(input: []const u8, arena: Allocator) !EscapeResult {
if (input.len == 0) {
// EOF after backslash -> replacement character
return .{ .bytes = "\u{FFFD}", .consumed = 0 };
}
const first = input[0];
// Check if it's a hex escape (1-6 hex digits)
if (std.ascii.isHex(first)) {
var hex_value: u32 = 0;
var i: usize = 0;
// Parse up to 6 hex digits
while (i < 6 and i < input.len) : (i += 1) {
const c = input[i];
if (!std.ascii.isHex(c)) break;
const digit = if (c >= '0' and c <= '9')
c - '0'
else if (c >= 'a' and c <= 'f')
c - 'a' + 10
else if (c >= 'A' and c <= 'F')
c - 'A' + 10
else
unreachable;
hex_value = hex_value * 16 + digit;
}
var consumed = i;
// Consume one optional whitespace character (space, tab, CR, LF, FF)
if (i < input.len) {
const next = input[i];
if (next == ' ' or next == '\t' or next == '\r' or next == '\n' or next == '\x0C') {
consumed += 1;
}
}
// Validate the code point and convert to UTF-8
// Invalid: 0, > 0x10FFFF, or surrogate range 0xD800-0xDFFF
if (hex_value == 0 or hex_value > 0x10FFFF or (hex_value >= 0xD800 and hex_value <= 0xDFFF)) {
return .{ .bytes = "\u{FFFD}", .consumed = consumed };
}
// Encode as UTF-8
var buf = try arena.alloc(u8, 4);
const len = std.unicode.utf8Encode(@intCast(hex_value), buf) catch {
return .{ .bytes = "\u{FFFD}", .consumed = consumed };
};
return .{ .bytes = buf[0..len], .consumed = consumed };
}
// Simple escape - just the character itself
var buf = try arena.alloc(u8, 1);
buf[0] = first;
return .{ .bytes = buf, .consumed = 1 };
}
const testing = @import("../../../testing.zig"); const testing = @import("../../../testing.zig");
test "Selector: Parser.ID" { test "Selector: Parser.ID" {
const arena = testing.allocator; const arena = testing.allocator;
@@ -1072,12 +1068,14 @@ test "Selector: Parser.ID" {
{ {
var parser = Parser{ .input = "#--" }; var parser = Parser{ .input = "#--" };
try testing.expectError(error.InvalidIDSelector, parser.id(arena)); try testing.expectEqual("--", try parser.id(arena));
try testing.expectEqual("", parser.input);
} }
{ {
var parser = Parser{ .input = "#--test" }; var parser = Parser{ .input = "#--test" };
try testing.expectError(error.InvalidIDSelector, parser.id(arena)); try testing.expectEqual("--test", try parser.id(arena));
try testing.expectEqual("", parser.input);
} }
{ {
@@ -1187,12 +1185,14 @@ test "Selector: Parser.class" {
{ {
var parser = Parser{ .input = ".--" }; var parser = Parser{ .input = ".--" };
try testing.expectError(error.InvalidClassSelector, parser.class(arena)); try testing.expectEqual("--", try parser.class(arena));
try testing.expectEqual("", parser.input);
} }
{ {
var parser = Parser{ .input = ".--test" }; var parser = Parser{ .input = ".--test" };
try testing.expectError(error.InvalidClassSelector, parser.class(arena)); try testing.expectEqual("--test", try parser.class(arena));
try testing.expectEqual("", parser.input);
} }
{ {