From 27ffea905243772d432a571f9a19688cc1021315 Mon Sep 17 00:00:00 2001 From: nikneym Date: Mon, 15 Sep 2025 11:15:09 +0300 Subject: [PATCH 1/5] add vectorized `parseCharset` impl --- src/browser/mime.zig | 89 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/src/browser/mime.zig b/src/browser/mime.zig index 0561cab5..ab3582f0 100644 --- a/src/browser/mime.zig +++ b/src/browser/mime.zig @@ -52,6 +52,95 @@ pub const Mime = struct { other: struct { type: []const u8, sub_type: []const u8 }, }; + /// Removes quotes of value if quotes are given. + /// + /// Currently we don't validate the charset. + /// See section 2.3 Naming Requirements: + /// https://datatracker.ietf.org/doc/rfc2978/ + fn parseCharset(value: []const u8) error{ CharsetTooBig, Invalid }![]const u8 { + // Cannot be larger than 40. + // https://datatracker.ietf.org/doc/rfc2978/ + if (value.len > 40) return error.CharsetTooBig; + + // If the first char is not a quote, value can be used directly. + // Whitespace is not allowed. + if (value[0] != '"') { + return value; + } + + // Search for second quote begins. + // Skip the first character. + var offset: usize = 1; + + // Charset values are not so large; 128-bit registers should be + // more than enough. + const vec_size = 16; + const Vec = @Vector(vec_size, u8); + const UInt = std.meta.Int(.unsigned, vec_size); + const block_size = @sizeOf(u64); + + const charset = blk: { + // Vector search. + while (value.len - offset >= vec_size) : (offset += vec_size) { + // Fill a vector with quotes. + const quotes: Vec = @splat('"'); + const chunk: Vec = value[offset..][0..vec_size].*; + + // Check if chunk has double quote byte. + const match = @intFromBool(chunk == quotes); + // Create an integer out of match and count how much to skip. + const skip_by = @ctz(@as(UInt, @bitCast(match))); + + // Found a match. + if (skip_by != vec_size) { + break :blk value[1 .. offset + skip_by]; + } + } + + // SWAR search. + while (value.len - offset >= block_size) : (offset += block_size) { + // Magic number for integer filled with double quote. + // [8]u8{ '"', '"', '"', '"', '"', '"', '"', '"' }. + const quotes: u64 = 0x2222222222222222; + // Load the next chunk as unsigned 64-bit integer. + const chunk: u64 = @bitCast(value[offset..][0..block_size].*); + + // XOR with the pattern - bytes equal to quote become 0. + const xor_result = chunk ^ quotes; + + const magic: u64 = 0x8080808080808080; // High bit mask for each byte. + const sub_result = xor_result -% 0x0101010101010101; // Subtract 1 from each byte. + const and_result = sub_result & (~xor_result); // AND with inverted original. + const zero_mask = and_result & magic; // Extract high bits (indicates zero bytes). + + // Found a match. + if (zero_mask != 0) { + // * Count trailing zeroes. + // * Dividing by byte size (>> 3) converts the bit position to byte index. + const skip_by = @ctz(zero_mask) >> 3; + break :blk value[1 .. offset + skip_by]; + } + } + + // Fallback to scalar search. + for (value[offset..], 0..) |c, i| { + if (c == '"') { + break :blk value[1 .. offset + i]; + } + } + + // No quote pairs, something is wrong. + return error.Invalid; + }; + + // Make sure we don't end up w/ empty buffer. + if (charset.len == 0) { + return error.Invalid; + } + + return charset; + } + pub fn parse(input: []u8) !Mime { if (input.len > 255) { return error.TooBig; From 974f350f27130dddd206845a4e7ee152a72c706e Mon Sep 17 00:00:00 2001 From: nikneym Date: Mon, 15 Sep 2025 15:15:08 +0300 Subject: [PATCH 2/5] store charset value directly in `Mime` --- src/browser/mime.zig | 50 +++++++++++++++-------------------------- src/browser/page.zig | 4 ++-- src/browser/xhr/xhr.zig | 2 +- 3 files changed, 21 insertions(+), 35 deletions(-) diff --git a/src/browser/mime.zig b/src/browser/mime.zig index ab3582f0..7723231a 100644 --- a/src/browser/mime.zig +++ b/src/browser/mime.zig @@ -22,13 +22,15 @@ const Allocator = std.mem.Allocator; pub const Mime = struct { content_type: ContentType, params: []const u8 = "", - charset: ?[:0]const u8 = null, + // IANA defines max. charset value length as 40. + // We keep 41 for null-termination since HTML parser expects in this format. + charset: [41]u8 = default_charset, - pub const unknown = Mime{ - .params = "", - .charset = null, - .content_type = .{ .unknown = {} }, - }; + /// String "UTF-8" continued by null characters. + pub const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36; + + /// Mime with unknown Content-Type, empty params and empty charset. + pub const unknown = Mime{ .content_type = .{ .unknown = {} } }; pub const ContentTypeEnum = enum { text_xml, @@ -52,6 +54,11 @@ pub const Mime = struct { other: struct { type: []const u8, sub_type: []const u8 }, }; + /// Returns the null-terminated charset value. + pub inline fn charsetString(mime: *const Mime) [:0]const u8 { + return @ptrCast(&mime.charset); + } + /// Removes quotes of value if quotes are given. /// /// Currently we don't validate the charset. @@ -158,7 +165,7 @@ pub const Mime = struct { const params = trimLeft(normalized[type_len..]); - var charset: ?[:0]const u8 = null; + var charset: [41]u8 = undefined; var it = std.mem.splitScalar(u8, params, ';'); while (it.next()) |attr| { @@ -176,35 +183,14 @@ pub const Mime = struct { switch (attribute_name) { .charset => { - // We used to have a proper value parser, but we currently - // only care about the charset attribute, plus only about - // the UTF-8 value. It's a lot easier to do it this way, - // and it doesn't require an allocation to (a) unescape the - // value or (b) ensure the correct lifetime. if (value.len == 0) { break; } - var attribute_value = value; - if (value[0] == '"') { - if (value.len < 3 or value[value.len - 1] != '"') { - return error.Invalid; - } - attribute_value = value[1 .. value.len - 1]; - } - if (std.ascii.eqlIgnoreCase(attribute_value, "utf-8")) { - charset = "UTF-8"; - } else if (std.ascii.eqlIgnoreCase(attribute_value, "iso-8859-1")) { - charset = "ISO-8859-1"; - } else { - // we only care about null (which we default to UTF-8) - // or UTF-8. If this is actually set (i.e. not null) - // and isn't UTF-8, we'll just put a dummy value. If - // we want to capture the actual value, we'll need to - // dupe/allocate it. Since, for now, we don't need that - // we can avoid the allocation. - charset = "lightpanda:UNSUPPORTED"; - } + const attribute_value = try parseCharset(value); + @memcpy(charset[0..attribute_value.len], attribute_value); + // Fill the rest with zeroes. + @memset(charset[attribute_value.len..], 0); }, } } diff --git a/src/browser/page.zig b/src/browser/page.zig index 870d0fe0..4f2aa88a 100644 --- a/src/browser/page.zig +++ b/src/browser/page.zig @@ -672,14 +672,14 @@ pub const Page = struct { log.debug(.http, "navigate first chunk", .{ .content_type = mime.content_type, .len = data.len }); self.mode = switch (mime.content_type) { - .text_html => .{ .html = try parser.Parser.init(mime.charset orelse "UTF-8") }, + .text_html => .{ .html = try parser.Parser.init(mime.charsetString()) }, .application_json, .text_javascript, .text_css, .text_plain, => blk: { - var p = try parser.Parser.init(mime.charset orelse "UTF-8"); + var p = try parser.Parser.init(mime.charsetString()); try p.process("
");
                     break :blk .{ .text = p };
                 },
diff --git a/src/browser/xhr/xhr.zig b/src/browser/xhr/xhr.zig
index 93f07b7c..97d18536 100644
--- a/src/browser/xhr/xhr.zig
+++ b/src/browser/xhr/xhr.zig
@@ -679,7 +679,7 @@ pub const XMLHttpRequest = struct {
         }
 
         var fbs = std.io.fixedBufferStream(self.response_bytes.items);
-        const doc = parser.documentHTMLParse(fbs.reader(), mime.charset orelse "UTF-8") catch {
+        const doc = parser.documentHTMLParse(fbs.reader(), mime.charsetString()) catch {
             self.response_obj = .{ .Failure = {} };
             return;
         };

From 2e68407fbe155f46f2fb5c8337673a45ad6f239d Mon Sep 17 00:00:00 2001
From: nikneym 
Date: Mon, 15 Sep 2025 15:15:29 +0300
Subject: [PATCH 3/5] update `Mime` tests

---
 src/browser/mime.zig | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/browser/mime.zig b/src/browser/mime.zig
index 7723231a..babf7072 100644
--- a/src/browser/mime.zig
+++ b/src/browser/mime.zig
@@ -438,21 +438,33 @@ test "Mime: parse charset" {
 
     try expect(.{
         .content_type = .{ .text_xml = {} },
-        .charset = "UTF-8",
+        .charset = "utf-8",
         .params = "charset=utf-8",
     }, "text/xml; charset=utf-8");
 
     try expect(.{
         .content_type = .{ .text_xml = {} },
-        .charset = "UTF-8",
+        .charset = "utf-8",
         .params = "charset=\"utf-8\"",
-    }, "text/xml;charset=\"utf-8\"");
+    }, "text/xml;charset=\"UTF-8\"");
+
+    try expect(.{
+        .content_type = .{ .text_html = {} },
+        .charset = "iso-8859-1",
+        .params = "charset=\"iso-8859-1\"",
+    }, "text/html; charset=\"iso-8859-1\"");
+
+    try expect(.{
+        .content_type = .{ .text_html = {} },
+        .charset = "iso-8859-1",
+        .params = "charset=\"iso-8859-1\"",
+    }, "text/html; charset=\"ISO-8859-1\"");
 
     try expect(.{
         .content_type = .{ .text_xml = {} },
-        .charset = "lightpanda:UNSUPPORTED",
-        .params = "charset=\"\\\\ \\\" \"",
-    }, "text/xml;charset=\"\\\\ \\\" \"   ");
+        .charset = "custom-non-standard-charset-value",
+        .params = "charset=\"custom-non-standard-charset-value\"",
+    }, "text/xml;charset=\"custom-non-standard-charset-value\"");
 }
 
 test "Mime: isHTML" {
@@ -565,8 +577,10 @@ fn expect(expected: Expectation, input: []const u8) !void {
     try testing.expectEqual(expected.params, actual.params);
 
     if (expected.charset) |ec| {
-        try testing.expectEqual(ec, actual.charset.?);
+        // We remove the null characters for testing purposes here.
+        try testing.expectEqual(ec, actual.charsetString()[0..ec.len]);
     } else {
-        try testing.expectEqual(null, actual.charset);
+        const m: Mime = .unknown;
+        try testing.expectEqual(m.charsetString(), actual.charsetString());
     }
 }

From c05470515f8b20a1fa31ffacf0c44debb24b8f29 Mon Sep 17 00:00:00 2001
From: nikneym 
Date: Tue, 16 Sep 2025 10:40:38 +0300
Subject: [PATCH 4/5] double quotes must be first and last argument of slice if
 provided

---
 src/browser/mime.zig | 84 +++++---------------------------------------
 1 file changed, 9 insertions(+), 75 deletions(-)

diff --git a/src/browser/mime.zig b/src/browser/mime.zig
index babf7072..f1860b1f 100644
--- a/src/browser/mime.zig
+++ b/src/browser/mime.zig
@@ -69,83 +69,17 @@ pub const Mime = struct {
         // https://datatracker.ietf.org/doc/rfc2978/
         if (value.len > 40) return error.CharsetTooBig;
 
-        // If the first char is not a quote, value can be used directly.
-        // Whitespace is not allowed.
-        if (value[0] != '"') {
-            return value;
+        // If the first char is a quote, look for a pair.
+        if (value[0] == '"') {
+            if (value.len < 3 or value[value.len - 1] != '"') {
+                return error.Invalid;
+            }
+
+            return value[1 .. value.len - 1];
         }
 
-        // Search for second quote begins.
-        // Skip the first character.
-        var offset: usize = 1;
-
-        // Charset values are not so large; 128-bit registers should be
-        // more than enough.
-        const vec_size = 16;
-        const Vec = @Vector(vec_size, u8);
-        const UInt = std.meta.Int(.unsigned, vec_size);
-        const block_size = @sizeOf(u64);
-
-        const charset = blk: {
-            // Vector search.
-            while (value.len - offset >= vec_size) : (offset += vec_size) {
-                // Fill a vector with quotes.
-                const quotes: Vec = @splat('"');
-                const chunk: Vec = value[offset..][0..vec_size].*;
-
-                // Check if chunk has double quote byte.
-                const match = @intFromBool(chunk == quotes);
-                // Create an integer out of match and count how much to skip.
-                const skip_by = @ctz(@as(UInt, @bitCast(match)));
-
-                // Found a match.
-                if (skip_by != vec_size) {
-                    break :blk value[1 .. offset + skip_by];
-                }
-            }
-
-            // SWAR search.
-            while (value.len - offset >= block_size) : (offset += block_size) {
-                // Magic number for integer filled with double quote.
-                // [8]u8{ '"', '"', '"', '"', '"', '"', '"', '"' }.
-                const quotes: u64 = 0x2222222222222222;
-                // Load the next chunk as unsigned 64-bit integer.
-                const chunk: u64 = @bitCast(value[offset..][0..block_size].*);
-
-                // XOR with the pattern - bytes equal to quote become 0.
-                const xor_result = chunk ^ quotes;
-
-                const magic: u64 = 0x8080808080808080; // High bit mask for each byte.
-                const sub_result = xor_result -% 0x0101010101010101; // Subtract 1 from each byte.
-                const and_result = sub_result & (~xor_result); // AND with inverted original.
-                const zero_mask = and_result & magic; // Extract high bits (indicates zero bytes).
-
-                // Found a match.
-                if (zero_mask != 0) {
-                    // * Count trailing zeroes.
-                    // * Dividing by byte size (>> 3) converts the bit position to byte index.
-                    const skip_by = @ctz(zero_mask) >> 3;
-                    break :blk value[1 .. offset + skip_by];
-                }
-            }
-
-            // Fallback to scalar search.
-            for (value[offset..], 0..) |c, i| {
-                if (c == '"') {
-                    break :blk value[1 .. offset + i];
-                }
-            }
-
-            // No quote pairs, something is wrong.
-            return error.Invalid;
-        };
-
-        // Make sure we don't end up w/ empty buffer.
-        if (charset.len == 0) {
-            return error.Invalid;
-        }
-
-        return charset;
+        // No quotes.
+        return value;
     }
 
     pub fn parse(input: []u8) !Mime {

From 90a96fd8a79cfe45f22d5ac4eb65f2da86a82a64 Mon Sep 17 00:00:00 2001
From: nikneym 
Date: Tue, 16 Sep 2025 10:41:49 +0300
Subject: [PATCH 5/5] set a zero char right after attrib value instead of
 memset

---
 src/browser/mime.zig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/browser/mime.zig b/src/browser/mime.zig
index f1860b1f..33ab9958 100644
--- a/src/browser/mime.zig
+++ b/src/browser/mime.zig
@@ -123,8 +123,8 @@ pub const Mime = struct {
 
                     const attribute_value = try parseCharset(value);
                     @memcpy(charset[0..attribute_value.len], attribute_value);
-                    // Fill the rest with zeroes.
-                    @memset(charset[attribute_value.len..], 0);
+                    // Null-terminate right after attribute value.
+                    charset[attribute_value.len] = 0;
                 },
             }
         }