From 3dcdaa0a9bf7c4a138f70dab4672a48d79276c5a Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Sat, 14 Mar 2026 14:15:40 -0700 Subject: [PATCH 1/2] Implement charset detection from first 1024 bytes of HTML Per the HTML spec, browsers should detect charset from tags in the first 1024 bytes of a document when the HTTP Content-Type header doesn't specify one. Adds Mime.prescanCharset() which scans for: - - Integrates into the page loading flow to set the detected charset on the Mime when no explicit HTTP charset was provided. Fixes #531 --- src/browser/Mime.zig | 160 +++++++++++++++++++++++++++++++++++++++++++ src/browser/Page.zig | 14 +++- 2 files changed, 173 insertions(+), 1 deletion(-) diff --git a/src/browser/Mime.zig b/src/browser/Mime.zig index 43ca3632..beef2177 100644 --- a/src/browser/Mime.zig +++ b/src/browser/Mime.zig @@ -168,6 +168,137 @@ pub fn parse(input: []u8) !Mime { }; } +/// Prescan the first 1024 bytes of an HTML document for a charset declaration. +/// Looks for `` and ``. +/// Returns the charset value or null if none found. +/// See: https://www.w3.org/International/questions/qa-html-encoding-declarations +pub fn prescanCharset(html: []const u8) ?[]const u8 { + const limit = @min(html.len, 1024); + const data = html[0..limit]; + + // Scan for = data.len) return null; + + // Check for "meta" (case-insensitive) + if (pos + 4 >= data.len) return null; + var tag_buf: [4]u8 = undefined; + _ = std.ascii.lowerString(&tag_buf, data[pos..][0..4]); + if (!std.mem.eql(u8, &tag_buf, "meta")) { + continue; + } + pos += 4; + + // Must be followed by whitespace or end of tag + if (pos >= data.len) return null; + if (data[pos] != ' ' and data[pos] != '\t' and data[pos] != '\n' and + data[pos] != '\r' and data[pos] != '/') + { + continue; + } + + // Scan attributes within this meta tag + const tag_end = std.mem.indexOfScalarPos(u8, data, pos, '>') orelse return null; + const attrs = data[pos..tag_end]; + + // Look for charset= attribute directly + if (findAttrValue(attrs, "charset")) |charset| { + if (charset.len > 0 and charset.len <= 40) return charset; + } + + // Look for http-equiv="content-type" with content="...;charset=X" + if (findAttrValue(attrs, "http-equiv")) |he| { + if (asciiEqlIgnoreCase(he, "content-type")) { + if (findAttrValue(attrs, "content")) |content| { + if (extractCharsetFromContentType(content)) |charset| { + return charset; + } + } + } + } + + pos = tag_end + 1; + } + return null; +} + +fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 { + var pos: usize = 0; + while (pos < attrs.len) { + // Skip whitespace + while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t' or + attrs[pos] == '\n' or attrs[pos] == '\r')) + { + pos += 1; + } + if (pos >= attrs.len) return null; + + // Read attribute name + const attr_start = pos; + while (pos < attrs.len and attrs[pos] != '=' and attrs[pos] != ' ' and + attrs[pos] != '\t' and attrs[pos] != '>' and attrs[pos] != '/') + { + pos += 1; + } + const attr_name = attrs[attr_start..pos]; + + // Skip whitespace around = + while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1; + if (pos >= attrs.len or attrs[pos] != '=') continue; + pos += 1; // skip '=' + while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1; + if (pos >= attrs.len) return null; + + // Read attribute value + const value = blk: { + if (attrs[pos] == '"' or attrs[pos] == '\'') { + const quote = attrs[pos]; + pos += 1; + const val_start = pos; + while (pos < attrs.len and attrs[pos] != quote) pos += 1; + const val = attrs[val_start..pos]; + if (pos < attrs.len) pos += 1; // skip closing quote + break :blk val; + } else { + const val_start = pos; + while (pos < attrs.len and attrs[pos] != ' ' and attrs[pos] != '\t' and + attrs[pos] != '>' and attrs[pos] != '/') + { + pos += 1; + } + break :blk attrs[val_start..pos]; + } + }; + + if (asciiEqlIgnoreCase(attr_name, name)) return value; + } + return null; +} + +fn extractCharsetFromContentType(content: []const u8) ?[]const u8 { + var it = std.mem.splitScalar(u8, content, ';'); + while (it.next()) |part| { + const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' }); + if (trimmed.len > 8 and asciiEqlIgnoreCase(trimmed[0..8], "charset=")) { + const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' }); + if (val.len > 0 and val.len <= 40) return val; + } + } + return null; +} + +fn asciiEqlIgnoreCase(a: []const u8, b: []const u8) bool { + if (a.len != b.len) return false; + for (a, b) |ca, cb| { + if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false; + } + return true; +} + pub fn sniff(body: []const u8) ?Mime { // 0x0C is form feed const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C }); @@ -576,3 +707,32 @@ fn expect(expected: Expectation, input: []const u8) !void { try testing.expectEqual(m.charsetStringZ(), actual.charsetStringZ()); } } + +test "Mime: prescanCharset" { + // + try testing.expectEqual("utf-8", Mime.prescanCharset("").?); + try testing.expectEqual("iso-8859-1", Mime.prescanCharset("").?); + try testing.expectEqual("shift_jis", Mime.prescanCharset("").?); + + // Case-insensitive tag matching + try testing.expectEqual("utf-8", Mime.prescanCharset("").?); + try testing.expectEqual("utf-8", Mime.prescanCharset("").?); + + // + try testing.expectEqual( + "iso-8859-1", + Mime.prescanCharset("").?, + ); + + // No charset found + try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("Test")); + try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("")); + try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("no html here")); + + // Charset after 1024 bytes should not be found + var long_html: [1100]u8 = undefined; + @memset(&long_html, ' '); + const suffix = ""; + @memcpy(long_html[1050 .. 1050 + suffix.len], suffix); + try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(&long_html)); +} diff --git a/src/browser/Page.zig b/src/browser/Page.zig index cb62cb31..ab291bc6 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -848,13 +848,25 @@ fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void { if (self._parse_state == .pre) { // we lazily do this, because we might need the first chunk of data // to sniff the content type - const mime: Mime = blk: { + var mime: Mime = blk: { if (transfer.response_header.?.contentType()) |ct| { break :blk try Mime.parse(ct); } break :blk Mime.sniff(data); } orelse .unknown; + // If the HTTP header didn't specify a charset and this is HTML, + // prescan the first 1024 bytes for a declaration. + if (mime.content_type == .text_html and std.mem.eql(u8, mime.charsetString(), "UTF-8")) { + if (Mime.prescanCharset(data)) |charset| { + if (charset.len <= 40) { + @memcpy(mime.charset[0..charset.len], charset); + mime.charset[charset.len] = 0; + mime.charset_len = charset.len; + } + } + } + if (comptime IS_DEBUG) { log.debug(.page, "navigate first chunk", .{ .content_type = mime.content_type, From b373fb4a424119c314da7c8a62aa3811c6a4698c Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Sun, 15 Mar 2026 21:20:45 -0700 Subject: [PATCH 2/2] Address review feedback: fix endless loop, use stdlib, add charset flag - Use std.ascii.eqlIgnoreCase instead of custom asciiEqlIgnoreCase - Fix infinite loop in findAttrValue when attribute has no '=' sign (e.g. self-closing ) - Add is_default_charset flag to Mime struct so prescan only overrides charset when Content-Type header didn't set one explicitly - Add regression test for the self-closing meta loop case Co-Authored-By: Claude Opus 4.6 --- src/browser/Mime.zig | 35 +++++++++++++++++++---------------- src/browser/Page.zig | 4 ++-- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/browser/Mime.zig b/src/browser/Mime.zig index beef2177..13951259 100644 --- a/src/browser/Mime.zig +++ b/src/browser/Mime.zig @@ -25,6 +25,7 @@ params: []const u8 = "", // We keep 41 for null-termination since HTML parser expects in this format. charset: [41]u8 = default_charset, charset_len: usize = default_charset_len, +is_default_charset: bool = true, /// String "UTF-8" continued by null characters. const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36; @@ -130,6 +131,7 @@ pub fn parse(input: []u8) !Mime { var charset: [41]u8 = default_charset; var charset_len: usize = default_charset_len; + var has_explicit_charset = false; var it = std.mem.splitScalar(u8, params, ';'); while (it.next()) |attr| { @@ -156,6 +158,7 @@ pub fn parse(input: []u8) !Mime { // Null-terminate right after attribute value. charset[attribute_value.len] = 0; charset_len = attribute_value.len; + has_explicit_charset = true; }, } } @@ -165,6 +168,7 @@ pub fn parse(input: []u8) !Mime { .charset = charset, .charset_len = charset_len, .content_type = content_type, + .is_default_charset = !has_explicit_charset, }; } @@ -212,7 +216,7 @@ pub fn prescanCharset(html: []const u8) ?[]const u8 { // Look for http-equiv="content-type" with content="...;charset=X" if (findAttrValue(attrs, "http-equiv")) |he| { - if (asciiEqlIgnoreCase(he, "content-type")) { + if (std.ascii.eqlIgnoreCase(he, "content-type")) { if (findAttrValue(attrs, "content")) |content| { if (extractCharsetFromContentType(content)) |charset| { return charset; @@ -248,7 +252,11 @@ fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 { // Skip whitespace around = while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1; - if (pos >= attrs.len or attrs[pos] != '=') continue; + if (pos >= attrs.len or attrs[pos] != '=') { + // No '=' found - skip this token. Advance at least one byte to avoid infinite loop. + if (pos == attr_start) pos += 1; + continue; + } pos += 1; // skip '=' while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1; if (pos >= attrs.len) return null; @@ -274,7 +282,7 @@ fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 { } }; - if (asciiEqlIgnoreCase(attr_name, name)) return value; + if (std.ascii.eqlIgnoreCase(attr_name, name)) return value; } return null; } @@ -283,7 +291,7 @@ fn extractCharsetFromContentType(content: []const u8) ?[]const u8 { var it = std.mem.splitScalar(u8, content, ';'); while (it.next()) |part| { const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' }); - if (trimmed.len > 8 and asciiEqlIgnoreCase(trimmed[0..8], "charset=")) { + if (trimmed.len > 8 and std.ascii.eqlIgnoreCase(trimmed[0..8], "charset=")) { const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' }); if (val.len > 0 and val.len <= 40) return val; } @@ -291,14 +299,6 @@ fn extractCharsetFromContentType(content: []const u8) ?[]const u8 { return null; } -fn asciiEqlIgnoreCase(a: []const u8, b: []const u8) bool { - if (a.len != b.len) return false; - for (a, b) |ca, cb| { - if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false; - } - return true; -} - pub fn sniff(body: []const u8) ?Mime { // 0x0C is form feed const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C }); @@ -725,14 +725,17 @@ test "Mime: prescanCharset" { ); // No charset found - try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("Test")); - try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("")); - try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("no html here")); + try testing.expectEqual(null, Mime.prescanCharset("Test")); + try testing.expectEqual(null, Mime.prescanCharset("")); + try testing.expectEqual(null, Mime.prescanCharset("no html here")); + + // Self-closing meta without charset must not loop forever + try testing.expectEqual(null, Mime.prescanCharset("")); // Charset after 1024 bytes should not be found var long_html: [1100]u8 = undefined; @memset(&long_html, ' '); const suffix = ""; @memcpy(long_html[1050 .. 1050 + suffix.len], suffix); - try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(&long_html)); + try testing.expectEqual(null, Mime.prescanCharset(&long_html)); } diff --git a/src/browser/Page.zig b/src/browser/Page.zig index ab291bc6..9f7a22a1 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -855,9 +855,9 @@ fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void { break :blk Mime.sniff(data); } orelse .unknown; - // If the HTTP header didn't specify a charset and this is HTML, + // If the HTTP Content-Type header didn't specify a charset and this is HTML, // prescan the first 1024 bytes for a declaration. - if (mime.content_type == .text_html and std.mem.eql(u8, mime.charsetString(), "UTF-8")) { + if (mime.content_type == .text_html and mime.is_default_charset) { if (Mime.prescanCharset(data)) |charset| { if (charset.len <= 40) { @memcpy(mime.charset[0..charset.len], charset);