From 3dcdaa0a9bf7c4a138f70dab4672a48d79276c5a Mon Sep 17 00:00:00 2001
From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com>
Date: Sat, 14 Mar 2026 14:15:40 -0700
Subject: [PATCH 1/2] Implement charset detection from first 1024 bytes of HTML
Per the HTML spec, browsers should detect charset from tags
in the first 1024 bytes of a document when the HTTP Content-Type
header doesn't specify one.
Adds Mime.prescanCharset() which scans for:
-
-
Integrates into the page loading flow to set the detected charset
on the Mime when no explicit HTTP charset was provided.
Fixes #531
---
src/browser/Mime.zig | 160 +++++++++++++++++++++++++++++++++++++++++++
src/browser/Page.zig | 14 +++-
2 files changed, 173 insertions(+), 1 deletion(-)
diff --git a/src/browser/Mime.zig b/src/browser/Mime.zig
index 43ca3632..beef2177 100644
--- a/src/browser/Mime.zig
+++ b/src/browser/Mime.zig
@@ -168,6 +168,137 @@ pub fn parse(input: []u8) !Mime {
};
}
+/// Prescan the first 1024 bytes of an HTML document for a charset declaration.
+/// Looks for `` and ``.
+/// Returns the charset value or null if none found.
+/// See: https://www.w3.org/International/questions/qa-html-encoding-declarations
+pub fn prescanCharset(html: []const u8) ?[]const u8 {
+ const limit = @min(html.len, 1024);
+ const data = html[0..limit];
+
+ // Scan for = data.len) return null;
+
+ // Check for "meta" (case-insensitive)
+ if (pos + 4 >= data.len) return null;
+ var tag_buf: [4]u8 = undefined;
+ _ = std.ascii.lowerString(&tag_buf, data[pos..][0..4]);
+ if (!std.mem.eql(u8, &tag_buf, "meta")) {
+ continue;
+ }
+ pos += 4;
+
+ // Must be followed by whitespace or end of tag
+ if (pos >= data.len) return null;
+ if (data[pos] != ' ' and data[pos] != '\t' and data[pos] != '\n' and
+ data[pos] != '\r' and data[pos] != '/')
+ {
+ continue;
+ }
+
+ // Scan attributes within this meta tag
+ const tag_end = std.mem.indexOfScalarPos(u8, data, pos, '>') orelse return null;
+ const attrs = data[pos..tag_end];
+
+ // Look for charset= attribute directly
+ if (findAttrValue(attrs, "charset")) |charset| {
+ if (charset.len > 0 and charset.len <= 40) return charset;
+ }
+
+ // Look for http-equiv="content-type" with content="...;charset=X"
+ if (findAttrValue(attrs, "http-equiv")) |he| {
+ if (asciiEqlIgnoreCase(he, "content-type")) {
+ if (findAttrValue(attrs, "content")) |content| {
+ if (extractCharsetFromContentType(content)) |charset| {
+ return charset;
+ }
+ }
+ }
+ }
+
+ pos = tag_end + 1;
+ }
+ return null;
+}
+
+fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
+ var pos: usize = 0;
+ while (pos < attrs.len) {
+ // Skip whitespace
+ while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t' or
+ attrs[pos] == '\n' or attrs[pos] == '\r'))
+ {
+ pos += 1;
+ }
+ if (pos >= attrs.len) return null;
+
+ // Read attribute name
+ const attr_start = pos;
+ while (pos < attrs.len and attrs[pos] != '=' and attrs[pos] != ' ' and
+ attrs[pos] != '\t' and attrs[pos] != '>' and attrs[pos] != '/')
+ {
+ pos += 1;
+ }
+ const attr_name = attrs[attr_start..pos];
+
+ // Skip whitespace around =
+ while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
+ if (pos >= attrs.len or attrs[pos] != '=') continue;
+ pos += 1; // skip '='
+ while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
+ if (pos >= attrs.len) return null;
+
+ // Read attribute value
+ const value = blk: {
+ if (attrs[pos] == '"' or attrs[pos] == '\'') {
+ const quote = attrs[pos];
+ pos += 1;
+ const val_start = pos;
+ while (pos < attrs.len and attrs[pos] != quote) pos += 1;
+ const val = attrs[val_start..pos];
+ if (pos < attrs.len) pos += 1; // skip closing quote
+ break :blk val;
+ } else {
+ const val_start = pos;
+ while (pos < attrs.len and attrs[pos] != ' ' and attrs[pos] != '\t' and
+ attrs[pos] != '>' and attrs[pos] != '/')
+ {
+ pos += 1;
+ }
+ break :blk attrs[val_start..pos];
+ }
+ };
+
+ if (asciiEqlIgnoreCase(attr_name, name)) return value;
+ }
+ return null;
+}
+
+fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
+ var it = std.mem.splitScalar(u8, content, ';');
+ while (it.next()) |part| {
+ const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' });
+ if (trimmed.len > 8 and asciiEqlIgnoreCase(trimmed[0..8], "charset=")) {
+ const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' });
+ if (val.len > 0 and val.len <= 40) return val;
+ }
+ }
+ return null;
+}
+
+fn asciiEqlIgnoreCase(a: []const u8, b: []const u8) bool {
+ if (a.len != b.len) return false;
+ for (a, b) |ca, cb| {
+ if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false;
+ }
+ return true;
+}
+
pub fn sniff(body: []const u8) ?Mime {
// 0x0C is form feed
const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C });
@@ -576,3 +707,32 @@ fn expect(expected: Expectation, input: []const u8) !void {
try testing.expectEqual(m.charsetStringZ(), actual.charsetStringZ());
}
}
+
+test "Mime: prescanCharset" {
+ //
+ try testing.expectEqual("utf-8", Mime.prescanCharset("
").?);
+ try testing.expectEqual("iso-8859-1", Mime.prescanCharset("").?);
+ try testing.expectEqual("shift_jis", Mime.prescanCharset("").?);
+
+ // Case-insensitive tag matching
+ try testing.expectEqual("utf-8", Mime.prescanCharset("").?);
+ try testing.expectEqual("utf-8", Mime.prescanCharset("").?);
+
+ //
+ try testing.expectEqual(
+ "iso-8859-1",
+ Mime.prescanCharset("").?,
+ );
+
+ // No charset found
+ try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("Test"));
+ try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(""));
+ try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("no html here"));
+
+ // Charset after 1024 bytes should not be found
+ var long_html: [1100]u8 = undefined;
+ @memset(&long_html, ' ');
+ const suffix = "";
+ @memcpy(long_html[1050 .. 1050 + suffix.len], suffix);
+ try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(&long_html));
+}
diff --git a/src/browser/Page.zig b/src/browser/Page.zig
index cb62cb31..ab291bc6 100644
--- a/src/browser/Page.zig
+++ b/src/browser/Page.zig
@@ -848,13 +848,25 @@ fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
if (self._parse_state == .pre) {
// we lazily do this, because we might need the first chunk of data
// to sniff the content type
- const mime: Mime = blk: {
+ var mime: Mime = blk: {
if (transfer.response_header.?.contentType()) |ct| {
break :blk try Mime.parse(ct);
}
break :blk Mime.sniff(data);
} orelse .unknown;
+ // If the HTTP header didn't specify a charset and this is HTML,
+ // prescan the first 1024 bytes for a declaration.
+ if (mime.content_type == .text_html and std.mem.eql(u8, mime.charsetString(), "UTF-8")) {
+ if (Mime.prescanCharset(data)) |charset| {
+ if (charset.len <= 40) {
+ @memcpy(mime.charset[0..charset.len], charset);
+ mime.charset[charset.len] = 0;
+ mime.charset_len = charset.len;
+ }
+ }
+ }
+
if (comptime IS_DEBUG) {
log.debug(.page, "navigate first chunk", .{
.content_type = mime.content_type,
From b373fb4a424119c314da7c8a62aa3811c6a4698c Mon Sep 17 00:00:00 2001
From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com>
Date: Sun, 15 Mar 2026 21:20:45 -0700
Subject: [PATCH 2/2] Address review feedback: fix endless loop, use stdlib,
add charset flag
- Use std.ascii.eqlIgnoreCase instead of custom asciiEqlIgnoreCase
- Fix infinite loop in findAttrValue when attribute has no '=' sign
(e.g. self-closing )
- Add is_default_charset flag to Mime struct so prescan only overrides
charset when Content-Type header didn't set one explicitly
- Add regression test for the self-closing meta loop case
Co-Authored-By: Claude Opus 4.6
---
src/browser/Mime.zig | 35 +++++++++++++++++++----------------
src/browser/Page.zig | 4 ++--
2 files changed, 21 insertions(+), 18 deletions(-)
diff --git a/src/browser/Mime.zig b/src/browser/Mime.zig
index beef2177..13951259 100644
--- a/src/browser/Mime.zig
+++ b/src/browser/Mime.zig
@@ -25,6 +25,7 @@ params: []const u8 = "",
// We keep 41 for null-termination since HTML parser expects in this format.
charset: [41]u8 = default_charset,
charset_len: usize = default_charset_len,
+is_default_charset: bool = true,
/// String "UTF-8" continued by null characters.
const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;
@@ -130,6 +131,7 @@ pub fn parse(input: []u8) !Mime {
var charset: [41]u8 = default_charset;
var charset_len: usize = default_charset_len;
+ var has_explicit_charset = false;
var it = std.mem.splitScalar(u8, params, ';');
while (it.next()) |attr| {
@@ -156,6 +158,7 @@ pub fn parse(input: []u8) !Mime {
// Null-terminate right after attribute value.
charset[attribute_value.len] = 0;
charset_len = attribute_value.len;
+ has_explicit_charset = true;
},
}
}
@@ -165,6 +168,7 @@ pub fn parse(input: []u8) !Mime {
.charset = charset,
.charset_len = charset_len,
.content_type = content_type,
+ .is_default_charset = !has_explicit_charset,
};
}
@@ -212,7 +216,7 @@ pub fn prescanCharset(html: []const u8) ?[]const u8 {
// Look for http-equiv="content-type" with content="...;charset=X"
if (findAttrValue(attrs, "http-equiv")) |he| {
- if (asciiEqlIgnoreCase(he, "content-type")) {
+ if (std.ascii.eqlIgnoreCase(he, "content-type")) {
if (findAttrValue(attrs, "content")) |content| {
if (extractCharsetFromContentType(content)) |charset| {
return charset;
@@ -248,7 +252,11 @@ fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
// Skip whitespace around =
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
- if (pos >= attrs.len or attrs[pos] != '=') continue;
+ if (pos >= attrs.len or attrs[pos] != '=') {
+ // No '=' found - skip this token. Advance at least one byte to avoid infinite loop.
+ if (pos == attr_start) pos += 1;
+ continue;
+ }
pos += 1; // skip '='
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
if (pos >= attrs.len) return null;
@@ -274,7 +282,7 @@ fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
}
};
- if (asciiEqlIgnoreCase(attr_name, name)) return value;
+ if (std.ascii.eqlIgnoreCase(attr_name, name)) return value;
}
return null;
}
@@ -283,7 +291,7 @@ fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
var it = std.mem.splitScalar(u8, content, ';');
while (it.next()) |part| {
const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' });
- if (trimmed.len > 8 and asciiEqlIgnoreCase(trimmed[0..8], "charset=")) {
+ if (trimmed.len > 8 and std.ascii.eqlIgnoreCase(trimmed[0..8], "charset=")) {
const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' });
if (val.len > 0 and val.len <= 40) return val;
}
@@ -291,14 +299,6 @@ fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
return null;
}
-fn asciiEqlIgnoreCase(a: []const u8, b: []const u8) bool {
- if (a.len != b.len) return false;
- for (a, b) |ca, cb| {
- if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false;
- }
- return true;
-}
-
pub fn sniff(body: []const u8) ?Mime {
// 0x0C is form feed
const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C });
@@ -725,14 +725,17 @@ test "Mime: prescanCharset" {
);
// No charset found
- try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("Test"));
- try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(""));
- try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("no html here"));
+ try testing.expectEqual(null, Mime.prescanCharset("Test"));
+ try testing.expectEqual(null, Mime.prescanCharset(""));
+ try testing.expectEqual(null, Mime.prescanCharset("no html here"));
+
+ // Self-closing meta without charset must not loop forever
+ try testing.expectEqual(null, Mime.prescanCharset(""));
// Charset after 1024 bytes should not be found
var long_html: [1100]u8 = undefined;
@memset(&long_html, ' ');
const suffix = "";
@memcpy(long_html[1050 .. 1050 + suffix.len], suffix);
- try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(&long_html));
+ try testing.expectEqual(null, Mime.prescanCharset(&long_html));
}
diff --git a/src/browser/Page.zig b/src/browser/Page.zig
index ab291bc6..9f7a22a1 100644
--- a/src/browser/Page.zig
+++ b/src/browser/Page.zig
@@ -855,9 +855,9 @@ fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
break :blk Mime.sniff(data);
} orelse .unknown;
- // If the HTTP header didn't specify a charset and this is HTML,
+ // If the HTTP Content-Type header didn't specify a charset and this is HTML,
// prescan the first 1024 bytes for a declaration.
- if (mime.content_type == .text_html and std.mem.eql(u8, mime.charsetString(), "UTF-8")) {
+ if (mime.content_type == .text_html and mime.is_default_charset) {
if (Mime.prescanCharset(data)) |charset| {
if (charset.len <= 40) {
@memcpy(mime.charset[0..charset.len], charset);