From 3dcdaa0a9bf7c4a138f70dab4672a48d79276c5a Mon Sep 17 00:00:00 2001
From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com>
Date: Sat, 14 Mar 2026 14:15:40 -0700
Subject: [PATCH 1/2] Implement charset detection from first 1024 bytes of HTML

Per the HTML spec, browsers should detect charset from <meta> tags
in the first 1024 bytes of a document when the HTTP Content-Type
header doesn't specify one.

Adds Mime.prescanCharset() which scans for:
- <meta charset="X">
- <meta http-equiv="Content-Type" content="...;charset=X">

Integrates into the page loading flow to set the detected charset
on the Mime when no explicit HTTP charset was provided.

Fixes #531
---
 src/browser/Mime.zig | 160 +++++++++++++++++++++++++++++++++++++++++++
 src/browser/Page.zig |  14 +++-
 2 files changed, 173 insertions(+), 1 deletion(-)
diff --git a/src/browser/Mime.zig b/src/browser/Mime.zig
index 43ca3632..beef2177 100644
--- a/src/browser/Mime.zig
+++ b/src/browser/Mime.zig
@@ -168,6 +168,137 @@ pub fn parse(input: []u8) !Mime {
     };
 }
 
+/// Prescan the first 1024 bytes of an HTML document for a charset declaration.
+/// Looks for `<meta charset="X">` and `<meta http-equiv="Content-Type" content="...;charset=X">`.
+/// Returns the charset value or null if none found.
+/// See: https://www.w3.org/International/questions/qa-html-encoding-declarations
+pub fn prescanCharset(html: []const u8) ?[]const u8 {
+    const limit = @min(html.len, 1024);
+    const data = html[0..limit];
+
+    // Scan for <meta tags
+    var pos: usize = 0;
+    while (pos < data.len) {
+        // Find next '<'
+        pos = std.mem.indexOfScalarPos(u8, data, pos, '<') orelse return null;
+        pos += 1;
+        if (pos >= data.len) return null;
+
+        // Check for "meta" (case-insensitive)
+        if (pos + 4 >= data.len) return null;
+        var tag_buf: [4]u8 = undefined;
+        _ = std.ascii.lowerString(&tag_buf, data[pos..][0..4]);
+        if (!std.mem.eql(u8, &tag_buf, "meta")) {
+            continue;
+        }
+        pos += 4;
+
+        // Must be followed by whitespace or end of tag
+        if (pos >= data.len) return null;
+        if (data[pos] != ' ' and data[pos] != '\t' and data[pos] != '\n' and
+            data[pos] != '\r' and data[pos] != '/')
+        {
+            continue;
+        }
+
+        // Scan attributes within this meta tag
+        const tag_end = std.mem.indexOfScalarPos(u8, data, pos, '>') orelse return null;
+        const attrs = data[pos..tag_end];
+
+        // Look for charset= attribute directly
+        if (findAttrValue(attrs, "charset")) |charset| {
+            if (charset.len > 0 and charset.len <= 40) return charset;
+        }
+
+        // Look for http-equiv="content-type" with content="...;charset=X"
+        if (findAttrValue(attrs, "http-equiv")) |he| {
+            if (asciiEqlIgnoreCase(he, "content-type")) {
+                if (findAttrValue(attrs, "content")) |content| {
+                    if (extractCharsetFromContentType(content)) |charset| {
+                        return charset;
+                    }
+                }
+            }
+        }
+
+        pos = tag_end + 1;
+    }
+    return null;
+}
+
+fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
+    var pos: usize = 0;
+    while (pos < attrs.len) {
+        // Skip whitespace
+        while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t' or
+            attrs[pos] == '\n' or attrs[pos] == '\r'))
+        {
+            pos += 1;
+        }
+        if (pos >= attrs.len) return null;
+
+        // Read attribute name
+        const attr_start = pos;
+        while (pos < attrs.len and attrs[pos] != '=' and attrs[pos] != ' ' and
+            attrs[pos] != '\t' and attrs[pos] != '>' and attrs[pos] != '/')
+        {
+            pos += 1;
+        }
+        const attr_name = attrs[attr_start..pos];
+
+        // Skip whitespace around =
+        while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
+        if (pos >= attrs.len or attrs[pos] != '=') continue;
+        pos += 1; // skip '='
+        while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
+        if (pos >= attrs.len) return null;
+
+        // Read attribute value
+        const value = blk: {
+            if (attrs[pos] == '"' or attrs[pos] == '\'') {
+                const quote = attrs[pos];
+                pos += 1;
+                const val_start = pos;
+                while (pos < attrs.len and attrs[pos] != quote) pos += 1;
+                const val = attrs[val_start..pos];
+                if (pos < attrs.len) pos += 1; // skip closing quote
+                break :blk val;
+            } else {
+                const val_start = pos;
+                while (pos < attrs.len and attrs[pos] != ' ' and attrs[pos] != '\t' and
+                    attrs[pos] != '>' and attrs[pos] != '/')
+                {
+                    pos += 1;
+                }
+                break :blk attrs[val_start..pos];
+            }
+        };
+
+        if (asciiEqlIgnoreCase(attr_name, name)) return value;
+    }
+    return null;
+}
+
+fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
+    var it = std.mem.splitScalar(u8, content, ';');
+    while (it.next()) |part| {
+        const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' });
+        if (trimmed.len > 8 and asciiEqlIgnoreCase(trimmed[0..8], "charset=")) {
+            const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' });
+            if (val.len > 0 and val.len <= 40) return val;
+        }
+    }
+    return null;
+}
+
+fn asciiEqlIgnoreCase(a: []const u8, b: []const u8) bool {
+    if (a.len != b.len) return false;
+    for (a, b) |ca, cb| {
+        if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false;
+    }
+    return true;
+}
+
 pub fn sniff(body: []const u8) ?Mime {
     // 0x0C is form feed
     const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C });
@@ -576,3 +707,32 @@ fn expect(expected: Expectation, input: []const u8) !void {
         try testing.expectEqual(m.charsetStringZ(), actual.charsetStringZ());
     }
 }
+
+test "Mime: prescanCharset" {
+    // <meta charset="X">
+    try testing.expectEqual("utf-8", Mime.prescanCharset("<html><head><meta charset=\"utf-8\">").?);
+    try testing.expectEqual("iso-8859-1", Mime.prescanCharset("<html><head><meta charset=\"iso-8859-1\">").?);
+    try testing.expectEqual("shift_jis", Mime.prescanCharset("<meta charset='shift_jis'>").?);
+
+    // Case-insensitive tag matching
+    try testing.expectEqual("utf-8", Mime.prescanCharset("<META charset=\"utf-8\">").?);
+    try testing.expectEqual("utf-8", Mime.prescanCharset("<Meta charset=\"utf-8\">").?);
+
+    // <meta http-equiv="Content-Type" content="text/html; charset=X">
+    try testing.expectEqual(
+        "iso-8859-1",
+        Mime.prescanCharset("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">").?,
+    );
+
+    // No charset found
+    try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("<html><head><title>Test</title>"));
+    try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(""));
+    try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("no html here"));
+
+    // Charset after 1024 bytes should not be found
+    var long_html: [1100]u8 = undefined;
+    @memset(&long_html, ' ');
+    const suffix = "<meta charset=\"windows-1252\">";
+    @memcpy(long_html[1050 .. 1050 + suffix.len], suffix);
+    try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(&long_html));
+}
diff --git a/src/browser/Page.zig b/src/browser/Page.zig
index cb62cb31..ab291bc6 100644
--- a/src/browser/Page.zig
+++ b/src/browser/Page.zig
@@ -848,13 +848,25 @@ fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
     if (self._parse_state == .pre) {
         // we lazily do this, because we might need the first chunk of data
         // to sniff the content type
-        const mime: Mime = blk: {
+        var mime: Mime = blk: {
             if (transfer.response_header.?.contentType()) |ct| {
                 break :blk try Mime.parse(ct);
             }
             break :blk Mime.sniff(data);
         } orelse .unknown;
 
+        // If the HTTP header didn't specify a charset and this is HTML,
+        // prescan the first 1024 bytes for a <meta charset> declaration.
+        if (mime.content_type == .text_html and std.mem.eql(u8, mime.charsetString(), "UTF-8")) {
+            if (Mime.prescanCharset(data)) |charset| {
+                if (charset.len <= 40) {
+                    @memcpy(mime.charset[0..charset.len], charset);
+                    mime.charset[charset.len] = 0;
+                    mime.charset_len = charset.len;
+                }
+            }
+        }
+
         if (comptime IS_DEBUG) {
             log.debug(.page, "navigate first chunk", .{
                 .content_type = mime.content_type,

From b373fb4a424119c314da7c8a62aa3811c6a4698c Mon Sep 17 00:00:00 2001
From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com>
Date: Sun, 15 Mar 2026 21:20:45 -0700
Subject: [PATCH 2/2] Address review feedback: fix endless loop, use stdlib,
 add charset flag

- Use std.ascii.eqlIgnoreCase instead of custom asciiEqlIgnoreCase
- Fix infinite loop in findAttrValue when attribute has no '=' sign
  (e.g. self-closing <meta foo="bar"/>)
- Add is_default_charset flag to Mime struct so prescan only overrides
  charset when Content-Type header didn't set one explicitly
- Add regression test for the self-closing meta loop case

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/browser/Mime.zig | 35 +++++++++++++++++++----------------
 src/browser/Page.zig |  4 ++--
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/browser/Mime.zig b/src/browser/Mime.zig
index beef2177..13951259 100644
--- a/src/browser/Mime.zig
+++ b/src/browser/Mime.zig
@@ -25,6 +25,7 @@ params: []const u8 = "",
 // We keep 41 for null-termination since HTML parser expects in this format.
 charset: [41]u8 = default_charset,
 charset_len: usize = default_charset_len,
+is_default_charset: bool = true,
 
 /// String "UTF-8" continued by null characters.
 const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;
@@ -130,6 +131,7 @@ pub fn parse(input: []u8) !Mime {
 
     var charset: [41]u8 = default_charset;
     var charset_len: usize = default_charset_len;
+    var has_explicit_charset = false;
 
     var it = std.mem.splitScalar(u8, params, ';');
     while (it.next()) |attr| {
@@ -156,6 +158,7 @@ pub fn parse(input: []u8) !Mime {
                 // Null-terminate right after attribute value.
                 charset[attribute_value.len] = 0;
                 charset_len = attribute_value.len;
+                has_explicit_charset = true;
             },
         }
     }
@@ -165,6 +168,7 @@ pub fn parse(input: []u8) !Mime {
         .charset = charset,
         .charset_len = charset_len,
         .content_type = content_type,
+        .is_default_charset = !has_explicit_charset,
     };
 }
 
@@ -212,7 +216,7 @@ pub fn prescanCharset(html: []const u8) ?[]const u8 {
 
         // Look for http-equiv="content-type" with content="...;charset=X"
         if (findAttrValue(attrs, "http-equiv")) |he| {
-            if (asciiEqlIgnoreCase(he, "content-type")) {
+            if (std.ascii.eqlIgnoreCase(he, "content-type")) {
                 if (findAttrValue(attrs, "content")) |content| {
                     if (extractCharsetFromContentType(content)) |charset| {
                         return charset;
@@ -248,7 +252,11 @@ fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
 
         // Skip whitespace around =
         while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
-        if (pos >= attrs.len or attrs[pos] != '=') continue;
+        if (pos >= attrs.len or attrs[pos] != '=') {
+            // No '=' found - skip this token. Advance at least one byte to avoid infinite loop.
+            if (pos == attr_start) pos += 1;
+            continue;
+        }
         pos += 1; // skip '='
         while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
         if (pos >= attrs.len) return null;
@@ -274,7 +282,7 @@ fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
             }
         };
 
-        if (asciiEqlIgnoreCase(attr_name, name)) return value;
+        if (std.ascii.eqlIgnoreCase(attr_name, name)) return value;
     }
     return null;
 }
@@ -283,7 +291,7 @@ fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
     var it = std.mem.splitScalar(u8, content, ';');
     while (it.next()) |part| {
         const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' });
-        if (trimmed.len > 8 and asciiEqlIgnoreCase(trimmed[0..8], "charset=")) {
+        if (trimmed.len > 8 and std.ascii.eqlIgnoreCase(trimmed[0..8], "charset=")) {
             const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' });
             if (val.len > 0 and val.len <= 40) return val;
         }
@@ -291,14 +299,6 @@ fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
     return null;
 }
 
-fn asciiEqlIgnoreCase(a: []const u8, b: []const u8) bool {
-    if (a.len != b.len) return false;
-    for (a, b) |ca, cb| {
-        if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false;
-    }
-    return true;
-}
-
 pub fn sniff(body: []const u8) ?Mime {
     // 0x0C is form feed
     const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C });
@@ -725,14 +725,17 @@ test "Mime: prescanCharset" {
     );
 
     // No charset found
-    try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("<html><head><title>Test</title>"));
-    try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(""));
-    try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("no html here"));
+    try testing.expectEqual(null, Mime.prescanCharset("<html><head><title>Test</title>"));
+    try testing.expectEqual(null, Mime.prescanCharset(""));
+    try testing.expectEqual(null, Mime.prescanCharset("no html here"));
+
+    // Self-closing meta without charset must not loop forever
+    try testing.expectEqual(null, Mime.prescanCharset("<meta foo=\"bar\"/>"));
 
     // Charset after 1024 bytes should not be found
     var long_html: [1100]u8 = undefined;
     @memset(&long_html, ' ');
     const suffix = "<meta charset=\"windows-1252\">";
     @memcpy(long_html[1050 .. 1050 + suffix.len], suffix);
-    try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(&long_html));
+    try testing.expectEqual(null, Mime.prescanCharset(&long_html));
 }
diff --git a/src/browser/Page.zig b/src/browser/Page.zig
index ab291bc6..9f7a22a1 100644
--- a/src/browser/Page.zig
+++ b/src/browser/Page.zig
@@ -855,9 +855,9 @@ fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
             break :blk Mime.sniff(data);
         } orelse .unknown;
 
-        // If the HTTP header didn't specify a charset and this is HTML,
+        // If the HTTP Content-Type header didn't specify a charset and this is HTML,
         // prescan the first 1024 bytes for a <meta charset> declaration.
-        if (mime.content_type == .text_html and std.mem.eql(u8, mime.charsetString(), "UTF-8")) {
+        if (mime.content_type == .text_html and mime.is_default_charset) {
             if (Mime.prescanCharset(data)) |charset| {
                 if (charset.len <= 40) {
                     @memcpy(mime.charset[0..charset.len], charset);