Merge pull request #1837 from mvanhorn/osc/531-charset-prescan
Some checks failed
zig-test / zig test using v8 in debug mode (push) Has been cancelled
zig-test / zig test (push) Has been cancelled
zig-test / perf-fmt (push) Has been cancelled
e2e-test / zig build release (push) Has been cancelled
e2e-test / demo-scripts (push) Has been cancelled
e2e-test / wba-demo-scripts (push) Has been cancelled
e2e-test / wba-test (push) Has been cancelled
e2e-test / cdp-and-hyperfine-bench (push) Has been cancelled
e2e-test / perf-fmt (push) Has been cancelled
e2e-test / browser fetch (push) Has been cancelled

Implement charset detection from first 1024 bytes of HTML
This commit is contained in:
Karl Seguin
2026-03-16 17:39:58 +08:00
committed by GitHub
2 changed files with 176 additions and 1 deletions

View File

@@ -25,6 +25,7 @@ params: []const u8 = "",
// We keep 41 for null-termination since HTML parser expects in this format. // We keep 41 for null-termination since HTML parser expects in this format.
charset: [41]u8 = default_charset, charset: [41]u8 = default_charset,
charset_len: usize = default_charset_len, charset_len: usize = default_charset_len,
is_default_charset: bool = true,
/// String "UTF-8" continued by null characters. /// String "UTF-8" continued by null characters.
const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36; const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;
@@ -130,6 +131,7 @@ pub fn parse(input: []u8) !Mime {
var charset: [41]u8 = default_charset; var charset: [41]u8 = default_charset;
var charset_len: usize = default_charset_len; var charset_len: usize = default_charset_len;
var has_explicit_charset = false;
var it = std.mem.splitScalar(u8, params, ';'); var it = std.mem.splitScalar(u8, params, ';');
while (it.next()) |attr| { while (it.next()) |attr| {
@@ -156,6 +158,7 @@ pub fn parse(input: []u8) !Mime {
// Null-terminate right after attribute value. // Null-terminate right after attribute value.
charset[attribute_value.len] = 0; charset[attribute_value.len] = 0;
charset_len = attribute_value.len; charset_len = attribute_value.len;
has_explicit_charset = true;
}, },
} }
} }
@@ -165,9 +168,137 @@ pub fn parse(input: []u8) !Mime {
.charset = charset, .charset = charset,
.charset_len = charset_len, .charset_len = charset_len,
.content_type = content_type, .content_type = content_type,
.is_default_charset = !has_explicit_charset,
}; };
} }
/// Prescan the first 1024 bytes of an HTML document for a charset declaration.
/// Looks for `<meta charset="X">` and `<meta http-equiv="Content-Type" content="...;charset=X">`.
/// Returns the charset value or null if none found.
/// See: https://www.w3.org/International/questions/qa-html-encoding-declarations
pub fn prescanCharset(html: []const u8) ?[]const u8 {
const limit = @min(html.len, 1024);
const data = html[0..limit];
// Scan for <meta tags
var pos: usize = 0;
while (pos < data.len) {
// Find next '<'
pos = std.mem.indexOfScalarPos(u8, data, pos, '<') orelse return null;
pos += 1;
if (pos >= data.len) return null;
// Check for "meta" (case-insensitive)
if (pos + 4 >= data.len) return null;
var tag_buf: [4]u8 = undefined;
_ = std.ascii.lowerString(&tag_buf, data[pos..][0..4]);
if (!std.mem.eql(u8, &tag_buf, "meta")) {
continue;
}
pos += 4;
// Must be followed by whitespace or end of tag
if (pos >= data.len) return null;
if (data[pos] != ' ' and data[pos] != '\t' and data[pos] != '\n' and
data[pos] != '\r' and data[pos] != '/')
{
continue;
}
// Scan attributes within this meta tag
const tag_end = std.mem.indexOfScalarPos(u8, data, pos, '>') orelse return null;
const attrs = data[pos..tag_end];
// Look for charset= attribute directly
if (findAttrValue(attrs, "charset")) |charset| {
if (charset.len > 0 and charset.len <= 40) return charset;
}
// Look for http-equiv="content-type" with content="...;charset=X"
if (findAttrValue(attrs, "http-equiv")) |he| {
if (std.ascii.eqlIgnoreCase(he, "content-type")) {
if (findAttrValue(attrs, "content")) |content| {
if (extractCharsetFromContentType(content)) |charset| {
return charset;
}
}
}
}
pos = tag_end + 1;
}
return null;
}
fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
var pos: usize = 0;
while (pos < attrs.len) {
// Skip whitespace
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t' or
attrs[pos] == '\n' or attrs[pos] == '\r'))
{
pos += 1;
}
if (pos >= attrs.len) return null;
// Read attribute name
const attr_start = pos;
while (pos < attrs.len and attrs[pos] != '=' and attrs[pos] != ' ' and
attrs[pos] != '\t' and attrs[pos] != '>' and attrs[pos] != '/')
{
pos += 1;
}
const attr_name = attrs[attr_start..pos];
// Skip whitespace around =
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
if (pos >= attrs.len or attrs[pos] != '=') {
// No '=' found - skip this token. Advance at least one byte to avoid infinite loop.
if (pos == attr_start) pos += 1;
continue;
}
pos += 1; // skip '='
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
if (pos >= attrs.len) return null;
// Read attribute value
const value = blk: {
if (attrs[pos] == '"' or attrs[pos] == '\'') {
const quote = attrs[pos];
pos += 1;
const val_start = pos;
while (pos < attrs.len and attrs[pos] != quote) pos += 1;
const val = attrs[val_start..pos];
if (pos < attrs.len) pos += 1; // skip closing quote
break :blk val;
} else {
const val_start = pos;
while (pos < attrs.len and attrs[pos] != ' ' and attrs[pos] != '\t' and
attrs[pos] != '>' and attrs[pos] != '/')
{
pos += 1;
}
break :blk attrs[val_start..pos];
}
};
if (std.ascii.eqlIgnoreCase(attr_name, name)) return value;
}
return null;
}
fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
var it = std.mem.splitScalar(u8, content, ';');
while (it.next()) |part| {
const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' });
if (trimmed.len > 8 and std.ascii.eqlIgnoreCase(trimmed[0..8], "charset=")) {
const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' });
if (val.len > 0 and val.len <= 40) return val;
}
}
return null;
}
pub fn sniff(body: []const u8) ?Mime { pub fn sniff(body: []const u8) ?Mime {
// 0x0C is form feed // 0x0C is form feed
const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C }); const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C });
@@ -576,3 +707,35 @@ fn expect(expected: Expectation, input: []const u8) !void {
try testing.expectEqual(m.charsetStringZ(), actual.charsetStringZ()); try testing.expectEqual(m.charsetStringZ(), actual.charsetStringZ());
} }
} }
test "Mime: prescanCharset" {
// <meta charset="X">
try testing.expectEqual("utf-8", Mime.prescanCharset("<html><head><meta charset=\"utf-8\">").?);
try testing.expectEqual("iso-8859-1", Mime.prescanCharset("<html><head><meta charset=\"iso-8859-1\">").?);
try testing.expectEqual("shift_jis", Mime.prescanCharset("<meta charset='shift_jis'>").?);
// Case-insensitive tag matching
try testing.expectEqual("utf-8", Mime.prescanCharset("<META charset=\"utf-8\">").?);
try testing.expectEqual("utf-8", Mime.prescanCharset("<Meta charset=\"utf-8\">").?);
// <meta http-equiv="Content-Type" content="text/html; charset=X">
try testing.expectEqual(
"iso-8859-1",
Mime.prescanCharset("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">").?,
);
// No charset found
try testing.expectEqual(null, Mime.prescanCharset("<html><head><title>Test</title>"));
try testing.expectEqual(null, Mime.prescanCharset(""));
try testing.expectEqual(null, Mime.prescanCharset("no html here"));
// Self-closing meta without charset must not loop forever
try testing.expectEqual(null, Mime.prescanCharset("<meta foo=\"bar\"/>"));
// Charset after 1024 bytes should not be found
var long_html: [1100]u8 = undefined;
@memset(&long_html, ' ');
const suffix = "<meta charset=\"windows-1252\">";
@memcpy(long_html[1050 .. 1050 + suffix.len], suffix);
try testing.expectEqual(null, Mime.prescanCharset(&long_html));
}

View File

@@ -848,13 +848,25 @@ fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
if (self._parse_state == .pre) { if (self._parse_state == .pre) {
// we lazily do this, because we might need the first chunk of data // we lazily do this, because we might need the first chunk of data
// to sniff the content type // to sniff the content type
const mime: Mime = blk: { var mime: Mime = blk: {
if (transfer.response_header.?.contentType()) |ct| { if (transfer.response_header.?.contentType()) |ct| {
break :blk try Mime.parse(ct); break :blk try Mime.parse(ct);
} }
break :blk Mime.sniff(data); break :blk Mime.sniff(data);
} orelse .unknown; } orelse .unknown;
// If the HTTP Content-Type header didn't specify a charset and this is HTML,
// prescan the first 1024 bytes for a <meta charset> declaration.
if (mime.content_type == .text_html and mime.is_default_charset) {
if (Mime.prescanCharset(data)) |charset| {
if (charset.len <= 40) {
@memcpy(mime.charset[0..charset.len], charset);
mime.charset[charset.len] = 0;
mime.charset_len = charset.len;
}
}
}
if (comptime IS_DEBUG) { if (comptime IS_DEBUG) {
log.debug(.page, "navigate first chunk", .{ log.debug(.page, "navigate first chunk", .{
.content_type = mime.content_type, .content_type = mime.content_type,