mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-03-21 20:24:42 +00:00
Implement charset detection from first 1024 bytes of HTML
Per the HTML spec, browsers should detect charset from <meta> tags in the first 1024 bytes of a document when the HTTP Content-Type header doesn't specify one. Adds Mime.prescanCharset() which scans for: - <meta charset="X"> - <meta http-equiv="Content-Type" content="...;charset=X"> Integrates into the page loading flow to set the detected charset on the Mime when no explicit HTTP charset was provided. Fixes #531
This commit is contained in:
@@ -168,6 +168,137 @@ pub fn parse(input: []u8) !Mime {
|
||||
};
|
||||
}
|
||||
|
||||
/// Prescan the first 1024 bytes of an HTML document for a charset declaration.
|
||||
/// Looks for `<meta charset="X">` and `<meta http-equiv="Content-Type" content="...;charset=X">`.
|
||||
/// Returns the charset value or null if none found.
|
||||
/// See: https://www.w3.org/International/questions/qa-html-encoding-declarations
|
||||
pub fn prescanCharset(html: []const u8) ?[]const u8 {
|
||||
const limit = @min(html.len, 1024);
|
||||
const data = html[0..limit];
|
||||
|
||||
// Scan for <meta tags
|
||||
var pos: usize = 0;
|
||||
while (pos < data.len) {
|
||||
// Find next '<'
|
||||
pos = std.mem.indexOfScalarPos(u8, data, pos, '<') orelse return null;
|
||||
pos += 1;
|
||||
if (pos >= data.len) return null;
|
||||
|
||||
// Check for "meta" (case-insensitive)
|
||||
if (pos + 4 >= data.len) return null;
|
||||
var tag_buf: [4]u8 = undefined;
|
||||
_ = std.ascii.lowerString(&tag_buf, data[pos..][0..4]);
|
||||
if (!std.mem.eql(u8, &tag_buf, "meta")) {
|
||||
continue;
|
||||
}
|
||||
pos += 4;
|
||||
|
||||
// Must be followed by whitespace or end of tag
|
||||
if (pos >= data.len) return null;
|
||||
if (data[pos] != ' ' and data[pos] != '\t' and data[pos] != '\n' and
|
||||
data[pos] != '\r' and data[pos] != '/')
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Scan attributes within this meta tag
|
||||
const tag_end = std.mem.indexOfScalarPos(u8, data, pos, '>') orelse return null;
|
||||
const attrs = data[pos..tag_end];
|
||||
|
||||
// Look for charset= attribute directly
|
||||
if (findAttrValue(attrs, "charset")) |charset| {
|
||||
if (charset.len > 0 and charset.len <= 40) return charset;
|
||||
}
|
||||
|
||||
// Look for http-equiv="content-type" with content="...;charset=X"
|
||||
if (findAttrValue(attrs, "http-equiv")) |he| {
|
||||
if (asciiEqlIgnoreCase(he, "content-type")) {
|
||||
if (findAttrValue(attrs, "content")) |content| {
|
||||
if (extractCharsetFromContentType(content)) |charset| {
|
||||
return charset;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pos = tag_end + 1;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
|
||||
var pos: usize = 0;
|
||||
while (pos < attrs.len) {
|
||||
// Skip whitespace
|
||||
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t' or
|
||||
attrs[pos] == '\n' or attrs[pos] == '\r'))
|
||||
{
|
||||
pos += 1;
|
||||
}
|
||||
if (pos >= attrs.len) return null;
|
||||
|
||||
// Read attribute name
|
||||
const attr_start = pos;
|
||||
while (pos < attrs.len and attrs[pos] != '=' and attrs[pos] != ' ' and
|
||||
attrs[pos] != '\t' and attrs[pos] != '>' and attrs[pos] != '/')
|
||||
{
|
||||
pos += 1;
|
||||
}
|
||||
const attr_name = attrs[attr_start..pos];
|
||||
|
||||
// Skip whitespace around =
|
||||
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
|
||||
if (pos >= attrs.len or attrs[pos] != '=') continue;
|
||||
pos += 1; // skip '='
|
||||
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
|
||||
if (pos >= attrs.len) return null;
|
||||
|
||||
// Read attribute value
|
||||
const value = blk: {
|
||||
if (attrs[pos] == '"' or attrs[pos] == '\'') {
|
||||
const quote = attrs[pos];
|
||||
pos += 1;
|
||||
const val_start = pos;
|
||||
while (pos < attrs.len and attrs[pos] != quote) pos += 1;
|
||||
const val = attrs[val_start..pos];
|
||||
if (pos < attrs.len) pos += 1; // skip closing quote
|
||||
break :blk val;
|
||||
} else {
|
||||
const val_start = pos;
|
||||
while (pos < attrs.len and attrs[pos] != ' ' and attrs[pos] != '\t' and
|
||||
attrs[pos] != '>' and attrs[pos] != '/')
|
||||
{
|
||||
pos += 1;
|
||||
}
|
||||
break :blk attrs[val_start..pos];
|
||||
}
|
||||
};
|
||||
|
||||
if (asciiEqlIgnoreCase(attr_name, name)) return value;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
|
||||
var it = std.mem.splitScalar(u8, content, ';');
|
||||
while (it.next()) |part| {
|
||||
const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' });
|
||||
if (trimmed.len > 8 and asciiEqlIgnoreCase(trimmed[0..8], "charset=")) {
|
||||
const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' });
|
||||
if (val.len > 0 and val.len <= 40) return val;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
fn asciiEqlIgnoreCase(a: []const u8, b: []const u8) bool {
|
||||
if (a.len != b.len) return false;
|
||||
for (a, b) |ca, cb| {
|
||||
if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
pub fn sniff(body: []const u8) ?Mime {
|
||||
// 0x0C is form feed
|
||||
const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C });
|
||||
@@ -576,3 +707,32 @@ fn expect(expected: Expectation, input: []const u8) !void {
|
||||
try testing.expectEqual(m.charsetStringZ(), actual.charsetStringZ());
|
||||
}
|
||||
}
|
||||
|
||||
test "Mime: prescanCharset" {
|
||||
// <meta charset="X">
|
||||
try testing.expectEqual("utf-8", Mime.prescanCharset("<html><head><meta charset=\"utf-8\">").?);
|
||||
try testing.expectEqual("iso-8859-1", Mime.prescanCharset("<html><head><meta charset=\"iso-8859-1\">").?);
|
||||
try testing.expectEqual("shift_jis", Mime.prescanCharset("<meta charset='shift_jis'>").?);
|
||||
|
||||
// Case-insensitive tag matching
|
||||
try testing.expectEqual("utf-8", Mime.prescanCharset("<META charset=\"utf-8\">").?);
|
||||
try testing.expectEqual("utf-8", Mime.prescanCharset("<Meta charset=\"utf-8\">").?);
|
||||
|
||||
// <meta http-equiv="Content-Type" content="text/html; charset=X">
|
||||
try testing.expectEqual(
|
||||
"iso-8859-1",
|
||||
Mime.prescanCharset("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">").?,
|
||||
);
|
||||
|
||||
// No charset found
|
||||
try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("<html><head><title>Test</title>"));
|
||||
try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(""));
|
||||
try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("no html here"));
|
||||
|
||||
// Charset after 1024 bytes should not be found
|
||||
var long_html: [1100]u8 = undefined;
|
||||
@memset(&long_html, ' ');
|
||||
const suffix = "<meta charset=\"windows-1252\">";
|
||||
@memcpy(long_html[1050 .. 1050 + suffix.len], suffix);
|
||||
try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(&long_html));
|
||||
}
|
||||
|
||||
@@ -848,13 +848,25 @@ fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
|
||||
if (self._parse_state == .pre) {
|
||||
// we lazily do this, because we might need the first chunk of data
|
||||
// to sniff the content type
|
||||
const mime: Mime = blk: {
|
||||
var mime: Mime = blk: {
|
||||
if (transfer.response_header.?.contentType()) |ct| {
|
||||
break :blk try Mime.parse(ct);
|
||||
}
|
||||
break :blk Mime.sniff(data);
|
||||
} orelse .unknown;
|
||||
|
||||
// If the HTTP header didn't specify a charset and this is HTML,
|
||||
// prescan the first 1024 bytes for a <meta charset> declaration.
|
||||
if (mime.content_type == .text_html and std.mem.eql(u8, mime.charsetString(), "UTF-8")) {
|
||||
if (Mime.prescanCharset(data)) |charset| {
|
||||
if (charset.len <= 40) {
|
||||
@memcpy(mime.charset[0..charset.len], charset);
|
||||
mime.charset[charset.len] = 0;
|
||||
mime.charset_len = charset.len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (comptime IS_DEBUG) {
|
||||
log.debug(.page, "navigate first chunk", .{
|
||||
.content_type = mime.content_type,
|
||||
|
||||
Reference in New Issue
Block a user