mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-03-22 04:34:44 +00:00
Address review feedback: fix endless loop, use stdlib, add charset flag
- Use std.ascii.eqlIgnoreCase instead of custom asciiEqlIgnoreCase - Fix infinite loop in findAttrValue when attribute has no '=' sign (e.g. self-closing <meta foo="bar"/>) - Add is_default_charset flag to Mime struct so prescan only overrides charset when Content-Type header didn't set one explicitly - Add regression test for the self-closing meta loop case Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,7 @@ params: []const u8 = "",
|
|||||||
// We keep 41 for null-termination since HTML parser expects in this format.
|
// We keep 41 for null-termination since HTML parser expects in this format.
|
||||||
charset: [41]u8 = default_charset,
|
charset: [41]u8 = default_charset,
|
||||||
charset_len: usize = default_charset_len,
|
charset_len: usize = default_charset_len,
|
||||||
|
is_default_charset: bool = true,
|
||||||
|
|
||||||
/// String "UTF-8" continued by null characters.
|
/// String "UTF-8" continued by null characters.
|
||||||
const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;
|
const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;
|
||||||
@@ -130,6 +131,7 @@ pub fn parse(input: []u8) !Mime {
|
|||||||
|
|
||||||
var charset: [41]u8 = default_charset;
|
var charset: [41]u8 = default_charset;
|
||||||
var charset_len: usize = default_charset_len;
|
var charset_len: usize = default_charset_len;
|
||||||
|
var has_explicit_charset = false;
|
||||||
|
|
||||||
var it = std.mem.splitScalar(u8, params, ';');
|
var it = std.mem.splitScalar(u8, params, ';');
|
||||||
while (it.next()) |attr| {
|
while (it.next()) |attr| {
|
||||||
@@ -156,6 +158,7 @@ pub fn parse(input: []u8) !Mime {
|
|||||||
// Null-terminate right after attribute value.
|
// Null-terminate right after attribute value.
|
||||||
charset[attribute_value.len] = 0;
|
charset[attribute_value.len] = 0;
|
||||||
charset_len = attribute_value.len;
|
charset_len = attribute_value.len;
|
||||||
|
has_explicit_charset = true;
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -165,6 +168,7 @@ pub fn parse(input: []u8) !Mime {
|
|||||||
.charset = charset,
|
.charset = charset,
|
||||||
.charset_len = charset_len,
|
.charset_len = charset_len,
|
||||||
.content_type = content_type,
|
.content_type = content_type,
|
||||||
|
.is_default_charset = !has_explicit_charset,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -212,7 +216,7 @@ pub fn prescanCharset(html: []const u8) ?[]const u8 {
|
|||||||
|
|
||||||
// Look for http-equiv="content-type" with content="...;charset=X"
|
// Look for http-equiv="content-type" with content="...;charset=X"
|
||||||
if (findAttrValue(attrs, "http-equiv")) |he| {
|
if (findAttrValue(attrs, "http-equiv")) |he| {
|
||||||
if (asciiEqlIgnoreCase(he, "content-type")) {
|
if (std.ascii.eqlIgnoreCase(he, "content-type")) {
|
||||||
if (findAttrValue(attrs, "content")) |content| {
|
if (findAttrValue(attrs, "content")) |content| {
|
||||||
if (extractCharsetFromContentType(content)) |charset| {
|
if (extractCharsetFromContentType(content)) |charset| {
|
||||||
return charset;
|
return charset;
|
||||||
@@ -248,7 +252,11 @@ fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
|
|||||||
|
|
||||||
// Skip whitespace around =
|
// Skip whitespace around =
|
||||||
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
|
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
|
||||||
if (pos >= attrs.len or attrs[pos] != '=') continue;
|
if (pos >= attrs.len or attrs[pos] != '=') {
|
||||||
|
// No '=' found - skip this token. Advance at least one byte to avoid infinite loop.
|
||||||
|
if (pos == attr_start) pos += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
pos += 1; // skip '='
|
pos += 1; // skip '='
|
||||||
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
|
while (pos < attrs.len and (attrs[pos] == ' ' or attrs[pos] == '\t')) pos += 1;
|
||||||
if (pos >= attrs.len) return null;
|
if (pos >= attrs.len) return null;
|
||||||
@@ -274,7 +282,7 @@ fn findAttrValue(attrs: []const u8, name: []const u8) ?[]const u8 {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (asciiEqlIgnoreCase(attr_name, name)) return value;
|
if (std.ascii.eqlIgnoreCase(attr_name, name)) return value;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -283,7 +291,7 @@ fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
|
|||||||
var it = std.mem.splitScalar(u8, content, ';');
|
var it = std.mem.splitScalar(u8, content, ';');
|
||||||
while (it.next()) |part| {
|
while (it.next()) |part| {
|
||||||
const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' });
|
const trimmed = std.mem.trimLeft(u8, part, &.{ ' ', '\t' });
|
||||||
if (trimmed.len > 8 and asciiEqlIgnoreCase(trimmed[0..8], "charset=")) {
|
if (trimmed.len > 8 and std.ascii.eqlIgnoreCase(trimmed[0..8], "charset=")) {
|
||||||
const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' });
|
const val = std.mem.trim(u8, trimmed[8..], &.{ ' ', '\t', '"', '\'' });
|
||||||
if (val.len > 0 and val.len <= 40) return val;
|
if (val.len > 0 and val.len <= 40) return val;
|
||||||
}
|
}
|
||||||
@@ -291,14 +299,6 @@ fn extractCharsetFromContentType(content: []const u8) ?[]const u8 {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn asciiEqlIgnoreCase(a: []const u8, b: []const u8) bool {
|
|
||||||
if (a.len != b.len) return false;
|
|
||||||
for (a, b) |ca, cb| {
|
|
||||||
if (std.ascii.toLower(ca) != std.ascii.toLower(cb)) return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn sniff(body: []const u8) ?Mime {
|
pub fn sniff(body: []const u8) ?Mime {
|
||||||
// 0x0C is form feed
|
// 0x0C is form feed
|
||||||
const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C });
|
const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C });
|
||||||
@@ -725,14 +725,17 @@ test "Mime: prescanCharset" {
|
|||||||
);
|
);
|
||||||
|
|
||||||
// No charset found
|
// No charset found
|
||||||
try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("<html><head><title>Test</title>"));
|
try testing.expectEqual(null, Mime.prescanCharset("<html><head><title>Test</title>"));
|
||||||
try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(""));
|
try testing.expectEqual(null, Mime.prescanCharset(""));
|
||||||
try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset("no html here"));
|
try testing.expectEqual(null, Mime.prescanCharset("no html here"));
|
||||||
|
|
||||||
|
// Self-closing meta without charset must not loop forever
|
||||||
|
try testing.expectEqual(null, Mime.prescanCharset("<meta foo=\"bar\"/>"));
|
||||||
|
|
||||||
// Charset after 1024 bytes should not be found
|
// Charset after 1024 bytes should not be found
|
||||||
var long_html: [1100]u8 = undefined;
|
var long_html: [1100]u8 = undefined;
|
||||||
@memset(&long_html, ' ');
|
@memset(&long_html, ' ');
|
||||||
const suffix = "<meta charset=\"windows-1252\">";
|
const suffix = "<meta charset=\"windows-1252\">";
|
||||||
@memcpy(long_html[1050 .. 1050 + suffix.len], suffix);
|
@memcpy(long_html[1050 .. 1050 + suffix.len], suffix);
|
||||||
try testing.expectEqual(@as(?[]const u8, null), Mime.prescanCharset(&long_html));
|
try testing.expectEqual(null, Mime.prescanCharset(&long_html));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -855,9 +855,9 @@ fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
|
|||||||
break :blk Mime.sniff(data);
|
break :blk Mime.sniff(data);
|
||||||
} orelse .unknown;
|
} orelse .unknown;
|
||||||
|
|
||||||
// If the HTTP header didn't specify a charset and this is HTML,
|
// If the HTTP Content-Type header didn't specify a charset and this is HTML,
|
||||||
// prescan the first 1024 bytes for a <meta charset> declaration.
|
// prescan the first 1024 bytes for a <meta charset> declaration.
|
||||||
if (mime.content_type == .text_html and std.mem.eql(u8, mime.charsetString(), "UTF-8")) {
|
if (mime.content_type == .text_html and mime.is_default_charset) {
|
||||||
if (Mime.prescanCharset(data)) |charset| {
|
if (Mime.prescanCharset(data)) |charset| {
|
||||||
if (charset.len <= 40) {
|
if (charset.len <= 40) {
|
||||||
@memcpy(mime.charset[0..charset.len], charset);
|
@memcpy(mime.charset[0..charset.len], charset);
|
||||||
|
|||||||
Reference in New Issue
Block a user