mirror of
https://github.com/lightpanda-io/browser.git
synced 2025-10-29 23:23:28 +00:00
Merge pull request #1048 from lightpanda-io/nikneym/mime-changes
Some checks failed
e2e-test / zig build release (push) Has been cancelled
e2e-test / demo-scripts (push) Has been cancelled
e2e-test / cdp-and-hyperfine-bench (push) Has been cancelled
e2e-test / perf-fmt (push) Has been cancelled
zig-test / zig build dev (push) Has been cancelled
zig-test / browser fetch (push) Has been cancelled
zig-test / zig test (push) Has been cancelled
zig-test / perf-fmt (push) Has been cancelled
Some checks failed
e2e-test / zig build release (push) Has been cancelled
e2e-test / demo-scripts (push) Has been cancelled
e2e-test / cdp-and-hyperfine-bench (push) Has been cancelled
e2e-test / perf-fmt (push) Has been cancelled
zig-test / zig build dev (push) Has been cancelled
zig-test / browser fetch (push) Has been cancelled
zig-test / zig test (push) Has been cancelled
zig-test / perf-fmt (push) Has been cancelled
Mime: charset identification changes
This commit is contained in:
@@ -22,13 +22,15 @@ const Allocator = std.mem.Allocator;
|
||||
pub const Mime = struct {
|
||||
content_type: ContentType,
|
||||
params: []const u8 = "",
|
||||
charset: ?[:0]const u8 = null,
|
||||
// IANA defines max. charset value length as 40.
|
||||
// We keep 41 for null-termination since HTML parser expects in this format.
|
||||
charset: [41]u8 = default_charset,
|
||||
|
||||
pub const unknown = Mime{
|
||||
.params = "",
|
||||
.charset = null,
|
||||
.content_type = .{ .unknown = {} },
|
||||
};
|
||||
/// String "UTF-8" continued by null characters.
|
||||
pub const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;
|
||||
|
||||
/// Mime with unknown Content-Type, empty params and empty charset.
|
||||
pub const unknown = Mime{ .content_type = .{ .unknown = {} } };
|
||||
|
||||
pub const ContentTypeEnum = enum {
|
||||
text_xml,
|
||||
@@ -52,6 +54,34 @@ pub const Mime = struct {
|
||||
other: struct { type: []const u8, sub_type: []const u8 },
|
||||
};
|
||||
|
||||
/// Returns the null-terminated charset value.
|
||||
pub inline fn charsetString(mime: *const Mime) [:0]const u8 {
|
||||
return @ptrCast(&mime.charset);
|
||||
}
|
||||
|
||||
/// Removes quotes of value if quotes are given.
|
||||
///
|
||||
/// Currently we don't validate the charset.
|
||||
/// See section 2.3 Naming Requirements:
|
||||
/// https://datatracker.ietf.org/doc/rfc2978/
|
||||
fn parseCharset(value: []const u8) error{ CharsetTooBig, Invalid }![]const u8 {
|
||||
// Cannot be larger than 40.
|
||||
// https://datatracker.ietf.org/doc/rfc2978/
|
||||
if (value.len > 40) return error.CharsetTooBig;
|
||||
|
||||
// If the first char is a quote, look for a pair.
|
||||
if (value[0] == '"') {
|
||||
if (value.len < 3 or value[value.len - 1] != '"') {
|
||||
return error.Invalid;
|
||||
}
|
||||
|
||||
return value[1 .. value.len - 1];
|
||||
}
|
||||
|
||||
// No quotes.
|
||||
return value;
|
||||
}
|
||||
|
||||
pub fn parse(input: []u8) !Mime {
|
||||
if (input.len > 255) {
|
||||
return error.TooBig;
|
||||
@@ -69,7 +99,7 @@ pub const Mime = struct {
|
||||
|
||||
const params = trimLeft(normalized[type_len..]);
|
||||
|
||||
var charset: ?[:0]const u8 = null;
|
||||
var charset: [41]u8 = undefined;
|
||||
|
||||
var it = std.mem.splitScalar(u8, params, ';');
|
||||
while (it.next()) |attr| {
|
||||
@@ -87,35 +117,14 @@ pub const Mime = struct {
|
||||
|
||||
switch (attribute_name) {
|
||||
.charset => {
|
||||
// We used to have a proper value parser, but we currently
|
||||
// only care about the charset attribute, plus only about
|
||||
// the UTF-8 value. It's a lot easier to do it this way,
|
||||
// and it doesn't require an allocation to (a) unescape the
|
||||
// value or (b) ensure the correct lifetime.
|
||||
if (value.len == 0) {
|
||||
break;
|
||||
}
|
||||
var attribute_value = value;
|
||||
if (value[0] == '"') {
|
||||
if (value.len < 3 or value[value.len - 1] != '"') {
|
||||
return error.Invalid;
|
||||
}
|
||||
attribute_value = value[1 .. value.len - 1];
|
||||
}
|
||||
|
||||
if (std.ascii.eqlIgnoreCase(attribute_value, "utf-8")) {
|
||||
charset = "UTF-8";
|
||||
} else if (std.ascii.eqlIgnoreCase(attribute_value, "iso-8859-1")) {
|
||||
charset = "ISO-8859-1";
|
||||
} else {
|
||||
// we only care about null (which we default to UTF-8)
|
||||
// or UTF-8. If this is actually set (i.e. not null)
|
||||
// and isn't UTF-8, we'll just put a dummy value. If
|
||||
// we want to capture the actual value, we'll need to
|
||||
// dupe/allocate it. Since, for now, we don't need that
|
||||
// we can avoid the allocation.
|
||||
charset = "lightpanda:UNSUPPORTED";
|
||||
}
|
||||
const attribute_value = try parseCharset(value);
|
||||
@memcpy(charset[0..attribute_value.len], attribute_value);
|
||||
// Null-terminate right after attribute value.
|
||||
charset[attribute_value.len] = 0;
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -363,21 +372,33 @@ test "Mime: parse charset" {
|
||||
|
||||
try expect(.{
|
||||
.content_type = .{ .text_xml = {} },
|
||||
.charset = "UTF-8",
|
||||
.charset = "utf-8",
|
||||
.params = "charset=utf-8",
|
||||
}, "text/xml; charset=utf-8");
|
||||
|
||||
try expect(.{
|
||||
.content_type = .{ .text_xml = {} },
|
||||
.charset = "UTF-8",
|
||||
.charset = "utf-8",
|
||||
.params = "charset=\"utf-8\"",
|
||||
}, "text/xml;charset=\"utf-8\"");
|
||||
}, "text/xml;charset=\"UTF-8\"");
|
||||
|
||||
try expect(.{
|
||||
.content_type = .{ .text_html = {} },
|
||||
.charset = "iso-8859-1",
|
||||
.params = "charset=\"iso-8859-1\"",
|
||||
}, "text/html; charset=\"iso-8859-1\"");
|
||||
|
||||
try expect(.{
|
||||
.content_type = .{ .text_html = {} },
|
||||
.charset = "iso-8859-1",
|
||||
.params = "charset=\"iso-8859-1\"",
|
||||
}, "text/html; charset=\"ISO-8859-1\"");
|
||||
|
||||
try expect(.{
|
||||
.content_type = .{ .text_xml = {} },
|
||||
.charset = "lightpanda:UNSUPPORTED",
|
||||
.params = "charset=\"\\\\ \\\" \"",
|
||||
}, "text/xml;charset=\"\\\\ \\\" \" ");
|
||||
.charset = "custom-non-standard-charset-value",
|
||||
.params = "charset=\"custom-non-standard-charset-value\"",
|
||||
}, "text/xml;charset=\"custom-non-standard-charset-value\"");
|
||||
}
|
||||
|
||||
test "Mime: isHTML" {
|
||||
@@ -490,8 +511,10 @@ fn expect(expected: Expectation, input: []const u8) !void {
|
||||
try testing.expectEqual(expected.params, actual.params);
|
||||
|
||||
if (expected.charset) |ec| {
|
||||
try testing.expectEqual(ec, actual.charset.?);
|
||||
// We remove the null characters for testing purposes here.
|
||||
try testing.expectEqual(ec, actual.charsetString()[0..ec.len]);
|
||||
} else {
|
||||
try testing.expectEqual(null, actual.charset);
|
||||
const m: Mime = .unknown;
|
||||
try testing.expectEqual(m.charsetString(), actual.charsetString());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -672,14 +672,14 @@ pub const Page = struct {
|
||||
log.debug(.http, "navigate first chunk", .{ .content_type = mime.content_type, .len = data.len });
|
||||
|
||||
self.mode = switch (mime.content_type) {
|
||||
.text_html => .{ .html = try parser.Parser.init(mime.charset orelse "UTF-8") },
|
||||
.text_html => .{ .html = try parser.Parser.init(mime.charsetString()) },
|
||||
|
||||
.application_json,
|
||||
.text_javascript,
|
||||
.text_css,
|
||||
.text_plain,
|
||||
=> blk: {
|
||||
var p = try parser.Parser.init(mime.charset orelse "UTF-8");
|
||||
var p = try parser.Parser.init(mime.charsetString());
|
||||
try p.process("<html><head><meta charset=\"utf-8\"></head><body><pre>");
|
||||
break :blk .{ .text = p };
|
||||
},
|
||||
|
||||
@@ -679,7 +679,7 @@ pub const XMLHttpRequest = struct {
|
||||
}
|
||||
|
||||
var fbs = std.io.fixedBufferStream(self.response_bytes.items);
|
||||
const doc = parser.documentHTMLParse(fbs.reader(), mime.charset orelse "UTF-8") catch {
|
||||
const doc = parser.documentHTMLParse(fbs.reader(), mime.charsetString()) catch {
|
||||
self.response_obj = .{ .Failure = {} };
|
||||
return;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user