Merge pull request #413 from karlseguin/mime

Improve performance & compliance of MIME parsing
This commit is contained in:
Pierre Tachoire
2025-02-10 08:59:00 +01:00
committed by GitHub
4 changed files with 381 additions and 140 deletions

View File

@@ -24,7 +24,7 @@ const Types = @import("root").Types;
const parser = @import("netsurf"); const parser = @import("netsurf");
const Loader = @import("loader.zig").Loader; const Loader = @import("loader.zig").Loader;
const Dump = @import("dump.zig"); const Dump = @import("dump.zig");
const Mime = @import("mime.zig"); const Mime = @import("mime.zig").Mime;
const jsruntime = @import("jsruntime"); const jsruntime = @import("jsruntime");
const Loop = jsruntime.Loop; const Loop = jsruntime.Loop;
@@ -375,8 +375,10 @@ pub const Page = struct {
defer alloc.free(ct.?); defer alloc.free(ct.?);
log.debug("header content-type: {s}", .{ct.?}); log.debug("header content-type: {s}", .{ct.?});
const mime = try Mime.parse(ct.?); var mime = try Mime.parse(alloc, ct.?);
if (mime.eql(Mime.HTML)) { defer mime.deinit();
if (mime.isHTML()) {
try self.loadHTMLDoc(req.reader(), mime.charset orelse "utf-8", auxData); try self.loadHTMLDoc(req.reader(), mime.charset orelse "utf-8", auxData);
} else { } else {
log.info("non-HTML document: {s}", .{ct.?}); log.info("non-HTML document: {s}", .{ct.?});

View File

@@ -17,143 +17,375 @@
// along with this program. If not, see <https://www.gnu.org/licenses/>. // along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std"); const std = @import("std");
const testing = std.testing; const Allocator = std.mem.Allocator;
const Reader = @import("../str/parser.zig").Reader; pub const Mime = struct {
content_type: ContentType,
params: []const u8 = "",
charset: ?[]const u8 = null,
arena: std.heap.ArenaAllocator,
const Self = @This(); pub const ContentTypeEnum = enum {
text_xml,
const MimeError = error{ text_html,
Empty, text_plain,
TooBig, other,
Invalid,
InvalidChar,
}; };
mtype: []const u8, pub const ContentType = union(ContentTypeEnum) {
msubtype: []const u8, text_xml: void,
params: []const u8 = "", text_html: void,
text_plain: void,
other: struct { type: []const u8, sub_type: []const u8 },
};
charset: ?[]const u8 = null, pub fn parse(allocator: Allocator, input: []const u8) !Mime {
boundary: ?[]const u8 = null, if (input.len > 255) {
return error.TooBig;
}
pub const Empty = Self{ .mtype = "", .msubtype = "" }; var arena = std.heap.ArenaAllocator.init(allocator);
pub const HTML = Self{ .mtype = "text", .msubtype = "html" }; errdefer arena.deinit();
pub const Javascript = Self{ .mtype = "application", .msubtype = "javascript" };
// https://mimesniff.spec.whatwg.org/#http-token-code-point var trimmed = trim(input);
fn isHTTPCodePoint(c: u8) bool {
return switch (c) { const content_type, const type_len = try parseContentType(trimmed);
'!', '#', '$', '%', '&', '\'', '*', '+', '-', '.', '^' => return true, if (type_len >= trimmed.len) {
'_', '`', '|', '~' => return true, return .{ .arena = arena, .content_type = content_type };
else => std.ascii.isAlphanumeric(c), }
const params = trimLeft(trimmed[type_len..]);
var charset: ?[]const u8 = null;
var it = std.mem.splitScalar(u8, params, ';');
while (it.next()) |attr| {
const i = std.mem.indexOfScalarPos(u8, attr, 0, '=') orelse return error.Invalid;
const name = trimLeft(attr[0..i]);
const value = trimRight(attr[i + 1 ..]);
if (value.len == 0) {
return error.Invalid;
}
switch (name.len) {
7 => if (isCaseEqual("charset", name)) {
charset = try parseValue(arena.allocator(), value);
},
else => {},
}
}
return .{
.arena = arena,
.params = params,
.charset = charset,
.content_type = content_type,
}; };
} }
fn valid(s: []const u8) bool { pub fn deinit(self: *Mime) void {
const ln = s.len; self.arena.deinit();
var i: usize = 0; }
while (i < ln) {
if (!isHTTPCodePoint(s[i])) return false; pub fn isHTML(self: *const Mime) bool {
i += 1; return self.content_type == .text_html;
}
fn parseContentType(value: []const u8) !struct { ContentType, usize } {
const separator = std.mem.indexOfScalarPos(u8, value, 0, '/') orelse {
return error.Invalid;
};
const end = std.mem.indexOfScalarPos(u8, value, separator, ';') orelse blk: {
break :blk value.len;
};
const main_type = value[0..separator];
const sub_type = trimRight(value[separator + 1 .. end]);
if (parseCommonContentType(main_type, sub_type)) |content_type| {
return .{ content_type, end + 1 };
}
if (main_type.len == 0) {
return error.Invalid;
}
if (validType(main_type) == false) {
return error.Invalid;
}
if (sub_type.len == 0) {
return error.Invalid;
}
if (validType(sub_type) == false) {
return error.Invalid;
}
const content_type = ContentType{ .other = .{
.type = main_type,
.sub_type = sub_type,
} };
return .{ content_type, end + 1 };
}
fn parseCommonContentType(main_type: []const u8, sub_type: []const u8) ?ContentType {
switch (main_type.len) {
4 => if (isCaseEqual("text", main_type)) {
switch (sub_type.len) {
3 => if (isCaseEqual("xml", sub_type)) {
return .{ .text_xml = {} };
},
4 => if (isCaseEqual("html", sub_type)) {
return .{ .text_html = {} };
},
5 => if (isCaseEqual("plain", sub_type)) {
return .{ .text_plain = {} };
},
else => {},
}
},
else => {},
}
return null;
}
const T_SPECIAL = blk: {
var v = [_]bool{false} ** 256;
for ("()<>@,;:\\\"/[]?=") |b| {
v[b] = true;
}
break :blk v;
};
fn parseValue(allocator: Allocator, value: []const u8) ![]const u8 {
if (value[0] != '"') {
return value;
}
// 1 to skip the opening quote
var value_pos: usize = 1;
var unescaped_len: usize = 0;
const last = value.len - 1;
while (value_pos < value.len) {
switch (value[value_pos]) {
'"' => break,
'\\' => {
if (value_pos == last) {
return error.Invalid;
}
const next = value[value_pos + 1];
if (T_SPECIAL[next] == false) {
return error.Invalid;
}
value_pos += 2;
},
else => value_pos += 1,
}
unescaped_len += 1;
}
if (unescaped_len == 0) {
return error.Invalid;
}
value_pos = 1;
const owned = try allocator.alloc(u8, unescaped_len);
for (0..unescaped_len) |i| {
switch (value[value_pos]) {
'"' => break,
'\\' => {
owned[i] = value[value_pos + 1];
value_pos += 2;
},
else => |c| {
owned[i] = c;
value_pos += 1;
},
}
}
return owned;
}
const VALID_CODEPOINTS = blk: {
var v: [256]bool = undefined;
for (0..256) |i| {
v[i] = std.ascii.isAlphanumeric(i);
}
for ("!#$%&\\*+-.^'_`|~") |b| {
v[b] = true;
}
break :blk v;
};
fn validType(value: []const u8) bool {
for (value) |b| {
if (VALID_CODEPOINTS[b] == false) {
return false;
}
} }
return true; return true;
} }
// https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
pub fn parse(s: []const u8) Self.MimeError!Self {
const ln = s.len;
if (ln == 0) return MimeError.Empty;
// limit input size
if (ln > 255) return MimeError.TooBig;
var res = Self{ .mtype = "", .msubtype = "" };
var r = Reader{ .data = s };
res.mtype = trim(r.until('/'));
if (res.mtype.len == 0) return MimeError.Invalid;
if (!valid(res.mtype)) return MimeError.InvalidChar;
if (!r.skip()) return MimeError.Invalid;
res.msubtype = trim(r.until(';'));
if (res.msubtype.len == 0) return MimeError.Invalid;
if (!valid(res.msubtype)) return MimeError.InvalidChar;
if (!r.skip()) return res;
res.params = trim(r.tail());
if (res.params.len == 0) return MimeError.Invalid;
// parse well known parameters.
// don't check invalid parameter format.
var rp = Reader{ .data = res.params };
while (true) {
const name = trim(rp.until('='));
if (!rp.skip()) return res;
const value = trim(rp.until(';'));
if (std.ascii.eqlIgnoreCase(name, "charset")) {
res.charset = value;
}
if (std.ascii.eqlIgnoreCase(name, "boundary")) {
res.boundary = value;
}
if (!rp.skip()) return res;
}
return res;
}
fn trim(s: []const u8) []const u8 { fn trim(s: []const u8) []const u8 {
return std.mem.trim(u8, s, &std.ascii.whitespace); return std.mem.trim(u8, s, &std.ascii.whitespace);
} }
test "parse valid" { fn trimLeft(s: []const u8) []const u8 {
for ([_][]const u8{ return std.mem.trimLeft(u8, s, &std.ascii.whitespace);
"text/html",
" \ttext/html",
"text \t/html",
"text/ \thtml",
"text/html \t",
}) |tc| {
const m = try Self.parse(tc);
try testing.expectEqualStrings("text", m.mtype);
try testing.expectEqualStrings("html", m.msubtype);
}
const m2 = try Self.parse("text/javascript1.5");
try testing.expectEqualStrings("text", m2.mtype);
try testing.expectEqualStrings("javascript1.5", m2.msubtype);
const m3 = try Self.parse("text/html; charset=utf-8");
try testing.expectEqualStrings("text", m3.mtype);
try testing.expectEqualStrings("html", m3.msubtype);
try testing.expectEqualStrings("charset=utf-8", m3.params);
try testing.expectEqualStrings("utf-8", m3.charset.?);
const m4 = try Self.parse("text/html; boundary=----");
try testing.expectEqualStrings("text", m4.mtype);
try testing.expectEqualStrings("html", m4.msubtype);
try testing.expectEqualStrings("boundary=----", m4.params);
try testing.expectEqualStrings("----", m4.boundary.?);
} }
test "parse invalid" { fn trimRight(s: []const u8) []const u8 {
for ([_][]const u8{ return std.mem.trimRight(u8, s, &std.ascii.whitespace);
}
fn isCaseEqual(comptime target: anytype, value: []const u8) bool {
// - 8 beause we don't care about the sentinel
const bit_len = @bitSizeOf(@TypeOf(target.*)) - 8;
const byte_len = bit_len / 8;
const T = @Type(.{ .Int = .{
.bits = bit_len,
.signedness = .unsigned,
} });
const bit_target: T = @bitCast(@as(*const [byte_len]u8, target).*);
if (@as(T, @bitCast(value[0..byte_len].*)) == bit_target) {
return true;
}
return std.ascii.eqlIgnoreCase(value, target);
}
};
const testing = std.testing;
test "Mime: invalid " {
const invalids = [_][]const u8{
"", "",
"te xt/html;", "text",
"te@xt/html;", "text /html",
"text/ht@ml;", "text/ html",
"text/html;", "text / html",
"/text/html", "text/html other",
"/html", "text/html; x",
}) |tc| { "text/html; x=",
_ = Self.parse(tc) catch continue; "text/html; x= ",
try testing.expect(false); "text/html; = ",
"text/html;=",
"text/html; charset=\"\"",
"text/html; charset=\"",
"text/html; charset=\"\\",
"text/html; charset=\"\\a\"", // invalid to escape non special characters
};
for (invalids) |invalid| {
try testing.expectError(error.Invalid, Mime.parse(undefined, invalid));
} }
} }
// Compare type and subtype. test "Mime: parse common" {
pub fn eql(self: Self, b: Self) bool { try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml");
if (!std.mem.eql(u8, self.mtype, b.mtype)) return false; try expect(.{ .content_type = .{ .text_html = {} } }, "text/html");
return std.mem.eql(u8, self.msubtype, b.msubtype); try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain");
try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml;");
try expect(.{ .content_type = .{ .text_html = {} } }, "text/html;");
try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain;");
try expect(.{ .content_type = .{ .text_xml = {} } }, " \ttext/xml");
try expect(.{ .content_type = .{ .text_html = {} } }, "text/html ");
try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain \t\t");
try expect(.{ .content_type = .{ .text_xml = {} } }, "TEXT/xml");
try expect(.{ .content_type = .{ .text_html = {} } }, "text/Html");
try expect(.{ .content_type = .{ .text_plain = {} } }, "TEXT/PLAIN");
try expect(.{ .content_type = .{ .text_xml = {} } }, " TeXT/xml");
try expect(.{ .content_type = .{ .text_html = {} } }, "teXt/HtML ;");
try expect(.{ .content_type = .{ .text_plain = {} } }, "tExT/PlAiN;");
}
test "Mime: parse uncommon" {
const text_javascript = Expectation{
.content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } },
};
try expect(text_javascript, "text/javascript");
try expect(text_javascript, "text/javascript;");
try expect(text_javascript, " text/javascript\t ");
try expect(text_javascript, " text/javascript\t ;");
try expect(
.{ .content_type = .{ .other = .{ .type = "Text", .sub_type = "Javascript" } } },
"Text/Javascript",
);
}
test "Mime: parse charset" {
try expect(.{
.content_type = .{ .text_xml = {} },
.charset = "utf-8",
.params = "charset=utf-8",
}, "text/xml; charset=utf-8");
try expect(.{
.content_type = .{ .text_xml = {} },
.charset = "utf-8",
.params = "charset=\"utf-8\"",
}, "text/xml;charset=\"utf-8\"");
try expect(.{
.content_type = .{ .text_xml = {} },
.charset = "\\ \" ",
.params = "charset=\"\\\\ \\\" \"",
}, "text/xml;charset=\"\\\\ \\\" \" ");
}
test "Mime: isHTML" {
const isHTML = struct {
fn isHTML(expected: bool, input: []const u8) !void {
var mime = try Mime.parse(testing.allocator, input);
defer mime.deinit();
try testing.expectEqual(expected, mime.isHTML());
}
}.isHTML;
try isHTML(true, "text/html");
try isHTML(true, "text/html;");
try isHTML(true, "text/html; charset=utf-8");
try isHTML(false, "text/htm"); // htm not html
try isHTML(false, "text/plain");
try isHTML(false, "over/9000");
}
const Expectation = struct {
content_type: Mime.ContentType,
params: []const u8 = "",
charset: ?[]const u8 = null,
};
fn expect(expected: Expectation, input: []const u8) !void {
var actual = try Mime.parse(testing.allocator, input);
defer actual.deinit();
try testing.expectEqual(
std.meta.activeTag(expected.content_type),
std.meta.activeTag(actual.content_type),
);
switch (expected.content_type) {
.other => |e| {
const a = actual.content_type.other;
try testing.expectEqualStrings(e.type, a.type);
try testing.expectEqualStrings(e.sub_type, a.sub_type);
},
else => {}, // already asserted above
}
try testing.expectEqualStrings(expected.params, actual.params);
if (expected.charset) |ec| {
try testing.expectEqualStrings(ec, actual.charset.?);
} else {
try testing.expectEqual(null, actual.charset);
}
} }

View File

@@ -28,7 +28,7 @@ const DOMException = @import("../dom/exceptions.zig").DOMException;
const ProgressEvent = @import("progress_event.zig").ProgressEvent; const ProgressEvent = @import("progress_event.zig").ProgressEvent;
const XMLHttpRequestEventTarget = @import("event_target.zig").XMLHttpRequestEventTarget; const XMLHttpRequestEventTarget = @import("event_target.zig").XMLHttpRequestEventTarget;
const Mime = @import("../browser/mime.zig"); const Mime = @import("../browser/mime.zig").Mime;
const Loop = jsruntime.Loop; const Loop = jsruntime.Loop;
const Client = @import("asyncio").Client; const Client = @import("asyncio").Client;
@@ -141,7 +141,7 @@ pub const XMLHttpRequest = struct {
// https://lightpanda.slack.com/archives/C05TRU6RBM1/p1707819010681019 // https://lightpanda.slack.com/archives/C05TRU6RBM1/p1707819010681019
// response_override_mime_type: ?[]const u8 = null, // response_override_mime_type: ?[]const u8 = null,
response_mime: Mime = undefined, response_mime: ?Mime = null,
response_obj: ?ResponseObj = null, response_obj: ?ResponseObj = null,
send_flag: bool = false, send_flag: bool = false,
@@ -313,8 +313,11 @@ pub const XMLHttpRequest = struct {
if (self.response_obj) |v| v.deinit(); if (self.response_obj) |v| v.deinit();
self.response_obj = null; self.response_obj = null;
self.response_mime = Mime.Empty;
self.response_type = .Empty; self.response_type = .Empty;
if (self.response_mime) |*mime| {
mime.deinit();
self.response_mime = null;
}
// TODO should we clearRetainingCapacity instead? // TODO should we clearRetainingCapacity instead?
self.headers.clearAndFree(); self.headers.clearAndFree();
@@ -336,6 +339,9 @@ pub const XMLHttpRequest = struct {
self.reset(); self.reset();
self.headers.deinit(); self.headers.deinit();
self.response_headers.deinit(); self.response_headers.deinit();
if (self.response_mime) |*mime| {
mime.deinit();
}
self.proto.deinit(alloc); self.proto.deinit(alloc);
} }
@@ -544,7 +550,7 @@ pub const XMLHttpRequest = struct {
// extract a mime type from headers. // extract a mime type from headers.
const ct = self.response_headers.getFirstValue("Content-Type") orelse "text/xml"; const ct = self.response_headers.getFirstValue("Content-Type") orelse "text/xml";
self.response_mime = Mime.parse(ct) catch |e| return self.onErr(e); self.response_mime = Mime.parse(self.alloc, ct) catch |e| return self.onErr(e);
// TODO handle override mime type // TODO handle override mime type
@@ -820,13 +826,14 @@ pub const XMLHttpRequest = struct {
// TODO parse XML. // TODO parse XML.
// https://xhr.spec.whatwg.org/#response-object // https://xhr.spec.whatwg.org/#response-object
fn setResponseObjDocument(self: *XMLHttpRequest, alloc: std.mem.Allocator) void { fn setResponseObjDocument(self: *XMLHttpRequest, alloc: std.mem.Allocator) void {
const isHTML = self.response_mime.eql(Mime.HTML); const response_mime = &self.response_mime.?;
const isHTML = response_mime.isHTML();
// TODO If finalMIME is not an HTML MIME type or an XML MIME type, then // TODO If finalMIME is not an HTML MIME type or an XML MIME type, then
// return. // return.
if (!isHTML) return; if (!isHTML) return;
const ccharset = alloc.dupeZ(u8, self.response_mime.charset orelse "utf-8") catch { const ccharset = alloc.dupeZ(u8, response_mime.charset orelse "utf-8") catch {
self.response_obj = .{ .Failure = true }; self.response_obj = .{ .Failure = true };
return; return;
}; };