Improve performance & compliance of MIME parsing

Common cases, text/html, text/xml and text/plain parse about 2x faster. Other
cases are about 30% faster.

Support quoted attributes, i.e. charset="utf-8" & valid escape sequences. This
potentially requires allocation, thus Mime.parse now takes an allocator.

Stricter validation around type/subtype based on RFC.

More tests.

Replace Mime.eql with isHTML(). Equality is complicated and was previously
incorrect (it was case sensitive, it should not be). Since we currently only
use isHTML-like behavior, built a (faster) method specifically for that.
This commit is contained in:
Karl Seguin
2025-02-10 11:07:55 +08:00
parent 0c1a486ed9
commit 6863f3227f
4 changed files with 380 additions and 139 deletions

View File

@@ -24,7 +24,7 @@ const Types = @import("root").Types;
const parser = @import("netsurf");
const Loader = @import("loader.zig").Loader;
const Dump = @import("dump.zig");
const Mime = @import("mime.zig");
const Mime = @import("mime.zig").Mime;
const jsruntime = @import("jsruntime");
const Loop = jsruntime.Loop;
@@ -376,7 +376,9 @@ pub const Page = struct {
log.debug("header content-type: {s}", .{ct.?});
const mime = try Mime.parse(ct.?);
if (mime.eql(Mime.HTML)) {
defer mime.deinit();
if (mime.isHTML()) {
try self.loadHTMLDoc(req.reader(), mime.charset orelse "utf-8", auxData);
} else {
log.info("non-HTML document: {s}", .{ct.?});

View File

@@ -17,143 +17,375 @@
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const testing = std.testing;
const Allocator = std.mem.Allocator;
const Reader = @import("../str/parser.zig").Reader;
pub const Mime = struct {
content_type: ContentType,
params: []const u8 = "",
charset: ?[]const u8 = null,
arena: std.heap.ArenaAllocator,
const Self = @This();
const MimeError = error{
Empty,
TooBig,
Invalid,
InvalidChar,
};
mtype: []const u8,
msubtype: []const u8,
params: []const u8 = "",
charset: ?[]const u8 = null,
boundary: ?[]const u8 = null,
pub const Empty = Self{ .mtype = "", .msubtype = "" };
pub const HTML = Self{ .mtype = "text", .msubtype = "html" };
pub const Javascript = Self{ .mtype = "application", .msubtype = "javascript" };
// https://mimesniff.spec.whatwg.org/#http-token-code-point
fn isHTTPCodePoint(c: u8) bool {
return switch (c) {
'!', '#', '$', '%', '&', '\'', '*', '+', '-', '.', '^' => return true,
'_', '`', '|', '~' => return true,
else => std.ascii.isAlphanumeric(c),
pub const ContentTypeEnum = enum {
text_xml,
text_html,
text_plain,
other,
};
}
fn valid(s: []const u8) bool {
const ln = s.len;
var i: usize = 0;
while (i < ln) {
if (!isHTTPCodePoint(s[i])) return false;
i += 1;
pub const ContentType = union(ContentTypeEnum) {
text_xml: void,
text_html: void,
text_plain: void,
other: struct { type: []const u8, sub_type: []const u8 },
};
pub fn parse(allocator: Allocator, input: []const u8) !Mime {
if (input.len > 255) {
return error.TooBig;
}
var arena = std.heap.ArenaAllocator.init(allocator);
errdefer arena.deinit();
var trimmed = trim(input);
const content_type, const type_len = try parseContentType(trimmed);
if (type_len >= trimmed.len) {
return .{ .arena = arena, .content_type = content_type };
}
const params = trimLeft(trimmed[type_len..]);
var charset: ?[]const u8 = null;
var it = std.mem.splitScalar(u8, params, ';');
while (it.next()) |attr| {
const i = std.mem.indexOfScalarPos(u8, attr, 0, '=') orelse return error.Invalid;
const name = trimLeft(attr[0..i]);
const value = trimRight(attr[i + 1 ..]);
if (value.len == 0) {
return error.Invalid;
}
switch (name.len) {
7 => if (isCaseEqual("charset", name)) {
charset = try parseValue(arena.allocator(), value);
},
else => {},
}
}
return .{
.arena = arena,
.params = params,
.charset = charset,
.content_type = content_type,
};
}
pub fn deinit(self: *Mime) void {
self.arena.deinit();
}
pub fn isHTML(self: *const Mime) bool {
return self.content_type == .text_html;
}
fn parseContentType(value: []const u8) !struct { ContentType, usize } {
const separator = std.mem.indexOfScalarPos(u8, value, 0, '/') orelse {
return error.Invalid;
};
const end = std.mem.indexOfScalarPos(u8, value, separator, ';') orelse blk: {
break :blk value.len;
};
const main_type = value[0..separator];
const sub_type = trimRight(value[separator + 1 .. end]);
if (parseCommonContentType(main_type, sub_type)) |content_type| {
return .{ content_type, end + 1 };
}
if (main_type.len == 0) {
return error.Invalid;
}
if (validType(main_type) == false) {
return error.Invalid;
}
if (sub_type.len == 0) {
return error.Invalid;
}
if (validType(sub_type) == false) {
return error.Invalid;
}
const content_type = ContentType{ .other = .{
.type = main_type,
.sub_type = sub_type,
} };
return .{ content_type, end + 1 };
}
fn parseCommonContentType(main_type: []const u8, sub_type: []const u8) ?ContentType {
switch (main_type.len) {
4 => if (isCaseEqual("text", main_type)) {
switch (sub_type.len) {
3 => if (isCaseEqual("xml", sub_type)) {
return .{ .text_xml = {} };
},
4 => if (isCaseEqual("html", sub_type)) {
return .{ .text_html = {} };
},
5 => if (isCaseEqual("plain", sub_type)) {
return .{ .text_plain = {} };
},
else => {},
}
},
else => {},
}
return null;
}
const T_SPECIAL = blk: {
var v = [_]bool{false} ** 256;
for ("()<>@,;:\\\"/[]?=") |b| {
v[b] = true;
}
break :blk v;
};
fn parseValue(allocator: Allocator, value: []const u8) ![]const u8 {
if (value[0] != '"') {
return value;
}
// 1 to skip the opening quote
var value_pos: usize = 1;
var unescaped_len: usize = 0;
const last = value.len - 1;
while (value_pos < value.len) {
switch (value[value_pos]) {
'"' => break,
'\\' => {
if (value_pos == last) {
return error.Invalid;
}
const next = value[value_pos + 1];
if (T_SPECIAL[next] == false) {
return error.Invalid;
}
value_pos += 2;
},
else => value_pos += 1,
}
unescaped_len += 1;
}
if (unescaped_len == 0) {
return error.Invalid;
}
value_pos = 1;
const owned = try allocator.alloc(u8, unescaped_len);
for (0..unescaped_len) |i| {
switch (value[value_pos]) {
'"' => break,
'\\' => {
owned[i] = value[value_pos + 1];
value_pos += 2;
},
else => |c| {
owned[i] = c;
value_pos += 1;
},
}
}
return owned;
}
const VALID_CODEPOINTS = blk: {
var v: [256]bool = undefined;
for (0..256) |i| {
v[i] = std.ascii.isAlphanumeric(i);
}
for ("!#$%&\\*+-.^'_`|~") |b| {
v[b] = true;
}
break :blk v;
};
fn validType(value: []const u8) bool {
for (value) |b| {
if (VALID_CODEPOINTS[b] == false) {
return false;
}
}
return true;
}
// https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
pub fn parse(s: []const u8) Self.MimeError!Self {
const ln = s.len;
if (ln == 0) return MimeError.Empty;
// limit input size
if (ln > 255) return MimeError.TooBig;
var res = Self{ .mtype = "", .msubtype = "" };
var r = Reader{ .data = s };
res.mtype = trim(r.until('/'));
if (res.mtype.len == 0) return MimeError.Invalid;
if (!valid(res.mtype)) return MimeError.InvalidChar;
if (!r.skip()) return MimeError.Invalid;
res.msubtype = trim(r.until(';'));
if (res.msubtype.len == 0) return MimeError.Invalid;
if (!valid(res.msubtype)) return MimeError.InvalidChar;
if (!r.skip()) return res;
res.params = trim(r.tail());
if (res.params.len == 0) return MimeError.Invalid;
// parse well known parameters.
// don't check invalid parameter format.
var rp = Reader{ .data = res.params };
while (true) {
const name = trim(rp.until('='));
if (!rp.skip()) return res;
const value = trim(rp.until(';'));
if (std.ascii.eqlIgnoreCase(name, "charset")) {
res.charset = value;
}
if (std.ascii.eqlIgnoreCase(name, "boundary")) {
res.boundary = value;
}
if (!rp.skip()) return res;
}
return res;
}
fn trim(s: []const u8) []const u8 {
fn trim(s: []const u8) []const u8 {
return std.mem.trim(u8, s, &std.ascii.whitespace);
}
test "parse valid" {
for ([_][]const u8{
"text/html",
" \ttext/html",
"text \t/html",
"text/ \thtml",
"text/html \t",
}) |tc| {
const m = try Self.parse(tc);
try testing.expectEqualStrings("text", m.mtype);
try testing.expectEqualStrings("html", m.msubtype);
}
const m2 = try Self.parse("text/javascript1.5");
try testing.expectEqualStrings("text", m2.mtype);
try testing.expectEqualStrings("javascript1.5", m2.msubtype);
const m3 = try Self.parse("text/html; charset=utf-8");
try testing.expectEqualStrings("text", m3.mtype);
try testing.expectEqualStrings("html", m3.msubtype);
try testing.expectEqualStrings("charset=utf-8", m3.params);
try testing.expectEqualStrings("utf-8", m3.charset.?);
fn trimLeft(s: []const u8) []const u8 {
return std.mem.trimLeft(u8, s, &std.ascii.whitespace);
}
const m4 = try Self.parse("text/html; boundary=----");
try testing.expectEqualStrings("text", m4.mtype);
try testing.expectEqualStrings("html", m4.msubtype);
try testing.expectEqualStrings("boundary=----", m4.params);
try testing.expectEqualStrings("----", m4.boundary.?);
}
fn trimRight(s: []const u8) []const u8 {
return std.mem.trimRight(u8, s, &std.ascii.whitespace);
}
test "parse invalid" {
for ([_][]const u8{
fn isCaseEqual(comptime target: anytype, value: []const u8) bool {
// - 8 beause we don't care about the sentinel
const bit_len = @bitSizeOf(@TypeOf(target.*)) - 8;
const byte_len = bit_len / 8;
const T = @Type(.{ .Int = .{
.bits = bit_len,
.signedness = .unsigned,
} });
const bit_target: T = @bitCast(@as(*const [byte_len]u8, target).*);
if (@as(T, @bitCast(value[0..byte_len].*)) == bit_target) {
return true;
}
return std.ascii.eqlIgnoreCase(value, target);
}
};
const testing = std.testing;
test "Mime: invalid " {
const invalids = [_][]const u8{
"",
"te xt/html;",
"te@xt/html;",
"text/ht@ml;",
"text/html;",
"/text/html",
"/html",
}) |tc| {
_ = Self.parse(tc) catch continue;
try testing.expect(false);
"text",
"text /html",
"text/ html",
"text / html",
"text/html other",
"text/html; x",
"text/html; x=",
"text/html; x= ",
"text/html; = ",
"text/html;=",
"text/html; charset=\"\"",
"text/html; charset=\"",
"text/html; charset=\"\\",
"text/html; charset=\"\\a\"", // invalid to escape non special characters
};
for (invalids) |invalid| {
try testing.expectError(error.Invalid, Mime.parse(undefined, invalid));
}
}
// Compare type and subtype.
pub fn eql(self: Self, b: Self) bool {
if (!std.mem.eql(u8, self.mtype, b.mtype)) return false;
return std.mem.eql(u8, self.msubtype, b.msubtype);
test "Mime: parse common" {
try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml");
try expect(.{ .content_type = .{ .text_html = {} } }, "text/html");
try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain");
try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml;");
try expect(.{ .content_type = .{ .text_html = {} } }, "text/html;");
try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain;");
try expect(.{ .content_type = .{ .text_xml = {} } }, " \ttext/xml");
try expect(.{ .content_type = .{ .text_html = {} } }, "text/html ");
try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain \t\t");
try expect(.{ .content_type = .{ .text_xml = {} } }, "TEXT/xml");
try expect(.{ .content_type = .{ .text_html = {} } }, "text/Html");
try expect(.{ .content_type = .{ .text_plain = {} } }, "TEXT/PLAIN");
try expect(.{ .content_type = .{ .text_xml = {} } }, " TeXT/xml");
try expect(.{ .content_type = .{ .text_html = {} } }, "teXt/HtML ;");
try expect(.{ .content_type = .{ .text_plain = {} } }, "tExT/PlAiN;");
}
test "Mime: parse uncommon" {
const text_javascript = Expectation{
.content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } },
};
try expect(text_javascript, "text/javascript");
try expect(text_javascript, "text/javascript;");
try expect(text_javascript, " text/javascript\t ");
try expect(text_javascript, " text/javascript\t ;");
try expect(
.{ .content_type = .{ .other = .{ .type = "Text", .sub_type = "Javascript" } } },
"Text/Javascript",
);
}
test "Mime: parse charset" {
try expect(.{
.content_type = .{ .text_xml = {} },
.charset = "utf-8",
.params = "charset=utf-8",
}, "text/xml; charset=utf-8");
try expect(.{
.content_type = .{ .text_xml = {} },
.charset = "utf-8",
.params = "charset=\"utf-8\"",
}, "text/xml;charset=\"utf-8\"");
try expect(.{
.content_type = .{ .text_xml = {} },
.charset = "\\ \" ",
.params = "charset=\"\\\\ \\\" \"",
}, "text/xml;charset=\"\\\\ \\\" \" ");
}
test "Mime: isHTML" {
const isHTML = struct {
fn isHTML(expected: bool, input: []const u8) !void {
var mime = try Mime.parse(testing.allocator, input);
defer mime.deinit();
try testing.expectEqual(expected, mime.isHTML());
}
}.isHTML;
try isHTML(true, "text/html");
try isHTML(true, "text/html;");
try isHTML(true, "text/html; charset=utf-8");
try isHTML(false, "text/htm"); // htm not html
try isHTML(false, "text/plain");
try isHTML(false, "over/9000");
}
const Expectation = struct {
content_type: Mime.ContentType,
params: []const u8 = "",
charset: ?[]const u8 = null,
};
fn expect(expected: Expectation, input: []const u8) !void {
var actual = try Mime.parse(testing.allocator, input);
defer actual.deinit();
try testing.expectEqual(
std.meta.activeTag(expected.content_type),
std.meta.activeTag(actual.content_type),
);
switch (expected.content_type) {
.other => |e| {
const a = actual.content_type.other;
try testing.expectEqualStrings(e.type, a.type);
try testing.expectEqualStrings(e.sub_type, a.sub_type);
},
else => {}, // already asserted above
}
try testing.expectEqualStrings(expected.params, actual.params);
if (expected.charset) |ec| {
try testing.expectEqualStrings(ec, actual.charset.?);
} else {
try testing.expectEqual(null, actual.charset);
}
}

View File

@@ -28,7 +28,7 @@ const DOMException = @import("../dom/exceptions.zig").DOMException;
const ProgressEvent = @import("progress_event.zig").ProgressEvent;
const XMLHttpRequestEventTarget = @import("event_target.zig").XMLHttpRequestEventTarget;
const Mime = @import("../browser/mime.zig");
const Mime = @import("../browser/mime.zig").Mime;
const Loop = jsruntime.Loop;
const Client = @import("asyncio").Client;
@@ -141,7 +141,7 @@ pub const XMLHttpRequest = struct {
// https://lightpanda.slack.com/archives/C05TRU6RBM1/p1707819010681019
// response_override_mime_type: ?[]const u8 = null,
response_mime: Mime = undefined,
response_mime: ?Mime = null,
response_obj: ?ResponseObj = null,
send_flag: bool = false,
@@ -313,8 +313,11 @@ pub const XMLHttpRequest = struct {
if (self.response_obj) |v| v.deinit();
self.response_obj = null;
self.response_mime = Mime.Empty;
self.response_type = .Empty;
if (self.response_mime) |*mime| {
mime.deinit();
self.response_mime = null;
}
// TODO should we clearRetainingCapacity instead?
self.headers.clearAndFree();
@@ -336,6 +339,9 @@ pub const XMLHttpRequest = struct {
self.reset();
self.headers.deinit();
self.response_headers.deinit();
if (self.response_mime) |*mime| {
mime.deinit();
}
self.proto.deinit(alloc);
}
@@ -544,7 +550,7 @@ pub const XMLHttpRequest = struct {
// extract a mime type from headers.
const ct = self.response_headers.getFirstValue("Content-Type") orelse "text/xml";
self.response_mime = Mime.parse(ct) catch |e| return self.onErr(e);
self.response_mime = Mime.parse(self.alloc, ct) catch |e| return self.onErr(e);
// TODO handle override mime type
@@ -820,13 +826,14 @@ pub const XMLHttpRequest = struct {
// TODO parse XML.
// https://xhr.spec.whatwg.org/#response-object
fn setResponseObjDocument(self: *XMLHttpRequest, alloc: std.mem.Allocator) void {
const isHTML = self.response_mime.eql(Mime.HTML);
const response_mime = &self.response_mime.?;
const isHTML = response_mime.isHTML();
// TODO If finalMIME is not an HTML MIME type or an XML MIME type, then
// return.
if (!isHTML) return;
const ccharset = alloc.dupeZ(u8, self.response_mime.charset orelse "utf-8") catch {
const ccharset = alloc.dupeZ(u8, response_mime.charset orelse "utf-8") catch {
self.response_obj = .{ .Failure = true };
return;
};