Merge pull request #413 from karlseguin/mime

Improve performance & compliance of MIME parsing
2026-03-22 20:54:43 +00:00 · 2025-02-10 08:59:00 +01:00
parent c78b582d71 4ab02fab1c
commit 842760255b
4 changed files with 381 additions and 140 deletions
--- a/src/browser/browser.zig
+++ b/src/browser/browser.zig
@@ -24,7 +24,7 @@ const Types = @import("root").Types;
 const parser = @import("netsurf");
 const Loader = @import("loader.zig").Loader;
 const Dump = @import("dump.zig");
-const Mime = @import("mime.zig");
+const Mime = @import("mime.zig").Mime;
 const jsruntime = @import("jsruntime");
 const Loop = jsruntime.Loop;
@@ -375,8 +375,10 @@ pub const Page = struct {
        defer alloc.free(ct.?);
        log.debug("header content-type: {s}", .{ct.?});
-        const mime = try Mime.parse(ct.?);
+        var mime = try Mime.parse(alloc, ct.?);
-        if (mime.eql(Mime.HTML)) {
+        defer mime.deinit();
        if (mime.isHTML()) {
            try self.loadHTMLDoc(req.reader(), mime.charset orelse "utf-8", auxData);
        } else {
            log.info("non-HTML document: {s}", .{ct.?});
--- a/src/browser/mime.zig
+++ b/src/browser/mime.zig
@@ -17,143 +17,375 @@
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 const std = @import("std");
-const testing = std.testing;
+const Allocator = std.mem.Allocator;
-const Reader = @import("../str/parser.zig").Reader;
+pub const Mime = struct {
    content_type: ContentType,
    params: []const u8 = "",
    charset: ?[]const u8 = null,
    arena: std.heap.ArenaAllocator,
-const Self = @This();
+    pub const ContentTypeEnum = enum {
-
+        text_xml,
-const MimeError = error{
+        text_html,
-    Empty,
+        text_plain,
-    TooBig,
+        other,
    Invalid,
    InvalidChar,
    };
-mtype: []const u8,
+    pub const ContentType = union(ContentTypeEnum) {
-msubtype: []const u8,
+        text_xml: void,
-params: []const u8 = "",
+        text_html: void,
        text_plain: void,
        other: struct { type: []const u8, sub_type: []const u8 },
    };
-charset: ?[]const u8 = null,
+    pub fn parse(allocator: Allocator, input: []const u8) !Mime {
-boundary: ?[]const u8 = null,
+        if (input.len > 255) {
            return error.TooBig;
        }
-pub const Empty = Self{ .mtype = "", .msubtype = "" };
+        var arena = std.heap.ArenaAllocator.init(allocator);
-pub const HTML = Self{ .mtype = "text", .msubtype = "html" };
+        errdefer arena.deinit();
 pub const Javascript = Self{ .mtype = "application", .msubtype = "javascript" };
-// https://mimesniff.spec.whatwg.org/#http-token-code-point
+        var trimmed = trim(input);
-fn isHTTPCodePoint(c: u8) bool {
+
-    return switch (c) {
+        const content_type, const type_len = try parseContentType(trimmed);
-        '!', '#', '$', '%', '&', '\'', '*', '+', '-', '.', '^' => return true,
+        if (type_len >= trimmed.len) {
-        '_', '`', '|', '~' => return true,
+            return .{ .arena = arena, .content_type = content_type };
-        else => std.ascii.isAlphanumeric(c),
+        }
        const params = trimLeft(trimmed[type_len..]);
        var charset: ?[]const u8 = null;
        var it = std.mem.splitScalar(u8, params, ';');
        while (it.next()) |attr| {
            const i = std.mem.indexOfScalarPos(u8, attr, 0, '=') orelse return error.Invalid;
            const name = trimLeft(attr[0..i]);
            const value = trimRight(attr[i + 1 ..]);
            if (value.len == 0) {
                return error.Invalid;
            }
            switch (name.len) {
                7 => if (isCaseEqual("charset", name)) {
                    charset = try parseValue(arena.allocator(), value);
                },
                else => {},
            }
        }
        return .{
            .arena = arena,
            .params = params,
            .charset = charset,
            .content_type = content_type,
        };
    }
-fn valid(s: []const u8) bool {
+    pub fn deinit(self: *Mime) void {
-    const ln = s.len;
+        self.arena.deinit();
-    var i: usize = 0;
+    }
-    while (i < ln) {
+
-        if (!isHTTPCodePoint(s[i])) return false;
+    pub fn isHTML(self: *const Mime) bool {
-        i += 1;
+        return self.content_type == .text_html;
    }
    fn parseContentType(value: []const u8) !struct { ContentType, usize } {
        const separator = std.mem.indexOfScalarPos(u8, value, 0, '/') orelse {
            return error.Invalid;
        };
        const end = std.mem.indexOfScalarPos(u8, value, separator, ';') orelse blk: {
            break :blk value.len;
        };
        const main_type = value[0..separator];
        const sub_type = trimRight(value[separator + 1 .. end]);
        if (parseCommonContentType(main_type, sub_type)) |content_type| {
            return .{ content_type, end + 1 };
        }
        if (main_type.len == 0) {
            return error.Invalid;
        }
        if (validType(main_type) == false) {
            return error.Invalid;
        }
        if (sub_type.len == 0) {
            return error.Invalid;
        }
        if (validType(sub_type) == false) {
            return error.Invalid;
        }
        const content_type = ContentType{ .other = .{
            .type = main_type,
            .sub_type = sub_type,
        } };
        return .{ content_type, end + 1 };
    }
    fn parseCommonContentType(main_type: []const u8, sub_type: []const u8) ?ContentType {
        switch (main_type.len) {
            4 => if (isCaseEqual("text", main_type)) {
                switch (sub_type.len) {
                    3 => if (isCaseEqual("xml", sub_type)) {
                        return .{ .text_xml = {} };
                    },
                    4 => if (isCaseEqual("html", sub_type)) {
                        return .{ .text_html = {} };
                    },
                    5 => if (isCaseEqual("plain", sub_type)) {
                        return .{ .text_plain = {} };
                    },
                    else => {},
                }
            },
            else => {},
        }
        return null;
    }
    const T_SPECIAL = blk: {
        var v = [_]bool{false} ** 256;
        for ("()<>@,;:\\\"/[]?=") |b| {
            v[b] = true;
        }
        break :blk v;
    };
    fn parseValue(allocator: Allocator, value: []const u8) ![]const u8 {
        if (value[0] != '"') {
            return value;
        }
        // 1 to skip the opening quote
        var value_pos: usize = 1;
        var unescaped_len: usize = 0;
        const last = value.len - 1;
        while (value_pos < value.len) {
            switch (value[value_pos]) {
                '"' => break,
                '\\' => {
                    if (value_pos == last) {
                        return error.Invalid;
                    }
                    const next = value[value_pos + 1];
                    if (T_SPECIAL[next] == false) {
                        return error.Invalid;
                    }
                    value_pos += 2;
                },
                else => value_pos += 1,
            }
            unescaped_len += 1;
        }
        if (unescaped_len == 0) {
            return error.Invalid;
        }
        value_pos = 1;
        const owned = try allocator.alloc(u8, unescaped_len);
        for (0..unescaped_len) |i| {
            switch (value[value_pos]) {
                '"' => break,
                '\\' => {
                    owned[i] = value[value_pos + 1];
                    value_pos += 2;
                },
                else => |c| {
                    owned[i] = c;
                    value_pos += 1;
                },
            }
        }
        return owned;
    }
    const VALID_CODEPOINTS = blk: {
        var v: [256]bool = undefined;
        for (0..256) |i| {
            v[i] = std.ascii.isAlphanumeric(i);
        }
        for ("!#$%&\\*+-.^'_`|~") |b| {
            v[b] = true;
        }
        break :blk v;
    };
    fn validType(value: []const u8) bool {
        for (value) |b| {
            if (VALID_CODEPOINTS[b] == false) {
                return false;
            }
        }
        return true;
    }
 // https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
 pub fn parse(s: []const u8) Self.MimeError!Self {
    const ln = s.len;
    if (ln == 0) return MimeError.Empty;
    // limit input size
    if (ln > 255) return MimeError.TooBig;
    var res = Self{ .mtype = "", .msubtype = "" };
    var r = Reader{ .data = s };
    res.mtype = trim(r.until('/'));
    if (res.mtype.len == 0) return MimeError.Invalid;
    if (!valid(res.mtype)) return MimeError.InvalidChar;
    if (!r.skip()) return MimeError.Invalid;
    res.msubtype = trim(r.until(';'));
    if (res.msubtype.len == 0) return MimeError.Invalid;
    if (!valid(res.msubtype)) return MimeError.InvalidChar;
    if (!r.skip()) return res;
    res.params = trim(r.tail());
    if (res.params.len == 0) return MimeError.Invalid;
    // parse well known parameters.
    // don't check invalid parameter format.
    var rp = Reader{ .data = res.params };
    while (true) {
        const name = trim(rp.until('='));
        if (!rp.skip()) return res;
        const value = trim(rp.until(';'));
        if (std.ascii.eqlIgnoreCase(name, "charset")) {
            res.charset = value;
        }
        if (std.ascii.eqlIgnoreCase(name, "boundary")) {
            res.boundary = value;
        }
        if (!rp.skip()) return res;
    }
    return res;
 }
    fn trim(s: []const u8) []const u8 {
        return std.mem.trim(u8, s, &std.ascii.whitespace);
    }
-test "parse valid" {
+    fn trimLeft(s: []const u8) []const u8 {
-    for ([_][]const u8{
+        return std.mem.trimLeft(u8, s, &std.ascii.whitespace);
        "text/html",
        " \ttext/html",
        "text \t/html",
        "text/ \thtml",
        "text/html \t",
    }) |tc| {
        const m = try Self.parse(tc);
        try testing.expectEqualStrings("text", m.mtype);
        try testing.expectEqualStrings("html", m.msubtype);
    }
    const m2 = try Self.parse("text/javascript1.5");
    try testing.expectEqualStrings("text", m2.mtype);
    try testing.expectEqualStrings("javascript1.5", m2.msubtype);
    const m3 = try Self.parse("text/html; charset=utf-8");
    try testing.expectEqualStrings("text", m3.mtype);
    try testing.expectEqualStrings("html", m3.msubtype);
    try testing.expectEqualStrings("charset=utf-8", m3.params);
    try testing.expectEqualStrings("utf-8", m3.charset.?);
    const m4 = try Self.parse("text/html; boundary=----");
    try testing.expectEqualStrings("text", m4.mtype);
    try testing.expectEqualStrings("html", m4.msubtype);
    try testing.expectEqualStrings("boundary=----", m4.params);
    try testing.expectEqualStrings("----", m4.boundary.?);
    }
-test "parse invalid" {
+    fn trimRight(s: []const u8) []const u8 {
-    for ([_][]const u8{
+        return std.mem.trimRight(u8, s, &std.ascii.whitespace);
    }
    fn isCaseEqual(comptime target: anytype, value: []const u8) bool {
        // - 8 beause we don't care about the sentinel
        const bit_len = @bitSizeOf(@TypeOf(target.*)) - 8;
        const byte_len = bit_len / 8;
        const T = @Type(.{ .Int = .{
            .bits = bit_len,
            .signedness = .unsigned,
        } });
        const bit_target: T = @bitCast(@as(*const [byte_len]u8, target).*);
        if (@as(T, @bitCast(value[0..byte_len].*)) == bit_target) {
            return true;
        }
        return std.ascii.eqlIgnoreCase(value, target);
    }
 };
 const testing = std.testing;
 test "Mime: invalid " {
    const invalids = [_][]const u8{
        "",
-        "te xt/html;",
+        "text",
-        "te@xt/html;",
+        "text /html",
-        "text/ht@ml;",
+        "text/ html",
-        "text/html;",
+        "text / html",
-        "/text/html",
+        "text/html other",
-        "/html",
+        "text/html; x",
-    }) |tc| {
+        "text/html; x=",
-        _ = Self.parse(tc) catch continue;
+        "text/html; x=  ",
-        try testing.expect(false);
+        "text/html; = ",
        "text/html;=",
        "text/html; charset=\"\"",
        "text/html; charset=\"",
        "text/html; charset=\"\\",
        "text/html; charset=\"\\a\"", // invalid to escape non special characters
    };
    for (invalids) |invalid| {
        try testing.expectError(error.Invalid, Mime.parse(undefined, invalid));
    }
 }
-// Compare type and subtype.
+test "Mime: parse common" {
-pub fn eql(self: Self, b: Self) bool {
+    try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml");
-    if (!std.mem.eql(u8, self.mtype, b.mtype)) return false;
+    try expect(.{ .content_type = .{ .text_html = {} } }, "text/html");
-    return std.mem.eql(u8, self.msubtype, b.msubtype);
+    try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain");
    try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml;");
    try expect(.{ .content_type = .{ .text_html = {} } }, "text/html;");
    try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain;");
    try expect(.{ .content_type = .{ .text_xml = {} } }, "  \ttext/xml");
    try expect(.{ .content_type = .{ .text_html = {} } }, "text/html   ");
    try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain \t\t");
    try expect(.{ .content_type = .{ .text_xml = {} } }, "TEXT/xml");
    try expect(.{ .content_type = .{ .text_html = {} } }, "text/Html");
    try expect(.{ .content_type = .{ .text_plain = {} } }, "TEXT/PLAIN");
    try expect(.{ .content_type = .{ .text_xml = {} } }, " TeXT/xml");
    try expect(.{ .content_type = .{ .text_html = {} } }, "teXt/HtML  ;");
    try expect(.{ .content_type = .{ .text_plain = {} } }, "tExT/PlAiN;");
 }
 test "Mime: parse uncommon" {
    const text_javascript = Expectation{
        .content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } },
    };
    try expect(text_javascript, "text/javascript");
    try expect(text_javascript, "text/javascript;");
    try expect(text_javascript, "  text/javascript\t  ");
    try expect(text_javascript, "  text/javascript\t  ;");
    try expect(
        .{ .content_type = .{ .other = .{ .type = "Text", .sub_type = "Javascript" } } },
        "Text/Javascript",
    );
 }
 test "Mime: parse charset" {
    try expect(.{
        .content_type = .{ .text_xml = {} },
        .charset = "utf-8",
        .params = "charset=utf-8",
    }, "text/xml; charset=utf-8");
    try expect(.{
        .content_type = .{ .text_xml = {} },
        .charset = "utf-8",
        .params = "charset=\"utf-8\"",
    }, "text/xml;charset=\"utf-8\"");
    try expect(.{
        .content_type = .{ .text_xml = {} },
        .charset = "\\ \" ",
        .params = "charset=\"\\\\ \\\" \"",
    }, "text/xml;charset=\"\\\\ \\\" \"   ");
 }
 test "Mime: isHTML" {
    const isHTML = struct {
        fn isHTML(expected: bool, input: []const u8) !void {
            var mime = try Mime.parse(testing.allocator, input);
            defer mime.deinit();
            try testing.expectEqual(expected, mime.isHTML());
        }
    }.isHTML;
    try isHTML(true, "text/html");
    try isHTML(true, "text/html;");
    try isHTML(true, "text/html; charset=utf-8");
    try isHTML(false, "text/htm"); // htm not html
    try isHTML(false, "text/plain");
    try isHTML(false, "over/9000");
 }
 const Expectation = struct {
    content_type: Mime.ContentType,
    params: []const u8 = "",
    charset: ?[]const u8 = null,
 };
 fn expect(expected: Expectation, input: []const u8) !void {
    var actual = try Mime.parse(testing.allocator, input);
    defer actual.deinit();
    try testing.expectEqual(
        std.meta.activeTag(expected.content_type),
        std.meta.activeTag(actual.content_type),
    );
    switch (expected.content_type) {
        .other => |e| {
            const a = actual.content_type.other;
            try testing.expectEqualStrings(e.type, a.type);
            try testing.expectEqualStrings(e.sub_type, a.sub_type);
        },
        else => {}, // already asserted above
    }
    try testing.expectEqualStrings(expected.params, actual.params);
    if (expected.charset) |ec| {
        try testing.expectEqualStrings(ec, actual.charset.?);
    } else {
        try testing.expectEqual(null, actual.charset);
    }
 }
--- a/src/xhr/xhr.zig
+++ b/src/xhr/xhr.zig
@@ -28,7 +28,7 @@ const DOMException = @import("../dom/exceptions.zig").DOMException;
 const ProgressEvent = @import("progress_event.zig").ProgressEvent;
 const XMLHttpRequestEventTarget = @import("event_target.zig").XMLHttpRequestEventTarget;
-const Mime = @import("../browser/mime.zig");
+const Mime = @import("../browser/mime.zig").Mime;
 const Loop = jsruntime.Loop;
 const Client = @import("asyncio").Client;
@@ -141,7 +141,7 @@ pub const XMLHttpRequest = struct {
    // https://lightpanda.slack.com/archives/C05TRU6RBM1/p1707819010681019
    // response_override_mime_type: ?[]const u8 = null,
-    response_mime: Mime = undefined,
+    response_mime: ?Mime = null,
    response_obj: ?ResponseObj = null,
    send_flag: bool = false,
@@ -313,8 +313,11 @@ pub const XMLHttpRequest = struct {
        if (self.response_obj) |v| v.deinit();
        self.response_obj = null;
        self.response_mime = Mime.Empty;
        self.response_type = .Empty;
        if (self.response_mime) |*mime| {
            mime.deinit();
            self.response_mime = null;
        }
        // TODO should we clearRetainingCapacity instead?
        self.headers.clearAndFree();
@@ -336,6 +339,9 @@ pub const XMLHttpRequest = struct {
        self.reset();
        self.headers.deinit();
        self.response_headers.deinit();
        if (self.response_mime) |*mime| {
            mime.deinit();
        }
        self.proto.deinit(alloc);
    }
@@ -544,7 +550,7 @@ pub const XMLHttpRequest = struct {
        // extract a mime type from headers.
        const ct = self.response_headers.getFirstValue("Content-Type") orelse "text/xml";
-        self.response_mime = Mime.parse(ct) catch |e| return self.onErr(e);
+        self.response_mime = Mime.parse(self.alloc, ct) catch |e| return self.onErr(e);
        // TODO handle override mime type
@@ -820,13 +826,14 @@ pub const XMLHttpRequest = struct {
    // TODO parse XML.
    // https://xhr.spec.whatwg.org/#response-object
    fn setResponseObjDocument(self: *XMLHttpRequest, alloc: std.mem.Allocator) void {
-        const isHTML = self.response_mime.eql(Mime.HTML);
+        const response_mime = &self.response_mime.?;
        const isHTML = response_mime.isHTML();
        // TODO If finalMIME is not an HTML MIME type or an XML MIME type, then
        // return.
        if (!isHTML) return;
-        const ccharset = alloc.dupeZ(u8, self.response_mime.charset orelse "utf-8") catch {
+        const ccharset = alloc.dupeZ(u8, response_mime.charset orelse "utf-8") catch {
            self.response_obj = .{ .Failure = true };
            return;
        };