diff --git a/src/browser/browser.zig b/src/browser/browser.zig index e361838e..e7e2445c 100644 --- a/src/browser/browser.zig +++ b/src/browser/browser.zig @@ -24,7 +24,7 @@ const Types = @import("root").Types; const parser = @import("netsurf"); const Loader = @import("loader.zig").Loader; const Dump = @import("dump.zig"); -const Mime = @import("mime.zig"); +const Mime = @import("mime.zig").Mime; const jsruntime = @import("jsruntime"); const Loop = jsruntime.Loop; @@ -375,8 +375,10 @@ pub const Page = struct { defer alloc.free(ct.?); log.debug("header content-type: {s}", .{ct.?}); - const mime = try Mime.parse(ct.?); - if (mime.eql(Mime.HTML)) { + var mime = try Mime.parse(alloc, ct.?); + defer mime.deinit(); + + if (mime.isHTML()) { try self.loadHTMLDoc(req.reader(), mime.charset orelse "utf-8", auxData); } else { log.info("non-HTML document: {s}", .{ct.?}); diff --git a/src/browser/mime.zig b/src/browser/mime.zig index da8ac710..3716939f 100644 --- a/src/browser/mime.zig +++ b/src/browser/mime.zig @@ -17,143 +17,375 @@ // along with this program. If not, see . const std = @import("std"); -const testing = std.testing; +const Allocator = std.mem.Allocator; -const Reader = @import("../str/parser.zig").Reader; +pub const Mime = struct { + content_type: ContentType, + params: []const u8 = "", + charset: ?[]const u8 = null, + arena: std.heap.ArenaAllocator, -const Self = @This(); + pub const ContentTypeEnum = enum { + text_xml, + text_html, + text_plain, + other, + }; -const MimeError = error{ - Empty, - TooBig, - Invalid, - InvalidChar, + pub const ContentType = union(ContentTypeEnum) { + text_xml: void, + text_html: void, + text_plain: void, + other: struct { type: []const u8, sub_type: []const u8 }, + }; + + pub fn parse(allocator: Allocator, input: []const u8) !Mime { + if (input.len > 255) { + return error.TooBig; + } + + var arena = std.heap.ArenaAllocator.init(allocator); + errdefer arena.deinit(); + + var trimmed = trim(input); + + const content_type, const type_len = try parseContentType(trimmed); + if (type_len >= trimmed.len) { + return .{ .arena = arena, .content_type = content_type }; + } + + const params = trimLeft(trimmed[type_len..]); + + var charset: ?[]const u8 = null; + + var it = std.mem.splitScalar(u8, params, ';'); + while (it.next()) |attr| { + const i = std.mem.indexOfScalarPos(u8, attr, 0, '=') orelse return error.Invalid; + const name = trimLeft(attr[0..i]); + + const value = trimRight(attr[i + 1 ..]); + if (value.len == 0) { + return error.Invalid; + } + + switch (name.len) { + 7 => if (isCaseEqual("charset", name)) { + charset = try parseValue(arena.allocator(), value); + }, + else => {}, + } + } + + return .{ + .arena = arena, + .params = params, + .charset = charset, + .content_type = content_type, + }; + } + + pub fn deinit(self: *Mime) void { + self.arena.deinit(); + } + + pub fn isHTML(self: *const Mime) bool { + return self.content_type == .text_html; + } + + fn parseContentType(value: []const u8) !struct { ContentType, usize } { + const separator = std.mem.indexOfScalarPos(u8, value, 0, '/') orelse { + return error.Invalid; + }; + const end = std.mem.indexOfScalarPos(u8, value, separator, ';') orelse blk: { + break :blk value.len; + }; + + const main_type = value[0..separator]; + const sub_type = trimRight(value[separator + 1 .. end]); + + if (parseCommonContentType(main_type, sub_type)) |content_type| { + return .{ content_type, end + 1 }; + } + + if (main_type.len == 0) { + return error.Invalid; + } + if (validType(main_type) == false) { + return error.Invalid; + } + + if (sub_type.len == 0) { + return error.Invalid; + } + if (validType(sub_type) == false) { + return error.Invalid; + } + + const content_type = ContentType{ .other = .{ + .type = main_type, + .sub_type = sub_type, + } }; + + return .{ content_type, end + 1 }; + } + + fn parseCommonContentType(main_type: []const u8, sub_type: []const u8) ?ContentType { + switch (main_type.len) { + 4 => if (isCaseEqual("text", main_type)) { + switch (sub_type.len) { + 3 => if (isCaseEqual("xml", sub_type)) { + return .{ .text_xml = {} }; + }, + 4 => if (isCaseEqual("html", sub_type)) { + return .{ .text_html = {} }; + }, + 5 => if (isCaseEqual("plain", sub_type)) { + return .{ .text_plain = {} }; + }, + else => {}, + } + }, + else => {}, + } + return null; + } + + const T_SPECIAL = blk: { + var v = [_]bool{false} ** 256; + for ("()<>@,;:\\\"/[]?=") |b| { + v[b] = true; + } + break :blk v; + }; + + fn parseValue(allocator: Allocator, value: []const u8) ![]const u8 { + if (value[0] != '"') { + return value; + } + + // 1 to skip the opening quote + var value_pos: usize = 1; + var unescaped_len: usize = 0; + const last = value.len - 1; + + while (value_pos < value.len) { + switch (value[value_pos]) { + '"' => break, + '\\' => { + if (value_pos == last) { + return error.Invalid; + } + const next = value[value_pos + 1]; + if (T_SPECIAL[next] == false) { + return error.Invalid; + } + value_pos += 2; + }, + else => value_pos += 1, + } + unescaped_len += 1; + } + + if (unescaped_len == 0) { + return error.Invalid; + } + + value_pos = 1; + const owned = try allocator.alloc(u8, unescaped_len); + for (0..unescaped_len) |i| { + switch (value[value_pos]) { + '"' => break, + '\\' => { + owned[i] = value[value_pos + 1]; + value_pos += 2; + }, + else => |c| { + owned[i] = c; + value_pos += 1; + }, + } + } + return owned; + } + + const VALID_CODEPOINTS = blk: { + var v: [256]bool = undefined; + for (0..256) |i| { + v[i] = std.ascii.isAlphanumeric(i); + } + for ("!#$%&\\*+-.^'_`|~") |b| { + v[b] = true; + } + break :blk v; + }; + + fn validType(value: []const u8) bool { + for (value) |b| { + if (VALID_CODEPOINTS[b] == false) { + return false; + } + } + return true; + } + + fn trim(s: []const u8) []const u8 { + return std.mem.trim(u8, s, &std.ascii.whitespace); + } + + fn trimLeft(s: []const u8) []const u8 { + return std.mem.trimLeft(u8, s, &std.ascii.whitespace); + } + + fn trimRight(s: []const u8) []const u8 { + return std.mem.trimRight(u8, s, &std.ascii.whitespace); + } + + fn isCaseEqual(comptime target: anytype, value: []const u8) bool { + // - 8 beause we don't care about the sentinel + const bit_len = @bitSizeOf(@TypeOf(target.*)) - 8; + const byte_len = bit_len / 8; + + const T = @Type(.{ .Int = .{ + .bits = bit_len, + .signedness = .unsigned, + } }); + + const bit_target: T = @bitCast(@as(*const [byte_len]u8, target).*); + + if (@as(T, @bitCast(value[0..byte_len].*)) == bit_target) { + return true; + } + return std.ascii.eqlIgnoreCase(value, target); + } }; -mtype: []const u8, -msubtype: []const u8, -params: []const u8 = "", - -charset: ?[]const u8 = null, -boundary: ?[]const u8 = null, - -pub const Empty = Self{ .mtype = "", .msubtype = "" }; -pub const HTML = Self{ .mtype = "text", .msubtype = "html" }; -pub const Javascript = Self{ .mtype = "application", .msubtype = "javascript" }; - -// https://mimesniff.spec.whatwg.org/#http-token-code-point -fn isHTTPCodePoint(c: u8) bool { - return switch (c) { - '!', '#', '$', '%', '&', '\'', '*', '+', '-', '.', '^' => return true, - '_', '`', '|', '~' => return true, - else => std.ascii.isAlphanumeric(c), - }; -} - -fn valid(s: []const u8) bool { - const ln = s.len; - var i: usize = 0; - while (i < ln) { - if (!isHTTPCodePoint(s[i])) return false; - i += 1; - } - return true; -} - -// https://mimesniff.spec.whatwg.org/#parsing-a-mime-type -pub fn parse(s: []const u8) Self.MimeError!Self { - const ln = s.len; - if (ln == 0) return MimeError.Empty; - // limit input size - if (ln > 255) return MimeError.TooBig; - - var res = Self{ .mtype = "", .msubtype = "" }; - var r = Reader{ .data = s }; - - res.mtype = trim(r.until('/')); - if (res.mtype.len == 0) return MimeError.Invalid; - if (!valid(res.mtype)) return MimeError.InvalidChar; - - if (!r.skip()) return MimeError.Invalid; - res.msubtype = trim(r.until(';')); - if (res.msubtype.len == 0) return MimeError.Invalid; - if (!valid(res.msubtype)) return MimeError.InvalidChar; - - if (!r.skip()) return res; - res.params = trim(r.tail()); - if (res.params.len == 0) return MimeError.Invalid; - - // parse well known parameters. - // don't check invalid parameter format. - var rp = Reader{ .data = res.params }; - while (true) { - const name = trim(rp.until('=')); - if (!rp.skip()) return res; - const value = trim(rp.until(';')); - - if (std.ascii.eqlIgnoreCase(name, "charset")) { - res.charset = value; - } - if (std.ascii.eqlIgnoreCase(name, "boundary")) { - res.boundary = value; - } - - if (!rp.skip()) return res; - } - - return res; -} - -fn trim(s: []const u8) []const u8 { - return std.mem.trim(u8, s, &std.ascii.whitespace); -} - -test "parse valid" { - for ([_][]const u8{ - "text/html", - " \ttext/html", - "text \t/html", - "text/ \thtml", - "text/html \t", - }) |tc| { - const m = try Self.parse(tc); - try testing.expectEqualStrings("text", m.mtype); - try testing.expectEqualStrings("html", m.msubtype); - } - const m2 = try Self.parse("text/javascript1.5"); - try testing.expectEqualStrings("text", m2.mtype); - try testing.expectEqualStrings("javascript1.5", m2.msubtype); - - const m3 = try Self.parse("text/html; charset=utf-8"); - try testing.expectEqualStrings("text", m3.mtype); - try testing.expectEqualStrings("html", m3.msubtype); - try testing.expectEqualStrings("charset=utf-8", m3.params); - try testing.expectEqualStrings("utf-8", m3.charset.?); - - const m4 = try Self.parse("text/html; boundary=----"); - try testing.expectEqualStrings("text", m4.mtype); - try testing.expectEqualStrings("html", m4.msubtype); - try testing.expectEqualStrings("boundary=----", m4.params); - try testing.expectEqualStrings("----", m4.boundary.?); -} - -test "parse invalid" { - for ([_][]const u8{ +const testing = std.testing; +test "Mime: invalid " { + const invalids = [_][]const u8{ "", - "te xt/html;", - "te@xt/html;", - "text/ht@ml;", - "text/html;", - "/text/html", - "/html", - }) |tc| { - _ = Self.parse(tc) catch continue; - try testing.expect(false); + "text", + "text /html", + "text/ html", + "text / html", + "text/html other", + "text/html; x", + "text/html; x=", + "text/html; x= ", + "text/html; = ", + "text/html;=", + "text/html; charset=\"\"", + "text/html; charset=\"", + "text/html; charset=\"\\", + "text/html; charset=\"\\a\"", // invalid to escape non special characters + }; + + for (invalids) |invalid| { + try testing.expectError(error.Invalid, Mime.parse(undefined, invalid)); } } -// Compare type and subtype. -pub fn eql(self: Self, b: Self) bool { - if (!std.mem.eql(u8, self.mtype, b.mtype)) return false; - return std.mem.eql(u8, self.msubtype, b.msubtype); +test "Mime: parse common" { + try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml"); + try expect(.{ .content_type = .{ .text_html = {} } }, "text/html"); + try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain"); + + try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml;"); + try expect(.{ .content_type = .{ .text_html = {} } }, "text/html;"); + try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain;"); + + try expect(.{ .content_type = .{ .text_xml = {} } }, " \ttext/xml"); + try expect(.{ .content_type = .{ .text_html = {} } }, "text/html "); + try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain \t\t"); + + try expect(.{ .content_type = .{ .text_xml = {} } }, "TEXT/xml"); + try expect(.{ .content_type = .{ .text_html = {} } }, "text/Html"); + try expect(.{ .content_type = .{ .text_plain = {} } }, "TEXT/PLAIN"); + + try expect(.{ .content_type = .{ .text_xml = {} } }, " TeXT/xml"); + try expect(.{ .content_type = .{ .text_html = {} } }, "teXt/HtML ;"); + try expect(.{ .content_type = .{ .text_plain = {} } }, "tExT/PlAiN;"); +} + +test "Mime: parse uncommon" { + const text_javascript = Expectation{ + .content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } }, + }; + try expect(text_javascript, "text/javascript"); + try expect(text_javascript, "text/javascript;"); + try expect(text_javascript, " text/javascript\t "); + try expect(text_javascript, " text/javascript\t ;"); + + try expect( + .{ .content_type = .{ .other = .{ .type = "Text", .sub_type = "Javascript" } } }, + "Text/Javascript", + ); +} + +test "Mime: parse charset" { + try expect(.{ + .content_type = .{ .text_xml = {} }, + .charset = "utf-8", + .params = "charset=utf-8", + }, "text/xml; charset=utf-8"); + + try expect(.{ + .content_type = .{ .text_xml = {} }, + .charset = "utf-8", + .params = "charset=\"utf-8\"", + }, "text/xml;charset=\"utf-8\""); + + try expect(.{ + .content_type = .{ .text_xml = {} }, + .charset = "\\ \" ", + .params = "charset=\"\\\\ \\\" \"", + }, "text/xml;charset=\"\\\\ \\\" \" "); +} + +test "Mime: isHTML" { + const isHTML = struct { + fn isHTML(expected: bool, input: []const u8) !void { + var mime = try Mime.parse(testing.allocator, input); + defer mime.deinit(); + try testing.expectEqual(expected, mime.isHTML()); + } + }.isHTML; + try isHTML(true, "text/html"); + try isHTML(true, "text/html;"); + try isHTML(true, "text/html; charset=utf-8"); + try isHTML(false, "text/htm"); // htm not html + try isHTML(false, "text/plain"); + try isHTML(false, "over/9000"); +} + +const Expectation = struct { + content_type: Mime.ContentType, + params: []const u8 = "", + charset: ?[]const u8 = null, +}; + +fn expect(expected: Expectation, input: []const u8) !void { + var actual = try Mime.parse(testing.allocator, input); + defer actual.deinit(); + + try testing.expectEqual( + std.meta.activeTag(expected.content_type), + std.meta.activeTag(actual.content_type), + ); + + switch (expected.content_type) { + .other => |e| { + const a = actual.content_type.other; + try testing.expectEqualStrings(e.type, a.type); + try testing.expectEqualStrings(e.sub_type, a.sub_type); + }, + else => {}, // already asserted above + } + + try testing.expectEqualStrings(expected.params, actual.params); + + if (expected.charset) |ec| { + try testing.expectEqualStrings(ec, actual.charset.?); + } else { + try testing.expectEqual(null, actual.charset); + } } diff --git a/src/str/parser.zig b/src/str/parser.zig index f663c4d5..55b6bb32 100644 --- a/src/str/parser.zig +++ b/src/str/parser.zig @@ -35,7 +35,7 @@ pub const Reader = struct { pub fn tail(self: *Reader) []const u8 { const pos = self.pos; const data = self.data; - if (pos > data.len) { + if (pos > data.len) { return ""; } self.pos = data.len; diff --git a/src/xhr/xhr.zig b/src/xhr/xhr.zig index 77131577..2ad2bc26 100644 --- a/src/xhr/xhr.zig +++ b/src/xhr/xhr.zig @@ -28,7 +28,7 @@ const DOMException = @import("../dom/exceptions.zig").DOMException; const ProgressEvent = @import("progress_event.zig").ProgressEvent; const XMLHttpRequestEventTarget = @import("event_target.zig").XMLHttpRequestEventTarget; -const Mime = @import("../browser/mime.zig"); +const Mime = @import("../browser/mime.zig").Mime; const Loop = jsruntime.Loop; const Client = @import("asyncio").Client; @@ -141,7 +141,7 @@ pub const XMLHttpRequest = struct { // https://lightpanda.slack.com/archives/C05TRU6RBM1/p1707819010681019 // response_override_mime_type: ?[]const u8 = null, - response_mime: Mime = undefined, + response_mime: ?Mime = null, response_obj: ?ResponseObj = null, send_flag: bool = false, @@ -313,8 +313,11 @@ pub const XMLHttpRequest = struct { if (self.response_obj) |v| v.deinit(); self.response_obj = null; - self.response_mime = Mime.Empty; self.response_type = .Empty; + if (self.response_mime) |*mime| { + mime.deinit(); + self.response_mime = null; + } // TODO should we clearRetainingCapacity instead? self.headers.clearAndFree(); @@ -336,6 +339,9 @@ pub const XMLHttpRequest = struct { self.reset(); self.headers.deinit(); self.response_headers.deinit(); + if (self.response_mime) |*mime| { + mime.deinit(); + } self.proto.deinit(alloc); } @@ -544,7 +550,7 @@ pub const XMLHttpRequest = struct { // extract a mime type from headers. const ct = self.response_headers.getFirstValue("Content-Type") orelse "text/xml"; - self.response_mime = Mime.parse(ct) catch |e| return self.onErr(e); + self.response_mime = Mime.parse(self.alloc, ct) catch |e| return self.onErr(e); // TODO handle override mime type @@ -820,13 +826,14 @@ pub const XMLHttpRequest = struct { // TODO parse XML. // https://xhr.spec.whatwg.org/#response-object fn setResponseObjDocument(self: *XMLHttpRequest, alloc: std.mem.Allocator) void { - const isHTML = self.response_mime.eql(Mime.HTML); + const response_mime = &self.response_mime.?; + const isHTML = response_mime.isHTML(); // TODO If finalMIME is not an HTML MIME type or an XML MIME type, then // return. if (!isHTML) return; - const ccharset = alloc.dupeZ(u8, self.response_mime.charset orelse "utf-8") catch { + const ccharset = alloc.dupeZ(u8, response_mime.charset orelse "utf-8") catch { self.response_obj = .{ .Failure = true }; return; };