diff --git a/src/browser/browser.zig b/src/browser/browser.zig index 5d2a0e61..a357c7da 100644 --- a/src/browser/browser.zig +++ b/src/browser/browser.zig @@ -435,24 +435,19 @@ pub const Page = struct { log.info("GET {any} {d}", .{ url, header.status }); - const ct = blk: { - break :blk header.get("content-type") orelse { - // no content type in HTTP headers. - // TODO try to sniff mime type from the body. - log.info("no content-type HTTP header", .{}); + const content_type = header.get("content-type"); - // Assume it's HTML for now. - break :blk "text/html; charset=utf-8"; - }; - }; - - log.debug("header content-type: {s}", .{ct}); - var mime = try Mime.parse(arena, ct); + const mime: Mime = blk: { + if (content_type) |ct| { + break :blk try Mime.parse(arena, ct); + } + break :blk Mime.sniff(try response.peek()); + } orelse .unknown; if (mime.isHTML()) { try self.loadHTMLDoc(&response, mime.charset orelse "utf-8"); } else { - log.info("non-HTML document: {s}", .{ct}); + log.info("non-HTML document: {s}", .{content_type orelse "null"}); var arr: std.ArrayListUnmanaged(u8) = .{}; while (try response.next()) |data| { try arr.appendSlice(arena, try arena.dupe(u8, data)); diff --git a/src/browser/mime.zig b/src/browser/mime.zig index 33e14cba..21e4cb8c 100644 --- a/src/browser/mime.zig +++ b/src/browser/mime.zig @@ -24,10 +24,17 @@ pub const Mime = struct { params: []const u8 = "", charset: ?[]const u8 = null, + pub const unknown = Mime{ + .params = "", + .charset = "", + .content_type = .{ .unknown = {} }, + }; + pub const ContentTypeEnum = enum { text_xml, text_html, text_plain, + unknown, other, }; @@ -35,21 +42,26 @@ pub const Mime = struct { text_xml: void, text_html: void, text_plain: void, + unknown: void, other: struct { type: []const u8, sub_type: []const u8 }, }; - pub fn parse(arena: Allocator, input: []const u8) !Mime { + pub fn parse(arena: Allocator, input: []u8) !Mime { if (input.len > 255) { return error.TooBig; } - var trimmed = trim(input); - const content_type, const type_len = try parseContentType(trimmed); - if (type_len >= trimmed.len) { + // Zig's trim API is broken. The return type is always `[]const u8`, + // even if the input type is `[]u8`. @constCast is safe here. + var normalized = @constCast(std.mem.trim(u8, input, &std.ascii.whitespace)); + _ = std.ascii.lowerString(normalized, normalized); + + const content_type, const type_len = try parseContentType(normalized); + if (type_len >= normalized.len) { return .{ .content_type = content_type }; } - const params = trimLeft(trimmed[type_len..]); + const params = trimLeft(normalized[type_len..]); var charset: ?[]const u8 = null; @@ -63,11 +75,12 @@ pub const Mime = struct { return error.Invalid; } - switch (name.len) { - 7 => if (isCaseEqual("charset", name)) { - charset = try parseValue(arena, value); - }, - else => {}, + const attribute_name = std.meta.stringToEnum(enum { + charset, + }, name) orelse continue; + + switch (attribute_name) { + .charset => charset = try parseAttributeValue(arena, value), } } @@ -78,66 +91,113 @@ pub const Mime = struct { }; } + pub fn sniff(body: []const u8) ?Mime { + // 0x0C is form feed + const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C }); + if (content.len == 0) { + return null; + } + + if (content[0] != '<') { + if (std.mem.startsWith(u8, content, &.{ 0xEF, 0xBB, 0xBF })) { + // UTF-8 BOM + return .{ .content_type = .{ .text_plain = {} } }; + } + if (std.mem.startsWith(u8, content, &.{ 0xFE, 0xFF })) { + // UTF-16 big-endian BOM + return .{ .content_type = .{ .text_plain = {} } }; + } + if (std.mem.startsWith(u8, content, &.{ 0xFF, 0xFE })) { + // UTF-16 little-endian BOM + return .{ .content_type = .{ .text_plain = {} } }; + } + return null; + } + + // The longest prefix we have is " known_prefix.len) { + const next = prefix[known_prefix.len]; + // a "tag-terminating-byte" + if (next == ' ' or next == '>') { + return .{ .content_type = kp.@"1" }; + } + } + } + + return null; + } + pub fn isHTML(self: *const Mime) bool { return self.content_type == .text_html; } + // we expect value to be lowercase fn parseContentType(value: []const u8) !struct { ContentType, usize } { - const separator = std.mem.indexOfScalarPos(u8, value, 0, '/') orelse { - return error.Invalid; - }; - const end = std.mem.indexOfScalarPos(u8, value, separator, ';') orelse blk: { - break :blk value.len; - }; + const end = std.mem.indexOfScalarPos(u8, value, 0, ';') orelse value.len; + const type_name = trimRight(value[0..end]); + const attribute_start = end + 1; + + if (std.meta.stringToEnum(enum { + @"text/xml", + @"text/html", + @"text/plain", + }, type_name)) |known_type| { + const ct: ContentType = switch (known_type) { + .@"text/xml" => .{ .text_xml = {} }, + .@"text/html" => .{ .text_html = {} }, + .@"text/plain" => .{ .text_plain = {} }, + }; + return .{ ct, attribute_start }; + } + + const separator = std.mem.indexOfScalarPos(u8, type_name, 0, '/') orelse return error.Invalid; const main_type = value[0..separator]; const sub_type = trimRight(value[separator + 1 .. end]); - if (parseCommonContentType(main_type, sub_type)) |content_type| { - return .{ content_type, end + 1 }; - } - - if (main_type.len == 0) { + if (main_type.len == 0 or validType(main_type) == false) { return error.Invalid; } - if (validType(main_type) == false) { + if (sub_type.len == 0 or validType(sub_type) == false) { return error.Invalid; } - if (sub_type.len == 0) { - return error.Invalid; - } - if (validType(sub_type) == false) { - return error.Invalid; - } - - const content_type = ContentType{ .other = .{ + return .{ .{ .other = .{ .type = main_type, .sub_type = sub_type, - } }; - - return .{ content_type, end + 1 }; - } - - fn parseCommonContentType(main_type: []const u8, sub_type: []const u8) ?ContentType { - switch (main_type.len) { - 4 => if (isCaseEqual("text", main_type)) { - switch (sub_type.len) { - 3 => if (isCaseEqual("xml", sub_type)) { - return .{ .text_xml = {} }; - }, - 4 => if (isCaseEqual("html", sub_type)) { - return .{ .text_html = {} }; - }, - 5 => if (isCaseEqual("plain", sub_type)) { - return .{ .text_plain = {} }; - }, - else => {}, - } - }, - else => {}, - } - return null; + } }, attribute_start }; } const T_SPECIAL = blk: { @@ -148,7 +208,7 @@ pub const Mime = struct { break :blk v; }; - fn parseValue(arena: Allocator, value: []const u8) ![]const u8 { + fn parseAttributeValue(arena: Allocator, value: []const u8) ![]const u8 { if (value[0] != '"') { return value; } @@ -218,10 +278,6 @@ pub const Mime = struct { return true; } - fn trim(s: []const u8) []const u8 { - return std.mem.trim(u8, s, &std.ascii.whitespace); - } - fn trimLeft(s: []const u8) []const u8 { return std.mem.trimLeft(u8, s, &std.ascii.whitespace); } @@ -229,28 +285,12 @@ pub const Mime = struct { fn trimRight(s: []const u8) []const u8 { return std.mem.trimRight(u8, s, &std.ascii.whitespace); } - - fn isCaseEqual(comptime target: anytype, value: []const u8) bool { - // - 8 beause we don't care about the sentinel - const bit_len = @bitSizeOf(@TypeOf(target.*)) - 8; - const byte_len = bit_len / 8; - - const T = @Type(.{ .int = .{ - .bits = bit_len, - .signedness = .unsigned, - } }); - - const bit_target: T = @bitCast(@as(*const [byte_len]u8, target).*); - - if (@as(T, @bitCast(value[0..byte_len].*)) == bit_target) { - return true; - } - return std.ascii.eqlIgnoreCase(value, target); - } }; -const testing = std.testing; +const testing = @import("../testing.zig"); test "Mime: invalid " { + defer testing.reset(); + const invalids = [_][]const u8{ "", "text", @@ -270,11 +310,14 @@ test "Mime: invalid " { }; for (invalids) |invalid| { - try testing.expectError(error.Invalid, Mime.parse(undefined, invalid)); + const mutable_input = try testing.arena_allocator.dupe(u8, invalid); + try testing.expectError(error.Invalid, Mime.parse(undefined, mutable_input)); } } test "Mime: parse common" { + defer testing.reset(); + try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml"); try expect(.{ .content_type = .{ .text_html = {} } }, "text/html"); try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain"); @@ -297,6 +340,8 @@ test "Mime: parse common" { } test "Mime: parse uncommon" { + defer testing.reset(); + const text_javascript = Expectation{ .content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } }, }; @@ -306,12 +351,14 @@ test "Mime: parse uncommon" { try expect(text_javascript, " text/javascript\t ;"); try expect( - .{ .content_type = .{ .other = .{ .type = "Text", .sub_type = "Javascript" } } }, + .{ .content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } } }, "Text/Javascript", ); } test "Mime: parse charset" { + defer testing.reset(); + try expect(.{ .content_type = .{ .text_xml = {} }, .charset = "utf-8", @@ -332,11 +379,12 @@ test "Mime: parse charset" { } test "Mime: isHTML" { + defer testing.reset(); + const isHTML = struct { fn isHTML(expected: bool, input: []const u8) !void { - var arena = std.heap.ArenaAllocator.init(testing.allocator); - defer arena.deinit(); - var mime = try Mime.parse(arena.allocator(), input); + const mutable_input = try testing.arena_allocator.dupe(u8, input); + var mime = try Mime.parse(testing.arena_allocator, mutable_input); try testing.expectEqual(expected, mime.isHTML()); } }.isHTML; @@ -348,6 +396,71 @@ test "Mime: isHTML" { try isHTML(false, "over/9000"); } +test "Mime: sniff" { + try testing.expectEqual(null, Mime.sniff("")); + try testing.expectEqual(null, Mime.sniff("")); + try testing.expectEqual(null, Mime.sniff("\n ")); + try testing.expectEqual(null, Mime.sniff("\n \t ")); + + const expectHTML = struct { + fn expect(input: []const u8) !void { + try testing.expectEqual(.text_html, std.meta.activeTag(Mime.sniff(input).?.content_type)); + } + }.expect; + + try expectHTML(" even more stufff"); + + try expectHTML(""); + + try expectHTML("