From b9f61466ba42898c91f3d32dbbe854fe5ef8667a Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Tue, 22 Apr 2025 10:54:29 +0800 Subject: [PATCH] Try to sniff the mime type based on the body content Synchronous body reader now exposes a peek() function to get the first few bytes from the response body. This will be no less than 100 bytes (assuming the body is that big), but could be more. Streaming API, via res.next() continues to work as-is even if peek() is called. Introduce Mime.sniff() that detects a few common types - the ones that we care about right now - from the body content. --- src/browser/browser.zig | 21 ++- src/browser/mime.zig | 293 +++++++++++++++++++++++++++------------- src/browser/xhr/xhr.zig | 34 ++--- src/http/client.zig | 144 +++++++++++++++----- src/testing.zig | 11 ++ 5 files changed, 343 insertions(+), 160 deletions(-) diff --git a/src/browser/browser.zig b/src/browser/browser.zig index 5d2a0e61..a357c7da 100644 --- a/src/browser/browser.zig +++ b/src/browser/browser.zig @@ -435,24 +435,19 @@ pub const Page = struct { log.info("GET {any} {d}", .{ url, header.status }); - const ct = blk: { - break :blk header.get("content-type") orelse { - // no content type in HTTP headers. - // TODO try to sniff mime type from the body. - log.info("no content-type HTTP header", .{}); + const content_type = header.get("content-type"); - // Assume it's HTML for now. - break :blk "text/html; charset=utf-8"; - }; - }; - - log.debug("header content-type: {s}", .{ct}); - var mime = try Mime.parse(arena, ct); + const mime: Mime = blk: { + if (content_type) |ct| { + break :blk try Mime.parse(arena, ct); + } + break :blk Mime.sniff(try response.peek()); + } orelse .unknown; if (mime.isHTML()) { try self.loadHTMLDoc(&response, mime.charset orelse "utf-8"); } else { - log.info("non-HTML document: {s}", .{ct}); + log.info("non-HTML document: {s}", .{content_type orelse "null"}); var arr: std.ArrayListUnmanaged(u8) = .{}; while (try response.next()) |data| { try arr.appendSlice(arena, try arena.dupe(u8, data)); diff --git a/src/browser/mime.zig b/src/browser/mime.zig index 33e14cba..21e4cb8c 100644 --- a/src/browser/mime.zig +++ b/src/browser/mime.zig @@ -24,10 +24,17 @@ pub const Mime = struct { params: []const u8 = "", charset: ?[]const u8 = null, + pub const unknown = Mime{ + .params = "", + .charset = "", + .content_type = .{ .unknown = {} }, + }; + pub const ContentTypeEnum = enum { text_xml, text_html, text_plain, + unknown, other, }; @@ -35,21 +42,26 @@ pub const Mime = struct { text_xml: void, text_html: void, text_plain: void, + unknown: void, other: struct { type: []const u8, sub_type: []const u8 }, }; - pub fn parse(arena: Allocator, input: []const u8) !Mime { + pub fn parse(arena: Allocator, input: []u8) !Mime { if (input.len > 255) { return error.TooBig; } - var trimmed = trim(input); - const content_type, const type_len = try parseContentType(trimmed); - if (type_len >= trimmed.len) { + // Zig's trim API is broken. The return type is always `[]const u8`, + // even if the input type is `[]u8`. @constCast is safe here. + var normalized = @constCast(std.mem.trim(u8, input, &std.ascii.whitespace)); + _ = std.ascii.lowerString(normalized, normalized); + + const content_type, const type_len = try parseContentType(normalized); + if (type_len >= normalized.len) { return .{ .content_type = content_type }; } - const params = trimLeft(trimmed[type_len..]); + const params = trimLeft(normalized[type_len..]); var charset: ?[]const u8 = null; @@ -63,11 +75,12 @@ pub const Mime = struct { return error.Invalid; } - switch (name.len) { - 7 => if (isCaseEqual("charset", name)) { - charset = try parseValue(arena, value); - }, - else => {}, + const attribute_name = std.meta.stringToEnum(enum { + charset, + }, name) orelse continue; + + switch (attribute_name) { + .charset => charset = try parseAttributeValue(arena, value), } } @@ -78,66 +91,113 @@ pub const Mime = struct { }; } + pub fn sniff(body: []const u8) ?Mime { + // 0x0C is form feed + const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C }); + if (content.len == 0) { + return null; + } + + if (content[0] != '<') { + if (std.mem.startsWith(u8, content, &.{ 0xEF, 0xBB, 0xBF })) { + // UTF-8 BOM + return .{ .content_type = .{ .text_plain = {} } }; + } + if (std.mem.startsWith(u8, content, &.{ 0xFE, 0xFF })) { + // UTF-16 big-endian BOM + return .{ .content_type = .{ .text_plain = {} } }; + } + if (std.mem.startsWith(u8, content, &.{ 0xFF, 0xFE })) { + // UTF-16 little-endian BOM + return .{ .content_type = .{ .text_plain = {} } }; + } + return null; + } + + // The longest prefix we have is " known_prefix.len) { + const next = prefix[known_prefix.len]; + // a "tag-terminating-byte" + if (next == ' ' or next == '>') { + return .{ .content_type = kp.@"1" }; + } + } + } + + return null; + } + pub fn isHTML(self: *const Mime) bool { return self.content_type == .text_html; } + // we expect value to be lowercase fn parseContentType(value: []const u8) !struct { ContentType, usize } { - const separator = std.mem.indexOfScalarPos(u8, value, 0, '/') orelse { - return error.Invalid; - }; - const end = std.mem.indexOfScalarPos(u8, value, separator, ';') orelse blk: { - break :blk value.len; - }; + const end = std.mem.indexOfScalarPos(u8, value, 0, ';') orelse value.len; + const type_name = trimRight(value[0..end]); + const attribute_start = end + 1; + + if (std.meta.stringToEnum(enum { + @"text/xml", + @"text/html", + @"text/plain", + }, type_name)) |known_type| { + const ct: ContentType = switch (known_type) { + .@"text/xml" => .{ .text_xml = {} }, + .@"text/html" => .{ .text_html = {} }, + .@"text/plain" => .{ .text_plain = {} }, + }; + return .{ ct, attribute_start }; + } + + const separator = std.mem.indexOfScalarPos(u8, type_name, 0, '/') orelse return error.Invalid; const main_type = value[0..separator]; const sub_type = trimRight(value[separator + 1 .. end]); - if (parseCommonContentType(main_type, sub_type)) |content_type| { - return .{ content_type, end + 1 }; - } - - if (main_type.len == 0) { + if (main_type.len == 0 or validType(main_type) == false) { return error.Invalid; } - if (validType(main_type) == false) { + if (sub_type.len == 0 or validType(sub_type) == false) { return error.Invalid; } - if (sub_type.len == 0) { - return error.Invalid; - } - if (validType(sub_type) == false) { - return error.Invalid; - } - - const content_type = ContentType{ .other = .{ + return .{ .{ .other = .{ .type = main_type, .sub_type = sub_type, - } }; - - return .{ content_type, end + 1 }; - } - - fn parseCommonContentType(main_type: []const u8, sub_type: []const u8) ?ContentType { - switch (main_type.len) { - 4 => if (isCaseEqual("text", main_type)) { - switch (sub_type.len) { - 3 => if (isCaseEqual("xml", sub_type)) { - return .{ .text_xml = {} }; - }, - 4 => if (isCaseEqual("html", sub_type)) { - return .{ .text_html = {} }; - }, - 5 => if (isCaseEqual("plain", sub_type)) { - return .{ .text_plain = {} }; - }, - else => {}, - } - }, - else => {}, - } - return null; + } }, attribute_start }; } const T_SPECIAL = blk: { @@ -148,7 +208,7 @@ pub const Mime = struct { break :blk v; }; - fn parseValue(arena: Allocator, value: []const u8) ![]const u8 { + fn parseAttributeValue(arena: Allocator, value: []const u8) ![]const u8 { if (value[0] != '"') { return value; } @@ -218,10 +278,6 @@ pub const Mime = struct { return true; } - fn trim(s: []const u8) []const u8 { - return std.mem.trim(u8, s, &std.ascii.whitespace); - } - fn trimLeft(s: []const u8) []const u8 { return std.mem.trimLeft(u8, s, &std.ascii.whitespace); } @@ -229,28 +285,12 @@ pub const Mime = struct { fn trimRight(s: []const u8) []const u8 { return std.mem.trimRight(u8, s, &std.ascii.whitespace); } - - fn isCaseEqual(comptime target: anytype, value: []const u8) bool { - // - 8 beause we don't care about the sentinel - const bit_len = @bitSizeOf(@TypeOf(target.*)) - 8; - const byte_len = bit_len / 8; - - const T = @Type(.{ .int = .{ - .bits = bit_len, - .signedness = .unsigned, - } }); - - const bit_target: T = @bitCast(@as(*const [byte_len]u8, target).*); - - if (@as(T, @bitCast(value[0..byte_len].*)) == bit_target) { - return true; - } - return std.ascii.eqlIgnoreCase(value, target); - } }; -const testing = std.testing; +const testing = @import("../testing.zig"); test "Mime: invalid " { + defer testing.reset(); + const invalids = [_][]const u8{ "", "text", @@ -270,11 +310,14 @@ test "Mime: invalid " { }; for (invalids) |invalid| { - try testing.expectError(error.Invalid, Mime.parse(undefined, invalid)); + const mutable_input = try testing.arena_allocator.dupe(u8, invalid); + try testing.expectError(error.Invalid, Mime.parse(undefined, mutable_input)); } } test "Mime: parse common" { + defer testing.reset(); + try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml"); try expect(.{ .content_type = .{ .text_html = {} } }, "text/html"); try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain"); @@ -297,6 +340,8 @@ test "Mime: parse common" { } test "Mime: parse uncommon" { + defer testing.reset(); + const text_javascript = Expectation{ .content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } }, }; @@ -306,12 +351,14 @@ test "Mime: parse uncommon" { try expect(text_javascript, " text/javascript\t ;"); try expect( - .{ .content_type = .{ .other = .{ .type = "Text", .sub_type = "Javascript" } } }, + .{ .content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } } }, "Text/Javascript", ); } test "Mime: parse charset" { + defer testing.reset(); + try expect(.{ .content_type = .{ .text_xml = {} }, .charset = "utf-8", @@ -332,11 +379,12 @@ test "Mime: parse charset" { } test "Mime: isHTML" { + defer testing.reset(); + const isHTML = struct { fn isHTML(expected: bool, input: []const u8) !void { - var arena = std.heap.ArenaAllocator.init(testing.allocator); - defer arena.deinit(); - var mime = try Mime.parse(arena.allocator(), input); + const mutable_input = try testing.arena_allocator.dupe(u8, input); + var mime = try Mime.parse(testing.arena_allocator, mutable_input); try testing.expectEqual(expected, mime.isHTML()); } }.isHTML; @@ -348,6 +396,71 @@ test "Mime: isHTML" { try isHTML(false, "over/9000"); } +test "Mime: sniff" { + try testing.expectEqual(null, Mime.sniff("")); + try testing.expectEqual(null, Mime.sniff("")); + try testing.expectEqual(null, Mime.sniff("\n ")); + try testing.expectEqual(null, Mime.sniff("\n \t ")); + + const expectHTML = struct { + fn expect(input: []const u8) !void { + try testing.expectEqual(.text_html, std.meta.activeTag(Mime.sniff(input).?.content_type)); + } + }.expect; + + try expectHTML(" even more stufff"); + + try expectHTML(""); + + try expectHTML("