From b9f61466ba42898c91f3d32dbbe854fe5ef8667a Mon Sep 17 00:00:00 2001
From: Karl Seguin <k@openmymind.io>
Date: Tue, 22 Apr 2025 10:54:29 +0800
Subject: [PATCH] Try to sniff the mime type based on the body content

Synchronous body reader now exposes a peek() function to get the first few bytes
from the response body. This will be no less than 100 bytes (assuming the body
is that big), but could be more. Streaming API, via res.next() continues to work
as-is even if peek() is called.

Introduce Mime.sniff() that detects a few common types - the ones that we care
about right now - from the body content.
---
 src/browser/browser.zig |  21 ++-
 src/browser/mime.zig    | 293 +++++++++++++++++++++++++++-------------
 src/browser/xhr/xhr.zig |  34 ++---
 src/http/client.zig     | 144 +++++++++++++++-----
 src/testing.zig         |  11 ++
 5 files changed, 343 insertions(+), 160 deletions(-)

diff --git a/src/browser/browser.zig b/src/browser/browser.zig
index 5d2a0e61..a357c7da 100644
--- a/src/browser/browser.zig
+++ b/src/browser/browser.zig
@@ -435,24 +435,19 @@ pub const Page = struct {
 
         log.info("GET {any} {d}", .{ url, header.status });
 
-        const ct = blk: {
-            break :blk header.get("content-type") orelse {
-                // no content type in HTTP headers.
-                // TODO try to sniff mime type from the body.
-                log.info("no content-type HTTP header", .{});
+        const content_type = header.get("content-type");
 
-                // Assume it's HTML for now.
-                break :blk "text/html; charset=utf-8";
-            };
-        };
-
-        log.debug("header content-type: {s}", .{ct});
-        var mime = try Mime.parse(arena, ct);
+        const mime: Mime = blk: {
+            if (content_type) |ct| {
+                break :blk try Mime.parse(arena, ct);
+            }
+            break :blk Mime.sniff(try response.peek());
+        } orelse .unknown;
 
         if (mime.isHTML()) {
             try self.loadHTMLDoc(&response, mime.charset orelse "utf-8");
         } else {
-            log.info("non-HTML document: {s}", .{ct});
+            log.info("non-HTML document: {s}", .{content_type orelse "null"});
             var arr: std.ArrayListUnmanaged(u8) = .{};
             while (try response.next()) |data| {
                 try arr.appendSlice(arena, try arena.dupe(u8, data));
diff --git a/src/browser/mime.zig b/src/browser/mime.zig
index 33e14cba..21e4cb8c 100644
--- a/src/browser/mime.zig
+++ b/src/browser/mime.zig
@@ -24,10 +24,17 @@ pub const Mime = struct {
     params: []const u8 = "",
     charset: ?[]const u8 = null,
 
+    pub const unknown = Mime{
+        .params = "",
+        .charset = "",
+        .content_type = .{ .unknown = {} },
+    };
+
     pub const ContentTypeEnum = enum {
         text_xml,
         text_html,
         text_plain,
+        unknown,
         other,
     };
 
@@ -35,21 +42,26 @@ pub const Mime = struct {
         text_xml: void,
         text_html: void,
         text_plain: void,
+        unknown: void,
         other: struct { type: []const u8, sub_type: []const u8 },
     };
 
-    pub fn parse(arena: Allocator, input: []const u8) !Mime {
+    pub fn parse(arena: Allocator, input: []u8) !Mime {
         if (input.len > 255) {
             return error.TooBig;
         }
-        var trimmed = trim(input);
 
-        const content_type, const type_len = try parseContentType(trimmed);
-        if (type_len >= trimmed.len) {
+        // Zig's trim API is broken. The return type is always `[]const u8`,
+        // even if the input type is `[]u8`. @constCast is safe here.
+        var normalized = @constCast(std.mem.trim(u8, input, &std.ascii.whitespace));
+        _ = std.ascii.lowerString(normalized, normalized);
+
+        const content_type, const type_len = try parseContentType(normalized);
+        if (type_len >= normalized.len) {
             return .{ .content_type = content_type };
         }
 
-        const params = trimLeft(trimmed[type_len..]);
+        const params = trimLeft(normalized[type_len..]);
 
         var charset: ?[]const u8 = null;
 
@@ -63,11 +75,12 @@ pub const Mime = struct {
                 return error.Invalid;
             }
 
-            switch (name.len) {
-                7 => if (isCaseEqual("charset", name)) {
-                    charset = try parseValue(arena, value);
-                },
-                else => {},
+            const attribute_name = std.meta.stringToEnum(enum {
+                charset,
+            }, name) orelse continue;
+
+            switch (attribute_name) {
+                .charset => charset = try parseAttributeValue(arena, value),
             }
         }
 
@@ -78,66 +91,113 @@ pub const Mime = struct {
         };
     }
 
+    pub fn sniff(body: []const u8) ?Mime {
+        // 0x0C is form feed
+        const content = std.mem.trimLeft(u8, body, &.{ ' ', '\t', '\n', '\r', 0x0C });
+        if (content.len == 0) {
+            return null;
+        }
+
+        if (content[0] != '<') {
+            if (std.mem.startsWith(u8, content, &.{ 0xEF, 0xBB, 0xBF })) {
+                // UTF-8 BOM
+                return .{ .content_type = .{ .text_plain = {} } };
+            }
+            if (std.mem.startsWith(u8, content, &.{ 0xFE, 0xFF })) {
+                // UTF-16 big-endian BOM
+                return .{ .content_type = .{ .text_plain = {} } };
+            }
+            if (std.mem.startsWith(u8, content, &.{ 0xFF, 0xFE })) {
+                // UTF-16 little-endian BOM
+                return .{ .content_type = .{ .text_plain = {} } };
+            }
+            return null;
+        }
+
+        // The longest prefix we have is "<!DOCTYPE HTML ", 15 bytes. If we're
+        // here, we already know content[0] == '<', so we can skip that. So 14
+        // bytes.
+
+        // +1 because we don't need the leading '<'
+        var buf: [14]u8 = undefined;
+
+        const stripped = content[1..];
+        const prefix_len = @min(stripped.len, buf.len);
+        const prefix = std.ascii.lowerString(&buf, stripped[0..prefix_len]);
+
+        // we already know it starts with a <
+        const known_prefixes = [_]struct { []const u8, ContentType }{
+            .{ "!doctype html", .{ .text_html = {} } },
+            .{ "html", .{ .text_html = {} } },
+            .{ "script", .{ .text_html = {} } },
+            .{ "iframe", .{ .text_html = {} } },
+            .{ "h1", .{ .text_html = {} } },
+            .{ "div", .{ .text_html = {} } },
+            .{ "font", .{ .text_html = {} } },
+            .{ "table", .{ .text_html = {} } },
+            .{ "a", .{ .text_html = {} } },
+            .{ "style", .{ .text_html = {} } },
+            .{ "title", .{ .text_html = {} } },
+            .{ "b", .{ .text_html = {} } },
+            .{ "body", .{ .text_html = {} } },
+            .{ "br", .{ .text_html = {} } },
+            .{ "p", .{ .text_html = {} } },
+            .{ "!--", .{ .text_html = {} } },
+            .{ "xml", .{ .text_xml = {} } },
+        };
+        inline for (known_prefixes) |kp| {
+            const known_prefix = kp.@"0";
+            if (std.mem.startsWith(u8, prefix, known_prefix) and prefix.len > known_prefix.len) {
+                const next = prefix[known_prefix.len];
+                // a "tag-terminating-byte"
+                if (next == ' ' or next == '>') {
+                    return .{ .content_type = kp.@"1" };
+                }
+            }
+        }
+
+        return null;
+    }
+
     pub fn isHTML(self: *const Mime) bool {
         return self.content_type == .text_html;
     }
 
+    // we expect value to be lowercase
     fn parseContentType(value: []const u8) !struct { ContentType, usize } {
-        const separator = std.mem.indexOfScalarPos(u8, value, 0, '/') orelse {
-            return error.Invalid;
-        };
-        const end = std.mem.indexOfScalarPos(u8, value, separator, ';') orelse blk: {
-            break :blk value.len;
-        };
+        const end = std.mem.indexOfScalarPos(u8, value, 0, ';') orelse value.len;
+        const type_name = trimRight(value[0..end]);
+        const attribute_start = end + 1;
+
+        if (std.meta.stringToEnum(enum {
+            @"text/xml",
+            @"text/html",
+            @"text/plain",
+        }, type_name)) |known_type| {
+            const ct: ContentType = switch (known_type) {
+                .@"text/xml" => .{ .text_xml = {} },
+                .@"text/html" => .{ .text_html = {} },
+                .@"text/plain" => .{ .text_plain = {} },
+            };
+            return .{ ct, attribute_start };
+        }
+
+        const separator = std.mem.indexOfScalarPos(u8, type_name, 0, '/') orelse return error.Invalid;
 
         const main_type = value[0..separator];
         const sub_type = trimRight(value[separator + 1 .. end]);
 
-        if (parseCommonContentType(main_type, sub_type)) |content_type| {
-            return .{ content_type, end + 1 };
-        }
-
-        if (main_type.len == 0) {
+        if (main_type.len == 0 or validType(main_type) == false) {
             return error.Invalid;
         }
-        if (validType(main_type) == false) {
+        if (sub_type.len == 0 or validType(sub_type) == false) {
             return error.Invalid;
         }
 
-        if (sub_type.len == 0) {
-            return error.Invalid;
-        }
-        if (validType(sub_type) == false) {
-            return error.Invalid;
-        }
-
-        const content_type = ContentType{ .other = .{
+        return .{ .{ .other = .{
             .type = main_type,
             .sub_type = sub_type,
-        } };
-
-        return .{ content_type, end + 1 };
-    }
-
-    fn parseCommonContentType(main_type: []const u8, sub_type: []const u8) ?ContentType {
-        switch (main_type.len) {
-            4 => if (isCaseEqual("text", main_type)) {
-                switch (sub_type.len) {
-                    3 => if (isCaseEqual("xml", sub_type)) {
-                        return .{ .text_xml = {} };
-                    },
-                    4 => if (isCaseEqual("html", sub_type)) {
-                        return .{ .text_html = {} };
-                    },
-                    5 => if (isCaseEqual("plain", sub_type)) {
-                        return .{ .text_plain = {} };
-                    },
-                    else => {},
-                }
-            },
-            else => {},
-        }
-        return null;
+        } }, attribute_start };
     }
 
     const T_SPECIAL = blk: {
@@ -148,7 +208,7 @@ pub const Mime = struct {
         break :blk v;
     };
 
-    fn parseValue(arena: Allocator, value: []const u8) ![]const u8 {
+    fn parseAttributeValue(arena: Allocator, value: []const u8) ![]const u8 {
         if (value[0] != '"') {
             return value;
         }
@@ -218,10 +278,6 @@ pub const Mime = struct {
         return true;
     }
 
-    fn trim(s: []const u8) []const u8 {
-        return std.mem.trim(u8, s, &std.ascii.whitespace);
-    }
-
     fn trimLeft(s: []const u8) []const u8 {
         return std.mem.trimLeft(u8, s, &std.ascii.whitespace);
     }
@@ -229,28 +285,12 @@ pub const Mime = struct {
     fn trimRight(s: []const u8) []const u8 {
         return std.mem.trimRight(u8, s, &std.ascii.whitespace);
     }
-
-    fn isCaseEqual(comptime target: anytype, value: []const u8) bool {
-        // - 8 beause we don't care about the sentinel
-        const bit_len = @bitSizeOf(@TypeOf(target.*)) - 8;
-        const byte_len = bit_len / 8;
-
-        const T = @Type(.{ .int = .{
-            .bits = bit_len,
-            .signedness = .unsigned,
-        } });
-
-        const bit_target: T = @bitCast(@as(*const [byte_len]u8, target).*);
-
-        if (@as(T, @bitCast(value[0..byte_len].*)) == bit_target) {
-            return true;
-        }
-        return std.ascii.eqlIgnoreCase(value, target);
-    }
 };
 
-const testing = std.testing;
+const testing = @import("../testing.zig");
 test "Mime: invalid " {
+    defer testing.reset();
+
     const invalids = [_][]const u8{
         "",
         "text",
@@ -270,11 +310,14 @@ test "Mime: invalid " {
     };
 
     for (invalids) |invalid| {
-        try testing.expectError(error.Invalid, Mime.parse(undefined, invalid));
+        const mutable_input = try testing.arena_allocator.dupe(u8, invalid);
+        try testing.expectError(error.Invalid, Mime.parse(undefined, mutable_input));
     }
 }
 
 test "Mime: parse common" {
+    defer testing.reset();
+
     try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml");
     try expect(.{ .content_type = .{ .text_html = {} } }, "text/html");
     try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain");
@@ -297,6 +340,8 @@ test "Mime: parse common" {
 }
 
 test "Mime: parse uncommon" {
+    defer testing.reset();
+
     const text_javascript = Expectation{
         .content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } },
     };
@@ -306,12 +351,14 @@ test "Mime: parse uncommon" {
     try expect(text_javascript, "  text/javascript\t  ;");
 
     try expect(
-        .{ .content_type = .{ .other = .{ .type = "Text", .sub_type = "Javascript" } } },
+        .{ .content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } } },
         "Text/Javascript",
     );
 }
 
 test "Mime: parse charset" {
+    defer testing.reset();
+
     try expect(.{
         .content_type = .{ .text_xml = {} },
         .charset = "utf-8",
@@ -332,11 +379,12 @@ test "Mime: parse charset" {
 }
 
 test "Mime: isHTML" {
+    defer testing.reset();
+
     const isHTML = struct {
         fn isHTML(expected: bool, input: []const u8) !void {
-            var arena = std.heap.ArenaAllocator.init(testing.allocator);
-            defer arena.deinit();
-            var mime = try Mime.parse(arena.allocator(), input);
+            const mutable_input = try testing.arena_allocator.dupe(u8, input);
+            var mime = try Mime.parse(testing.arena_allocator, mutable_input);
             try testing.expectEqual(expected, mime.isHTML());
         }
     }.isHTML;
@@ -348,6 +396,71 @@ test "Mime: isHTML" {
     try isHTML(false, "over/9000");
 }
 
+test "Mime: sniff" {
+    try testing.expectEqual(null, Mime.sniff(""));
+    try testing.expectEqual(null, Mime.sniff("<htm"));
+    try testing.expectEqual(null, Mime.sniff("<html!"));
+    try testing.expectEqual(null, Mime.sniff("<a_"));
+    try testing.expectEqual(null, Mime.sniff("<!doctype html"));
+    try testing.expectEqual(null, Mime.sniff("<!doctype  html>"));
+    try testing.expectEqual(null, Mime.sniff("\n  <!doctype  html>"));
+    try testing.expectEqual(null, Mime.sniff("\n \t <font/>"));
+
+    const expectHTML = struct {
+        fn expect(input: []const u8) !void {
+            try testing.expectEqual(.text_html, std.meta.activeTag(Mime.sniff(input).?.content_type));
+        }
+    }.expect;
+
+    try expectHTML("<!doctype html ");
+    try expectHTML("\n  \t    <!DOCTYPE HTML ");
+
+    try expectHTML("<html ");
+    try expectHTML("\n  \t    <HtmL> even more stufff");
+
+    try expectHTML("<script>");
+    try expectHTML("\n  \t    <SCRIpt >alert(document.cookies)</script>");
+
+    try expectHTML("<iframe>");
+    try expectHTML(" \t    <ifRAME >");
+
+    try expectHTML("<h1>");
+    try expectHTML("  <H1>");
+
+    try expectHTML("<div>");
+    try expectHTML("\n\r\r  <DiV>");
+
+    try expectHTML("<font>");
+    try expectHTML("  <fonT>");
+
+    try expectHTML("<table>");
+    try expectHTML("\t\t<TAblE>");
+
+    try expectHTML("<a>");
+    try expectHTML("\n\n<A>");
+
+    try expectHTML("<style>");
+    try expectHTML("    \n\t <STyLE>");
+
+    try expectHTML("<title>");
+    try expectHTML("    \n\t <TITLE>");
+
+    try expectHTML("<b>");
+    try expectHTML("    \n\t <B>");
+
+    try expectHTML("<body>");
+    try expectHTML("    \n\t <BODY>");
+
+    try expectHTML("<br>");
+    try expectHTML("    \n\t <BR>");
+
+    try expectHTML("<p>");
+    try expectHTML("    \n\t <P>");
+
+    try expectHTML("<!-->");
+    try expectHTML("    \n\t <!-->");
+}
+
 const Expectation = struct {
     content_type: Mime.ContentType,
     params: []const u8 = "",
@@ -355,11 +468,9 @@ const Expectation = struct {
 };
 
 fn expect(expected: Expectation, input: []const u8) !void {
-    var arena = std.heap.ArenaAllocator.init(testing.allocator);
-    defer arena.deinit();
-
-    const actual = try Mime.parse(arena.allocator(), input);
+    const mutable_input = try testing.arena_allocator.dupe(u8, input);
 
+    const actual = try Mime.parse(testing.arena_allocator, mutable_input);
     try testing.expectEqual(
         std.meta.activeTag(expected.content_type),
         std.meta.activeTag(actual.content_type),
@@ -368,16 +479,16 @@ fn expect(expected: Expectation, input: []const u8) !void {
     switch (expected.content_type) {
         .other => |e| {
             const a = actual.content_type.other;
-            try testing.expectEqualStrings(e.type, a.type);
-            try testing.expectEqualStrings(e.sub_type, a.sub_type);
+            try testing.expectEqual(e.type, a.type);
+            try testing.expectEqual(e.sub_type, a.sub_type);
         },
         else => {}, // already asserted above
     }
 
-    try testing.expectEqualStrings(expected.params, actual.params);
+    try testing.expectEqual(expected.params, actual.params);
 
     if (expected.charset) |ec| {
-        try testing.expectEqualStrings(ec, actual.charset.?);
+        try testing.expectEqual(ec, actual.charset.?);
     } else {
         try testing.expectEqual(null, actual.charset);
     }
diff --git a/src/browser/xhr/xhr.zig b/src/browser/xhr/xhr.zig
index b0aa59d5..716cef44 100644
--- a/src/browser/xhr/xhr.zig
+++ b/src/browser/xhr/xhr.zig
@@ -254,7 +254,7 @@ pub const XMLHttpRequest = struct {
     };
     const ResponseObj = union(ResponseObjTag) {
         Document: *parser.Document,
-        Failure: bool,
+        Failure: void,
         JSON: std.json.Parsed(JSONValue),
 
         fn deinit(self: ResponseObj) void {
@@ -511,12 +511,8 @@ pub const XMLHttpRequest = struct {
             }
 
             // extract a mime type from headers.
-            {
-                var raw: []const u8 = "text/xml";
-                if (header.get("content-type")) |ct| {
-                    raw = try self.arena.dupe(u8, ct);
-                }
-                self.response_mime = Mime.parse(self.arena, raw) catch |e| {
+            if (header.get("content-type")) |ct| {
+                self.response_mime = Mime.parse(self.arena, ct) catch |e| {
                     return self.onErr(e);
                 };
             }
@@ -724,26 +720,24 @@ pub const XMLHttpRequest = struct {
     // TODO parse XML.
     // https://xhr.spec.whatwg.org/#response-object
     fn setResponseObjDocument(self: *XMLHttpRequest) void {
-        const response_mime = &self.response_mime.?;
-        const isHTML = response_mime.isHTML();
-
-        // TODO If finalMIME is not an HTML MIME type or an XML MIME type, then
-        // return.
-        if (!isHTML) {
+        const mime = self.response_mime orelse return;
+        if (mime.isHTML() == false) {
             return;
         }
 
         var ccharset: [:0]const u8 = "utf-8";
-        if (response_mime.charset) |rc| {
-            ccharset = self.arena.dupeZ(u8, rc) catch {
-                self.response_obj = .{ .Failure = true };
-                return;
-            };
+        if (mime.charset) |rc| {
+            if (std.mem.eql(u8, rc, "utf-8") == false) {
+                ccharset = self.arena.dupeZ(u8, rc) catch {
+                    self.response_obj = .{ .Failure = {} };
+                    return;
+                };
+            }
         }
 
         var fbs = std.io.fixedBufferStream(self.response_bytes.items);
         const doc = parser.documentHTMLParse(fbs.reader(), ccharset) catch {
-            self.response_obj = .{ .Failure = true };
+            self.response_obj = .{ .Failure = {} };
             return;
         };
 
@@ -766,7 +760,7 @@ pub const XMLHttpRequest = struct {
             .{},
         ) catch |e| {
             log.err("parse JSON: {}", .{e});
-            self.response_obj = .{ .Failure = true };
+            self.response_obj = .{ .Failure = {} };
             return;
         };
 
diff --git a/src/http/client.zig b/src/http/client.zig
index c2946ca7..028c88a4 100644
--- a/src/http/client.zig
+++ b/src/http/client.zig
@@ -32,9 +32,13 @@ const Loop = @import("../runtime/loop.zig").Loop;
 
 const log = std.log.scoped(.http_client);
 
+// We might need to peek at the body to try and sniff the content-type.
+// While we only need a few bytes, in most cases we need to ignore leading
+// whitespace, so we want to get a reasonable-sized chunk.
+const PEEK_BUF_LEN = 1024;
+
 const BUFFER_LEN = 32 * 1024;
 
-// The longest individual header line that we support
 const MAX_HEADER_LINE_LEN = 4096;
 
 // Thread-safe. Holds our root certificate, connection pool and state pool
@@ -900,6 +904,7 @@ const SyncHandler = struct {
             // object which can be iterated to get the body.
             std.debug.assert(result.done or reader.body_reader != null);
             std.debug.assert(result.data == null);
+
             return .{
                 ._buf = buf,
                 ._request = request,
@@ -907,6 +912,8 @@ const SyncHandler = struct {
                 ._done = result.done,
                 ._connection = connection,
                 ._data = result.unprocessed,
+                ._peek_len = 0,
+                ._peek_buf = state.peek_buf,
                 .header = reader.response,
             };
         }
@@ -1046,7 +1053,7 @@ const Reader = struct {
 
         // Still parsing the header
 
-        // what data do we have leftover in `data`.
+        // What data do we have leftover in `data`?
         // When header_done == true, then this is part (or all) of the body
         // When header_done == false, then this is a header line that we didn't
         // have enough data for.
@@ -1504,23 +1511,49 @@ pub const Progress = struct {
     header: ResponseHeader,
 };
 
-// The value that we return from a synchronous requst.
+// The value that we return from a synchronous request.
 pub const Response = struct {
     _reader: Reader,
     _request: *Request,
-
-    _buf: []u8,
     _connection: SyncHandler.Connection,
 
+    // the buffer to read the peeked data into
+    _peek_buf: []u8,
+
+    // the length of data we've peeked. The peeked_data is _peek_buf[0.._peek_len].
+    // It's possible for peek_len > 0 and _done == true, in which case, the
+    // _peeked data should be emitted once and subsequent calls to `next` should
+    // return null.
+    _peek_len: usize,
+
+    // What we'll read from the socket into. This is the State's read_buf
+    _buf: []u8,
+
+    // Whether or not we're done reading the response. When true, next will
+    // return null.
     _done: bool,
 
-    // Any data we over-read while parsing the header. This will be returned on
-    // the first call to next();
+    // Data that we've read. This can be set when the Response is first created
+    // from extra data received while parsing the body. Or, it can be set
+    // when `next` is called and we read more data from the socket.
     _data: ?[]u8 = null,
     header: ResponseHeader,
 
     pub fn next(self: *Response) !?[]u8 {
-        var buf = self._buf;
+        // it's possible for peek_len > - and done == true. This would happen
+        // when, while peeking, we reached the end of the data. In that case,
+        // we return the peeked data once, and on subsequent call, we'll return
+        // null normally, because done == true;
+        const pl = self._peek_len;
+        if (pl > 0) {
+            self._peek_len = 0;
+            return self._peek_buf[0..pl];
+        }
+
+        return self._nextIgnorePeek(self._buf);
+    }
+
+    fn _nextIgnorePeek(self: *Response, buf: []u8) !?[]u8 {
         while (true) {
             if (try self.processData()) |data| {
                 return data;
@@ -1541,14 +1574,38 @@ pub const Response = struct {
         self._data = result.unprocessed; // for the next call
         return result.data;
     }
+
+    pub fn peek(self: *Response) ![]u8 {
+        while (true) {
+            var peek_buf = self._peek_buf;
+            const peek_len = self._peek_len;
+
+            const data = (try self._nextIgnorePeek(peek_buf[peek_len..])) orelse {
+                return peek_buf[0..peek_len];
+            };
+
+            const peek_end = peek_len + data.len;
+            @memcpy(peek_buf[peek_len..peek_end], data);
+            self._peek_len = peek_end;
+
+            if (peek_end > 100) {
+                return peek_buf[peek_len..peek_end];
+            }
+        }
+    }
 };
 
 // Pooled and re-used when creating a request
 const State = struct {
-    // used for reading chunks of payload data.
+    // We might be asked to peek at the response, i.e. to sniff the mime type.
+    // This will require storing any peeked data so that, later, if we stream
+    // the body, we can present a cohesive body.
+    peek_buf: []u8,
+
+    // Used for reading chunks of payload data.
     read_buf: []u8,
 
-    // use for writing data. If you're wondering why BOTH a read_buf and a
+    // Used for writing data. If you're wondering why BOTH a read_buf and a
     // write_buf, even though HTTP is req -> resp, it's for TLS, which has
     // bidirectional data.
     write_buf: []u8,
@@ -1561,7 +1618,10 @@ const State = struct {
     // response headers.
     arena: ArenaAllocator,
 
-    fn init(allocator: Allocator, header_size: usize, buf_size: usize) !State {
+    fn init(allocator: Allocator, header_size: usize, peek_size: usize, buf_size: usize) !State {
+        const peek_buf = try allocator.alloc(u8, peek_size);
+        errdefer allocator.free(peek_buf);
+
         const read_buf = try allocator.alloc(u8, buf_size);
         errdefer allocator.free(read_buf);
 
@@ -1572,6 +1632,7 @@ const State = struct {
         errdefer allocator.free(header_buf);
 
         return .{
+            .peek_buf = peek_buf,
             .read_buf = read_buf,
             .write_buf = write_buf,
             .header_buf = header_buf,
@@ -1585,6 +1646,7 @@ const State = struct {
 
     fn deinit(self: *State) void {
         const allocator = self.arena.child_allocator;
+        allocator.free(self.peek_buf);
         allocator.free(self.read_buf);
         allocator.free(self.write_buf);
         allocator.free(self.header_buf);
@@ -1611,7 +1673,7 @@ const StatePool = struct {
         for (0..count) |i| {
             const state = try allocator.create(State);
             errdefer allocator.destroy(state);
-            state.* = try State.init(allocator, MAX_HEADER_LINE_LEN, BUFFER_LEN);
+            state.* = try State.init(allocator, MAX_HEADER_LINE_LEN, PEEK_BUF_LEN, BUFFER_LEN);
             states[i] = state;
             started += 1;
         }
@@ -1662,7 +1724,7 @@ const StatePool = struct {
 
 const testing = @import("../testing.zig");
 test "HttpClient Reader: fuzz" {
-    var state = try State.init(testing.allocator, 1024, 1024);
+    var state = try State.init(testing.allocator, 1024, 1024, 100);
     defer state.deinit();
 
     var res = TestResponse.init();
@@ -1773,18 +1835,23 @@ test "HttpClient: sync connect error" {
 }
 
 test "HttpClient: sync no body" {
-    var client = try testClient();
-    defer client.deinit();
+    for (0..2) |i| {
+        var client = try testClient();
+        defer client.deinit();
 
-    const uri = try Uri.parse("http://127.0.0.1:9582/http_client/simple");
-    var req = try client.request(.GET, &uri);
-    var res = try req.sendSync(.{});
+        const uri = try Uri.parse("http://127.0.0.1:9582/http_client/simple");
+        var req = try client.request(.GET, &uri);
+        var res = try req.sendSync(.{});
 
-    try testing.expectEqual(null, try res.next());
-    try testing.expectEqual(200, res.header.status);
-    try testing.expectEqual(2, res.header.count());
-    try testing.expectEqual("close", res.header.get("connection"));
-    try testing.expectEqual("0", res.header.get("content-length"));
+        if (i == 0) {
+            try testing.expectEqual("", try res.peek());
+        }
+        try testing.expectEqual(null, try res.next());
+        try testing.expectEqual(200, res.header.status);
+        try testing.expectEqual(2, res.header.count());
+        try testing.expectEqual("close", res.header.get("connection"));
+        try testing.expectEqual("0", res.header.get("content-length"));
+    }
 }
 
 test "HttpClient: sync tls no body" {
@@ -1804,21 +1871,26 @@ test "HttpClient: sync tls no body" {
 }
 
 test "HttpClient: sync with body" {
-    var client = try testClient();
-    defer client.deinit();
+    for (0..2) |i| {
+        var client = try testClient();
+        defer client.deinit();
 
-    const uri = try Uri.parse("http://127.0.0.1:9582/http_client/echo");
-    var req = try client.request(.GET, &uri);
-    var res = try req.sendSync(.{});
+        const uri = try Uri.parse("http://127.0.0.1:9582/http_client/echo");
+        var req = try client.request(.GET, &uri);
+        var res = try req.sendSync(.{});
 
-    try testing.expectEqual("over 9000!", try res.next());
-    try testing.expectEqual(201, res.header.status);
-    try testing.expectEqual(5, res.header.count());
-    try testing.expectEqual("close", res.header.get("connection"));
-    try testing.expectEqual("10", res.header.get("content-length"));
-    try testing.expectEqual("127.0.0.1", res.header.get("_host"));
-    try testing.expectEqual("Close", res.header.get("_connection"));
-    try testing.expectEqual("Lightpanda/1.0", res.header.get("_user-agent"));
+        if (i == 0) {
+            try testing.expectEqual("over 9000!", try res.peek());
+        }
+        try testing.expectEqual("over 9000!", try res.next());
+        try testing.expectEqual(201, res.header.status);
+        try testing.expectEqual(5, res.header.count());
+        try testing.expectEqual("close", res.header.get("connection"));
+        try testing.expectEqual("10", res.header.get("content-length"));
+        try testing.expectEqual("127.0.0.1", res.header.get("_host"));
+        try testing.expectEqual("Close", res.header.get("_connection"));
+        try testing.expectEqual("Lightpanda/1.0", res.header.get("_user-agent"));
+    }
 }
 
 test "HttpClient: sync tls with body" {
diff --git a/src/testing.zig b/src/testing.zig
index 9dbb8f79..e37b5e40 100644
--- a/src/testing.zig
+++ b/src/testing.zig
@@ -24,6 +24,17 @@ pub const expectError = std.testing.expectError;
 pub const expectString = std.testing.expectEqualStrings;
 pub const expectEqualSlices = std.testing.expectEqualSlices;
 
+// sometimes it's super useful to have an arena you don't really care about
+// in a test. Like, you need a mutable string, so you just want to dupe a
+// string literal. It has nothing to do with the code under test, it's just
+// infrastructure for the test itself.
+pub var arena_instance = std.heap.ArenaAllocator.init(std.heap.c_allocator);
+pub const arena_allocator = arena_instance.allocator();
+
+pub fn reset() void {
+    _ = arena_instance.reset(.{ .retain_capacity = {} });
+}
+
 const App = @import("app.zig").App;
 const parser = @import("browser/netsurf.zig");