Add Accept-Encoding

This is necessary because of CloudFront which will send gzip content even if we don't ask for it. Properly handle scripts that are both async and defer. Add a helper to print state of page wait. This can be helpful in identifying what's causing the page to hang on page.wait.
2025-12-16 08:18:59 +00:00 · 2025-08-05 11:24:56 +08:00
parent 32566ccc80
commit 9876d79680
4 changed files with 119 additions and 27 deletions
--- a/src/browser/ScriptManager.zig
+++ b/src/browser/ScriptManager.zig
@@ -34,6 +34,9 @@ const ScriptManager = @This();
 page: *Page,
 // used to prevent recursive evalution
 is_evaluating: bool,
 // Only once this is true can deferred scripts be run
 static_scripts_done: bool,
@@ -48,6 +51,8 @@ scripts: OrderList,
 // dom_loaded == true,
 deferreds: OrderList,
 shutdown: bool = false,
 client: *HttpClient,
 allocator: Allocator,
 buffer_pool: BufferPool,
@@ -63,6 +68,7 @@ pub fn init(browser: *Browser, page: *Page) ScriptManager {
        .asyncs = .{},
        .scripts = .{},
        .deferreds = .{},
        .is_evaluating = false,
        .allocator = allocator,
        .client = browser.http_client,
        .static_scripts_done = false,
@@ -72,6 +78,7 @@ pub fn init(browser: *Browser, page: *Page) ScriptManager {
 }
 pub fn deinit(self: *ScriptManager) void {
    self.reset();
    self.buffer_pool.deinit();
    self.script_pool.deinit();
 }
@@ -193,7 +200,7 @@ pub fn addFromElement(self: *ScriptManager, element: *parser.Element) !void {
    };
    if (source == .@"inline") {
-        // if we're here, it means that we have pending scripts (i.e. self.ordered
+        // if we're here, it means that we have pending scripts (i.e. self.scripts
        // is not empty). Because the script is inline, it's complete/ready, but
        // we need to process them in order
        pending_script.complete = true;
@@ -201,9 +208,8 @@ pub fn addFromElement(self: *ScriptManager, element: *parser.Element) !void {
        return;
    }
    const list = self.getList(&pending_script.script);
    pending_script.node = .{ .data = pending_script };
-    list.append(&pending_script.node);
+    self.getList(&pending_script.script).append(&pending_script.node);
    errdefer pending_script.deinit();
@@ -255,7 +261,17 @@ pub fn staticScriptsDone(self: *ScriptManager) void {
 // try to evaluate completed scripts (in order). This is called whenever a script
 // is completed.
 fn evaluate(self: *ScriptManager) void {
    if (self.is_evaluating) {
        // It's possible for a script.eval to cause evaluate to be called again.
        // This is particularly true with blockingGet, but even without this,
        // it's theoretically possible (but unlikely). We could make this work
        // but there's little reason to support the complexity.
        return;
    }
    const page = self.page;
    self.is_evaluating = true;
    defer self.is_evaluating = false;
    while (self.scripts.first) |n| {
        var pending_script = n.data;
@@ -269,8 +285,8 @@ fn evaluate(self: *ScriptManager) void {
    if (self.static_scripts_done == false) {
        // We can only execute deferred scripts if
        // 1 - all the normal scripts are done
-        // 2 - and we've loaded all the normal scripts
+        // 2 - we've finished parsing the HTML and at least queued all the scripts
-        // The last one isn't obvious, but it's possible for self.scripts to/
+        // The last one isn't obvious, but it's possible for self.scripts to
        // be empty not because we're done executing all the normal scripts
        // but because we're done executing some (or maybe none), but we're still
        // parsing the HTML.
@@ -315,14 +331,17 @@ fn asyncDone(self: *ScriptManager) void {
 }
 fn getList(self: *ScriptManager, script: *const Script) *OrderList {
-    if (script.is_defer) {
+    // When a script has both the async and defer flag set, it should be
-        return &self.deferreds;
+    // treated as async. Async is newer, so some websites use both so that
-    }
+    // if async isn't known, it'll fallback to defer.
    if (script.is_async) {
        return &self.asyncs;
    }
    if (script.is_defer) {
        return &self.deferreds;
    }
    return &self.scripts;
 }
@@ -375,16 +394,22 @@ const PendingScript = struct {
    manager: *ScriptManager,
    fn deinit(self: *PendingScript) void {
        var manager = self.manager;
        const script = &self.script;
        const manager = self.manager;
        if (script.source == .remote) {
            manager.buffer_pool.release(script.source.remote);
        }
        manager.getList(script).remove(&self.node);
    }
    fn remove(self: *PendingScript) void {
        if (self.node) |*node| {
            self.manager.getList(&self.script).remove(node);
            self.node = null;
        }
    }
    fn startCallback(self: *PendingScript, transfer: *HttpClient.Transfer) !void {
        _ = self;
        log.debug(.http, "script fetch start", .{ .req = transfer });
@@ -392,19 +417,25 @@ const PendingScript = struct {
    fn headerCallback(self: *PendingScript, transfer: *HttpClient.Transfer) !void {
        const header = &transfer.response_header.?;
        if (header.status != 200) {
            return error.InvalidStatusCode;
        }
        // @newhttp TODO: pre size based on content-length
        // @newhttp TODO: max-length enfocement
        self.script.source = .{ .remote = self.manager.buffer_pool.get() };
        log.debug(.http, "script header", .{
            .req = transfer,
            .status = header.status,
            .content_type = header.contentType(),
        });
        if (header.status != 200) {
            return error.InvalidStatusCode;
        }
        // If this isn't true, then we'll likely leak memory. If you don't
        // set `CURLOPT_SUPPRESS_CONNECT_HEADERS` and CONNECT to a proxy, this
        // will fail. This assertion exists to catch incorrect assumptions about
        // how libcurl works, or about how we've configured it.
        std.debug.assert(self.script.source.remote.capacity == 0);
        // @newhttp TODO: pre size based on content-length
        // @newhttp TODO: max-length enfocement
        self.script.source = .{ .remote = self.manager.buffer_pool.get() };
    }
    fn dataCallback(self: *PendingScript, transfer: *HttpClient.Transfer, data: []const u8) !void {
@@ -436,9 +467,15 @@ const PendingScript = struct {
    fn errorCallback(self: *PendingScript, err: anyerror) void {
        log.warn(.http, "script fetch error", .{ .req = self.script.url, .err = err });
        const manager = self.manager;
        self.deinit();
        if (manager.shutdown) {
            return;
        }
        manager.evaluate();
    }
 };
--- a/src/browser/page.zig
+++ b/src/browser/page.zig
@@ -153,6 +153,8 @@ pub const Page = struct {
    }
    pub fn deinit(self: *Page) void {
        self.script_manager.shutdown = true;
        self.http_client.abort();
        self.script_manager.deinit();
    }
@@ -268,6 +270,9 @@ pub const Page = struct {
        var scheduler = &self.scheduler;
        var http_client = self.http_client;
        // for debugging
        // defer self.printWaitAnalysis();
        while (true) {
            SW: switch (self.mode) {
                .pre, .raw => {
@@ -346,6 +351,56 @@ pub const Page = struct {
        }
    }
    fn printWaitAnalysis(self: *Page) void {
        std.debug.print("mode: {s}\n", .{@tagName(std.meta.activeTag(self.mode))});
        std.debug.print("load: {s}\n", .{@tagName(self.load_state)});
        std.debug.print("active requests: {d}\n", .{self.http_client.active});
        {
            std.debug.print("\nscripts: {d}\n", .{self.script_manager.scripts.len});
            var n_ = self.script_manager.scripts.first;
            while (n_) |n| {
                std.debug.print(" - {s} complete: {any}\n", .{ n.data.script.url, n.data.complete });
                n_ = n.next;
            }
        }
        {
            std.debug.print("\ndeferreds: {d}\n", .{self.script_manager.deferreds.len});
            var n_ = self.script_manager.deferreds.first;
            while (n_) |n| {
                std.debug.print(" - {s} complete: {any}\n", .{ n.data.script.url, n.data.complete });
                n_ = n.next;
            }
        }
        const now = std.time.milliTimestamp();
        {
            std.debug.print("\nasyncs: {d}\n", .{self.script_manager.asyncs.len});
            var n_ = self.script_manager.asyncs.first;
            while (n_) |n| {
                std.debug.print(" - {s} complete: {any}\n", .{ n.data.script.url, n.data.complete });
                n_ = n.next;
            }
        }
        {
            std.debug.print("\nprimary schedule: {d}\n", .{self.scheduler.primary.count()});
            var it = self.scheduler.primary.iterator();
            while (it.next()) |task| {
                std.debug.print(" - {s} complete: {any}\n", .{ task.name, task.ms - now });
            }
        }
        {
            std.debug.print("\nsecondary schedule: {d}\n", .{self.scheduler.secondary.count()});
            var it = self.scheduler.secondary.iterator();
            while (it.next()) |task| {
                std.debug.print(" - {s} complete: {any}\n", .{ task.name, task.ms - now });
            }
        }
    }
    pub fn origin(self: *const Page, arena: Allocator) ![]const u8 {
        var arr: std.ArrayListUnmanaged(u8) = .{};
        try self.url.origin(arr.writer(arena));
--- a/src/browser/session.zig
+++ b/src/browser/session.zig
@@ -118,18 +118,13 @@ pub const Session = struct {
        std.debug.assert(self.page != null);
-        // Cleanup is a bit sensitive. We could still have inflight I/O. For
+        self.page.?.deinit();
-        // example, we could have an XHR request which is still in the connect
+        self.page = null;
-        // phase. It's important that we clean these up, as they're holding onto
+
        // limited resources (like our fixed-sized http state pool).
        //
        // RemoveJsContext() will execute the destructor of any type that
        // registered a destructor (e.g. XMLHttpRequest).
        self.executor.removeJsContext();
        self.page.?.deinit();
        self.page = null;
        // clear netsurf memory arena.
        parser.deinit();
--- a/src/http/Http.zig
+++ b/src/http/Http.zig
@@ -141,6 +141,11 @@ pub const Connection = struct {
            }
        }
        // compression, don't remove this. CloudFront will send gzip content
        // even if we don't support it, and then it won't be decompressed.
        // empty string means: use whatever's available
        try errorCheck(c.curl_easy_setopt(easy, c.CURLOPT_ACCEPT_ENCODING, ""));
        // debug
        if (comptime Http.ENABLE_DEBUG) {
            try errorCheck(c.curl_easy_setopt(easy, c.CURLOPT_VERBOSE, @as(c_long, 1)));