Add Accept-Encoding

This is necessary because of CloudFront which will send gzip content even if we don't ask for it. Properly handle scripts that are both async and defer. Add a helper to print state of page wait. This can be helpful in identifying what's causing the page to hang on page.wait.
2025-10-28 22:53:28 +00:00 · 2025-08-05 11:24:56 +08:00
parent 32566ccc80
commit 9876d79680
4 changed files with 119 additions and 27 deletions
--- a/src/browser/ScriptManager.zig
+++ b/src/browser/ScriptManager.zig
@@ -34,6 +34,9 @@ const ScriptManager = @This();

 page: *Page,

+// used to prevent recursive evalution
+is_evaluating: bool,
+
 // Only once this is true can deferred scripts be run
 static_scripts_done: bool,

@@ -48,6 +51,8 @@ scripts: OrderList,
 // dom_loaded == true,
 deferreds: OrderList,

+shutdown: bool = false,
+
 client: *HttpClient,
 allocator: Allocator,
 buffer_pool: BufferPool,
@@ -63,6 +68,7 @@ pub fn init(browser: *Browser, page: *Page) ScriptManager {
        .asyncs = .{},
        .scripts = .{},
        .deferreds = .{},
+        .is_evaluating = false,
        .allocator = allocator,
        .client = browser.http_client,
        .static_scripts_done = false,
@@ -72,6 +78,7 @@ pub fn init(browser: *Browser, page: *Page) ScriptManager {
 }

 pub fn deinit(self: *ScriptManager) void {
+    self.reset();
    self.buffer_pool.deinit();
    self.script_pool.deinit();
 }
@@ -193,7 +200,7 @@ pub fn addFromElement(self: *ScriptManager, element: *parser.Element) !void {
    };

    if (source == .@"inline") {
-        // if we're here, it means that we have pending scripts (i.e. self.ordered
+        // if we're here, it means that we have pending scripts (i.e. self.scripts
        // is not empty). Because the script is inline, it's complete/ready, but
        // we need to process them in order
        pending_script.complete = true;
@@ -201,9 +208,8 @@ pub fn addFromElement(self: *ScriptManager, element: *parser.Element) !void {
        return;
    }

-    const list = self.getList(&pending_script.script);
    pending_script.node = .{ .data = pending_script };
-    list.append(&pending_script.node);
+    self.getList(&pending_script.script).append(&pending_script.node);

    errdefer pending_script.deinit();

@@ -255,7 +261,17 @@ pub fn staticScriptsDone(self: *ScriptManager) void {
 // try to evaluate completed scripts (in order). This is called whenever a script
 // is completed.
 fn evaluate(self: *ScriptManager) void {
+    if (self.is_evaluating) {
+        // It's possible for a script.eval to cause evaluate to be called again.
+        // This is particularly true with blockingGet, but even without this,
+        // it's theoretically possible (but unlikely). We could make this work
+        // but there's little reason to support the complexity.
+        return;
+    }
+
    const page = self.page;
+    self.is_evaluating = true;
+    defer self.is_evaluating = false;

    while (self.scripts.first) |n| {
        var pending_script = n.data;
@@ -269,8 +285,8 @@ fn evaluate(self: *ScriptManager) void {
    if (self.static_scripts_done == false) {
        // We can only execute deferred scripts if
        // 1 - all the normal scripts are done
-        // 2 - and we've loaded all the normal scripts
-        // The last one isn't obvious, but it's possible for self.scripts to/
+        // 2 - we've finished parsing the HTML and at least queued all the scripts
+        // The last one isn't obvious, but it's possible for self.scripts to
        // be empty not because we're done executing all the normal scripts
        // but because we're done executing some (or maybe none), but we're still
        // parsing the HTML.
@@ -315,14 +331,17 @@ fn asyncDone(self: *ScriptManager) void {
 }

 fn getList(self: *ScriptManager, script: *const Script) *OrderList {
-    if (script.is_defer) {
-        return &self.deferreds;
-    }
-
+    // When a script has both the async and defer flag set, it should be
+    // treated as async. Async is newer, so some websites use both so that
+    // if async isn't known, it'll fallback to defer.
    if (script.is_async) {
        return &self.asyncs;
    }

+    if (script.is_defer) {
+        return &self.deferreds;
+    }
+
    return &self.scripts;
 }

@@ -375,16 +394,22 @@ const PendingScript = struct {
    manager: *ScriptManager,

    fn deinit(self: *PendingScript) void {
-        var manager = self.manager;
        const script = &self.script;
+        const manager = self.manager;

        if (script.source == .remote) {
            manager.buffer_pool.release(script.source.remote);
        }
-
        manager.getList(script).remove(&self.node);
    }

+    fn remove(self: *PendingScript) void {
+        if (self.node) |*node| {
+            self.manager.getList(&self.script).remove(node);
+            self.node = null;
+        }
+    }
+
    fn startCallback(self: *PendingScript, transfer: *HttpClient.Transfer) !void {
        _ = self;
        log.debug(.http, "script fetch start", .{ .req = transfer });
@@ -392,19 +417,25 @@ const PendingScript = struct {

    fn headerCallback(self: *PendingScript, transfer: *HttpClient.Transfer) !void {
        const header = &transfer.response_header.?;
-        if (header.status != 200) {
-            return error.InvalidStatusCode;
-        }
-
-        // @newhttp TODO: pre size based on content-length
-        // @newhttp TODO: max-length enfocement
-        self.script.source = .{ .remote = self.manager.buffer_pool.get() };
-
        log.debug(.http, "script header", .{
            .req = transfer,
            .status = header.status,
            .content_type = header.contentType(),
        });
+
+        if (header.status != 200) {
+            return error.InvalidStatusCode;
+        }
+
+        // If this isn't true, then we'll likely leak memory. If you don't
+        // set `CURLOPT_SUPPRESS_CONNECT_HEADERS` and CONNECT to a proxy, this
+        // will fail. This assertion exists to catch incorrect assumptions about
+        // how libcurl works, or about how we've configured it.
+        std.debug.assert(self.script.source.remote.capacity == 0);
+
+        // @newhttp TODO: pre size based on content-length
+        // @newhttp TODO: max-length enfocement
+        self.script.source = .{ .remote = self.manager.buffer_pool.get() };
    }

    fn dataCallback(self: *PendingScript, transfer: *HttpClient.Transfer, data: []const u8) !void {
@@ -436,9 +467,15 @@ const PendingScript = struct {

    fn errorCallback(self: *PendingScript, err: anyerror) void {
        log.warn(.http, "script fetch error", .{ .req = self.script.url, .err = err });
+
        const manager = self.manager;
+
        self.deinit();

+        if (manager.shutdown) {
+            return;
+        }
+
        manager.evaluate();
    }
 };
--- a/src/browser/page.zig
+++ b/src/browser/page.zig
@@ -153,6 +153,8 @@ pub const Page = struct {
    }

    pub fn deinit(self: *Page) void {
+        self.script_manager.shutdown = true;
+
        self.http_client.abort();
        self.script_manager.deinit();
    }
@@ -268,6 +270,9 @@ pub const Page = struct {
        var scheduler = &self.scheduler;
        var http_client = self.http_client;

+        // for debugging
+        // defer self.printWaitAnalysis();
+
        while (true) {
            SW: switch (self.mode) {
                .pre, .raw => {
@@ -346,6 +351,56 @@ pub const Page = struct {
        }
    }

+    fn printWaitAnalysis(self: *Page) void {
+        std.debug.print("mode: {s}\n", .{@tagName(std.meta.activeTag(self.mode))});
+        std.debug.print("load: {s}\n", .{@tagName(self.load_state)});
+        std.debug.print("active requests: {d}\n", .{self.http_client.active});
+
+        {
+            std.debug.print("\nscripts: {d}\n", .{self.script_manager.scripts.len});
+            var n_ = self.script_manager.scripts.first;
+            while (n_) |n| {
+                std.debug.print(" - {s} complete: {any}\n", .{ n.data.script.url, n.data.complete });
+                n_ = n.next;
+            }
+        }
+
+        {
+            std.debug.print("\ndeferreds: {d}\n", .{self.script_manager.deferreds.len});
+            var n_ = self.script_manager.deferreds.first;
+            while (n_) |n| {
+                std.debug.print(" - {s} complete: {any}\n", .{ n.data.script.url, n.data.complete });
+                n_ = n.next;
+            }
+        }
+
+        const now = std.time.milliTimestamp();
+        {
+            std.debug.print("\nasyncs: {d}\n", .{self.script_manager.asyncs.len});
+            var n_ = self.script_manager.asyncs.first;
+            while (n_) |n| {
+                std.debug.print(" - {s} complete: {any}\n", .{ n.data.script.url, n.data.complete });
+                n_ = n.next;
+            }
+        }
+
+        {
+            std.debug.print("\nprimary schedule: {d}\n", .{self.scheduler.primary.count()});
+            var it = self.scheduler.primary.iterator();
+            while (it.next()) |task| {
+                std.debug.print(" - {s} complete: {any}\n", .{ task.name, task.ms - now });
+            }
+        }
+
+        {
+            std.debug.print("\nsecondary schedule: {d}\n", .{self.scheduler.secondary.count()});
+            var it = self.scheduler.secondary.iterator();
+            while (it.next()) |task| {
+                std.debug.print(" - {s} complete: {any}\n", .{ task.name, task.ms - now });
+            }
+        }
+    }
+
    pub fn origin(self: *const Page, arena: Allocator) ![]const u8 {
        var arr: std.ArrayListUnmanaged(u8) = .{};
        try self.url.origin(arr.writer(arena));
--- a/src/browser/session.zig
+++ b/src/browser/session.zig
@@ -118,18 +118,13 @@ pub const Session = struct {

        std.debug.assert(self.page != null);

-        // Cleanup is a bit sensitive. We could still have inflight I/O. For
-        // example, we could have an XHR request which is still in the connect
-        // phase. It's important that we clean these up, as they're holding onto
-        // limited resources (like our fixed-sized http state pool).
-        //
+        self.page.?.deinit();
+        self.page = null;
+
        // RemoveJsContext() will execute the destructor of any type that
        // registered a destructor (e.g. XMLHttpRequest).
        self.executor.removeJsContext();

-        self.page.?.deinit();
-        self.page = null;
-
        // clear netsurf memory arena.
        parser.deinit();

--- a/src/http/Http.zig
+++ b/src/http/Http.zig
@@ -141,6 +141,11 @@ pub const Connection = struct {
            }
        }

+        // compression, don't remove this. CloudFront will send gzip content
+        // even if we don't support it, and then it won't be decompressed.
+        // empty string means: use whatever's available
+        try errorCheck(c.curl_easy_setopt(easy, c.CURLOPT_ACCEPT_ENCODING, ""));
+
        // debug
        if (comptime Http.ENABLE_DEBUG) {
            try errorCheck(c.curl_easy_setopt(easy, c.CURLOPT_VERBOSE, @as(c_long, 1)));