Rework internal navigation to prevent deadlocking

The mix of sync and async HTTP requests requires care to avoid deadlocks. Previously, it was possible for async requests to use up all available HTTP state objects duration a navigation flow (either directly, or via an internal redirect (e.g. click, submit, ...)). This would block the navigation, which, because everything is single thread, would block the I/O loop, resulting in a deadlock. The correct solution seems to be to remove all synchronous I/O. And I tried to do that, but I ran into a wall with module-loading, which is initiated from V8. V8 says "give me the source for this module", and I don't see a great way to tell it: wait a bit. So I went back to trying to make this work with the hybrid model, despite last weeks failures to get it to work. I changed two things: 1 - The http client will only directly initiate an async request if there's at least 2 free state objects available (1 for the request, and leaving 1 free for any synchronous requests) 2 - Delayed navigation retries until there's at least 1 free http state object available. Commits from last week did help with this. First, we're now guaranteed to have a single sync-request at a time (previously, we could have had 2). Secondly, the async connection is now async end-to-end (previously, it could have blocked on an empty state pool). We could probably make this a bit more obviously by reserving 1 state object for synchronous requests. But, since the long term solution is probably having no synchronous requests, I'm happy with anything that lets me move past this issue.
2025-10-29 15:13:28 +00:00 · 2025-06-11 17:44:45 +08:00
parent 0de33b36f8
commit 97c769e805
5 changed files with 124 additions and 46 deletions
--- a/src/browser/page.zig
+++ b/src/browser/page.zig
@@ -192,7 +192,12 @@ pub const Page = struct {
        const session = self.session;
        const notification = session.browser.notification;

-        log.debug(.http, "navigate", .{ .url = request_url, .reason = opts.reason });
+        log.debug(.http, "navigate", .{
+            .url = request_url,
+            .method = opts.method,
+            .reason = opts.reason,
+            .body = opts.body != null,
+        });

        // if the url is about:blank, nothing to do.
        if (std.mem.eql(u8, "about:blank", request_url.raw)) {
@@ -685,6 +690,8 @@ pub const Page = struct {
        }
    }

+    // We cannot navigate immediately as navigating will delete the DOM tree,
+    // which holds this event's node.
    // As such we schedule the function to be called as soon as possible.
    // The page.arena is safe to use here, but the transfer_arena exists
    // specifically for this type of lifetime.
@@ -699,7 +706,7 @@ pub const Page = struct {
        navi.* = .{
            .opts = opts,
            .session = self.session,
-            .url = try arena.dupe(u8, url),
+            .url = try self.url.resolve(arena, url),
        };
        _ = try self.loop.timeout(0, &navi.navigate_node);
    }
@@ -787,15 +794,77 @@ pub const Page = struct {
 };

 const DelayedNavigation = struct {
-    url: []const u8,
+    url: URL,
    session: *Session,
    opts: NavigateOpts,
+    initial: bool = true,
    navigate_node: Loop.CallbackNode = .{ .func = delayNavigate },

+    // Navigation is blocking, which is problem because it can seize up
+    // the loop and deadlock. We can only safely try to navigate to a
+    // new page when we're sure there's at least 1 free slot in the
+    // http client. We handle this in two phases:
+    //
+    // In the first phase, when self.initial == true, we'll shutdown the page
+    // and create a new one. The shutdown is important, because it resets the
+    // loop ctx_id and closes the scope. Closing the scope calls our XHR
+    // destructors which aborts requests. This is necessary to make sure our
+    // [blocking] navigate won't block.
+    //
+    // In the 2nd phase, we wait until there's a free http slot so that our
+    // navigate definetly won't block (which could deadlock the system if there
+    // are still pending async requests, which we've seen happen, even after
+    // an abort).
    fn delayNavigate(node: *Loop.CallbackNode, repeat_delay: *?u63) void {
-        _ = repeat_delay;
        const self: *DelayedNavigation = @fieldParentPtr("navigate_node", node);
-        self.session.pageNavigate(self.url, self.opts) catch |err| {
+
+        const session = self.session;
+        const initial = self.initial;
+
+        if (initial) {
+            session.removePage();
+            _ = session.createPage() catch |err| {
+                log.err(.browser, "delayed navigation page error", .{
+                    .err = err,
+                    .url = self.url,
+                });
+                return;
+            };
+            self.initial = false;
+        }
+
+        if (session.browser.http_client.freeSlotCount() == 0) {
+            log.debug(.browser, "delayed navigate waiting", .{});
+            const delay = 0 * std.time.ns_per_ms;
+
+            // If this isn't the initial check, we can safely re-use the timer
+            // to check again.
+            if (initial == false) {
+                repeat_delay.* = delay;
+                return;
+            }
+
+            // However, if this _is_ the initial check, we called
+            // session.removePage above, and that reset the loop ctx_id.
+            // We can't re-use this timer, because it has the previous ctx_id.
+            // We can create a new timeout though, and that'll get the new ctx_id.
+            //
+            // Page has to be not-null here because we called createPage above.
+            _ = session.page.?.loop.timeout(delay, &self.navigate_node) catch |err| {
+                log.err(.browser, "delayed navigation loop err", .{ .err = err });
+            };
+            return;
+        }
+
+        const page = session.currentPage() orelse return;
+        defer if (!page.delayed_navigation) {
+            // If, while loading the page, we intend to navigate to another
+            // page, then we need to keep the transfer_arena around, as this
+            // sub-navigation is probably using it.
+            _ = session.browser.transfer_arena.reset(.{ .retain_with_limit = 64 * 1024 });
+        };
+
+        return page.navigate(self.url, self.opts) catch |err| {
            log.err(.browser, "delayed navigation error", .{ .err = err, .url = self.url });
        };
    }
--- a/src/browser/session.zig
+++ b/src/browser/session.zig
@@ -22,6 +22,7 @@ const Allocator = std.mem.Allocator;

 const Env = @import("env.zig").Env;
 const Page = @import("page.zig").Page;
+const URL = @import("../url.zig").URL;
 const Browser = @import("browser.zig").Browser;
 const NavigateOpts = @import("page.zig").NavigateOpts;

@@ -72,7 +73,7 @@ pub const Session = struct {

    pub fn deinit(self: *Session) void {
        if (self.page != null) {
-            self.removePage() catch {};
+            self.removePage();
        }
        self.cookie_jar.deinit();
        self.storage_shed.deinit();
@@ -104,7 +105,7 @@ pub const Session = struct {
        return page;
    }

-    pub fn removePage(self: *Session) !void {
+    pub fn removePage(self: *Session) void {
        // Inform CDP the page is going to be removed, allowing other worlds to remove themselves before the main one
        self.browser.notification.dispatch(.page_remove, .{});

@@ -127,12 +128,6 @@ pub const Session = struct {
        // window.setTimeout and running microtasks should be ignored
        self.browser.app.loop.reset();

-        // Finally, we run the loop. Because of the reset just above, this will
-        // ignore any timeouts. And, because of the endScope about this, it
-        // should ensure that the http requests detect the shutdown socket and
-        // release their resources.
-        try self.browser.app.loop.run();
-
        self.page = null;

        // clear netsurf memory arena.
@@ -144,28 +139,4 @@ pub const Session = struct {
    pub fn currentPage(self: *Session) ?*Page {
        return &(self.page orelse return null);
    }
-
-    pub fn pageNavigate(self: *Session, url_string: []const u8, opts: NavigateOpts) !void {
-        // currently, this is only called from the page, so let's hope
-        // it isn't null!
-        std.debug.assert(self.page != null);
-
-        defer if (self.page) |*p| {
-            if (!p.delayed_navigation) {
-                // If, while loading the page, we intend to navigate to another
-                // page, then we need to keep the transfer_arena around, as this
-                // sub-navigation is probably using it.
-                _ = self.browser.transfer_arena.reset(.{ .retain_with_limit = 64 * 1024 });
-            }
-        };
-
-        // it's safe to use the transfer arena here, because the page will
-        // eventually clone the URL using its own page_arena (after it gets
-        // the final URL, possibly following redirects)
-        const url = try self.page.?.url.resolve(self.transfer_arena, url_string);
-
-        try self.removePage();
-        var page = try self.createPage();
-        return page.navigate(url, opts);
-    }
 };
--- a/src/browser/xhr/xhr.zig
+++ b/src/browser/xhr/xhr.zig
@@ -341,10 +341,17 @@ pub const XMLHttpRequest = struct {
        log.debug(.script_event, "dispatch event", .{
            .type = typ,
            .source = "xhr",
+            .method = self.method,
            .url = self.url,
        });
        self._dispatchEvt(typ) catch |err| {
-            log.err(.app, "dispatch event error", .{ .err = err, .type = typ, .source = "xhr" });
+            log.err(.app, "dispatch event error", .{
+                .err = err,
+                .type = typ,
+                .source = "xhr",
+                .method = self.method,
+                .url = self.url,
+            });
        };
    }

@@ -365,10 +372,17 @@ pub const XMLHttpRequest = struct {
        log.debug(.script_event, "dispatch progress event", .{
            .type = typ,
            .source = "xhr",
+            .method = self.method,
            .url = self.url,
        });
        self._dispatchProgressEvent(typ, opts) catch |err| {
-            log.err(.app, "dispatch progress event error", .{ .err = err, .type = typ, .source = "xhr" });
+            log.err(.app, "dispatch progress event error", .{
+                .err = err,
+                .type = typ,
+                .source = "xhr",
+                .method = self.method,
+                .url = self.url,
+            });
        };
    }

--- a/src/cdp/domains/target.zig
+++ b/src/cdp/domains/target.zig
@@ -220,7 +220,7 @@ fn closeTarget(cmd: anytype) !void {
        bc.session_id = null;
    }

-    try bc.session.removePage();
+    bc.session.removePage();
    if (bc.isolated_world) |*world| {
        world.deinit();
        bc.isolated_world = null;
--- a/src/http/client.zig
+++ b/src/http/client.zig
@@ -113,10 +113,18 @@ pub const Client = struct {
        loop: *Loop,
        opts: RequestOpts,
    ) !void {
-        if (self.state_pool.acquireOrNull()) |state| {
-            // if we have state ready, we can skip the loop and immediately
-            // kick this request off.
-            return self.asyncRequestReady(method, uri, ctx, callback, state, opts);
+
+        // See the page's DelayedNavitation for why we're doing this. TL;DR -
+        // we need to keep 1 slot available for the blocking page navigation flow
+        // (Almost worth keeping a dedicate State just for that flow, but keep
+        // thinking we need a more permanent solution (i.e. making everything
+        // non-blocking).
+        if (self.freeSlotCount() > 1) {
+            if (self.state_pool.acquireOrNull()) |state| {
+                // if we have state ready, we can skip the loop and immediately
+                // kick this request off.
+                return self.asyncRequestReady(method, uri, ctx, callback, state, opts);
+            }
        }

        // This cannot be a client-owned MemoryPool. The page can end before
@@ -174,6 +182,10 @@ pub const Client = struct {
            .client = self,
        };
    }
+
+    pub fn freeSlotCount(self: *Client) usize {
+        return self.state_pool.freeSlotCount();
+    }
 };

 const RequestOpts = struct {
@@ -2531,6 +2543,12 @@ const StatePool = struct {
        allocator.free(self.states);
    }

+    pub fn freeSlotCount(self: *StatePool) usize {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        return self.available;
+    }
+
    pub fn acquireWait(self: *StatePool) *State {
        const states = self.states;

@@ -3022,8 +3040,14 @@ test "HttpClient: async connect error" {
        .{},
    );

-    try loop.io.run_for_ns(std.time.ns_per_ms);
-    try reset.timedWait(std.time.ns_per_s);
+    for (0..10) |_| {
+        try loop.io.run_for_ns(std.time.ns_per_ms * 10);
+        if (reset.isSet()) {
+            break;
+        }
+    } else {
+        return error.Timeout;
+    }
 }

 test "HttpClient: async no body" {