From 97c769e80560c4450d0c4507821da2600b4f39b9 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Wed, 11 Jun 2025 17:44:45 +0800 Subject: [PATCH 1/2] Rework internal navigation to prevent deadlocking The mix of sync and async HTTP requests requires care to avoid deadlocks. Previously, it was possible for async requests to use up all available HTTP state objects duration a navigation flow (either directly, or via an internal redirect (e.g. click, submit, ...)). This would block the navigation, which, because everything is single thread, would block the I/O loop, resulting in a deadlock. The correct solution seems to be to remove all synchronous I/O. And I tried to do that, but I ran into a wall with module-loading, which is initiated from V8. V8 says "give me the source for this module", and I don't see a great way to tell it: wait a bit. So I went back to trying to make this work with the hybrid model, despite last weeks failures to get it to work. I changed two things: 1 - The http client will only directly initiate an async request if there's at least 2 free state objects available (1 for the request, and leaving 1 free for any synchronous requests) 2 - Delayed navigation retries until there's at least 1 free http state object available. Commits from last week did help with this. First, we're now guaranteed to have a single sync-request at a time (previously, we could have had 2). Secondly, the async connection is now async end-to-end (previously, it could have blocked on an empty state pool). We could probably make this a bit more obviously by reserving 1 state object for synchronous requests. But, since the long term solution is probably having no synchronous requests, I'm happy with anything that lets me move past this issue. --- src/browser/page.zig | 79 +++++++++++++++++++++++++++++++++++--- src/browser/session.zig | 35 ++--------------- src/browser/xhr/xhr.zig | 18 ++++++++- src/cdp/domains/target.zig | 2 +- src/http/client.zig | 36 ++++++++++++++--- 5 files changed, 124 insertions(+), 46 deletions(-) diff --git a/src/browser/page.zig b/src/browser/page.zig index 2a652c88..9b9ba9ae 100644 --- a/src/browser/page.zig +++ b/src/browser/page.zig @@ -192,7 +192,12 @@ pub const Page = struct { const session = self.session; const notification = session.browser.notification; - log.debug(.http, "navigate", .{ .url = request_url, .reason = opts.reason }); + log.debug(.http, "navigate", .{ + .url = request_url, + .method = opts.method, + .reason = opts.reason, + .body = opts.body != null, + }); // if the url is about:blank, nothing to do. if (std.mem.eql(u8, "about:blank", request_url.raw)) { @@ -685,6 +690,8 @@ pub const Page = struct { } } + // We cannot navigate immediately as navigating will delete the DOM tree, + // which holds this event's node. // As such we schedule the function to be called as soon as possible. // The page.arena is safe to use here, but the transfer_arena exists // specifically for this type of lifetime. @@ -699,7 +706,7 @@ pub const Page = struct { navi.* = .{ .opts = opts, .session = self.session, - .url = try arena.dupe(u8, url), + .url = try self.url.resolve(arena, url), }; _ = try self.loop.timeout(0, &navi.navigate_node); } @@ -787,15 +794,77 @@ pub const Page = struct { }; const DelayedNavigation = struct { - url: []const u8, + url: URL, session: *Session, opts: NavigateOpts, + initial: bool = true, navigate_node: Loop.CallbackNode = .{ .func = delayNavigate }, + // Navigation is blocking, which is problem because it can seize up + // the loop and deadlock. We can only safely try to navigate to a + // new page when we're sure there's at least 1 free slot in the + // http client. We handle this in two phases: + // + // In the first phase, when self.initial == true, we'll shutdown the page + // and create a new one. The shutdown is important, because it resets the + // loop ctx_id and closes the scope. Closing the scope calls our XHR + // destructors which aborts requests. This is necessary to make sure our + // [blocking] navigate won't block. + // + // In the 2nd phase, we wait until there's a free http slot so that our + // navigate definetly won't block (which could deadlock the system if there + // are still pending async requests, which we've seen happen, even after + // an abort). fn delayNavigate(node: *Loop.CallbackNode, repeat_delay: *?u63) void { - _ = repeat_delay; const self: *DelayedNavigation = @fieldParentPtr("navigate_node", node); - self.session.pageNavigate(self.url, self.opts) catch |err| { + + const session = self.session; + const initial = self.initial; + + if (initial) { + session.removePage(); + _ = session.createPage() catch |err| { + log.err(.browser, "delayed navigation page error", .{ + .err = err, + .url = self.url, + }); + return; + }; + self.initial = false; + } + + if (session.browser.http_client.freeSlotCount() == 0) { + log.debug(.browser, "delayed navigate waiting", .{}); + const delay = 0 * std.time.ns_per_ms; + + // If this isn't the initial check, we can safely re-use the timer + // to check again. + if (initial == false) { + repeat_delay.* = delay; + return; + } + + // However, if this _is_ the initial check, we called + // session.removePage above, and that reset the loop ctx_id. + // We can't re-use this timer, because it has the previous ctx_id. + // We can create a new timeout though, and that'll get the new ctx_id. + // + // Page has to be not-null here because we called createPage above. + _ = session.page.?.loop.timeout(delay, &self.navigate_node) catch |err| { + log.err(.browser, "delayed navigation loop err", .{ .err = err }); + }; + return; + } + + const page = session.currentPage() orelse return; + defer if (!page.delayed_navigation) { + // If, while loading the page, we intend to navigate to another + // page, then we need to keep the transfer_arena around, as this + // sub-navigation is probably using it. + _ = session.browser.transfer_arena.reset(.{ .retain_with_limit = 64 * 1024 }); + }; + + return page.navigate(self.url, self.opts) catch |err| { log.err(.browser, "delayed navigation error", .{ .err = err, .url = self.url }); }; } diff --git a/src/browser/session.zig b/src/browser/session.zig index 5d550a5e..b308ab64 100644 --- a/src/browser/session.zig +++ b/src/browser/session.zig @@ -22,6 +22,7 @@ const Allocator = std.mem.Allocator; const Env = @import("env.zig").Env; const Page = @import("page.zig").Page; +const URL = @import("../url.zig").URL; const Browser = @import("browser.zig").Browser; const NavigateOpts = @import("page.zig").NavigateOpts; @@ -72,7 +73,7 @@ pub const Session = struct { pub fn deinit(self: *Session) void { if (self.page != null) { - self.removePage() catch {}; + self.removePage(); } self.cookie_jar.deinit(); self.storage_shed.deinit(); @@ -104,7 +105,7 @@ pub const Session = struct { return page; } - pub fn removePage(self: *Session) !void { + pub fn removePage(self: *Session) void { // Inform CDP the page is going to be removed, allowing other worlds to remove themselves before the main one self.browser.notification.dispatch(.page_remove, .{}); @@ -127,12 +128,6 @@ pub const Session = struct { // window.setTimeout and running microtasks should be ignored self.browser.app.loop.reset(); - // Finally, we run the loop. Because of the reset just above, this will - // ignore any timeouts. And, because of the endScope about this, it - // should ensure that the http requests detect the shutdown socket and - // release their resources. - try self.browser.app.loop.run(); - self.page = null; // clear netsurf memory arena. @@ -144,28 +139,4 @@ pub const Session = struct { pub fn currentPage(self: *Session) ?*Page { return &(self.page orelse return null); } - - pub fn pageNavigate(self: *Session, url_string: []const u8, opts: NavigateOpts) !void { - // currently, this is only called from the page, so let's hope - // it isn't null! - std.debug.assert(self.page != null); - - defer if (self.page) |*p| { - if (!p.delayed_navigation) { - // If, while loading the page, we intend to navigate to another - // page, then we need to keep the transfer_arena around, as this - // sub-navigation is probably using it. - _ = self.browser.transfer_arena.reset(.{ .retain_with_limit = 64 * 1024 }); - } - }; - - // it's safe to use the transfer arena here, because the page will - // eventually clone the URL using its own page_arena (after it gets - // the final URL, possibly following redirects) - const url = try self.page.?.url.resolve(self.transfer_arena, url_string); - - try self.removePage(); - var page = try self.createPage(); - return page.navigate(url, opts); - } }; diff --git a/src/browser/xhr/xhr.zig b/src/browser/xhr/xhr.zig index 907e1057..37f87bee 100644 --- a/src/browser/xhr/xhr.zig +++ b/src/browser/xhr/xhr.zig @@ -341,10 +341,17 @@ pub const XMLHttpRequest = struct { log.debug(.script_event, "dispatch event", .{ .type = typ, .source = "xhr", + .method = self.method, .url = self.url, }); self._dispatchEvt(typ) catch |err| { - log.err(.app, "dispatch event error", .{ .err = err, .type = typ, .source = "xhr" }); + log.err(.app, "dispatch event error", .{ + .err = err, + .type = typ, + .source = "xhr", + .method = self.method, + .url = self.url, + }); }; } @@ -365,10 +372,17 @@ pub const XMLHttpRequest = struct { log.debug(.script_event, "dispatch progress event", .{ .type = typ, .source = "xhr", + .method = self.method, .url = self.url, }); self._dispatchProgressEvent(typ, opts) catch |err| { - log.err(.app, "dispatch progress event error", .{ .err = err, .type = typ, .source = "xhr" }); + log.err(.app, "dispatch progress event error", .{ + .err = err, + .type = typ, + .source = "xhr", + .method = self.method, + .url = self.url, + }); }; } diff --git a/src/cdp/domains/target.zig b/src/cdp/domains/target.zig index 139ab279..3461bd44 100644 --- a/src/cdp/domains/target.zig +++ b/src/cdp/domains/target.zig @@ -220,7 +220,7 @@ fn closeTarget(cmd: anytype) !void { bc.session_id = null; } - try bc.session.removePage(); + bc.session.removePage(); if (bc.isolated_world) |*world| { world.deinit(); bc.isolated_world = null; diff --git a/src/http/client.zig b/src/http/client.zig index f060176c..c6cf71c0 100644 --- a/src/http/client.zig +++ b/src/http/client.zig @@ -113,10 +113,18 @@ pub const Client = struct { loop: *Loop, opts: RequestOpts, ) !void { - if (self.state_pool.acquireOrNull()) |state| { - // if we have state ready, we can skip the loop and immediately - // kick this request off. - return self.asyncRequestReady(method, uri, ctx, callback, state, opts); + + // See the page's DelayedNavitation for why we're doing this. TL;DR - + // we need to keep 1 slot available for the blocking page navigation flow + // (Almost worth keeping a dedicate State just for that flow, but keep + // thinking we need a more permanent solution (i.e. making everything + // non-blocking). + if (self.freeSlotCount() > 1) { + if (self.state_pool.acquireOrNull()) |state| { + // if we have state ready, we can skip the loop and immediately + // kick this request off. + return self.asyncRequestReady(method, uri, ctx, callback, state, opts); + } } // This cannot be a client-owned MemoryPool. The page can end before @@ -174,6 +182,10 @@ pub const Client = struct { .client = self, }; } + + pub fn freeSlotCount(self: *Client) usize { + return self.state_pool.freeSlotCount(); + } }; const RequestOpts = struct { @@ -2531,6 +2543,12 @@ const StatePool = struct { allocator.free(self.states); } + pub fn freeSlotCount(self: *StatePool) usize { + self.mutex.lock(); + defer self.mutex.unlock(); + return self.available; + } + pub fn acquireWait(self: *StatePool) *State { const states = self.states; @@ -3022,8 +3040,14 @@ test "HttpClient: async connect error" { .{}, ); - try loop.io.run_for_ns(std.time.ns_per_ms); - try reset.timedWait(std.time.ns_per_s); + for (0..10) |_| { + try loop.io.run_for_ns(std.time.ns_per_ms * 10); + if (reset.isSet()) { + break; + } + } else { + return error.Timeout; + } } test "HttpClient: async no body" { From 2788c36ca690d57384682b6d04e0916eda3486d4 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Thu, 12 Jun 2025 22:34:23 +0800 Subject: [PATCH 2/2] Fix loop run (Page.wait) In https://github.com/lightpanda-io/browser/pull/767 I tried to call loop.run from within a loop.run (spoiler, it didn't work), in order to make sure aborted connections were properly cleaned up before starting a new navigation. That resulted in having loop.run no longer wait for timeouts for fear of having to wait on a long timeout. The ended up breaking page.wait (used in the fetch command). This commit brings back the original behavior where loop.run() waits for all completions. Which is now safe to do since the nested loop.run() call has been removed. --- src/runtime/loop.zig | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/src/runtime/loop.zig b/src/runtime/loop.zig index cd4f1464..6e022e44 100644 --- a/src/runtime/loop.zig +++ b/src/runtime/loop.zig @@ -81,12 +81,13 @@ pub const Loop = struct { // run tail events. We do run the tail events to ensure all the // contexts are correcly free. - while (self.hasPendinEvents()) { - self.io.run_for_ns(10 * std.time.ns_per_ms) catch |err| { + while (self.pending_network_count != 0 or self.pending_timeout_count != 0) { + self.io.run_for_ns(std.time.ns_per_ms * 10) catch |err| { log.err(.loop, "deinit", .{ .err = err }); break; }; } + if (comptime CANCEL_SUPPORTED) { self.io.cancel_all(); } @@ -96,21 +97,6 @@ pub const Loop = struct { self.cancelled.deinit(self.alloc); } - // We can shutdown once all the pending network IO is complete. - // In debug mode we also wait until al the pending timeouts are complete - // but we only do this so that the `timeoutCallback` can free all allocated - // memory and we won't report a leak. - fn hasPendinEvents(self: *const Self) bool { - if (self.pending_network_count > 0) { - return true; - } - - if (builtin.mode != .Debug) { - return false; - } - return self.pending_timeout_count > 0; - } - // Retrieve all registred I/O events completed by OS kernel, // and execute sequentially their callbacks. // Stops when there is no more I/O events registered on the loop. @@ -121,9 +107,11 @@ pub const Loop = struct { self.stopping = true; defer self.stopping = false; - while (self.pending_network_count > 0) { - try self.io.run_for_ns(10 * std.time.ns_per_ms); - // at each iteration we might have new events registred by previous callbacks + while (self.pending_network_count != 0 or self.pending_timeout_count != 0) { + self.io.run_for_ns(std.time.ns_per_ms * 10) catch |err| { + log.err(.loop, "deinit", .{ .err = err }); + break; + }; } }