Add Accept-Encoding

This is necessary because of CloudFront which will send gzip content even if
we don't ask for it.

Properly handle scripts that are both async and defer.

Add a helper to print state of page wait. This can be helpful in identifying
what's causing the page to hang on page.wait.
This commit is contained in:
Karl Seguin
2025-08-05 11:24:56 +08:00
parent 32566ccc80
commit 9876d79680
4 changed files with 119 additions and 27 deletions

View File

@@ -34,6 +34,9 @@ const ScriptManager = @This();
page: *Page, page: *Page,
// used to prevent recursive evalution
is_evaluating: bool,
// Only once this is true can deferred scripts be run // Only once this is true can deferred scripts be run
static_scripts_done: bool, static_scripts_done: bool,
@@ -48,6 +51,8 @@ scripts: OrderList,
// dom_loaded == true, // dom_loaded == true,
deferreds: OrderList, deferreds: OrderList,
shutdown: bool = false,
client: *HttpClient, client: *HttpClient,
allocator: Allocator, allocator: Allocator,
buffer_pool: BufferPool, buffer_pool: BufferPool,
@@ -63,6 +68,7 @@ pub fn init(browser: *Browser, page: *Page) ScriptManager {
.asyncs = .{}, .asyncs = .{},
.scripts = .{}, .scripts = .{},
.deferreds = .{}, .deferreds = .{},
.is_evaluating = false,
.allocator = allocator, .allocator = allocator,
.client = browser.http_client, .client = browser.http_client,
.static_scripts_done = false, .static_scripts_done = false,
@@ -72,6 +78,7 @@ pub fn init(browser: *Browser, page: *Page) ScriptManager {
} }
pub fn deinit(self: *ScriptManager) void { pub fn deinit(self: *ScriptManager) void {
self.reset();
self.buffer_pool.deinit(); self.buffer_pool.deinit();
self.script_pool.deinit(); self.script_pool.deinit();
} }
@@ -193,7 +200,7 @@ pub fn addFromElement(self: *ScriptManager, element: *parser.Element) !void {
}; };
if (source == .@"inline") { if (source == .@"inline") {
// if we're here, it means that we have pending scripts (i.e. self.ordered // if we're here, it means that we have pending scripts (i.e. self.scripts
// is not empty). Because the script is inline, it's complete/ready, but // is not empty). Because the script is inline, it's complete/ready, but
// we need to process them in order // we need to process them in order
pending_script.complete = true; pending_script.complete = true;
@@ -201,9 +208,8 @@ pub fn addFromElement(self: *ScriptManager, element: *parser.Element) !void {
return; return;
} }
const list = self.getList(&pending_script.script);
pending_script.node = .{ .data = pending_script }; pending_script.node = .{ .data = pending_script };
list.append(&pending_script.node); self.getList(&pending_script.script).append(&pending_script.node);
errdefer pending_script.deinit(); errdefer pending_script.deinit();
@@ -255,7 +261,17 @@ pub fn staticScriptsDone(self: *ScriptManager) void {
// try to evaluate completed scripts (in order). This is called whenever a script // try to evaluate completed scripts (in order). This is called whenever a script
// is completed. // is completed.
fn evaluate(self: *ScriptManager) void { fn evaluate(self: *ScriptManager) void {
if (self.is_evaluating) {
// It's possible for a script.eval to cause evaluate to be called again.
// This is particularly true with blockingGet, but even without this,
// it's theoretically possible (but unlikely). We could make this work
// but there's little reason to support the complexity.
return;
}
const page = self.page; const page = self.page;
self.is_evaluating = true;
defer self.is_evaluating = false;
while (self.scripts.first) |n| { while (self.scripts.first) |n| {
var pending_script = n.data; var pending_script = n.data;
@@ -269,8 +285,8 @@ fn evaluate(self: *ScriptManager) void {
if (self.static_scripts_done == false) { if (self.static_scripts_done == false) {
// We can only execute deferred scripts if // We can only execute deferred scripts if
// 1 - all the normal scripts are done // 1 - all the normal scripts are done
// 2 - and we've loaded all the normal scripts // 2 - we've finished parsing the HTML and at least queued all the scripts
// The last one isn't obvious, but it's possible for self.scripts to/ // The last one isn't obvious, but it's possible for self.scripts to
// be empty not because we're done executing all the normal scripts // be empty not because we're done executing all the normal scripts
// but because we're done executing some (or maybe none), but we're still // but because we're done executing some (or maybe none), but we're still
// parsing the HTML. // parsing the HTML.
@@ -315,14 +331,17 @@ fn asyncDone(self: *ScriptManager) void {
} }
fn getList(self: *ScriptManager, script: *const Script) *OrderList { fn getList(self: *ScriptManager, script: *const Script) *OrderList {
if (script.is_defer) { // When a script has both the async and defer flag set, it should be
return &self.deferreds; // treated as async. Async is newer, so some websites use both so that
} // if async isn't known, it'll fallback to defer.
if (script.is_async) { if (script.is_async) {
return &self.asyncs; return &self.asyncs;
} }
if (script.is_defer) {
return &self.deferreds;
}
return &self.scripts; return &self.scripts;
} }
@@ -375,16 +394,22 @@ const PendingScript = struct {
manager: *ScriptManager, manager: *ScriptManager,
fn deinit(self: *PendingScript) void { fn deinit(self: *PendingScript) void {
var manager = self.manager;
const script = &self.script; const script = &self.script;
const manager = self.manager;
if (script.source == .remote) { if (script.source == .remote) {
manager.buffer_pool.release(script.source.remote); manager.buffer_pool.release(script.source.remote);
} }
manager.getList(script).remove(&self.node); manager.getList(script).remove(&self.node);
} }
fn remove(self: *PendingScript) void {
if (self.node) |*node| {
self.manager.getList(&self.script).remove(node);
self.node = null;
}
}
fn startCallback(self: *PendingScript, transfer: *HttpClient.Transfer) !void { fn startCallback(self: *PendingScript, transfer: *HttpClient.Transfer) !void {
_ = self; _ = self;
log.debug(.http, "script fetch start", .{ .req = transfer }); log.debug(.http, "script fetch start", .{ .req = transfer });
@@ -392,19 +417,25 @@ const PendingScript = struct {
fn headerCallback(self: *PendingScript, transfer: *HttpClient.Transfer) !void { fn headerCallback(self: *PendingScript, transfer: *HttpClient.Transfer) !void {
const header = &transfer.response_header.?; const header = &transfer.response_header.?;
if (header.status != 200) {
return error.InvalidStatusCode;
}
// @newhttp TODO: pre size based on content-length
// @newhttp TODO: max-length enfocement
self.script.source = .{ .remote = self.manager.buffer_pool.get() };
log.debug(.http, "script header", .{ log.debug(.http, "script header", .{
.req = transfer, .req = transfer,
.status = header.status, .status = header.status,
.content_type = header.contentType(), .content_type = header.contentType(),
}); });
if (header.status != 200) {
return error.InvalidStatusCode;
}
// If this isn't true, then we'll likely leak memory. If you don't
// set `CURLOPT_SUPPRESS_CONNECT_HEADERS` and CONNECT to a proxy, this
// will fail. This assertion exists to catch incorrect assumptions about
// how libcurl works, or about how we've configured it.
std.debug.assert(self.script.source.remote.capacity == 0);
// @newhttp TODO: pre size based on content-length
// @newhttp TODO: max-length enfocement
self.script.source = .{ .remote = self.manager.buffer_pool.get() };
} }
fn dataCallback(self: *PendingScript, transfer: *HttpClient.Transfer, data: []const u8) !void { fn dataCallback(self: *PendingScript, transfer: *HttpClient.Transfer, data: []const u8) !void {
@@ -436,9 +467,15 @@ const PendingScript = struct {
fn errorCallback(self: *PendingScript, err: anyerror) void { fn errorCallback(self: *PendingScript, err: anyerror) void {
log.warn(.http, "script fetch error", .{ .req = self.script.url, .err = err }); log.warn(.http, "script fetch error", .{ .req = self.script.url, .err = err });
const manager = self.manager; const manager = self.manager;
self.deinit(); self.deinit();
if (manager.shutdown) {
return;
}
manager.evaluate(); manager.evaluate();
} }
}; };

View File

@@ -153,6 +153,8 @@ pub const Page = struct {
} }
pub fn deinit(self: *Page) void { pub fn deinit(self: *Page) void {
self.script_manager.shutdown = true;
self.http_client.abort(); self.http_client.abort();
self.script_manager.deinit(); self.script_manager.deinit();
} }
@@ -268,6 +270,9 @@ pub const Page = struct {
var scheduler = &self.scheduler; var scheduler = &self.scheduler;
var http_client = self.http_client; var http_client = self.http_client;
// for debugging
// defer self.printWaitAnalysis();
while (true) { while (true) {
SW: switch (self.mode) { SW: switch (self.mode) {
.pre, .raw => { .pre, .raw => {
@@ -346,6 +351,56 @@ pub const Page = struct {
} }
} }
fn printWaitAnalysis(self: *Page) void {
std.debug.print("mode: {s}\n", .{@tagName(std.meta.activeTag(self.mode))});
std.debug.print("load: {s}\n", .{@tagName(self.load_state)});
std.debug.print("active requests: {d}\n", .{self.http_client.active});
{
std.debug.print("\nscripts: {d}\n", .{self.script_manager.scripts.len});
var n_ = self.script_manager.scripts.first;
while (n_) |n| {
std.debug.print(" - {s} complete: {any}\n", .{ n.data.script.url, n.data.complete });
n_ = n.next;
}
}
{
std.debug.print("\ndeferreds: {d}\n", .{self.script_manager.deferreds.len});
var n_ = self.script_manager.deferreds.first;
while (n_) |n| {
std.debug.print(" - {s} complete: {any}\n", .{ n.data.script.url, n.data.complete });
n_ = n.next;
}
}
const now = std.time.milliTimestamp();
{
std.debug.print("\nasyncs: {d}\n", .{self.script_manager.asyncs.len});
var n_ = self.script_manager.asyncs.first;
while (n_) |n| {
std.debug.print(" - {s} complete: {any}\n", .{ n.data.script.url, n.data.complete });
n_ = n.next;
}
}
{
std.debug.print("\nprimary schedule: {d}\n", .{self.scheduler.primary.count()});
var it = self.scheduler.primary.iterator();
while (it.next()) |task| {
std.debug.print(" - {s} complete: {any}\n", .{ task.name, task.ms - now });
}
}
{
std.debug.print("\nsecondary schedule: {d}\n", .{self.scheduler.secondary.count()});
var it = self.scheduler.secondary.iterator();
while (it.next()) |task| {
std.debug.print(" - {s} complete: {any}\n", .{ task.name, task.ms - now });
}
}
}
pub fn origin(self: *const Page, arena: Allocator) ![]const u8 { pub fn origin(self: *const Page, arena: Allocator) ![]const u8 {
var arr: std.ArrayListUnmanaged(u8) = .{}; var arr: std.ArrayListUnmanaged(u8) = .{};
try self.url.origin(arr.writer(arena)); try self.url.origin(arr.writer(arena));

View File

@@ -118,18 +118,13 @@ pub const Session = struct {
std.debug.assert(self.page != null); std.debug.assert(self.page != null);
// Cleanup is a bit sensitive. We could still have inflight I/O. For self.page.?.deinit();
// example, we could have an XHR request which is still in the connect self.page = null;
// phase. It's important that we clean these up, as they're holding onto
// limited resources (like our fixed-sized http state pool).
//
// RemoveJsContext() will execute the destructor of any type that // RemoveJsContext() will execute the destructor of any type that
// registered a destructor (e.g. XMLHttpRequest). // registered a destructor (e.g. XMLHttpRequest).
self.executor.removeJsContext(); self.executor.removeJsContext();
self.page.?.deinit();
self.page = null;
// clear netsurf memory arena. // clear netsurf memory arena.
parser.deinit(); parser.deinit();

View File

@@ -141,6 +141,11 @@ pub const Connection = struct {
} }
} }
// compression, don't remove this. CloudFront will send gzip content
// even if we don't support it, and then it won't be decompressed.
// empty string means: use whatever's available
try errorCheck(c.curl_easy_setopt(easy, c.CURLOPT_ACCEPT_ENCODING, ""));
// debug // debug
if (comptime Http.ENABLE_DEBUG) { if (comptime Http.ENABLE_DEBUG) {
try errorCheck(c.curl_easy_setopt(easy, c.CURLOPT_VERBOSE, @as(c_long, 1))); try errorCheck(c.curl_easy_setopt(easy, c.CURLOPT_VERBOSE, @as(c_long, 1)));