Files
browser/src/browser/browser.zig
Karl Seguin 3fe28d5441 Optimize memory usage
The two bigger changes here are:

1- The http_client has been moved from the Session to the Browser, allowing
   its connection pool to be re-used across multiple sessions

2- The browser now has a page_arena which is used for all page-level allocation
   and which can be re-used between pages (currently retains 1MB of memory).
   Previously, pages uses an arena that was tied to the lifetime of the page,
   thus it could not be re-used.

Using the Bench allocator for zig-js-runtime, allocated bytes went from
1347037879 to 834932438 (in a RUNS=1000 of puppeteer demo).

Various other changes to try to simplify the API and remove the possibility
of invalid states. For example, session.newPage() now includes the logic for
page.start() so that there should now never be a page that wasn't started.
2025-03-12 13:38:22 +08:00

701 lines
24 KiB
Zig

// Copyright (C) 2023-2024 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const builtin = @import("builtin");
const Allocator = std.mem.Allocator;
const Types = @import("root").Types;
const parser = @import("netsurf");
const Loader = @import("loader.zig").Loader;
const Dump = @import("dump.zig");
const Mime = @import("mime.zig").Mime;
const jsruntime = @import("jsruntime");
const Loop = jsruntime.Loop;
const Env = jsruntime.Env;
const Module = jsruntime.Module;
const apiweb = @import("../apiweb.zig");
const Window = @import("../html/window.zig").Window;
const Walker = @import("../dom/walker.zig").WalkerDepthFirst;
const URL = @import("../url/url.zig").URL;
const Location = @import("../html/location.zig").Location;
const storage = @import("../storage/storage.zig");
const FetchResult = @import("../http/Client.zig").Client.FetchResult;
const UserContext = @import("../user_context.zig").UserContext;
const HttpClient = @import("asyncio").Client;
const polyfill = @import("../polyfill/polyfill.zig");
const log = std.log.scoped(.browser);
pub const user_agent = "Lightpanda/1.0";
// Browser is an instance of the browser.
// You can create multiple browser instances.
// A browser contains only one session.
// TODO allow multiple sessions per browser.
pub const Browser = struct {
loop: *Loop,
session: ?*Session,
allocator: Allocator,
http_client: HttpClient,
session_pool: SessionPool,
page_arena: std.heap.ArenaAllocator,
const SessionPool = std.heap.MemoryPool(Session);
pub fn init(allocator: Allocator, loop: *Loop) Browser {
return .{
.loop = loop,
.session = null,
.allocator = allocator,
.http_client = .{ .allocator = allocator },
.session_pool = SessionPool.init(allocator),
.page_arena = std.heap.ArenaAllocator.init(allocator),
};
}
pub fn deinit(self: *Browser) void {
self.closeSession();
self.http_client.deinit();
self.session_pool.deinit();
self.page_arena.deinit();
}
pub fn newSession(self: *Browser, ctx: anytype) !*Session {
self.closeSession();
const session = try self.session_pool.create();
try Session.init(session, self, ctx);
self.session = session;
return session;
}
fn closeSession(self: *Browser) void {
if (self.session) |session| {
session.deinit();
self.session_pool.destroy(session);
self.session = null;
}
}
};
// Session is like a browser's tab.
// It owns the js env and the loader for all the pages of the session.
// You can create successively multiple pages for a session, but you must
// deinit a page before running another one.
pub const Session = struct {
browser: *Browser,
// The arena is used only to bound the js env init b/c it leaks memory.
// see https://github.com/lightpanda-io/jsruntime-lib/issues/181
//
// The arena is initialised with self.alloc allocator.
// all others Session deps use directly self.alloc and not the arena.
arena: std.heap.ArenaAllocator,
// TODO handle proxy
loader: Loader,
env: Env,
inspector: jsruntime.Inspector,
window: Window,
// TODO move the shed to the browser?
storage_shed: storage.Shed,
page: ?Page = null,
jstypes: [Types.len]usize = undefined,
fn init(self: *Session, browser: *Browser, ctx: anytype) !void {
const allocator = browser.allocator;
self.* = .{
.env = undefined,
.browser = browser,
.inspector = undefined,
.loader = Loader.init(allocator),
.storage_shed = storage.Shed.init(allocator),
.arena = std.heap.ArenaAllocator.init(allocator),
.window = Window.create(null, .{ .agent = user_agent }),
};
const arena = self.arena.allocator();
Env.init(&self.env, arena, browser.loop, null);
errdefer self.env.deinit();
try self.env.load(&self.jstypes);
const ContextT = @TypeOf(ctx);
const InspectorContainer = switch (@typeInfo(ContextT)) {
.Struct => ContextT,
.Pointer => |ptr| ptr.child,
.Void => NoopInspector,
else => @compileError("invalid context type"),
};
// const ctx_opaque = @as(*anyopaque, @ptrCast(ctx));
self.inspector = try jsruntime.Inspector.init(
arena,
self.env, // TODO: change to 'env' when https://github.com/lightpanda-io/zig-js-runtime/pull/285 lands
if (@TypeOf(ctx) == void) @constCast(@ptrCast(&{})) else ctx,
InspectorContainer.onInspectorResponse,
InspectorContainer.onInspectorEvent,
);
self.env.setInspector(self.inspector);
try self.env.setModuleLoadFn(self, Session.fetchModule);
}
fn deinit(self: *Session) void {
if (self.page != null) {
self.removePage();
}
self.env.deinit();
self.arena.deinit();
self.loader.deinit();
self.storage_shed.deinit();
}
fn fetchModule(ctx: *anyopaque, referrer: ?jsruntime.Module, specifier: []const u8) !jsruntime.Module {
_ = referrer;
const self: *Session = @ptrCast(@alignCast(ctx));
if (self.page == null) {
return error.NoPage;
}
log.debug("fetch module: specifier: {s}", .{specifier});
// fetchModule is called within the context of processing a page.
// Use the page_arena for this, which has a more appropriate lifetime
// and which has more retained memory between sessions and pages.
const arena = self.browser.page_arena.allocator();
const body = try self.page.?.fetchData(arena, specifier);
return self.env.compileModule(body, specifier);
}
pub fn callInspector(self: *Session, msg: []const u8) void {
self.inspector.send(self.env, msg);
}
// NOTE: the caller is not the owner of the returned value,
// the pointer on Page is just returned as a convenience
pub fn createPage(self: *Session, aux_data: ?[]const u8) !*Page {
std.debug.assert(self.page == null);
_ = self.browser.page_arena.reset(.{ .retain_with_limit = 1 * 1024 * 1024 });
self.page = Page.init(self);
const page = &self.page.?;
// start JS env
log.debug("start js env", .{});
try self.env.start();
if (comptime builtin.is_test == false) {
// By not loading this during tests, we aren't required to load
// all of the interfaces into zig-js-runtime.
log.debug("setup global env", .{});
try self.env.bindGlobal(&self.window);
}
// load polyfills
// TODO: change to 'env' when https://github.com/lightpanda-io/zig-js-runtime/pull/285 lands
try polyfill.load(self.arena.allocator(), self.env);
// inspector
self.contextCreated(page, aux_data);
return page;
}
pub fn removePage(self: *Session) void {
std.debug.assert(self.page != null);
// Reset all existing callbacks.
self.browser.loop.reset();
self.env.stop();
// TODO unload document: https://html.spec.whatwg.org/#unloading-documents
self.window.replaceLocation(null) catch |e| {
log.err("reset window location: {any}", .{e});
};
// clear netsurf memory arena.
parser.deinit();
self.page = null;
}
pub fn currentPage(self: *Session) ?*Page {
return &(self.page orelse return null);
}
fn contextCreated(self: *Session, page: *Page, aux_data: ?[]const u8) void {
log.debug("inspector context created", .{});
self.inspector.contextCreated(self.env, "", page.origin orelse "://", aux_data);
}
};
// Page navigates to an url.
// You can navigates multiple urls with the same page, but you have to call
// end() to stop the previous navigation before starting a new one.
// The page handle all its memory in an arena allocator. The arena is reseted
// when end() is called.
pub const Page = struct {
arena: Allocator,
session: *Session,
doc: ?*parser.Document = null,
// handle url
rawuri: ?[]const u8 = null,
uri: std.Uri = undefined,
origin: ?[]const u8 = null,
// html url and location
url: ?URL = null,
location: Location = .{},
raw_data: ?[]const u8 = null,
fn init(session: *Session) Page {
return .{
.session = session,
.arena = session.browser.page_arena.allocator(),
};
}
// dump writes the page content into the given file.
pub fn dump(self: *const Page, out: std.fs.File) !void {
// if no HTML document pointer available, dump the data content only.
if (self.doc == null) {
// no data loaded, nothing to do.
if (self.raw_data == null) return;
return try out.writeAll(self.raw_data.?);
}
// if the page has a pointer to a document, dumps the HTML.
try Dump.writeHTML(self.doc.?, out);
}
pub fn wait(self: *Page) !void {
// try catch
var try_catch: jsruntime.TryCatch = undefined;
try_catch.init(self.session.env);
defer try_catch.deinit();
self.session.env.wait() catch |err| {
// the js env could not be started if the document wasn't an HTML.
if (err == error.EnvNotStarted) return;
const arena = self.arena;
if (try try_catch.err(arena, self.session.env)) |msg| {
defer arena.free(msg);
log.info("wait error: {s}", .{msg});
return;
}
};
log.debug("wait: OK", .{});
}
// spec reference: https://html.spec.whatwg.org/#document-lifecycle
// - aux_data: extra data forwarded to the Inspector
// see Inspector.contextCreated
pub fn navigate(self: *Page, uri: []const u8, aux_data: ?[]const u8) !void {
const arena = self.arena;
log.debug("starting GET {s}", .{uri});
// if the uri is about:blank, nothing to do.
if (std.mem.eql(u8, "about:blank", uri)) {
return;
}
// own the url
self.rawuri = try arena.dupe(u8, uri);
self.uri = std.Uri.parse(self.rawuri.?) catch try std.Uri.parseAfterScheme("", self.rawuri.?);
self.url = try URL.constructor(arena, self.rawuri.?, null);
self.location.url = &self.url.?;
try self.session.window.replaceLocation(&self.location);
// prepare origin value.
var buf: std.ArrayListUnmanaged(u8) = .{};
try self.uri.writeToStream(.{
.scheme = true,
.authority = true,
}, buf.writer(arena));
self.origin = buf.items;
// TODO handle fragment in url.
// load the data
var resp = try self.session.loader.get(arena, self.uri);
defer resp.deinit();
const req = resp.req;
log.info("GET {any} {d}", .{ self.uri, @intFromEnum(req.response.status) });
// TODO handle redirection
log.debug("{?} {d} {s}", .{
req.response.version,
@intFromEnum(req.response.status),
req.response.reason,
// TODO log headers
});
// TODO handle charset
// https://html.spec.whatwg.org/#content-type
var it = req.response.iterateHeaders();
var ct_: ?[]const u8 = null;
while (true) {
const h = it.next() orelse break;
if (std.ascii.eqlIgnoreCase(h.name, "Content-Type")) {
ct_ = try arena.dupe(u8, h.value);
}
}
const ct = ct_ orelse {
// no content type in HTTP headers.
// TODO try to sniff mime type from the body.
log.info("no content-type HTTP header", .{});
return;
};
log.debug("header content-type: {s}", .{ct});
var mime = try Mime.parse(arena, ct);
if (mime.isHTML()) {
try self.loadHTMLDoc(req.reader(), mime.charset orelse "utf-8", aux_data);
} else {
log.info("non-HTML document: {s}", .{ct});
// save the body into the page.
self.raw_data = try req.reader().readAllAlloc(arena, 16 * 1024 * 1024);
}
}
// https://html.spec.whatwg.org/#read-html
fn loadHTMLDoc(self: *Page, reader: anytype, charset: []const u8, aux_data: ?[]const u8) !void {
const arena = self.arena;
// start netsurf memory arena.
try parser.init();
log.debug("parse html with charset {s}", .{charset});
const ccharset = try arena.dupeZ(u8, charset);
const html_doc = try parser.documentHTMLParse(reader, ccharset);
const doc = parser.documentHTMLToDocument(html_doc);
// save a document's pointer in the page.
self.doc = doc;
// TODO set document.readyState to interactive
// https://html.spec.whatwg.org/#reporting-document-loading-status
// inject the URL to the document including the fragment.
try parser.documentSetDocumentURI(doc, self.rawuri orelse "about:blank");
const session = self.session;
// TODO set the referrer to the document.
try session.window.replaceDocument(html_doc);
session.window.setStorageShelf(
try session.storage_shed.getOrPut(self.origin orelse "null"),
);
// https://html.spec.whatwg.org/#read-html
// inspector
session.contextCreated(self, aux_data);
// replace the user context document with the new one.
try session.env.setUserContext(.{
.document = html_doc,
.httpClient = &self.session.browser.http_client,
});
// browse the DOM tree to retrieve scripts
// TODO execute the synchronous scripts during the HTL parsing.
// TODO fetch the script resources concurrently but execute them in the
// declaration order for synchronous ones.
// sasync stores scripts which can be run asynchronously.
// for now they are just run after the non-async one in order to
// dispatch DOMContentLoaded the sooner as possible.
var sasync: std.ArrayListUnmanaged(Script) = .{};
const root = parser.documentToNode(doc);
const walker = Walker{};
var next: ?*parser.Node = null;
while (true) {
next = try walker.get_next(root, next) orelse break;
// ignore non-elements nodes.
if (try parser.nodeType(next.?) != .element) {
continue;
}
const e = parser.nodeToElement(next.?);
// ignore non-js script.
const script = try Script.init(e) orelse continue;
if (script.kind == .unknown) continue;
// Ignore the defer attribute b/c we analyze all script
// after the document has been parsed.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/script#defer
// TODO use fetchpriority
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/script#fetchpriority
// > async
// > For classic scripts, if the async attribute is present,
// > then the classic script will be fetched in parallel to
// > parsing and evaluated as soon as it is available.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/script#async
if (script.is_async) {
try sasync.append(arena, script);
continue;
}
// TODO handle for attribute
// TODO handle event attribute
// TODO defer
// > This Boolean attribute is set to indicate to a browser
// > that the script is meant to be executed after the
// > document has been parsed, but before firing
// > DOMContentLoaded.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/script#defer
// defer allow us to load a script w/o blocking the rest of
// evaluations.
// > Scripts without async, defer or type="module"
// > attributes, as well as inline scripts without the
// > type="module" attribute, are fetched and executed
// > immediately before the browser continues to parse the
// > page.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/script#notes
try parser.documentHTMLSetCurrentScript(html_doc, @ptrCast(e));
self.evalScript(script) catch |err| log.warn("evaljs: {any}", .{err});
try parser.documentHTMLSetCurrentScript(html_doc, null);
}
// TODO wait for deferred scripts
// dispatch DOMContentLoaded before the transition to "complete",
// at the point where all subresources apart from async script elements
// have loaded.
// https://html.spec.whatwg.org/#reporting-document-loading-status
const evt = try parser.eventCreate();
defer parser.eventDestroy(evt);
try parser.eventInit(evt, "DOMContentLoaded", .{ .bubbles = true, .cancelable = true });
_ = try parser.eventTargetDispatchEvent(parser.toEventTarget(parser.DocumentHTML, html_doc), evt);
// eval async scripts.
for (sasync.items) |s| {
try parser.documentHTMLSetCurrentScript(html_doc, @ptrCast(s.element));
self.evalScript(s) catch |err| log.warn("evaljs: {any}", .{err});
try parser.documentHTMLSetCurrentScript(html_doc, null);
}
// TODO wait for async scripts
// TODO set document.readyState to complete
// dispatch window.load event
const loadevt = try parser.eventCreate();
defer parser.eventDestroy(loadevt);
try parser.eventInit(loadevt, "load", .{});
_ = try parser.eventTargetDispatchEvent(
parser.toEventTarget(Window, &self.session.window),
loadevt,
);
}
// evalScript evaluates the src in priority.
// if no src is present, we evaluate the text source.
// https://html.spec.whatwg.org/multipage/scripting.html#script-processing-model
fn evalScript(self: *Page, s: Script) !void {
// https://html.spec.whatwg.org/multipage/webappapis.html#fetch-a-classic-script
const opt_src = try parser.elementGetAttribute(s.element, "src");
if (opt_src) |src| {
log.debug("starting GET {s}", .{src});
self.fetchScript(s) catch |err| {
switch (err) {
FetchError.BadStatusCode => return err,
// TODO If el's result is null, then fire an event named error at
// el, and return.
FetchError.NoBody => return,
FetchError.JsErr => {}, // nothing to do here.
else => return err,
}
};
// TODO If el's from an external file is true, then fire an event
// named load at el.
return;
}
// TODO handle charset attribute
const opt_text = try parser.nodeTextContent(parser.elementToNode(s.element));
if (opt_text) |text| {
// TODO: change to &self.session.env when
// https://github.com/lightpanda-io/zig-js-runtime/pull/285 lands
try s.eval(self.arena, self.session.env, text);
return;
}
// nothing has been loaded.
// TODO If el's result is null, then fire an event named error at
// el, and return.
}
const FetchError = error{
BadStatusCode,
NoBody,
JsErr,
};
// the caller owns the returned string
fn fetchData(self: *Page, arena: Allocator, src: []const u8) ![]const u8 {
log.debug("starting fetch {s}", .{src});
var buffer: [1024]u8 = undefined;
var b: []u8 = buffer[0..];
const u = try std.Uri.resolve_inplace(self.uri, src, &b);
var fetchres = try self.session.loader.get(arena, u);
defer fetchres.deinit();
const resp = fetchres.req.response;
log.info("fetch {any}: {d}", .{ u, resp.status });
if (resp.status != .ok) {
return FetchError.BadStatusCode;
}
// TODO check content-type
const body = try fetchres.req.reader().readAllAlloc(arena, 16 * 1024 * 1024);
// check no body
if (body.len == 0) {
return FetchError.NoBody;
}
return body;
}
// fetchScript senf a GET request to the src and execute the script
// received.
fn fetchScript(self: *Page, s: Script) !void {
const arena = self.arena;
const body = try self.fetchData(arena, s.src);
// TODO: change to &self.session.env when
// https://github.com/lightpanda-io/zig-js-runtime/pull/285 lands
try s.eval(arena, self.session.env, body);
}
const Script = struct {
element: *parser.Element,
kind: Kind,
is_async: bool,
src: []const u8,
const Kind = enum {
unknown,
javascript,
module,
};
fn init(e: *parser.Element) !?Script {
// ignore non-script tags
const tag = try parser.elementHTMLGetTagType(@as(*parser.ElementHTML, @ptrCast(e)));
if (tag != .script) return null;
return .{
.element = e,
.kind = kind(try parser.elementGetAttribute(e, "type")),
.is_async = try parser.elementGetAttribute(e, "async") != null,
.src = try parser.elementGetAttribute(e, "src") orelse "inline",
};
}
// > type
// > Attribute is not set (default), an empty string, or a JavaScript MIME
// > type indicates that the script is a "classic script", containing
// > JavaScript code.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/script#attribute_is_not_set_default_an_empty_string_or_a_javascript_mime_type
fn kind(stype: ?[]const u8) Kind {
if (stype == null or stype.?.len == 0) return .javascript;
if (std.mem.eql(u8, stype.?, "application/javascript")) return .javascript;
if (std.mem.eql(u8, stype.?, "module")) return .module;
return .unknown;
}
fn eval(self: Script, arena: Allocator, env: Env, body: []const u8) !void {
var try_catch: jsruntime.TryCatch = undefined;
try_catch.init(env);
defer try_catch.deinit();
const res = switch (self.kind) {
.unknown => return error.UnknownScript,
.javascript => env.exec(body, self.src),
.module => env.module(body, self.src),
} catch {
if (try try_catch.err(arena, env)) |msg| {
log.info("eval script {s}: {s}", .{ self.src, msg });
}
return FetchError.JsErr;
};
if (builtin.mode == .Debug) {
const msg = try res.toString(arena, env);
log.debug("eval script {s}: {s}", .{ self.src, msg });
}
}
};
};
const NoopInspector = struct {
pub fn onInspectorResponse(_: *anyopaque, _: u32, _: []const u8) void {}
pub fn onInspectorEvent(_: *anyopaque, _: []const u8) void {}
};