Compare commits

...

54 Commits

Author SHA1 Message Date
Muki Kiboigo
de4f91af01 fix WebBotAuthLayer crash 2026-04-03 13:33:01 -07:00
Muki Kiboigo
3385ce2586 add WebBotAuth layer 2026-04-03 13:33:01 -07:00
Muki Kiboigo
f7b72b17f2 add RobotsLayer 2026-04-03 13:33:01 -07:00
Muki Kiboigo
e868b553f7 try layering http client 2026-04-03 13:32:59 -07:00
Muki Kiboigo
778b7eb8c2 allocate CacheMetadata on use 2026-04-03 07:34:31 -07:00
Muki Kiboigo
ca5fa2b866 change --cache-dir -> --http-cache-dir 2026-04-03 07:23:32 -07:00
Pierre Tachoire
a71ff521aa cache: add debug log with no store reason 2026-04-03 07:23:32 -07:00
Muki Kiboigo
5a551607c2 better logging on FsCache init failure 2026-04-03 07:23:32 -07:00
Muki Kiboigo
13ea4d1ee3 more expressive cache logging 2026-04-03 07:23:32 -07:00
Muki Kiboigo
dc600c953f move script queue log before request 2026-04-03 07:23:32 -07:00
Muki Kiboigo
e00d569754 fix crashes on cached file from script manager 2026-04-03 07:23:31 -07:00
Muki Kiboigo
3d760e4577 add format to CachedMetadata 2026-04-03 07:23:31 -07:00
Muki Kiboigo
1e8bdd7e28 assign headers and vary headers before possible move 2026-04-03 07:23:31 -07:00
Muki Kiboigo
31bab4cc05 put in cache before releasing conn 2026-04-03 07:23:31 -07:00
Muki Kiboigo
a1a301666f dupe url in tryCache 2026-04-03 07:23:31 -07:00
Muki Kiboigo
619a2653d1 update cacheDir config option 2026-04-03 07:23:31 -07:00
Muki Kiboigo
0b9cae5354 fix self.req.ctx in HttpClient 2026-04-03 07:23:31 -07:00
Muki Kiboigo
f098a991a8 remove cache revalidation stubs 2026-04-03 07:23:30 -07:00
Muki Kiboigo
7b5e4d6f52 add Vary support 2026-04-03 07:23:30 -07:00
Muki Kiboigo
9ffc99d6a2 add more FsCache tests 2026-04-03 07:23:30 -07:00
Muki Kiboigo
855c3290ff always close file on serveFromCache 2026-04-03 07:23:30 -07:00
Muki Kiboigo
d65a4b09f3 better logging for cache 2026-04-03 07:23:30 -07:00
Muki Kiboigo
6a57d69359 switch to single file cache 2026-04-03 07:23:30 -07:00
Muki Kiboigo
9c5e67fbf5 properly deinit cache 2026-04-03 07:23:30 -07:00
Muki Kiboigo
7edb24e54d use wyhash for power of two lock stripes 2026-04-03 07:23:29 -07:00
Muki Kiboigo
a60932bbe0 require timestamp passed in with cache request 2026-04-03 07:23:29 -07:00
Muki Kiboigo
77e9f5caf7 remove unused cache method on fs cache 2026-04-03 07:23:29 -07:00
Muki Kiboigo
cedc894445 add basic fs cache get/put test 2026-04-03 07:23:29 -07:00
Muki Kiboigo
9d62e58c9a check age on fs cache get 2026-04-03 07:23:29 -07:00
Muki Kiboigo
609983da87 only store stuff when we know we will cache 2026-04-03 07:23:29 -07:00
Muki Kiboigo
65f77af84d shortcircuit a lot of caching checks 2026-04-03 07:23:29 -07:00
Muki Kiboigo
cd3e6b2364 ensure fs cache file is closed after use 2026-04-03 07:23:28 -07:00
Muki Kiboigo
557a4458a4 use CacheRequest instead of key 2026-04-03 07:23:28 -07:00
Muki Kiboigo
ce620e208d add striped lock to FsCache 2026-04-03 07:23:28 -07:00
Muki Kiboigo
2de35a9db2 use arena_pool for cache get 2026-04-03 07:23:28 -07:00
Muki Kiboigo
3eb05fdd1a use writer for fs cache body file 2026-04-03 07:23:28 -07:00
Muki Kiboigo
186fdee59b use json for fs cache metadata file 2026-04-03 07:23:28 -07:00
Muki Kiboigo
3c8bb5bc00 use sha256 instead of wyhash 2026-04-03 07:23:28 -07:00
Muki Kiboigo
66d190c047 store type_buf and sub_type_buf in Mime 2026-04-03 07:23:28 -07:00
Muki Kiboigo
5c2207ecc3 add more http caching rules 2026-04-03 07:23:27 -07:00
Muki Kiboigo
18d347e247 use CacheControl and Vary 2026-04-03 07:23:27 -07:00
Muki Kiboigo
29dfbbfdea cache headers along with response 2026-04-03 07:23:27 -07:00
Muki Kiboigo
02f611bbc8 add basic caching support 2026-04-03 07:23:27 -07:00
Muki Kiboigo
349d5a0a0b create cache owned by the network struct 2026-04-03 07:23:27 -07:00
Muki Kiboigo
647d989191 use enum approach instead of vtable 2026-04-03 07:23:27 -07:00
Muki Kiboigo
41a24623fa add basic FsCache impl 2026-04-03 07:23:27 -07:00
Muki Kiboigo
d9a3d912c0 add CachedResponse variant to Response 2026-04-03 07:23:26 -07:00
Muki Kiboigo
070baa8f46 add headerIterator to Http Response 2026-04-03 07:23:26 -07:00
Muki Kiboigo
4f78f299a3 use Response instead of Transfer in callbacks 2026-04-03 07:23:25 -07:00
Muki Kiboigo
9568c86326 allow Mime parse to use []const u8 2026-04-03 07:22:20 -07:00
Muki Kiboigo
6633b6effc add cache dir to configuration opts 2026-04-03 07:22:20 -07:00
Muki Kiboigo
2962864f3d create Cache interface file 2026-04-03 07:22:20 -07:00
Pierre Tachoire
749f21816c Merge pull request #2075 from lightpanda-io/e2e-integration-proxy
use proxy for integration tests
2026-04-03 14:23:10 +02:00
Pierre Tachoire
6bb8bc8391 ci: use proxy for integration tests 2026-04-03 09:24:17 +02:00
22 changed files with 2574 additions and 1181 deletions

View File

@@ -62,7 +62,7 @@ jobs:
- name: run end to end integration tests
continue-on-error: true
run: |
./lightpanda serve --log-level error & echo $! > LPD.pid
./lightpanda serve --http-proxy ${{ secrets.MASSIVE_PROXY_RESIDENTIAL_US }} --log-level error & echo $! > LPD.pid
go run integration/main.go |tee result.log
kill `cat LPD.pid`

View File

@@ -55,7 +55,7 @@ pub fn init(allocator: Allocator, config: *const Config) !*App {
.arena_pool = undefined,
};
app.network = try Network.init(allocator, config);
app.network = try Network.init(allocator, app, config);
errdefer app.network.deinit();
app.platform = try Platform.init();

View File

@@ -156,6 +156,13 @@ pub fn userAgentSuffix(self: *const Config) ?[]const u8 {
};
}
pub fn httpCacheDir(self: *const Config) ?[]const u8 {
return switch (self.mode) {
inline .serve, .fetch, .mcp => |opts| opts.common.http_cache_dir,
else => null,
};
}
pub fn cdpTimeout(self: *const Config) usize {
return switch (self.mode) {
.serve => |opts| if (opts.timeout > 604_800) 604_800_000 else @as(usize, opts.timeout) * 1000,
@@ -273,6 +280,7 @@ pub const Common = struct {
log_format: ?log.Format = null,
log_filter_scopes: ?[]log.Scope = null,
user_agent_suffix: ?[]const u8 = null,
http_cache_dir: ?[]const u8 = null,
web_bot_auth_key_file: ?[]const u8 = null,
web_bot_auth_keyid: ?[]const u8 = null,
@@ -392,6 +400,11 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void {
\\
\\--web-bot-auth-domain
\\ Your domain e.g. yourdomain.com
\\
\\--http-cache-dir
\\ Path to a directory to use as a Filesystem Cache for network resources.
\\ Omitting this will result is no caching.
\\ Defaults to no caching.
;
// MAX_HELP_LEN|
@@ -1066,5 +1079,14 @@ fn parseCommonArg(
return true;
}
if (std.mem.eql(u8, "--http-cache-dir", opt)) {
const str = args.next() orelse {
log.fatal(.app, "missing argument value", .{ .arg = "--http-cache-dir" });
return error.InvalidArgument;
};
common.http_cache_dir = try allocator.dupe(u8, str);
return true;
}
return false;
}

View File

@@ -260,14 +260,14 @@ pub const Client = struct {
fn start(self: *Client) void {
const http = self.http;
http.cdp_client = .{
http.setCdpClient(.{
.socket = self.ws.socket,
.ctx = self,
.blocking_read_start = Client.blockingReadStart,
.blocking_read = Client.blockingRead,
.blocking_read_end = Client.blockingReadStop,
};
defer http.cdp_client = null;
});
defer http.setCdpClient(null);
self.httpLoop(http) catch |err| {
log.err(.app, "CDP client loop", .{ .err = err });

File diff suppressed because it is too large Load Diff

View File

@@ -27,6 +27,9 @@ charset: [41]u8 = default_charset,
charset_len: usize = default_charset_len,
is_default_charset: bool = true,
type_buf: [127]u8 = @splat(0),
sub_type_buf: [127]u8 = @splat(0),
/// String "UTF-8" continued by null characters.
const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;
const default_charset_len = 5;
@@ -61,7 +64,10 @@ pub const ContentType = union(ContentTypeEnum) {
image_webp: void,
application_json: void,
unknown: void,
other: struct { type: []const u8, sub_type: []const u8 },
other: struct {
type: []const u8,
sub_type: []const u8,
},
};
pub fn contentTypeString(mime: *const Mime) []const u8 {
@@ -112,17 +118,18 @@ fn parseCharset(value: []const u8) error{ CharsetTooBig, Invalid }![]const u8 {
return value;
}
pub fn parse(input: []u8) !Mime {
pub fn parse(input: []const u8) !Mime {
if (input.len > 255) {
return error.TooBig;
}
// Zig's trim API is broken. The return type is always `[]const u8`,
// even if the input type is `[]u8`. @constCast is safe here.
var normalized = @constCast(std.mem.trim(u8, input, &std.ascii.whitespace));
var buf: [255]u8 = undefined;
const normalized = std.ascii.lowerString(&buf, std.mem.trim(u8, input, &std.ascii.whitespace));
_ = std.ascii.lowerString(normalized, normalized);
const content_type, const type_len = try parseContentType(normalized);
var mime = Mime{ .content_type = undefined };
const content_type, const type_len = try parseContentType(normalized, &mime.type_buf, &mime.sub_type_buf);
if (type_len >= normalized.len) {
return .{ .content_type = content_type };
}
@@ -163,13 +170,12 @@ pub fn parse(input: []u8) !Mime {
}
}
return .{
.params = params,
.charset = charset,
.charset_len = charset_len,
.content_type = content_type,
.is_default_charset = !has_explicit_charset,
};
mime.params = params;
mime.charset = charset;
mime.charset_len = charset_len;
mime.content_type = content_type;
mime.is_default_charset = !has_explicit_charset;
return mime;
}
/// Prescan the first 1024 bytes of an HTML document for a charset declaration.
@@ -395,7 +401,7 @@ pub fn isText(mime: *const Mime) bool {
}
// we expect value to be lowercase
fn parseContentType(value: []const u8) !struct { ContentType, usize } {
fn parseContentType(value: []const u8, type_buf: []u8, sub_type_buf: []u8) !struct { ContentType, usize } {
const end = std.mem.indexOfScalarPos(u8, value, 0, ';') orelse value.len;
const type_name = trimRight(value[0..end]);
const attribute_start = end + 1;
@@ -444,10 +450,18 @@ fn parseContentType(value: []const u8) !struct { ContentType, usize } {
return error.Invalid;
}
return .{ .{ .other = .{
.type = main_type,
.sub_type = sub_type,
} }, attribute_start };
@memcpy(type_buf[0..main_type.len], main_type);
@memcpy(sub_type_buf[0..sub_type.len], sub_type);
return .{
.{
.other = .{
.type = type_buf[0..main_type.len],
.sub_type = sub_type_buf[0..sub_type.len],
},
},
attribute_start,
};
}
const VALID_CODEPOINTS = blk: {
@@ -461,6 +475,13 @@ const VALID_CODEPOINTS = blk: {
break :blk v;
};
pub fn typeString(self: *const Mime) []const u8 {
return switch (self.content_type) {
.other => |o| o.type[0..o.type_len],
else => "",
};
}
fn validType(value: []const u8) bool {
for (value) |b| {
if (VALID_CODEPOINTS[b] == false) {

View File

@@ -886,12 +886,10 @@ fn notifyParentLoadComplete(self: *Page) void {
parent.iframeCompletedLoading(self.iframe.?);
}
fn pageHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
var self: *Page = @ptrCast(@alignCast(transfer.ctx));
fn pageHeaderDoneCallback(response: HttpClient.Response) !bool {
var self: *Page = @ptrCast(@alignCast(response.ctx));
const header = &transfer.response_header.?;
const response_url = std.mem.span(header.url);
const response_url = response.url();
if (std.mem.eql(u8, response_url, self.url) == false) {
// would be different than self.url in the case of a redirect
self.url = try self.arena.dupeZ(u8, response_url);
@@ -905,8 +903,8 @@ fn pageHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
if (comptime IS_DEBUG) {
log.debug(.page, "navigate header", .{
.url = self.url,
.status = header.status,
.content_type = header.contentType(),
.status = response.status(),
.content_type = response.contentType(),
.type = self._type,
});
}
@@ -927,14 +925,14 @@ fn pageHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
return true;
}
fn pageDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
var self: *Page = @ptrCast(@alignCast(transfer.ctx));
fn pageDataCallback(response: HttpClient.Response, data: []const u8) !void {
var self: *Page = @ptrCast(@alignCast(response.ctx));
if (self._parse_state == .pre) {
// we lazily do this, because we might need the first chunk of data
// to sniff the content type
var mime: Mime = blk: {
if (transfer.response_header.?.contentType()) |ct| {
if (response.contentType()) |ct| {
break :blk try Mime.parse(ct);
}
break :blk Mime.sniff(data);

View File

@@ -135,7 +135,7 @@ fn _tick(self: *Runner, comptime is_cdp: bool, opts: TickOpts) !CDPTickResult {
.pre, .raw, .text, .image => {
// The main page hasn't started/finished navigating.
// There's no JS to run, and no reason to run the scheduler.
if (http_client.active == 0 and (comptime is_cdp) == false) {
if (http_client.active() == 0 and (comptime is_cdp) == false) {
// haven't started navigating, I guess.
return .done;
}
@@ -169,8 +169,8 @@ fn _tick(self: *Runner, comptime is_cdp: bool, opts: TickOpts) !CDPTickResult {
// Each call to this runs scheduled load events.
try page.dispatchLoad();
const http_active = http_client.active;
const total_network_activity = http_active + http_client.intercepted;
const http_active = http_client.active();
const total_network_activity = http_active + http_client.intercepted();
if (page._notified_network_almost_idle.check(total_network_activity <= 2)) {
page.notifyNetworkAlmostIdle();
}
@@ -183,7 +183,7 @@ fn _tick(self: *Runner, comptime is_cdp: bool, opts: TickOpts) !CDPTickResult {
// because is_cdp is true, and that can only be
// the case when interception isn't possible.
if (comptime IS_DEBUG) {
std.debug.assert(http_client.intercepted == 0);
std.debug.assert(http_client.intercepted() == 0);
}
if (browser.hasBackgroundTasks()) {

View File

@@ -273,6 +273,24 @@ pub fn addFromElement(self: *ScriptManager, comptime from_parser: bool, script_e
// Let the outer errdefer handle releasing the arena if client.request fails
}
if (comptime IS_DEBUG) {
var ls: js.Local.Scope = undefined;
page.js.localScope(&ls);
defer ls.deinit();
log.debug(.http, "script queue", .{
.ctx = ctx,
.url = remote_url.?,
.element = element,
.stack = ls.local.stackTrace() catch "???",
});
}
{
const was_evaluating = self.is_evaluating;
self.is_evaluating = true;
defer self.is_evaluating = was_evaluating;
try self.client.request(.{
.url = url,
.ctx = script,
@@ -290,20 +308,9 @@ pub fn addFromElement(self: *ScriptManager, comptime from_parser: bool, script_e
.done_callback = Script.doneCallback,
.error_callback = Script.errorCallback,
});
handover = true;
if (comptime IS_DEBUG) {
var ls: js.Local.Scope = undefined;
page.js.localScope(&ls);
defer ls.deinit();
log.debug(.http, "script queue", .{
.ctx = ctx,
.url = remote_url.?,
.element = element,
.stack = ls.local.stackTrace() catch "???",
});
}
handover = true;
}
if (is_blocking == false) {
@@ -694,32 +701,33 @@ pub const Script = struct {
self.manager.page.releaseArena(self.arena);
}
fn startCallback(transfer: *HttpClient.Transfer) !void {
log.debug(.http, "script fetch start", .{ .req = transfer });
fn startCallback(response: HttpClient.Response) !void {
log.debug(.http, "script fetch start", .{ .req = response });
}
fn headerCallback(transfer: *HttpClient.Transfer) !bool {
const self: *Script = @ptrCast(@alignCast(transfer.ctx));
const header = &transfer.response_header.?;
self.status = header.status;
if (header.status != 200) {
fn headerCallback(response: HttpClient.Response) !bool {
const self: *Script = @ptrCast(@alignCast(response.ctx));
self.status = response.status().?;
if (response.status() != 200) {
log.info(.http, "script header", .{
.req = transfer,
.status = header.status,
.content_type = header.contentType(),
.req = response,
.status = response.status(),
.content_type = response.contentType(),
});
return false;
}
if (comptime IS_DEBUG) {
log.debug(.http, "script header", .{
.req = transfer,
.status = header.status,
.content_type = header.contentType(),
.req = response,
.status = response.status(),
.content_type = response.contentType(),
});
}
{
switch (response.inner) {
.transfer => |transfer| {
// temp debug, trying to figure out why the next assert sometimes
// fails. Is the buffer just corrupt or is headerCallback really
// being called twice?
@@ -751,25 +759,28 @@ pub const Script = struct {
self.debug_transfer_intercept_state = @intFromEnum(transfer._intercept_state);
self.debug_transfer_auth_challenge = transfer._auth_challenge != null;
self.debug_transfer_easy_id = if (transfer._conn) |c| @intFromPtr(c._easy) else 0;
},
else => {},
}
lp.assert(self.source.remote.capacity == 0, "ScriptManager.Header buffer", .{ .capacity = self.source.remote.capacity });
var buffer: std.ArrayList(u8) = .empty;
if (transfer.getContentLength()) |cl| {
if (response.contentLength()) |cl| {
try buffer.ensureTotalCapacity(self.arena, cl);
}
self.source = .{ .remote = buffer };
return true;
}
fn dataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
const self: *Script = @ptrCast(@alignCast(transfer.ctx));
self._dataCallback(transfer, data) catch |err| {
log.err(.http, "SM.dataCallback", .{ .err = err, .transfer = transfer, .len = data.len });
fn dataCallback(response: HttpClient.Response, data: []const u8) !void {
const self: *Script = @ptrCast(@alignCast(response.ctx));
self._dataCallback(response, data) catch |err| {
log.err(.http, "SM.dataCallback", .{ .err = err, .transfer = response, .len = data.len });
return err;
};
}
fn _dataCallback(self: *Script, _: *HttpClient.Transfer, data: []const u8) !void {
fn _dataCallback(self: *Script, _: HttpClient.Response, data: []const u8) !void {
try self.source.remote.appendSlice(self.arena, data);
}

View File

@@ -127,16 +127,16 @@ fn handleBlobUrl(url: []const u8, resolver: js.PromiseResolver, page: *Page) !js
return resolver.promise();
}
fn httpStartCallback(transfer: *HttpClient.Transfer) !void {
const self: *Fetch = @ptrCast(@alignCast(transfer.ctx));
fn httpStartCallback(response: HttpClient.Response) !void {
const self: *Fetch = @ptrCast(@alignCast(response.ctx));
if (comptime IS_DEBUG) {
log.debug(.http, "request start", .{ .url = self._url, .source = "fetch" });
}
self._response._transfer = transfer;
self._response._http_response = response;
}
fn httpHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
const self: *Fetch = @ptrCast(@alignCast(transfer.ctx));
fn httpHeaderDoneCallback(response: HttpClient.Response) !bool {
const self: *Fetch = @ptrCast(@alignCast(response.ctx));
if (self._signal) |signal| {
if (signal._aborted) {
@@ -145,25 +145,24 @@ fn httpHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
}
const arena = self._response._arena;
if (transfer.getContentLength()) |cl| {
if (response.contentLength()) |cl| {
try self._buf.ensureTotalCapacity(arena, cl);
}
const res = self._response;
const header = transfer.response_header.?;
if (comptime IS_DEBUG) {
log.debug(.http, "request header", .{
.source = "fetch",
.url = self._url,
.status = header.status,
.status = response.status(),
});
}
res._status = header.status;
res._status_text = std.http.Status.phrase(@enumFromInt(header.status)) orelse "";
res._url = try arena.dupeZ(u8, std.mem.span(header.url));
res._is_redirected = header.redirect_count > 0;
res._status = response.status().?;
res._status_text = std.http.Status.phrase(@enumFromInt(response.status().?)) orelse "";
res._url = try arena.dupeZ(u8, response.url());
res._is_redirected = response.redirectCount().? > 0;
// Determine response type based on origin comparison
const page_origin = URL.getOrigin(arena, self._page.url) catch null;
@@ -183,7 +182,7 @@ fn httpHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
res._type = .basic;
}
var it = transfer.responseHeaderIterator();
var it = response.headerIterator();
while (it.next()) |hdr| {
try res._headers.append(hdr.name, hdr.value, self._page);
}
@@ -191,8 +190,8 @@ fn httpHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
return true;
}
fn httpDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
const self: *Fetch = @ptrCast(@alignCast(transfer.ctx));
fn httpDataCallback(response: HttpClient.Response, data: []const u8) !void {
const self: *Fetch = @ptrCast(@alignCast(response.ctx));
// Check if aborted
if (self._signal) |signal| {
@@ -207,7 +206,7 @@ fn httpDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
fn httpDoneCallback(ctx: *anyopaque) !void {
const self: *Fetch = @ptrCast(@alignCast(ctx));
var response = self._response;
response._transfer = null;
response._http_response = null;
response._body = self._buf.items;
log.info(.http, "request complete", .{
@@ -230,7 +229,7 @@ fn httpErrorCallback(ctx: *anyopaque, _: anyerror) void {
const self: *Fetch = @ptrCast(@alignCast(ctx));
var response = self._response;
response._transfer = null;
response._http_response = null;
// the response is only passed on v8 on success, if we're here, it's safe to
// clear this. (defer since `self is in the response's arena).
@@ -256,7 +255,7 @@ fn httpShutdownCallback(ctx: *anyopaque) void {
if (self._owns_response) {
var response = self._response;
response._transfer = null;
response._http_response = null;
response.deinit(self._page._session);
// Do not access `self` after this point: the Fetch struct was
// allocated from response._arena which has been released.

View File

@@ -48,7 +48,7 @@ _type: Type,
_status_text: []const u8,
_url: [:0]const u8,
_is_redirected: bool,
_transfer: ?*HttpClient.Transfer = null,
_http_response: ?HttpClient.Response = null,
const InitOpts = struct {
status: u16 = 200,
@@ -81,9 +81,9 @@ pub fn init(body_: ?[]const u8, opts_: ?InitOpts, page: *Page) !*Response {
}
pub fn deinit(self: *Response, session: *Session) void {
if (self._transfer) |transfer| {
transfer.abort(error.Abort);
self._transfer = null;
if (self._http_response) |resp| {
resp.abort(error.Abort);
self._http_response = null;
}
session.releaseArena(self._arena);
}
@@ -191,7 +191,7 @@ pub fn clone(self: *const Response, page: *Page) !*Response {
._type = self._type,
._is_redirected = self._is_redirected,
._headers = try Headers.init(.{ .obj = self._headers }, page),
._transfer = null,
._http_response = null,
};
return cloned;
}

View File

@@ -43,7 +43,7 @@ _rc: lp.RC(u8) = .{},
_page: *Page,
_proto: *XMLHttpRequestEventTarget,
_arena: Allocator,
_transfer: ?*HttpClient.Transfer = null,
_http_response: ?HttpClient.Response = null,
_active_request: bool = false,
_url: [:0]const u8 = "",
@@ -100,9 +100,9 @@ pub fn init(page: *Page) !*XMLHttpRequest {
}
pub fn deinit(self: *XMLHttpRequest, session: *Session) void {
if (self._transfer) |transfer| {
transfer.abort(error.Abort);
self._transfer = null;
if (self._http_response) |resp| {
resp.abort(error.Abort);
self._http_response = null;
}
if (self._on_ready_state_change) |func| {
@@ -184,9 +184,9 @@ pub fn setWithCredentials(self: *XMLHttpRequest, value: bool) !void {
// TODO: url should be a union, as it can be multiple things
pub fn open(self: *XMLHttpRequest, method_: []const u8, url: [:0]const u8) !void {
// Abort any in-progress request
if (self._transfer) |transfer| {
if (self._http_response) |transfer| {
transfer.abort(error.Abort);
self._transfer = null;
self._http_response = null;
}
// Reset internal state
@@ -402,34 +402,32 @@ pub fn getResponseXML(self: *XMLHttpRequest, page: *Page) !?*Node.Document {
};
}
fn httpStartCallback(transfer: *HttpClient.Transfer) !void {
const self: *XMLHttpRequest = @ptrCast(@alignCast(transfer.ctx));
fn httpStartCallback(response: HttpClient.Response) !void {
const self: *XMLHttpRequest = @ptrCast(@alignCast(response.ctx));
if (comptime IS_DEBUG) {
log.debug(.http, "request start", .{ .method = self._method, .url = self._url, .source = "xhr" });
}
self._transfer = transfer;
self._http_response = response;
}
fn httpHeaderCallback(transfer: *HttpClient.Transfer, header: http.Header) !void {
const self: *XMLHttpRequest = @ptrCast(@alignCast(transfer.ctx));
fn httpHeaderCallback(response: HttpClient.Response, header: http.Header) !void {
const self: *XMLHttpRequest = @ptrCast(@alignCast(response.ctx));
const joined = try std.fmt.allocPrint(self._arena, "{s}: {s}", .{ header.name, header.value });
try self._response_headers.append(self._arena, joined);
}
fn httpHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
const self: *XMLHttpRequest = @ptrCast(@alignCast(transfer.ctx));
const header = &transfer.response_header.?;
fn httpHeaderDoneCallback(response: HttpClient.Response) !bool {
const self: *XMLHttpRequest = @ptrCast(@alignCast(response.ctx));
if (comptime IS_DEBUG) {
log.debug(.http, "request header", .{
.source = "xhr",
.url = self._url,
.status = header.status,
.status = response.status(),
});
}
if (header.contentType()) |ct| {
if (response.contentType()) |ct| {
self._response_mime = Mime.parse(ct) catch |e| {
log.info(.http, "invalid content type", .{
.content_Type = ct,
@@ -440,18 +438,18 @@ fn httpHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
};
}
var it = transfer.responseHeaderIterator();
var it = response.headerIterator();
while (it.next()) |hdr| {
const joined = try std.fmt.allocPrint(self._arena, "{s}: {s}", .{ hdr.name, hdr.value });
try self._response_headers.append(self._arena, joined);
}
self._response_status = header.status;
if (transfer.getContentLength()) |cl| {
self._response_status = response.status().?;
if (response.contentLength()) |cl| {
self._response_len = cl;
try self._response_data.ensureTotalCapacity(self._arena, cl);
}
self._response_url = try self._arena.dupeZ(u8, std.mem.span(header.url));
self._response_url = try self._arena.dupeZ(u8, response.url());
const page = self._page;
@@ -466,8 +464,8 @@ fn httpHeaderDoneCallback(transfer: *HttpClient.Transfer) !bool {
return true;
}
fn httpDataCallback(transfer: *HttpClient.Transfer, data: []const u8) !void {
const self: *XMLHttpRequest = @ptrCast(@alignCast(transfer.ctx));
fn httpDataCallback(response: HttpClient.Response, data: []const u8) !void {
const self: *XMLHttpRequest = @ptrCast(@alignCast(response.ctx));
try self._response_data.appendSlice(self._arena, data);
const page = self._page;
@@ -490,7 +488,7 @@ fn httpDoneCallback(ctx: *anyopaque) !void {
// Not that the request is done, the http/client will free the transfer
// object. It isn't safe to keep it around.
self._transfer = null;
self._http_response = null;
const page = self._page;
@@ -513,23 +511,23 @@ fn httpErrorCallback(ctx: *anyopaque, err: anyerror) void {
const self: *XMLHttpRequest = @ptrCast(@alignCast(ctx));
// http client will close it after an error, it isn't safe to keep around
self.handleError(err);
if (self._transfer != null) {
self._transfer = null;
if (self._http_response != null) {
self._http_response = null;
}
self.releaseSelfRef();
}
fn httpShutdownCallback(ctx: *anyopaque) void {
const self: *XMLHttpRequest = @ptrCast(@alignCast(ctx));
self._transfer = null;
self._http_response = null;
self.releaseSelfRef();
}
pub fn abort(self: *XMLHttpRequest) void {
self.handleError(error.Abort);
if (self._transfer) |transfer| {
self._transfer = null;
transfer.abort(error.Abort);
if (self._http_response) |resp| {
self._http_response = null;
resp.abort(error.Abort);
}
self.releaseSelfRef();
}

View File

@@ -139,8 +139,8 @@ fn setLifecycleEventsEnabled(cmd: *CDP.Command) !void {
try sendPageLifecycle(bc, "load", now, frame_id, loader_id);
const http_client = page._session.browser.http_client;
const http_active = http_client.active;
const total_network_activity = http_active + http_client.intercepted;
const http_active = http_client.active();
const total_network_activity = http_active + http_client.intercepted();
if (page._notified_network_almost_idle.check(total_network_activity <= 2)) {
try sendPageLifecycle(bc, "networkAlmostIdle", now, frame_id, loader_id);
}

View File

@@ -39,6 +39,7 @@ pub const Scope = enum {
telemetry,
unknown_prop,
mcp,
cache,
};
const Opts = struct {

View File

@@ -17,6 +17,7 @@
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const log = @import("../log.zig");
const builtin = @import("builtin");
const net = std.net;
const posix = std.posix;
@@ -30,6 +31,10 @@ const http = @import("http.zig");
const RobotStore = @import("Robots.zig").RobotStore;
const WebBotAuth = @import("WebBotAuth.zig");
const Cache = @import("cache/Cache.zig");
const FsCache = @import("cache/FsCache.zig");
const App = @import("../App.zig");
const Network = @This();
const Listener = struct {
@@ -45,10 +50,12 @@ const MAX_TICK_CALLBACKS = 16;
allocator: Allocator,
app: *App,
config: *const Config,
ca_blob: ?http.Blob,
robot_store: RobotStore,
web_bot_auth: ?WebBotAuth,
cache: ?Cache,
connections: []http.Connection,
available: std.DoublyLinkedList = .{},
@@ -200,7 +207,7 @@ fn globalDeinit() void {
libcurl.curl_global_cleanup();
}
pub fn init(allocator: Allocator, config: *const Config) !Network {
pub fn init(allocator: Allocator, app: *App, config: *const Config) !Network {
globalInit(allocator);
errdefer globalDeinit();
@@ -233,6 +240,22 @@ pub fn init(allocator: Allocator, config: *const Config) !Network {
else
null;
const cache = if (config.httpCacheDir()) |cache_dir_path|
Cache{
.kind = .{
.fs = FsCache.init(cache_dir_path) catch |e| {
log.err(.cache, "failed to init", .{
.kind = "FsCache",
.path = cache_dir_path,
.err = e,
});
return e;
},
},
}
else
null;
return .{
.allocator = allocator,
.config = config,
@@ -244,8 +267,10 @@ pub fn init(allocator: Allocator, config: *const Config) !Network {
.available = available,
.connections = connections,
.app = app,
.robot_store = RobotStore.init(allocator),
.web_bot_auth = web_bot_auth,
.cache = cache,
};
}
@@ -278,6 +303,8 @@ pub fn deinit(self: *Network) void {
wba.deinit(self.allocator);
}
if (self.cache) |*cache| cache.deinit();
globalDeinit();
}

213
src/network/cache/Cache.zig vendored Normal file
View File

@@ -0,0 +1,213 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const log = @import("../../log.zig");
const Http = @import("../http.zig");
const FsCache = @import("FsCache.zig");
/// A browser-wide cache for resources across the network.
/// This mostly conforms to RFC9111 with regards to caching behavior.
pub const Cache = @This();
kind: union(enum) {
fs: FsCache,
},
pub fn deinit(self: *Cache) void {
return switch (self.kind) {
inline else => |*c| c.deinit(),
};
}
pub fn get(self: *Cache, arena: std.mem.Allocator, req: CacheRequest) ?CachedResponse {
return switch (self.kind) {
inline else => |*c| c.get(arena, req),
};
}
pub fn put(self: *Cache, metadata: CachedMetadata, body: []const u8) !void {
return switch (self.kind) {
inline else => |*c| c.put(metadata, body),
};
}
pub const CacheControl = struct {
max_age: u64,
pub fn parse(value: []const u8) ?CacheControl {
var cc: CacheControl = .{ .max_age = undefined };
var max_age_set = false;
var max_s_age_set = false;
var is_public = false;
var iter = std.mem.splitScalar(u8, value, ',');
while (iter.next()) |part| {
const directive = std.mem.trim(u8, part, &std.ascii.whitespace);
if (std.ascii.eqlIgnoreCase(directive, "no-store")) {
return null;
} else if (std.ascii.eqlIgnoreCase(directive, "no-cache")) {
return null;
} else if (std.ascii.eqlIgnoreCase(directive, "public")) {
is_public = true;
} else if (std.ascii.startsWithIgnoreCase(directive, "max-age=")) {
if (!max_s_age_set) {
if (std.fmt.parseInt(u64, directive[8..], 10) catch null) |max_age| {
cc.max_age = max_age;
max_age_set = true;
}
}
} else if (std.ascii.startsWithIgnoreCase(directive, "s-maxage=")) {
if (std.fmt.parseInt(u64, directive[9..], 10) catch null) |max_age| {
cc.max_age = max_age;
max_age_set = true;
max_s_age_set = true;
}
}
}
if (!max_age_set) return null;
if (!is_public) return null;
if (cc.max_age == 0) return null;
return cc;
}
};
pub const CachedMetadata = struct {
url: [:0]const u8,
content_type: []const u8,
status: u16,
stored_at: i64,
age_at_store: u64,
cache_control: CacheControl,
/// Response Headers
headers: []const Http.Header,
/// These are Request Headers used by Vary.
vary_headers: []const Http.Header,
pub fn format(self: CachedMetadata, writer: *std.Io.Writer) !void {
try writer.print("url={s} | status={d} | content_type={s} | max_age={d} | vary=[", .{
self.url,
self.status,
self.content_type,
self.cache_control.max_age,
});
// Logging all headers gets pretty verbose...
// so we just log the Vary ones that matter for caching.
if (self.vary_headers.len > 0) {
for (self.vary_headers, 0..) |hdr, i| {
if (i > 0) try writer.print(", ", .{});
try writer.print("{s}: {s}", .{ hdr.name, hdr.value });
}
}
try writer.print("]", .{});
}
};
pub const CacheRequest = struct {
url: []const u8,
timestamp: i64,
request_headers: []const Http.Header,
};
pub const CachedData = union(enum) {
buffer: []const u8,
file: struct {
file: std.fs.File,
offset: usize,
len: usize,
},
pub fn format(self: CachedData, writer: *std.Io.Writer) !void {
switch (self) {
.buffer => |buf| try writer.print("buffer({d} bytes)", .{buf.len}),
.file => |f| try writer.print("file(offset={d}, len={d} bytes)", .{ f.offset, f.len }),
}
}
};
pub const CachedResponse = struct {
metadata: CachedMetadata,
data: CachedData,
pub fn format(self: *const CachedResponse, writer: *std.Io.Writer) !void {
try writer.print("metadata=(", .{});
try self.metadata.format(writer);
try writer.print("), data=", .{});
try self.data.format(writer);
}
};
pub fn tryCache(
arena: std.mem.Allocator,
timestamp: i64,
url: [:0]const u8,
status: u16,
content_type: ?[]const u8,
cache_control: ?[]const u8,
vary: ?[]const u8,
age: ?[]const u8,
has_set_cookie: bool,
has_authorization: bool,
) !?CachedMetadata {
if (status != 200) {
log.debug(.cache, "no store", .{ .url = url, .code = status, .reason = "status" });
return null;
}
if (has_set_cookie) {
log.debug(.cache, "no store", .{ .url = url, .reason = "has_cookies" });
return null;
}
if (has_authorization) {
log.debug(.cache, "no store", .{ .url = url, .reason = "has_authorization" });
return null;
}
if (vary) |v| if (std.mem.eql(u8, v, "*")) {
log.debug(.cache, "no store", .{ .url = url, .vary = v, .reason = "vary" });
return null;
};
const cc = blk: {
if (cache_control == null) {
log.debug(.cache, "no store", .{ .url = url, .reason = "no cache control" });
return null;
}
if (CacheControl.parse(cache_control.?)) |cc| {
break :blk cc;
}
log.debug(.cache, "no store", .{ .url = url, .cache_control = cache_control.?, .reason = "cache control" });
return null;
};
return .{
.url = try arena.dupeZ(u8, url),
.content_type = if (content_type) |ct| try arena.dupe(u8, ct) else "application/octet-stream",
.status = status,
.stored_at = timestamp,
.age_at_store = if (age) |a| std.fmt.parseInt(u64, a, 10) catch 0 else 0,
.cache_control = cc,
.headers = &.{},
.vary_headers = &.{},
};
}

612
src/network/cache/FsCache.zig vendored Normal file
View File

@@ -0,0 +1,612 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const log = @import("../../log.zig");
const Cache = @import("Cache.zig");
const Http = @import("../http.zig");
const CacheRequest = Cache.CacheRequest;
const CachedMetadata = Cache.CachedMetadata;
const CachedResponse = Cache.CachedResponse;
const CACHE_VERSION: usize = 1;
const LOCK_STRIPES = 16;
comptime {
std.debug.assert(std.math.isPowerOfTwo(LOCK_STRIPES));
}
pub const FsCache = @This();
dir: std.fs.Dir,
locks: [LOCK_STRIPES]std.Thread.Mutex = .{std.Thread.Mutex{}} ** LOCK_STRIPES,
const CacheMetadataJson = struct {
version: usize,
metadata: CachedMetadata,
};
fn getLockPtr(self: *FsCache, key: *const [HASHED_KEY_LEN]u8) *std.Thread.Mutex {
const lock_idx = std.hash.Wyhash.hash(0, key[0..]) & (LOCK_STRIPES - 1);
return &self.locks[lock_idx];
}
const BODY_LEN_HEADER_LEN = 8;
const HASHED_KEY_LEN = 64;
const HASHED_PATH_LEN = HASHED_KEY_LEN + 6;
const HASHED_TMP_PATH_LEN = HASHED_PATH_LEN + 4;
fn hashKey(key: []const u8) [HASHED_KEY_LEN]u8 {
var digest: [std.crypto.hash.sha2.Sha256.digest_length]u8 = undefined;
std.crypto.hash.sha2.Sha256.hash(key, &digest, .{});
var hex: [HASHED_KEY_LEN]u8 = undefined;
_ = std.fmt.bufPrint(&hex, "{s}", .{std.fmt.bytesToHex(&digest, .lower)}) catch unreachable;
return hex;
}
fn cachePath(hashed_key: *const [HASHED_KEY_LEN]u8) [HASHED_PATH_LEN]u8 {
var path: [HASHED_PATH_LEN]u8 = undefined;
_ = std.fmt.bufPrint(&path, "{s}.cache", .{hashed_key}) catch unreachable;
return path;
}
fn cacheTmpPath(hashed_key: *const [HASHED_KEY_LEN]u8) [HASHED_TMP_PATH_LEN]u8 {
var path: [HASHED_TMP_PATH_LEN]u8 = undefined;
_ = std.fmt.bufPrint(&path, "{s}.cache.tmp", .{hashed_key}) catch unreachable;
return path;
}
pub fn init(path: []const u8) !FsCache {
const cwd = std.fs.cwd();
cwd.makeDir(path) catch |err| switch (err) {
error.PathAlreadyExists => {},
else => return err,
};
const dir = try cwd.openDir(path, .{ .iterate = true });
return .{ .dir = dir };
}
pub fn deinit(self: *FsCache) void {
self.dir.close();
}
pub fn get(self: *FsCache, arena: std.mem.Allocator, req: CacheRequest) ?Cache.CachedResponse {
const hashed_key = hashKey(req.url);
const cache_p = cachePath(&hashed_key);
const lock = self.getLockPtr(&hashed_key);
lock.lock();
defer lock.unlock();
const file = self.dir.openFile(&cache_p, .{ .mode = .read_only }) catch |e| {
switch (e) {
std.fs.File.OpenError.FileNotFound => {
log.debug(.cache, "miss", .{ .url = req.url, .hash = &hashed_key, .reason = "missing" });
},
else => |err| {
log.warn(.cache, "open file err", .{ .url = req.url, .err = err });
},
}
return null;
};
var cleanup = false;
defer if (cleanup) {
file.close();
self.dir.deleteFile(&cache_p) catch |e| {
log.err(.cache, "clean fail", .{ .url = req.url, .file = &cache_p, .err = e });
};
};
var file_buf: [1024]u8 = undefined;
var len_buf: [BODY_LEN_HEADER_LEN]u8 = undefined;
var file_reader = file.reader(&file_buf);
const file_reader_iface = &file_reader.interface;
file_reader_iface.readSliceAll(&len_buf) catch |e| {
log.warn(.cache, "read header", .{ .url = req.url, .err = e });
cleanup = true;
return null;
};
const body_len = std.mem.readInt(u64, &len_buf, .little);
// Now we read metadata.
file_reader.seekTo(body_len + BODY_LEN_HEADER_LEN) catch |e| {
log.warn(.cache, "seek metadata", .{ .url = req.url, .err = e });
cleanup = true;
return null;
};
var json_reader = std.json.Reader.init(arena, file_reader_iface);
const cache_file: CacheMetadataJson = std.json.parseFromTokenSourceLeaky(
CacheMetadataJson,
arena,
&json_reader,
.{ .allocate = .alloc_always },
) catch |e| {
// Warn because malformed metadata can be a deeper symptom.
log.warn(.cache, "miss", .{ .url = req.url, .err = e, .reason = "malformed metadata" });
cleanup = true;
return null;
};
if (cache_file.version != CACHE_VERSION) {
log.debug(.cache, "miss", .{
.url = req.url,
.reason = "version mismatch",
.expected = CACHE_VERSION,
.got = cache_file.version,
});
cleanup = true;
return null;
}
const metadata = cache_file.metadata;
// Check entry expiration.
const now = req.timestamp;
const age = (now - metadata.stored_at) + @as(i64, @intCast(metadata.age_at_store));
if (age < 0 or @as(u64, @intCast(age)) >= metadata.cache_control.max_age) {
log.debug(.cache, "miss", .{ .url = req.url, .reason = "expired" });
cleanup = true;
return null;
}
// If we have Vary headers, ensure they are present & matching.
for (metadata.vary_headers) |vary_hdr| {
const name = vary_hdr.name;
const value = vary_hdr.value;
const incoming = for (req.request_headers) |h| {
if (std.ascii.eqlIgnoreCase(h.name, name)) break h.value;
} else "";
if (!std.ascii.eqlIgnoreCase(value, incoming)) {
log.debug(.cache, "miss", .{
.url = req.url,
.reason = "vary mismatch",
.header = name,
.expected = value,
.got = incoming,
});
return null;
}
}
// On the case of a hash collision.
if (!std.ascii.eqlIgnoreCase(metadata.url, req.url)) {
log.warn(.cache, "collision", .{ .url = req.url, .expected = metadata.url, .got = req.url });
cleanup = true;
return null;
}
log.debug(.cache, "hit", .{ .url = req.url, .hash = &hashed_key });
return .{
.metadata = metadata,
.data = .{
.file = .{
.file = file,
.offset = BODY_LEN_HEADER_LEN,
.len = body_len,
},
},
};
}
pub fn put(self: *FsCache, meta: CachedMetadata, body: []const u8) !void {
const hashed_key = hashKey(meta.url);
const cache_p = cachePath(&hashed_key);
const cache_tmp_p = cacheTmpPath(&hashed_key);
const lock = self.getLockPtr(&hashed_key);
lock.lock();
defer lock.unlock();
const file = self.dir.createFile(&cache_tmp_p, .{ .truncate = true }) catch |e| {
log.err(.cache, "create file", .{ .url = meta.url, .file = &cache_tmp_p, .err = e });
return e;
};
defer file.close();
var writer_buf: [1024]u8 = undefined;
var file_writer = file.writer(&writer_buf);
var file_writer_iface = &file_writer.interface;
var len_buf: [8]u8 = undefined;
std.mem.writeInt(u64, &len_buf, body.len, .little);
file_writer_iface.writeAll(&len_buf) catch |e| {
log.err(.cache, "write body len", .{ .url = meta.url, .err = e });
return e;
};
file_writer_iface.writeAll(body) catch |e| {
log.err(.cache, "write body", .{ .url = meta.url, .err = e });
return e;
};
std.json.Stringify.value(
CacheMetadataJson{ .version = CACHE_VERSION, .metadata = meta },
.{ .whitespace = .minified },
file_writer_iface,
) catch |e| {
log.err(.cache, "write metadata", .{ .url = meta.url, .err = e });
return e;
};
file_writer_iface.flush() catch |e| {
log.err(.cache, "flush", .{ .url = meta.url, .err = e });
return e;
};
self.dir.rename(&cache_tmp_p, &cache_p) catch |e| {
log.err(.cache, "rename", .{ .url = meta.url, .from = &cache_tmp_p, .to = &cache_p, .err = e });
return e;
};
log.debug(.cache, "put", .{ .url = meta.url, .hash = &hashed_key, .body_len = body.len });
}
const testing = std.testing;
fn setupCache() !struct { tmp: testing.TmpDir, cache: Cache } {
var tmp = testing.tmpDir(.{});
errdefer tmp.cleanup();
const path = try tmp.dir.realpathAlloc(testing.allocator, ".");
defer testing.allocator.free(path);
return .{
.tmp = tmp,
.cache = Cache{ .kind = .{ .fs = try FsCache.init(path) } },
};
}
test "FsCache: basic put and get" {
var setup = try setupCache();
defer {
setup.cache.deinit();
setup.tmp.cleanup();
}
const cache = &setup.cache;
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
const now = std.time.timestamp();
const meta = CachedMetadata{
.url = "https://example.com",
.content_type = "text/html",
.status = 200,
.stored_at = now,
.age_at_store = 0,
.cache_control = .{ .max_age = 600 },
.headers = &.{},
.vary_headers = &.{},
};
const body = "hello world";
try cache.put(meta, body);
const result = cache.get(
arena.allocator(),
.{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{},
},
) orelse return error.CacheMiss;
const f = result.data.file;
const file = f.file;
defer file.close();
var buf: [64]u8 = undefined;
var file_reader = file.reader(&buf);
try file_reader.seekTo(f.offset);
const read_buf = try file_reader.interface.readAlloc(testing.allocator, f.len);
defer testing.allocator.free(read_buf);
try testing.expectEqualStrings(body, read_buf);
}
test "FsCache: get expiration" {
var setup = try setupCache();
defer {
setup.cache.deinit();
setup.tmp.cleanup();
}
const cache = &setup.cache;
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
const now = 5000;
const max_age = 1000;
const meta = CachedMetadata{
.url = "https://example.com",
.content_type = "text/html",
.status = 200,
.stored_at = now,
.age_at_store = 900,
.cache_control = .{ .max_age = max_age },
.headers = &.{},
.vary_headers = &.{},
};
const body = "hello world";
try cache.put(meta, body);
const result = cache.get(
arena.allocator(),
.{
.url = "https://example.com",
.timestamp = now + 50,
.request_headers = &.{},
},
) orelse return error.CacheMiss;
result.data.file.file.close();
try testing.expectEqual(null, cache.get(
arena.allocator(),
.{
.url = "https://example.com",
.timestamp = now + 200,
.request_headers = &.{},
},
));
try testing.expectEqual(null, cache.get(
arena.allocator(),
.{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{},
},
));
}
test "FsCache: put override" {
var setup = try setupCache();
defer {
setup.cache.deinit();
setup.tmp.cleanup();
}
const cache = &setup.cache;
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
{
const now = 5000;
const max_age = 1000;
const meta = CachedMetadata{
.url = "https://example.com",
.content_type = "text/html",
.status = 200,
.stored_at = now,
.age_at_store = 900,
.cache_control = .{ .max_age = max_age },
.headers = &.{},
.vary_headers = &.{},
};
const body = "hello world";
try cache.put(meta, body);
const result = cache.get(
arena.allocator(),
.{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{},
},
) orelse return error.CacheMiss;
const f = result.data.file;
const file = f.file;
defer file.close();
var buf: [64]u8 = undefined;
var file_reader = file.reader(&buf);
try file_reader.seekTo(f.offset);
const read_buf = try file_reader.interface.readAlloc(testing.allocator, f.len);
defer testing.allocator.free(read_buf);
try testing.expectEqualStrings(body, read_buf);
}
{
const now = 10000;
const max_age = 2000;
const meta = CachedMetadata{
.url = "https://example.com",
.content_type = "text/html",
.status = 200,
.stored_at = now,
.age_at_store = 0,
.cache_control = .{ .max_age = max_age },
.headers = &.{},
.vary_headers = &.{},
};
const body = "goodbye world";
try cache.put(meta, body);
const result = cache.get(
arena.allocator(),
.{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{},
},
) orelse return error.CacheMiss;
const f = result.data.file;
const file = f.file;
defer file.close();
var buf: [64]u8 = undefined;
var file_reader = file.reader(&buf);
try file_reader.seekTo(f.offset);
const read_buf = try file_reader.interface.readAlloc(testing.allocator, f.len);
defer testing.allocator.free(read_buf);
try testing.expectEqualStrings(body, read_buf);
}
}
test "FsCache: garbage file" {
var setup = try setupCache();
defer {
setup.cache.deinit();
setup.tmp.cleanup();
}
const hashed_key = hashKey("https://example.com");
const cache_p = cachePath(&hashed_key);
const file = try setup.cache.kind.fs.dir.createFile(&cache_p, .{});
try file.writeAll("this is not a valid cache file !@#$%");
file.close();
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
try testing.expectEqual(
null,
setup.cache.get(arena.allocator(), .{
.url = "https://example.com",
.timestamp = 5000,
.request_headers = &.{},
}),
);
}
test "FsCache: vary hit and miss" {
var setup = try setupCache();
defer {
setup.cache.deinit();
setup.tmp.cleanup();
}
const cache = &setup.cache;
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
const now = std.time.timestamp();
const meta = CachedMetadata{
.url = "https://example.com",
.content_type = "text/html",
.status = 200,
.stored_at = now,
.age_at_store = 0,
.cache_control = .{ .max_age = 600 },
.headers = &.{},
.vary_headers = &.{
.{ .name = "Accept-Encoding", .value = "gzip" },
},
};
try cache.put(meta, "hello world");
const result = cache.get(arena.allocator(), .{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{
.{ .name = "Accept-Encoding", .value = "gzip" },
},
}) orelse return error.CacheMiss;
result.data.file.file.close();
try testing.expectEqual(null, cache.get(arena.allocator(), .{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{
.{ .name = "Accept-Encoding", .value = "br" },
},
}));
try testing.expectEqual(null, cache.get(arena.allocator(), .{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{},
}));
const result2 = cache.get(arena.allocator(), .{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{
.{ .name = "Accept-Encoding", .value = "gzip" },
},
}) orelse return error.CacheMiss;
result2.data.file.file.close();
}
test "FsCache: vary multiple headers" {
var setup = try setupCache();
defer {
setup.cache.deinit();
setup.tmp.cleanup();
}
const cache = &setup.cache;
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
const now = std.time.timestamp();
const meta = CachedMetadata{
.url = "https://example.com",
.content_type = "text/html",
.status = 200,
.stored_at = now,
.age_at_store = 0,
.cache_control = .{ .max_age = 600 },
.headers = &.{},
.vary_headers = &.{
.{ .name = "Accept-Encoding", .value = "gzip" },
.{ .name = "Accept-Language", .value = "en" },
},
};
try cache.put(meta, "hello world");
const result = cache.get(arena.allocator(), .{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{
.{ .name = "Accept-Encoding", .value = "gzip" },
.{ .name = "Accept-Language", .value = "en" },
},
}) orelse return error.CacheMiss;
result.data.file.file.close();
try testing.expectEqual(null, cache.get(arena.allocator(), .{
.url = "https://example.com",
.timestamp = now,
.request_headers = &.{
.{ .name = "Accept-Encoding", .value = "gzip" },
.{ .name = "Accept-Language", .value = "fr" },
},
}));
}

View File

@@ -79,7 +79,7 @@ pub const Headers = struct {
self.headers = updated_headers;
}
fn parseHeader(header_str: []const u8) ?Header {
pub fn parseHeader(header_str: []const u8) ?Header {
const colon_pos = std.mem.indexOfScalar(u8, header_str, ':') orelse return null;
const name = std.mem.trim(u8, header_str[0..colon_pos], " \t");
@@ -88,22 +88,9 @@ pub const Headers = struct {
return .{ .name = name, .value = value };
}
pub fn iterator(self: *Headers) Iterator {
return .{
.header = self.headers,
};
pub fn iterator(self: Headers) HeaderIterator {
return .{ .curl_slist = .{ .header = self.headers } };
}
const Iterator = struct {
header: [*c]libcurl.CurlSList,
pub fn next(self: *Iterator) ?Header {
const h = self.header orelse return null;
self.header = h.*.next;
return parseHeader(std.mem.span(@as([*:0]const u8, @ptrCast(h.*.data))));
}
};
};
// In normal cases, the header iterator comes from the curl linked list.
@@ -112,6 +99,7 @@ pub const Headers = struct {
// This union, is an iterator that exposes the same API for either case.
pub const HeaderIterator = union(enum) {
curl: CurlHeaderIterator,
curl_slist: CurlSListIterator,
list: ListHeaderIterator,
pub fn next(self: *HeaderIterator) ?Header {
@@ -120,6 +108,19 @@ pub const HeaderIterator = union(enum) {
}
}
pub fn collect(self: *HeaderIterator, allocator: std.mem.Allocator) !std.ArrayList(Header) {
var list: std.ArrayList(Header) = .empty;
while (self.next()) |hdr| {
try list.append(allocator, .{
.name = try allocator.dupe(u8, hdr.name),
.value = try allocator.dupe(u8, hdr.value),
});
}
return list;
}
const CurlHeaderIterator = struct {
conn: *const Connection,
prev: ?*libcurl.CurlHeader = null,
@@ -136,6 +137,16 @@ pub const HeaderIterator = union(enum) {
}
};
const CurlSListIterator = struct {
header: [*c]libcurl.CurlSList,
pub fn next(self: *CurlSListIterator) ?Header {
const h = self.header orelse return null;
self.header = h.*.next;
return Headers.parseHeader(std.mem.span(@as([*:0]const u8, @ptrCast(h.*.data))));
}
};
const ListHeaderIterator = struct {
index: usize = 0,
list: []const Header,

View File

@@ -0,0 +1,240 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const log = @import("../../log.zig");
const http = @import("../http.zig");
const Transfer = @import("../../browser/HttpClient.zig").Transfer;
const Context = @import("../../browser/HttpClient.zig").Context;
const Request = @import("../../browser/HttpClient.zig").Request;
const Response = @import("../../browser/HttpClient.zig").Response;
const Layer = @import("../../browser/HttpClient.zig").Layer;
const Cache = @import("../cache/Cache.zig");
const CachedMetadata = @import("../cache/Cache.zig").CachedMetadata;
const CachedResponse = @import("../cache/Cache.zig").CachedResponse;
const Forward = @import("Forward.zig");
const CacheLayer = @This();
next: Layer = undefined,
pub fn layer(self: *CacheLayer) Layer {
return .{
.ptr = self,
.vtable = &.{
.request = request,
},
};
}
fn request(ptr: *anyopaque, ctx: Context, req: Request) anyerror!void {
const self: *CacheLayer = @ptrCast(@alignCast(ptr));
const network = ctx.network;
if (network.cache == null or req.method != .GET) {
return self.next.request(ctx, req);
}
const arena = try network.app.arena_pool.acquire(.{ .debug = "CacheLayer" });
errdefer network.app.arena_pool.release(arena);
var iter = req.headers.iterator();
const req_header_list = try iter.collect(arena);
if (network.cache.?.get(arena, .{
.url = req.url,
.timestamp = std.time.timestamp(),
.request_headers = req_header_list.items,
})) |cached| {
defer req.headers.deinit();
defer network.app.arena_pool.release(arena);
return serveFromCache(req, &cached);
}
const cache_ctx = try arena.create(CacheContext);
cache_ctx.* = .{
.arena = arena,
.context = ctx,
.forward = Forward.fromRequest(req),
.req_url = req.url,
.req_headers = req.headers,
};
const wrapped = cache_ctx.forward.wrapRequest(
req,
cache_ctx,
"forward",
.{
.start = CacheContext.startCallback,
.header = CacheContext.headerCallback,
.done = CacheContext.doneCallback,
.shutdown = CacheContext.shutdownCallback,
.err = CacheContext.errorCallback,
},
);
return self.next.request(ctx, wrapped);
}
fn serveFromCache(req: Request, cached: *const CachedResponse) !void {
const response = Response.fromCached(req.ctx, cached);
defer switch (cached.data) {
.buffer => |_| {},
.file => |f| f.file.close(),
};
if (req.start_callback) |cb| {
try cb(response);
}
const proceed = try req.header_callback(response);
if (!proceed) {
req.error_callback(req.ctx, error.Abort);
return;
}
switch (cached.data) {
.buffer => |data| {
if (data.len > 0) {
try req.data_callback(response, data);
}
},
.file => |f| {
const file = f.file;
var buf: [1024]u8 = undefined;
var file_reader = file.reader(&buf);
try file_reader.seekTo(f.offset);
const reader = &file_reader.interface;
var read_buf: [1024]u8 = undefined;
var remaining = f.len;
while (remaining > 0) {
const read_len = @min(read_buf.len, remaining);
const n = try reader.readSliceShort(read_buf[0..read_len]);
if (n == 0) break;
remaining -= n;
try req.data_callback(response, read_buf[0..n]);
}
},
}
try req.done_callback(req.ctx);
}
const CacheContext = struct {
arena: std.mem.Allocator,
context: Context,
transfer: ?*Transfer = null,
forward: Forward,
req_url: [:0]const u8,
req_headers: http.Headers,
pending_metadata: ?*CachedMetadata = null,
fn startCallback(response: Response) anyerror!void {
const self: *CacheContext = @ptrCast(@alignCast(response.ctx));
self.transfer = response.inner.transfer;
return self.forward.forwardStart(response);
}
fn headerCallback(response: Response) anyerror!bool {
const self: *CacheContext = @ptrCast(@alignCast(response.ctx));
const allocator = self.arena;
const transfer = response.inner.transfer;
var rh = &transfer.response_header.?;
const conn = transfer._conn.?;
const vary = if (conn.getResponseHeader("vary", 0)) |h| h.value else null;
const maybe_cm = try Cache.tryCache(
allocator,
std.time.timestamp(),
transfer.url,
rh.status,
rh.contentType(),
if (conn.getResponseHeader("cache-control", 0)) |h| h.value else null,
vary,
if (conn.getResponseHeader("age", 0)) |h| h.value else null,
conn.getResponseHeader("set-cookie", 0) != null,
conn.getResponseHeader("authorization", 0) != null,
);
if (maybe_cm) |cm| {
var iter = transfer.responseHeaderIterator();
var header_list = try iter.collect(allocator);
const end_of_response = header_list.items.len;
if (vary) |vary_str| {
var req_it = self.req_headers.iterator();
while (req_it.next()) |hdr| {
var vary_iter = std.mem.splitScalar(u8, vary_str, ',');
while (vary_iter.next()) |part| {
const name = std.mem.trim(u8, part, &std.ascii.whitespace);
if (std.ascii.eqlIgnoreCase(hdr.name, name)) {
try header_list.append(allocator, .{
.name = try allocator.dupe(u8, hdr.name),
.value = try allocator.dupe(u8, hdr.value),
});
}
}
}
const metadata = try allocator.create(CachedMetadata);
metadata.* = cm;
metadata.headers = header_list.items[0..end_of_response];
metadata.vary_headers = header_list.items[end_of_response..];
self.pending_metadata = metadata;
}
}
return self.forward.forwardHeader(response);
}
fn doneCallback(ctx: *anyopaque) anyerror!void {
const self: *CacheContext = @ptrCast(@alignCast(ctx));
defer self.context.network.app.arena_pool.release(self.arena);
const transfer = self.transfer orelse @panic("Start Callback didn't set CacheLayer.transfer");
if (self.pending_metadata) |metadata| {
const cache = &self.context.network.cache.?;
log.debug(.browser, "http cache", .{ .key = self.req_url, .metadata = metadata });
cache.put(metadata.*, transfer._stream_buffer.items) catch |err| {
log.warn(.http, "cache put failed", .{ .err = err });
};
log.debug(.browser, "http.cache.put", .{ .url = self.req_url });
}
return self.forward.forwardDone();
}
fn shutdownCallback(ctx: *anyopaque) void {
const self: *CacheContext = @ptrCast(@alignCast(ctx));
defer self.context.network.app.arena_pool.release(self.arena);
self.forward.forwardShutdown();
}
fn errorCallback(ctx: *anyopaque, e: anyerror) void {
const self: *CacheContext = @ptrCast(@alignCast(ctx));
defer self.context.network.app.arena_pool.release(self.arena);
self.forward.forwardErr(e);
}
};

View File

@@ -0,0 +1,135 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const Request = @import("../../browser/HttpClient.zig").Request;
const Response = @import("../../browser/HttpClient.zig").Response;
const Forward = @This();
ctx: *anyopaque,
start: ?Request.StartCallback,
header: Request.HeaderCallback,
data: Request.DataCallback,
done: Request.DoneCallback,
err: Request.ErrorCallback,
shutdown: ?Request.ShutdownCallback,
pub fn fromRequest(req: Request) Forward {
return .{
.ctx = req.ctx,
.start = req.start_callback,
.header = req.header_callback,
.data = req.data_callback,
.done = req.done_callback,
.err = req.error_callback,
.shutdown = req.shutdown_callback,
};
}
pub const Overrides = struct {
start: ?Request.StartCallback = null,
header: ?Request.HeaderCallback = null,
data: ?Request.DataCallback = null,
done: ?Request.DoneCallback = null,
err: ?Request.ErrorCallback = null,
shutdown: ?Request.ShutdownCallback = null,
};
pub fn wrapRequest(
self: *Forward,
req: Request,
new_ctx: anytype,
comptime field: []const u8,
overrides: Overrides,
) Request {
const T = @TypeOf(new_ctx.*);
const PassthroughT = makePassthrough(T, field);
var wrapped = req;
wrapped.ctx = new_ctx;
wrapped.start_callback = overrides.start orelse if (self.start != null) PassthroughT.start else null;
wrapped.header_callback = overrides.header orelse PassthroughT.header;
wrapped.data_callback = overrides.data orelse PassthroughT.data;
wrapped.done_callback = overrides.done orelse PassthroughT.done;
wrapped.error_callback = overrides.err orelse PassthroughT.err;
wrapped.shutdown_callback = overrides.shutdown orelse if (self.shutdown != null) PassthroughT.shutdown else null;
return wrapped;
}
fn makePassthrough(comptime T: type, comptime field: []const u8) type {
return struct {
pub fn start(response: Response) anyerror!void {
const self: *T = @ptrCast(@alignCast(response.ctx));
return @field(self, field).forwardStart(response);
}
pub fn header(response: Response) anyerror!bool {
const self: *T = @ptrCast(@alignCast(response.ctx));
return @field(self, field).forwardHeader(response);
}
pub fn data(response: Response, chunk: []const u8) anyerror!void {
const self: *T = @ptrCast(@alignCast(response.ctx));
return @field(self, field).forwardData(response, chunk);
}
pub fn done(ctx_ptr: *anyopaque) anyerror!void {
const self: *T = @ptrCast(@alignCast(ctx_ptr));
return @field(self, field).forwardDone();
}
pub fn err(ctx_ptr: *anyopaque, e: anyerror) void {
const self: *T = @ptrCast(@alignCast(ctx_ptr));
@field(self, field).forwardErr(e);
}
pub fn shutdown(ctx_ptr: *anyopaque) void {
const self: *T = @ptrCast(@alignCast(ctx_ptr));
@field(self, field).forwardShutdown();
}
};
}
pub fn forwardStart(self: Forward, response: Response) anyerror!void {
var fwd = response;
fwd.ctx = self.ctx;
if (self.start) |cb| try cb(fwd);
}
pub fn forwardHeader(self: Forward, response: Response) anyerror!bool {
var fwd = response;
fwd.ctx = self.ctx;
return self.header(fwd);
}
pub fn forwardData(self: Forward, response: Response, chunk: []const u8) anyerror!void {
var fwd = response;
fwd.ctx = self.ctx;
return self.data(fwd, chunk);
}
pub fn forwardDone(self: Forward) anyerror!void {
return self.done(self.ctx);
}
pub fn forwardErr(self: Forward, e: anyerror) void {
self.err(self.ctx, e);
}
pub fn forwardShutdown(self: Forward) void {
if (self.shutdown) |cb| cb(self.ctx);
}

View File

@@ -0,0 +1,255 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const log = @import("../../log.zig");
const URL = @import("../../browser/URL.zig");
const Robots = @import("../Robots.zig");
const Context = @import("../../browser/HttpClient.zig").Context;
const Request = @import("../../browser/HttpClient.zig").Request;
const Response = @import("../../browser/HttpClient.zig").Response;
const Layer = @import("../../browser/HttpClient.zig").Layer;
const Forward = @import("Forward.zig");
const RobotsLayer = @This();
next: Layer = undefined,
obey_robots: bool,
allocator: std.mem.Allocator,
pending: std.StringHashMapUnmanaged(std.ArrayListUnmanaged(Request)) = .empty,
pub fn layer(self: *RobotsLayer) Layer {
return .{
.ptr = self,
.vtable = &.{
.request = request,
},
};
}
pub fn deinit(self: *RobotsLayer, allocator: std.mem.Allocator) void {
var it = self.pending.iterator();
while (it.next()) |entry| {
entry.value_ptr.deinit(allocator);
}
self.pending.deinit(allocator);
}
fn request(ptr: *anyopaque, ctx: Context, req: Request) anyerror!void {
const self: *RobotsLayer = @ptrCast(@alignCast(ptr));
if (!self.obey_robots) {
return self.next.request(ctx, req);
}
const robots_url = try URL.getRobotsUrl(self.allocator, req.url);
errdefer self.allocator.free(robots_url);
if (ctx.network.robot_store.get(robots_url)) |robot_entry| {
defer self.allocator.free(robots_url);
switch (robot_entry) {
.present => |robots| {
const path = URL.getPathname(req.url);
if (!robots.isAllowed(path)) {
log.warn(.http, "blocked by robots", .{ .url = req.url });
req.error_callback(req.ctx, error.RobotsBlocked);
return;
}
},
.absent => {},
}
return self.next.request(ctx, req);
}
return self.fetchRobotsThenRequest(ctx, robots_url, req);
}
fn fetchRobotsThenRequest(self: *RobotsLayer, ctx: Context, robots_url: [:0]const u8, req: Request) !void {
const entry = try self.pending.getOrPut(self.allocator, robots_url);
if (!entry.found_existing) {
errdefer self.allocator.free(robots_url);
entry.value_ptr.* = .empty;
const robots_ctx = try self.allocator.create(RobotsContext);
errdefer self.allocator.destroy(robots_ctx);
robots_ctx.* = .{
.layer = self,
.ctx = ctx,
.robots_url = robots_url,
.buffer = .empty,
};
const headers = try ctx.newHeaders();
log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url });
try self.next.request(ctx, .{
.ctx = robots_ctx,
.url = robots_url,
.method = .GET,
.headers = headers,
.blocking = false,
.frame_id = req.frame_id,
.cookie_jar = req.cookie_jar,
.cookie_origin = req.cookie_origin,
.notification = req.notification,
.resource_type = .fetch,
.header_callback = RobotsContext.headerCallback,
.data_callback = RobotsContext.dataCallback,
.done_callback = RobotsContext.doneCallback,
.error_callback = RobotsContext.errorCallback,
.shutdown_callback = RobotsContext.shutdownCallback,
});
} else {
self.allocator.free(robots_url);
}
try entry.value_ptr.append(self.allocator, req);
}
fn flushPending(self: *RobotsLayer, ctx: Context, robots_url: [:0]const u8, allowed: bool) void {
var queued = self.pending.fetchRemove(robots_url) orelse
@panic("RobotsLayer.flushPending: missing queue");
defer queued.value.deinit(self.allocator);
for (queued.value.items) |queued_req| {
if (!allowed) {
log.warn(.http, "blocked by robots", .{ .url = queued_req.url });
defer queued_req.headers.deinit();
queued_req.error_callback(queued_req.ctx, error.RobotsBlocked);
} else {
self.next.request(ctx, queued_req) catch |e| {
defer queued_req.headers.deinit();
queued_req.error_callback(queued_req.ctx, e);
};
}
}
}
fn flushPendingShutdown(self: *RobotsLayer, robots_url: [:0]const u8) void {
var queued = self.pending.fetchRemove(robots_url) orelse
@panic("RobotsLayer.flushPendingShutdown: missing queue");
defer queued.value.deinit(self.allocator);
for (queued.value.items) |queued_req| {
defer queued_req.headers.deinit();
if (queued_req.shutdown_callback) |cb| cb(queued_req.ctx);
}
}
const RobotsContext = struct {
layer: *RobotsLayer,
ctx: Context,
robots_url: [:0]const u8,
buffer: std.ArrayListUnmanaged(u8),
status: u16 = 0,
fn deinit(self: *RobotsContext) void {
self.buffer.deinit(self.layer.allocator);
self.layer.allocator.destroy(self);
}
fn headerCallback(response: Response) anyerror!bool {
const self: *RobotsContext = @ptrCast(@alignCast(response.ctx));
switch (response.inner) {
.transfer => |t| {
if (t.response_header) |hdr| {
log.debug(.browser, "robots status", .{ .status = hdr.status, .robots_url = self.robots_url });
self.status = hdr.status;
}
if (t.getContentLength()) |cl| {
try self.buffer.ensureTotalCapacity(self.layer.allocator, cl);
}
},
.cached => {},
}
return true;
}
fn dataCallback(response: Response, data: []const u8) anyerror!void {
const self: *RobotsContext = @ptrCast(@alignCast(response.ctx));
try self.buffer.appendSlice(self.layer.allocator, data);
}
fn doneCallback(ctx_ptr: *anyopaque) anyerror!void {
const self: *RobotsContext = @ptrCast(@alignCast(ctx_ptr));
const l = self.layer;
const ctx = self.ctx;
const robots_url = self.robots_url;
defer l.allocator.free(robots_url);
defer self.deinit();
var allowed = true;
const network = ctx.network;
switch (self.status) {
200 => {
if (self.buffer.items.len > 0) {
const robots: ?Robots = network.robot_store.robotsFromBytes(
network.config.http_headers.user_agent,
self.buffer.items,
) catch blk: {
log.warn(.browser, "failed to parse robots", .{ .robots_url = robots_url });
try network.robot_store.putAbsent(robots_url);
break :blk null;
};
if (robots) |r| {
try network.robot_store.put(robots_url, r);
const path = URL.getPathname(self.layer.pending.get(robots_url).?.items[0].url);
allowed = r.isAllowed(path);
}
}
},
404 => {
log.debug(.http, "robots not found", .{ .url = robots_url });
try network.robot_store.putAbsent(robots_url);
},
else => {
log.debug(.http, "unexpected status on robots", .{
.url = robots_url,
.status = self.status,
});
try network.robot_store.putAbsent(robots_url);
},
}
l.flushPending(ctx, robots_url, allowed);
}
fn errorCallback(ctx_ptr: *anyopaque, err: anyerror) void {
const self: *RobotsContext = @ptrCast(@alignCast(ctx_ptr));
const l = self.layer;
const ctx = self.ctx;
const robots_url = self.robots_url;
defer l.allocator.free(robots_url);
defer self.deinit();
log.warn(.http, "robots fetch failed", .{ .err = err });
l.flushPending(ctx, robots_url, true);
}
fn shutdownCallback(ctx_ptr: *anyopaque) void {
const self: *RobotsContext = @ptrCast(@alignCast(ctx_ptr));
const l = self.layer;
const robots_url = self.robots_url;
defer l.allocator.free(robots_url);
defer self.deinit();
log.debug(.http, "robots fetch shutdown", .{});
l.flushPendingShutdown(robots_url);
}
};

View File

@@ -0,0 +1,54 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const log = @import("../../log.zig");
const URL = @import("../../browser/URL.zig");
const WebBotAuth = @import("../WebBotAuth.zig");
const Context = @import("../../browser/HttpClient.zig").Context;
const Request = @import("../../browser/HttpClient.zig").Request;
const Layer = @import("../../browser/HttpClient.zig").Layer;
const WebBotAuthLayer = @This();
next: Layer = undefined,
pub fn layer(self: *WebBotAuthLayer) Layer {
return .{
.ptr = self,
.vtable = &.{ .request = request },
};
}
pub fn deinit(_: *WebBotAuthLayer, _: std.mem.Allocator) void {}
fn request(ptr: *anyopaque, ctx: Context, req: Request) anyerror!void {
const self: *WebBotAuthLayer = @ptrCast(@alignCast(ptr));
var our_req = req;
if (ctx.network.web_bot_auth) |*wba| {
const arena = try ctx.network.app.arena_pool.acquire(.{ .debug = "WebBotAuthLayer" });
defer ctx.network.app.arena_pool.release(arena);
const authority = URL.getHost(req.url);
try wba.signRequest(arena, &our_req.headers, authority);
}
return self.next.request(ctx, our_req);
}