// Copyright (C) 2023-2025 Lightpanda (Selecy SAS) // // Francis Bouvier // Pierre Tachoire // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as // published by the Free Software Foundation, either version 3 of the // License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . const std = @import("std"); const lp = @import("lightpanda"); const log = @import("../log.zig"); const builtin = @import("builtin"); const Net = @import("../Net.zig"); const Config = @import("../Config.zig"); const URL = @import("../browser/URL.zig"); const Notification = @import("../Notification.zig"); const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar; const Robots = @import("../browser/Robots.zig"); const RobotStore = Robots.RobotStore; const c = Net.c; const posix = std.posix; const Allocator = std.mem.Allocator; const ArenaAllocator = std.heap.ArenaAllocator; const IS_DEBUG = builtin.mode == .Debug; const Method = Net.Method; const ResponseHead = Net.ResponseHead; const HeaderIterator = Net.HeaderIterator; // This is loosely tied to a browser Page. Loading all the , doing // XHR requests, and loading imports all happens through here. Sine the app // currently supports 1 browser and 1 page at-a-time, we only have 1 Client and // re-use it from page to page. This allows us better re-use of the various // buffers/caches (including keepalive connections) that libcurl has. // // The app has other secondary http needs, like telemetry. While we want to // share some things (namely the ca blob, and maybe some configuration // (TODO: ??? should proxy settings be global ???)), we're able to do call // client.abort() to abort the transfers being made by a page, without impacting // those other http requests. pub const Client = @This(); // Count of active requests active: usize, // Count of intercepted requests. This is to help deal with intercepted requests. // The client doesn't track intercepted transfers. If a request is intercepted, // the client forgets about it and requires the interceptor to continue or abort // it. That works well, except if we only rely on active, we might think there's // no more network activity when, with interecepted requests, there might be more // in the future. (We really only need this to properly emit a 'networkIdle' and // 'networkAlmostIdle' Page.lifecycleEvent in CDP). intercepted: usize, // Our easy handles, managed by a curl multi. handles: Handles, // Use to generate the next request ID next_request_id: u32 = 0, // When handles has no more available easys, requests get queued. queue: TransferQueue, // The main app allocator allocator: Allocator, // Reference to the App-owned Robot Store. robot_store: *RobotStore, // Queue of requests that depend on a robots.txt. // Allows us to fetch the robots.txt just once. pending_robots_queue: std.StringHashMapUnmanaged(std.ArrayList(Request)) = .empty, // Once we have a handle/easy to process a request with, we create a Transfer // which contains the Request as well as any state we need to process the // request. These wil come and go with each request. transfer_pool: std.heap.MemoryPool(Transfer), // only needed for CDP which can change the proxy and then restore it. When // restoring, this originally-configured value is what it goes to. http_proxy: ?[:0]const u8 = null, // track if the client use a proxy for connections. // We can't use http_proxy because we want also to track proxy configured via // CDP. use_proxy: bool, config: *const Config, cdp_client: ?CDPClient = null, // libcurl can monitor arbitrary sockets, this lets us use libcurl to poll // both HTTP data as well as messages from an CDP connection. // Furthermore, we have some tension between blocking scripts and request // interception. For non-blocking scripts, because nothing blocks, we can // just queue the scripts until we receive a response to the interception // notification. But for blocking scripts (which block the parser), it's hard // to return control back to the CDP loop. So the `read` function pointer is // used by the Client to have the CDP client read more data from the socket, // specifically when we're waiting for a request interception response to // a blocking script. pub const CDPClient = struct { socket: posix.socket_t, ctx: *anyopaque, blocking_read_start: *const fn (*anyopaque) bool, blocking_read: *const fn (*anyopaque) bool, blocking_read_end: *const fn (*anyopaque) bool, }; const TransferQueue = std.DoublyLinkedList; pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, robot_store: *RobotStore, config: *const Config) !*Client { var transfer_pool = std.heap.MemoryPool(Transfer).init(allocator); errdefer transfer_pool.deinit(); const client = try allocator.create(Client); errdefer allocator.destroy(client); var handles = try Handles.init(allocator, ca_blob, config); errdefer handles.deinit(allocator); // Set transfer callbacks on each connection. for (handles.connections) |*conn| { try conn.setCallbacks(Transfer.headerCallback, Transfer.dataCallback); } const http_proxy = config.httpProxy(); client.* = .{ .queue = .{}, .active = 0, .intercepted = 0, .handles = handles, .allocator = allocator, .robot_store = robot_store, .http_proxy = http_proxy, .use_proxy = http_proxy != null, .config = config, .transfer_pool = transfer_pool, }; return client; } pub fn deinit(self: *Client) void { self.abort(); self.handles.deinit(self.allocator); self.transfer_pool.deinit(); var robots_iter = self.pending_robots_queue.iterator(); while (robots_iter.next()) |entry| { entry.value_ptr.deinit(self.allocator); } self.pending_robots_queue.deinit(self.allocator); self.allocator.destroy(self); } pub fn newHeaders(self: *const Client) !Net.Headers { return Net.Headers.init(self.config.http_headers.user_agent_header); } pub fn abort(self: *Client) void { while (self.handles.in_use.first) |node| { const conn: *Net.Connection = @fieldParentPtr("node", node); var transfer = Transfer.fromConnection(conn) catch |err| { log.err(.http, "get private info", .{ .err = err, .source = "abort" }); continue; }; transfer.kill(); } if (comptime IS_DEBUG) { std.debug.assert(self.active == 0); } var n = self.queue.first; while (n) |node| { n = node.next; const transfer: *Transfer = @fieldParentPtr("_node", node); transfer.kill(); } self.queue = .{}; if (comptime IS_DEBUG) { std.debug.assert(self.handles.in_use.first == null); std.debug.assert(self.handles.available.len() == self.handles.connections.len); const running = self.handles.perform() catch |err| { lp.assert(false, "multi perform in abort", .{ .err = err }); }; std.debug.assert(running == 0); } } pub fn tick(self: *Client, timeout_ms: u32) !PerformStatus { while (true) { if (self.handles.hasAvailable() == false) { break; } const queue_node = self.queue.popFirst() orelse break; const transfer: *Transfer = @fieldParentPtr("_node", queue_node); // we know this exists, because we checked hasAvailable() above const conn = self.handles.get().?; try self.makeRequest(conn, transfer); } return self.perform(@intCast(timeout_ms)); } pub fn request(self: *Client, req: Request) !void { if (self.config.obeyRobots()) { const robots_url = try URL.getRobotsUrl(self.allocator, req.url); errdefer self.allocator.free(robots_url); // If we have this robots cached, we can take a fast path. if (self.robot_store.get(robots_url)) |robot_entry| { defer self.allocator.free(robots_url); switch (robot_entry) { // If we have a found robots entry, we check it. .present => |robots| { const path = URL.getPathname(req.url); if (!robots.isAllowed(path)) { req.error_callback(req.ctx, error.RobotsBlocked); return; } }, // Otherwise, we assume we won't find it again. .absent => {}, } return self.processRequest(req); } return self.fetchRobotsThenProcessRequest(robots_url, req); } return self.processRequest(req); } fn processRequest(self: *Client, req: Request) !void { const transfer = try self.makeTransfer(req); transfer.req.notification.dispatch(.http_request_start, &.{ .transfer = transfer }); var wait_for_interception = false; transfer.req.notification.dispatch(.http_request_intercept, &.{ .transfer = transfer, .wait_for_interception = &wait_for_interception, }); if (wait_for_interception == false) { // request not intercepted, process it normally return self.process(transfer); } self.intercepted += 1; if (comptime IS_DEBUG) { log.debug(.http, "wait for interception", .{ .intercepted = self.intercepted }); } transfer._intercept_state = .pending; if (req.blocking == false) { // The request was interecepted, but it isn't a blocking request, so we // dont' need to block this call. The request will be unblocked // asynchronously via either continueTransfer or abortTransfer return; } if (try self.waitForInterceptedResponse(transfer)) { return self.process(transfer); } } const RobotsRequestContext = struct { client: *Client, req: Request, robots_url: [:0]const u8, buffer: std.ArrayList(u8), status: u16 = 0, pub fn deinit(self: *RobotsRequestContext) void { self.client.allocator.free(self.robots_url); self.buffer.deinit(self.client.allocator); self.client.allocator.destroy(self); } }; fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void { const entry = try self.pending_robots_queue.getOrPut(self.allocator, robots_url); if (!entry.found_existing) { errdefer self.allocator.free(robots_url); // If we aren't already fetching this robots, // we want to create a new queue for it and add this request into it. entry.value_ptr.* = .empty; const ctx = try self.allocator.create(RobotsRequestContext); errdefer self.allocator.destroy(ctx); ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty }; const headers = try self.newHeaders(); log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url }); try self.processRequest(.{ .ctx = ctx, .url = robots_url, .method = .GET, .headers = headers, .blocking = false, .page_id = req.page_id, .cookie_jar = req.cookie_jar, .notification = req.notification, .resource_type = .fetch, .header_callback = robotsHeaderCallback, .data_callback = robotsDataCallback, .done_callback = robotsDoneCallback, .error_callback = robotsErrorCallback, .shutdown_callback = robotsShutdownCallback, }); } else { // Not using our own robots URL, only using the one from the first request. self.allocator.free(robots_url); } try entry.value_ptr.append(self.allocator, req); } fn robotsHeaderCallback(transfer: *Transfer) !bool { const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); if (transfer.response_header) |hdr| { log.debug(.browser, "robots status", .{ .status = hdr.status, .robots_url = ctx.robots_url }); ctx.status = hdr.status; } if (transfer.getContentLength()) |cl| { try ctx.buffer.ensureTotalCapacity(ctx.client.allocator, cl); } return true; } fn robotsDataCallback(transfer: *Transfer, data: []const u8) !void { const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); try ctx.buffer.appendSlice(ctx.client.allocator, data); } fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); defer ctx.deinit(); var allowed = true; switch (ctx.status) { 200 => { if (ctx.buffer.items.len > 0) { const robots: ?Robots = ctx.client.robot_store.robotsFromBytes( ctx.client.config.http_headers.user_agent, ctx.buffer.items, ) catch blk: { log.warn(.browser, "failed to parse robots", .{ .robots_url = ctx.robots_url }); // If we fail to parse, we just insert it as absent and ignore. try ctx.client.robot_store.putAbsent(ctx.robots_url); break :blk null; }; if (robots) |r| { try ctx.client.robot_store.put(ctx.robots_url, r); const path = URL.getPathname(ctx.req.url); allowed = r.isAllowed(path); } } }, 404 => { log.debug(.http, "robots not found", .{ .url = ctx.robots_url }); // If we get a 404, we just insert it as absent. try ctx.client.robot_store.putAbsent(ctx.robots_url); }, else => { log.debug(.http, "unexpected status on robots", .{ .url = ctx.robots_url, .status = ctx.status }); // If we get an unexpected status, we just insert as absent. try ctx.client.robot_store.putAbsent(ctx.robots_url); }, } var queued = ctx.client.pending_robots_queue.fetchRemove( ctx.robots_url, ) orelse @panic("Client.robotsDoneCallbacke empty queue"); defer queued.value.deinit(ctx.client.allocator); for (queued.value.items) |queued_req| { if (!allowed) { log.warn(.http, "blocked by robots", .{ .url = queued_req.url }); queued_req.error_callback(queued_req.ctx, error.RobotsBlocked); } else { ctx.client.processRequest(queued_req) catch |e| { queued_req.error_callback(queued_req.ctx, e); }; } } } fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); defer ctx.deinit(); log.warn(.http, "robots fetch failed", .{ .err = err }); var queued = ctx.client.pending_robots_queue.fetchRemove( ctx.robots_url, ) orelse @panic("Client.robotsErrorCallback empty queue"); defer queued.value.deinit(ctx.client.allocator); // On error, allow all queued requests to proceed for (queued.value.items) |queued_req| { ctx.client.processRequest(queued_req) catch |e| { queued_req.error_callback(queued_req.ctx, e); }; } } fn robotsShutdownCallback(ctx_ptr: *anyopaque) void { const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); defer ctx.deinit(); log.debug(.http, "robots fetch shutdown", .{}); var queued = ctx.client.pending_robots_queue.fetchRemove( ctx.robots_url, ) orelse @panic("Client.robotsErrorCallback empty queue"); defer queued.value.deinit(ctx.client.allocator); for (queued.value.items) |queued_req| { if (queued_req.shutdown_callback) |shutdown_cb| { shutdown_cb(queued_req.ctx); } } } fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool { // The request was intercepted and is blocking. This is messy, but our // callers, the ScriptManager -> Page, don't have a great way to stop the // parser and return control to the CDP server to wait for the interception // response. We have some information on the CDPClient, so we'll do the // blocking here. (This is a bit of a legacy thing. Initially the Client // had a 'extra_socket' that it could monitor. It was named 'extra_socket' // to appear generic, but really, that 'extra_socket' was always the CDP // socket. Because we already had the "extra_socket" here, it was easier to // make it even more CDP- aware and turn `extra_socket: socket_t` into the // current CDPClient and do the blocking here). const cdp_client = self.cdp_client.?; const ctx = cdp_client.ctx; if (cdp_client.blocking_read_start(ctx) == false) { return error.BlockingInterceptFailure; } defer _ = cdp_client.blocking_read_end(ctx); while (true) { if (cdp_client.blocking_read(ctx) == false) { return error.BlockingInterceptFailure; } switch (transfer._intercept_state) { .pending => continue, // keep waiting .@"continue" => return true, .abort => |err| { transfer.abort(err); return false; }, .fulfilled => { // callbacks already called, just need to cleanups transfer.deinit(); return false; }, .not_intercepted => unreachable, } } } // Above, request will not process if there's an interception request. In such // cases, the interecptor is expected to call resume to continue the transfer // or transfer.abort() to abort it. fn process(self: *Client, transfer: *Transfer) !void { if (self.handles.get()) |conn| { return self.makeRequest(conn, transfer); } self.queue.append(&transfer._node); } // For an intercepted request pub fn continueTransfer(self: *Client, transfer: *Transfer) !void { if (comptime IS_DEBUG) { std.debug.assert(transfer._intercept_state != .not_intercepted); log.debug(.http, "continue transfer", .{ .intercepted = self.intercepted }); } self.intercepted -= 1; if (!transfer.req.blocking) { return self.process(transfer); } transfer._intercept_state = .@"continue"; } // For an intercepted request pub fn abortTransfer(self: *Client, transfer: *Transfer) void { if (comptime IS_DEBUG) { std.debug.assert(transfer._intercept_state != .not_intercepted); log.debug(.http, "abort transfer", .{ .intercepted = self.intercepted }); } self.intercepted -= 1; if (!transfer.req.blocking) { transfer.abort(error.Abort); } transfer._intercept_state = .{ .abort = error.Abort }; } // For an intercepted request pub fn fulfillTransfer(self: *Client, transfer: *Transfer, status: u16, headers: []const Net.Header, body: ?[]const u8) !void { if (comptime IS_DEBUG) { std.debug.assert(transfer._intercept_state != .not_intercepted); log.debug(.http, "filfull transfer", .{ .intercepted = self.intercepted }); } self.intercepted -= 1; try transfer.fulfill(status, headers, body); if (!transfer.req.blocking) { transfer.deinit(); return; } transfer._intercept_state = .fulfilled; } pub fn nextReqId(self: *Client) u32 { return self.next_request_id +% 1; } pub fn incrReqId(self: *Client) u32 { const id = self.next_request_id +% 1; self.next_request_id = id; return id; } fn makeTransfer(self: *Client, req: Request) !*Transfer { errdefer req.headers.deinit(); const transfer = try self.transfer_pool.create(); errdefer self.transfer_pool.destroy(transfer); const id = self.incrReqId(); transfer.* = .{ .arena = ArenaAllocator.init(self.allocator), .id = id, .url = req.url, .req = req, .ctx = req.ctx, .client = self, .max_response_size = self.config.httpMaxResponseSize(), }; return transfer; } fn requestFailed(transfer: *Transfer, err: anyerror, comptime execute_callback: bool) void { if (transfer._notified_fail) { // we can force a failed request within a callback, which will eventually // result in this being called again in the more general loop. We do this // because we can raise a more specific error inside a callback in some cases return; } transfer._notified_fail = true; transfer.req.notification.dispatch(.http_request_fail, &.{ .transfer = transfer, .err = err, }); if (execute_callback) { transfer.req.error_callback(transfer.ctx, err); } else if (transfer.req.shutdown_callback) |cb| { cb(transfer.ctx); } } // Restrictive since it'll only work if there are no inflight requests. In some // cases, the libcurl documentation is clear that changing settings while a // connection is inflight is undefined. It doesn't say anything about CURLOPT_PROXY, // but better to be safe than sorry. // For now, this restriction is ok, since it's only called by CDP on // createBrowserContext, at which point, if we do have an active connection, // that's probably a bug (a previous abort failed?). But if we need to call this // at any point in time, it could be worth digging into libcurl to see if this // can be changed at any point in the easy's lifecycle. pub fn changeProxy(self: *Client, proxy: [:0]const u8) !void { try self.ensureNoActiveConnection(); for (self.handles.connections) |*conn| { try conn.setProxy(proxy.ptr); } self.use_proxy = true; } // Same restriction as changeProxy. Should be ok since this is only called on // BrowserContext deinit. pub fn restoreOriginalProxy(self: *Client) !void { try self.ensureNoActiveConnection(); const proxy = if (self.http_proxy) |p| p.ptr else null; for (self.handles.connections) |*conn| { try conn.setProxy(proxy); } self.use_proxy = proxy != null; } // Enable TLS verification on all connections. pub fn enableTlsVerify(self: *const Client) !void { // Remove inflight connections check on enable TLS b/c chromiumoxide calls // the command during navigate and Curl seems to accept it... for (self.handles.connections) |*conn| { try conn.setTlsVerify(true, self.use_proxy); } } // Disable TLS verification on all connections. pub fn disableTlsVerify(self: *const Client) !void { // Remove inflight connections check on disable TLS b/c chromiumoxide calls // the command during navigate and Curl seems to accept it... for (self.handles.connections) |*conn| { try conn.setTlsVerify(false, self.use_proxy); } } fn makeRequest(self: *Client, conn: *Net.Connection, transfer: *Transfer) anyerror!void { const req = &transfer.req; { transfer._conn = conn; errdefer transfer.deinit(); try conn.setURL(req.url); try conn.setMethod(req.method); if (req.body) |b| { try conn.setBody(b); } else { try conn.setGetMode(); } var header_list = req.headers; try conn.secretHeaders(&header_list, &self.config.http_headers); // Add headers that must be hidden from intercepts try conn.setHeaders(&header_list); // Add cookies. if (header_list.cookies) |cookies| { try conn.setCookies(cookies); } try conn.setPrivate(transfer); // add credentials if (req.credentials) |creds| { try conn.setProxyCredentials(creds); } } // Once soon as this is called, our "perform" loop is responsible for // cleaning things up. That's why the above code is in a block. If anything // fails BEFORE `curl_multi_add_handle` suceeds, the we still need to do // cleanup. But if things fail after `curl_multi_add_handle`, we expect // perfom to pickup the failure and cleanup. try self.handles.add(conn); if (req.start_callback) |cb| { cb(transfer) catch |err| { self.handles.remove(conn); transfer._conn = null; transfer.deinit(); return err; }; } self.active += 1; _ = try self.perform(0); } pub const PerformStatus = enum { cdp_socket, normal, }; fn perform(self: *Client, timeout_ms: c_int) !PerformStatus { const running = try self.handles.perform(); // We're potentially going to block for a while until we get data. Process // whatever messages we have waiting ahead of time. if (try self.processMessages()) { return .normal; } var status = PerformStatus.normal; if (self.cdp_client) |cdp_client| { var wait_fds = [_]c.curl_waitfd{.{ .fd = cdp_client.socket, .events = c.CURL_WAIT_POLLIN, .revents = 0, }}; try self.handles.poll(&wait_fds, timeout_ms); if (wait_fds[0].revents != 0) { status = .cdp_socket; } } else if (running > 0) { try self.handles.poll(&.{}, timeout_ms); } _ = try self.processMessages(); return status; } fn processMessages(self: *Client) !bool { var processed = false; while (self.handles.readMessage()) |msg| { const transfer = try Transfer.fromConnection(&msg.conn); // In case of auth challenge // TODO give a way to configure the number of auth retries. if (transfer._auth_challenge != null and transfer._tries < 10) { var wait_for_interception = false; transfer.req.notification.dispatch(.http_request_auth_required, &.{ .transfer = transfer, .wait_for_interception = &wait_for_interception }); if (wait_for_interception) { self.intercepted += 1; if (comptime IS_DEBUG) { log.debug(.http, "wait for auth interception", .{ .intercepted = self.intercepted }); } transfer._intercept_state = .pending; // Wether or not this is a blocking request, we're not going // to process it now. We can end the transfer, which will // release the easy handle back into the pool. The transfer // is still valid/alive (just has no handle). self.endTransfer(transfer); if (!transfer.req.blocking) { // In the case of an async request, we can just "forget" // about this transfer until it gets updated asynchronously // from some CDP command. continue; } // In the case of a sync request, we need to block until we // get the CDP command for handling this case. if (try self.waitForInterceptedResponse(transfer)) { // we've been asked to continue with the request // we can't process it here, since we're already inside // a process, so we need to queue it and wait for the // next tick (this is why it was safe to endTransfer // above, because even in the "blocking" path, we still // only process it on the next tick). self.queue.append(&transfer._node); } else { // aborted, already cleaned up } continue; } } // release it ASAP so that it's available; some done_callbacks // will load more resources. self.endTransfer(transfer); defer transfer.deinit(); if (msg.err) |err| { requestFailed(transfer, err, true); } else blk: { // In case of request w/o data, we need to call the header done // callback now. if (!transfer._header_done_called) { const proceed = transfer.headerDoneCallback(&msg.conn) catch |err| { log.err(.http, "header_done_callback", .{ .err = err }); requestFailed(transfer, err, true); continue; }; if (!proceed) { requestFailed(transfer, error.Abort, true); break :blk; } } transfer.req.done_callback(transfer.ctx) catch |err| { // transfer isn't valid at this point, don't use it. log.err(.http, "done_callback", .{ .err = err }); requestFailed(transfer, err, true); continue; }; transfer.req.notification.dispatch(.http_request_done, &.{ .transfer = transfer, }); processed = true; } } return processed; } fn endTransfer(self: *Client, transfer: *Transfer) void { const conn = transfer._conn.?; self.handles.remove(conn); transfer._conn = null; self.active -= 1; } fn ensureNoActiveConnection(self: *const Client) !void { if (self.active > 0) { return error.InflightConnection; } } const Handles = Net.Handles; pub const RequestCookie = struct { is_http: bool, jar: *CookieJar, is_navigation: bool, origin: [:0]const u8, pub fn headersForRequest(self: *const RequestCookie, temp: Allocator, url: [:0]const u8, headers: *Net.Headers) !void { var arr: std.ArrayList(u8) = .{}; try self.jar.forRequest(url, arr.writer(temp), .{ .is_http = self.is_http, .is_navigation = self.is_navigation, .origin_url = self.origin, }); if (arr.items.len > 0) { try arr.append(temp, 0); //null terminate headers.cookies = @ptrCast(arr.items.ptr); } } }; pub const Request = struct { page_id: u32, method: Method, url: [:0]const u8, headers: Net.Headers, body: ?[]const u8 = null, cookie_jar: ?*CookieJar, resource_type: ResourceType, credentials: ?[:0]const u8 = null, notification: *Notification, max_response_size: ?usize = null, // This is only relevant for intercepted requests. If a request is flagged // as blocking AND is intercepted, then it'll be up to us to wait until // we receive a response to the interception. This probably isn't ideal, // but it's harder for our caller (ScriptManager) to deal with this. One // reason for that is the Http Client is already a bit CDP-aware. blocking: bool = false, // arbitrary data that can be associated with this request ctx: *anyopaque = undefined, start_callback: ?*const fn (transfer: *Transfer) anyerror!void = null, header_callback: *const fn (transfer: *Transfer) anyerror!bool, data_callback: *const fn (transfer: *Transfer, data: []const u8) anyerror!void, done_callback: *const fn (ctx: *anyopaque) anyerror!void, error_callback: *const fn (ctx: *anyopaque, err: anyerror) void, shutdown_callback: ?*const fn (ctx: *anyopaque) void = null, const ResourceType = enum { document, xhr, script, fetch, // Allowed Values: Document, Stylesheet, Image, Media, Font, Script, // TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, // SignedExchange, Ping, CSPViolationReport, Preflight, FedCM, Other // https://chromedevtools.github.io/devtools-protocol/tot/Network/#type-ResourceType pub fn string(self: ResourceType) []const u8 { return switch (self) { .document => "Document", .xhr => "XHR", .script => "Script", .fetch => "Fetch", }; } }; }; const AuthChallenge = Net.AuthChallenge; pub const Transfer = struct { arena: ArenaAllocator, id: u32 = 0, req: Request, url: [:0]const u8, ctx: *anyopaque, // copied from req.ctx to make it easier for callback handlers client: *Client, // total bytes received in the response, including the response status line, // the headers, and the [encoded] body. bytes_received: usize = 0, aborted: bool = false, max_response_size: ?usize = null, // We'll store the response header here response_header: ?ResponseHead = null, // track if the header callbacks done have been called. _header_done_called: bool = false, _notified_fail: bool = false, _conn: ?*Net.Connection = null, _redirecting: bool = false, _auth_challenge: ?AuthChallenge = null, // number of times the transfer has been tried. // incremented by reset func. _tries: u8 = 0, // for when a Transfer is queued in the client.queue _node: std.DoublyLinkedList.Node = .{}, _intercept_state: InterceptState = .not_intercepted, const InterceptState = union(enum) { not_intercepted, pending, @"continue", abort: anyerror, fulfilled, }; pub fn reset(self: *Transfer) void { // There's an assertion in ScriptManager that's failing. Seemingly because // the headerCallback is being called multiple times. This shouldn't be // possible (hence the assertion). Previously, this `reset` would set // _header_done_called = false. That could have been how headerCallback // was called multuple times (because _header_done_called is the guard // against that, so resetting it would allow a 2nd call to headerCallback). // But it should also be impossible for this to be true. So, I've added // this assertion to try to narrow down what's going on. lp.assert(self._header_done_called == false, "Transfer.reset header_done_called", .{}); self._redirecting = false; self._auth_challenge = null; self._notified_fail = false; self.response_header = null; self.bytes_received = 0; self._tries += 1; } fn deinit(self: *Transfer) void { self.req.headers.deinit(); if (self._conn) |conn| { self.client.handles.remove(conn); } self.arena.deinit(); self.client.transfer_pool.destroy(self); } fn buildResponseHeader(self: *Transfer, conn: *const Net.Connection) !void { if (comptime IS_DEBUG) { std.debug.assert(self.response_header == null); } const url = try conn.getEffectiveUrl(); const status: u16 = if (self._auth_challenge != null) 407 else try conn.getResponseCode(); self.response_header = .{ .url = url, .status = status, .redirect_count = try conn.getRedirectCount(), }; if (conn.getResponseHeader("content-type", 0)) |ct| { var hdr = &self.response_header.?; const value = ct.value; const len = @min(value.len, ResponseHead.MAX_CONTENT_TYPE_LEN); hdr._content_type_len = len; @memcpy(hdr._content_type[0..len], value[0..len]); } } pub fn format(self: *Transfer, writer: *std.Io.Writer) !void { const req = self.req; return writer.print("{s} {s}", .{ @tagName(req.method), req.url }); } pub fn updateURL(self: *Transfer, url: [:0]const u8) !void { // for cookies self.url = url; // for the request itself self.req.url = url; } pub fn updateCredentials(self: *Transfer, userpwd: [:0]const u8) void { self.req.credentials = userpwd; } pub fn replaceRequestHeaders(self: *Transfer, allocator: Allocator, headers: []const Net.Header) !void { self.req.headers.deinit(); var buf: std.ArrayList(u8) = .empty; var new_headers = try self.client.newHeaders(); for (headers) |hdr| { // safe to re-use this buffer, because Headers.add because curl copies // the value we pass into curl_slist_append. defer buf.clearRetainingCapacity(); try std.fmt.format(buf.writer(allocator), "{s}: {s}", .{ hdr.name, hdr.value }); try buf.append(allocator, 0); // null terminated try new_headers.add(buf.items[0 .. buf.items.len - 1 :0]); } self.req.headers = new_headers; } pub fn abort(self: *Transfer, err: anyerror) void { requestFailed(self, err, true); if (self._conn == null) { self.deinit(); return; } const client = self.client; if (client.handles.performing) { // We're currently in a curl_multi_perform. We cannot call endTransfer // as that calls curl_multi_remove_handle, and you can't do that // from a curl callback. Instead, we flag this transfer and all of // our callbacks will check for this flag and abort the transfer for // us self.aborted = true; return; } if (self._conn != null) { client.endTransfer(self); } self.deinit(); } pub fn terminate(self: *Transfer) void { requestFailed(self, error.Shutdown, false); if (self._conn != null) { self.client.endTransfer(self); } self.deinit(); } // internal, when the page is shutting down. Doesn't have the same ceremony // as abort (doesn't send a notification, doesn't invoke an error callback) fn kill(self: *Transfer) void { if (self._conn != null) { self.client.endTransfer(self); } if (self.req.shutdown_callback) |cb| { cb(self.ctx); } self.deinit(); } // abortAuthChallenge is called when an auth challenge interception is // abort. We don't call self.client.endTransfer here b/c it has been done // before interception process. pub fn abortAuthChallenge(self: *Transfer) void { if (comptime IS_DEBUG) { std.debug.assert(self._intercept_state != .not_intercepted); log.debug(.http, "abort auth transfer", .{ .intercepted = self.client.intercepted }); } self.client.intercepted -= 1; if (!self.req.blocking) { self.abort(error.AbortAuthChallenge); return; } self._intercept_state = .{ .abort = error.AbortAuthChallenge }; } // redirectionCookies manages cookies during redirections handled by Curl. // It sets the cookies from the current response to the cookie jar. // It also immediately sets cookies for the following request. fn redirectionCookies(transfer: *Transfer, conn: *const Net.Connection) !void { const req = &transfer.req; const arena = transfer.arena.allocator(); // retrieve cookies from the redirect's response. if (req.cookie_jar) |jar| { var i: usize = 0; while (true) { const ct = conn.getResponseHeader("set-cookie", i); if (ct == null) break; try jar.populateFromResponse(transfer.url, ct.?.value); i += 1; if (i >= ct.?.amount) break; } } // set cookies for the following redirection's request. const location = conn.getResponseHeader("location", 0) orelse { return error.LocationNotFound; }; const base_url = try conn.getEffectiveUrl(); const url = try URL.resolve(arena, std.mem.span(base_url), location.value, .{}); transfer.url = url; if (req.cookie_jar) |jar| { var cookies: std.ArrayList(u8) = .{}; try jar.forRequest(url, cookies.writer(arena), .{ .is_http = true, .origin_url = url, // used to enforce samesite cookie rules .is_navigation = req.resource_type == .document, }); try cookies.append(arena, 0); //null terminate try conn.setCookies(@ptrCast(cookies.items.ptr)); } } // headerDoneCallback is called once the headers have been read. // It can be called either on dataCallback or once the request for those // w/o body. fn headerDoneCallback(transfer: *Transfer, conn: *const Net.Connection) !bool { lp.assert(transfer._header_done_called == false, "Transfer.headerDoneCallback", .{}); defer transfer._header_done_called = true; try transfer.buildResponseHeader(conn); if (conn.getResponseHeader("content-type", 0)) |ct| { var hdr = &transfer.response_header.?; const value = ct.value; const len = @min(value.len, ResponseHead.MAX_CONTENT_TYPE_LEN); hdr._content_type_len = len; @memcpy(hdr._content_type[0..len], value[0..len]); } if (transfer.req.cookie_jar) |jar| { var i: usize = 0; while (true) { const ct = conn.getResponseHeader("set-cookie", i); if (ct == null) break; jar.populateFromResponse(transfer.url, ct.?.value) catch |err| { log.err(.http, "set cookie", .{ .err = err, .req = transfer }); return err; }; i += 1; if (i >= ct.?.amount) break; } } if (transfer.max_response_size) |max_size| { if (transfer.getContentLength()) |cl| { if (cl > max_size) { return error.ResponseTooLarge; } } } const proceed = transfer.req.header_callback(transfer) catch |err| { log.err(.http, "header_callback", .{ .err = err, .req = transfer }); return err; }; transfer.req.notification.dispatch(.http_response_header_done, &.{ .transfer = transfer, }); return proceed and transfer.aborted == false; } // headerCallback is called by curl on each request's header line read. fn headerCallback(buffer: [*]const u8, header_count: usize, buf_len: usize, data: *anyopaque) callconv(.c) usize { // libcurl should only ever emit 1 header at a time if (comptime IS_DEBUG) { std.debug.assert(header_count == 1); } const conn: Net.Connection = .{ .easy = @ptrCast(@alignCast(data)) }; var transfer = fromConnection(&conn) catch |err| { log.err(.http, "get private info", .{ .err = err, .source = "header callback" }); return 0; }; if (comptime IS_DEBUG) { // curl will allow header lines that end with either \r\n or just \n std.debug.assert(buffer[buf_len - 1] == '\n'); } if (buf_len < 3) { // could be \r\n or \n. return buf_len; } var header_len = buf_len - 2; if (buffer[buf_len - 2] != '\r') { // curl supports headers that just end with either \r\n or \n header_len = buf_len - 1; } const header = buffer[0..header_len]; // We need to parse the first line headers for each request b/c curl's // CURLINFO_RESPONSE_CODE returns the status code of the final request. // If a redirection or a proxy's CONNECT forbidden happens, we won't // get this intermediary status code. if (std.mem.startsWith(u8, header, "HTTP/")) { // Is it the first header line. if (buf_len < 13) { if (comptime IS_DEBUG) { log.debug(.http, "invalid response line", .{ .line = header }); } return 0; } const version_start: usize = if (header[5] == '2') 7 else 9; const version_end = version_start + 3; // a bit silly, but it makes sure that we don't change the length check // above in a way that could break this. if (comptime IS_DEBUG) { std.debug.assert(version_end < 13); } const status = std.fmt.parseInt(u16, header[version_start..version_end], 10) catch { if (comptime IS_DEBUG) { log.debug(.http, "invalid status code", .{ .line = header }); } return 0; }; if (status >= 300 and status <= 399) { transfer._redirecting = true; return buf_len; } transfer._redirecting = false; if (status == 401 or status == 407) { // The auth challenge must be parsed from a following // WWW-Authenticate or Proxy-Authenticate header. transfer._auth_challenge = .{ .status = status, .source = undefined, .scheme = undefined, .realm = undefined, }; return buf_len; } transfer._auth_challenge = null; transfer.bytes_received = buf_len; return buf_len; } if (transfer._redirecting == false and transfer._auth_challenge != null) { transfer.bytes_received += buf_len; } if (buf_len > 2) { if (transfer._auth_challenge != null) { // try to parse auth challenge. if (std.ascii.startsWithIgnoreCase(header, "WWW-Authenticate") or std.ascii.startsWithIgnoreCase(header, "Proxy-Authenticate")) { const ac = AuthChallenge.parse( transfer._auth_challenge.?.status, header, ) catch |err| { // We can't parse the auth challenge log.err(.http, "parse auth challenge", .{ .err = err, .header = header }); // Should we cancel the request? I don't think so. return buf_len; }; transfer._auth_challenge = ac; } } return buf_len; } // Starting here, we get the last header line. if (transfer._redirecting) { // parse and set cookies for the redirection. redirectionCookies(transfer, &conn) catch |err| { if (comptime IS_DEBUG) { log.debug(.http, "redirection cookies", .{ .err = err }); } return 0; }; return buf_len; } return buf_len; } fn dataCallback(buffer: [*]const u8, chunk_count: usize, chunk_len: usize, data: *anyopaque) callconv(.c) isize { // libcurl should only ever emit 1 chunk at a time if (comptime IS_DEBUG) { std.debug.assert(chunk_count == 1); } const conn: Net.Connection = .{ .easy = @ptrCast(@alignCast(data)) }; var transfer = fromConnection(&conn) catch |err| { log.err(.http, "get private info", .{ .err = err, .source = "body callback" }); return c.CURL_WRITEFUNC_ERROR; }; if (transfer._redirecting or transfer._auth_challenge != null) { return @intCast(chunk_len); } if (!transfer._header_done_called) { const proceed = transfer.headerDoneCallback(&conn) catch |err| { log.err(.http, "header_done_callback", .{ .err = err, .req = transfer }); return c.CURL_WRITEFUNC_ERROR; }; if (!proceed) { // signal abort to libcurl return -1; } } transfer.bytes_received += chunk_len; if (transfer.max_response_size) |max_size| { if (transfer.bytes_received > max_size) { requestFailed(transfer, error.ResponseTooLarge, true); return -1; } } const chunk = buffer[0..chunk_len]; transfer.req.data_callback(transfer, chunk) catch |err| { log.err(.http, "data_callback", .{ .err = err, .req = transfer }); return c.CURL_WRITEFUNC_ERROR; }; transfer.req.notification.dispatch(.http_response_data, &.{ .data = chunk, .transfer = transfer, }); if (transfer.aborted) { return -1; } return @intCast(chunk_len); } pub fn responseHeaderIterator(self: *Transfer) HeaderIterator { if (self._conn) |conn| { // If we have a connection, than this is a real curl request and we // iterate through the header that curl maintains. return .{ .curl = .{ .conn = conn } }; } // If there's no handle, it either means this is being called before // the request is even being made (which would be a bug in the code) // or when a response was injected via transfer.fulfill. The injected // header should be iterated, since there is no handle/easy. return .{ .list = .{ .list = self.response_header.?._injected_headers } }; } pub fn fromConnection(conn: *const Net.Connection) !*Transfer { const private = try conn.getPrivate(); return @ptrCast(@alignCast(private)); } pub fn fulfill(transfer: *Transfer, status: u16, headers: []const Net.Header, body: ?[]const u8) !void { if (transfer._conn != null) { // should never happen, should have been intercepted/paused, and then // either continued, aborted or fulfilled once. @branchHint(.unlikely); return error.RequestInProgress; } transfer._fulfill(status, headers, body) catch |err| { transfer.req.error_callback(transfer.req.ctx, err); return err; }; } fn _fulfill(transfer: *Transfer, status: u16, headers: []const Net.Header, body: ?[]const u8) !void { const req = &transfer.req; if (req.start_callback) |cb| { try cb(transfer); } transfer.response_header = .{ .status = status, .url = req.url, .redirect_count = 0, ._injected_headers = headers, }; for (headers) |hdr| { if (std.ascii.eqlIgnoreCase(hdr.name, "content-type")) { const len = @min(hdr.value.len, ResponseHead.MAX_CONTENT_TYPE_LEN); @memcpy(transfer.response_header.?._content_type[0..len], hdr.value[0..len]); transfer.response_header.?._content_type_len = len; break; } } lp.assert(transfer._header_done_called == false, "Transfer.fulfill header_done_called", .{}); if (try req.header_callback(transfer) == false) { transfer.abort(error.Abort); return; } if (body) |b| { try req.data_callback(transfer, b); } try req.done_callback(req.ctx); } // This function should be called during the dataCallback. Calling it after // such as in the doneCallback is guaranteed to return null. pub fn getContentLength(self: *const Transfer) ?u32 { const cl = self.getContentLengthRawValue() orelse return null; return std.fmt.parseInt(u32, cl, 10) catch null; } fn getContentLengthRawValue(self: *const Transfer) ?[]const u8 { if (self._conn) |conn| { // If we have a connection, than this is a normal request. We can get the // header value from the connection. const cl = conn.getResponseHeader("content-length", 0) orelse return null; return cl.value; } // If we have no handle, then maybe this is being called after the // doneCallback. OR, maybe this is a "fulfilled" request. Let's check // the injected headers (if we have any). const rh = self.response_header orelse return null; for (rh._injected_headers) |hdr| { if (std.ascii.eqlIgnoreCase(hdr.name, "content-length")) { return hdr.value; } } return null; } };