queue requests to run after robots is fetched

2026-02-05 06:47:11 +00:00 · 2026-02-04 11:30:27 -08:00
parent f9104c71f6
commit 29ee7d41f5
1 changed files with 69 additions and 30 deletions
--- a/src/http/Client.zig
+++ b/src/http/Client.zig
@@ -87,6 +87,10 @@ queue: TransferQueue,
 // The main app allocator
 allocator: Allocator,
 // Queue of requests that depend on a robots.txt.
 // Allows us to fetch the robots.txt just once.
 pending_robots_queue: std.StringHashMapUnmanaged(std.ArrayList(Request)) = .empty,
 // Once we have a handle/easy to process a request with, we create a Transfer
 // which contains the Request as well as any state we need to process the
 // request. These wil come and go with each request.
@@ -165,6 +169,13 @@ pub fn deinit(self: *Client) void {
    _ = c.curl_multi_cleanup(self.multi);
    self.transfer_pool.deinit();
    var robots_iter = self.pending_robots_queue.iterator();
    while (robots_iter.next()) |entry| {
        entry.value_ptr.deinit(self.allocator);
    }
    self.pending_robots_queue.deinit(self.allocator);
    self.allocator.destroy(self);
 }
@@ -254,7 +265,10 @@ fn processRequest(self: *Client, req: Request) !void {
    transfer.req.notification.dispatch(.http_request_start, &.{ .transfer = transfer });
    var wait_for_interception = false;
-    transfer.req.notification.dispatch(.http_request_intercept, &.{ .transfer = transfer, .wait_for_interception = &wait_for_interception });
+    transfer.req.notification.dispatch(.http_request_intercept, &.{
        .transfer = transfer,
        .wait_for_interception = &wait_for_interception,
    });
    if (wait_for_interception == false) {
        // request not intercepted, process it normally
        return self.process(transfer);
@@ -293,27 +307,36 @@ const RobotsRequestContext = struct {
 };
 fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void {
-    const ctx = try self.allocator.create(RobotsRequestContext);
+    const entry = try self.pending_robots_queue.getOrPut(self.allocator, robots_url);
    ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty };
-    const headers = try self.newHeaders();
+    if (!entry.found_existing) {
        // If we aren't already fetching this robots,
        // we want to create a new queue for it and add this request into it.
        entry.value_ptr.* = .empty;
-    log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url });
+        const ctx = try self.allocator.create(RobotsRequestContext);
-    try self.processRequest(.{
+        ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty };
-        .ctx = ctx,
+        const headers = try self.newHeaders();
-        .url = robots_url,
+
-        .method = .GET,
+        log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url });
-        .headers = headers,
+        try self.processRequest(.{
-        .blocking = false,
+            .ctx = ctx,
-        .cookie_jar = req.cookie_jar,
+            .url = robots_url,
-        .notification = req.notification,
+            .method = .GET,
-        .robots = req.robots,
+            .headers = headers,
-        .resource_type = .fetch,
+            .blocking = false,
-        .header_callback = robotsHeaderCallback,
+            .cookie_jar = req.cookie_jar,
-        .data_callback = robotsDataCallback,
+            .notification = req.notification,
-        .done_callback = robotsDoneCallback,
+            .robots = req.robots,
-        .error_callback = robotsErrorCallback,
+            .resource_type = .fetch,
-    });
+            .header_callback = robotsHeaderCallback,
            .data_callback = robotsDataCallback,
            .done_callback = robotsDoneCallback,
            .error_callback = robotsErrorCallback,
        });
    }
    try entry.value_ptr.append(self.allocator, req);
 }
 fn robotsHeaderCallback(transfer: *Http.Transfer) !bool {
@@ -357,14 +380,22 @@ fn robotsDoneCallback(ctx_ptr: *anyopaque) !void {
        try ctx.req.robots.putAbsent(ctx.robots_url);
    }
-    if (!allowed) {
+    const queued = ctx.client.pending_robots_queue.getPtr(ctx.robots_url) orelse unreachable;
-        log.warn(.http, "blocked by robots", .{ .url = ctx.req.url });
+    defer {
-        ctx.req.error_callback(ctx.req.ctx, error.RobotsBlocked);
+        queued.deinit(ctx.client.allocator);
-        return;
+        _ = ctx.client.pending_robots_queue.remove(ctx.robots_url);
    }
-    // Now process the original request
+    for (queued.items) |queued_req| {
-    try ctx.client.processRequest(ctx.req);
+        if (!allowed) {
            log.warn(.http, "blocked by robots", .{ .url = queued_req.url });
            queued_req.error_callback(queued_req.ctx, error.RobotsBlocked);
        } else {
            ctx.client.processRequest(queued_req) catch |e| {
                queued_req.error_callback(queued_req.ctx, e);
            };
        }
    }
 }
 fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void {
@@ -373,10 +404,18 @@ fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void {
    log.warn(.http, "robots fetch failed", .{ .err = err });
-    // On error, allow the request to proceed
+    const queued = ctx.client.pending_robots_queue.getPtr(ctx.robots_url) orelse unreachable;
-    ctx.client.processRequest(ctx.req) catch |e| {
+    defer {
-        ctx.req.error_callback(ctx.req.ctx, e);
+        queued.deinit(ctx.client.allocator);
-    };
+        _ = ctx.client.pending_robots_queue.remove(ctx.robots_url);
    }
    // On error, allow all queued requests to proceed
    for (queued.items) |queued_req| {
        ctx.client.processRequest(queued_req) catch |e| {
            queued_req.error_callback(queued_req.ctx, e);
        };
    }
 }
 fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool {