From 04f9793b7ef56da8fdb3bca463174ea5446d1f67 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Sat, 31 Jan 2026 18:41:55 -0800 Subject: [PATCH] robots in the actual http client --- src/App.zig | 7 + src/browser/Page.zig | 1 + src/browser/Robots.zig | 239 +++++++++++++++++----- src/browser/ScriptManager.zig | 3 + src/browser/URL.zig | 14 +- src/browser/webapi/net/Fetch.zig | 1 + src/browser/webapi/net/XMLHttpRequest.zig | 1 + src/http/Client.zig | 138 +++++++++++++ src/http/Http.zig | 1 + 9 files changed, 344 insertions(+), 61 deletions(-) diff --git a/src/App.zig b/src/App.zig index fca528cd..393ceec9 100644 --- a/src/App.zig +++ b/src/App.zig @@ -24,6 +24,7 @@ const log = @import("log.zig"); const Snapshot = @import("browser/js/Snapshot.zig"); const Platform = @import("browser/js/Platform.zig"); const Telemetry = @import("telemetry/telemetry.zig").Telemetry; +const RobotStore = @import("browser/Robots.zig").RobotStore; pub const Http = @import("http/Http.zig"); pub const ArenaPool = @import("ArenaPool.zig"); @@ -40,6 +41,7 @@ snapshot: Snapshot, telemetry: Telemetry, allocator: Allocator, arena_pool: ArenaPool, +robots: RobotStore, app_dir_path: ?[]const u8, notification: *Notification, shutdown: bool = false, @@ -53,6 +55,7 @@ pub const RunMode = enum { pub const Config = struct { run_mode: RunMode, + obey_robots: bool = false, tls_verify_host: bool = true, http_proxy: ?[:0]const u8 = null, proxy_bearer_token: ?[:0]const u8 = null, @@ -74,6 +77,7 @@ pub fn init(allocator: Allocator, config: Config) !*App { errdefer app.notification.deinit(); app.http = try Http.init(allocator, .{ + .obey_robots = config.obey_robots, .max_host_open = config.http_max_host_open orelse 4, .max_concurrent = config.http_max_concurrent orelse 10, .timeout_ms = config.http_timeout_ms orelse 5000, @@ -91,6 +95,8 @@ pub fn init(allocator: Allocator, config: Config) !*App { app.snapshot = try Snapshot.load(); errdefer app.snapshot.deinit(); + app.robots = RobotStore.init(allocator); + app.app_dir_path = getAndMakeAppDir(allocator); app.telemetry = try Telemetry.init(app, config.run_mode); @@ -115,6 +121,7 @@ pub fn deinit(self: *App) void { self.app_dir_path = null; } self.telemetry.deinit(); + self.robots.deinit(); self.notification.deinit(); self.http.deinit(); self.snapshot.deinit(); diff --git a/src/browser/Page.zig b/src/browser/Page.zig index b417b4ad..bd16590a 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -544,6 +544,7 @@ pub fn navigate(self: *Page, request_url: [:0]const u8, opts: NavigateOpts) !voi .headers = headers, .body = opts.body, .cookie_jar = &self._session.cookie_jar, + .robots = &self._session.browser.app.robots, .resource_type = .document, .header_callback = pageHeaderDoneCallback, .data_callback = pageDataCallback, diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig index 5a4f9033..2aff774a 100644 --- a/src/browser/Robots.zig +++ b/src/browser/Robots.zig @@ -33,13 +33,80 @@ pub const Key = enum { pub const Robots = @This(); pub const empty: Robots = .{ .rules = &.{} }; +pub const RobotStore = struct { + const RobotsEntry = union(enum) { + present: Robots, + absent, + }; + + pub const RobotsMap = std.HashMapUnmanaged([]const u8, RobotsEntry, struct { + const Context = @This(); + + pub fn hash(_: Context, value: []const u8) u32 { + var hasher = std.hash.Wyhash.init(value.len); + for (value) |c| { + std.hash.autoHash(&hasher, std.ascii.toLower(c)); + } + return @truncate(hasher.final()); + } + + pub fn eql(_: Context, a: []const u8, b: []const u8) bool { + if (a.len != b.len) return false; + return std.ascii.eqlIgnoreCase(a, b); + } + }, 80); + + allocator: std.mem.Allocator, + map: RobotsMap, + + pub fn init(allocator: std.mem.Allocator) RobotStore { + return .{ .allocator = allocator, .map = .empty }; + } + + pub fn deinit(self: *RobotStore) void { + var iter = self.map.iterator(); + + while (iter.next()) |entry| { + self.allocator.free(entry.key_ptr.*); + + switch (entry.value_ptr.*) { + .present => |*robots| robots.deinit(self.allocator), + .absent => {}, + } + } + + self.map.deinit(self.allocator); + } + + pub fn get(self: *RobotStore, url: []const u8) ?RobotsEntry { + return self.map.get(url); + } + + pub fn robotsFromBytes(self: *RobotStore, user_agent: []const u8, bytes: []const u8) !Robots { + return try Robots.fromBytes(self.allocator, user_agent, bytes); + } + + pub fn put(self: *RobotStore, url: []const u8, robots: Robots) !void { + const duped = try self.allocator.dupe(u8, url); + try self.map.put(self.allocator, duped, .{ .present = robots }); + } + + pub fn putAbsent(self: *RobotStore, url: []const u8) !void { + const duped = try self.allocator.dupe(u8, url); + try self.map.put(self.allocator, duped, .absent); + } +}; + rules: []const Rule, -const State = enum { - not_in_entry, - in_other_entry, - in_our_entry, - in_wildcard_entry, +const State = struct { + entry: enum { + not_in_entry, + in_other_entry, + in_our_entry, + in_wildcard_entry, + }, + has_rules: bool = false, }; fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void { @@ -62,7 +129,7 @@ fn parseRulesWithUserAgent( var wildcard_rules: std.ArrayList(Rule) = .empty; defer wildcard_rules.deinit(allocator); - var state: State = .not_in_entry; + var state: State = .{ .entry = .not_in_entry, .has_rules = false }; var iter = std.mem.splitScalar(u8, bytes, '\n'); while (iter.next()) |line| { @@ -78,7 +145,6 @@ fn parseRulesWithUserAgent( trimmed; if (true_line.len == 0) { - state = .not_in_entry; continue; } @@ -94,55 +160,69 @@ fn parseRulesWithUserAgent( const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace); switch (key) { - .@"user-agent" => switch (state) { - .in_other_entry => { - if (std.ascii.eqlIgnoreCase(user_agent, value)) { - state = .in_our_entry; - } - }, - .in_our_entry => {}, - .in_wildcard_entry => { - if (std.ascii.eqlIgnoreCase(user_agent, value)) { - state = .in_our_entry; - } - }, - .not_in_entry => { - if (std.ascii.eqlIgnoreCase(user_agent, value)) { - state = .in_our_entry; - } else if (std.mem.eql(u8, "*", value)) { - state = .in_wildcard_entry; - } else { - state = .in_other_entry; - } - }, + .@"user-agent" => { + if (state.has_rules) { + state = .{ .entry = .not_in_entry, .has_rules = false }; + } + + switch (state.entry) { + .in_other_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } + }, + .in_our_entry => {}, + .in_wildcard_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } + }, + .not_in_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } else if (std.mem.eql(u8, "*", value)) { + state.entry = .in_wildcard_entry; + } else { + state.entry = .in_other_entry; + } + }, + } }, - .allow => switch (state) { - .in_our_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try rules.append(allocator, .{ .allow = duped_value }); - }, - .in_other_entry => {}, - .in_wildcard_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try wildcard_rules.append(allocator, .{ .allow = duped_value }); - }, - .not_in_entry => return error.UnexpectedRule, + .allow => { + defer state.has_rules = true; + + switch (state.entry) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .allow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .allow = duped_value }); + }, + .not_in_entry => return error.UnexpectedRule, + } }, - .disallow => switch (state) { - .in_our_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try rules.append(allocator, .{ .disallow = duped_value }); - }, - .in_other_entry => {}, - .in_wildcard_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try wildcard_rules.append(allocator, .{ .disallow = duped_value }); - }, - .not_in_entry => return error.UnexpectedRule, + .disallow => { + defer state.has_rules = true; + + switch (state.entry) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .disallow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .disallow = duped_value }); + }, + .not_in_entry => return error.UnexpectedRule, + } }, } } @@ -737,3 +817,54 @@ test "Robots: isAllowed - Google's real robots.txt" { try std.testing.expect(twitterbot.isAllowed("/groups") == false); try std.testing.expect(twitterbot.isAllowed("/m/") == false); } + +test "Robots: user-agent after rules starts new entry" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: Bot1 + \\User-agent: Bot2 + \\Disallow: /admin/ + \\Allow: /public/ + \\User-agent: Bot3 + \\Disallow: /private/ + \\ + ; + + var robots1 = try Robots.fromBytes(allocator, "Bot1", file); + defer robots1.deinit(allocator); + try std.testing.expect(robots1.isAllowed("/admin/") == false); + try std.testing.expect(robots1.isAllowed("/public/") == true); + try std.testing.expect(robots1.isAllowed("/private/") == true); + + var robots2 = try Robots.fromBytes(allocator, "Bot2", file); + defer robots2.deinit(allocator); + try std.testing.expect(robots2.isAllowed("/admin/") == false); + try std.testing.expect(robots2.isAllowed("/public/") == true); + try std.testing.expect(robots2.isAllowed("/private/") == true); + + var robots3 = try Robots.fromBytes(allocator, "Bot3", file); + defer robots3.deinit(allocator); + try std.testing.expect(robots3.isAllowed("/admin/") == true); + try std.testing.expect(robots3.isAllowed("/public/") == true); + try std.testing.expect(robots3.isAllowed("/private/") == false); +} + +test "Robots: blank lines don't end entries" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: MyBot + \\Disallow: /admin/ + \\ + \\ + \\Allow: /public/ + \\ + ; + + var robots = try Robots.fromBytes(allocator, "MyBot", file); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/") == false); + try std.testing.expect(robots.isAllowed("/public/") == true); +} diff --git a/src/browser/ScriptManager.zig b/src/browser/ScriptManager.zig index be968870..99de4127 100644 --- a/src/browser/ScriptManager.zig +++ b/src/browser/ScriptManager.zig @@ -265,6 +265,7 @@ pub fn addFromElement(self: *ScriptManager, comptime from_parser: bool, script_e .headers = try self.getHeaders(url), .blocking = is_blocking, .cookie_jar = &page._session.cookie_jar, + .robots = &page._session.browser.app.robots, .resource_type = .script, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, .header_callback = Script.headerCallback, @@ -379,6 +380,7 @@ pub fn preloadImport(self: *ScriptManager, url: [:0]const u8, referrer: []const .method = .GET, .headers = try self.getHeaders(url), .cookie_jar = &self.page._session.cookie_jar, + .robots = &self.page._session.browser.app.robots, .resource_type = .script, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, .header_callback = Script.headerCallback, @@ -481,6 +483,7 @@ pub fn getAsyncImport(self: *ScriptManager, url: [:0]const u8, cb: ImportAsync.C .ctx = script, .resource_type = .script, .cookie_jar = &self.page._session.cookie_jar, + .robots = &self.page._session.browser.app.robots, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, .header_callback = Script.headerCallback, .data_callback = Script.dataCallback, diff --git a/src/browser/URL.zig b/src/browser/URL.zig index 1e5d272a..716480b1 100644 --- a/src/browser/URL.zig +++ b/src/browser/URL.zig @@ -502,8 +502,8 @@ pub fn concatQueryString(arena: Allocator, url: []const u8, query_string: []cons return buf.items[0 .. buf.items.len - 1 :0]; } -pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) !?[:0]const u8 { - const origin = try getOrigin(arena, url) orelse return null; +pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) ![:0]const u8 { + const origin = try getOrigin(arena, url) orelse return error.NoOrigin; return try std.fmt.allocPrintSentinel( arena, "{s}/robots.txt", @@ -795,24 +795,24 @@ test "URL: getRobotsUrl" { { const url = try getRobotsUrl(arena, "https://www.lightpanda.io"); - try testing.expectEqual("https://www.lightpanda.io/robots.txt", url.?); + try testing.expectEqual("https://www.lightpanda.io/robots.txt", url); } { const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path"); - try testing.expectString("https://www.lightpanda.io/robots.txt", url.?); + try testing.expectString("https://www.lightpanda.io/robots.txt", url); } { const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page"); - try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url.?); + try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url); } { const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment"); - try testing.expectString("http://example.com/robots.txt", url.?); + try testing.expectString("http://example.com/robots.txt", url); } { const url = try getRobotsUrl(arena, "https://user:pass@example.com/page"); - try testing.expectString("https://example.com/robots.txt", url.?); + try testing.expectString("https://example.com/robots.txt", url); } } diff --git a/src/browser/webapi/net/Fetch.zig b/src/browser/webapi/net/Fetch.zig index 78d77016..fa4d309a 100644 --- a/src/browser/webapi/net/Fetch.zig +++ b/src/browser/webapi/net/Fetch.zig @@ -78,6 +78,7 @@ pub fn init(input: Input, options: ?InitOpts, page: *Page) !js.Promise { .headers = headers, .resource_type = .fetch, .cookie_jar = &page._session.cookie_jar, + .robots = &page._session.browser.app.robots, .start_callback = httpStartCallback, .header_callback = httpHeaderDoneCallback, .data_callback = httpDataCallback, diff --git a/src/browser/webapi/net/XMLHttpRequest.zig b/src/browser/webapi/net/XMLHttpRequest.zig index 606d9919..ea6f7396 100644 --- a/src/browser/webapi/net/XMLHttpRequest.zig +++ b/src/browser/webapi/net/XMLHttpRequest.zig @@ -206,6 +206,7 @@ pub fn send(self: *XMLHttpRequest, body_: ?[]const u8) !void { .headers = headers, .body = self._request_body, .cookie_jar = &page._session.cookie_jar, + .robots = &page._session.browser.app.robots, .resource_type = .xhr, .start_callback = httpStartCallback, .header_callback = httpHeaderDoneCallback, diff --git a/src/http/Client.zig b/src/http/Client.zig index 9d497597..4cd9f227 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -24,6 +24,8 @@ const Http = @import("Http.zig"); const URL = @import("../browser/URL.zig"); const Notification = @import("../Notification.zig"); const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar; +const Robots = @import("../browser/Robots.zig"); +const RobotStore = Robots.RobotStore; const c = Http.c; const posix = std.posix; @@ -82,6 +84,9 @@ queue: TransferQueue, // The main app allocator allocator: Allocator, +// If we obey robots.txt or not. +obey_robots: bool, + // Once we have a handle/easy to process a request with, we create a Transfer // which contains the Request as well as any state we need to process the // request. These wil come and go with each request. @@ -146,6 +151,7 @@ pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, opts: Http.Opts) !*Clie .multi = multi, .handles = handles, .allocator = allocator, + .obey_robots = opts.obey_robots, .http_proxy = opts.http_proxy, .use_proxy = opts.http_proxy != null, .user_agent = opts.user_agent, @@ -216,6 +222,36 @@ pub fn tick(self: *Client, timeout_ms: u32) !PerformStatus { } pub fn request(self: *Client, req: Request) !void { + if (self.obey_robots) { + const robots_url = try URL.getRobotsUrl(self.allocator, req.url); + + // If we have this robots cached, we can take a fast path. + if (req.robots.get(robots_url)) |robot_entry| { + defer self.allocator.free(robots_url); + + switch (robot_entry) { + // If we have a found robots entry, we check it. + .present => |robots| { + const path = URL.getPathname(req.url); + if (!robots.isAllowed(path)) { + req.error_callback(req.ctx, error.RobotsBlocked); + return; + } + }, + // Otherwise, we assume we won't find it again. + .absent => {}, + } + + return self.processRequest(req); + } + + return self.fetchRobotsThenProcessRequest(robots_url, req); + } + + return self.processRequest(req); +} + +fn processRequest(self: *Client, req: Request) !void { const transfer = try self.makeTransfer(req); const notification = self.notification orelse return self.process(transfer); @@ -247,6 +283,107 @@ pub fn request(self: *Client, req: Request) !void { } } +const RobotsRequestContext = struct { + client: *Client, + req: Request, + robots_url: [:0]const u8, + buffer: std.ArrayList(u8), + status: u16 = 0, +}; + +fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void { + const ctx = try self.allocator.create(RobotsRequestContext); + ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty }; + + const headers = try self.newHeaders(); + + log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url }); + try self.processRequest(.{ + .ctx = ctx, + .url = robots_url, + .method = .GET, + .headers = headers, + .blocking = false, + .cookie_jar = req.cookie_jar, + .robots = req.robots, + .resource_type = .fetch, + .header_callback = robotsHeaderCallback, + .data_callback = robotsDataCallback, + .done_callback = robotsDoneCallback, + .error_callback = robotsErrorCallback, + }); +} + +fn robotsHeaderCallback(transfer: *Http.Transfer) !bool { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); + + if (transfer.response_header) |hdr| { + log.debug(.browser, "robots status", .{ .status = hdr.status }); + ctx.status = hdr.status; + } + + if (transfer.getContentLength()) |cl| { + try ctx.buffer.ensureTotalCapacity(ctx.client.allocator, cl); + } + + return true; +} + +fn robotsDataCallback(transfer: *Http.Transfer, data: []const u8) !void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); + try ctx.buffer.appendSlice(ctx.client.allocator, data); +} + +fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.client.allocator.destroy(ctx); + defer ctx.buffer.deinit(ctx.client.allocator); + defer ctx.client.allocator.free(ctx.robots_url); + + var allowed = true; + + if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) { + const robots = try ctx.req.robots.robotsFromBytes( + ctx.client.user_agent, + ctx.buffer.items, + ); + + try ctx.req.robots.put(ctx.robots_url, robots); + + const path = URL.getPathname(ctx.req.url); + allowed = robots.isAllowed(path); + } + + // If not found, store as Not Found. + if (ctx.status == 404) { + log.debug(.http, "robots not found", .{ .url = ctx.robots_url }); + try ctx.req.robots.putAbsent(ctx.robots_url); + } + + if (!allowed) { + log.warn(.http, "blocked by robots", .{ .url = ctx.req.url }); + ctx.req.error_callback(ctx.req.ctx, error.RobotsBlocked); + return; + } + + // Now process the original request + try ctx.client.processRequest(ctx.req); +} + +fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.client.allocator.destroy(ctx); + defer ctx.buffer.deinit(ctx.client.allocator); + defer ctx.client.allocator.free(ctx.robots_url); + + log.warn(.http, "robots fetch failed", .{ .err = err }); + + // On error, allow the request to proceed + ctx.client.processRequest(ctx.req) catch |e| { + ctx.req.error_callback(ctx.req.ctx, e); + }; +} + fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool { // The request was intercepted and is blocking. This is messy, but our // callers, the ScriptManager -> Page, don't have a great way to stop the @@ -765,6 +902,7 @@ pub const Request = struct { headers: Http.Headers, body: ?[]const u8 = null, cookie_jar: *CookieJar, + robots: *RobotStore, resource_type: ResourceType, credentials: ?[:0]const u8 = null, diff --git a/src/http/Http.zig b/src/http/Http.zig index f54fd6e4..5a1f6058 100644 --- a/src/http/Http.zig +++ b/src/http/Http.zig @@ -348,6 +348,7 @@ pub fn errorMCheck(code: c.CURLMcode) errors.Multi!void { } pub const Opts = struct { + obey_robots: bool, timeout_ms: u31, max_host_open: u8, max_concurrent: u8,