From 1a246f2e380f965a4602b4b2d9c5db8c72864dee Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Sat, 31 Jan 2026 18:41:55 -0800 Subject: [PATCH] robots in the actual http client --- src/App.zig | 5 + src/Config.zig | 18 ++ src/browser/Page.zig | 1 + src/browser/Robots.zig | 239 +++++++++++++++++----- src/browser/ScriptManager.zig | 3 + src/browser/URL.zig | 14 +- src/browser/webapi/net/Fetch.zig | 1 + src/browser/webapi/net/XMLHttpRequest.zig | 1 + src/http/Client.zig | 137 ++++++++++++- 9 files changed, 357 insertions(+), 62 deletions(-) diff --git a/src/App.zig b/src/App.zig index 21b0ecc6..76ffd396 100644 --- a/src/App.zig +++ b/src/App.zig @@ -25,6 +25,7 @@ const Config = @import("Config.zig"); const Snapshot = @import("browser/js/Snapshot.zig"); const Platform = @import("browser/js/Platform.zig"); const Telemetry = @import("telemetry/telemetry.zig").Telemetry; +const RobotStore = @import("browser/Robots.zig").RobotStore; pub const Http = @import("http/Http.zig"); pub const ArenaPool = @import("ArenaPool.zig"); @@ -38,6 +39,7 @@ snapshot: Snapshot, telemetry: Telemetry, allocator: Allocator, arena_pool: ArenaPool, +robots: RobotStore, app_dir_path: ?[]const u8, shutdown: bool = false, @@ -57,6 +59,8 @@ pub fn init(allocator: Allocator, config: *const Config) !*App { app.snapshot = try Snapshot.load(); errdefer app.snapshot.deinit(); + app.robots = RobotStore.init(allocator); + app.app_dir_path = getAndMakeAppDir(allocator); app.telemetry = try Telemetry.init(app, config.mode); @@ -79,6 +83,7 @@ pub fn deinit(self: *App) void { self.app_dir_path = null; } self.telemetry.deinit(); + self.robots.deinit(); self.http.deinit(); self.snapshot.deinit(); self.platform.deinit(); diff --git a/src/Config.zig b/src/Config.zig index fc4ebcdd..0f285f98 100644 --- a/src/Config.zig +++ b/src/Config.zig @@ -57,6 +57,13 @@ pub fn tlsVerifyHost(self: *const Config) bool { }; } +pub fn obeyRobots(self: *const Config) bool { + return switch (self.mode) { + inline .serve, .fetch => |opts| opts.common.obey_robots, + else => unreachable, + }; +} + pub fn httpProxy(self: *const Config) ?[:0]const u8 { return switch (self.mode) { inline .serve, .fetch => |opts| opts.common.http_proxy, @@ -158,6 +165,7 @@ pub const Fetch = struct { }; pub const Common = struct { + obey_robots: bool = false, proxy_bearer_token: ?[:0]const u8 = null, http_proxy: ?[:0]const u8 = null, http_max_concurrent: ?u8 = null, @@ -223,6 +231,11 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void { \\ advanced option which should only be set if you understand \\ and accept the risk of disabling host verification. \\ + \\--obey_robots + \\ Fetches and obeys the robots.txt (if available) of the web pages + \\ we make requests towards. + \\ Defaults to false. + \\ \\--http_proxy The HTTP proxy to use for all HTTP requests. \\ A username:password can be included for basic authentication. \\ Defaults to none. @@ -613,6 +626,11 @@ fn parseCommonArg( return true; } + if (std.mem.eql(u8, "--obey_robots", opt)) { + common.obey_robots = true; + return true; + } + if (std.mem.eql(u8, "--http_proxy", opt)) { const str = args.next() orelse { log.fatal(.app, "missing argument value", .{ .arg = "--http_proxy" }); diff --git a/src/browser/Page.zig b/src/browser/Page.zig index 8e86c47a..d879a813 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -559,6 +559,7 @@ pub fn navigate(self: *Page, request_url: [:0]const u8, opts: NavigateOpts) !voi .headers = headers, .body = opts.body, .cookie_jar = &self._session.cookie_jar, + .robots = &self._session.browser.app.robots, .resource_type = .document, .notification = self._session.notification, .header_callback = pageHeaderDoneCallback, diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig index 5a4f9033..2aff774a 100644 --- a/src/browser/Robots.zig +++ b/src/browser/Robots.zig @@ -33,13 +33,80 @@ pub const Key = enum { pub const Robots = @This(); pub const empty: Robots = .{ .rules = &.{} }; +pub const RobotStore = struct { + const RobotsEntry = union(enum) { + present: Robots, + absent, + }; + + pub const RobotsMap = std.HashMapUnmanaged([]const u8, RobotsEntry, struct { + const Context = @This(); + + pub fn hash(_: Context, value: []const u8) u32 { + var hasher = std.hash.Wyhash.init(value.len); + for (value) |c| { + std.hash.autoHash(&hasher, std.ascii.toLower(c)); + } + return @truncate(hasher.final()); + } + + pub fn eql(_: Context, a: []const u8, b: []const u8) bool { + if (a.len != b.len) return false; + return std.ascii.eqlIgnoreCase(a, b); + } + }, 80); + + allocator: std.mem.Allocator, + map: RobotsMap, + + pub fn init(allocator: std.mem.Allocator) RobotStore { + return .{ .allocator = allocator, .map = .empty }; + } + + pub fn deinit(self: *RobotStore) void { + var iter = self.map.iterator(); + + while (iter.next()) |entry| { + self.allocator.free(entry.key_ptr.*); + + switch (entry.value_ptr.*) { + .present => |*robots| robots.deinit(self.allocator), + .absent => {}, + } + } + + self.map.deinit(self.allocator); + } + + pub fn get(self: *RobotStore, url: []const u8) ?RobotsEntry { + return self.map.get(url); + } + + pub fn robotsFromBytes(self: *RobotStore, user_agent: []const u8, bytes: []const u8) !Robots { + return try Robots.fromBytes(self.allocator, user_agent, bytes); + } + + pub fn put(self: *RobotStore, url: []const u8, robots: Robots) !void { + const duped = try self.allocator.dupe(u8, url); + try self.map.put(self.allocator, duped, .{ .present = robots }); + } + + pub fn putAbsent(self: *RobotStore, url: []const u8) !void { + const duped = try self.allocator.dupe(u8, url); + try self.map.put(self.allocator, duped, .absent); + } +}; + rules: []const Rule, -const State = enum { - not_in_entry, - in_other_entry, - in_our_entry, - in_wildcard_entry, +const State = struct { + entry: enum { + not_in_entry, + in_other_entry, + in_our_entry, + in_wildcard_entry, + }, + has_rules: bool = false, }; fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void { @@ -62,7 +129,7 @@ fn parseRulesWithUserAgent( var wildcard_rules: std.ArrayList(Rule) = .empty; defer wildcard_rules.deinit(allocator); - var state: State = .not_in_entry; + var state: State = .{ .entry = .not_in_entry, .has_rules = false }; var iter = std.mem.splitScalar(u8, bytes, '\n'); while (iter.next()) |line| { @@ -78,7 +145,6 @@ fn parseRulesWithUserAgent( trimmed; if (true_line.len == 0) { - state = .not_in_entry; continue; } @@ -94,55 +160,69 @@ fn parseRulesWithUserAgent( const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace); switch (key) { - .@"user-agent" => switch (state) { - .in_other_entry => { - if (std.ascii.eqlIgnoreCase(user_agent, value)) { - state = .in_our_entry; - } - }, - .in_our_entry => {}, - .in_wildcard_entry => { - if (std.ascii.eqlIgnoreCase(user_agent, value)) { - state = .in_our_entry; - } - }, - .not_in_entry => { - if (std.ascii.eqlIgnoreCase(user_agent, value)) { - state = .in_our_entry; - } else if (std.mem.eql(u8, "*", value)) { - state = .in_wildcard_entry; - } else { - state = .in_other_entry; - } - }, + .@"user-agent" => { + if (state.has_rules) { + state = .{ .entry = .not_in_entry, .has_rules = false }; + } + + switch (state.entry) { + .in_other_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } + }, + .in_our_entry => {}, + .in_wildcard_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } + }, + .not_in_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } else if (std.mem.eql(u8, "*", value)) { + state.entry = .in_wildcard_entry; + } else { + state.entry = .in_other_entry; + } + }, + } }, - .allow => switch (state) { - .in_our_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try rules.append(allocator, .{ .allow = duped_value }); - }, - .in_other_entry => {}, - .in_wildcard_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try wildcard_rules.append(allocator, .{ .allow = duped_value }); - }, - .not_in_entry => return error.UnexpectedRule, + .allow => { + defer state.has_rules = true; + + switch (state.entry) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .allow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .allow = duped_value }); + }, + .not_in_entry => return error.UnexpectedRule, + } }, - .disallow => switch (state) { - .in_our_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try rules.append(allocator, .{ .disallow = duped_value }); - }, - .in_other_entry => {}, - .in_wildcard_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try wildcard_rules.append(allocator, .{ .disallow = duped_value }); - }, - .not_in_entry => return error.UnexpectedRule, + .disallow => { + defer state.has_rules = true; + + switch (state.entry) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .disallow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .disallow = duped_value }); + }, + .not_in_entry => return error.UnexpectedRule, + } }, } } @@ -737,3 +817,54 @@ test "Robots: isAllowed - Google's real robots.txt" { try std.testing.expect(twitterbot.isAllowed("/groups") == false); try std.testing.expect(twitterbot.isAllowed("/m/") == false); } + +test "Robots: user-agent after rules starts new entry" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: Bot1 + \\User-agent: Bot2 + \\Disallow: /admin/ + \\Allow: /public/ + \\User-agent: Bot3 + \\Disallow: /private/ + \\ + ; + + var robots1 = try Robots.fromBytes(allocator, "Bot1", file); + defer robots1.deinit(allocator); + try std.testing.expect(robots1.isAllowed("/admin/") == false); + try std.testing.expect(robots1.isAllowed("/public/") == true); + try std.testing.expect(robots1.isAllowed("/private/") == true); + + var robots2 = try Robots.fromBytes(allocator, "Bot2", file); + defer robots2.deinit(allocator); + try std.testing.expect(robots2.isAllowed("/admin/") == false); + try std.testing.expect(robots2.isAllowed("/public/") == true); + try std.testing.expect(robots2.isAllowed("/private/") == true); + + var robots3 = try Robots.fromBytes(allocator, "Bot3", file); + defer robots3.deinit(allocator); + try std.testing.expect(robots3.isAllowed("/admin/") == true); + try std.testing.expect(robots3.isAllowed("/public/") == true); + try std.testing.expect(robots3.isAllowed("/private/") == false); +} + +test "Robots: blank lines don't end entries" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: MyBot + \\Disallow: /admin/ + \\ + \\ + \\Allow: /public/ + \\ + ; + + var robots = try Robots.fromBytes(allocator, "MyBot", file); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/") == false); + try std.testing.expect(robots.isAllowed("/public/") == true); +} diff --git a/src/browser/ScriptManager.zig b/src/browser/ScriptManager.zig index 344d6232..01c56a81 100644 --- a/src/browser/ScriptManager.zig +++ b/src/browser/ScriptManager.zig @@ -265,6 +265,7 @@ pub fn addFromElement(self: *ScriptManager, comptime from_parser: bool, script_e .headers = try self.getHeaders(url), .blocking = is_blocking, .cookie_jar = &page._session.cookie_jar, + .robots = &page._session.browser.app.robots, .resource_type = .script, .notification = page._session.notification, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, @@ -380,6 +381,7 @@ pub fn preloadImport(self: *ScriptManager, url: [:0]const u8, referrer: []const .method = .GET, .headers = try self.getHeaders(url), .cookie_jar = &self.page._session.cookie_jar, + .robots = &self.page._session.browser.app.robots, .resource_type = .script, .notification = self.page._session.notification, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, @@ -484,6 +486,7 @@ pub fn getAsyncImport(self: *ScriptManager, url: [:0]const u8, cb: ImportAsync.C .resource_type = .script, .cookie_jar = &self.page._session.cookie_jar, .notification = self.page._session.notification, + .robots = &self.page._session.browser.app.robots, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, .header_callback = Script.headerCallback, .data_callback = Script.dataCallback, diff --git a/src/browser/URL.zig b/src/browser/URL.zig index 1e5d272a..716480b1 100644 --- a/src/browser/URL.zig +++ b/src/browser/URL.zig @@ -502,8 +502,8 @@ pub fn concatQueryString(arena: Allocator, url: []const u8, query_string: []cons return buf.items[0 .. buf.items.len - 1 :0]; } -pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) !?[:0]const u8 { - const origin = try getOrigin(arena, url) orelse return null; +pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) ![:0]const u8 { + const origin = try getOrigin(arena, url) orelse return error.NoOrigin; return try std.fmt.allocPrintSentinel( arena, "{s}/robots.txt", @@ -795,24 +795,24 @@ test "URL: getRobotsUrl" { { const url = try getRobotsUrl(arena, "https://www.lightpanda.io"); - try testing.expectEqual("https://www.lightpanda.io/robots.txt", url.?); + try testing.expectEqual("https://www.lightpanda.io/robots.txt", url); } { const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path"); - try testing.expectString("https://www.lightpanda.io/robots.txt", url.?); + try testing.expectString("https://www.lightpanda.io/robots.txt", url); } { const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page"); - try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url.?); + try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url); } { const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment"); - try testing.expectString("http://example.com/robots.txt", url.?); + try testing.expectString("http://example.com/robots.txt", url); } { const url = try getRobotsUrl(arena, "https://user:pass@example.com/page"); - try testing.expectString("https://example.com/robots.txt", url.?); + try testing.expectString("https://example.com/robots.txt", url); } } diff --git a/src/browser/webapi/net/Fetch.zig b/src/browser/webapi/net/Fetch.zig index a66fb311..988e9a53 100644 --- a/src/browser/webapi/net/Fetch.zig +++ b/src/browser/webapi/net/Fetch.zig @@ -79,6 +79,7 @@ pub fn init(input: Input, options: ?InitOpts, page: *Page) !js.Promise { .resource_type = .fetch, .cookie_jar = &page._session.cookie_jar, .notification = page._session.notification, + .robots = &page._session.browser.app.robots, .start_callback = httpStartCallback, .header_callback = httpHeaderDoneCallback, .data_callback = httpDataCallback, diff --git a/src/browser/webapi/net/XMLHttpRequest.zig b/src/browser/webapi/net/XMLHttpRequest.zig index 7c266e1a..296048b3 100644 --- a/src/browser/webapi/net/XMLHttpRequest.zig +++ b/src/browser/webapi/net/XMLHttpRequest.zig @@ -208,6 +208,7 @@ pub fn send(self: *XMLHttpRequest, body_: ?[]const u8) !void { .headers = headers, .body = self._request_body, .cookie_jar = &page._session.cookie_jar, + .robots = &page._session.browser.app.robots, .resource_type = .xhr, .notification = page._session.notification, .start_callback = httpStartCallback, diff --git a/src/http/Client.zig b/src/http/Client.zig index cc61b681..a9c21e0c 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -27,6 +27,8 @@ const Config = @import("../Config.zig"); const URL = @import("../browser/URL.zig"); const Notification = @import("../Notification.zig"); const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar; +const Robots = @import("../browser/Robots.zig"); +const RobotStore = Robots.RobotStore; const c = Http.c; const posix = std.posix; @@ -217,6 +219,36 @@ pub fn tick(self: *Client, timeout_ms: u32) !PerformStatus { } pub fn request(self: *Client, req: Request) !void { + if (self.config.obeyRobots()) { + const robots_url = try URL.getRobotsUrl(self.allocator, req.url); + + // If we have this robots cached, we can take a fast path. + if (req.robots.get(robots_url)) |robot_entry| { + defer self.allocator.free(robots_url); + + switch (robot_entry) { + // If we have a found robots entry, we check it. + .present => |robots| { + const path = URL.getPathname(req.url); + if (!robots.isAllowed(path)) { + req.error_callback(req.ctx, error.RobotsBlocked); + return; + } + }, + // Otherwise, we assume we won't find it again. + .absent => {}, + } + + return self.processRequest(req); + } + + return self.fetchRobotsThenProcessRequest(robots_url, req); + } + + return self.processRequest(req); +} + +fn processRequest(self: *Client, req: Request) !void { const transfer = try self.makeTransfer(req); transfer.req.notification.dispatch(.http_request_start, &.{ .transfer = transfer }); @@ -246,6 +278,108 @@ pub fn request(self: *Client, req: Request) !void { } } +const RobotsRequestContext = struct { + client: *Client, + req: Request, + robots_url: [:0]const u8, + buffer: std.ArrayList(u8), + status: u16 = 0, +}; + +fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void { + const ctx = try self.allocator.create(RobotsRequestContext); + ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty }; + + const headers = try self.newHeaders(); + + log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url }); + try self.processRequest(.{ + .ctx = ctx, + .url = robots_url, + .method = .GET, + .headers = headers, + .blocking = false, + .cookie_jar = req.cookie_jar, + .notification = req.notification, + .robots = req.robots, + .resource_type = .fetch, + .header_callback = robotsHeaderCallback, + .data_callback = robotsDataCallback, + .done_callback = robotsDoneCallback, + .error_callback = robotsErrorCallback, + }); +} + +fn robotsHeaderCallback(transfer: *Http.Transfer) !bool { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); + + if (transfer.response_header) |hdr| { + log.debug(.browser, "robots status", .{ .status = hdr.status }); + ctx.status = hdr.status; + } + + if (transfer.getContentLength()) |cl| { + try ctx.buffer.ensureTotalCapacity(ctx.client.allocator, cl); + } + + return true; +} + +fn robotsDataCallback(transfer: *Http.Transfer, data: []const u8) !void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); + try ctx.buffer.appendSlice(ctx.client.allocator, data); +} + +fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.client.allocator.destroy(ctx); + defer ctx.buffer.deinit(ctx.client.allocator); + defer ctx.client.allocator.free(ctx.robots_url); + + var allowed = true; + + if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) { + const robots = try ctx.req.robots.robotsFromBytes( + ctx.client.config.http_headers.user_agent, + ctx.buffer.items, + ); + + try ctx.req.robots.put(ctx.robots_url, robots); + + const path = URL.getPathname(ctx.req.url); + allowed = robots.isAllowed(path); + } + + // If not found, store as Not Found. + if (ctx.status == 404) { + log.debug(.http, "robots not found", .{ .url = ctx.robots_url }); + try ctx.req.robots.putAbsent(ctx.robots_url); + } + + if (!allowed) { + log.warn(.http, "blocked by robots", .{ .url = ctx.req.url }); + ctx.req.error_callback(ctx.req.ctx, error.RobotsBlocked); + return; + } + + // Now process the original request + try ctx.client.processRequest(ctx.req); +} + +fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.client.allocator.destroy(ctx); + defer ctx.buffer.deinit(ctx.client.allocator); + defer ctx.client.allocator.free(ctx.robots_url); + + log.warn(.http, "robots fetch failed", .{ .err = err }); + + // On error, allow the request to proceed + ctx.client.processRequest(ctx.req) catch |e| { + ctx.req.error_callback(ctx.req.ctx, e); + }; +} + fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool { // The request was intercepted and is blocking. This is messy, but our // callers, the ScriptManager -> Page, don't have a great way to stop the @@ -565,7 +699,7 @@ fn processMessages(self: *Client) !bool { // In case of auth challenge // TODO give a way to configure the number of auth retries. - if (transfer._auth_challenge != null and transfer._tries < 10) { + if (transfer._auth_challenge != null and transfer._tries < 10) { var wait_for_interception = false; transfer.req.notification.dispatch(.http_request_auth_required, &.{ .transfer = transfer, .wait_for_interception = &wait_for_interception }); if (wait_for_interception) { @@ -784,6 +918,7 @@ pub const Request = struct { headers: Http.Headers, body: ?[]const u8 = null, cookie_jar: *CookieJar, + robots: *RobotStore, resource_type: ResourceType, credentials: ?[:0]const u8 = null, notification: *Notification,