From e27803038c89712556c9a8425d7d8bfacc2cc402 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Fri, 23 Jan 2026 07:59:46 -0800 Subject: [PATCH 01/14] initial implementation of Robots --- src/browser/Robots.zig | 739 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 739 insertions(+) create mode 100644 src/browser/Robots.zig diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig new file mode 100644 index 00000000..5a4f9033 --- /dev/null +++ b/src/browser/Robots.zig @@ -0,0 +1,739 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +const std = @import("std"); + +pub const Rule = union(enum) { + allow: []const u8, + disallow: []const u8, +}; + +pub const Key = enum { + @"user-agent", + allow, + disallow, +}; + +/// https://www.rfc-editor.org/rfc/rfc9309.html +pub const Robots = @This(); +pub const empty: Robots = .{ .rules = &.{} }; + +rules: []const Rule, + +const State = enum { + not_in_entry, + in_other_entry, + in_our_entry, + in_wildcard_entry, +}; + +fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void { + for (rules) |rule| { + switch (rule) { + .allow => |value| allocator.free(value), + .disallow => |value| allocator.free(value), + } + } +} + +fn parseRulesWithUserAgent( + allocator: std.mem.Allocator, + user_agent: []const u8, + bytes: []const u8, +) ![]const Rule { + var rules: std.ArrayList(Rule) = .empty; + defer rules.deinit(allocator); + + var wildcard_rules: std.ArrayList(Rule) = .empty; + defer wildcard_rules.deinit(allocator); + + var state: State = .not_in_entry; + + var iter = std.mem.splitScalar(u8, bytes, '\n'); + while (iter.next()) |line| { + const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace); + + // Skip all comment lines. + if (std.mem.startsWith(u8, trimmed, "#")) continue; + + // Remove end of line comment. + const true_line = if (std.mem.indexOfScalar(u8, trimmed, '#')) |pos| + std.mem.trimRight(u8, trimmed[0..pos], &std.ascii.whitespace) + else + trimmed; + + if (true_line.len == 0) { + state = .not_in_entry; + continue; + } + + const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse return error.MissingColon; + const key_str = try std.ascii.allocLowerString(allocator, true_line[0..colon_idx]); + defer allocator.free(key_str); + + const key = std.meta.stringToEnum(Key, key_str) orelse { + // log.warn(.browser, "robots key", .{ .key = key_str }); + continue; + }; + + const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace); + + switch (key) { + .@"user-agent" => switch (state) { + .in_other_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state = .in_our_entry; + } + }, + .in_our_entry => {}, + .in_wildcard_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state = .in_our_entry; + } + }, + .not_in_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state = .in_our_entry; + } else if (std.mem.eql(u8, "*", value)) { + state = .in_wildcard_entry; + } else { + state = .in_other_entry; + } + }, + }, + .allow => switch (state) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .allow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .allow = duped_value }); + }, + .not_in_entry => return error.UnexpectedRule, + }, + .disallow => switch (state) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .disallow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .disallow = duped_value }); + }, + .not_in_entry => return error.UnexpectedRule, + }, + } + } + + if (rules.items.len > 0) { + freeRulesInList(allocator, wildcard_rules.items); + return try rules.toOwnedSlice(allocator); + } else { + freeRulesInList(allocator, rules.items); + return try wildcard_rules.toOwnedSlice(allocator); + } +} + +pub fn fromBytes(allocator: std.mem.Allocator, user_agent: []const u8, bytes: []const u8) !Robots { + const rules = try parseRulesWithUserAgent(allocator, user_agent, bytes); + return .{ .rules = rules }; +} + +pub fn deinit(self: *Robots, allocator: std.mem.Allocator) void { + freeRulesInList(allocator, self.rules); + allocator.free(self.rules); +} + +fn matchPatternRecursive(pattern: []const u8, path: []const u8, exact_match: bool) bool { + if (pattern.len == 0) return true; + + const star_pos = std.mem.indexOfScalar(u8, pattern, '*') orelse { + if (exact_match) { + // If we end in '$', we must be exactly equal. + return std.mem.eql(u8, path, pattern); + } else { + // Otherwise, we are just a prefix. + return std.mem.startsWith(u8, path, pattern); + } + }; + + // Ensure the prefix before the '*' matches. + if (!std.mem.startsWith(u8, path, pattern[0..star_pos])) { + return false; + } + + const suffix_pattern = pattern[star_pos + 1 ..]; + if (suffix_pattern.len == 0) return true; + + var i: usize = star_pos; + while (i <= path.len) : (i += 1) { + if (matchPatternRecursive(suffix_pattern, path[i..], exact_match)) { + return true; + } + } + + return false; +} + +/// There are rules for how the pattern in robots.txt should be matched. +/// +/// * should match 0 or more of any character. +/// $ should signify the end of a path, making it exact. +/// otherwise, it is a prefix path. +fn matchPattern(pattern: []const u8, path: []const u8) ?usize { + if (pattern.len == 0) return 0; + const exact_match = pattern[pattern.len - 1] == '$'; + const inner_pattern = if (exact_match) pattern[0 .. pattern.len - 1] else pattern; + + if (matchPatternRecursive( + inner_pattern, + path, + exact_match, + )) return pattern.len else return null; +} + +pub fn isAllowed(self: *const Robots, path: []const u8) bool { + const rules = self.rules; + + var longest_match_len: usize = 0; + var is_allowed_result = true; + + for (rules) |rule| { + switch (rule) { + .allow => |pattern| { + if (matchPattern(pattern, path)) |len| { + // Longest or Last Wins. + if (len >= longest_match_len) { + longest_match_len = len; + is_allowed_result = true; + } + } + }, + .disallow => |pattern| { + if (pattern.len == 0) continue; + + if (matchPattern(pattern, path)) |len| { + // Longest or Last Wins. + if (len >= longest_match_len) { + longest_match_len = len; + is_allowed_result = false; + } + } + }, + } + } + + return is_allowed_result; +} + +test "Robots: simple robots.txt" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: * + \\Disallow: /private/ + \\Allow: /public/ + \\ + \\User-agent: Googlebot + \\Disallow: /admin/ + \\ + ; + + const rules = try parseRulesWithUserAgent(allocator, "GoogleBot", file); + defer { + freeRulesInList(allocator, rules); + allocator.free(rules); + } + + try std.testing.expectEqual(1, rules.len); + try std.testing.expectEqualStrings("/admin/", rules[0].disallow); +} + +test "Robots: matchPattern - simple prefix" { + try std.testing.expect(matchPattern("/admin", "/admin/page") != null); + try std.testing.expect(matchPattern("/admin", "/admin") != null); + try std.testing.expect(matchPattern("/admin", "/other") == null); + try std.testing.expect(matchPattern("/admin/page", "/admin") == null); +} + +test "Robots: matchPattern - single wildcard" { + try std.testing.expect(matchPattern("/admin/*", "/admin/") != null); + try std.testing.expect(matchPattern("/admin/*", "/admin/page") != null); + try std.testing.expect(matchPattern("/admin/*", "/admin/page/subpage") != null); + try std.testing.expect(matchPattern("/admin/*", "/other/page") == null); +} + +test "Robots: matchPattern - wildcard in middle" { + try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/xyz") != null); + try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/ghi/xyz") != null); + try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def") == null); + try std.testing.expect(matchPattern("/abc/*/xyz", "/other/def/xyz") == null); +} + +test "Robots: matchPattern - complex wildcard case" { + try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/def/def/xyz") != null); + try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/ANYTHING/def/xyz") != null); +} + +test "Robots: matchPattern - multiple wildcards" { + try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/b/y/c") != null); + try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/y/b/z/w/c") != null); + try std.testing.expect(matchPattern("/*.php", "/index.php") != null); + try std.testing.expect(matchPattern("/*.php", "/admin/index.php") != null); +} + +test "Robots: matchPattern - end anchor" { + try std.testing.expect(matchPattern("/*.php$", "/index.php") != null); + try std.testing.expect(matchPattern("/*.php$", "/index.php?param=value") == null); + try std.testing.expect(matchPattern("/admin$", "/admin") != null); + try std.testing.expect(matchPattern("/admin$", "/admin/") == null); + try std.testing.expect(matchPattern("/fish$", "/fish") != null); + try std.testing.expect(matchPattern("/fish$", "/fishheads") == null); +} + +test "Robots: matchPattern - wildcard with extension" { + try std.testing.expect(matchPattern("/fish*.php", "/fish.php") != null); + try std.testing.expect(matchPattern("/fish*.php", "/fishheads.php") != null); + try std.testing.expect(matchPattern("/fish*.php", "/fish/salmon.php") != null); + try std.testing.expect(matchPattern("/fish*.php", "/fish.asp") == null); +} + +test "Robots: matchPattern - empty and edge cases" { + try std.testing.expect(matchPattern("", "/anything") != null); + try std.testing.expect(matchPattern("/", "/") != null); + try std.testing.expect(matchPattern("*", "/anything") != null); + try std.testing.expect(matchPattern("/*", "/anything") != null); + try std.testing.expect(matchPattern("$", "") != null); +} + +test "Robots: matchPattern - real world examples" { + try std.testing.expect(matchPattern("/", "/anything") != null); + + try std.testing.expect(matchPattern("/admin/", "/admin/page") != null); + try std.testing.expect(matchPattern("/admin/", "/public/page") == null); + + try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf") != null); + try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf.bak") == null); + + try std.testing.expect(matchPattern("/*?", "/page?param=value") != null); + try std.testing.expect(matchPattern("/*?", "/page") == null); +} + +test "Robots: isAllowed - basic allow/disallow" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "MyBot", + \\User-agent: MyBot + \\Disallow: /admin/ + \\Allow: /public/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/") == true); + try std.testing.expect(robots.isAllowed("/public/page") == true); + try std.testing.expect(robots.isAllowed("/admin/secret") == false); + try std.testing.expect(robots.isAllowed("/other/page") == true); +} + +test "Robots: isAllowed - longest match wins" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "TestBot", + \\User-agent: TestBot + \\Disallow: /admin/ + \\Allow: /admin/public/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/secret") == false); + try std.testing.expect(robots.isAllowed("/admin/public/page") == true); + try std.testing.expect(robots.isAllowed("/admin/public/") == true); +} + +test "Robots: isAllowed - specific user-agent vs wildcard" { + const allocator = std.testing.allocator; + + var robots1 = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + \\User-agent: * + \\Disallow: /admin/ + \\ + ); + defer robots1.deinit(allocator); + + try std.testing.expect(robots1.isAllowed("/private/page") == false); + try std.testing.expect(robots1.isAllowed("/admin/page") == true); + + // Test with other bot (should use wildcard) + var robots2 = try Robots.fromBytes(allocator, "OtherBot", + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + \\User-agent: * + \\Disallow: /admin/ + \\ + ); + defer robots2.deinit(allocator); + + try std.testing.expect(robots2.isAllowed("/private/page") == true); + try std.testing.expect(robots2.isAllowed("/admin/page") == false); +} + +test "Robots: isAllowed - case insensitive user-agent" { + const allocator = std.testing.allocator; + + var robots1 = try Robots.fromBytes(allocator, "googlebot", + \\User-agent: GoogleBot + \\Disallow: /private/ + \\ + ); + defer robots1.deinit(allocator); + try std.testing.expect(robots1.isAllowed("/private/") == false); + + var robots2 = try Robots.fromBytes(allocator, "GOOGLEBOT", + \\User-agent: GoogleBot + \\Disallow: /private/ + \\ + ); + defer robots2.deinit(allocator); + try std.testing.expect(robots2.isAllowed("/private/") == false); + + var robots3 = try Robots.fromBytes(allocator, "GoOgLeBoT", + \\User-agent: GoogleBot + \\Disallow: /private/ + \\ + ); + defer robots3.deinit(allocator); + try std.testing.expect(robots3.isAllowed("/private/") == false); +} + +test "Robots: isAllowed - merged rules for same agent" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: Googlebot + \\Disallow: /admin/ + \\ + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/page") == false); + try std.testing.expect(robots.isAllowed("/private/page") == false); + try std.testing.expect(robots.isAllowed("/public/page") == true); +} + +test "Robots: isAllowed - wildcards in patterns" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", + \\User-agent: Bot + \\Disallow: /*.php$ + \\Allow: /index.php$ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/page.php") == false); + try std.testing.expect(robots.isAllowed("/index.php") == true); + try std.testing.expect(robots.isAllowed("/page.php?param=1") == true); + try std.testing.expect(robots.isAllowed("/page.html") == true); +} + +test "Robots: isAllowed - empty disallow allows everything" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", + \\User-agent: Bot + \\Disallow: + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/anything") == true); + try std.testing.expect(robots.isAllowed("/") == true); +} + +test "Robots: isAllowed - no rules" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", ""); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/anything") == true); +} + +test "Robots: isAllowed - disallow all" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", + \\User-agent: Bot + \\Disallow: / + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/") == false); + try std.testing.expect(robots.isAllowed("/anything") == false); + try std.testing.expect(robots.isAllowed("/admin/page") == false); +} + +test "Robots: isAllowed - multiple user-agents in same entry" { + const allocator = std.testing.allocator; + + var robots1 = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: Googlebot + \\User-agent: Bingbot + \\Disallow: /private/ + \\ + ); + defer robots1.deinit(allocator); + try std.testing.expect(robots1.isAllowed("/private/") == false); + + var robots2 = try Robots.fromBytes(allocator, "Bingbot", + \\User-agent: Googlebot + \\User-agent: Bingbot + \\Disallow: /private/ + \\ + ); + defer robots2.deinit(allocator); + try std.testing.expect(robots2.isAllowed("/private/") == false); + + var robots3 = try Robots.fromBytes(allocator, "OtherBot", + \\User-agent: Googlebot + \\User-agent: Bingbot + \\Disallow: /private/ + \\ + ); + defer robots3.deinit(allocator); + try std.testing.expect(robots3.isAllowed("/private/") == true); +} + +test "Robots: isAllowed - wildcard fallback" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "UnknownBot", + \\User-agent: * + \\Disallow: /admin/ + \\Allow: /admin/public/ + \\ + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/secret") == false); + try std.testing.expect(robots.isAllowed("/admin/public/page") == true); + try std.testing.expect(robots.isAllowed("/private/") == true); +} + +test "Robots: isAllowed - complex real-world example" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "MyBot", + \\User-agent: * + \\Disallow: /cgi-bin/ + \\Disallow: /tmp/ + \\Disallow: /private/ + \\ + \\User-agent: MyBot + \\Disallow: /admin/ + \\Disallow: /*.pdf$ + \\Allow: /public/*.pdf$ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/") == true); + try std.testing.expect(robots.isAllowed("/admin/dashboard") == false); + try std.testing.expect(robots.isAllowed("/docs/guide.pdf") == false); + try std.testing.expect(robots.isAllowed("/public/manual.pdf") == true); + try std.testing.expect(robots.isAllowed("/page.html") == true); + try std.testing.expect(robots.isAllowed("/cgi-bin/script.sh") == true); +} + +test "Robots: isAllowed - order doesn't matter for same length" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", + \\User-agent: Bot + \\ # WOW!! + \\Allow: /page + \\Disallow: /page + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/page") == false); +} + +test "Robots: isAllowed - empty file uses wildcard defaults" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "MyBot", + \\User-agent: * # ABCDEF!!! + \\Disallow: /admin/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/") == false); + try std.testing.expect(robots.isAllowed("/public/") == true); +} +test "Robots: isAllowed - wildcard entry with multiple user-agents including specific" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: * + \\User-agent: Googlebot + \\Disallow: /shared/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/shared/") == false); + try std.testing.expect(robots.isAllowed("/other/") == true); + + var robots2 = try Robots.fromBytes(allocator, "Bingbot", + \\User-agent: * + \\User-agent: Googlebot + \\Disallow: /shared/ + \\ + ); + defer robots2.deinit(allocator); + + try std.testing.expect(robots2.isAllowed("/shared/") == false); +} + +test "Robots: isAllowed - specific agent appears after wildcard in entry" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "MyBot", + \\User-agent: * + \\User-agent: MyBot + \\User-agent: Bingbot + \\Disallow: /admin/ + \\Allow: /admin/public/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/secret") == false); + try std.testing.expect(robots.isAllowed("/admin/public/page") == true); +} + +test "Robots: isAllowed - wildcard should not override specific entry" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + \\User-agent: * + \\User-agent: Googlebot + \\Disallow: /admin/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/private/") == false); + try std.testing.expect(robots.isAllowed("/admin/") == false); +} + +test "Robots: isAllowed - Google's real robots.txt" { + const allocator = std.testing.allocator; + + // Simplified version of google.com/robots.txt + const google_robots = + \\User-agent: * + \\User-agent: Yandex + \\Disallow: /search + \\Allow: /search/about + \\Allow: /search/howsearchworks + \\Disallow: /imgres + \\Disallow: /m? + \\Disallow: /m/ + \\Allow: /m/finance + \\Disallow: /maps/ + \\Allow: /maps/$ + \\Allow: /maps/@ + \\Allow: /maps/dir/ + \\Disallow: /shopping? + \\Allow: /shopping?udm=28$ + \\ + \\User-agent: AdsBot-Google + \\Disallow: /maps/api/js/ + \\Allow: /maps/api/js + \\Disallow: /maps/api/staticmap + \\ + \\User-agent: Yandex + \\Disallow: /about/careers/applications/jobs/results + \\ + \\User-agent: facebookexternalhit + \\User-agent: Twitterbot + \\Allow: /imgres + \\Allow: /search + \\Disallow: /groups + \\Disallow: /m/ + \\ + ; + + var regular_bot = try Robots.fromBytes(allocator, "Googlebot", google_robots); + defer regular_bot.deinit(allocator); + + try std.testing.expect(regular_bot.isAllowed("/") == true); + try std.testing.expect(regular_bot.isAllowed("/search") == false); + try std.testing.expect(regular_bot.isAllowed("/search/about") == true); + try std.testing.expect(regular_bot.isAllowed("/search/howsearchworks") == true); + try std.testing.expect(regular_bot.isAllowed("/imgres") == false); + try std.testing.expect(regular_bot.isAllowed("/m/finance") == true); + try std.testing.expect(regular_bot.isAllowed("/m/other") == false); + try std.testing.expect(regular_bot.isAllowed("/maps/") == true); + try std.testing.expect(regular_bot.isAllowed("/maps/@") == true); + try std.testing.expect(regular_bot.isAllowed("/shopping?udm=28") == true); + try std.testing.expect(regular_bot.isAllowed("/shopping?udm=28&extra") == false); + + var adsbot = try Robots.fromBytes(allocator, "AdsBot-Google", google_robots); + defer adsbot.deinit(allocator); + + try std.testing.expect(adsbot.isAllowed("/maps/api/js") == true); + try std.testing.expect(adsbot.isAllowed("/maps/api/js/") == false); + try std.testing.expect(adsbot.isAllowed("/maps/api/staticmap") == false); + + var twitterbot = try Robots.fromBytes(allocator, "Twitterbot", google_robots); + defer twitterbot.deinit(allocator); + + try std.testing.expect(twitterbot.isAllowed("/imgres") == true); + try std.testing.expect(twitterbot.isAllowed("/search") == true); + try std.testing.expect(twitterbot.isAllowed("/groups") == false); + try std.testing.expect(twitterbot.isAllowed("/m/") == false); +} From 48ebc46c5fa9bd8261c07e5cb8b5c7343b573ea5 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Fri, 23 Jan 2026 16:29:41 -0800 Subject: [PATCH 02/14] add getRobotsUrl to URL --- src/browser/URL.zig | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/browser/URL.zig b/src/browser/URL.zig index d36673cc..1e5d272a 100644 --- a/src/browser/URL.zig +++ b/src/browser/URL.zig @@ -502,6 +502,16 @@ pub fn concatQueryString(arena: Allocator, url: []const u8, query_string: []cons return buf.items[0 .. buf.items.len - 1 :0]; } +pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) !?[:0]const u8 { + const origin = try getOrigin(arena, url) orelse return null; + return try std.fmt.allocPrintSentinel( + arena, + "{s}/robots.txt", + .{origin}, + 0, + ); +} + const testing = @import("../testing.zig"); test "URL: isCompleteHTTPUrl" { try testing.expectEqual(true, isCompleteHTTPUrl("http://example.com/about")); @@ -778,3 +788,31 @@ test "URL: concatQueryString" { try testing.expectEqual("https://www.lightpanda.io/index?1=2&a=b", url); } } + +test "URL: getRobotsUrl" { + defer testing.reset(); + const arena = testing.arena_allocator; + + { + const url = try getRobotsUrl(arena, "https://www.lightpanda.io"); + try testing.expectEqual("https://www.lightpanda.io/robots.txt", url.?); + } + + { + const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path"); + try testing.expectString("https://www.lightpanda.io/robots.txt", url.?); + } + + { + const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page"); + try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url.?); + } + { + const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment"); + try testing.expectString("http://example.com/robots.txt", url.?); + } + { + const url = try getRobotsUrl(arena, "https://user:pass@example.com/page"); + try testing.expectString("https://example.com/robots.txt", url.?); + } +} From 1a246f2e380f965a4602b4b2d9c5db8c72864dee Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Sat, 31 Jan 2026 18:41:55 -0800 Subject: [PATCH 03/14] robots in the actual http client --- src/App.zig | 5 + src/Config.zig | 18 ++ src/browser/Page.zig | 1 + src/browser/Robots.zig | 239 +++++++++++++++++----- src/browser/ScriptManager.zig | 3 + src/browser/URL.zig | 14 +- src/browser/webapi/net/Fetch.zig | 1 + src/browser/webapi/net/XMLHttpRequest.zig | 1 + src/http/Client.zig | 137 ++++++++++++- 9 files changed, 357 insertions(+), 62 deletions(-) diff --git a/src/App.zig b/src/App.zig index 21b0ecc6..76ffd396 100644 --- a/src/App.zig +++ b/src/App.zig @@ -25,6 +25,7 @@ const Config = @import("Config.zig"); const Snapshot = @import("browser/js/Snapshot.zig"); const Platform = @import("browser/js/Platform.zig"); const Telemetry = @import("telemetry/telemetry.zig").Telemetry; +const RobotStore = @import("browser/Robots.zig").RobotStore; pub const Http = @import("http/Http.zig"); pub const ArenaPool = @import("ArenaPool.zig"); @@ -38,6 +39,7 @@ snapshot: Snapshot, telemetry: Telemetry, allocator: Allocator, arena_pool: ArenaPool, +robots: RobotStore, app_dir_path: ?[]const u8, shutdown: bool = false, @@ -57,6 +59,8 @@ pub fn init(allocator: Allocator, config: *const Config) !*App { app.snapshot = try Snapshot.load(); errdefer app.snapshot.deinit(); + app.robots = RobotStore.init(allocator); + app.app_dir_path = getAndMakeAppDir(allocator); app.telemetry = try Telemetry.init(app, config.mode); @@ -79,6 +83,7 @@ pub fn deinit(self: *App) void { self.app_dir_path = null; } self.telemetry.deinit(); + self.robots.deinit(); self.http.deinit(); self.snapshot.deinit(); self.platform.deinit(); diff --git a/src/Config.zig b/src/Config.zig index fc4ebcdd..0f285f98 100644 --- a/src/Config.zig +++ b/src/Config.zig @@ -57,6 +57,13 @@ pub fn tlsVerifyHost(self: *const Config) bool { }; } +pub fn obeyRobots(self: *const Config) bool { + return switch (self.mode) { + inline .serve, .fetch => |opts| opts.common.obey_robots, + else => unreachable, + }; +} + pub fn httpProxy(self: *const Config) ?[:0]const u8 { return switch (self.mode) { inline .serve, .fetch => |opts| opts.common.http_proxy, @@ -158,6 +165,7 @@ pub const Fetch = struct { }; pub const Common = struct { + obey_robots: bool = false, proxy_bearer_token: ?[:0]const u8 = null, http_proxy: ?[:0]const u8 = null, http_max_concurrent: ?u8 = null, @@ -223,6 +231,11 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void { \\ advanced option which should only be set if you understand \\ and accept the risk of disabling host verification. \\ + \\--obey_robots + \\ Fetches and obeys the robots.txt (if available) of the web pages + \\ we make requests towards. + \\ Defaults to false. + \\ \\--http_proxy The HTTP proxy to use for all HTTP requests. \\ A username:password can be included for basic authentication. \\ Defaults to none. @@ -613,6 +626,11 @@ fn parseCommonArg( return true; } + if (std.mem.eql(u8, "--obey_robots", opt)) { + common.obey_robots = true; + return true; + } + if (std.mem.eql(u8, "--http_proxy", opt)) { const str = args.next() orelse { log.fatal(.app, "missing argument value", .{ .arg = "--http_proxy" }); diff --git a/src/browser/Page.zig b/src/browser/Page.zig index 8e86c47a..d879a813 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -559,6 +559,7 @@ pub fn navigate(self: *Page, request_url: [:0]const u8, opts: NavigateOpts) !voi .headers = headers, .body = opts.body, .cookie_jar = &self._session.cookie_jar, + .robots = &self._session.browser.app.robots, .resource_type = .document, .notification = self._session.notification, .header_callback = pageHeaderDoneCallback, diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig index 5a4f9033..2aff774a 100644 --- a/src/browser/Robots.zig +++ b/src/browser/Robots.zig @@ -33,13 +33,80 @@ pub const Key = enum { pub const Robots = @This(); pub const empty: Robots = .{ .rules = &.{} }; +pub const RobotStore = struct { + const RobotsEntry = union(enum) { + present: Robots, + absent, + }; + + pub const RobotsMap = std.HashMapUnmanaged([]const u8, RobotsEntry, struct { + const Context = @This(); + + pub fn hash(_: Context, value: []const u8) u32 { + var hasher = std.hash.Wyhash.init(value.len); + for (value) |c| { + std.hash.autoHash(&hasher, std.ascii.toLower(c)); + } + return @truncate(hasher.final()); + } + + pub fn eql(_: Context, a: []const u8, b: []const u8) bool { + if (a.len != b.len) return false; + return std.ascii.eqlIgnoreCase(a, b); + } + }, 80); + + allocator: std.mem.Allocator, + map: RobotsMap, + + pub fn init(allocator: std.mem.Allocator) RobotStore { + return .{ .allocator = allocator, .map = .empty }; + } + + pub fn deinit(self: *RobotStore) void { + var iter = self.map.iterator(); + + while (iter.next()) |entry| { + self.allocator.free(entry.key_ptr.*); + + switch (entry.value_ptr.*) { + .present => |*robots| robots.deinit(self.allocator), + .absent => {}, + } + } + + self.map.deinit(self.allocator); + } + + pub fn get(self: *RobotStore, url: []const u8) ?RobotsEntry { + return self.map.get(url); + } + + pub fn robotsFromBytes(self: *RobotStore, user_agent: []const u8, bytes: []const u8) !Robots { + return try Robots.fromBytes(self.allocator, user_agent, bytes); + } + + pub fn put(self: *RobotStore, url: []const u8, robots: Robots) !void { + const duped = try self.allocator.dupe(u8, url); + try self.map.put(self.allocator, duped, .{ .present = robots }); + } + + pub fn putAbsent(self: *RobotStore, url: []const u8) !void { + const duped = try self.allocator.dupe(u8, url); + try self.map.put(self.allocator, duped, .absent); + } +}; + rules: []const Rule, -const State = enum { - not_in_entry, - in_other_entry, - in_our_entry, - in_wildcard_entry, +const State = struct { + entry: enum { + not_in_entry, + in_other_entry, + in_our_entry, + in_wildcard_entry, + }, + has_rules: bool = false, }; fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void { @@ -62,7 +129,7 @@ fn parseRulesWithUserAgent( var wildcard_rules: std.ArrayList(Rule) = .empty; defer wildcard_rules.deinit(allocator); - var state: State = .not_in_entry; + var state: State = .{ .entry = .not_in_entry, .has_rules = false }; var iter = std.mem.splitScalar(u8, bytes, '\n'); while (iter.next()) |line| { @@ -78,7 +145,6 @@ fn parseRulesWithUserAgent( trimmed; if (true_line.len == 0) { - state = .not_in_entry; continue; } @@ -94,55 +160,69 @@ fn parseRulesWithUserAgent( const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace); switch (key) { - .@"user-agent" => switch (state) { - .in_other_entry => { - if (std.ascii.eqlIgnoreCase(user_agent, value)) { - state = .in_our_entry; - } - }, - .in_our_entry => {}, - .in_wildcard_entry => { - if (std.ascii.eqlIgnoreCase(user_agent, value)) { - state = .in_our_entry; - } - }, - .not_in_entry => { - if (std.ascii.eqlIgnoreCase(user_agent, value)) { - state = .in_our_entry; - } else if (std.mem.eql(u8, "*", value)) { - state = .in_wildcard_entry; - } else { - state = .in_other_entry; - } - }, + .@"user-agent" => { + if (state.has_rules) { + state = .{ .entry = .not_in_entry, .has_rules = false }; + } + + switch (state.entry) { + .in_other_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } + }, + .in_our_entry => {}, + .in_wildcard_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } + }, + .not_in_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } else if (std.mem.eql(u8, "*", value)) { + state.entry = .in_wildcard_entry; + } else { + state.entry = .in_other_entry; + } + }, + } }, - .allow => switch (state) { - .in_our_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try rules.append(allocator, .{ .allow = duped_value }); - }, - .in_other_entry => {}, - .in_wildcard_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try wildcard_rules.append(allocator, .{ .allow = duped_value }); - }, - .not_in_entry => return error.UnexpectedRule, + .allow => { + defer state.has_rules = true; + + switch (state.entry) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .allow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .allow = duped_value }); + }, + .not_in_entry => return error.UnexpectedRule, + } }, - .disallow => switch (state) { - .in_our_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try rules.append(allocator, .{ .disallow = duped_value }); - }, - .in_other_entry => {}, - .in_wildcard_entry => { - const duped_value = try allocator.dupe(u8, value); - errdefer allocator.free(duped_value); - try wildcard_rules.append(allocator, .{ .disallow = duped_value }); - }, - .not_in_entry => return error.UnexpectedRule, + .disallow => { + defer state.has_rules = true; + + switch (state.entry) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .disallow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .disallow = duped_value }); + }, + .not_in_entry => return error.UnexpectedRule, + } }, } } @@ -737,3 +817,54 @@ test "Robots: isAllowed - Google's real robots.txt" { try std.testing.expect(twitterbot.isAllowed("/groups") == false); try std.testing.expect(twitterbot.isAllowed("/m/") == false); } + +test "Robots: user-agent after rules starts new entry" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: Bot1 + \\User-agent: Bot2 + \\Disallow: /admin/ + \\Allow: /public/ + \\User-agent: Bot3 + \\Disallow: /private/ + \\ + ; + + var robots1 = try Robots.fromBytes(allocator, "Bot1", file); + defer robots1.deinit(allocator); + try std.testing.expect(robots1.isAllowed("/admin/") == false); + try std.testing.expect(robots1.isAllowed("/public/") == true); + try std.testing.expect(robots1.isAllowed("/private/") == true); + + var robots2 = try Robots.fromBytes(allocator, "Bot2", file); + defer robots2.deinit(allocator); + try std.testing.expect(robots2.isAllowed("/admin/") == false); + try std.testing.expect(robots2.isAllowed("/public/") == true); + try std.testing.expect(robots2.isAllowed("/private/") == true); + + var robots3 = try Robots.fromBytes(allocator, "Bot3", file); + defer robots3.deinit(allocator); + try std.testing.expect(robots3.isAllowed("/admin/") == true); + try std.testing.expect(robots3.isAllowed("/public/") == true); + try std.testing.expect(robots3.isAllowed("/private/") == false); +} + +test "Robots: blank lines don't end entries" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: MyBot + \\Disallow: /admin/ + \\ + \\ + \\Allow: /public/ + \\ + ; + + var robots = try Robots.fromBytes(allocator, "MyBot", file); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/") == false); + try std.testing.expect(robots.isAllowed("/public/") == true); +} diff --git a/src/browser/ScriptManager.zig b/src/browser/ScriptManager.zig index 344d6232..01c56a81 100644 --- a/src/browser/ScriptManager.zig +++ b/src/browser/ScriptManager.zig @@ -265,6 +265,7 @@ pub fn addFromElement(self: *ScriptManager, comptime from_parser: bool, script_e .headers = try self.getHeaders(url), .blocking = is_blocking, .cookie_jar = &page._session.cookie_jar, + .robots = &page._session.browser.app.robots, .resource_type = .script, .notification = page._session.notification, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, @@ -380,6 +381,7 @@ pub fn preloadImport(self: *ScriptManager, url: [:0]const u8, referrer: []const .method = .GET, .headers = try self.getHeaders(url), .cookie_jar = &self.page._session.cookie_jar, + .robots = &self.page._session.browser.app.robots, .resource_type = .script, .notification = self.page._session.notification, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, @@ -484,6 +486,7 @@ pub fn getAsyncImport(self: *ScriptManager, url: [:0]const u8, cb: ImportAsync.C .resource_type = .script, .cookie_jar = &self.page._session.cookie_jar, .notification = self.page._session.notification, + .robots = &self.page._session.browser.app.robots, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, .header_callback = Script.headerCallback, .data_callback = Script.dataCallback, diff --git a/src/browser/URL.zig b/src/browser/URL.zig index 1e5d272a..716480b1 100644 --- a/src/browser/URL.zig +++ b/src/browser/URL.zig @@ -502,8 +502,8 @@ pub fn concatQueryString(arena: Allocator, url: []const u8, query_string: []cons return buf.items[0 .. buf.items.len - 1 :0]; } -pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) !?[:0]const u8 { - const origin = try getOrigin(arena, url) orelse return null; +pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) ![:0]const u8 { + const origin = try getOrigin(arena, url) orelse return error.NoOrigin; return try std.fmt.allocPrintSentinel( arena, "{s}/robots.txt", @@ -795,24 +795,24 @@ test "URL: getRobotsUrl" { { const url = try getRobotsUrl(arena, "https://www.lightpanda.io"); - try testing.expectEqual("https://www.lightpanda.io/robots.txt", url.?); + try testing.expectEqual("https://www.lightpanda.io/robots.txt", url); } { const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path"); - try testing.expectString("https://www.lightpanda.io/robots.txt", url.?); + try testing.expectString("https://www.lightpanda.io/robots.txt", url); } { const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page"); - try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url.?); + try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url); } { const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment"); - try testing.expectString("http://example.com/robots.txt", url.?); + try testing.expectString("http://example.com/robots.txt", url); } { const url = try getRobotsUrl(arena, "https://user:pass@example.com/page"); - try testing.expectString("https://example.com/robots.txt", url.?); + try testing.expectString("https://example.com/robots.txt", url); } } diff --git a/src/browser/webapi/net/Fetch.zig b/src/browser/webapi/net/Fetch.zig index a66fb311..988e9a53 100644 --- a/src/browser/webapi/net/Fetch.zig +++ b/src/browser/webapi/net/Fetch.zig @@ -79,6 +79,7 @@ pub fn init(input: Input, options: ?InitOpts, page: *Page) !js.Promise { .resource_type = .fetch, .cookie_jar = &page._session.cookie_jar, .notification = page._session.notification, + .robots = &page._session.browser.app.robots, .start_callback = httpStartCallback, .header_callback = httpHeaderDoneCallback, .data_callback = httpDataCallback, diff --git a/src/browser/webapi/net/XMLHttpRequest.zig b/src/browser/webapi/net/XMLHttpRequest.zig index 7c266e1a..296048b3 100644 --- a/src/browser/webapi/net/XMLHttpRequest.zig +++ b/src/browser/webapi/net/XMLHttpRequest.zig @@ -208,6 +208,7 @@ pub fn send(self: *XMLHttpRequest, body_: ?[]const u8) !void { .headers = headers, .body = self._request_body, .cookie_jar = &page._session.cookie_jar, + .robots = &page._session.browser.app.robots, .resource_type = .xhr, .notification = page._session.notification, .start_callback = httpStartCallback, diff --git a/src/http/Client.zig b/src/http/Client.zig index cc61b681..a9c21e0c 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -27,6 +27,8 @@ const Config = @import("../Config.zig"); const URL = @import("../browser/URL.zig"); const Notification = @import("../Notification.zig"); const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar; +const Robots = @import("../browser/Robots.zig"); +const RobotStore = Robots.RobotStore; const c = Http.c; const posix = std.posix; @@ -217,6 +219,36 @@ pub fn tick(self: *Client, timeout_ms: u32) !PerformStatus { } pub fn request(self: *Client, req: Request) !void { + if (self.config.obeyRobots()) { + const robots_url = try URL.getRobotsUrl(self.allocator, req.url); + + // If we have this robots cached, we can take a fast path. + if (req.robots.get(robots_url)) |robot_entry| { + defer self.allocator.free(robots_url); + + switch (robot_entry) { + // If we have a found robots entry, we check it. + .present => |robots| { + const path = URL.getPathname(req.url); + if (!robots.isAllowed(path)) { + req.error_callback(req.ctx, error.RobotsBlocked); + return; + } + }, + // Otherwise, we assume we won't find it again. + .absent => {}, + } + + return self.processRequest(req); + } + + return self.fetchRobotsThenProcessRequest(robots_url, req); + } + + return self.processRequest(req); +} + +fn processRequest(self: *Client, req: Request) !void { const transfer = try self.makeTransfer(req); transfer.req.notification.dispatch(.http_request_start, &.{ .transfer = transfer }); @@ -246,6 +278,108 @@ pub fn request(self: *Client, req: Request) !void { } } +const RobotsRequestContext = struct { + client: *Client, + req: Request, + robots_url: [:0]const u8, + buffer: std.ArrayList(u8), + status: u16 = 0, +}; + +fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void { + const ctx = try self.allocator.create(RobotsRequestContext); + ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty }; + + const headers = try self.newHeaders(); + + log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url }); + try self.processRequest(.{ + .ctx = ctx, + .url = robots_url, + .method = .GET, + .headers = headers, + .blocking = false, + .cookie_jar = req.cookie_jar, + .notification = req.notification, + .robots = req.robots, + .resource_type = .fetch, + .header_callback = robotsHeaderCallback, + .data_callback = robotsDataCallback, + .done_callback = robotsDoneCallback, + .error_callback = robotsErrorCallback, + }); +} + +fn robotsHeaderCallback(transfer: *Http.Transfer) !bool { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); + + if (transfer.response_header) |hdr| { + log.debug(.browser, "robots status", .{ .status = hdr.status }); + ctx.status = hdr.status; + } + + if (transfer.getContentLength()) |cl| { + try ctx.buffer.ensureTotalCapacity(ctx.client.allocator, cl); + } + + return true; +} + +fn robotsDataCallback(transfer: *Http.Transfer, data: []const u8) !void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); + try ctx.buffer.appendSlice(ctx.client.allocator, data); +} + +fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.client.allocator.destroy(ctx); + defer ctx.buffer.deinit(ctx.client.allocator); + defer ctx.client.allocator.free(ctx.robots_url); + + var allowed = true; + + if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) { + const robots = try ctx.req.robots.robotsFromBytes( + ctx.client.config.http_headers.user_agent, + ctx.buffer.items, + ); + + try ctx.req.robots.put(ctx.robots_url, robots); + + const path = URL.getPathname(ctx.req.url); + allowed = robots.isAllowed(path); + } + + // If not found, store as Not Found. + if (ctx.status == 404) { + log.debug(.http, "robots not found", .{ .url = ctx.robots_url }); + try ctx.req.robots.putAbsent(ctx.robots_url); + } + + if (!allowed) { + log.warn(.http, "blocked by robots", .{ .url = ctx.req.url }); + ctx.req.error_callback(ctx.req.ctx, error.RobotsBlocked); + return; + } + + // Now process the original request + try ctx.client.processRequest(ctx.req); +} + +fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.client.allocator.destroy(ctx); + defer ctx.buffer.deinit(ctx.client.allocator); + defer ctx.client.allocator.free(ctx.robots_url); + + log.warn(.http, "robots fetch failed", .{ .err = err }); + + // On error, allow the request to proceed + ctx.client.processRequest(ctx.req) catch |e| { + ctx.req.error_callback(ctx.req.ctx, e); + }; +} + fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool { // The request was intercepted and is blocking. This is messy, but our // callers, the ScriptManager -> Page, don't have a great way to stop the @@ -565,7 +699,7 @@ fn processMessages(self: *Client) !bool { // In case of auth challenge // TODO give a way to configure the number of auth retries. - if (transfer._auth_challenge != null and transfer._tries < 10) { + if (transfer._auth_challenge != null and transfer._tries < 10) { var wait_for_interception = false; transfer.req.notification.dispatch(.http_request_auth_required, &.{ .transfer = transfer, .wait_for_interception = &wait_for_interception }); if (wait_for_interception) { @@ -784,6 +918,7 @@ pub const Request = struct { headers: Http.Headers, body: ?[]const u8 = null, cookie_jar: *CookieJar, + robots: *RobotStore, resource_type: ResourceType, credentials: ?[:0]const u8 = null, notification: *Notification, From e4f250435d95d9f9475bb489d912b524c18a33f6 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Wed, 4 Feb 2026 11:03:34 -0800 Subject: [PATCH 04/14] include robots url in debug log --- src/http/Client.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/http/Client.zig b/src/http/Client.zig index a9c21e0c..d6d77271 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -314,7 +314,7 @@ fn robotsHeaderCallback(transfer: *Http.Transfer) !bool { const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); if (transfer.response_header) |hdr| { - log.debug(.browser, "robots status", .{ .status = hdr.status }); + log.debug(.browser, "robots status", .{ .status = hdr.status, .robots_url = ctx.robots_url }); ctx.status = hdr.status; } From b6af5884b11c387b4905678d6d1e6feda7adcca2 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Wed, 4 Feb 2026 11:05:24 -0800 Subject: [PATCH 05/14] use RobotsRequestContext deinit --- src/http/Client.zig | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/http/Client.zig b/src/http/Client.zig index d6d77271..f3fa923f 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -284,6 +284,12 @@ const RobotsRequestContext = struct { robots_url: [:0]const u8, buffer: std.ArrayList(u8), status: u16 = 0, + + pub fn deinit(self: *RobotsRequestContext) void { + self.client.allocator.free(self.robots_url); + self.buffer.deinit(self.client.allocator); + self.client.allocator.destroy(self); + } }; fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void { @@ -332,9 +338,7 @@ fn robotsDataCallback(transfer: *Http.Transfer, data: []const u8) !void { fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); - defer ctx.client.allocator.destroy(ctx); - defer ctx.buffer.deinit(ctx.client.allocator); - defer ctx.client.allocator.free(ctx.robots_url); + defer ctx.deinit(); var allowed = true; @@ -348,10 +352,7 @@ fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { const path = URL.getPathname(ctx.req.url); allowed = robots.isAllowed(path); - } - - // If not found, store as Not Found. - if (ctx.status == 404) { + } else if (ctx.status == 404) { log.debug(.http, "robots not found", .{ .url = ctx.robots_url }); try ctx.req.robots.putAbsent(ctx.robots_url); } @@ -368,9 +369,7 @@ fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); - defer ctx.client.allocator.destroy(ctx); - defer ctx.buffer.deinit(ctx.client.allocator); - defer ctx.client.allocator.free(ctx.robots_url); + defer ctx.deinit(); log.warn(.http, "robots fetch failed", .{ .err = err }); From f9104c71f6b55b6e7e9c156c0966c754cb667282 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Wed, 4 Feb 2026 11:10:07 -0800 Subject: [PATCH 06/14] log instead of returning error on unexpected rule --- src/browser/Robots.zig | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig index 2aff774a..a11026cd 100644 --- a/src/browser/Robots.zig +++ b/src/browser/Robots.zig @@ -17,6 +17,7 @@ // along with this program. If not, see . const std = @import("std"); +const log = @import("../log.zig"); pub const Rule = union(enum) { allow: []const u8, @@ -203,7 +204,10 @@ fn parseRulesWithUserAgent( errdefer allocator.free(duped_value); try wildcard_rules.append(allocator, .{ .allow = duped_value }); }, - .not_in_entry => return error.UnexpectedRule, + .not_in_entry => { + log.warn(.browser, "robots unexpected rule", .{ .rule = "allow" }); + continue; + }, } }, .disallow => { @@ -221,7 +225,10 @@ fn parseRulesWithUserAgent( errdefer allocator.free(duped_value); try wildcard_rules.append(allocator, .{ .disallow = duped_value }); }, - .not_in_entry => return error.UnexpectedRule, + .not_in_entry => { + log.warn(.browser, "robots unexpected rule", .{ .rule = "disallow" }); + continue; + }, } }, } From 29ee7d41f5cf62de729e3a96586df9e2eeef1119 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Wed, 4 Feb 2026 11:30:27 -0800 Subject: [PATCH 07/14] queue requests to run after robots is fetched --- src/http/Client.zig | 99 +++++++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 30 deletions(-) diff --git a/src/http/Client.zig b/src/http/Client.zig index f3fa923f..60b01047 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -87,6 +87,10 @@ queue: TransferQueue, // The main app allocator allocator: Allocator, +// Queue of requests that depend on a robots.txt. +// Allows us to fetch the robots.txt just once. +pending_robots_queue: std.StringHashMapUnmanaged(std.ArrayList(Request)) = .empty, + // Once we have a handle/easy to process a request with, we create a Transfer // which contains the Request as well as any state we need to process the // request. These wil come and go with each request. @@ -165,6 +169,13 @@ pub fn deinit(self: *Client) void { _ = c.curl_multi_cleanup(self.multi); self.transfer_pool.deinit(); + + var robots_iter = self.pending_robots_queue.iterator(); + while (robots_iter.next()) |entry| { + entry.value_ptr.deinit(self.allocator); + } + self.pending_robots_queue.deinit(self.allocator); + self.allocator.destroy(self); } @@ -254,7 +265,10 @@ fn processRequest(self: *Client, req: Request) !void { transfer.req.notification.dispatch(.http_request_start, &.{ .transfer = transfer }); var wait_for_interception = false; - transfer.req.notification.dispatch(.http_request_intercept, &.{ .transfer = transfer, .wait_for_interception = &wait_for_interception }); + transfer.req.notification.dispatch(.http_request_intercept, &.{ + .transfer = transfer, + .wait_for_interception = &wait_for_interception, + }); if (wait_for_interception == false) { // request not intercepted, process it normally return self.process(transfer); @@ -293,27 +307,36 @@ const RobotsRequestContext = struct { }; fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void { - const ctx = try self.allocator.create(RobotsRequestContext); - ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty }; + const entry = try self.pending_robots_queue.getOrPut(self.allocator, robots_url); - const headers = try self.newHeaders(); + if (!entry.found_existing) { + // If we aren't already fetching this robots, + // we want to create a new queue for it and add this request into it. + entry.value_ptr.* = .empty; - log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url }); - try self.processRequest(.{ - .ctx = ctx, - .url = robots_url, - .method = .GET, - .headers = headers, - .blocking = false, - .cookie_jar = req.cookie_jar, - .notification = req.notification, - .robots = req.robots, - .resource_type = .fetch, - .header_callback = robotsHeaderCallback, - .data_callback = robotsDataCallback, - .done_callback = robotsDoneCallback, - .error_callback = robotsErrorCallback, - }); + const ctx = try self.allocator.create(RobotsRequestContext); + ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty }; + const headers = try self.newHeaders(); + + log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url }); + try self.processRequest(.{ + .ctx = ctx, + .url = robots_url, + .method = .GET, + .headers = headers, + .blocking = false, + .cookie_jar = req.cookie_jar, + .notification = req.notification, + .robots = req.robots, + .resource_type = .fetch, + .header_callback = robotsHeaderCallback, + .data_callback = robotsDataCallback, + .done_callback = robotsDoneCallback, + .error_callback = robotsErrorCallback, + }); + } + + try entry.value_ptr.append(self.allocator, req); } fn robotsHeaderCallback(transfer: *Http.Transfer) !bool { @@ -357,14 +380,22 @@ fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { try ctx.req.robots.putAbsent(ctx.robots_url); } - if (!allowed) { - log.warn(.http, "blocked by robots", .{ .url = ctx.req.url }); - ctx.req.error_callback(ctx.req.ctx, error.RobotsBlocked); - return; + const queued = ctx.client.pending_robots_queue.getPtr(ctx.robots_url) orelse unreachable; + defer { + queued.deinit(ctx.client.allocator); + _ = ctx.client.pending_robots_queue.remove(ctx.robots_url); } - // Now process the original request - try ctx.client.processRequest(ctx.req); + for (queued.items) |queued_req| { + if (!allowed) { + log.warn(.http, "blocked by robots", .{ .url = queued_req.url }); + queued_req.error_callback(queued_req.ctx, error.RobotsBlocked); + } else { + ctx.client.processRequest(queued_req) catch |e| { + queued_req.error_callback(queued_req.ctx, e); + }; + } + } } fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { @@ -373,10 +404,18 @@ fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { log.warn(.http, "robots fetch failed", .{ .err = err }); - // On error, allow the request to proceed - ctx.client.processRequest(ctx.req) catch |e| { - ctx.req.error_callback(ctx.req.ctx, e); - }; + const queued = ctx.client.pending_robots_queue.getPtr(ctx.robots_url) orelse unreachable; + defer { + queued.deinit(ctx.client.allocator); + _ = ctx.client.pending_robots_queue.remove(ctx.robots_url); + } + + // On error, allow all queued requests to proceed + for (queued.items) |queued_req| { + ctx.client.processRequest(queued_req) catch |e| { + queued_req.error_callback(queued_req.ctx, e); + }; + } } fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool { From e620c28a1c2536bb86208c8a8a0767f4c42791c5 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Wed, 4 Feb 2026 11:35:48 -0800 Subject: [PATCH 08/14] stop leaking robots_url when in robot queue --- src/http/Client.zig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/http/Client.zig b/src/http/Client.zig index 60b01047..91ab02d1 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -232,6 +232,7 @@ pub fn tick(self: *Client, timeout_ms: u32) !PerformStatus { pub fn request(self: *Client, req: Request) !void { if (self.config.obeyRobots()) { const robots_url = try URL.getRobotsUrl(self.allocator, req.url); + errdefer self.allocator.free(robots_url); // If we have this robots cached, we can take a fast path. if (req.robots.get(robots_url)) |robot_entry| { @@ -334,6 +335,9 @@ fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: R .done_callback = robotsDoneCallback, .error_callback = robotsErrorCallback, }); + } else { + // Not using our own robots URL, only using the one from the first request. + self.allocator.free(robots_url); } try entry.value_ptr.append(self.allocator, req); From 50aeb9ff21f571f33ca3f5116741f058b8bb81b5 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Wed, 4 Feb 2026 11:39:39 -0800 Subject: [PATCH 09/14] add comment explaining rule choice in robots --- src/browser/Robots.zig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig index a11026cd..6b6062b1 100644 --- a/src/browser/Robots.zig +++ b/src/browser/Robots.zig @@ -234,6 +234,8 @@ fn parseRulesWithUserAgent( } } + // If we have rules for our specific User-Agent, we will use those rules. + // If we don't have any rules, we fallback to using the wildcard ("*") rules. if (rules.items.len > 0) { freeRulesInList(allocator, wildcard_rules.items); return try rules.toOwnedSlice(allocator); From a7095d7decfbe62af09cc945b8b51f93122d0edb Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Wed, 4 Feb 2026 11:49:52 -0800 Subject: [PATCH 10/14] pass robot store into Http init --- src/App.zig | 6 +++--- src/browser/Page.zig | 1 - src/browser/ScriptManager.zig | 3 --- src/browser/webapi/net/Fetch.zig | 1 - src/browser/webapi/net/XMLHttpRequest.zig | 1 - src/http/Client.zig | 15 ++++++++------- src/http/Http.zig | 5 +++-- 7 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/App.zig b/src/App.zig index 76ffd396..a4ed0e8f 100644 --- a/src/App.zig +++ b/src/App.zig @@ -50,7 +50,9 @@ pub fn init(allocator: Allocator, config: *const Config) !*App { app.config = config; app.allocator = allocator; - app.http = try Http.init(allocator, config); + app.robots = RobotStore.init(allocator); + + app.http = try Http.init(allocator, &app.robots, config); errdefer app.http.deinit(); app.platform = try Platform.init(); @@ -59,8 +61,6 @@ pub fn init(allocator: Allocator, config: *const Config) !*App { app.snapshot = try Snapshot.load(); errdefer app.snapshot.deinit(); - app.robots = RobotStore.init(allocator); - app.app_dir_path = getAndMakeAppDir(allocator); app.telemetry = try Telemetry.init(app, config.mode); diff --git a/src/browser/Page.zig b/src/browser/Page.zig index d879a813..8e86c47a 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -559,7 +559,6 @@ pub fn navigate(self: *Page, request_url: [:0]const u8, opts: NavigateOpts) !voi .headers = headers, .body = opts.body, .cookie_jar = &self._session.cookie_jar, - .robots = &self._session.browser.app.robots, .resource_type = .document, .notification = self._session.notification, .header_callback = pageHeaderDoneCallback, diff --git a/src/browser/ScriptManager.zig b/src/browser/ScriptManager.zig index 01c56a81..344d6232 100644 --- a/src/browser/ScriptManager.zig +++ b/src/browser/ScriptManager.zig @@ -265,7 +265,6 @@ pub fn addFromElement(self: *ScriptManager, comptime from_parser: bool, script_e .headers = try self.getHeaders(url), .blocking = is_blocking, .cookie_jar = &page._session.cookie_jar, - .robots = &page._session.browser.app.robots, .resource_type = .script, .notification = page._session.notification, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, @@ -381,7 +380,6 @@ pub fn preloadImport(self: *ScriptManager, url: [:0]const u8, referrer: []const .method = .GET, .headers = try self.getHeaders(url), .cookie_jar = &self.page._session.cookie_jar, - .robots = &self.page._session.browser.app.robots, .resource_type = .script, .notification = self.page._session.notification, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, @@ -486,7 +484,6 @@ pub fn getAsyncImport(self: *ScriptManager, url: [:0]const u8, cb: ImportAsync.C .resource_type = .script, .cookie_jar = &self.page._session.cookie_jar, .notification = self.page._session.notification, - .robots = &self.page._session.browser.app.robots, .start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null, .header_callback = Script.headerCallback, .data_callback = Script.dataCallback, diff --git a/src/browser/webapi/net/Fetch.zig b/src/browser/webapi/net/Fetch.zig index 988e9a53..a66fb311 100644 --- a/src/browser/webapi/net/Fetch.zig +++ b/src/browser/webapi/net/Fetch.zig @@ -79,7 +79,6 @@ pub fn init(input: Input, options: ?InitOpts, page: *Page) !js.Promise { .resource_type = .fetch, .cookie_jar = &page._session.cookie_jar, .notification = page._session.notification, - .robots = &page._session.browser.app.robots, .start_callback = httpStartCallback, .header_callback = httpHeaderDoneCallback, .data_callback = httpDataCallback, diff --git a/src/browser/webapi/net/XMLHttpRequest.zig b/src/browser/webapi/net/XMLHttpRequest.zig index 296048b3..7c266e1a 100644 --- a/src/browser/webapi/net/XMLHttpRequest.zig +++ b/src/browser/webapi/net/XMLHttpRequest.zig @@ -208,7 +208,6 @@ pub fn send(self: *XMLHttpRequest, body_: ?[]const u8) !void { .headers = headers, .body = self._request_body, .cookie_jar = &page._session.cookie_jar, - .robots = &page._session.browser.app.robots, .resource_type = .xhr, .notification = page._session.notification, .start_callback = httpStartCallback, diff --git a/src/http/Client.zig b/src/http/Client.zig index 91ab02d1..d65e860f 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -87,6 +87,8 @@ queue: TransferQueue, // The main app allocator allocator: Allocator, +// Reference to the App-owned Robot Store. +robot_store: *RobotStore, // Queue of requests that depend on a robots.txt. // Allows us to fetch the robots.txt just once. pending_robots_queue: std.StringHashMapUnmanaged(std.ArrayList(Request)) = .empty, @@ -129,7 +131,7 @@ pub const CDPClient = struct { const TransferQueue = std.DoublyLinkedList; -pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, config: *const Config) !*Client { +pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, robot_store: *RobotStore, config: *const Config) !*Client { var transfer_pool = std.heap.MemoryPool(Transfer).init(allocator); errdefer transfer_pool.deinit(); @@ -153,6 +155,7 @@ pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, config: *const Config) .multi = multi, .handles = handles, .allocator = allocator, + .robot_store = robot_store, .http_proxy = http_proxy, .use_proxy = http_proxy != null, .config = config, @@ -235,7 +238,7 @@ pub fn request(self: *Client, req: Request) !void { errdefer self.allocator.free(robots_url); // If we have this robots cached, we can take a fast path. - if (req.robots.get(robots_url)) |robot_entry| { + if (self.robot_store.get(robots_url)) |robot_entry| { defer self.allocator.free(robots_url); switch (robot_entry) { @@ -328,7 +331,6 @@ fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: R .blocking = false, .cookie_jar = req.cookie_jar, .notification = req.notification, - .robots = req.robots, .resource_type = .fetch, .header_callback = robotsHeaderCallback, .data_callback = robotsDataCallback, @@ -370,18 +372,18 @@ fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { var allowed = true; if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) { - const robots = try ctx.req.robots.robotsFromBytes( + const robots = try ctx.client.robot_store.robotsFromBytes( ctx.client.config.http_headers.user_agent, ctx.buffer.items, ); - try ctx.req.robots.put(ctx.robots_url, robots); + try ctx.client.robot_store.put(ctx.robots_url, robots); const path = URL.getPathname(ctx.req.url); allowed = robots.isAllowed(path); } else if (ctx.status == 404) { log.debug(.http, "robots not found", .{ .url = ctx.robots_url }); - try ctx.req.robots.putAbsent(ctx.robots_url); + try ctx.client.robot_store.putAbsent(ctx.robots_url); } const queued = ctx.client.pending_robots_queue.getPtr(ctx.robots_url) orelse unreachable; @@ -960,7 +962,6 @@ pub const Request = struct { headers: Http.Headers, body: ?[]const u8 = null, cookie_jar: *CookieJar, - robots: *RobotStore, resource_type: ResourceType, credentials: ?[:0]const u8 = null, notification: *Notification, diff --git a/src/http/Http.zig b/src/http/Http.zig index 3d488f95..9d550148 100644 --- a/src/http/Http.zig +++ b/src/http/Http.zig @@ -30,6 +30,7 @@ pub const Transfer = Client.Transfer; const log = @import("../log.zig"); const errors = @import("errors.zig"); +const RobotStore = @import("../browser/Robots.zig").RobotStore; const Allocator = std.mem.Allocator; const ArenaAllocator = std.heap.ArenaAllocator; @@ -46,7 +47,7 @@ client: *Client, ca_blob: ?c.curl_blob, arena: ArenaAllocator, -pub fn init(allocator: Allocator, config: *const Config) !Http { +pub fn init(allocator: Allocator, robot_store: *RobotStore, config: *const Config) !Http { try errorCheck(c.curl_global_init(c.CURL_GLOBAL_SSL)); errdefer c.curl_global_cleanup(); @@ -62,7 +63,7 @@ pub fn init(allocator: Allocator, config: *const Config) !Http { ca_blob = try loadCerts(allocator, arena.allocator()); } - var client = try Client.init(allocator, ca_blob, config); + var client = try Client.init(allocator, ca_blob, robot_store, config); errdefer client.deinit(); return .{ From 34067a1d70414cc80b0e9c489cca65dc94180925 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Thu, 5 Feb 2026 08:02:35 -0800 Subject: [PATCH 11/14] only use eqlIgnoreCase for RobotStore --- src/browser/Robots.zig | 1 - 1 file changed, 1 deletion(-) diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig index 6b6062b1..709481ee 100644 --- a/src/browser/Robots.zig +++ b/src/browser/Robots.zig @@ -52,7 +52,6 @@ pub const RobotStore = struct { } pub fn eql(_: Context, a: []const u8, b: []const u8) bool { - if (a.len != b.len) return false; return std.ascii.eqlIgnoreCase(a, b); } }, 80); From 46c73a05a9d24921d6b6a0686dbba28e8b82c2e3 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Mon, 9 Feb 2026 05:35:32 -0800 Subject: [PATCH 12/14] panic instead of unreachable on robots callbacks --- src/http/Client.zig | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/http/Client.zig b/src/http/Client.zig index d65e860f..c69b0cf9 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -386,13 +386,12 @@ fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { try ctx.client.robot_store.putAbsent(ctx.robots_url); } - const queued = ctx.client.pending_robots_queue.getPtr(ctx.robots_url) orelse unreachable; - defer { - queued.deinit(ctx.client.allocator); - _ = ctx.client.pending_robots_queue.remove(ctx.robots_url); - } + var queued = ctx.client.pending_robots_queue.fetchRemove( + ctx.robots_url, + ) orelse @panic("Client.robotsDoneCallbacke empty queue"); + defer queued.value.deinit(ctx.client.allocator); - for (queued.items) |queued_req| { + for (queued.value.items) |queued_req| { if (!allowed) { log.warn(.http, "blocked by robots", .{ .url = queued_req.url }); queued_req.error_callback(queued_req.ctx, error.RobotsBlocked); @@ -410,14 +409,13 @@ fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { log.warn(.http, "robots fetch failed", .{ .err = err }); - const queued = ctx.client.pending_robots_queue.getPtr(ctx.robots_url) orelse unreachable; - defer { - queued.deinit(ctx.client.allocator); - _ = ctx.client.pending_robots_queue.remove(ctx.robots_url); - } + var queued = ctx.client.pending_robots_queue.fetchRemove( + ctx.robots_url, + ) orelse @panic("Client.robotsErrorCallback empty queue"); + defer queued.value.deinit(ctx.client.allocator); // On error, allow all queued requests to proceed - for (queued.items) |queued_req| { + for (queued.value.items) |queued_req| { ctx.client.processRequest(queued_req) catch |e| { queued_req.error_callback(queued_req.ctx, e); }; From 65c9b2a5f70d0cab99fbe5af5b3ea8dcf3525985 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Mon, 9 Feb 2026 05:51:42 -0800 Subject: [PATCH 13/14] add robotsShutdownCallback --- src/http/Client.zig | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/http/Client.zig b/src/http/Client.zig index c69b0cf9..90bba041 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -314,11 +314,14 @@ fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: R const entry = try self.pending_robots_queue.getOrPut(self.allocator, robots_url); if (!entry.found_existing) { + errdefer self.allocator.free(robots_url); + // If we aren't already fetching this robots, // we want to create a new queue for it and add this request into it. entry.value_ptr.* = .empty; const ctx = try self.allocator.create(RobotsRequestContext); + errdefer self.allocator.destroy(ctx); ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty }; const headers = try self.newHeaders(); @@ -336,6 +339,7 @@ fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: R .data_callback = robotsDataCallback, .done_callback = robotsDoneCallback, .error_callback = robotsErrorCallback, + .shutdown_callback = robotsShutdownCallback, }); } else { // Not using our own robots URL, only using the one from the first request. @@ -422,6 +426,18 @@ fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { } } +fn robotsShutdownCallback(ctx_ptr: *anyopaque) void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.deinit(); + + log.debug(.http, "robots fetch shutdown", .{}); + + var queued = ctx.client.pending_robots_queue.fetchRemove( + ctx.robots_url, + ) orelse @panic("Client.robotsErrorCallback empty queue"); + defer queued.value.deinit(ctx.client.allocator); +} + fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool { // The request was intercepted and is blocking. This is messy, but our // callers, the ScriptManager -> Page, don't have a great way to stop the From e1850440b0ba9b2e7983246d5098c6a18765c0b0 Mon Sep 17 00:00:00 2001 From: Muki Kiboigo Date: Mon, 9 Feb 2026 15:24:35 -0800 Subject: [PATCH 14/14] shutdown queued req on robots shutdown --- src/http/Client.zig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/http/Client.zig b/src/http/Client.zig index 90bba041..26419b6a 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -436,6 +436,12 @@ fn robotsShutdownCallback(ctx_ptr: *anyopaque) void { ctx.robots_url, ) orelse @panic("Client.robotsErrorCallback empty queue"); defer queued.value.deinit(ctx.client.allocator); + + for (queued.value.items) |queued_req| { + if (queued_req.shutdown_callback) |shutdown_cb| { + shutdown_cb(queued_req.ctx); + } + } } fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool {