diff --git a/src/App.zig b/src/App.zig index 21b0ecc6..a4ed0e8f 100644 --- a/src/App.zig +++ b/src/App.zig @@ -25,6 +25,7 @@ const Config = @import("Config.zig"); const Snapshot = @import("browser/js/Snapshot.zig"); const Platform = @import("browser/js/Platform.zig"); const Telemetry = @import("telemetry/telemetry.zig").Telemetry; +const RobotStore = @import("browser/Robots.zig").RobotStore; pub const Http = @import("http/Http.zig"); pub const ArenaPool = @import("ArenaPool.zig"); @@ -38,6 +39,7 @@ snapshot: Snapshot, telemetry: Telemetry, allocator: Allocator, arena_pool: ArenaPool, +robots: RobotStore, app_dir_path: ?[]const u8, shutdown: bool = false, @@ -48,7 +50,9 @@ pub fn init(allocator: Allocator, config: *const Config) !*App { app.config = config; app.allocator = allocator; - app.http = try Http.init(allocator, config); + app.robots = RobotStore.init(allocator); + + app.http = try Http.init(allocator, &app.robots, config); errdefer app.http.deinit(); app.platform = try Platform.init(); @@ -79,6 +83,7 @@ pub fn deinit(self: *App) void { self.app_dir_path = null; } self.telemetry.deinit(); + self.robots.deinit(); self.http.deinit(); self.snapshot.deinit(); self.platform.deinit(); diff --git a/src/Config.zig b/src/Config.zig index 709c70c3..c9725168 100644 --- a/src/Config.zig +++ b/src/Config.zig @@ -57,6 +57,13 @@ pub fn tlsVerifyHost(self: *const Config) bool { }; } +pub fn obeyRobots(self: *const Config) bool { + return switch (self.mode) { + inline .serve, .fetch => |opts| opts.common.obey_robots, + else => unreachable, + }; +} + pub fn httpProxy(self: *const Config) ?[:0]const u8 { return switch (self.mode) { inline .serve, .fetch => |opts| opts.common.http_proxy, @@ -165,6 +172,7 @@ pub const Fetch = struct { }; pub const Common = struct { + obey_robots: bool = false, proxy_bearer_token: ?[:0]const u8 = null, http_proxy: ?[:0]const u8 = null, http_max_concurrent: ?u8 = null, @@ -231,6 +239,11 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void { \\ advanced option which should only be set if you understand \\ and accept the risk of disabling host verification. \\ + \\--obey_robots + \\ Fetches and obeys the robots.txt (if available) of the web pages + \\ we make requests towards. + \\ Defaults to false. + \\ \\--http_proxy The HTTP proxy to use for all HTTP requests. \\ A username:password can be included for basic authentication. \\ Defaults to none. @@ -626,6 +639,11 @@ fn parseCommonArg( return true; } + if (std.mem.eql(u8, "--obey_robots", opt)) { + common.obey_robots = true; + return true; + } + if (std.mem.eql(u8, "--http_proxy", opt)) { const str = args.next() orelse { log.fatal(.app, "missing argument value", .{ .arg = "--http_proxy" }); diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig new file mode 100644 index 00000000..709481ee --- /dev/null +++ b/src/browser/Robots.zig @@ -0,0 +1,878 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +const std = @import("std"); +const log = @import("../log.zig"); + +pub const Rule = union(enum) { + allow: []const u8, + disallow: []const u8, +}; + +pub const Key = enum { + @"user-agent", + allow, + disallow, +}; + +/// https://www.rfc-editor.org/rfc/rfc9309.html +pub const Robots = @This(); +pub const empty: Robots = .{ .rules = &.{} }; + +pub const RobotStore = struct { + const RobotsEntry = union(enum) { + present: Robots, + absent, + }; + + pub const RobotsMap = std.HashMapUnmanaged([]const u8, RobotsEntry, struct { + const Context = @This(); + + pub fn hash(_: Context, value: []const u8) u32 { + var hasher = std.hash.Wyhash.init(value.len); + for (value) |c| { + std.hash.autoHash(&hasher, std.ascii.toLower(c)); + } + return @truncate(hasher.final()); + } + + pub fn eql(_: Context, a: []const u8, b: []const u8) bool { + return std.ascii.eqlIgnoreCase(a, b); + } + }, 80); + + allocator: std.mem.Allocator, + map: RobotsMap, + + pub fn init(allocator: std.mem.Allocator) RobotStore { + return .{ .allocator = allocator, .map = .empty }; + } + + pub fn deinit(self: *RobotStore) void { + var iter = self.map.iterator(); + + while (iter.next()) |entry| { + self.allocator.free(entry.key_ptr.*); + + switch (entry.value_ptr.*) { + .present => |*robots| robots.deinit(self.allocator), + .absent => {}, + } + } + + self.map.deinit(self.allocator); + } + + pub fn get(self: *RobotStore, url: []const u8) ?RobotsEntry { + return self.map.get(url); + } + + pub fn robotsFromBytes(self: *RobotStore, user_agent: []const u8, bytes: []const u8) !Robots { + return try Robots.fromBytes(self.allocator, user_agent, bytes); + } + + pub fn put(self: *RobotStore, url: []const u8, robots: Robots) !void { + const duped = try self.allocator.dupe(u8, url); + try self.map.put(self.allocator, duped, .{ .present = robots }); + } + + pub fn putAbsent(self: *RobotStore, url: []const u8) !void { + const duped = try self.allocator.dupe(u8, url); + try self.map.put(self.allocator, duped, .absent); + } +}; + +rules: []const Rule, + +const State = struct { + entry: enum { + not_in_entry, + in_other_entry, + in_our_entry, + in_wildcard_entry, + }, + has_rules: bool = false, +}; + +fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void { + for (rules) |rule| { + switch (rule) { + .allow => |value| allocator.free(value), + .disallow => |value| allocator.free(value), + } + } +} + +fn parseRulesWithUserAgent( + allocator: std.mem.Allocator, + user_agent: []const u8, + bytes: []const u8, +) ![]const Rule { + var rules: std.ArrayList(Rule) = .empty; + defer rules.deinit(allocator); + + var wildcard_rules: std.ArrayList(Rule) = .empty; + defer wildcard_rules.deinit(allocator); + + var state: State = .{ .entry = .not_in_entry, .has_rules = false }; + + var iter = std.mem.splitScalar(u8, bytes, '\n'); + while (iter.next()) |line| { + const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace); + + // Skip all comment lines. + if (std.mem.startsWith(u8, trimmed, "#")) continue; + + // Remove end of line comment. + const true_line = if (std.mem.indexOfScalar(u8, trimmed, '#')) |pos| + std.mem.trimRight(u8, trimmed[0..pos], &std.ascii.whitespace) + else + trimmed; + + if (true_line.len == 0) { + continue; + } + + const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse return error.MissingColon; + const key_str = try std.ascii.allocLowerString(allocator, true_line[0..colon_idx]); + defer allocator.free(key_str); + + const key = std.meta.stringToEnum(Key, key_str) orelse { + // log.warn(.browser, "robots key", .{ .key = key_str }); + continue; + }; + + const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace); + + switch (key) { + .@"user-agent" => { + if (state.has_rules) { + state = .{ .entry = .not_in_entry, .has_rules = false }; + } + + switch (state.entry) { + .in_other_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } + }, + .in_our_entry => {}, + .in_wildcard_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } + }, + .not_in_entry => { + if (std.ascii.eqlIgnoreCase(user_agent, value)) { + state.entry = .in_our_entry; + } else if (std.mem.eql(u8, "*", value)) { + state.entry = .in_wildcard_entry; + } else { + state.entry = .in_other_entry; + } + }, + } + }, + .allow => { + defer state.has_rules = true; + + switch (state.entry) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .allow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .allow = duped_value }); + }, + .not_in_entry => { + log.warn(.browser, "robots unexpected rule", .{ .rule = "allow" }); + continue; + }, + } + }, + .disallow => { + defer state.has_rules = true; + + switch (state.entry) { + .in_our_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try rules.append(allocator, .{ .disallow = duped_value }); + }, + .in_other_entry => {}, + .in_wildcard_entry => { + const duped_value = try allocator.dupe(u8, value); + errdefer allocator.free(duped_value); + try wildcard_rules.append(allocator, .{ .disallow = duped_value }); + }, + .not_in_entry => { + log.warn(.browser, "robots unexpected rule", .{ .rule = "disallow" }); + continue; + }, + } + }, + } + } + + // If we have rules for our specific User-Agent, we will use those rules. + // If we don't have any rules, we fallback to using the wildcard ("*") rules. + if (rules.items.len > 0) { + freeRulesInList(allocator, wildcard_rules.items); + return try rules.toOwnedSlice(allocator); + } else { + freeRulesInList(allocator, rules.items); + return try wildcard_rules.toOwnedSlice(allocator); + } +} + +pub fn fromBytes(allocator: std.mem.Allocator, user_agent: []const u8, bytes: []const u8) !Robots { + const rules = try parseRulesWithUserAgent(allocator, user_agent, bytes); + return .{ .rules = rules }; +} + +pub fn deinit(self: *Robots, allocator: std.mem.Allocator) void { + freeRulesInList(allocator, self.rules); + allocator.free(self.rules); +} + +fn matchPatternRecursive(pattern: []const u8, path: []const u8, exact_match: bool) bool { + if (pattern.len == 0) return true; + + const star_pos = std.mem.indexOfScalar(u8, pattern, '*') orelse { + if (exact_match) { + // If we end in '$', we must be exactly equal. + return std.mem.eql(u8, path, pattern); + } else { + // Otherwise, we are just a prefix. + return std.mem.startsWith(u8, path, pattern); + } + }; + + // Ensure the prefix before the '*' matches. + if (!std.mem.startsWith(u8, path, pattern[0..star_pos])) { + return false; + } + + const suffix_pattern = pattern[star_pos + 1 ..]; + if (suffix_pattern.len == 0) return true; + + var i: usize = star_pos; + while (i <= path.len) : (i += 1) { + if (matchPatternRecursive(suffix_pattern, path[i..], exact_match)) { + return true; + } + } + + return false; +} + +/// There are rules for how the pattern in robots.txt should be matched. +/// +/// * should match 0 or more of any character. +/// $ should signify the end of a path, making it exact. +/// otherwise, it is a prefix path. +fn matchPattern(pattern: []const u8, path: []const u8) ?usize { + if (pattern.len == 0) return 0; + const exact_match = pattern[pattern.len - 1] == '$'; + const inner_pattern = if (exact_match) pattern[0 .. pattern.len - 1] else pattern; + + if (matchPatternRecursive( + inner_pattern, + path, + exact_match, + )) return pattern.len else return null; +} + +pub fn isAllowed(self: *const Robots, path: []const u8) bool { + const rules = self.rules; + + var longest_match_len: usize = 0; + var is_allowed_result = true; + + for (rules) |rule| { + switch (rule) { + .allow => |pattern| { + if (matchPattern(pattern, path)) |len| { + // Longest or Last Wins. + if (len >= longest_match_len) { + longest_match_len = len; + is_allowed_result = true; + } + } + }, + .disallow => |pattern| { + if (pattern.len == 0) continue; + + if (matchPattern(pattern, path)) |len| { + // Longest or Last Wins. + if (len >= longest_match_len) { + longest_match_len = len; + is_allowed_result = false; + } + } + }, + } + } + + return is_allowed_result; +} + +test "Robots: simple robots.txt" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: * + \\Disallow: /private/ + \\Allow: /public/ + \\ + \\User-agent: Googlebot + \\Disallow: /admin/ + \\ + ; + + const rules = try parseRulesWithUserAgent(allocator, "GoogleBot", file); + defer { + freeRulesInList(allocator, rules); + allocator.free(rules); + } + + try std.testing.expectEqual(1, rules.len); + try std.testing.expectEqualStrings("/admin/", rules[0].disallow); +} + +test "Robots: matchPattern - simple prefix" { + try std.testing.expect(matchPattern("/admin", "/admin/page") != null); + try std.testing.expect(matchPattern("/admin", "/admin") != null); + try std.testing.expect(matchPattern("/admin", "/other") == null); + try std.testing.expect(matchPattern("/admin/page", "/admin") == null); +} + +test "Robots: matchPattern - single wildcard" { + try std.testing.expect(matchPattern("/admin/*", "/admin/") != null); + try std.testing.expect(matchPattern("/admin/*", "/admin/page") != null); + try std.testing.expect(matchPattern("/admin/*", "/admin/page/subpage") != null); + try std.testing.expect(matchPattern("/admin/*", "/other/page") == null); +} + +test "Robots: matchPattern - wildcard in middle" { + try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/xyz") != null); + try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/ghi/xyz") != null); + try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def") == null); + try std.testing.expect(matchPattern("/abc/*/xyz", "/other/def/xyz") == null); +} + +test "Robots: matchPattern - complex wildcard case" { + try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/def/def/xyz") != null); + try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/ANYTHING/def/xyz") != null); +} + +test "Robots: matchPattern - multiple wildcards" { + try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/b/y/c") != null); + try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/y/b/z/w/c") != null); + try std.testing.expect(matchPattern("/*.php", "/index.php") != null); + try std.testing.expect(matchPattern("/*.php", "/admin/index.php") != null); +} + +test "Robots: matchPattern - end anchor" { + try std.testing.expect(matchPattern("/*.php$", "/index.php") != null); + try std.testing.expect(matchPattern("/*.php$", "/index.php?param=value") == null); + try std.testing.expect(matchPattern("/admin$", "/admin") != null); + try std.testing.expect(matchPattern("/admin$", "/admin/") == null); + try std.testing.expect(matchPattern("/fish$", "/fish") != null); + try std.testing.expect(matchPattern("/fish$", "/fishheads") == null); +} + +test "Robots: matchPattern - wildcard with extension" { + try std.testing.expect(matchPattern("/fish*.php", "/fish.php") != null); + try std.testing.expect(matchPattern("/fish*.php", "/fishheads.php") != null); + try std.testing.expect(matchPattern("/fish*.php", "/fish/salmon.php") != null); + try std.testing.expect(matchPattern("/fish*.php", "/fish.asp") == null); +} + +test "Robots: matchPattern - empty and edge cases" { + try std.testing.expect(matchPattern("", "/anything") != null); + try std.testing.expect(matchPattern("/", "/") != null); + try std.testing.expect(matchPattern("*", "/anything") != null); + try std.testing.expect(matchPattern("/*", "/anything") != null); + try std.testing.expect(matchPattern("$", "") != null); +} + +test "Robots: matchPattern - real world examples" { + try std.testing.expect(matchPattern("/", "/anything") != null); + + try std.testing.expect(matchPattern("/admin/", "/admin/page") != null); + try std.testing.expect(matchPattern("/admin/", "/public/page") == null); + + try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf") != null); + try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf.bak") == null); + + try std.testing.expect(matchPattern("/*?", "/page?param=value") != null); + try std.testing.expect(matchPattern("/*?", "/page") == null); +} + +test "Robots: isAllowed - basic allow/disallow" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "MyBot", + \\User-agent: MyBot + \\Disallow: /admin/ + \\Allow: /public/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/") == true); + try std.testing.expect(robots.isAllowed("/public/page") == true); + try std.testing.expect(robots.isAllowed("/admin/secret") == false); + try std.testing.expect(robots.isAllowed("/other/page") == true); +} + +test "Robots: isAllowed - longest match wins" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "TestBot", + \\User-agent: TestBot + \\Disallow: /admin/ + \\Allow: /admin/public/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/secret") == false); + try std.testing.expect(robots.isAllowed("/admin/public/page") == true); + try std.testing.expect(robots.isAllowed("/admin/public/") == true); +} + +test "Robots: isAllowed - specific user-agent vs wildcard" { + const allocator = std.testing.allocator; + + var robots1 = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + \\User-agent: * + \\Disallow: /admin/ + \\ + ); + defer robots1.deinit(allocator); + + try std.testing.expect(robots1.isAllowed("/private/page") == false); + try std.testing.expect(robots1.isAllowed("/admin/page") == true); + + // Test with other bot (should use wildcard) + var robots2 = try Robots.fromBytes(allocator, "OtherBot", + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + \\User-agent: * + \\Disallow: /admin/ + \\ + ); + defer robots2.deinit(allocator); + + try std.testing.expect(robots2.isAllowed("/private/page") == true); + try std.testing.expect(robots2.isAllowed("/admin/page") == false); +} + +test "Robots: isAllowed - case insensitive user-agent" { + const allocator = std.testing.allocator; + + var robots1 = try Robots.fromBytes(allocator, "googlebot", + \\User-agent: GoogleBot + \\Disallow: /private/ + \\ + ); + defer robots1.deinit(allocator); + try std.testing.expect(robots1.isAllowed("/private/") == false); + + var robots2 = try Robots.fromBytes(allocator, "GOOGLEBOT", + \\User-agent: GoogleBot + \\Disallow: /private/ + \\ + ); + defer robots2.deinit(allocator); + try std.testing.expect(robots2.isAllowed("/private/") == false); + + var robots3 = try Robots.fromBytes(allocator, "GoOgLeBoT", + \\User-agent: GoogleBot + \\Disallow: /private/ + \\ + ); + defer robots3.deinit(allocator); + try std.testing.expect(robots3.isAllowed("/private/") == false); +} + +test "Robots: isAllowed - merged rules for same agent" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: Googlebot + \\Disallow: /admin/ + \\ + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/page") == false); + try std.testing.expect(robots.isAllowed("/private/page") == false); + try std.testing.expect(robots.isAllowed("/public/page") == true); +} + +test "Robots: isAllowed - wildcards in patterns" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", + \\User-agent: Bot + \\Disallow: /*.php$ + \\Allow: /index.php$ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/page.php") == false); + try std.testing.expect(robots.isAllowed("/index.php") == true); + try std.testing.expect(robots.isAllowed("/page.php?param=1") == true); + try std.testing.expect(robots.isAllowed("/page.html") == true); +} + +test "Robots: isAllowed - empty disallow allows everything" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", + \\User-agent: Bot + \\Disallow: + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/anything") == true); + try std.testing.expect(robots.isAllowed("/") == true); +} + +test "Robots: isAllowed - no rules" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", ""); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/anything") == true); +} + +test "Robots: isAllowed - disallow all" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", + \\User-agent: Bot + \\Disallow: / + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/") == false); + try std.testing.expect(robots.isAllowed("/anything") == false); + try std.testing.expect(robots.isAllowed("/admin/page") == false); +} + +test "Robots: isAllowed - multiple user-agents in same entry" { + const allocator = std.testing.allocator; + + var robots1 = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: Googlebot + \\User-agent: Bingbot + \\Disallow: /private/ + \\ + ); + defer robots1.deinit(allocator); + try std.testing.expect(robots1.isAllowed("/private/") == false); + + var robots2 = try Robots.fromBytes(allocator, "Bingbot", + \\User-agent: Googlebot + \\User-agent: Bingbot + \\Disallow: /private/ + \\ + ); + defer robots2.deinit(allocator); + try std.testing.expect(robots2.isAllowed("/private/") == false); + + var robots3 = try Robots.fromBytes(allocator, "OtherBot", + \\User-agent: Googlebot + \\User-agent: Bingbot + \\Disallow: /private/ + \\ + ); + defer robots3.deinit(allocator); + try std.testing.expect(robots3.isAllowed("/private/") == true); +} + +test "Robots: isAllowed - wildcard fallback" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "UnknownBot", + \\User-agent: * + \\Disallow: /admin/ + \\Allow: /admin/public/ + \\ + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/secret") == false); + try std.testing.expect(robots.isAllowed("/admin/public/page") == true); + try std.testing.expect(robots.isAllowed("/private/") == true); +} + +test "Robots: isAllowed - complex real-world example" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "MyBot", + \\User-agent: * + \\Disallow: /cgi-bin/ + \\Disallow: /tmp/ + \\Disallow: /private/ + \\ + \\User-agent: MyBot + \\Disallow: /admin/ + \\Disallow: /*.pdf$ + \\Allow: /public/*.pdf$ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/") == true); + try std.testing.expect(robots.isAllowed("/admin/dashboard") == false); + try std.testing.expect(robots.isAllowed("/docs/guide.pdf") == false); + try std.testing.expect(robots.isAllowed("/public/manual.pdf") == true); + try std.testing.expect(robots.isAllowed("/page.html") == true); + try std.testing.expect(robots.isAllowed("/cgi-bin/script.sh") == true); +} + +test "Robots: isAllowed - order doesn't matter for same length" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Bot", + \\User-agent: Bot + \\ # WOW!! + \\Allow: /page + \\Disallow: /page + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/page") == false); +} + +test "Robots: isAllowed - empty file uses wildcard defaults" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "MyBot", + \\User-agent: * # ABCDEF!!! + \\Disallow: /admin/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/") == false); + try std.testing.expect(robots.isAllowed("/public/") == true); +} +test "Robots: isAllowed - wildcard entry with multiple user-agents including specific" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: * + \\User-agent: Googlebot + \\Disallow: /shared/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/shared/") == false); + try std.testing.expect(robots.isAllowed("/other/") == true); + + var robots2 = try Robots.fromBytes(allocator, "Bingbot", + \\User-agent: * + \\User-agent: Googlebot + \\Disallow: /shared/ + \\ + ); + defer robots2.deinit(allocator); + + try std.testing.expect(robots2.isAllowed("/shared/") == false); +} + +test "Robots: isAllowed - specific agent appears after wildcard in entry" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "MyBot", + \\User-agent: * + \\User-agent: MyBot + \\User-agent: Bingbot + \\Disallow: /admin/ + \\Allow: /admin/public/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/secret") == false); + try std.testing.expect(robots.isAllowed("/admin/public/page") == true); +} + +test "Robots: isAllowed - wildcard should not override specific entry" { + const allocator = std.testing.allocator; + + var robots = try Robots.fromBytes(allocator, "Googlebot", + \\User-agent: Googlebot + \\Disallow: /private/ + \\ + \\User-agent: * + \\User-agent: Googlebot + \\Disallow: /admin/ + \\ + ); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/private/") == false); + try std.testing.expect(robots.isAllowed("/admin/") == false); +} + +test "Robots: isAllowed - Google's real robots.txt" { + const allocator = std.testing.allocator; + + // Simplified version of google.com/robots.txt + const google_robots = + \\User-agent: * + \\User-agent: Yandex + \\Disallow: /search + \\Allow: /search/about + \\Allow: /search/howsearchworks + \\Disallow: /imgres + \\Disallow: /m? + \\Disallow: /m/ + \\Allow: /m/finance + \\Disallow: /maps/ + \\Allow: /maps/$ + \\Allow: /maps/@ + \\Allow: /maps/dir/ + \\Disallow: /shopping? + \\Allow: /shopping?udm=28$ + \\ + \\User-agent: AdsBot-Google + \\Disallow: /maps/api/js/ + \\Allow: /maps/api/js + \\Disallow: /maps/api/staticmap + \\ + \\User-agent: Yandex + \\Disallow: /about/careers/applications/jobs/results + \\ + \\User-agent: facebookexternalhit + \\User-agent: Twitterbot + \\Allow: /imgres + \\Allow: /search + \\Disallow: /groups + \\Disallow: /m/ + \\ + ; + + var regular_bot = try Robots.fromBytes(allocator, "Googlebot", google_robots); + defer regular_bot.deinit(allocator); + + try std.testing.expect(regular_bot.isAllowed("/") == true); + try std.testing.expect(regular_bot.isAllowed("/search") == false); + try std.testing.expect(regular_bot.isAllowed("/search/about") == true); + try std.testing.expect(regular_bot.isAllowed("/search/howsearchworks") == true); + try std.testing.expect(regular_bot.isAllowed("/imgres") == false); + try std.testing.expect(regular_bot.isAllowed("/m/finance") == true); + try std.testing.expect(regular_bot.isAllowed("/m/other") == false); + try std.testing.expect(regular_bot.isAllowed("/maps/") == true); + try std.testing.expect(regular_bot.isAllowed("/maps/@") == true); + try std.testing.expect(regular_bot.isAllowed("/shopping?udm=28") == true); + try std.testing.expect(regular_bot.isAllowed("/shopping?udm=28&extra") == false); + + var adsbot = try Robots.fromBytes(allocator, "AdsBot-Google", google_robots); + defer adsbot.deinit(allocator); + + try std.testing.expect(adsbot.isAllowed("/maps/api/js") == true); + try std.testing.expect(adsbot.isAllowed("/maps/api/js/") == false); + try std.testing.expect(adsbot.isAllowed("/maps/api/staticmap") == false); + + var twitterbot = try Robots.fromBytes(allocator, "Twitterbot", google_robots); + defer twitterbot.deinit(allocator); + + try std.testing.expect(twitterbot.isAllowed("/imgres") == true); + try std.testing.expect(twitterbot.isAllowed("/search") == true); + try std.testing.expect(twitterbot.isAllowed("/groups") == false); + try std.testing.expect(twitterbot.isAllowed("/m/") == false); +} + +test "Robots: user-agent after rules starts new entry" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: Bot1 + \\User-agent: Bot2 + \\Disallow: /admin/ + \\Allow: /public/ + \\User-agent: Bot3 + \\Disallow: /private/ + \\ + ; + + var robots1 = try Robots.fromBytes(allocator, "Bot1", file); + defer robots1.deinit(allocator); + try std.testing.expect(robots1.isAllowed("/admin/") == false); + try std.testing.expect(robots1.isAllowed("/public/") == true); + try std.testing.expect(robots1.isAllowed("/private/") == true); + + var robots2 = try Robots.fromBytes(allocator, "Bot2", file); + defer robots2.deinit(allocator); + try std.testing.expect(robots2.isAllowed("/admin/") == false); + try std.testing.expect(robots2.isAllowed("/public/") == true); + try std.testing.expect(robots2.isAllowed("/private/") == true); + + var robots3 = try Robots.fromBytes(allocator, "Bot3", file); + defer robots3.deinit(allocator); + try std.testing.expect(robots3.isAllowed("/admin/") == true); + try std.testing.expect(robots3.isAllowed("/public/") == true); + try std.testing.expect(robots3.isAllowed("/private/") == false); +} + +test "Robots: blank lines don't end entries" { + const allocator = std.testing.allocator; + + const file = + \\User-agent: MyBot + \\Disallow: /admin/ + \\ + \\ + \\Allow: /public/ + \\ + ; + + var robots = try Robots.fromBytes(allocator, "MyBot", file); + defer robots.deinit(allocator); + + try std.testing.expect(robots.isAllowed("/admin/") == false); + try std.testing.expect(robots.isAllowed("/public/") == true); +} diff --git a/src/browser/URL.zig b/src/browser/URL.zig index d36673cc..716480b1 100644 --- a/src/browser/URL.zig +++ b/src/browser/URL.zig @@ -502,6 +502,16 @@ pub fn concatQueryString(arena: Allocator, url: []const u8, query_string: []cons return buf.items[0 .. buf.items.len - 1 :0]; } +pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) ![:0]const u8 { + const origin = try getOrigin(arena, url) orelse return error.NoOrigin; + return try std.fmt.allocPrintSentinel( + arena, + "{s}/robots.txt", + .{origin}, + 0, + ); +} + const testing = @import("../testing.zig"); test "URL: isCompleteHTTPUrl" { try testing.expectEqual(true, isCompleteHTTPUrl("http://example.com/about")); @@ -778,3 +788,31 @@ test "URL: concatQueryString" { try testing.expectEqual("https://www.lightpanda.io/index?1=2&a=b", url); } } + +test "URL: getRobotsUrl" { + defer testing.reset(); + const arena = testing.arena_allocator; + + { + const url = try getRobotsUrl(arena, "https://www.lightpanda.io"); + try testing.expectEqual("https://www.lightpanda.io/robots.txt", url); + } + + { + const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path"); + try testing.expectString("https://www.lightpanda.io/robots.txt", url); + } + + { + const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page"); + try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url); + } + { + const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment"); + try testing.expectString("http://example.com/robots.txt", url); + } + { + const url = try getRobotsUrl(arena, "https://user:pass@example.com/page"); + try testing.expectString("https://example.com/robots.txt", url); + } +} diff --git a/src/http/Client.zig b/src/http/Client.zig index 4124e26c..3a7a4086 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -27,6 +27,8 @@ const Config = @import("../Config.zig"); const URL = @import("../browser/URL.zig"); const Notification = @import("../Notification.zig"); const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar; +const Robots = @import("../browser/Robots.zig"); +const RobotStore = Robots.RobotStore; const c = Http.c; const posix = std.posix; @@ -85,6 +87,12 @@ queue: TransferQueue, // The main app allocator allocator: Allocator, +// Reference to the App-owned Robot Store. +robot_store: *RobotStore, +// Queue of requests that depend on a robots.txt. +// Allows us to fetch the robots.txt just once. +pending_robots_queue: std.StringHashMapUnmanaged(std.ArrayList(Request)) = .empty, + // Once we have a handle/easy to process a request with, we create a Transfer // which contains the Request as well as any state we need to process the // request. These wil come and go with each request. @@ -123,7 +131,7 @@ pub const CDPClient = struct { const TransferQueue = std.DoublyLinkedList; -pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, config: *const Config) !*Client { +pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, robot_store: *RobotStore, config: *const Config) !*Client { var transfer_pool = std.heap.MemoryPool(Transfer).init(allocator); errdefer transfer_pool.deinit(); @@ -147,6 +155,7 @@ pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, config: *const Config) .multi = multi, .handles = handles, .allocator = allocator, + .robot_store = robot_store, .http_proxy = http_proxy, .use_proxy = http_proxy != null, .config = config, @@ -163,6 +172,13 @@ pub fn deinit(self: *Client) void { _ = c.curl_multi_cleanup(self.multi); self.transfer_pool.deinit(); + + var robots_iter = self.pending_robots_queue.iterator(); + while (robots_iter.next()) |entry| { + entry.value_ptr.deinit(self.allocator); + } + self.pending_robots_queue.deinit(self.allocator); + self.allocator.destroy(self); } @@ -217,12 +233,46 @@ pub fn tick(self: *Client, timeout_ms: u32) !PerformStatus { } pub fn request(self: *Client, req: Request) !void { + if (self.config.obeyRobots()) { + const robots_url = try URL.getRobotsUrl(self.allocator, req.url); + errdefer self.allocator.free(robots_url); + + // If we have this robots cached, we can take a fast path. + if (self.robot_store.get(robots_url)) |robot_entry| { + defer self.allocator.free(robots_url); + + switch (robot_entry) { + // If we have a found robots entry, we check it. + .present => |robots| { + const path = URL.getPathname(req.url); + if (!robots.isAllowed(path)) { + req.error_callback(req.ctx, error.RobotsBlocked); + return; + } + }, + // Otherwise, we assume we won't find it again. + .absent => {}, + } + + return self.processRequest(req); + } + + return self.fetchRobotsThenProcessRequest(robots_url, req); + } + + return self.processRequest(req); +} + +fn processRequest(self: *Client, req: Request) !void { const transfer = try self.makeTransfer(req); transfer.req.notification.dispatch(.http_request_start, &.{ .transfer = transfer }); var wait_for_interception = false; - transfer.req.notification.dispatch(.http_request_intercept, &.{ .transfer = transfer, .wait_for_interception = &wait_for_interception }); + transfer.req.notification.dispatch(.http_request_intercept, &.{ + .transfer = transfer, + .wait_for_interception = &wait_for_interception, + }); if (wait_for_interception == false) { // request not intercepted, process it normally return self.process(transfer); @@ -246,6 +296,154 @@ pub fn request(self: *Client, req: Request) !void { } } +const RobotsRequestContext = struct { + client: *Client, + req: Request, + robots_url: [:0]const u8, + buffer: std.ArrayList(u8), + status: u16 = 0, + + pub fn deinit(self: *RobotsRequestContext) void { + self.client.allocator.free(self.robots_url); + self.buffer.deinit(self.client.allocator); + self.client.allocator.destroy(self); + } +}; + +fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void { + const entry = try self.pending_robots_queue.getOrPut(self.allocator, robots_url); + + if (!entry.found_existing) { + errdefer self.allocator.free(robots_url); + + // If we aren't already fetching this robots, + // we want to create a new queue for it and add this request into it. + entry.value_ptr.* = .empty; + + const ctx = try self.allocator.create(RobotsRequestContext); + errdefer self.allocator.destroy(ctx); + ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty }; + const headers = try self.newHeaders(); + + log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url }); + try self.processRequest(.{ + .ctx = ctx, + .url = robots_url, + .method = .GET, + .headers = headers, + .blocking = false, + .cookie_jar = req.cookie_jar, + .notification = req.notification, + .resource_type = .fetch, + .header_callback = robotsHeaderCallback, + .data_callback = robotsDataCallback, + .done_callback = robotsDoneCallback, + .error_callback = robotsErrorCallback, + .shutdown_callback = robotsShutdownCallback, + }); + } else { + // Not using our own robots URL, only using the one from the first request. + self.allocator.free(robots_url); + } + + try entry.value_ptr.append(self.allocator, req); +} + +fn robotsHeaderCallback(transfer: *Http.Transfer) !bool { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); + + if (transfer.response_header) |hdr| { + log.debug(.browser, "robots status", .{ .status = hdr.status, .robots_url = ctx.robots_url }); + ctx.status = hdr.status; + } + + if (transfer.getContentLength()) |cl| { + try ctx.buffer.ensureTotalCapacity(ctx.client.allocator, cl); + } + + return true; +} + +fn robotsDataCallback(transfer: *Http.Transfer, data: []const u8) !void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx)); + try ctx.buffer.appendSlice(ctx.client.allocator, data); +} + +fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.deinit(); + + var allowed = true; + + if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) { + const robots = try ctx.client.robot_store.robotsFromBytes( + ctx.client.config.http_headers.user_agent, + ctx.buffer.items, + ); + + try ctx.client.robot_store.put(ctx.robots_url, robots); + + const path = URL.getPathname(ctx.req.url); + allowed = robots.isAllowed(path); + } else if (ctx.status == 404) { + log.debug(.http, "robots not found", .{ .url = ctx.robots_url }); + try ctx.client.robot_store.putAbsent(ctx.robots_url); + } + + var queued = ctx.client.pending_robots_queue.fetchRemove( + ctx.robots_url, + ) orelse @panic("Client.robotsDoneCallbacke empty queue"); + defer queued.value.deinit(ctx.client.allocator); + + for (queued.value.items) |queued_req| { + if (!allowed) { + log.warn(.http, "blocked by robots", .{ .url = queued_req.url }); + queued_req.error_callback(queued_req.ctx, error.RobotsBlocked); + } else { + ctx.client.processRequest(queued_req) catch |e| { + queued_req.error_callback(queued_req.ctx, e); + }; + } + } +} + +fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.deinit(); + + log.warn(.http, "robots fetch failed", .{ .err = err }); + + var queued = ctx.client.pending_robots_queue.fetchRemove( + ctx.robots_url, + ) orelse @panic("Client.robotsErrorCallback empty queue"); + defer queued.value.deinit(ctx.client.allocator); + + // On error, allow all queued requests to proceed + for (queued.value.items) |queued_req| { + ctx.client.processRequest(queued_req) catch |e| { + queued_req.error_callback(queued_req.ctx, e); + }; + } +} + +fn robotsShutdownCallback(ctx_ptr: *anyopaque) void { + const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr)); + defer ctx.deinit(); + + log.debug(.http, "robots fetch shutdown", .{}); + + var queued = ctx.client.pending_robots_queue.fetchRemove( + ctx.robots_url, + ) orelse @panic("Client.robotsErrorCallback empty queue"); + defer queued.value.deinit(ctx.client.allocator); + + for (queued.value.items) |queued_req| { + if (queued_req.shutdown_callback) |shutdown_cb| { + shutdown_cb(queued_req.ctx); + } + } +} + fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool { // The request was intercepted and is blocking. This is messy, but our // callers, the ScriptManager -> Page, don't have a great way to stop the diff --git a/src/http/Http.zig b/src/http/Http.zig index 65f298f2..d9943a74 100644 --- a/src/http/Http.zig +++ b/src/http/Http.zig @@ -30,6 +30,7 @@ pub const Transfer = Client.Transfer; const log = @import("../log.zig"); const errors = @import("errors.zig"); +const RobotStore = @import("../browser/Robots.zig").RobotStore; const Allocator = std.mem.Allocator; const ArenaAllocator = std.heap.ArenaAllocator; @@ -46,7 +47,7 @@ client: *Client, ca_blob: ?c.curl_blob, arena: ArenaAllocator, -pub fn init(allocator: Allocator, config: *const Config) !Http { +pub fn init(allocator: Allocator, robot_store: *RobotStore, config: *const Config) !Http { try errorCheck(c.curl_global_init(c.CURL_GLOBAL_SSL)); errdefer c.curl_global_cleanup(); @@ -62,7 +63,7 @@ pub fn init(allocator: Allocator, config: *const Config) !Http { ca_blob = try loadCerts(allocator, arena.allocator()); } - var client = try Client.init(allocator, ca_blob, config); + var client = try Client.init(allocator, ca_blob, robot_store, config); errdefer client.deinit(); return .{