Merge pull request #1407 from lightpanda-io/robots

Support for `robots.txt`
This commit is contained in:
Karl Seguin
2026-02-10 09:51:32 +08:00
committed by GitHub
6 changed files with 1143 additions and 5 deletions

View File

@@ -25,6 +25,7 @@ const Config = @import("Config.zig");
const Snapshot = @import("browser/js/Snapshot.zig"); const Snapshot = @import("browser/js/Snapshot.zig");
const Platform = @import("browser/js/Platform.zig"); const Platform = @import("browser/js/Platform.zig");
const Telemetry = @import("telemetry/telemetry.zig").Telemetry; const Telemetry = @import("telemetry/telemetry.zig").Telemetry;
const RobotStore = @import("browser/Robots.zig").RobotStore;
pub const Http = @import("http/Http.zig"); pub const Http = @import("http/Http.zig");
pub const ArenaPool = @import("ArenaPool.zig"); pub const ArenaPool = @import("ArenaPool.zig");
@@ -38,6 +39,7 @@ snapshot: Snapshot,
telemetry: Telemetry, telemetry: Telemetry,
allocator: Allocator, allocator: Allocator,
arena_pool: ArenaPool, arena_pool: ArenaPool,
robots: RobotStore,
app_dir_path: ?[]const u8, app_dir_path: ?[]const u8,
shutdown: bool = false, shutdown: bool = false,
@@ -48,7 +50,9 @@ pub fn init(allocator: Allocator, config: *const Config) !*App {
app.config = config; app.config = config;
app.allocator = allocator; app.allocator = allocator;
app.http = try Http.init(allocator, config); app.robots = RobotStore.init(allocator);
app.http = try Http.init(allocator, &app.robots, config);
errdefer app.http.deinit(); errdefer app.http.deinit();
app.platform = try Platform.init(); app.platform = try Platform.init();
@@ -79,6 +83,7 @@ pub fn deinit(self: *App) void {
self.app_dir_path = null; self.app_dir_path = null;
} }
self.telemetry.deinit(); self.telemetry.deinit();
self.robots.deinit();
self.http.deinit(); self.http.deinit();
self.snapshot.deinit(); self.snapshot.deinit();
self.platform.deinit(); self.platform.deinit();

View File

@@ -57,6 +57,13 @@ pub fn tlsVerifyHost(self: *const Config) bool {
}; };
} }
pub fn obeyRobots(self: *const Config) bool {
return switch (self.mode) {
inline .serve, .fetch => |opts| opts.common.obey_robots,
else => unreachable,
};
}
pub fn httpProxy(self: *const Config) ?[:0]const u8 { pub fn httpProxy(self: *const Config) ?[:0]const u8 {
return switch (self.mode) { return switch (self.mode) {
inline .serve, .fetch => |opts| opts.common.http_proxy, inline .serve, .fetch => |opts| opts.common.http_proxy,
@@ -165,6 +172,7 @@ pub const Fetch = struct {
}; };
pub const Common = struct { pub const Common = struct {
obey_robots: bool = false,
proxy_bearer_token: ?[:0]const u8 = null, proxy_bearer_token: ?[:0]const u8 = null,
http_proxy: ?[:0]const u8 = null, http_proxy: ?[:0]const u8 = null,
http_max_concurrent: ?u8 = null, http_max_concurrent: ?u8 = null,
@@ -231,6 +239,11 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void {
\\ advanced option which should only be set if you understand \\ advanced option which should only be set if you understand
\\ and accept the risk of disabling host verification. \\ and accept the risk of disabling host verification.
\\ \\
\\--obey_robots
\\ Fetches and obeys the robots.txt (if available) of the web pages
\\ we make requests towards.
\\ Defaults to false.
\\
\\--http_proxy The HTTP proxy to use for all HTTP requests. \\--http_proxy The HTTP proxy to use for all HTTP requests.
\\ A username:password can be included for basic authentication. \\ A username:password can be included for basic authentication.
\\ Defaults to none. \\ Defaults to none.
@@ -626,6 +639,11 @@ fn parseCommonArg(
return true; return true;
} }
if (std.mem.eql(u8, "--obey_robots", opt)) {
common.obey_robots = true;
return true;
}
if (std.mem.eql(u8, "--http_proxy", opt)) { if (std.mem.eql(u8, "--http_proxy", opt)) {
const str = args.next() orelse { const str = args.next() orelse {
log.fatal(.app, "missing argument value", .{ .arg = "--http_proxy" }); log.fatal(.app, "missing argument value", .{ .arg = "--http_proxy" });

878
src/browser/Robots.zig Normal file
View File

@@ -0,0 +1,878 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const log = @import("../log.zig");
pub const Rule = union(enum) {
allow: []const u8,
disallow: []const u8,
};
pub const Key = enum {
@"user-agent",
allow,
disallow,
};
/// https://www.rfc-editor.org/rfc/rfc9309.html
pub const Robots = @This();
pub const empty: Robots = .{ .rules = &.{} };
pub const RobotStore = struct {
const RobotsEntry = union(enum) {
present: Robots,
absent,
};
pub const RobotsMap = std.HashMapUnmanaged([]const u8, RobotsEntry, struct {
const Context = @This();
pub fn hash(_: Context, value: []const u8) u32 {
var hasher = std.hash.Wyhash.init(value.len);
for (value) |c| {
std.hash.autoHash(&hasher, std.ascii.toLower(c));
}
return @truncate(hasher.final());
}
pub fn eql(_: Context, a: []const u8, b: []const u8) bool {
return std.ascii.eqlIgnoreCase(a, b);
}
}, 80);
allocator: std.mem.Allocator,
map: RobotsMap,
pub fn init(allocator: std.mem.Allocator) RobotStore {
return .{ .allocator = allocator, .map = .empty };
}
pub fn deinit(self: *RobotStore) void {
var iter = self.map.iterator();
while (iter.next()) |entry| {
self.allocator.free(entry.key_ptr.*);
switch (entry.value_ptr.*) {
.present => |*robots| robots.deinit(self.allocator),
.absent => {},
}
}
self.map.deinit(self.allocator);
}
pub fn get(self: *RobotStore, url: []const u8) ?RobotsEntry {
return self.map.get(url);
}
pub fn robotsFromBytes(self: *RobotStore, user_agent: []const u8, bytes: []const u8) !Robots {
return try Robots.fromBytes(self.allocator, user_agent, bytes);
}
pub fn put(self: *RobotStore, url: []const u8, robots: Robots) !void {
const duped = try self.allocator.dupe(u8, url);
try self.map.put(self.allocator, duped, .{ .present = robots });
}
pub fn putAbsent(self: *RobotStore, url: []const u8) !void {
const duped = try self.allocator.dupe(u8, url);
try self.map.put(self.allocator, duped, .absent);
}
};
rules: []const Rule,
const State = struct {
entry: enum {
not_in_entry,
in_other_entry,
in_our_entry,
in_wildcard_entry,
},
has_rules: bool = false,
};
fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void {
for (rules) |rule| {
switch (rule) {
.allow => |value| allocator.free(value),
.disallow => |value| allocator.free(value),
}
}
}
fn parseRulesWithUserAgent(
allocator: std.mem.Allocator,
user_agent: []const u8,
bytes: []const u8,
) ![]const Rule {
var rules: std.ArrayList(Rule) = .empty;
defer rules.deinit(allocator);
var wildcard_rules: std.ArrayList(Rule) = .empty;
defer wildcard_rules.deinit(allocator);
var state: State = .{ .entry = .not_in_entry, .has_rules = false };
var iter = std.mem.splitScalar(u8, bytes, '\n');
while (iter.next()) |line| {
const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
// Skip all comment lines.
if (std.mem.startsWith(u8, trimmed, "#")) continue;
// Remove end of line comment.
const true_line = if (std.mem.indexOfScalar(u8, trimmed, '#')) |pos|
std.mem.trimRight(u8, trimmed[0..pos], &std.ascii.whitespace)
else
trimmed;
if (true_line.len == 0) {
continue;
}
const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse return error.MissingColon;
const key_str = try std.ascii.allocLowerString(allocator, true_line[0..colon_idx]);
defer allocator.free(key_str);
const key = std.meta.stringToEnum(Key, key_str) orelse {
// log.warn(.browser, "robots key", .{ .key = key_str });
continue;
};
const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace);
switch (key) {
.@"user-agent" => {
if (state.has_rules) {
state = .{ .entry = .not_in_entry, .has_rules = false };
}
switch (state.entry) {
.in_other_entry => {
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
state.entry = .in_our_entry;
}
},
.in_our_entry => {},
.in_wildcard_entry => {
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
state.entry = .in_our_entry;
}
},
.not_in_entry => {
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
state.entry = .in_our_entry;
} else if (std.mem.eql(u8, "*", value)) {
state.entry = .in_wildcard_entry;
} else {
state.entry = .in_other_entry;
}
},
}
},
.allow => {
defer state.has_rules = true;
switch (state.entry) {
.in_our_entry => {
const duped_value = try allocator.dupe(u8, value);
errdefer allocator.free(duped_value);
try rules.append(allocator, .{ .allow = duped_value });
},
.in_other_entry => {},
.in_wildcard_entry => {
const duped_value = try allocator.dupe(u8, value);
errdefer allocator.free(duped_value);
try wildcard_rules.append(allocator, .{ .allow = duped_value });
},
.not_in_entry => {
log.warn(.browser, "robots unexpected rule", .{ .rule = "allow" });
continue;
},
}
},
.disallow => {
defer state.has_rules = true;
switch (state.entry) {
.in_our_entry => {
const duped_value = try allocator.dupe(u8, value);
errdefer allocator.free(duped_value);
try rules.append(allocator, .{ .disallow = duped_value });
},
.in_other_entry => {},
.in_wildcard_entry => {
const duped_value = try allocator.dupe(u8, value);
errdefer allocator.free(duped_value);
try wildcard_rules.append(allocator, .{ .disallow = duped_value });
},
.not_in_entry => {
log.warn(.browser, "robots unexpected rule", .{ .rule = "disallow" });
continue;
},
}
},
}
}
// If we have rules for our specific User-Agent, we will use those rules.
// If we don't have any rules, we fallback to using the wildcard ("*") rules.
if (rules.items.len > 0) {
freeRulesInList(allocator, wildcard_rules.items);
return try rules.toOwnedSlice(allocator);
} else {
freeRulesInList(allocator, rules.items);
return try wildcard_rules.toOwnedSlice(allocator);
}
}
pub fn fromBytes(allocator: std.mem.Allocator, user_agent: []const u8, bytes: []const u8) !Robots {
const rules = try parseRulesWithUserAgent(allocator, user_agent, bytes);
return .{ .rules = rules };
}
pub fn deinit(self: *Robots, allocator: std.mem.Allocator) void {
freeRulesInList(allocator, self.rules);
allocator.free(self.rules);
}
fn matchPatternRecursive(pattern: []const u8, path: []const u8, exact_match: bool) bool {
if (pattern.len == 0) return true;
const star_pos = std.mem.indexOfScalar(u8, pattern, '*') orelse {
if (exact_match) {
// If we end in '$', we must be exactly equal.
return std.mem.eql(u8, path, pattern);
} else {
// Otherwise, we are just a prefix.
return std.mem.startsWith(u8, path, pattern);
}
};
// Ensure the prefix before the '*' matches.
if (!std.mem.startsWith(u8, path, pattern[0..star_pos])) {
return false;
}
const suffix_pattern = pattern[star_pos + 1 ..];
if (suffix_pattern.len == 0) return true;
var i: usize = star_pos;
while (i <= path.len) : (i += 1) {
if (matchPatternRecursive(suffix_pattern, path[i..], exact_match)) {
return true;
}
}
return false;
}
/// There are rules for how the pattern in robots.txt should be matched.
///
/// * should match 0 or more of any character.
/// $ should signify the end of a path, making it exact.
/// otherwise, it is a prefix path.
fn matchPattern(pattern: []const u8, path: []const u8) ?usize {
if (pattern.len == 0) return 0;
const exact_match = pattern[pattern.len - 1] == '$';
const inner_pattern = if (exact_match) pattern[0 .. pattern.len - 1] else pattern;
if (matchPatternRecursive(
inner_pattern,
path,
exact_match,
)) return pattern.len else return null;
}
pub fn isAllowed(self: *const Robots, path: []const u8) bool {
const rules = self.rules;
var longest_match_len: usize = 0;
var is_allowed_result = true;
for (rules) |rule| {
switch (rule) {
.allow => |pattern| {
if (matchPattern(pattern, path)) |len| {
// Longest or Last Wins.
if (len >= longest_match_len) {
longest_match_len = len;
is_allowed_result = true;
}
}
},
.disallow => |pattern| {
if (pattern.len == 0) continue;
if (matchPattern(pattern, path)) |len| {
// Longest or Last Wins.
if (len >= longest_match_len) {
longest_match_len = len;
is_allowed_result = false;
}
}
},
}
}
return is_allowed_result;
}
test "Robots: simple robots.txt" {
const allocator = std.testing.allocator;
const file =
\\User-agent: *
\\Disallow: /private/
\\Allow: /public/
\\
\\User-agent: Googlebot
\\Disallow: /admin/
\\
;
const rules = try parseRulesWithUserAgent(allocator, "GoogleBot", file);
defer {
freeRulesInList(allocator, rules);
allocator.free(rules);
}
try std.testing.expectEqual(1, rules.len);
try std.testing.expectEqualStrings("/admin/", rules[0].disallow);
}
test "Robots: matchPattern - simple prefix" {
try std.testing.expect(matchPattern("/admin", "/admin/page") != null);
try std.testing.expect(matchPattern("/admin", "/admin") != null);
try std.testing.expect(matchPattern("/admin", "/other") == null);
try std.testing.expect(matchPattern("/admin/page", "/admin") == null);
}
test "Robots: matchPattern - single wildcard" {
try std.testing.expect(matchPattern("/admin/*", "/admin/") != null);
try std.testing.expect(matchPattern("/admin/*", "/admin/page") != null);
try std.testing.expect(matchPattern("/admin/*", "/admin/page/subpage") != null);
try std.testing.expect(matchPattern("/admin/*", "/other/page") == null);
}
test "Robots: matchPattern - wildcard in middle" {
try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/xyz") != null);
try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/ghi/xyz") != null);
try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def") == null);
try std.testing.expect(matchPattern("/abc/*/xyz", "/other/def/xyz") == null);
}
test "Robots: matchPattern - complex wildcard case" {
try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/def/def/xyz") != null);
try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/ANYTHING/def/xyz") != null);
}
test "Robots: matchPattern - multiple wildcards" {
try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/b/y/c") != null);
try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/y/b/z/w/c") != null);
try std.testing.expect(matchPattern("/*.php", "/index.php") != null);
try std.testing.expect(matchPattern("/*.php", "/admin/index.php") != null);
}
test "Robots: matchPattern - end anchor" {
try std.testing.expect(matchPattern("/*.php$", "/index.php") != null);
try std.testing.expect(matchPattern("/*.php$", "/index.php?param=value") == null);
try std.testing.expect(matchPattern("/admin$", "/admin") != null);
try std.testing.expect(matchPattern("/admin$", "/admin/") == null);
try std.testing.expect(matchPattern("/fish$", "/fish") != null);
try std.testing.expect(matchPattern("/fish$", "/fishheads") == null);
}
test "Robots: matchPattern - wildcard with extension" {
try std.testing.expect(matchPattern("/fish*.php", "/fish.php") != null);
try std.testing.expect(matchPattern("/fish*.php", "/fishheads.php") != null);
try std.testing.expect(matchPattern("/fish*.php", "/fish/salmon.php") != null);
try std.testing.expect(matchPattern("/fish*.php", "/fish.asp") == null);
}
test "Robots: matchPattern - empty and edge cases" {
try std.testing.expect(matchPattern("", "/anything") != null);
try std.testing.expect(matchPattern("/", "/") != null);
try std.testing.expect(matchPattern("*", "/anything") != null);
try std.testing.expect(matchPattern("/*", "/anything") != null);
try std.testing.expect(matchPattern("$", "") != null);
}
test "Robots: matchPattern - real world examples" {
try std.testing.expect(matchPattern("/", "/anything") != null);
try std.testing.expect(matchPattern("/admin/", "/admin/page") != null);
try std.testing.expect(matchPattern("/admin/", "/public/page") == null);
try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf") != null);
try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf.bak") == null);
try std.testing.expect(matchPattern("/*?", "/page?param=value") != null);
try std.testing.expect(matchPattern("/*?", "/page") == null);
}
test "Robots: isAllowed - basic allow/disallow" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "MyBot",
\\User-agent: MyBot
\\Disallow: /admin/
\\Allow: /public/
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/") == true);
try std.testing.expect(robots.isAllowed("/public/page") == true);
try std.testing.expect(robots.isAllowed("/admin/secret") == false);
try std.testing.expect(robots.isAllowed("/other/page") == true);
}
test "Robots: isAllowed - longest match wins" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "TestBot",
\\User-agent: TestBot
\\Disallow: /admin/
\\Allow: /admin/public/
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/admin/secret") == false);
try std.testing.expect(robots.isAllowed("/admin/public/page") == true);
try std.testing.expect(robots.isAllowed("/admin/public/") == true);
}
test "Robots: isAllowed - specific user-agent vs wildcard" {
const allocator = std.testing.allocator;
var robots1 = try Robots.fromBytes(allocator, "Googlebot",
\\User-agent: Googlebot
\\Disallow: /private/
\\
\\User-agent: *
\\Disallow: /admin/
\\
);
defer robots1.deinit(allocator);
try std.testing.expect(robots1.isAllowed("/private/page") == false);
try std.testing.expect(robots1.isAllowed("/admin/page") == true);
// Test with other bot (should use wildcard)
var robots2 = try Robots.fromBytes(allocator, "OtherBot",
\\User-agent: Googlebot
\\Disallow: /private/
\\
\\User-agent: *
\\Disallow: /admin/
\\
);
defer robots2.deinit(allocator);
try std.testing.expect(robots2.isAllowed("/private/page") == true);
try std.testing.expect(robots2.isAllowed("/admin/page") == false);
}
test "Robots: isAllowed - case insensitive user-agent" {
const allocator = std.testing.allocator;
var robots1 = try Robots.fromBytes(allocator, "googlebot",
\\User-agent: GoogleBot
\\Disallow: /private/
\\
);
defer robots1.deinit(allocator);
try std.testing.expect(robots1.isAllowed("/private/") == false);
var robots2 = try Robots.fromBytes(allocator, "GOOGLEBOT",
\\User-agent: GoogleBot
\\Disallow: /private/
\\
);
defer robots2.deinit(allocator);
try std.testing.expect(robots2.isAllowed("/private/") == false);
var robots3 = try Robots.fromBytes(allocator, "GoOgLeBoT",
\\User-agent: GoogleBot
\\Disallow: /private/
\\
);
defer robots3.deinit(allocator);
try std.testing.expect(robots3.isAllowed("/private/") == false);
}
test "Robots: isAllowed - merged rules for same agent" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "Googlebot",
\\User-agent: Googlebot
\\Disallow: /admin/
\\
\\User-agent: Googlebot
\\Disallow: /private/
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/admin/page") == false);
try std.testing.expect(robots.isAllowed("/private/page") == false);
try std.testing.expect(robots.isAllowed("/public/page") == true);
}
test "Robots: isAllowed - wildcards in patterns" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "Bot",
\\User-agent: Bot
\\Disallow: /*.php$
\\Allow: /index.php$
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/page.php") == false);
try std.testing.expect(robots.isAllowed("/index.php") == true);
try std.testing.expect(robots.isAllowed("/page.php?param=1") == true);
try std.testing.expect(robots.isAllowed("/page.html") == true);
}
test "Robots: isAllowed - empty disallow allows everything" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "Bot",
\\User-agent: Bot
\\Disallow:
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/anything") == true);
try std.testing.expect(robots.isAllowed("/") == true);
}
test "Robots: isAllowed - no rules" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "Bot", "");
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/anything") == true);
}
test "Robots: isAllowed - disallow all" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "Bot",
\\User-agent: Bot
\\Disallow: /
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/") == false);
try std.testing.expect(robots.isAllowed("/anything") == false);
try std.testing.expect(robots.isAllowed("/admin/page") == false);
}
test "Robots: isAllowed - multiple user-agents in same entry" {
const allocator = std.testing.allocator;
var robots1 = try Robots.fromBytes(allocator, "Googlebot",
\\User-agent: Googlebot
\\User-agent: Bingbot
\\Disallow: /private/
\\
);
defer robots1.deinit(allocator);
try std.testing.expect(robots1.isAllowed("/private/") == false);
var robots2 = try Robots.fromBytes(allocator, "Bingbot",
\\User-agent: Googlebot
\\User-agent: Bingbot
\\Disallow: /private/
\\
);
defer robots2.deinit(allocator);
try std.testing.expect(robots2.isAllowed("/private/") == false);
var robots3 = try Robots.fromBytes(allocator, "OtherBot",
\\User-agent: Googlebot
\\User-agent: Bingbot
\\Disallow: /private/
\\
);
defer robots3.deinit(allocator);
try std.testing.expect(robots3.isAllowed("/private/") == true);
}
test "Robots: isAllowed - wildcard fallback" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "UnknownBot",
\\User-agent: *
\\Disallow: /admin/
\\Allow: /admin/public/
\\
\\User-agent: Googlebot
\\Disallow: /private/
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/admin/secret") == false);
try std.testing.expect(robots.isAllowed("/admin/public/page") == true);
try std.testing.expect(robots.isAllowed("/private/") == true);
}
test "Robots: isAllowed - complex real-world example" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "MyBot",
\\User-agent: *
\\Disallow: /cgi-bin/
\\Disallow: /tmp/
\\Disallow: /private/
\\
\\User-agent: MyBot
\\Disallow: /admin/
\\Disallow: /*.pdf$
\\Allow: /public/*.pdf$
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/") == true);
try std.testing.expect(robots.isAllowed("/admin/dashboard") == false);
try std.testing.expect(robots.isAllowed("/docs/guide.pdf") == false);
try std.testing.expect(robots.isAllowed("/public/manual.pdf") == true);
try std.testing.expect(robots.isAllowed("/page.html") == true);
try std.testing.expect(robots.isAllowed("/cgi-bin/script.sh") == true);
}
test "Robots: isAllowed - order doesn't matter for same length" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "Bot",
\\User-agent: Bot
\\ # WOW!!
\\Allow: /page
\\Disallow: /page
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/page") == false);
}
test "Robots: isAllowed - empty file uses wildcard defaults" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "MyBot",
\\User-agent: * # ABCDEF!!!
\\Disallow: /admin/
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/admin/") == false);
try std.testing.expect(robots.isAllowed("/public/") == true);
}
test "Robots: isAllowed - wildcard entry with multiple user-agents including specific" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "Googlebot",
\\User-agent: *
\\User-agent: Googlebot
\\Disallow: /shared/
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/shared/") == false);
try std.testing.expect(robots.isAllowed("/other/") == true);
var robots2 = try Robots.fromBytes(allocator, "Bingbot",
\\User-agent: *
\\User-agent: Googlebot
\\Disallow: /shared/
\\
);
defer robots2.deinit(allocator);
try std.testing.expect(robots2.isAllowed("/shared/") == false);
}
test "Robots: isAllowed - specific agent appears after wildcard in entry" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "MyBot",
\\User-agent: *
\\User-agent: MyBot
\\User-agent: Bingbot
\\Disallow: /admin/
\\Allow: /admin/public/
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/admin/secret") == false);
try std.testing.expect(robots.isAllowed("/admin/public/page") == true);
}
test "Robots: isAllowed - wildcard should not override specific entry" {
const allocator = std.testing.allocator;
var robots = try Robots.fromBytes(allocator, "Googlebot",
\\User-agent: Googlebot
\\Disallow: /private/
\\
\\User-agent: *
\\User-agent: Googlebot
\\Disallow: /admin/
\\
);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/private/") == false);
try std.testing.expect(robots.isAllowed("/admin/") == false);
}
test "Robots: isAllowed - Google's real robots.txt" {
const allocator = std.testing.allocator;
// Simplified version of google.com/robots.txt
const google_robots =
\\User-agent: *
\\User-agent: Yandex
\\Disallow: /search
\\Allow: /search/about
\\Allow: /search/howsearchworks
\\Disallow: /imgres
\\Disallow: /m?
\\Disallow: /m/
\\Allow: /m/finance
\\Disallow: /maps/
\\Allow: /maps/$
\\Allow: /maps/@
\\Allow: /maps/dir/
\\Disallow: /shopping?
\\Allow: /shopping?udm=28$
\\
\\User-agent: AdsBot-Google
\\Disallow: /maps/api/js/
\\Allow: /maps/api/js
\\Disallow: /maps/api/staticmap
\\
\\User-agent: Yandex
\\Disallow: /about/careers/applications/jobs/results
\\
\\User-agent: facebookexternalhit
\\User-agent: Twitterbot
\\Allow: /imgres
\\Allow: /search
\\Disallow: /groups
\\Disallow: /m/
\\
;
var regular_bot = try Robots.fromBytes(allocator, "Googlebot", google_robots);
defer regular_bot.deinit(allocator);
try std.testing.expect(regular_bot.isAllowed("/") == true);
try std.testing.expect(regular_bot.isAllowed("/search") == false);
try std.testing.expect(regular_bot.isAllowed("/search/about") == true);
try std.testing.expect(regular_bot.isAllowed("/search/howsearchworks") == true);
try std.testing.expect(regular_bot.isAllowed("/imgres") == false);
try std.testing.expect(regular_bot.isAllowed("/m/finance") == true);
try std.testing.expect(regular_bot.isAllowed("/m/other") == false);
try std.testing.expect(regular_bot.isAllowed("/maps/") == true);
try std.testing.expect(regular_bot.isAllowed("/maps/@") == true);
try std.testing.expect(regular_bot.isAllowed("/shopping?udm=28") == true);
try std.testing.expect(regular_bot.isAllowed("/shopping?udm=28&extra") == false);
var adsbot = try Robots.fromBytes(allocator, "AdsBot-Google", google_robots);
defer adsbot.deinit(allocator);
try std.testing.expect(adsbot.isAllowed("/maps/api/js") == true);
try std.testing.expect(adsbot.isAllowed("/maps/api/js/") == false);
try std.testing.expect(adsbot.isAllowed("/maps/api/staticmap") == false);
var twitterbot = try Robots.fromBytes(allocator, "Twitterbot", google_robots);
defer twitterbot.deinit(allocator);
try std.testing.expect(twitterbot.isAllowed("/imgres") == true);
try std.testing.expect(twitterbot.isAllowed("/search") == true);
try std.testing.expect(twitterbot.isAllowed("/groups") == false);
try std.testing.expect(twitterbot.isAllowed("/m/") == false);
}
test "Robots: user-agent after rules starts new entry" {
const allocator = std.testing.allocator;
const file =
\\User-agent: Bot1
\\User-agent: Bot2
\\Disallow: /admin/
\\Allow: /public/
\\User-agent: Bot3
\\Disallow: /private/
\\
;
var robots1 = try Robots.fromBytes(allocator, "Bot1", file);
defer robots1.deinit(allocator);
try std.testing.expect(robots1.isAllowed("/admin/") == false);
try std.testing.expect(robots1.isAllowed("/public/") == true);
try std.testing.expect(robots1.isAllowed("/private/") == true);
var robots2 = try Robots.fromBytes(allocator, "Bot2", file);
defer robots2.deinit(allocator);
try std.testing.expect(robots2.isAllowed("/admin/") == false);
try std.testing.expect(robots2.isAllowed("/public/") == true);
try std.testing.expect(robots2.isAllowed("/private/") == true);
var robots3 = try Robots.fromBytes(allocator, "Bot3", file);
defer robots3.deinit(allocator);
try std.testing.expect(robots3.isAllowed("/admin/") == true);
try std.testing.expect(robots3.isAllowed("/public/") == true);
try std.testing.expect(robots3.isAllowed("/private/") == false);
}
test "Robots: blank lines don't end entries" {
const allocator = std.testing.allocator;
const file =
\\User-agent: MyBot
\\Disallow: /admin/
\\
\\
\\Allow: /public/
\\
;
var robots = try Robots.fromBytes(allocator, "MyBot", file);
defer robots.deinit(allocator);
try std.testing.expect(robots.isAllowed("/admin/") == false);
try std.testing.expect(robots.isAllowed("/public/") == true);
}

View File

@@ -502,6 +502,16 @@ pub fn concatQueryString(arena: Allocator, url: []const u8, query_string: []cons
return buf.items[0 .. buf.items.len - 1 :0]; return buf.items[0 .. buf.items.len - 1 :0];
} }
pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) ![:0]const u8 {
const origin = try getOrigin(arena, url) orelse return error.NoOrigin;
return try std.fmt.allocPrintSentinel(
arena,
"{s}/robots.txt",
.{origin},
0,
);
}
const testing = @import("../testing.zig"); const testing = @import("../testing.zig");
test "URL: isCompleteHTTPUrl" { test "URL: isCompleteHTTPUrl" {
try testing.expectEqual(true, isCompleteHTTPUrl("http://example.com/about")); try testing.expectEqual(true, isCompleteHTTPUrl("http://example.com/about"));
@@ -778,3 +788,31 @@ test "URL: concatQueryString" {
try testing.expectEqual("https://www.lightpanda.io/index?1=2&a=b", url); try testing.expectEqual("https://www.lightpanda.io/index?1=2&a=b", url);
} }
} }
test "URL: getRobotsUrl" {
defer testing.reset();
const arena = testing.arena_allocator;
{
const url = try getRobotsUrl(arena, "https://www.lightpanda.io");
try testing.expectEqual("https://www.lightpanda.io/robots.txt", url);
}
{
const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path");
try testing.expectString("https://www.lightpanda.io/robots.txt", url);
}
{
const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page");
try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url);
}
{
const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment");
try testing.expectString("http://example.com/robots.txt", url);
}
{
const url = try getRobotsUrl(arena, "https://user:pass@example.com/page");
try testing.expectString("https://example.com/robots.txt", url);
}
}

View File

@@ -27,6 +27,8 @@ const Config = @import("../Config.zig");
const URL = @import("../browser/URL.zig"); const URL = @import("../browser/URL.zig");
const Notification = @import("../Notification.zig"); const Notification = @import("../Notification.zig");
const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar; const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar;
const Robots = @import("../browser/Robots.zig");
const RobotStore = Robots.RobotStore;
const c = Http.c; const c = Http.c;
const posix = std.posix; const posix = std.posix;
@@ -85,6 +87,12 @@ queue: TransferQueue,
// The main app allocator // The main app allocator
allocator: Allocator, allocator: Allocator,
// Reference to the App-owned Robot Store.
robot_store: *RobotStore,
// Queue of requests that depend on a robots.txt.
// Allows us to fetch the robots.txt just once.
pending_robots_queue: std.StringHashMapUnmanaged(std.ArrayList(Request)) = .empty,
// Once we have a handle/easy to process a request with, we create a Transfer // Once we have a handle/easy to process a request with, we create a Transfer
// which contains the Request as well as any state we need to process the // which contains the Request as well as any state we need to process the
// request. These wil come and go with each request. // request. These wil come and go with each request.
@@ -123,7 +131,7 @@ pub const CDPClient = struct {
const TransferQueue = std.DoublyLinkedList; const TransferQueue = std.DoublyLinkedList;
pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, config: *const Config) !*Client { pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, robot_store: *RobotStore, config: *const Config) !*Client {
var transfer_pool = std.heap.MemoryPool(Transfer).init(allocator); var transfer_pool = std.heap.MemoryPool(Transfer).init(allocator);
errdefer transfer_pool.deinit(); errdefer transfer_pool.deinit();
@@ -147,6 +155,7 @@ pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, config: *const Config)
.multi = multi, .multi = multi,
.handles = handles, .handles = handles,
.allocator = allocator, .allocator = allocator,
.robot_store = robot_store,
.http_proxy = http_proxy, .http_proxy = http_proxy,
.use_proxy = http_proxy != null, .use_proxy = http_proxy != null,
.config = config, .config = config,
@@ -163,6 +172,13 @@ pub fn deinit(self: *Client) void {
_ = c.curl_multi_cleanup(self.multi); _ = c.curl_multi_cleanup(self.multi);
self.transfer_pool.deinit(); self.transfer_pool.deinit();
var robots_iter = self.pending_robots_queue.iterator();
while (robots_iter.next()) |entry| {
entry.value_ptr.deinit(self.allocator);
}
self.pending_robots_queue.deinit(self.allocator);
self.allocator.destroy(self); self.allocator.destroy(self);
} }
@@ -217,12 +233,46 @@ pub fn tick(self: *Client, timeout_ms: u32) !PerformStatus {
} }
pub fn request(self: *Client, req: Request) !void { pub fn request(self: *Client, req: Request) !void {
if (self.config.obeyRobots()) {
const robots_url = try URL.getRobotsUrl(self.allocator, req.url);
errdefer self.allocator.free(robots_url);
// If we have this robots cached, we can take a fast path.
if (self.robot_store.get(robots_url)) |robot_entry| {
defer self.allocator.free(robots_url);
switch (robot_entry) {
// If we have a found robots entry, we check it.
.present => |robots| {
const path = URL.getPathname(req.url);
if (!robots.isAllowed(path)) {
req.error_callback(req.ctx, error.RobotsBlocked);
return;
}
},
// Otherwise, we assume we won't find it again.
.absent => {},
}
return self.processRequest(req);
}
return self.fetchRobotsThenProcessRequest(robots_url, req);
}
return self.processRequest(req);
}
fn processRequest(self: *Client, req: Request) !void {
const transfer = try self.makeTransfer(req); const transfer = try self.makeTransfer(req);
transfer.req.notification.dispatch(.http_request_start, &.{ .transfer = transfer }); transfer.req.notification.dispatch(.http_request_start, &.{ .transfer = transfer });
var wait_for_interception = false; var wait_for_interception = false;
transfer.req.notification.dispatch(.http_request_intercept, &.{ .transfer = transfer, .wait_for_interception = &wait_for_interception }); transfer.req.notification.dispatch(.http_request_intercept, &.{
.transfer = transfer,
.wait_for_interception = &wait_for_interception,
});
if (wait_for_interception == false) { if (wait_for_interception == false) {
// request not intercepted, process it normally // request not intercepted, process it normally
return self.process(transfer); return self.process(transfer);
@@ -246,6 +296,154 @@ pub fn request(self: *Client, req: Request) !void {
} }
} }
const RobotsRequestContext = struct {
client: *Client,
req: Request,
robots_url: [:0]const u8,
buffer: std.ArrayList(u8),
status: u16 = 0,
pub fn deinit(self: *RobotsRequestContext) void {
self.client.allocator.free(self.robots_url);
self.buffer.deinit(self.client.allocator);
self.client.allocator.destroy(self);
}
};
fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void {
const entry = try self.pending_robots_queue.getOrPut(self.allocator, robots_url);
if (!entry.found_existing) {
errdefer self.allocator.free(robots_url);
// If we aren't already fetching this robots,
// we want to create a new queue for it and add this request into it.
entry.value_ptr.* = .empty;
const ctx = try self.allocator.create(RobotsRequestContext);
errdefer self.allocator.destroy(ctx);
ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty };
const headers = try self.newHeaders();
log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url });
try self.processRequest(.{
.ctx = ctx,
.url = robots_url,
.method = .GET,
.headers = headers,
.blocking = false,
.cookie_jar = req.cookie_jar,
.notification = req.notification,
.resource_type = .fetch,
.header_callback = robotsHeaderCallback,
.data_callback = robotsDataCallback,
.done_callback = robotsDoneCallback,
.error_callback = robotsErrorCallback,
.shutdown_callback = robotsShutdownCallback,
});
} else {
// Not using our own robots URL, only using the one from the first request.
self.allocator.free(robots_url);
}
try entry.value_ptr.append(self.allocator, req);
}
fn robotsHeaderCallback(transfer: *Http.Transfer) !bool {
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx));
if (transfer.response_header) |hdr| {
log.debug(.browser, "robots status", .{ .status = hdr.status, .robots_url = ctx.robots_url });
ctx.status = hdr.status;
}
if (transfer.getContentLength()) |cl| {
try ctx.buffer.ensureTotalCapacity(ctx.client.allocator, cl);
}
return true;
}
fn robotsDataCallback(transfer: *Http.Transfer, data: []const u8) !void {
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx));
try ctx.buffer.appendSlice(ctx.client.allocator, data);
}
fn robotsDoneCallback(ctx_ptr: *anyopaque) !void {
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr));
defer ctx.deinit();
var allowed = true;
if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) {
const robots = try ctx.client.robot_store.robotsFromBytes(
ctx.client.config.http_headers.user_agent,
ctx.buffer.items,
);
try ctx.client.robot_store.put(ctx.robots_url, robots);
const path = URL.getPathname(ctx.req.url);
allowed = robots.isAllowed(path);
} else if (ctx.status == 404) {
log.debug(.http, "robots not found", .{ .url = ctx.robots_url });
try ctx.client.robot_store.putAbsent(ctx.robots_url);
}
var queued = ctx.client.pending_robots_queue.fetchRemove(
ctx.robots_url,
) orelse @panic("Client.robotsDoneCallbacke empty queue");
defer queued.value.deinit(ctx.client.allocator);
for (queued.value.items) |queued_req| {
if (!allowed) {
log.warn(.http, "blocked by robots", .{ .url = queued_req.url });
queued_req.error_callback(queued_req.ctx, error.RobotsBlocked);
} else {
ctx.client.processRequest(queued_req) catch |e| {
queued_req.error_callback(queued_req.ctx, e);
};
}
}
}
fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void {
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr));
defer ctx.deinit();
log.warn(.http, "robots fetch failed", .{ .err = err });
var queued = ctx.client.pending_robots_queue.fetchRemove(
ctx.robots_url,
) orelse @panic("Client.robotsErrorCallback empty queue");
defer queued.value.deinit(ctx.client.allocator);
// On error, allow all queued requests to proceed
for (queued.value.items) |queued_req| {
ctx.client.processRequest(queued_req) catch |e| {
queued_req.error_callback(queued_req.ctx, e);
};
}
}
fn robotsShutdownCallback(ctx_ptr: *anyopaque) void {
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr));
defer ctx.deinit();
log.debug(.http, "robots fetch shutdown", .{});
var queued = ctx.client.pending_robots_queue.fetchRemove(
ctx.robots_url,
) orelse @panic("Client.robotsErrorCallback empty queue");
defer queued.value.deinit(ctx.client.allocator);
for (queued.value.items) |queued_req| {
if (queued_req.shutdown_callback) |shutdown_cb| {
shutdown_cb(queued_req.ctx);
}
}
}
fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool { fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool {
// The request was intercepted and is blocking. This is messy, but our // The request was intercepted and is blocking. This is messy, but our
// callers, the ScriptManager -> Page, don't have a great way to stop the // callers, the ScriptManager -> Page, don't have a great way to stop the

View File

@@ -30,6 +30,7 @@ pub const Transfer = Client.Transfer;
const log = @import("../log.zig"); const log = @import("../log.zig");
const errors = @import("errors.zig"); const errors = @import("errors.zig");
const RobotStore = @import("../browser/Robots.zig").RobotStore;
const Allocator = std.mem.Allocator; const Allocator = std.mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator; const ArenaAllocator = std.heap.ArenaAllocator;
@@ -46,7 +47,7 @@ client: *Client,
ca_blob: ?c.curl_blob, ca_blob: ?c.curl_blob,
arena: ArenaAllocator, arena: ArenaAllocator,
pub fn init(allocator: Allocator, config: *const Config) !Http { pub fn init(allocator: Allocator, robot_store: *RobotStore, config: *const Config) !Http {
try errorCheck(c.curl_global_init(c.CURL_GLOBAL_SSL)); try errorCheck(c.curl_global_init(c.CURL_GLOBAL_SSL));
errdefer c.curl_global_cleanup(); errdefer c.curl_global_cleanup();
@@ -62,7 +63,7 @@ pub fn init(allocator: Allocator, config: *const Config) !Http {
ca_blob = try loadCerts(allocator, arena.allocator()); ca_blob = try loadCerts(allocator, arena.allocator());
} }
var client = try Client.init(allocator, ca_blob, config); var client = try Client.init(allocator, ca_blob, robot_store, config);
errdefer client.deinit(); errdefer client.deinit();
return .{ return .{