Merge pull request #1407 from lightpanda-io/robots

Support for `robots.txt`
This commit is contained in:
Karl Seguin
2026-02-10 09:51:32 +08:00
committed by GitHub
6 changed files with 1143 additions and 5 deletions

View File

@@ -57,6 +57,13 @@ pub fn tlsVerifyHost(self: *const Config) bool {
};
}
pub fn obeyRobots(self: *const Config) bool {
return switch (self.mode) {
inline .serve, .fetch => |opts| opts.common.obey_robots,
else => unreachable,
};
}
pub fn httpProxy(self: *const Config) ?[:0]const u8 {
return switch (self.mode) {
inline .serve, .fetch => |opts| opts.common.http_proxy,
@@ -165,6 +172,7 @@ pub const Fetch = struct {
};
pub const Common = struct {
obey_robots: bool = false,
proxy_bearer_token: ?[:0]const u8 = null,
http_proxy: ?[:0]const u8 = null,
http_max_concurrent: ?u8 = null,
@@ -231,6 +239,11 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void {
\\ advanced option which should only be set if you understand
\\ and accept the risk of disabling host verification.
\\
\\--obey_robots
\\ Fetches and obeys the robots.txt (if available) of the web pages
\\ we make requests towards.
\\ Defaults to false.
\\
\\--http_proxy The HTTP proxy to use for all HTTP requests.
\\ A username:password can be included for basic authentication.
\\ Defaults to none.
@@ -626,6 +639,11 @@ fn parseCommonArg(
return true;
}
if (std.mem.eql(u8, "--obey_robots", opt)) {
common.obey_robots = true;
return true;
}
if (std.mem.eql(u8, "--http_proxy", opt)) {
const str = args.next() orelse {
log.fatal(.app, "missing argument value", .{ .arg = "--http_proxy" });