mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-03-21 20:24:42 +00:00
Merge pull request #1407 from lightpanda-io/robots
Support for `robots.txt`
This commit is contained in:
@@ -25,6 +25,7 @@ const Config = @import("Config.zig");
|
||||
const Snapshot = @import("browser/js/Snapshot.zig");
|
||||
const Platform = @import("browser/js/Platform.zig");
|
||||
const Telemetry = @import("telemetry/telemetry.zig").Telemetry;
|
||||
const RobotStore = @import("browser/Robots.zig").RobotStore;
|
||||
|
||||
pub const Http = @import("http/Http.zig");
|
||||
pub const ArenaPool = @import("ArenaPool.zig");
|
||||
@@ -38,6 +39,7 @@ snapshot: Snapshot,
|
||||
telemetry: Telemetry,
|
||||
allocator: Allocator,
|
||||
arena_pool: ArenaPool,
|
||||
robots: RobotStore,
|
||||
app_dir_path: ?[]const u8,
|
||||
shutdown: bool = false,
|
||||
|
||||
@@ -48,7 +50,9 @@ pub fn init(allocator: Allocator, config: *const Config) !*App {
|
||||
app.config = config;
|
||||
app.allocator = allocator;
|
||||
|
||||
app.http = try Http.init(allocator, config);
|
||||
app.robots = RobotStore.init(allocator);
|
||||
|
||||
app.http = try Http.init(allocator, &app.robots, config);
|
||||
errdefer app.http.deinit();
|
||||
|
||||
app.platform = try Platform.init();
|
||||
@@ -79,6 +83,7 @@ pub fn deinit(self: *App) void {
|
||||
self.app_dir_path = null;
|
||||
}
|
||||
self.telemetry.deinit();
|
||||
self.robots.deinit();
|
||||
self.http.deinit();
|
||||
self.snapshot.deinit();
|
||||
self.platform.deinit();
|
||||
|
||||
@@ -57,6 +57,13 @@ pub fn tlsVerifyHost(self: *const Config) bool {
|
||||
};
|
||||
}
|
||||
|
||||
pub fn obeyRobots(self: *const Config) bool {
|
||||
return switch (self.mode) {
|
||||
inline .serve, .fetch => |opts| opts.common.obey_robots,
|
||||
else => unreachable,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn httpProxy(self: *const Config) ?[:0]const u8 {
|
||||
return switch (self.mode) {
|
||||
inline .serve, .fetch => |opts| opts.common.http_proxy,
|
||||
@@ -165,6 +172,7 @@ pub const Fetch = struct {
|
||||
};
|
||||
|
||||
pub const Common = struct {
|
||||
obey_robots: bool = false,
|
||||
proxy_bearer_token: ?[:0]const u8 = null,
|
||||
http_proxy: ?[:0]const u8 = null,
|
||||
http_max_concurrent: ?u8 = null,
|
||||
@@ -231,6 +239,11 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void {
|
||||
\\ advanced option which should only be set if you understand
|
||||
\\ and accept the risk of disabling host verification.
|
||||
\\
|
||||
\\--obey_robots
|
||||
\\ Fetches and obeys the robots.txt (if available) of the web pages
|
||||
\\ we make requests towards.
|
||||
\\ Defaults to false.
|
||||
\\
|
||||
\\--http_proxy The HTTP proxy to use for all HTTP requests.
|
||||
\\ A username:password can be included for basic authentication.
|
||||
\\ Defaults to none.
|
||||
@@ -626,6 +639,11 @@ fn parseCommonArg(
|
||||
return true;
|
||||
}
|
||||
|
||||
if (std.mem.eql(u8, "--obey_robots", opt)) {
|
||||
common.obey_robots = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (std.mem.eql(u8, "--http_proxy", opt)) {
|
||||
const str = args.next() orelse {
|
||||
log.fatal(.app, "missing argument value", .{ .arg = "--http_proxy" });
|
||||
|
||||
878
src/browser/Robots.zig
Normal file
878
src/browser/Robots.zig
Normal file
@@ -0,0 +1,878 @@
|
||||
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
|
||||
//
|
||||
// Francis Bouvier <francis@lightpanda.io>
|
||||
// Pierre Tachoire <pierre@lightpanda.io>
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as
|
||||
// published by the Free Software Foundation, either version 3 of the
|
||||
// License, or (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
const std = @import("std");
|
||||
const log = @import("../log.zig");
|
||||
|
||||
pub const Rule = union(enum) {
|
||||
allow: []const u8,
|
||||
disallow: []const u8,
|
||||
};
|
||||
|
||||
pub const Key = enum {
|
||||
@"user-agent",
|
||||
allow,
|
||||
disallow,
|
||||
};
|
||||
|
||||
/// https://www.rfc-editor.org/rfc/rfc9309.html
|
||||
pub const Robots = @This();
|
||||
pub const empty: Robots = .{ .rules = &.{} };
|
||||
|
||||
pub const RobotStore = struct {
|
||||
const RobotsEntry = union(enum) {
|
||||
present: Robots,
|
||||
absent,
|
||||
};
|
||||
|
||||
pub const RobotsMap = std.HashMapUnmanaged([]const u8, RobotsEntry, struct {
|
||||
const Context = @This();
|
||||
|
||||
pub fn hash(_: Context, value: []const u8) u32 {
|
||||
var hasher = std.hash.Wyhash.init(value.len);
|
||||
for (value) |c| {
|
||||
std.hash.autoHash(&hasher, std.ascii.toLower(c));
|
||||
}
|
||||
return @truncate(hasher.final());
|
||||
}
|
||||
|
||||
pub fn eql(_: Context, a: []const u8, b: []const u8) bool {
|
||||
return std.ascii.eqlIgnoreCase(a, b);
|
||||
}
|
||||
}, 80);
|
||||
|
||||
allocator: std.mem.Allocator,
|
||||
map: RobotsMap,
|
||||
|
||||
pub fn init(allocator: std.mem.Allocator) RobotStore {
|
||||
return .{ .allocator = allocator, .map = .empty };
|
||||
}
|
||||
|
||||
pub fn deinit(self: *RobotStore) void {
|
||||
var iter = self.map.iterator();
|
||||
|
||||
while (iter.next()) |entry| {
|
||||
self.allocator.free(entry.key_ptr.*);
|
||||
|
||||
switch (entry.value_ptr.*) {
|
||||
.present => |*robots| robots.deinit(self.allocator),
|
||||
.absent => {},
|
||||
}
|
||||
}
|
||||
|
||||
self.map.deinit(self.allocator);
|
||||
}
|
||||
|
||||
pub fn get(self: *RobotStore, url: []const u8) ?RobotsEntry {
|
||||
return self.map.get(url);
|
||||
}
|
||||
|
||||
pub fn robotsFromBytes(self: *RobotStore, user_agent: []const u8, bytes: []const u8) !Robots {
|
||||
return try Robots.fromBytes(self.allocator, user_agent, bytes);
|
||||
}
|
||||
|
||||
pub fn put(self: *RobotStore, url: []const u8, robots: Robots) !void {
|
||||
const duped = try self.allocator.dupe(u8, url);
|
||||
try self.map.put(self.allocator, duped, .{ .present = robots });
|
||||
}
|
||||
|
||||
pub fn putAbsent(self: *RobotStore, url: []const u8) !void {
|
||||
const duped = try self.allocator.dupe(u8, url);
|
||||
try self.map.put(self.allocator, duped, .absent);
|
||||
}
|
||||
};
|
||||
|
||||
rules: []const Rule,
|
||||
|
||||
const State = struct {
|
||||
entry: enum {
|
||||
not_in_entry,
|
||||
in_other_entry,
|
||||
in_our_entry,
|
||||
in_wildcard_entry,
|
||||
},
|
||||
has_rules: bool = false,
|
||||
};
|
||||
|
||||
fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void {
|
||||
for (rules) |rule| {
|
||||
switch (rule) {
|
||||
.allow => |value| allocator.free(value),
|
||||
.disallow => |value| allocator.free(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parseRulesWithUserAgent(
|
||||
allocator: std.mem.Allocator,
|
||||
user_agent: []const u8,
|
||||
bytes: []const u8,
|
||||
) ![]const Rule {
|
||||
var rules: std.ArrayList(Rule) = .empty;
|
||||
defer rules.deinit(allocator);
|
||||
|
||||
var wildcard_rules: std.ArrayList(Rule) = .empty;
|
||||
defer wildcard_rules.deinit(allocator);
|
||||
|
||||
var state: State = .{ .entry = .not_in_entry, .has_rules = false };
|
||||
|
||||
var iter = std.mem.splitScalar(u8, bytes, '\n');
|
||||
while (iter.next()) |line| {
|
||||
const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
|
||||
|
||||
// Skip all comment lines.
|
||||
if (std.mem.startsWith(u8, trimmed, "#")) continue;
|
||||
|
||||
// Remove end of line comment.
|
||||
const true_line = if (std.mem.indexOfScalar(u8, trimmed, '#')) |pos|
|
||||
std.mem.trimRight(u8, trimmed[0..pos], &std.ascii.whitespace)
|
||||
else
|
||||
trimmed;
|
||||
|
||||
if (true_line.len == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse return error.MissingColon;
|
||||
const key_str = try std.ascii.allocLowerString(allocator, true_line[0..colon_idx]);
|
||||
defer allocator.free(key_str);
|
||||
|
||||
const key = std.meta.stringToEnum(Key, key_str) orelse {
|
||||
// log.warn(.browser, "robots key", .{ .key = key_str });
|
||||
continue;
|
||||
};
|
||||
|
||||
const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace);
|
||||
|
||||
switch (key) {
|
||||
.@"user-agent" => {
|
||||
if (state.has_rules) {
|
||||
state = .{ .entry = .not_in_entry, .has_rules = false };
|
||||
}
|
||||
|
||||
switch (state.entry) {
|
||||
.in_other_entry => {
|
||||
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
|
||||
state.entry = .in_our_entry;
|
||||
}
|
||||
},
|
||||
.in_our_entry => {},
|
||||
.in_wildcard_entry => {
|
||||
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
|
||||
state.entry = .in_our_entry;
|
||||
}
|
||||
},
|
||||
.not_in_entry => {
|
||||
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
|
||||
state.entry = .in_our_entry;
|
||||
} else if (std.mem.eql(u8, "*", value)) {
|
||||
state.entry = .in_wildcard_entry;
|
||||
} else {
|
||||
state.entry = .in_other_entry;
|
||||
}
|
||||
},
|
||||
}
|
||||
},
|
||||
.allow => {
|
||||
defer state.has_rules = true;
|
||||
|
||||
switch (state.entry) {
|
||||
.in_our_entry => {
|
||||
const duped_value = try allocator.dupe(u8, value);
|
||||
errdefer allocator.free(duped_value);
|
||||
try rules.append(allocator, .{ .allow = duped_value });
|
||||
},
|
||||
.in_other_entry => {},
|
||||
.in_wildcard_entry => {
|
||||
const duped_value = try allocator.dupe(u8, value);
|
||||
errdefer allocator.free(duped_value);
|
||||
try wildcard_rules.append(allocator, .{ .allow = duped_value });
|
||||
},
|
||||
.not_in_entry => {
|
||||
log.warn(.browser, "robots unexpected rule", .{ .rule = "allow" });
|
||||
continue;
|
||||
},
|
||||
}
|
||||
},
|
||||
.disallow => {
|
||||
defer state.has_rules = true;
|
||||
|
||||
switch (state.entry) {
|
||||
.in_our_entry => {
|
||||
const duped_value = try allocator.dupe(u8, value);
|
||||
errdefer allocator.free(duped_value);
|
||||
try rules.append(allocator, .{ .disallow = duped_value });
|
||||
},
|
||||
.in_other_entry => {},
|
||||
.in_wildcard_entry => {
|
||||
const duped_value = try allocator.dupe(u8, value);
|
||||
errdefer allocator.free(duped_value);
|
||||
try wildcard_rules.append(allocator, .{ .disallow = duped_value });
|
||||
},
|
||||
.not_in_entry => {
|
||||
log.warn(.browser, "robots unexpected rule", .{ .rule = "disallow" });
|
||||
continue;
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// If we have rules for our specific User-Agent, we will use those rules.
|
||||
// If we don't have any rules, we fallback to using the wildcard ("*") rules.
|
||||
if (rules.items.len > 0) {
|
||||
freeRulesInList(allocator, wildcard_rules.items);
|
||||
return try rules.toOwnedSlice(allocator);
|
||||
} else {
|
||||
freeRulesInList(allocator, rules.items);
|
||||
return try wildcard_rules.toOwnedSlice(allocator);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fromBytes(allocator: std.mem.Allocator, user_agent: []const u8, bytes: []const u8) !Robots {
|
||||
const rules = try parseRulesWithUserAgent(allocator, user_agent, bytes);
|
||||
return .{ .rules = rules };
|
||||
}
|
||||
|
||||
pub fn deinit(self: *Robots, allocator: std.mem.Allocator) void {
|
||||
freeRulesInList(allocator, self.rules);
|
||||
allocator.free(self.rules);
|
||||
}
|
||||
|
||||
fn matchPatternRecursive(pattern: []const u8, path: []const u8, exact_match: bool) bool {
|
||||
if (pattern.len == 0) return true;
|
||||
|
||||
const star_pos = std.mem.indexOfScalar(u8, pattern, '*') orelse {
|
||||
if (exact_match) {
|
||||
// If we end in '$', we must be exactly equal.
|
||||
return std.mem.eql(u8, path, pattern);
|
||||
} else {
|
||||
// Otherwise, we are just a prefix.
|
||||
return std.mem.startsWith(u8, path, pattern);
|
||||
}
|
||||
};
|
||||
|
||||
// Ensure the prefix before the '*' matches.
|
||||
if (!std.mem.startsWith(u8, path, pattern[0..star_pos])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const suffix_pattern = pattern[star_pos + 1 ..];
|
||||
if (suffix_pattern.len == 0) return true;
|
||||
|
||||
var i: usize = star_pos;
|
||||
while (i <= path.len) : (i += 1) {
|
||||
if (matchPatternRecursive(suffix_pattern, path[i..], exact_match)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// There are rules for how the pattern in robots.txt should be matched.
|
||||
///
|
||||
/// * should match 0 or more of any character.
|
||||
/// $ should signify the end of a path, making it exact.
|
||||
/// otherwise, it is a prefix path.
|
||||
fn matchPattern(pattern: []const u8, path: []const u8) ?usize {
|
||||
if (pattern.len == 0) return 0;
|
||||
const exact_match = pattern[pattern.len - 1] == '$';
|
||||
const inner_pattern = if (exact_match) pattern[0 .. pattern.len - 1] else pattern;
|
||||
|
||||
if (matchPatternRecursive(
|
||||
inner_pattern,
|
||||
path,
|
||||
exact_match,
|
||||
)) return pattern.len else return null;
|
||||
}
|
||||
|
||||
pub fn isAllowed(self: *const Robots, path: []const u8) bool {
|
||||
const rules = self.rules;
|
||||
|
||||
var longest_match_len: usize = 0;
|
||||
var is_allowed_result = true;
|
||||
|
||||
for (rules) |rule| {
|
||||
switch (rule) {
|
||||
.allow => |pattern| {
|
||||
if (matchPattern(pattern, path)) |len| {
|
||||
// Longest or Last Wins.
|
||||
if (len >= longest_match_len) {
|
||||
longest_match_len = len;
|
||||
is_allowed_result = true;
|
||||
}
|
||||
}
|
||||
},
|
||||
.disallow => |pattern| {
|
||||
if (pattern.len == 0) continue;
|
||||
|
||||
if (matchPattern(pattern, path)) |len| {
|
||||
// Longest or Last Wins.
|
||||
if (len >= longest_match_len) {
|
||||
longest_match_len = len;
|
||||
is_allowed_result = false;
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
return is_allowed_result;
|
||||
}
|
||||
|
||||
test "Robots: simple robots.txt" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
const file =
|
||||
\\User-agent: *
|
||||
\\Disallow: /private/
|
||||
\\Allow: /public/
|
||||
\\
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /admin/
|
||||
\\
|
||||
;
|
||||
|
||||
const rules = try parseRulesWithUserAgent(allocator, "GoogleBot", file);
|
||||
defer {
|
||||
freeRulesInList(allocator, rules);
|
||||
allocator.free(rules);
|
||||
}
|
||||
|
||||
try std.testing.expectEqual(1, rules.len);
|
||||
try std.testing.expectEqualStrings("/admin/", rules[0].disallow);
|
||||
}
|
||||
|
||||
test "Robots: matchPattern - simple prefix" {
|
||||
try std.testing.expect(matchPattern("/admin", "/admin/page") != null);
|
||||
try std.testing.expect(matchPattern("/admin", "/admin") != null);
|
||||
try std.testing.expect(matchPattern("/admin", "/other") == null);
|
||||
try std.testing.expect(matchPattern("/admin/page", "/admin") == null);
|
||||
}
|
||||
|
||||
test "Robots: matchPattern - single wildcard" {
|
||||
try std.testing.expect(matchPattern("/admin/*", "/admin/") != null);
|
||||
try std.testing.expect(matchPattern("/admin/*", "/admin/page") != null);
|
||||
try std.testing.expect(matchPattern("/admin/*", "/admin/page/subpage") != null);
|
||||
try std.testing.expect(matchPattern("/admin/*", "/other/page") == null);
|
||||
}
|
||||
|
||||
test "Robots: matchPattern - wildcard in middle" {
|
||||
try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/xyz") != null);
|
||||
try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/ghi/xyz") != null);
|
||||
try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def") == null);
|
||||
try std.testing.expect(matchPattern("/abc/*/xyz", "/other/def/xyz") == null);
|
||||
}
|
||||
|
||||
test "Robots: matchPattern - complex wildcard case" {
|
||||
try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/def/def/xyz") != null);
|
||||
try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/ANYTHING/def/xyz") != null);
|
||||
}
|
||||
|
||||
test "Robots: matchPattern - multiple wildcards" {
|
||||
try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/b/y/c") != null);
|
||||
try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/y/b/z/w/c") != null);
|
||||
try std.testing.expect(matchPattern("/*.php", "/index.php") != null);
|
||||
try std.testing.expect(matchPattern("/*.php", "/admin/index.php") != null);
|
||||
}
|
||||
|
||||
test "Robots: matchPattern - end anchor" {
|
||||
try std.testing.expect(matchPattern("/*.php$", "/index.php") != null);
|
||||
try std.testing.expect(matchPattern("/*.php$", "/index.php?param=value") == null);
|
||||
try std.testing.expect(matchPattern("/admin$", "/admin") != null);
|
||||
try std.testing.expect(matchPattern("/admin$", "/admin/") == null);
|
||||
try std.testing.expect(matchPattern("/fish$", "/fish") != null);
|
||||
try std.testing.expect(matchPattern("/fish$", "/fishheads") == null);
|
||||
}
|
||||
|
||||
test "Robots: matchPattern - wildcard with extension" {
|
||||
try std.testing.expect(matchPattern("/fish*.php", "/fish.php") != null);
|
||||
try std.testing.expect(matchPattern("/fish*.php", "/fishheads.php") != null);
|
||||
try std.testing.expect(matchPattern("/fish*.php", "/fish/salmon.php") != null);
|
||||
try std.testing.expect(matchPattern("/fish*.php", "/fish.asp") == null);
|
||||
}
|
||||
|
||||
test "Robots: matchPattern - empty and edge cases" {
|
||||
try std.testing.expect(matchPattern("", "/anything") != null);
|
||||
try std.testing.expect(matchPattern("/", "/") != null);
|
||||
try std.testing.expect(matchPattern("*", "/anything") != null);
|
||||
try std.testing.expect(matchPattern("/*", "/anything") != null);
|
||||
try std.testing.expect(matchPattern("$", "") != null);
|
||||
}
|
||||
|
||||
test "Robots: matchPattern - real world examples" {
|
||||
try std.testing.expect(matchPattern("/", "/anything") != null);
|
||||
|
||||
try std.testing.expect(matchPattern("/admin/", "/admin/page") != null);
|
||||
try std.testing.expect(matchPattern("/admin/", "/public/page") == null);
|
||||
|
||||
try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf") != null);
|
||||
try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf.bak") == null);
|
||||
|
||||
try std.testing.expect(matchPattern("/*?", "/page?param=value") != null);
|
||||
try std.testing.expect(matchPattern("/*?", "/page") == null);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - basic allow/disallow" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "MyBot",
|
||||
\\User-agent: MyBot
|
||||
\\Disallow: /admin/
|
||||
\\Allow: /public/
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/") == true);
|
||||
try std.testing.expect(robots.isAllowed("/public/page") == true);
|
||||
try std.testing.expect(robots.isAllowed("/admin/secret") == false);
|
||||
try std.testing.expect(robots.isAllowed("/other/page") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - longest match wins" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "TestBot",
|
||||
\\User-agent: TestBot
|
||||
\\Disallow: /admin/
|
||||
\\Allow: /admin/public/
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/admin/secret") == false);
|
||||
try std.testing.expect(robots.isAllowed("/admin/public/page") == true);
|
||||
try std.testing.expect(robots.isAllowed("/admin/public/") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - specific user-agent vs wildcard" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots1 = try Robots.fromBytes(allocator, "Googlebot",
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
\\User-agent: *
|
||||
\\Disallow: /admin/
|
||||
\\
|
||||
);
|
||||
defer robots1.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots1.isAllowed("/private/page") == false);
|
||||
try std.testing.expect(robots1.isAllowed("/admin/page") == true);
|
||||
|
||||
// Test with other bot (should use wildcard)
|
||||
var robots2 = try Robots.fromBytes(allocator, "OtherBot",
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
\\User-agent: *
|
||||
\\Disallow: /admin/
|
||||
\\
|
||||
);
|
||||
defer robots2.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots2.isAllowed("/private/page") == true);
|
||||
try std.testing.expect(robots2.isAllowed("/admin/page") == false);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - case insensitive user-agent" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots1 = try Robots.fromBytes(allocator, "googlebot",
|
||||
\\User-agent: GoogleBot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
);
|
||||
defer robots1.deinit(allocator);
|
||||
try std.testing.expect(robots1.isAllowed("/private/") == false);
|
||||
|
||||
var robots2 = try Robots.fromBytes(allocator, "GOOGLEBOT",
|
||||
\\User-agent: GoogleBot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
);
|
||||
defer robots2.deinit(allocator);
|
||||
try std.testing.expect(robots2.isAllowed("/private/") == false);
|
||||
|
||||
var robots3 = try Robots.fromBytes(allocator, "GoOgLeBoT",
|
||||
\\User-agent: GoogleBot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
);
|
||||
defer robots3.deinit(allocator);
|
||||
try std.testing.expect(robots3.isAllowed("/private/") == false);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - merged rules for same agent" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "Googlebot",
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /admin/
|
||||
\\
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/admin/page") == false);
|
||||
try std.testing.expect(robots.isAllowed("/private/page") == false);
|
||||
try std.testing.expect(robots.isAllowed("/public/page") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - wildcards in patterns" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "Bot",
|
||||
\\User-agent: Bot
|
||||
\\Disallow: /*.php$
|
||||
\\Allow: /index.php$
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/page.php") == false);
|
||||
try std.testing.expect(robots.isAllowed("/index.php") == true);
|
||||
try std.testing.expect(robots.isAllowed("/page.php?param=1") == true);
|
||||
try std.testing.expect(robots.isAllowed("/page.html") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - empty disallow allows everything" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "Bot",
|
||||
\\User-agent: Bot
|
||||
\\Disallow:
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/anything") == true);
|
||||
try std.testing.expect(robots.isAllowed("/") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - no rules" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "Bot", "");
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/anything") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - disallow all" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "Bot",
|
||||
\\User-agent: Bot
|
||||
\\Disallow: /
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/") == false);
|
||||
try std.testing.expect(robots.isAllowed("/anything") == false);
|
||||
try std.testing.expect(robots.isAllowed("/admin/page") == false);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - multiple user-agents in same entry" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots1 = try Robots.fromBytes(allocator, "Googlebot",
|
||||
\\User-agent: Googlebot
|
||||
\\User-agent: Bingbot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
);
|
||||
defer robots1.deinit(allocator);
|
||||
try std.testing.expect(robots1.isAllowed("/private/") == false);
|
||||
|
||||
var robots2 = try Robots.fromBytes(allocator, "Bingbot",
|
||||
\\User-agent: Googlebot
|
||||
\\User-agent: Bingbot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
);
|
||||
defer robots2.deinit(allocator);
|
||||
try std.testing.expect(robots2.isAllowed("/private/") == false);
|
||||
|
||||
var robots3 = try Robots.fromBytes(allocator, "OtherBot",
|
||||
\\User-agent: Googlebot
|
||||
\\User-agent: Bingbot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
);
|
||||
defer robots3.deinit(allocator);
|
||||
try std.testing.expect(robots3.isAllowed("/private/") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - wildcard fallback" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "UnknownBot",
|
||||
\\User-agent: *
|
||||
\\Disallow: /admin/
|
||||
\\Allow: /admin/public/
|
||||
\\
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/admin/secret") == false);
|
||||
try std.testing.expect(robots.isAllowed("/admin/public/page") == true);
|
||||
try std.testing.expect(robots.isAllowed("/private/") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - complex real-world example" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "MyBot",
|
||||
\\User-agent: *
|
||||
\\Disallow: /cgi-bin/
|
||||
\\Disallow: /tmp/
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
\\User-agent: MyBot
|
||||
\\Disallow: /admin/
|
||||
\\Disallow: /*.pdf$
|
||||
\\Allow: /public/*.pdf$
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/") == true);
|
||||
try std.testing.expect(robots.isAllowed("/admin/dashboard") == false);
|
||||
try std.testing.expect(robots.isAllowed("/docs/guide.pdf") == false);
|
||||
try std.testing.expect(robots.isAllowed("/public/manual.pdf") == true);
|
||||
try std.testing.expect(robots.isAllowed("/page.html") == true);
|
||||
try std.testing.expect(robots.isAllowed("/cgi-bin/script.sh") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - order doesn't matter for same length" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "Bot",
|
||||
\\User-agent: Bot
|
||||
\\ # WOW!!
|
||||
\\Allow: /page
|
||||
\\Disallow: /page
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/page") == false);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - empty file uses wildcard defaults" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "MyBot",
|
||||
\\User-agent: * # ABCDEF!!!
|
||||
\\Disallow: /admin/
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/admin/") == false);
|
||||
try std.testing.expect(robots.isAllowed("/public/") == true);
|
||||
}
|
||||
test "Robots: isAllowed - wildcard entry with multiple user-agents including specific" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "Googlebot",
|
||||
\\User-agent: *
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /shared/
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/shared/") == false);
|
||||
try std.testing.expect(robots.isAllowed("/other/") == true);
|
||||
|
||||
var robots2 = try Robots.fromBytes(allocator, "Bingbot",
|
||||
\\User-agent: *
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /shared/
|
||||
\\
|
||||
);
|
||||
defer robots2.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots2.isAllowed("/shared/") == false);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - specific agent appears after wildcard in entry" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "MyBot",
|
||||
\\User-agent: *
|
||||
\\User-agent: MyBot
|
||||
\\User-agent: Bingbot
|
||||
\\Disallow: /admin/
|
||||
\\Allow: /admin/public/
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/admin/secret") == false);
|
||||
try std.testing.expect(robots.isAllowed("/admin/public/page") == true);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - wildcard should not override specific entry" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "Googlebot",
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
\\User-agent: *
|
||||
\\User-agent: Googlebot
|
||||
\\Disallow: /admin/
|
||||
\\
|
||||
);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/private/") == false);
|
||||
try std.testing.expect(robots.isAllowed("/admin/") == false);
|
||||
}
|
||||
|
||||
test "Robots: isAllowed - Google's real robots.txt" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
// Simplified version of google.com/robots.txt
|
||||
const google_robots =
|
||||
\\User-agent: *
|
||||
\\User-agent: Yandex
|
||||
\\Disallow: /search
|
||||
\\Allow: /search/about
|
||||
\\Allow: /search/howsearchworks
|
||||
\\Disallow: /imgres
|
||||
\\Disallow: /m?
|
||||
\\Disallow: /m/
|
||||
\\Allow: /m/finance
|
||||
\\Disallow: /maps/
|
||||
\\Allow: /maps/$
|
||||
\\Allow: /maps/@
|
||||
\\Allow: /maps/dir/
|
||||
\\Disallow: /shopping?
|
||||
\\Allow: /shopping?udm=28$
|
||||
\\
|
||||
\\User-agent: AdsBot-Google
|
||||
\\Disallow: /maps/api/js/
|
||||
\\Allow: /maps/api/js
|
||||
\\Disallow: /maps/api/staticmap
|
||||
\\
|
||||
\\User-agent: Yandex
|
||||
\\Disallow: /about/careers/applications/jobs/results
|
||||
\\
|
||||
\\User-agent: facebookexternalhit
|
||||
\\User-agent: Twitterbot
|
||||
\\Allow: /imgres
|
||||
\\Allow: /search
|
||||
\\Disallow: /groups
|
||||
\\Disallow: /m/
|
||||
\\
|
||||
;
|
||||
|
||||
var regular_bot = try Robots.fromBytes(allocator, "Googlebot", google_robots);
|
||||
defer regular_bot.deinit(allocator);
|
||||
|
||||
try std.testing.expect(regular_bot.isAllowed("/") == true);
|
||||
try std.testing.expect(regular_bot.isAllowed("/search") == false);
|
||||
try std.testing.expect(regular_bot.isAllowed("/search/about") == true);
|
||||
try std.testing.expect(regular_bot.isAllowed("/search/howsearchworks") == true);
|
||||
try std.testing.expect(regular_bot.isAllowed("/imgres") == false);
|
||||
try std.testing.expect(regular_bot.isAllowed("/m/finance") == true);
|
||||
try std.testing.expect(regular_bot.isAllowed("/m/other") == false);
|
||||
try std.testing.expect(regular_bot.isAllowed("/maps/") == true);
|
||||
try std.testing.expect(regular_bot.isAllowed("/maps/@") == true);
|
||||
try std.testing.expect(regular_bot.isAllowed("/shopping?udm=28") == true);
|
||||
try std.testing.expect(regular_bot.isAllowed("/shopping?udm=28&extra") == false);
|
||||
|
||||
var adsbot = try Robots.fromBytes(allocator, "AdsBot-Google", google_robots);
|
||||
defer adsbot.deinit(allocator);
|
||||
|
||||
try std.testing.expect(adsbot.isAllowed("/maps/api/js") == true);
|
||||
try std.testing.expect(adsbot.isAllowed("/maps/api/js/") == false);
|
||||
try std.testing.expect(adsbot.isAllowed("/maps/api/staticmap") == false);
|
||||
|
||||
var twitterbot = try Robots.fromBytes(allocator, "Twitterbot", google_robots);
|
||||
defer twitterbot.deinit(allocator);
|
||||
|
||||
try std.testing.expect(twitterbot.isAllowed("/imgres") == true);
|
||||
try std.testing.expect(twitterbot.isAllowed("/search") == true);
|
||||
try std.testing.expect(twitterbot.isAllowed("/groups") == false);
|
||||
try std.testing.expect(twitterbot.isAllowed("/m/") == false);
|
||||
}
|
||||
|
||||
test "Robots: user-agent after rules starts new entry" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
const file =
|
||||
\\User-agent: Bot1
|
||||
\\User-agent: Bot2
|
||||
\\Disallow: /admin/
|
||||
\\Allow: /public/
|
||||
\\User-agent: Bot3
|
||||
\\Disallow: /private/
|
||||
\\
|
||||
;
|
||||
|
||||
var robots1 = try Robots.fromBytes(allocator, "Bot1", file);
|
||||
defer robots1.deinit(allocator);
|
||||
try std.testing.expect(robots1.isAllowed("/admin/") == false);
|
||||
try std.testing.expect(robots1.isAllowed("/public/") == true);
|
||||
try std.testing.expect(robots1.isAllowed("/private/") == true);
|
||||
|
||||
var robots2 = try Robots.fromBytes(allocator, "Bot2", file);
|
||||
defer robots2.deinit(allocator);
|
||||
try std.testing.expect(robots2.isAllowed("/admin/") == false);
|
||||
try std.testing.expect(robots2.isAllowed("/public/") == true);
|
||||
try std.testing.expect(robots2.isAllowed("/private/") == true);
|
||||
|
||||
var robots3 = try Robots.fromBytes(allocator, "Bot3", file);
|
||||
defer robots3.deinit(allocator);
|
||||
try std.testing.expect(robots3.isAllowed("/admin/") == true);
|
||||
try std.testing.expect(robots3.isAllowed("/public/") == true);
|
||||
try std.testing.expect(robots3.isAllowed("/private/") == false);
|
||||
}
|
||||
|
||||
test "Robots: blank lines don't end entries" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
const file =
|
||||
\\User-agent: MyBot
|
||||
\\Disallow: /admin/
|
||||
\\
|
||||
\\
|
||||
\\Allow: /public/
|
||||
\\
|
||||
;
|
||||
|
||||
var robots = try Robots.fromBytes(allocator, "MyBot", file);
|
||||
defer robots.deinit(allocator);
|
||||
|
||||
try std.testing.expect(robots.isAllowed("/admin/") == false);
|
||||
try std.testing.expect(robots.isAllowed("/public/") == true);
|
||||
}
|
||||
@@ -502,6 +502,16 @@ pub fn concatQueryString(arena: Allocator, url: []const u8, query_string: []cons
|
||||
return buf.items[0 .. buf.items.len - 1 :0];
|
||||
}
|
||||
|
||||
pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) ![:0]const u8 {
|
||||
const origin = try getOrigin(arena, url) orelse return error.NoOrigin;
|
||||
return try std.fmt.allocPrintSentinel(
|
||||
arena,
|
||||
"{s}/robots.txt",
|
||||
.{origin},
|
||||
0,
|
||||
);
|
||||
}
|
||||
|
||||
const testing = @import("../testing.zig");
|
||||
test "URL: isCompleteHTTPUrl" {
|
||||
try testing.expectEqual(true, isCompleteHTTPUrl("http://example.com/about"));
|
||||
@@ -778,3 +788,31 @@ test "URL: concatQueryString" {
|
||||
try testing.expectEqual("https://www.lightpanda.io/index?1=2&a=b", url);
|
||||
}
|
||||
}
|
||||
|
||||
test "URL: getRobotsUrl" {
|
||||
defer testing.reset();
|
||||
const arena = testing.arena_allocator;
|
||||
|
||||
{
|
||||
const url = try getRobotsUrl(arena, "https://www.lightpanda.io");
|
||||
try testing.expectEqual("https://www.lightpanda.io/robots.txt", url);
|
||||
}
|
||||
|
||||
{
|
||||
const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path");
|
||||
try testing.expectString("https://www.lightpanda.io/robots.txt", url);
|
||||
}
|
||||
|
||||
{
|
||||
const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page");
|
||||
try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url);
|
||||
}
|
||||
{
|
||||
const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment");
|
||||
try testing.expectString("http://example.com/robots.txt", url);
|
||||
}
|
||||
{
|
||||
const url = try getRobotsUrl(arena, "https://user:pass@example.com/page");
|
||||
try testing.expectString("https://example.com/robots.txt", url);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,8 @@ const Config = @import("../Config.zig");
|
||||
const URL = @import("../browser/URL.zig");
|
||||
const Notification = @import("../Notification.zig");
|
||||
const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar;
|
||||
const Robots = @import("../browser/Robots.zig");
|
||||
const RobotStore = Robots.RobotStore;
|
||||
|
||||
const c = Http.c;
|
||||
const posix = std.posix;
|
||||
@@ -85,6 +87,12 @@ queue: TransferQueue,
|
||||
// The main app allocator
|
||||
allocator: Allocator,
|
||||
|
||||
// Reference to the App-owned Robot Store.
|
||||
robot_store: *RobotStore,
|
||||
// Queue of requests that depend on a robots.txt.
|
||||
// Allows us to fetch the robots.txt just once.
|
||||
pending_robots_queue: std.StringHashMapUnmanaged(std.ArrayList(Request)) = .empty,
|
||||
|
||||
// Once we have a handle/easy to process a request with, we create a Transfer
|
||||
// which contains the Request as well as any state we need to process the
|
||||
// request. These wil come and go with each request.
|
||||
@@ -123,7 +131,7 @@ pub const CDPClient = struct {
|
||||
|
||||
const TransferQueue = std.DoublyLinkedList;
|
||||
|
||||
pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, config: *const Config) !*Client {
|
||||
pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, robot_store: *RobotStore, config: *const Config) !*Client {
|
||||
var transfer_pool = std.heap.MemoryPool(Transfer).init(allocator);
|
||||
errdefer transfer_pool.deinit();
|
||||
|
||||
@@ -147,6 +155,7 @@ pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, config: *const Config)
|
||||
.multi = multi,
|
||||
.handles = handles,
|
||||
.allocator = allocator,
|
||||
.robot_store = robot_store,
|
||||
.http_proxy = http_proxy,
|
||||
.use_proxy = http_proxy != null,
|
||||
.config = config,
|
||||
@@ -163,6 +172,13 @@ pub fn deinit(self: *Client) void {
|
||||
_ = c.curl_multi_cleanup(self.multi);
|
||||
|
||||
self.transfer_pool.deinit();
|
||||
|
||||
var robots_iter = self.pending_robots_queue.iterator();
|
||||
while (robots_iter.next()) |entry| {
|
||||
entry.value_ptr.deinit(self.allocator);
|
||||
}
|
||||
self.pending_robots_queue.deinit(self.allocator);
|
||||
|
||||
self.allocator.destroy(self);
|
||||
}
|
||||
|
||||
@@ -217,12 +233,46 @@ pub fn tick(self: *Client, timeout_ms: u32) !PerformStatus {
|
||||
}
|
||||
|
||||
pub fn request(self: *Client, req: Request) !void {
|
||||
if (self.config.obeyRobots()) {
|
||||
const robots_url = try URL.getRobotsUrl(self.allocator, req.url);
|
||||
errdefer self.allocator.free(robots_url);
|
||||
|
||||
// If we have this robots cached, we can take a fast path.
|
||||
if (self.robot_store.get(robots_url)) |robot_entry| {
|
||||
defer self.allocator.free(robots_url);
|
||||
|
||||
switch (robot_entry) {
|
||||
// If we have a found robots entry, we check it.
|
||||
.present => |robots| {
|
||||
const path = URL.getPathname(req.url);
|
||||
if (!robots.isAllowed(path)) {
|
||||
req.error_callback(req.ctx, error.RobotsBlocked);
|
||||
return;
|
||||
}
|
||||
},
|
||||
// Otherwise, we assume we won't find it again.
|
||||
.absent => {},
|
||||
}
|
||||
|
||||
return self.processRequest(req);
|
||||
}
|
||||
|
||||
return self.fetchRobotsThenProcessRequest(robots_url, req);
|
||||
}
|
||||
|
||||
return self.processRequest(req);
|
||||
}
|
||||
|
||||
fn processRequest(self: *Client, req: Request) !void {
|
||||
const transfer = try self.makeTransfer(req);
|
||||
|
||||
transfer.req.notification.dispatch(.http_request_start, &.{ .transfer = transfer });
|
||||
|
||||
var wait_for_interception = false;
|
||||
transfer.req.notification.dispatch(.http_request_intercept, &.{ .transfer = transfer, .wait_for_interception = &wait_for_interception });
|
||||
transfer.req.notification.dispatch(.http_request_intercept, &.{
|
||||
.transfer = transfer,
|
||||
.wait_for_interception = &wait_for_interception,
|
||||
});
|
||||
if (wait_for_interception == false) {
|
||||
// request not intercepted, process it normally
|
||||
return self.process(transfer);
|
||||
@@ -246,6 +296,154 @@ pub fn request(self: *Client, req: Request) !void {
|
||||
}
|
||||
}
|
||||
|
||||
const RobotsRequestContext = struct {
|
||||
client: *Client,
|
||||
req: Request,
|
||||
robots_url: [:0]const u8,
|
||||
buffer: std.ArrayList(u8),
|
||||
status: u16 = 0,
|
||||
|
||||
pub fn deinit(self: *RobotsRequestContext) void {
|
||||
self.client.allocator.free(self.robots_url);
|
||||
self.buffer.deinit(self.client.allocator);
|
||||
self.client.allocator.destroy(self);
|
||||
}
|
||||
};
|
||||
|
||||
fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void {
|
||||
const entry = try self.pending_robots_queue.getOrPut(self.allocator, robots_url);
|
||||
|
||||
if (!entry.found_existing) {
|
||||
errdefer self.allocator.free(robots_url);
|
||||
|
||||
// If we aren't already fetching this robots,
|
||||
// we want to create a new queue for it and add this request into it.
|
||||
entry.value_ptr.* = .empty;
|
||||
|
||||
const ctx = try self.allocator.create(RobotsRequestContext);
|
||||
errdefer self.allocator.destroy(ctx);
|
||||
ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty };
|
||||
const headers = try self.newHeaders();
|
||||
|
||||
log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url });
|
||||
try self.processRequest(.{
|
||||
.ctx = ctx,
|
||||
.url = robots_url,
|
||||
.method = .GET,
|
||||
.headers = headers,
|
||||
.blocking = false,
|
||||
.cookie_jar = req.cookie_jar,
|
||||
.notification = req.notification,
|
||||
.resource_type = .fetch,
|
||||
.header_callback = robotsHeaderCallback,
|
||||
.data_callback = robotsDataCallback,
|
||||
.done_callback = robotsDoneCallback,
|
||||
.error_callback = robotsErrorCallback,
|
||||
.shutdown_callback = robotsShutdownCallback,
|
||||
});
|
||||
} else {
|
||||
// Not using our own robots URL, only using the one from the first request.
|
||||
self.allocator.free(robots_url);
|
||||
}
|
||||
|
||||
try entry.value_ptr.append(self.allocator, req);
|
||||
}
|
||||
|
||||
fn robotsHeaderCallback(transfer: *Http.Transfer) !bool {
|
||||
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx));
|
||||
|
||||
if (transfer.response_header) |hdr| {
|
||||
log.debug(.browser, "robots status", .{ .status = hdr.status, .robots_url = ctx.robots_url });
|
||||
ctx.status = hdr.status;
|
||||
}
|
||||
|
||||
if (transfer.getContentLength()) |cl| {
|
||||
try ctx.buffer.ensureTotalCapacity(ctx.client.allocator, cl);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
fn robotsDataCallback(transfer: *Http.Transfer, data: []const u8) !void {
|
||||
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx));
|
||||
try ctx.buffer.appendSlice(ctx.client.allocator, data);
|
||||
}
|
||||
|
||||
fn robotsDoneCallback(ctx_ptr: *anyopaque) !void {
|
||||
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr));
|
||||
defer ctx.deinit();
|
||||
|
||||
var allowed = true;
|
||||
|
||||
if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) {
|
||||
const robots = try ctx.client.robot_store.robotsFromBytes(
|
||||
ctx.client.config.http_headers.user_agent,
|
||||
ctx.buffer.items,
|
||||
);
|
||||
|
||||
try ctx.client.robot_store.put(ctx.robots_url, robots);
|
||||
|
||||
const path = URL.getPathname(ctx.req.url);
|
||||
allowed = robots.isAllowed(path);
|
||||
} else if (ctx.status == 404) {
|
||||
log.debug(.http, "robots not found", .{ .url = ctx.robots_url });
|
||||
try ctx.client.robot_store.putAbsent(ctx.robots_url);
|
||||
}
|
||||
|
||||
var queued = ctx.client.pending_robots_queue.fetchRemove(
|
||||
ctx.robots_url,
|
||||
) orelse @panic("Client.robotsDoneCallbacke empty queue");
|
||||
defer queued.value.deinit(ctx.client.allocator);
|
||||
|
||||
for (queued.value.items) |queued_req| {
|
||||
if (!allowed) {
|
||||
log.warn(.http, "blocked by robots", .{ .url = queued_req.url });
|
||||
queued_req.error_callback(queued_req.ctx, error.RobotsBlocked);
|
||||
} else {
|
||||
ctx.client.processRequest(queued_req) catch |e| {
|
||||
queued_req.error_callback(queued_req.ctx, e);
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void {
|
||||
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr));
|
||||
defer ctx.deinit();
|
||||
|
||||
log.warn(.http, "robots fetch failed", .{ .err = err });
|
||||
|
||||
var queued = ctx.client.pending_robots_queue.fetchRemove(
|
||||
ctx.robots_url,
|
||||
) orelse @panic("Client.robotsErrorCallback empty queue");
|
||||
defer queued.value.deinit(ctx.client.allocator);
|
||||
|
||||
// On error, allow all queued requests to proceed
|
||||
for (queued.value.items) |queued_req| {
|
||||
ctx.client.processRequest(queued_req) catch |e| {
|
||||
queued_req.error_callback(queued_req.ctx, e);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
fn robotsShutdownCallback(ctx_ptr: *anyopaque) void {
|
||||
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr));
|
||||
defer ctx.deinit();
|
||||
|
||||
log.debug(.http, "robots fetch shutdown", .{});
|
||||
|
||||
var queued = ctx.client.pending_robots_queue.fetchRemove(
|
||||
ctx.robots_url,
|
||||
) orelse @panic("Client.robotsErrorCallback empty queue");
|
||||
defer queued.value.deinit(ctx.client.allocator);
|
||||
|
||||
for (queued.value.items) |queued_req| {
|
||||
if (queued_req.shutdown_callback) |shutdown_cb| {
|
||||
shutdown_cb(queued_req.ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool {
|
||||
// The request was intercepted and is blocking. This is messy, but our
|
||||
// callers, the ScriptManager -> Page, don't have a great way to stop the
|
||||
|
||||
@@ -30,6 +30,7 @@ pub const Transfer = Client.Transfer;
|
||||
|
||||
const log = @import("../log.zig");
|
||||
const errors = @import("errors.zig");
|
||||
const RobotStore = @import("../browser/Robots.zig").RobotStore;
|
||||
|
||||
const Allocator = std.mem.Allocator;
|
||||
const ArenaAllocator = std.heap.ArenaAllocator;
|
||||
@@ -46,7 +47,7 @@ client: *Client,
|
||||
ca_blob: ?c.curl_blob,
|
||||
arena: ArenaAllocator,
|
||||
|
||||
pub fn init(allocator: Allocator, config: *const Config) !Http {
|
||||
pub fn init(allocator: Allocator, robot_store: *RobotStore, config: *const Config) !Http {
|
||||
try errorCheck(c.curl_global_init(c.CURL_GLOBAL_SSL));
|
||||
errdefer c.curl_global_cleanup();
|
||||
|
||||
@@ -62,7 +63,7 @@ pub fn init(allocator: Allocator, config: *const Config) !Http {
|
||||
ca_blob = try loadCerts(allocator, arena.allocator());
|
||||
}
|
||||
|
||||
var client = try Client.init(allocator, ca_blob, config);
|
||||
var client = try Client.init(allocator, ca_blob, robot_store, config);
|
||||
errdefer client.deinit();
|
||||
|
||||
return .{
|
||||
|
||||
Reference in New Issue
Block a user