mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-02-04 06:23:45 +00:00
robots in the actual http client
This commit is contained in:
@@ -24,6 +24,7 @@ const log = @import("log.zig");
|
|||||||
const Snapshot = @import("browser/js/Snapshot.zig");
|
const Snapshot = @import("browser/js/Snapshot.zig");
|
||||||
const Platform = @import("browser/js/Platform.zig");
|
const Platform = @import("browser/js/Platform.zig");
|
||||||
const Telemetry = @import("telemetry/telemetry.zig").Telemetry;
|
const Telemetry = @import("telemetry/telemetry.zig").Telemetry;
|
||||||
|
const RobotStore = @import("browser/Robots.zig").RobotStore;
|
||||||
|
|
||||||
pub const Http = @import("http/Http.zig");
|
pub const Http = @import("http/Http.zig");
|
||||||
pub const ArenaPool = @import("ArenaPool.zig");
|
pub const ArenaPool = @import("ArenaPool.zig");
|
||||||
@@ -40,6 +41,7 @@ snapshot: Snapshot,
|
|||||||
telemetry: Telemetry,
|
telemetry: Telemetry,
|
||||||
allocator: Allocator,
|
allocator: Allocator,
|
||||||
arena_pool: ArenaPool,
|
arena_pool: ArenaPool,
|
||||||
|
robots: RobotStore,
|
||||||
app_dir_path: ?[]const u8,
|
app_dir_path: ?[]const u8,
|
||||||
notification: *Notification,
|
notification: *Notification,
|
||||||
shutdown: bool = false,
|
shutdown: bool = false,
|
||||||
@@ -53,6 +55,7 @@ pub const RunMode = enum {
|
|||||||
|
|
||||||
pub const Config = struct {
|
pub const Config = struct {
|
||||||
run_mode: RunMode,
|
run_mode: RunMode,
|
||||||
|
obey_robots: bool = false,
|
||||||
tls_verify_host: bool = true,
|
tls_verify_host: bool = true,
|
||||||
http_proxy: ?[:0]const u8 = null,
|
http_proxy: ?[:0]const u8 = null,
|
||||||
proxy_bearer_token: ?[:0]const u8 = null,
|
proxy_bearer_token: ?[:0]const u8 = null,
|
||||||
@@ -74,6 +77,7 @@ pub fn init(allocator: Allocator, config: Config) !*App {
|
|||||||
errdefer app.notification.deinit();
|
errdefer app.notification.deinit();
|
||||||
|
|
||||||
app.http = try Http.init(allocator, .{
|
app.http = try Http.init(allocator, .{
|
||||||
|
.obey_robots = config.obey_robots,
|
||||||
.max_host_open = config.http_max_host_open orelse 4,
|
.max_host_open = config.http_max_host_open orelse 4,
|
||||||
.max_concurrent = config.http_max_concurrent orelse 10,
|
.max_concurrent = config.http_max_concurrent orelse 10,
|
||||||
.timeout_ms = config.http_timeout_ms orelse 5000,
|
.timeout_ms = config.http_timeout_ms orelse 5000,
|
||||||
@@ -91,6 +95,8 @@ pub fn init(allocator: Allocator, config: Config) !*App {
|
|||||||
app.snapshot = try Snapshot.load();
|
app.snapshot = try Snapshot.load();
|
||||||
errdefer app.snapshot.deinit();
|
errdefer app.snapshot.deinit();
|
||||||
|
|
||||||
|
app.robots = RobotStore.init(allocator);
|
||||||
|
|
||||||
app.app_dir_path = getAndMakeAppDir(allocator);
|
app.app_dir_path = getAndMakeAppDir(allocator);
|
||||||
|
|
||||||
app.telemetry = try Telemetry.init(app, config.run_mode);
|
app.telemetry = try Telemetry.init(app, config.run_mode);
|
||||||
@@ -115,6 +121,7 @@ pub fn deinit(self: *App) void {
|
|||||||
self.app_dir_path = null;
|
self.app_dir_path = null;
|
||||||
}
|
}
|
||||||
self.telemetry.deinit();
|
self.telemetry.deinit();
|
||||||
|
self.robots.deinit();
|
||||||
self.notification.deinit();
|
self.notification.deinit();
|
||||||
self.http.deinit();
|
self.http.deinit();
|
||||||
self.snapshot.deinit();
|
self.snapshot.deinit();
|
||||||
|
|||||||
@@ -544,6 +544,7 @@ pub fn navigate(self: *Page, request_url: [:0]const u8, opts: NavigateOpts) !voi
|
|||||||
.headers = headers,
|
.headers = headers,
|
||||||
.body = opts.body,
|
.body = opts.body,
|
||||||
.cookie_jar = &self._session.cookie_jar,
|
.cookie_jar = &self._session.cookie_jar,
|
||||||
|
.robots = &self._session.browser.app.robots,
|
||||||
.resource_type = .document,
|
.resource_type = .document,
|
||||||
.header_callback = pageHeaderDoneCallback,
|
.header_callback = pageHeaderDoneCallback,
|
||||||
.data_callback = pageDataCallback,
|
.data_callback = pageDataCallback,
|
||||||
|
|||||||
@@ -33,13 +33,80 @@ pub const Key = enum {
|
|||||||
pub const Robots = @This();
|
pub const Robots = @This();
|
||||||
pub const empty: Robots = .{ .rules = &.{} };
|
pub const empty: Robots = .{ .rules = &.{} };
|
||||||
|
|
||||||
|
pub const RobotStore = struct {
|
||||||
|
const RobotsEntry = union(enum) {
|
||||||
|
present: Robots,
|
||||||
|
absent,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub const RobotsMap = std.HashMapUnmanaged([]const u8, RobotsEntry, struct {
|
||||||
|
const Context = @This();
|
||||||
|
|
||||||
|
pub fn hash(_: Context, value: []const u8) u32 {
|
||||||
|
var hasher = std.hash.Wyhash.init(value.len);
|
||||||
|
for (value) |c| {
|
||||||
|
std.hash.autoHash(&hasher, std.ascii.toLower(c));
|
||||||
|
}
|
||||||
|
return @truncate(hasher.final());
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn eql(_: Context, a: []const u8, b: []const u8) bool {
|
||||||
|
if (a.len != b.len) return false;
|
||||||
|
return std.ascii.eqlIgnoreCase(a, b);
|
||||||
|
}
|
||||||
|
}, 80);
|
||||||
|
|
||||||
|
allocator: std.mem.Allocator,
|
||||||
|
map: RobotsMap,
|
||||||
|
|
||||||
|
pub fn init(allocator: std.mem.Allocator) RobotStore {
|
||||||
|
return .{ .allocator = allocator, .map = .empty };
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deinit(self: *RobotStore) void {
|
||||||
|
var iter = self.map.iterator();
|
||||||
|
|
||||||
|
while (iter.next()) |entry| {
|
||||||
|
self.allocator.free(entry.key_ptr.*);
|
||||||
|
|
||||||
|
switch (entry.value_ptr.*) {
|
||||||
|
.present => |*robots| robots.deinit(self.allocator),
|
||||||
|
.absent => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.map.deinit(self.allocator);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get(self: *RobotStore, url: []const u8) ?RobotsEntry {
|
||||||
|
return self.map.get(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn robotsFromBytes(self: *RobotStore, user_agent: []const u8, bytes: []const u8) !Robots {
|
||||||
|
return try Robots.fromBytes(self.allocator, user_agent, bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn put(self: *RobotStore, url: []const u8, robots: Robots) !void {
|
||||||
|
const duped = try self.allocator.dupe(u8, url);
|
||||||
|
try self.map.put(self.allocator, duped, .{ .present = robots });
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn putAbsent(self: *RobotStore, url: []const u8) !void {
|
||||||
|
const duped = try self.allocator.dupe(u8, url);
|
||||||
|
try self.map.put(self.allocator, duped, .absent);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
rules: []const Rule,
|
rules: []const Rule,
|
||||||
|
|
||||||
const State = enum {
|
const State = struct {
|
||||||
|
entry: enum {
|
||||||
not_in_entry,
|
not_in_entry,
|
||||||
in_other_entry,
|
in_other_entry,
|
||||||
in_our_entry,
|
in_our_entry,
|
||||||
in_wildcard_entry,
|
in_wildcard_entry,
|
||||||
|
},
|
||||||
|
has_rules: bool = false,
|
||||||
};
|
};
|
||||||
|
|
||||||
fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void {
|
fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void {
|
||||||
@@ -62,7 +129,7 @@ fn parseRulesWithUserAgent(
|
|||||||
var wildcard_rules: std.ArrayList(Rule) = .empty;
|
var wildcard_rules: std.ArrayList(Rule) = .empty;
|
||||||
defer wildcard_rules.deinit(allocator);
|
defer wildcard_rules.deinit(allocator);
|
||||||
|
|
||||||
var state: State = .not_in_entry;
|
var state: State = .{ .entry = .not_in_entry, .has_rules = false };
|
||||||
|
|
||||||
var iter = std.mem.splitScalar(u8, bytes, '\n');
|
var iter = std.mem.splitScalar(u8, bytes, '\n');
|
||||||
while (iter.next()) |line| {
|
while (iter.next()) |line| {
|
||||||
@@ -78,7 +145,6 @@ fn parseRulesWithUserAgent(
|
|||||||
trimmed;
|
trimmed;
|
||||||
|
|
||||||
if (true_line.len == 0) {
|
if (true_line.len == 0) {
|
||||||
state = .not_in_entry;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,29 +160,38 @@ fn parseRulesWithUserAgent(
|
|||||||
const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace);
|
const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace);
|
||||||
|
|
||||||
switch (key) {
|
switch (key) {
|
||||||
.@"user-agent" => switch (state) {
|
.@"user-agent" => {
|
||||||
|
if (state.has_rules) {
|
||||||
|
state = .{ .entry = .not_in_entry, .has_rules = false };
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (state.entry) {
|
||||||
.in_other_entry => {
|
.in_other_entry => {
|
||||||
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
|
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
|
||||||
state = .in_our_entry;
|
state.entry = .in_our_entry;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
.in_our_entry => {},
|
.in_our_entry => {},
|
||||||
.in_wildcard_entry => {
|
.in_wildcard_entry => {
|
||||||
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
|
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
|
||||||
state = .in_our_entry;
|
state.entry = .in_our_entry;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
.not_in_entry => {
|
.not_in_entry => {
|
||||||
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
|
if (std.ascii.eqlIgnoreCase(user_agent, value)) {
|
||||||
state = .in_our_entry;
|
state.entry = .in_our_entry;
|
||||||
} else if (std.mem.eql(u8, "*", value)) {
|
} else if (std.mem.eql(u8, "*", value)) {
|
||||||
state = .in_wildcard_entry;
|
state.entry = .in_wildcard_entry;
|
||||||
} else {
|
} else {
|
||||||
state = .in_other_entry;
|
state.entry = .in_other_entry;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
}
|
||||||
},
|
},
|
||||||
.allow => switch (state) {
|
.allow => {
|
||||||
|
defer state.has_rules = true;
|
||||||
|
|
||||||
|
switch (state.entry) {
|
||||||
.in_our_entry => {
|
.in_our_entry => {
|
||||||
const duped_value = try allocator.dupe(u8, value);
|
const duped_value = try allocator.dupe(u8, value);
|
||||||
errdefer allocator.free(duped_value);
|
errdefer allocator.free(duped_value);
|
||||||
@@ -129,8 +204,12 @@ fn parseRulesWithUserAgent(
|
|||||||
try wildcard_rules.append(allocator, .{ .allow = duped_value });
|
try wildcard_rules.append(allocator, .{ .allow = duped_value });
|
||||||
},
|
},
|
||||||
.not_in_entry => return error.UnexpectedRule,
|
.not_in_entry => return error.UnexpectedRule,
|
||||||
|
}
|
||||||
},
|
},
|
||||||
.disallow => switch (state) {
|
.disallow => {
|
||||||
|
defer state.has_rules = true;
|
||||||
|
|
||||||
|
switch (state.entry) {
|
||||||
.in_our_entry => {
|
.in_our_entry => {
|
||||||
const duped_value = try allocator.dupe(u8, value);
|
const duped_value = try allocator.dupe(u8, value);
|
||||||
errdefer allocator.free(duped_value);
|
errdefer allocator.free(duped_value);
|
||||||
@@ -143,6 +222,7 @@ fn parseRulesWithUserAgent(
|
|||||||
try wildcard_rules.append(allocator, .{ .disallow = duped_value });
|
try wildcard_rules.append(allocator, .{ .disallow = duped_value });
|
||||||
},
|
},
|
||||||
.not_in_entry => return error.UnexpectedRule,
|
.not_in_entry => return error.UnexpectedRule,
|
||||||
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -737,3 +817,54 @@ test "Robots: isAllowed - Google's real robots.txt" {
|
|||||||
try std.testing.expect(twitterbot.isAllowed("/groups") == false);
|
try std.testing.expect(twitterbot.isAllowed("/groups") == false);
|
||||||
try std.testing.expect(twitterbot.isAllowed("/m/") == false);
|
try std.testing.expect(twitterbot.isAllowed("/m/") == false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test "Robots: user-agent after rules starts new entry" {
|
||||||
|
const allocator = std.testing.allocator;
|
||||||
|
|
||||||
|
const file =
|
||||||
|
\\User-agent: Bot1
|
||||||
|
\\User-agent: Bot2
|
||||||
|
\\Disallow: /admin/
|
||||||
|
\\Allow: /public/
|
||||||
|
\\User-agent: Bot3
|
||||||
|
\\Disallow: /private/
|
||||||
|
\\
|
||||||
|
;
|
||||||
|
|
||||||
|
var robots1 = try Robots.fromBytes(allocator, "Bot1", file);
|
||||||
|
defer robots1.deinit(allocator);
|
||||||
|
try std.testing.expect(robots1.isAllowed("/admin/") == false);
|
||||||
|
try std.testing.expect(robots1.isAllowed("/public/") == true);
|
||||||
|
try std.testing.expect(robots1.isAllowed("/private/") == true);
|
||||||
|
|
||||||
|
var robots2 = try Robots.fromBytes(allocator, "Bot2", file);
|
||||||
|
defer robots2.deinit(allocator);
|
||||||
|
try std.testing.expect(robots2.isAllowed("/admin/") == false);
|
||||||
|
try std.testing.expect(robots2.isAllowed("/public/") == true);
|
||||||
|
try std.testing.expect(robots2.isAllowed("/private/") == true);
|
||||||
|
|
||||||
|
var robots3 = try Robots.fromBytes(allocator, "Bot3", file);
|
||||||
|
defer robots3.deinit(allocator);
|
||||||
|
try std.testing.expect(robots3.isAllowed("/admin/") == true);
|
||||||
|
try std.testing.expect(robots3.isAllowed("/public/") == true);
|
||||||
|
try std.testing.expect(robots3.isAllowed("/private/") == false);
|
||||||
|
}
|
||||||
|
|
||||||
|
test "Robots: blank lines don't end entries" {
|
||||||
|
const allocator = std.testing.allocator;
|
||||||
|
|
||||||
|
const file =
|
||||||
|
\\User-agent: MyBot
|
||||||
|
\\Disallow: /admin/
|
||||||
|
\\
|
||||||
|
\\
|
||||||
|
\\Allow: /public/
|
||||||
|
\\
|
||||||
|
;
|
||||||
|
|
||||||
|
var robots = try Robots.fromBytes(allocator, "MyBot", file);
|
||||||
|
defer robots.deinit(allocator);
|
||||||
|
|
||||||
|
try std.testing.expect(robots.isAllowed("/admin/") == false);
|
||||||
|
try std.testing.expect(robots.isAllowed("/public/") == true);
|
||||||
|
}
|
||||||
|
|||||||
@@ -265,6 +265,7 @@ pub fn addFromElement(self: *ScriptManager, comptime from_parser: bool, script_e
|
|||||||
.headers = try self.getHeaders(url),
|
.headers = try self.getHeaders(url),
|
||||||
.blocking = is_blocking,
|
.blocking = is_blocking,
|
||||||
.cookie_jar = &page._session.cookie_jar,
|
.cookie_jar = &page._session.cookie_jar,
|
||||||
|
.robots = &page._session.browser.app.robots,
|
||||||
.resource_type = .script,
|
.resource_type = .script,
|
||||||
.start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null,
|
.start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null,
|
||||||
.header_callback = Script.headerCallback,
|
.header_callback = Script.headerCallback,
|
||||||
@@ -379,6 +380,7 @@ pub fn preloadImport(self: *ScriptManager, url: [:0]const u8, referrer: []const
|
|||||||
.method = .GET,
|
.method = .GET,
|
||||||
.headers = try self.getHeaders(url),
|
.headers = try self.getHeaders(url),
|
||||||
.cookie_jar = &self.page._session.cookie_jar,
|
.cookie_jar = &self.page._session.cookie_jar,
|
||||||
|
.robots = &self.page._session.browser.app.robots,
|
||||||
.resource_type = .script,
|
.resource_type = .script,
|
||||||
.start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null,
|
.start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null,
|
||||||
.header_callback = Script.headerCallback,
|
.header_callback = Script.headerCallback,
|
||||||
@@ -481,6 +483,7 @@ pub fn getAsyncImport(self: *ScriptManager, url: [:0]const u8, cb: ImportAsync.C
|
|||||||
.ctx = script,
|
.ctx = script,
|
||||||
.resource_type = .script,
|
.resource_type = .script,
|
||||||
.cookie_jar = &self.page._session.cookie_jar,
|
.cookie_jar = &self.page._session.cookie_jar,
|
||||||
|
.robots = &self.page._session.browser.app.robots,
|
||||||
.start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null,
|
.start_callback = if (log.enabled(.http, .debug)) Script.startCallback else null,
|
||||||
.header_callback = Script.headerCallback,
|
.header_callback = Script.headerCallback,
|
||||||
.data_callback = Script.dataCallback,
|
.data_callback = Script.dataCallback,
|
||||||
|
|||||||
@@ -502,8 +502,8 @@ pub fn concatQueryString(arena: Allocator, url: []const u8, query_string: []cons
|
|||||||
return buf.items[0 .. buf.items.len - 1 :0];
|
return buf.items[0 .. buf.items.len - 1 :0];
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) !?[:0]const u8 {
|
pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) ![:0]const u8 {
|
||||||
const origin = try getOrigin(arena, url) orelse return null;
|
const origin = try getOrigin(arena, url) orelse return error.NoOrigin;
|
||||||
return try std.fmt.allocPrintSentinel(
|
return try std.fmt.allocPrintSentinel(
|
||||||
arena,
|
arena,
|
||||||
"{s}/robots.txt",
|
"{s}/robots.txt",
|
||||||
@@ -795,24 +795,24 @@ test "URL: getRobotsUrl" {
|
|||||||
|
|
||||||
{
|
{
|
||||||
const url = try getRobotsUrl(arena, "https://www.lightpanda.io");
|
const url = try getRobotsUrl(arena, "https://www.lightpanda.io");
|
||||||
try testing.expectEqual("https://www.lightpanda.io/robots.txt", url.?);
|
try testing.expectEqual("https://www.lightpanda.io/robots.txt", url);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path");
|
const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path");
|
||||||
try testing.expectString("https://www.lightpanda.io/robots.txt", url.?);
|
try testing.expectString("https://www.lightpanda.io/robots.txt", url);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page");
|
const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page");
|
||||||
try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url.?);
|
try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment");
|
const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment");
|
||||||
try testing.expectString("http://example.com/robots.txt", url.?);
|
try testing.expectString("http://example.com/robots.txt", url);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
const url = try getRobotsUrl(arena, "https://user:pass@example.com/page");
|
const url = try getRobotsUrl(arena, "https://user:pass@example.com/page");
|
||||||
try testing.expectString("https://example.com/robots.txt", url.?);
|
try testing.expectString("https://example.com/robots.txt", url);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ pub fn init(input: Input, options: ?InitOpts, page: *Page) !js.Promise {
|
|||||||
.headers = headers,
|
.headers = headers,
|
||||||
.resource_type = .fetch,
|
.resource_type = .fetch,
|
||||||
.cookie_jar = &page._session.cookie_jar,
|
.cookie_jar = &page._session.cookie_jar,
|
||||||
|
.robots = &page._session.browser.app.robots,
|
||||||
.start_callback = httpStartCallback,
|
.start_callback = httpStartCallback,
|
||||||
.header_callback = httpHeaderDoneCallback,
|
.header_callback = httpHeaderDoneCallback,
|
||||||
.data_callback = httpDataCallback,
|
.data_callback = httpDataCallback,
|
||||||
|
|||||||
@@ -206,6 +206,7 @@ pub fn send(self: *XMLHttpRequest, body_: ?[]const u8) !void {
|
|||||||
.headers = headers,
|
.headers = headers,
|
||||||
.body = self._request_body,
|
.body = self._request_body,
|
||||||
.cookie_jar = &page._session.cookie_jar,
|
.cookie_jar = &page._session.cookie_jar,
|
||||||
|
.robots = &page._session.browser.app.robots,
|
||||||
.resource_type = .xhr,
|
.resource_type = .xhr,
|
||||||
.start_callback = httpStartCallback,
|
.start_callback = httpStartCallback,
|
||||||
.header_callback = httpHeaderDoneCallback,
|
.header_callback = httpHeaderDoneCallback,
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ const Http = @import("Http.zig");
|
|||||||
const URL = @import("../browser/URL.zig");
|
const URL = @import("../browser/URL.zig");
|
||||||
const Notification = @import("../Notification.zig");
|
const Notification = @import("../Notification.zig");
|
||||||
const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar;
|
const CookieJar = @import("../browser/webapi/storage/Cookie.zig").Jar;
|
||||||
|
const Robots = @import("../browser/Robots.zig");
|
||||||
|
const RobotStore = Robots.RobotStore;
|
||||||
|
|
||||||
const c = Http.c;
|
const c = Http.c;
|
||||||
const posix = std.posix;
|
const posix = std.posix;
|
||||||
@@ -82,6 +84,9 @@ queue: TransferQueue,
|
|||||||
// The main app allocator
|
// The main app allocator
|
||||||
allocator: Allocator,
|
allocator: Allocator,
|
||||||
|
|
||||||
|
// If we obey robots.txt or not.
|
||||||
|
obey_robots: bool,
|
||||||
|
|
||||||
// Once we have a handle/easy to process a request with, we create a Transfer
|
// Once we have a handle/easy to process a request with, we create a Transfer
|
||||||
// which contains the Request as well as any state we need to process the
|
// which contains the Request as well as any state we need to process the
|
||||||
// request. These wil come and go with each request.
|
// request. These wil come and go with each request.
|
||||||
@@ -146,6 +151,7 @@ pub fn init(allocator: Allocator, ca_blob: ?c.curl_blob, opts: Http.Opts) !*Clie
|
|||||||
.multi = multi,
|
.multi = multi,
|
||||||
.handles = handles,
|
.handles = handles,
|
||||||
.allocator = allocator,
|
.allocator = allocator,
|
||||||
|
.obey_robots = opts.obey_robots,
|
||||||
.http_proxy = opts.http_proxy,
|
.http_proxy = opts.http_proxy,
|
||||||
.use_proxy = opts.http_proxy != null,
|
.use_proxy = opts.http_proxy != null,
|
||||||
.user_agent = opts.user_agent,
|
.user_agent = opts.user_agent,
|
||||||
@@ -216,6 +222,36 @@ pub fn tick(self: *Client, timeout_ms: u32) !PerformStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn request(self: *Client, req: Request) !void {
|
pub fn request(self: *Client, req: Request) !void {
|
||||||
|
if (self.obey_robots) {
|
||||||
|
const robots_url = try URL.getRobotsUrl(self.allocator, req.url);
|
||||||
|
|
||||||
|
// If we have this robots cached, we can take a fast path.
|
||||||
|
if (req.robots.get(robots_url)) |robot_entry| {
|
||||||
|
defer self.allocator.free(robots_url);
|
||||||
|
|
||||||
|
switch (robot_entry) {
|
||||||
|
// If we have a found robots entry, we check it.
|
||||||
|
.present => |robots| {
|
||||||
|
const path = URL.getPathname(req.url);
|
||||||
|
if (!robots.isAllowed(path)) {
|
||||||
|
req.error_callback(req.ctx, error.RobotsBlocked);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// Otherwise, we assume we won't find it again.
|
||||||
|
.absent => {},
|
||||||
|
}
|
||||||
|
|
||||||
|
return self.processRequest(req);
|
||||||
|
}
|
||||||
|
|
||||||
|
return self.fetchRobotsThenProcessRequest(robots_url, req);
|
||||||
|
}
|
||||||
|
|
||||||
|
return self.processRequest(req);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn processRequest(self: *Client, req: Request) !void {
|
||||||
const transfer = try self.makeTransfer(req);
|
const transfer = try self.makeTransfer(req);
|
||||||
|
|
||||||
const notification = self.notification orelse return self.process(transfer);
|
const notification = self.notification orelse return self.process(transfer);
|
||||||
@@ -247,6 +283,107 @@ pub fn request(self: *Client, req: Request) !void {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const RobotsRequestContext = struct {
|
||||||
|
client: *Client,
|
||||||
|
req: Request,
|
||||||
|
robots_url: [:0]const u8,
|
||||||
|
buffer: std.ArrayList(u8),
|
||||||
|
status: u16 = 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn fetchRobotsThenProcessRequest(self: *Client, robots_url: [:0]const u8, req: Request) !void {
|
||||||
|
const ctx = try self.allocator.create(RobotsRequestContext);
|
||||||
|
ctx.* = .{ .client = self, .req = req, .robots_url = robots_url, .buffer = .empty };
|
||||||
|
|
||||||
|
const headers = try self.newHeaders();
|
||||||
|
|
||||||
|
log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url });
|
||||||
|
try self.processRequest(.{
|
||||||
|
.ctx = ctx,
|
||||||
|
.url = robots_url,
|
||||||
|
.method = .GET,
|
||||||
|
.headers = headers,
|
||||||
|
.blocking = false,
|
||||||
|
.cookie_jar = req.cookie_jar,
|
||||||
|
.robots = req.robots,
|
||||||
|
.resource_type = .fetch,
|
||||||
|
.header_callback = robotsHeaderCallback,
|
||||||
|
.data_callback = robotsDataCallback,
|
||||||
|
.done_callback = robotsDoneCallback,
|
||||||
|
.error_callback = robotsErrorCallback,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn robotsHeaderCallback(transfer: *Http.Transfer) !bool {
|
||||||
|
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx));
|
||||||
|
|
||||||
|
if (transfer.response_header) |hdr| {
|
||||||
|
log.debug(.browser, "robots status", .{ .status = hdr.status });
|
||||||
|
ctx.status = hdr.status;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (transfer.getContentLength()) |cl| {
|
||||||
|
try ctx.buffer.ensureTotalCapacity(ctx.client.allocator, cl);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn robotsDataCallback(transfer: *Http.Transfer, data: []const u8) !void {
|
||||||
|
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(transfer.ctx));
|
||||||
|
try ctx.buffer.appendSlice(ctx.client.allocator, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn robotsDoneCallback(ctx_ptr: *anyopaque) !void {
|
||||||
|
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr));
|
||||||
|
defer ctx.client.allocator.destroy(ctx);
|
||||||
|
defer ctx.buffer.deinit(ctx.client.allocator);
|
||||||
|
defer ctx.client.allocator.free(ctx.robots_url);
|
||||||
|
|
||||||
|
var allowed = true;
|
||||||
|
|
||||||
|
if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) {
|
||||||
|
const robots = try ctx.req.robots.robotsFromBytes(
|
||||||
|
ctx.client.user_agent,
|
||||||
|
ctx.buffer.items,
|
||||||
|
);
|
||||||
|
|
||||||
|
try ctx.req.robots.put(ctx.robots_url, robots);
|
||||||
|
|
||||||
|
const path = URL.getPathname(ctx.req.url);
|
||||||
|
allowed = robots.isAllowed(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If not found, store as Not Found.
|
||||||
|
if (ctx.status == 404) {
|
||||||
|
log.debug(.http, "robots not found", .{ .url = ctx.robots_url });
|
||||||
|
try ctx.req.robots.putAbsent(ctx.robots_url);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!allowed) {
|
||||||
|
log.warn(.http, "blocked by robots", .{ .url = ctx.req.url });
|
||||||
|
ctx.req.error_callback(ctx.req.ctx, error.RobotsBlocked);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now process the original request
|
||||||
|
try ctx.client.processRequest(ctx.req);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn robotsErrorCallback(ctx_ptr: *anyopaque, err: anyerror) void {
|
||||||
|
const ctx: *RobotsRequestContext = @ptrCast(@alignCast(ctx_ptr));
|
||||||
|
defer ctx.client.allocator.destroy(ctx);
|
||||||
|
defer ctx.buffer.deinit(ctx.client.allocator);
|
||||||
|
defer ctx.client.allocator.free(ctx.robots_url);
|
||||||
|
|
||||||
|
log.warn(.http, "robots fetch failed", .{ .err = err });
|
||||||
|
|
||||||
|
// On error, allow the request to proceed
|
||||||
|
ctx.client.processRequest(ctx.req) catch |e| {
|
||||||
|
ctx.req.error_callback(ctx.req.ctx, e);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool {
|
fn waitForInterceptedResponse(self: *Client, transfer: *Transfer) !bool {
|
||||||
// The request was intercepted and is blocking. This is messy, but our
|
// The request was intercepted and is blocking. This is messy, but our
|
||||||
// callers, the ScriptManager -> Page, don't have a great way to stop the
|
// callers, the ScriptManager -> Page, don't have a great way to stop the
|
||||||
@@ -765,6 +902,7 @@ pub const Request = struct {
|
|||||||
headers: Http.Headers,
|
headers: Http.Headers,
|
||||||
body: ?[]const u8 = null,
|
body: ?[]const u8 = null,
|
||||||
cookie_jar: *CookieJar,
|
cookie_jar: *CookieJar,
|
||||||
|
robots: *RobotStore,
|
||||||
resource_type: ResourceType,
|
resource_type: ResourceType,
|
||||||
credentials: ?[:0]const u8 = null,
|
credentials: ?[:0]const u8 = null,
|
||||||
|
|
||||||
|
|||||||
@@ -348,6 +348,7 @@ pub fn errorMCheck(code: c.CURLMcode) errors.Multi!void {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub const Opts = struct {
|
pub const Opts = struct {
|
||||||
|
obey_robots: bool,
|
||||||
timeout_ms: u31,
|
timeout_ms: u31,
|
||||||
max_host_open: u8,
|
max_host_open: u8,
|
||||||
max_concurrent: u8,
|
max_concurrent: u8,
|
||||||
|
|||||||
Reference in New Issue
Block a user