mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-04-04 00:20:32 +00:00
add RobotsLayer
This commit is contained in:
@@ -43,6 +43,7 @@ pub const ResponseHead = http.ResponseHead;
|
|||||||
pub const HeaderIterator = http.HeaderIterator;
|
pub const HeaderIterator = http.HeaderIterator;
|
||||||
|
|
||||||
pub const CacheLayer = @import("../network/layer/CacheLayer.zig");
|
pub const CacheLayer = @import("../network/layer/CacheLayer.zig");
|
||||||
|
pub const RobotsLayer = @import("../network/layer/RobotsLayer.zig");
|
||||||
|
|
||||||
pub const PerformStatus = enum { cdp_socket, normal };
|
pub const PerformStatus = enum { cdp_socket, normal };
|
||||||
|
|
||||||
@@ -206,8 +207,7 @@ pub const Transport = struct {
|
|||||||
return self.perform(@intCast(timeout_ms));
|
return self.perform(@intCast(timeout_ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Core entry point: interception gating then queues/starts the transfer.
|
/// Core entry point.
|
||||||
/// Robots and cache checks are done by layers above.
|
|
||||||
pub fn _request(ptr: *anyopaque, _: Context, req: Request) !void {
|
pub fn _request(ptr: *anyopaque, _: Context, req: Request) !void {
|
||||||
const self: *Transport = @ptrCast(@alignCast(ptr));
|
const self: *Transport = @ptrCast(@alignCast(ptr));
|
||||||
const transfer = try self.makeTransfer(req);
|
const transfer = try self.makeTransfer(req);
|
||||||
@@ -593,216 +593,6 @@ pub const Layer = struct {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// pub const RobotsLayer = struct {
|
|
||||||
// next: Layer = undefined,
|
|
||||||
// obey_robots: bool,
|
|
||||||
// allocator: Allocator,
|
|
||||||
// pending: std.StringHashMapUnmanaged(std.ArrayList(Request)) = .empty,
|
|
||||||
|
|
||||||
// pub fn layer(self: *RobotsLayer) Layer {
|
|
||||||
// return .{
|
|
||||||
// .ptr = self,
|
|
||||||
// .vtable = &.{ .request = _request },
|
|
||||||
// };
|
|
||||||
// }
|
|
||||||
|
|
||||||
// pub fn deinit(self: *RobotsLayer) void {
|
|
||||||
// var it = self.pending.iterator();
|
|
||||||
// while (it.next()) |entry| {
|
|
||||||
// entry.value_ptr.deinit(self.allocator);
|
|
||||||
// }
|
|
||||||
// self.pending.deinit(self.allocator);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn _request(ptr: *anyopaque, ctx: Context, req: Request) anyerror!void {
|
|
||||||
// const self: *RobotsLayer = @ptrCast(@alignCast(ptr));
|
|
||||||
|
|
||||||
// if (!self.obey_robots) {
|
|
||||||
// return self.next.request(ctx, req);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// const robots_url = try URL.getRobotsUrl(self.allocator, req.url);
|
|
||||||
// errdefer self.allocator.free(robots_url);
|
|
||||||
|
|
||||||
// if (ctx.network.robot_store.get(robots_url)) |robot_entry| {
|
|
||||||
// defer self.allocator.free(robots_url);
|
|
||||||
// switch (robot_entry) {
|
|
||||||
// .present => |robots| {
|
|
||||||
// const path = URL.getPathname(req.url);
|
|
||||||
// if (!robots.isAllowed(path)) {
|
|
||||||
// req.error_callback(req.ctx, error.RobotsBlocked);
|
|
||||||
// return;
|
|
||||||
// }
|
|
||||||
// },
|
|
||||||
// .absent => {},
|
|
||||||
// }
|
|
||||||
// return self.next.request(ctx, req);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// return self.fetchRobotsThenRequest(ctx, robots_url, req);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn fetchRobotsThenRequest(self: *RobotsLayer, ctx: Context, robots_url: [:0]const u8, req: Request) !void {
|
|
||||||
// const entry = try self.pending.getOrPut(self.allocator, robots_url);
|
|
||||||
|
|
||||||
// if (!entry.found_existing) {
|
|
||||||
// errdefer self.allocator.free(robots_url);
|
|
||||||
// entry.value_ptr.* = .empty;
|
|
||||||
|
|
||||||
// const robots_ctx = try self.allocator.create(RobotsContext);
|
|
||||||
// errdefer self.allocator.destroy(robots_ctx);
|
|
||||||
// robots_ctx.* = .{
|
|
||||||
// .layer = self,
|
|
||||||
// .ctx = ctx,
|
|
||||||
// .req = req,
|
|
||||||
// .robots_url = robots_url,
|
|
||||||
// .buffer = .empty,
|
|
||||||
// };
|
|
||||||
|
|
||||||
// const headers = try ctx.newHeaders();
|
|
||||||
// log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url });
|
|
||||||
// try self.next.request(ctx, .{
|
|
||||||
// .ctx = robots_ctx,
|
|
||||||
// .url = robots_url,
|
|
||||||
// .method = .GET,
|
|
||||||
// .headers = headers,
|
|
||||||
// .blocking = false,
|
|
||||||
// .frame_id = req.frame_id,
|
|
||||||
// .cookie_jar = req.cookie_jar,
|
|
||||||
// .cookie_origin = req.cookie_origin,
|
|
||||||
// .notification = req.notification,
|
|
||||||
// .resource_type = .fetch,
|
|
||||||
// .header_callback = RobotsContext.headerCallback,
|
|
||||||
// .data_callback = RobotsContext.dataCallback,
|
|
||||||
// .done_callback = RobotsContext.doneCallback,
|
|
||||||
// .error_callback = RobotsContext.errorCallback,
|
|
||||||
// .shutdown_callback = RobotsContext.shutdownCallback,
|
|
||||||
// });
|
|
||||||
// } else {
|
|
||||||
// self.allocator.free(robots_url);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// try entry.value_ptr.append(self.allocator, req);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn flushPending(self: *RobotsLayer, ctx: Context, robots_url: [:0]const u8, allowed: bool) void {
|
|
||||||
// var queued = self.pending.fetchRemove(robots_url) orelse
|
|
||||||
// @panic("RobotsLayer.flushPending: missing queue");
|
|
||||||
// defer queued.value.deinit(self.allocator);
|
|
||||||
|
|
||||||
// for (queued.value.items) |queued_req| {
|
|
||||||
// if (!allowed) {
|
|
||||||
// log.warn(.http, "blocked by robots", .{ .url = queued_req.url });
|
|
||||||
// queued_req.error_callback(queued_req.ctx, error.RobotsBlocked);
|
|
||||||
// } else {
|
|
||||||
// self.next.request(ctx, queued_req) catch |e| {
|
|
||||||
// queued_req.error_callback(queued_req.ctx, e);
|
|
||||||
// };
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn flushPendingShutdown(self: *RobotsLayer, robots_url: [:0]const u8) void {
|
|
||||||
// var queued = self.pending.fetchRemove(robots_url) orelse
|
|
||||||
// @panic("RobotsLayer.flushPendingShutdown: missing queue");
|
|
||||||
// defer queued.value.deinit(self.allocator);
|
|
||||||
|
|
||||||
// for (queued.value.items) |queued_req| {
|
|
||||||
// if (queued_req.shutdown_callback) |cb| cb(queued_req.ctx);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
|
|
||||||
// const RobotsContext = struct {
|
|
||||||
// layer: *RobotsLayer,
|
|
||||||
// ctx: Context,
|
|
||||||
// req: Request,
|
|
||||||
// robots_url: [:0]const u8,
|
|
||||||
// buffer: std.ArrayList(u8),
|
|
||||||
// status: u16 = 0,
|
|
||||||
|
|
||||||
// fn deinit(self: *RobotsContext) void {
|
|
||||||
// self.layer.allocator.free(self.robots_url);
|
|
||||||
// self.buffer.deinit(self.layer.allocator);
|
|
||||||
// self.layer.allocator.destroy(self);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn headerCallback(response: Response) !bool {
|
|
||||||
// const self: *RobotsContext = @ptrCast(@alignCast(response.ctx));
|
|
||||||
// // Robots callbacks only happen on real live requests.
|
|
||||||
// const transfer = response.inner.transfer;
|
|
||||||
// if (transfer.response_header) |hdr| {
|
|
||||||
// log.debug(.browser, "robots status", .{ .status = hdr.status, .robots_url = self.robots_url });
|
|
||||||
// self.status = hdr.status;
|
|
||||||
// }
|
|
||||||
// if (transfer.getContentLength()) |cl| {
|
|
||||||
// try self.buffer.ensureTotalCapacity(self.layer.allocator, cl);
|
|
||||||
// }
|
|
||||||
// return true;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn dataCallback(response: Response, data: []const u8) !void {
|
|
||||||
// const self: *RobotsContext = @ptrCast(@alignCast(response.ctx));
|
|
||||||
// try self.buffer.appendSlice(self.layer.allocator, data);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn doneCallback(ctx_ptr: *anyopaque) !void {
|
|
||||||
// const self: *RobotsContext = @ptrCast(@alignCast(ctx_ptr));
|
|
||||||
// defer self.deinit();
|
|
||||||
|
|
||||||
// var allowed = true;
|
|
||||||
// const network = self.ctx.network;
|
|
||||||
|
|
||||||
// switch (self.status) {
|
|
||||||
// 200 => {
|
|
||||||
// if (self.buffer.items.len > 0) {
|
|
||||||
// const robots: ?Robots = network.robot_store.robotsFromBytes(
|
|
||||||
// network.config.http_headers.user_agent,
|
|
||||||
// self.buffer.items,
|
|
||||||
// ) catch blk: {
|
|
||||||
// log.warn(.browser, "failed to parse robots", .{ .robots_url = self.robots_url });
|
|
||||||
// try network.robot_store.putAbsent(self.robots_url);
|
|
||||||
// break :blk null;
|
|
||||||
// };
|
|
||||||
// if (robots) |r| {
|
|
||||||
// try network.robot_store.put(self.robots_url, r);
|
|
||||||
// const path = URL.getPathname(self.req.url);
|
|
||||||
// allowed = r.isAllowed(path);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// },
|
|
||||||
// 404 => {
|
|
||||||
// log.debug(.http, "robots not found", .{ .url = self.robots_url });
|
|
||||||
// try network.robot_store.putAbsent(self.robots_url);
|
|
||||||
// },
|
|
||||||
// else => {
|
|
||||||
// log.debug(.http, "unexpected status on robots", .{
|
|
||||||
// .url = self.robots_url,
|
|
||||||
// .status = self.status,
|
|
||||||
// });
|
|
||||||
// try network.robot_store.putAbsent(self.robots_url);
|
|
||||||
// },
|
|
||||||
// }
|
|
||||||
|
|
||||||
// self.layer.flushPending(self.ctx, self.robots_url, allowed);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn errorCallback(ctx_ptr: *anyopaque, err: anyerror) void {
|
|
||||||
// const self: *RobotsContext = @ptrCast(@alignCast(ctx_ptr));
|
|
||||||
// defer self.deinit();
|
|
||||||
// log.warn(.http, "robots fetch failed", .{ .err = err });
|
|
||||||
// // On error, allow all queued requests to proceed.
|
|
||||||
// self.layer.flushPending(self.ctx, self.robots_url, true);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn shutdownCallback(ctx_ptr: *anyopaque) void {
|
|
||||||
// const self: *RobotsContext = @ptrCast(@alignCast(ctx_ptr));
|
|
||||||
// defer self.deinit();
|
|
||||||
// log.debug(.http, "robots fetch shutdown", .{});
|
|
||||||
// self.layer.flushPendingShutdown(self.robots_url);
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
|
|
||||||
// pub const WebBotAuthLayer = struct {
|
// pub const WebBotAuthLayer = struct {
|
||||||
// next: Layer = undefined,
|
// next: Layer = undefined,
|
||||||
// allocator: std.mem.Allocator,
|
// allocator: std.mem.Allocator,
|
||||||
@@ -868,7 +658,7 @@ pub fn LayerStack(comptime layer_types: anytype) type {
|
|||||||
pub fn deinit(self: *Self, allocator: Allocator) void {
|
pub fn deinit(self: *Self, allocator: Allocator) void {
|
||||||
inline for (layer_types, 0..) |T, i| {
|
inline for (layer_types, 0..) |T, i| {
|
||||||
const ptr: *T = @ptrCast(@alignCast(self.ptrs[i]));
|
const ptr: *T = @ptrCast(@alignCast(self.ptrs[i]));
|
||||||
if (@hasDecl(T, "deinit")) ptr.deinit();
|
if (@hasDecl(T, "deinit")) ptr.deinit(allocator);
|
||||||
allocator.destroy(ptr);
|
allocator.destroy(ptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -880,7 +670,7 @@ pub fn LayerStack(comptime layer_types: anytype) type {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// pub const Layers = LayerStack(.{ RobotsLayer, WebBotAuthLayer, CacheLayer });
|
// pub const Layers = LayerStack(.{ RobotsLayer, WebBotAuthLayer, CacheLayer });
|
||||||
pub const Layers = LayerStack(.{CacheLayer});
|
pub const Layers = LayerStack(.{ RobotsLayer, CacheLayer });
|
||||||
|
|
||||||
const Client = @This();
|
const Client = @This();
|
||||||
|
|
||||||
@@ -892,15 +682,14 @@ pub fn init(allocator: Allocator, network: *Network) !*Client {
|
|||||||
errdefer transport.deinit();
|
errdefer transport.deinit();
|
||||||
|
|
||||||
var layers = try Layers.init(allocator, transport, .{
|
var layers = try Layers.init(allocator, transport, .{
|
||||||
// RobotsLayer{
|
RobotsLayer{
|
||||||
// .obey_robots = network.config.obeyRobots(),
|
.obey_robots = network.config.obeyRobots(),
|
||||||
// .allocator = allocator,
|
.allocator = allocator,
|
||||||
// .pending = .empty,
|
.pending = .empty,
|
||||||
// },
|
},
|
||||||
// WebBotAuthLayer{
|
WebBotAuthLayer{
|
||||||
// .auth = if (network.web_bot_auth) |*wba| wba else null,
|
.auth = if (network.web_bot_auth) |*wba| wba else null,
|
||||||
// .allocator = allocator,
|
},
|
||||||
// },
|
|
||||||
CacheLayer{},
|
CacheLayer{},
|
||||||
});
|
});
|
||||||
errdefer layers.deinit(allocator);
|
errdefer layers.deinit(allocator);
|
||||||
|
|||||||
@@ -0,0 +1,255 @@
|
|||||||
|
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
|
||||||
|
//
|
||||||
|
// Francis Bouvier <francis@lightpanda.io>
|
||||||
|
// Pierre Tachoire <pierre@lightpanda.io>
|
||||||
|
//
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Affero General Public License as
|
||||||
|
// published by the Free Software Foundation, either version 3 of the
|
||||||
|
// License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Affero General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Affero General Public License
|
||||||
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
const std = @import("std");
|
||||||
|
const log = @import("../../log.zig");
|
||||||
|
|
||||||
|
const URL = @import("../../browser/URL.zig");
|
||||||
|
const Robots = @import("../Robots.zig");
|
||||||
|
const Context = @import("../../browser/HttpClient.zig").Context;
|
||||||
|
const Request = @import("../../browser/HttpClient.zig").Request;
|
||||||
|
const Response = @import("../../browser/HttpClient.zig").Response;
|
||||||
|
const Layer = @import("../../browser/HttpClient.zig").Layer;
|
||||||
|
const Forward = @import("Forward.zig");
|
||||||
|
|
||||||
|
const RobotsLayer = @This();
|
||||||
|
|
||||||
|
next: Layer = undefined,
|
||||||
|
obey_robots: bool,
|
||||||
|
allocator: std.mem.Allocator,
|
||||||
|
pending: std.StringHashMapUnmanaged(std.ArrayListUnmanaged(Request)) = .empty,
|
||||||
|
|
||||||
|
pub fn layer(self: *RobotsLayer) Layer {
|
||||||
|
return .{
|
||||||
|
.ptr = self,
|
||||||
|
.vtable = &.{
|
||||||
|
.request = request,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deinit(self: *RobotsLayer, allocator: std.mem.Allocator) void {
|
||||||
|
var it = self.pending.iterator();
|
||||||
|
while (it.next()) |entry| {
|
||||||
|
entry.value_ptr.deinit(allocator);
|
||||||
|
}
|
||||||
|
self.pending.deinit(allocator);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn request(ptr: *anyopaque, ctx: Context, req: Request) anyerror!void {
|
||||||
|
const self: *RobotsLayer = @ptrCast(@alignCast(ptr));
|
||||||
|
|
||||||
|
if (!self.obey_robots) {
|
||||||
|
return self.next.request(ctx, req);
|
||||||
|
}
|
||||||
|
|
||||||
|
const robots_url = try URL.getRobotsUrl(self.allocator, req.url);
|
||||||
|
errdefer self.allocator.free(robots_url);
|
||||||
|
|
||||||
|
if (ctx.network.robot_store.get(robots_url)) |robot_entry| {
|
||||||
|
defer self.allocator.free(robots_url);
|
||||||
|
switch (robot_entry) {
|
||||||
|
.present => |robots| {
|
||||||
|
const path = URL.getPathname(req.url);
|
||||||
|
if (!robots.isAllowed(path)) {
|
||||||
|
log.warn(.http, "blocked by robots", .{ .url = req.url });
|
||||||
|
req.error_callback(req.ctx, error.RobotsBlocked);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
.absent => {},
|
||||||
|
}
|
||||||
|
return self.next.request(ctx, req);
|
||||||
|
}
|
||||||
|
|
||||||
|
return self.fetchRobotsThenRequest(ctx, robots_url, req);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetchRobotsThenRequest(self: *RobotsLayer, ctx: Context, robots_url: [:0]const u8, req: Request) !void {
|
||||||
|
const entry = try self.pending.getOrPut(self.allocator, robots_url);
|
||||||
|
|
||||||
|
if (!entry.found_existing) {
|
||||||
|
errdefer self.allocator.free(robots_url);
|
||||||
|
entry.value_ptr.* = .empty;
|
||||||
|
|
||||||
|
const robots_ctx = try self.allocator.create(RobotsContext);
|
||||||
|
errdefer self.allocator.destroy(robots_ctx);
|
||||||
|
robots_ctx.* = .{
|
||||||
|
.layer = self,
|
||||||
|
.ctx = ctx,
|
||||||
|
.robots_url = robots_url,
|
||||||
|
.buffer = .empty,
|
||||||
|
};
|
||||||
|
|
||||||
|
const headers = try ctx.newHeaders();
|
||||||
|
log.debug(.browser, "fetching robots.txt", .{ .robots_url = robots_url });
|
||||||
|
|
||||||
|
try self.next.request(ctx, .{
|
||||||
|
.ctx = robots_ctx,
|
||||||
|
.url = robots_url,
|
||||||
|
.method = .GET,
|
||||||
|
.headers = headers,
|
||||||
|
.blocking = false,
|
||||||
|
.frame_id = req.frame_id,
|
||||||
|
.cookie_jar = req.cookie_jar,
|
||||||
|
.cookie_origin = req.cookie_origin,
|
||||||
|
.notification = req.notification,
|
||||||
|
.resource_type = .fetch,
|
||||||
|
.header_callback = RobotsContext.headerCallback,
|
||||||
|
.data_callback = RobotsContext.dataCallback,
|
||||||
|
.done_callback = RobotsContext.doneCallback,
|
||||||
|
.error_callback = RobotsContext.errorCallback,
|
||||||
|
.shutdown_callback = RobotsContext.shutdownCallback,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
self.allocator.free(robots_url);
|
||||||
|
}
|
||||||
|
|
||||||
|
try entry.value_ptr.append(self.allocator, req);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flushPending(self: *RobotsLayer, ctx: Context, robots_url: [:0]const u8, allowed: bool) void {
|
||||||
|
var queued = self.pending.fetchRemove(robots_url) orelse
|
||||||
|
@panic("RobotsLayer.flushPending: missing queue");
|
||||||
|
defer queued.value.deinit(self.allocator);
|
||||||
|
|
||||||
|
for (queued.value.items) |queued_req| {
|
||||||
|
if (!allowed) {
|
||||||
|
log.warn(.http, "blocked by robots", .{ .url = queued_req.url });
|
||||||
|
defer queued_req.headers.deinit();
|
||||||
|
queued_req.error_callback(queued_req.ctx, error.RobotsBlocked);
|
||||||
|
} else {
|
||||||
|
self.next.request(ctx, queued_req) catch |e| {
|
||||||
|
defer queued_req.headers.deinit();
|
||||||
|
queued_req.error_callback(queued_req.ctx, e);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flushPendingShutdown(self: *RobotsLayer, robots_url: [:0]const u8) void {
|
||||||
|
var queued = self.pending.fetchRemove(robots_url) orelse
|
||||||
|
@panic("RobotsLayer.flushPendingShutdown: missing queue");
|
||||||
|
defer queued.value.deinit(self.allocator);
|
||||||
|
|
||||||
|
for (queued.value.items) |queued_req| {
|
||||||
|
defer queued_req.headers.deinit();
|
||||||
|
if (queued_req.shutdown_callback) |cb| cb(queued_req.ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const RobotsContext = struct {
|
||||||
|
layer: *RobotsLayer,
|
||||||
|
ctx: Context,
|
||||||
|
robots_url: [:0]const u8,
|
||||||
|
buffer: std.ArrayListUnmanaged(u8),
|
||||||
|
status: u16 = 0,
|
||||||
|
|
||||||
|
fn deinit(self: *RobotsContext) void {
|
||||||
|
self.buffer.deinit(self.layer.allocator);
|
||||||
|
self.layer.allocator.destroy(self);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn headerCallback(response: Response) anyerror!bool {
|
||||||
|
const self: *RobotsContext = @ptrCast(@alignCast(response.ctx));
|
||||||
|
switch (response.inner) {
|
||||||
|
.transfer => |t| {
|
||||||
|
if (t.response_header) |hdr| {
|
||||||
|
log.debug(.browser, "robots status", .{ .status = hdr.status, .robots_url = self.robots_url });
|
||||||
|
self.status = hdr.status;
|
||||||
|
}
|
||||||
|
if (t.getContentLength()) |cl| {
|
||||||
|
try self.buffer.ensureTotalCapacity(self.layer.allocator, cl);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
.cached => {},
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dataCallback(response: Response, data: []const u8) anyerror!void {
|
||||||
|
const self: *RobotsContext = @ptrCast(@alignCast(response.ctx));
|
||||||
|
try self.buffer.appendSlice(self.layer.allocator, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn doneCallback(ctx_ptr: *anyopaque) anyerror!void {
|
||||||
|
const self: *RobotsContext = @ptrCast(@alignCast(ctx_ptr));
|
||||||
|
const l = self.layer;
|
||||||
|
const ctx = self.ctx;
|
||||||
|
const robots_url = self.robots_url;
|
||||||
|
defer l.allocator.free(robots_url);
|
||||||
|
defer self.deinit();
|
||||||
|
|
||||||
|
var allowed = true;
|
||||||
|
const network = ctx.network;
|
||||||
|
|
||||||
|
switch (self.status) {
|
||||||
|
200 => {
|
||||||
|
if (self.buffer.items.len > 0) {
|
||||||
|
const robots: ?Robots = network.robot_store.robotsFromBytes(
|
||||||
|
network.config.http_headers.user_agent,
|
||||||
|
self.buffer.items,
|
||||||
|
) catch blk: {
|
||||||
|
log.warn(.browser, "failed to parse robots", .{ .robots_url = robots_url });
|
||||||
|
try network.robot_store.putAbsent(robots_url);
|
||||||
|
break :blk null;
|
||||||
|
};
|
||||||
|
if (robots) |r| {
|
||||||
|
try network.robot_store.put(robots_url, r);
|
||||||
|
const path = URL.getPathname(self.layer.pending.get(robots_url).?.items[0].url);
|
||||||
|
allowed = r.isAllowed(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
404 => {
|
||||||
|
log.debug(.http, "robots not found", .{ .url = robots_url });
|
||||||
|
try network.robot_store.putAbsent(robots_url);
|
||||||
|
},
|
||||||
|
else => {
|
||||||
|
log.debug(.http, "unexpected status on robots", .{
|
||||||
|
.url = robots_url,
|
||||||
|
.status = self.status,
|
||||||
|
});
|
||||||
|
try network.robot_store.putAbsent(robots_url);
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
l.flushPending(ctx, robots_url, allowed);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn errorCallback(ctx_ptr: *anyopaque, err: anyerror) void {
|
||||||
|
const self: *RobotsContext = @ptrCast(@alignCast(ctx_ptr));
|
||||||
|
const l = self.layer;
|
||||||
|
const ctx = self.ctx;
|
||||||
|
const robots_url = self.robots_url;
|
||||||
|
defer l.allocator.free(robots_url);
|
||||||
|
defer self.deinit();
|
||||||
|
log.warn(.http, "robots fetch failed", .{ .err = err });
|
||||||
|
l.flushPending(ctx, robots_url, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shutdownCallback(ctx_ptr: *anyopaque) void {
|
||||||
|
const self: *RobotsContext = @ptrCast(@alignCast(ctx_ptr));
|
||||||
|
const l = self.layer;
|
||||||
|
const robots_url = self.robots_url;
|
||||||
|
defer l.allocator.free(robots_url);
|
||||||
|
defer self.deinit();
|
||||||
|
log.debug(.http, "robots fetch shutdown", .{});
|
||||||
|
l.flushPendingShutdown(robots_url);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user