mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-03-22 04:34:44 +00:00
Merge pull request #1520 from lightpanda-io/robots-fix-wikipedia
Some checks failed
e2e-test / zig build release (push) Has been cancelled
e2e-test / demo-scripts (push) Has been cancelled
e2e-test / cdp-and-hyperfine-bench (push) Has been cancelled
e2e-test / perf-fmt (push) Has been cancelled
e2e-test / browser fetch (push) Has been cancelled
zig-test / zig test using v8 in debug mode (push) Has been cancelled
zig-test / zig test (push) Has been cancelled
zig-test / perf-fmt (push) Has been cancelled
Some checks failed
e2e-test / zig build release (push) Has been cancelled
e2e-test / demo-scripts (push) Has been cancelled
e2e-test / cdp-and-hyperfine-bench (push) Has been cancelled
e2e-test / perf-fmt (push) Has been cancelled
e2e-test / browser fetch (push) Has been cancelled
zig-test / zig test using v8 in debug mode (push) Has been cancelled
zig-test / zig test (push) Has been cancelled
zig-test / perf-fmt (push) Has been cancelled
Fix how `robots.txt`handles utf8
This commit is contained in:
@@ -121,7 +121,7 @@ fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void {
|
|||||||
fn parseRulesWithUserAgent(
|
fn parseRulesWithUserAgent(
|
||||||
allocator: std.mem.Allocator,
|
allocator: std.mem.Allocator,
|
||||||
user_agent: []const u8,
|
user_agent: []const u8,
|
||||||
bytes: []const u8,
|
raw_bytes: []const u8,
|
||||||
) ![]const Rule {
|
) ![]const Rule {
|
||||||
var rules: std.ArrayList(Rule) = .empty;
|
var rules: std.ArrayList(Rule) = .empty;
|
||||||
defer rules.deinit(allocator);
|
defer rules.deinit(allocator);
|
||||||
@@ -131,6 +131,15 @@ fn parseRulesWithUserAgent(
|
|||||||
|
|
||||||
var state: State = .{ .entry = .not_in_entry, .has_rules = false };
|
var state: State = .{ .entry = .not_in_entry, .has_rules = false };
|
||||||
|
|
||||||
|
// https://en.wikipedia.org/wiki/Byte_order_mark
|
||||||
|
const UTF8_BOM: []const u8 = &.{ 0xEF, 0xBB, 0xBF };
|
||||||
|
|
||||||
|
// Strip UTF8 BOM
|
||||||
|
const bytes = if (std.mem.startsWith(u8, raw_bytes, UTF8_BOM))
|
||||||
|
raw_bytes[3..]
|
||||||
|
else
|
||||||
|
raw_bytes;
|
||||||
|
|
||||||
var iter = std.mem.splitScalar(u8, bytes, '\n');
|
var iter = std.mem.splitScalar(u8, bytes, '\n');
|
||||||
while (iter.next()) |line| {
|
while (iter.next()) |line| {
|
||||||
const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
|
const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
|
||||||
@@ -144,19 +153,16 @@ fn parseRulesWithUserAgent(
|
|||||||
else
|
else
|
||||||
trimmed;
|
trimmed;
|
||||||
|
|
||||||
if (true_line.len == 0) {
|
if (true_line.len == 0) continue;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse return error.MissingColon;
|
const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse {
|
||||||
|
log.warn(.browser, "robots line missing colon", .{ .line = line });
|
||||||
|
continue;
|
||||||
|
};
|
||||||
const key_str = try std.ascii.allocLowerString(allocator, true_line[0..colon_idx]);
|
const key_str = try std.ascii.allocLowerString(allocator, true_line[0..colon_idx]);
|
||||||
defer allocator.free(key_str);
|
defer allocator.free(key_str);
|
||||||
|
|
||||||
const key = std.meta.stringToEnum(Key, key_str) orelse {
|
const key = std.meta.stringToEnum(Key, key_str) orelse continue;
|
||||||
// log.warn(.browser, "robots key", .{ .key = key_str });
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
|
|
||||||
const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace);
|
const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace);
|
||||||
|
|
||||||
switch (key) {
|
switch (key) {
|
||||||
|
|||||||
@@ -375,19 +375,36 @@ fn robotsDoneCallback(ctx_ptr: *anyopaque) !void {
|
|||||||
|
|
||||||
var allowed = true;
|
var allowed = true;
|
||||||
|
|
||||||
if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) {
|
switch (ctx.status) {
|
||||||
const robots = try ctx.client.robot_store.robotsFromBytes(
|
200 => {
|
||||||
ctx.client.config.http_headers.user_agent,
|
if (ctx.buffer.items.len > 0) {
|
||||||
ctx.buffer.items,
|
const robots: ?Robots = ctx.client.robot_store.robotsFromBytes(
|
||||||
);
|
ctx.client.config.http_headers.user_agent,
|
||||||
|
ctx.buffer.items,
|
||||||
|
) catch blk: {
|
||||||
|
log.warn(.browser, "failed to parse robots", .{ .robots_url = ctx.robots_url });
|
||||||
|
// If we fail to parse, we just insert it as absent and ignore.
|
||||||
|
try ctx.client.robot_store.putAbsent(ctx.robots_url);
|
||||||
|
break :blk null;
|
||||||
|
};
|
||||||
|
|
||||||
try ctx.client.robot_store.put(ctx.robots_url, robots);
|
if (robots) |r| {
|
||||||
|
try ctx.client.robot_store.put(ctx.robots_url, r);
|
||||||
const path = URL.getPathname(ctx.req.url);
|
const path = URL.getPathname(ctx.req.url);
|
||||||
allowed = robots.isAllowed(path);
|
allowed = r.isAllowed(path);
|
||||||
} else if (ctx.status == 404) {
|
}
|
||||||
log.debug(.http, "robots not found", .{ .url = ctx.robots_url });
|
}
|
||||||
try ctx.client.robot_store.putAbsent(ctx.robots_url);
|
},
|
||||||
|
404 => {
|
||||||
|
log.debug(.http, "robots not found", .{ .url = ctx.robots_url });
|
||||||
|
// If we get a 404, we just insert it as absent.
|
||||||
|
try ctx.client.robot_store.putAbsent(ctx.robots_url);
|
||||||
|
},
|
||||||
|
else => {
|
||||||
|
log.debug(.http, "unexpected status on robots", .{ .url = ctx.robots_url, .status = ctx.status });
|
||||||
|
// If we get an unexpected status, we just insert as absent.
|
||||||
|
try ctx.client.robot_store.putAbsent(ctx.robots_url);
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
var queued = ctx.client.pending_robots_queue.fetchRemove(
|
var queued = ctx.client.pending_robots_queue.fetchRemove(
|
||||||
|
|||||||
Reference in New Issue
Block a user