diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig index 709481ee..fe851118 100644 --- a/src/browser/Robots.zig +++ b/src/browser/Robots.zig @@ -121,7 +121,7 @@ fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void { fn parseRulesWithUserAgent( allocator: std.mem.Allocator, user_agent: []const u8, - bytes: []const u8, + raw_bytes: []const u8, ) ![]const Rule { var rules: std.ArrayList(Rule) = .empty; defer rules.deinit(allocator); @@ -131,6 +131,15 @@ fn parseRulesWithUserAgent( var state: State = .{ .entry = .not_in_entry, .has_rules = false }; + // https://en.wikipedia.org/wiki/Byte_order_mark + const UTF8_BOM: []const u8 = &.{ 0xEF, 0xBB, 0xBF }; + + // Strip UTF8 BOM + const bytes = if (std.mem.startsWith(u8, raw_bytes, UTF8_BOM)) + raw_bytes[3..] + else + raw_bytes; + var iter = std.mem.splitScalar(u8, bytes, '\n'); while (iter.next()) |line| { const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace); @@ -144,19 +153,16 @@ fn parseRulesWithUserAgent( else trimmed; - if (true_line.len == 0) { - continue; - } + if (true_line.len == 0) continue; - const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse return error.MissingColon; + const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse { + log.warn(.browser, "robots line missing colon", .{ .line = line }); + continue; + }; const key_str = try std.ascii.allocLowerString(allocator, true_line[0..colon_idx]); defer allocator.free(key_str); - const key = std.meta.stringToEnum(Key, key_str) orelse { - // log.warn(.browser, "robots key", .{ .key = key_str }); - continue; - }; - + const key = std.meta.stringToEnum(Key, key_str) orelse continue; const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace); switch (key) { diff --git a/src/http/Client.zig b/src/http/Client.zig index 3a7a4086..3cce2ddb 100644 --- a/src/http/Client.zig +++ b/src/http/Client.zig @@ -375,19 +375,36 @@ fn robotsDoneCallback(ctx_ptr: *anyopaque) !void { var allowed = true; - if (ctx.status >= 200 and ctx.status < 400 and ctx.buffer.items.len > 0) { - const robots = try ctx.client.robot_store.robotsFromBytes( - ctx.client.config.http_headers.user_agent, - ctx.buffer.items, - ); + switch (ctx.status) { + 200 => { + if (ctx.buffer.items.len > 0) { + const robots: ?Robots = ctx.client.robot_store.robotsFromBytes( + ctx.client.config.http_headers.user_agent, + ctx.buffer.items, + ) catch blk: { + log.warn(.browser, "failed to parse robots", .{ .robots_url = ctx.robots_url }); + // If we fail to parse, we just insert it as absent and ignore. + try ctx.client.robot_store.putAbsent(ctx.robots_url); + break :blk null; + }; - try ctx.client.robot_store.put(ctx.robots_url, robots); - - const path = URL.getPathname(ctx.req.url); - allowed = robots.isAllowed(path); - } else if (ctx.status == 404) { - log.debug(.http, "robots not found", .{ .url = ctx.robots_url }); - try ctx.client.robot_store.putAbsent(ctx.robots_url); + if (robots) |r| { + try ctx.client.robot_store.put(ctx.robots_url, r); + const path = URL.getPathname(ctx.req.url); + allowed = r.isAllowed(path); + } + } + }, + 404 => { + log.debug(.http, "robots not found", .{ .url = ctx.robots_url }); + // If we get a 404, we just insert it as absent. + try ctx.client.robot_store.putAbsent(ctx.robots_url); + }, + else => { + log.debug(.http, "unexpected status on robots", .{ .url = ctx.robots_url, .status = ctx.status }); + // If we get an unexpected status, we just insert as absent. + try ctx.client.robot_store.putAbsent(ctx.robots_url); + }, } var queued = ctx.client.pending_robots_queue.fetchRemove(