strip utf8 bom from start of robots.txt

This commit is contained in:
Muki Kiboigo
2026-02-10 20:29:39 -08:00
parent f02a37d3f0
commit 7de9422b75

View File

@@ -121,7 +121,7 @@ fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void {
fn parseRulesWithUserAgent( fn parseRulesWithUserAgent(
allocator: std.mem.Allocator, allocator: std.mem.Allocator,
user_agent: []const u8, user_agent: []const u8,
bytes: []const u8, raw_bytes: []const u8,
) ![]const Rule { ) ![]const Rule {
var rules: std.ArrayList(Rule) = .empty; var rules: std.ArrayList(Rule) = .empty;
defer rules.deinit(allocator); defer rules.deinit(allocator);
@@ -131,6 +131,15 @@ fn parseRulesWithUserAgent(
var state: State = .{ .entry = .not_in_entry, .has_rules = false }; var state: State = .{ .entry = .not_in_entry, .has_rules = false };
// https://en.wikipedia.org/wiki/Byte_order_mark
const UTF8_BOM: []const u8 = &.{ 0xEF, 0xBB, 0xBF };
// Strip UTF8 BOM
const bytes = if (std.mem.startsWith(u8, raw_bytes, UTF8_BOM))
raw_bytes[3..]
else
raw_bytes;
var iter = std.mem.splitScalar(u8, bytes, '\n'); var iter = std.mem.splitScalar(u8, bytes, '\n');
while (iter.next()) |line| { while (iter.next()) |line| {
const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace); const trimmed = std.mem.trim(u8, line, &std.ascii.whitespace);
@@ -144,19 +153,16 @@ fn parseRulesWithUserAgent(
else else
trimmed; trimmed;
if (true_line.len == 0) { if (true_line.len == 0) continue;
continue;
}
const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse return error.MissingColon; const colon_idx = std.mem.indexOfScalar(u8, true_line, ':') orelse {
log.warn(.browser, "robots line missing colon", .{ .line = line });
continue;
};
const key_str = try std.ascii.allocLowerString(allocator, true_line[0..colon_idx]); const key_str = try std.ascii.allocLowerString(allocator, true_line[0..colon_idx]);
defer allocator.free(key_str); defer allocator.free(key_str);
const key = std.meta.stringToEnum(Key, key_str) orelse { const key = std.meta.stringToEnum(Key, key_str) orelse continue;
// log.warn(.browser, "robots key", .{ .key = key_str });
continue;
};
const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace); const value = std.mem.trim(u8, true_line[colon_idx + 1 ..], &std.ascii.whitespace);
switch (key) { switch (key) {