diff --git a/src/browser/Robots.zig b/src/browser/Robots.zig index cb565ac8..7c4eb5c8 100644 --- a/src/browser/Robots.zig +++ b/src/browser/Robots.zig @@ -16,12 +16,54 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +const builtin = @import("builtin"); const std = @import("std"); const log = @import("../log.zig"); +pub const CompiledPattern = struct { + pattern: []const u8, + ty: enum { + prefix, // "/admin/" - prefix match + exact, // "/admin$" - exact match + wildcard, // any pattern that contains * + }, + + fn compile(pattern: []const u8) CompiledPattern { + if (pattern.len == 0) { + return .{ + .pattern = pattern, + .ty = .prefix, + }; + } + + const is_wildcard = std.mem.indexOfScalar(u8, pattern, '*') != null; + + if (is_wildcard) { + return .{ + .pattern = pattern, + .ty = .wildcard, + }; + } + + const has_end_anchor = pattern[pattern.len - 1] == '$'; + return .{ + .pattern = pattern, + .ty = if (has_end_anchor) .exact else .prefix, + }; + } +}; + pub const Rule = union(enum) { - allow: []const u8, - disallow: []const u8, + allow: CompiledPattern, + disallow: CompiledPattern, + + fn allowRule(pattern: []const u8) Rule { + return .{ .allow = CompiledPattern.compile(pattern) }; + } + + fn disallowRule(pattern: []const u8) Rule { + return .{ .disallow = CompiledPattern.compile(pattern) }; + } }; pub const Key = enum { @@ -112,8 +154,8 @@ const State = struct { fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void { for (rules) |rule| { switch (rule) { - .allow => |value| allocator.free(value), - .disallow => |value| allocator.free(value), + .allow => |compiled| allocator.free(compiled.pattern), + .disallow => |compiled| allocator.free(compiled.pattern), } } } @@ -201,13 +243,13 @@ fn parseRulesWithUserAgent( .in_our_entry => { const duped_value = try allocator.dupe(u8, value); errdefer allocator.free(duped_value); - try rules.append(allocator, .{ .allow = duped_value }); + try rules.append(allocator, Rule.allowRule(duped_value)); }, .in_other_entry => {}, .in_wildcard_entry => { const duped_value = try allocator.dupe(u8, value); errdefer allocator.free(duped_value); - try wildcard_rules.append(allocator, .{ .allow = duped_value }); + try wildcard_rules.append(allocator, Rule.allowRule(duped_value)); }, .not_in_entry => { log.warn(.browser, "robots unexpected rule", .{ .rule = "allow" }); @@ -224,7 +266,7 @@ fn parseRulesWithUserAgent( const duped_value = try allocator.dupe(u8, value); errdefer allocator.free(duped_value); - try rules.append(allocator, .{ .disallow = duped_value }); + try rules.append(allocator, Rule.disallowRule(duped_value)); }, .in_other_entry => {}, .in_wildcard_entry => { @@ -232,7 +274,7 @@ fn parseRulesWithUserAgent( const duped_value = try allocator.dupe(u8, value); errdefer allocator.free(duped_value); - try wildcard_rules.append(allocator, .{ .disallow = duped_value }); + try wildcard_rules.append(allocator, Rule.disallowRule(duped_value)); }, .not_in_entry => { log.warn(.browser, "robots unexpected rule", .{ .rule = "disallow" }); @@ -261,13 +303,13 @@ pub fn fromBytes(allocator: std.mem.Allocator, user_agent: []const u8, bytes: [] std.mem.sort(Rule, rules, {}, struct { fn lessThan(_: void, a: Rule, b: Rule) bool { const a_len = switch (a) { - .allow => |p| p.len, - .disallow => |p| p.len, + .allow => |p| p.pattern.len, + .disallow => |p| p.pattern.len, }; const b_len = switch (b) { - .allow => |p| p.len, - .disallow => |p| p.len, + .allow => |p| p.pattern.len, + .disallow => |p| p.pattern.len, }; // Sort by length first. @@ -297,6 +339,27 @@ pub fn deinit(self: *Robots, allocator: std.mem.Allocator) void { allocator.free(self.rules); } +/// There are rules for how the pattern in robots.txt should be matched. +/// +/// * should match 0 or more of any character. +/// $ should signify the end of a path, making it exact. +/// otherwise, it is a prefix path. +fn matchPattern(compiled: CompiledPattern, path: []const u8) bool { + switch (compiled.ty) { + .prefix => return std.mem.startsWith(u8, path, compiled.pattern), + .exact => { + const pattern = compiled.pattern; + return std.mem.eql(u8, path, pattern[0 .. pattern.len - 1]); + }, + .wildcard => { + const pattern = compiled.pattern; + const exact_match = pattern[pattern.len - 1] == '$'; + const inner_pattern = if (exact_match) pattern[0 .. pattern.len - 1] else pattern; + return matchPatternRecursive(inner_pattern, path, exact_match); + }, + } +} + fn matchPatternRecursive(pattern: []const u8, path: []const u8, exact_match: bool) bool { if (pattern.len == 0) return true; @@ -328,31 +391,14 @@ fn matchPatternRecursive(pattern: []const u8, path: []const u8, exact_match: boo return false; } -/// There are rules for how the pattern in robots.txt should be matched. -/// -/// * should match 0 or more of any character. -/// $ should signify the end of a path, making it exact. -/// otherwise, it is a prefix path. -fn matchPattern(pattern: []const u8, path: []const u8) ?usize { - if (pattern.len == 0) return 0; - const exact_match = pattern[pattern.len - 1] == '$'; - const inner_pattern = if (exact_match) pattern[0 .. pattern.len - 1] else pattern; - - if (matchPatternRecursive( - inner_pattern, - path, - exact_match, - )) return pattern.len else return null; -} - pub fn isAllowed(self: *const Robots, path: []const u8) bool { for (self.rules) |rule| { switch (rule) { - .allow => |pattern| { - if (matchPattern(pattern, path) != null) return true; + .allow => |compiled| { + if (matchPattern(compiled, path)) return true; }, - .disallow => |pattern| { - if (matchPattern(pattern, path) != null) return false; + .disallow => |compiled| { + if (matchPattern(compiled, path)) return false; }, } } @@ -360,6 +406,12 @@ pub fn isAllowed(self: *const Robots, path: []const u8) bool { return true; } +fn testMatch(pattern: []const u8, path: []const u8) bool { + comptime if (!builtin.is_test) unreachable; + + return matchPattern(CompiledPattern.compile(pattern), path); +} + test "Robots: simple robots.txt" { const allocator = std.testing.allocator; @@ -380,77 +432,77 @@ test "Robots: simple robots.txt" { } try std.testing.expectEqual(1, rules.len); - try std.testing.expectEqualStrings("/admin/", rules[0].disallow); + try std.testing.expectEqualStrings("/admin/", rules[0].disallow.pattern); } test "Robots: matchPattern - simple prefix" { - try std.testing.expect(matchPattern("/admin", "/admin/page") != null); - try std.testing.expect(matchPattern("/admin", "/admin") != null); - try std.testing.expect(matchPattern("/admin", "/other") == null); - try std.testing.expect(matchPattern("/admin/page", "/admin") == null); + try std.testing.expect(testMatch("/admin", "/admin/page")); + try std.testing.expect(testMatch("/admin", "/admin")); + try std.testing.expect(!testMatch("/admin", "/other")); + try std.testing.expect(!testMatch("/admin/page", "/admin")); } test "Robots: matchPattern - single wildcard" { - try std.testing.expect(matchPattern("/admin/*", "/admin/") != null); - try std.testing.expect(matchPattern("/admin/*", "/admin/page") != null); - try std.testing.expect(matchPattern("/admin/*", "/admin/page/subpage") != null); - try std.testing.expect(matchPattern("/admin/*", "/other/page") == null); + try std.testing.expect(testMatch("/admin/*", "/admin/")); + try std.testing.expect(testMatch("/admin/*", "/admin/page")); + try std.testing.expect(testMatch("/admin/*", "/admin/page/subpage")); + try std.testing.expect(!testMatch("/admin/*", "/other/page")); } test "Robots: matchPattern - wildcard in middle" { - try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/xyz") != null); - try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/ghi/xyz") != null); - try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def") == null); - try std.testing.expect(matchPattern("/abc/*/xyz", "/other/def/xyz") == null); + try std.testing.expect(testMatch("/abc/*/xyz", "/abc/def/xyz")); + try std.testing.expect(testMatch("/abc/*/xyz", "/abc/def/ghi/xyz")); + try std.testing.expect(!testMatch("/abc/*/xyz", "/abc/def")); + try std.testing.expect(!testMatch("/abc/*/xyz", "/other/def/xyz")); } test "Robots: matchPattern - complex wildcard case" { - try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/def/def/xyz") != null); - try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/ANYTHING/def/xyz") != null); + try std.testing.expect(testMatch("/abc/*/def/xyz", "/abc/def/def/xyz")); + try std.testing.expect(testMatch("/abc/*/def/xyz", "/abc/ANYTHING/def/xyz")); } test "Robots: matchPattern - multiple wildcards" { - try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/b/y/c") != null); - try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/y/b/z/w/c") != null); - try std.testing.expect(matchPattern("/*.php", "/index.php") != null); - try std.testing.expect(matchPattern("/*.php", "/admin/index.php") != null); + try std.testing.expect(testMatch("/a/*/b/*/c", "/a/x/b/y/c")); + try std.testing.expect(testMatch("/a/*/b/*/c", "/a/x/y/b/z/w/c")); + try std.testing.expect(testMatch("/*.php", "/index.php")); + try std.testing.expect(testMatch("/*.php", "/admin/index.php")); } test "Robots: matchPattern - end anchor" { - try std.testing.expect(matchPattern("/*.php$", "/index.php") != null); - try std.testing.expect(matchPattern("/*.php$", "/index.php?param=value") == null); - try std.testing.expect(matchPattern("/admin$", "/admin") != null); - try std.testing.expect(matchPattern("/admin$", "/admin/") == null); - try std.testing.expect(matchPattern("/fish$", "/fish") != null); - try std.testing.expect(matchPattern("/fish$", "/fishheads") == null); + try std.testing.expect(testMatch("/*.php$", "/index.php")); + try std.testing.expect(!testMatch("/*.php$", "/index.php?param=value")); + try std.testing.expect(testMatch("/admin$", "/admin")); + try std.testing.expect(!testMatch("/admin$", "/admin/")); + try std.testing.expect(testMatch("/fish$", "/fish")); + try std.testing.expect(!testMatch("/fish$", "/fishheads")); } test "Robots: matchPattern - wildcard with extension" { - try std.testing.expect(matchPattern("/fish*.php", "/fish.php") != null); - try std.testing.expect(matchPattern("/fish*.php", "/fishheads.php") != null); - try std.testing.expect(matchPattern("/fish*.php", "/fish/salmon.php") != null); - try std.testing.expect(matchPattern("/fish*.php", "/fish.asp") == null); + try std.testing.expect(testMatch("/fish*.php", "/fish.php")); + try std.testing.expect(testMatch("/fish*.php", "/fishheads.php")); + try std.testing.expect(testMatch("/fish*.php", "/fish/salmon.php")); + try std.testing.expect(!testMatch("/fish*.php", "/fish.asp")); } test "Robots: matchPattern - empty and edge cases" { - try std.testing.expect(matchPattern("", "/anything") != null); - try std.testing.expect(matchPattern("/", "/") != null); - try std.testing.expect(matchPattern("*", "/anything") != null); - try std.testing.expect(matchPattern("/*", "/anything") != null); - try std.testing.expect(matchPattern("$", "") != null); + try std.testing.expect(testMatch("", "/anything")); + try std.testing.expect(testMatch("/", "/")); + try std.testing.expect(testMatch("*", "/anything")); + try std.testing.expect(testMatch("/*", "/anything")); + try std.testing.expect(testMatch("$", "")); } test "Robots: matchPattern - real world examples" { - try std.testing.expect(matchPattern("/", "/anything") != null); + try std.testing.expect(testMatch("/", "/anything")); - try std.testing.expect(matchPattern("/admin/", "/admin/page") != null); - try std.testing.expect(matchPattern("/admin/", "/public/page") == null); + try std.testing.expect(testMatch("/admin/", "/admin/page")); + try std.testing.expect(!testMatch("/admin/", "/public/page")); - try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf") != null); - try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf.bak") == null); + try std.testing.expect(testMatch("/*.pdf$", "/document.pdf")); + try std.testing.expect(!testMatch("/*.pdf$", "/document.pdf.bak")); - try std.testing.expect(matchPattern("/*?", "/page?param=value") != null); - try std.testing.expect(matchPattern("/*?", "/page") == null); + try std.testing.expect(testMatch("/*?", "/page?param=value")); + try std.testing.expect(!testMatch("/*?", "/page")); } test "Robots: isAllowed - basic allow/disallow" {