mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-03-22 04:34:44 +00:00
partial compilation of robot rules
This commit is contained in:
@@ -16,12 +16,54 @@
|
|||||||
// You should have received a copy of the GNU Affero General Public License
|
// You should have received a copy of the GNU Affero General Public License
|
||||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
const builtin = @import("builtin");
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const log = @import("../log.zig");
|
const log = @import("../log.zig");
|
||||||
|
|
||||||
|
pub const CompiledPattern = struct {
|
||||||
|
pattern: []const u8,
|
||||||
|
ty: enum {
|
||||||
|
prefix, // "/admin/" - prefix match
|
||||||
|
exact, // "/admin$" - exact match
|
||||||
|
wildcard, // any pattern that contains *
|
||||||
|
},
|
||||||
|
|
||||||
|
fn compile(pattern: []const u8) CompiledPattern {
|
||||||
|
if (pattern.len == 0) {
|
||||||
|
return .{
|
||||||
|
.pattern = pattern,
|
||||||
|
.ty = .prefix,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const is_wildcard = std.mem.indexOfScalar(u8, pattern, '*') != null;
|
||||||
|
|
||||||
|
if (is_wildcard) {
|
||||||
|
return .{
|
||||||
|
.pattern = pattern,
|
||||||
|
.ty = .wildcard,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const has_end_anchor = pattern[pattern.len - 1] == '$';
|
||||||
|
return .{
|
||||||
|
.pattern = pattern,
|
||||||
|
.ty = if (has_end_anchor) .exact else .prefix,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
pub const Rule = union(enum) {
|
pub const Rule = union(enum) {
|
||||||
allow: []const u8,
|
allow: CompiledPattern,
|
||||||
disallow: []const u8,
|
disallow: CompiledPattern,
|
||||||
|
|
||||||
|
fn allowRule(pattern: []const u8) Rule {
|
||||||
|
return .{ .allow = CompiledPattern.compile(pattern) };
|
||||||
|
}
|
||||||
|
|
||||||
|
fn disallowRule(pattern: []const u8) Rule {
|
||||||
|
return .{ .disallow = CompiledPattern.compile(pattern) };
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const Key = enum {
|
pub const Key = enum {
|
||||||
@@ -112,8 +154,8 @@ const State = struct {
|
|||||||
fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void {
|
fn freeRulesInList(allocator: std.mem.Allocator, rules: []const Rule) void {
|
||||||
for (rules) |rule| {
|
for (rules) |rule| {
|
||||||
switch (rule) {
|
switch (rule) {
|
||||||
.allow => |value| allocator.free(value),
|
.allow => |compiled| allocator.free(compiled.pattern),
|
||||||
.disallow => |value| allocator.free(value),
|
.disallow => |compiled| allocator.free(compiled.pattern),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -201,13 +243,13 @@ fn parseRulesWithUserAgent(
|
|||||||
.in_our_entry => {
|
.in_our_entry => {
|
||||||
const duped_value = try allocator.dupe(u8, value);
|
const duped_value = try allocator.dupe(u8, value);
|
||||||
errdefer allocator.free(duped_value);
|
errdefer allocator.free(duped_value);
|
||||||
try rules.append(allocator, .{ .allow = duped_value });
|
try rules.append(allocator, Rule.allowRule(duped_value));
|
||||||
},
|
},
|
||||||
.in_other_entry => {},
|
.in_other_entry => {},
|
||||||
.in_wildcard_entry => {
|
.in_wildcard_entry => {
|
||||||
const duped_value = try allocator.dupe(u8, value);
|
const duped_value = try allocator.dupe(u8, value);
|
||||||
errdefer allocator.free(duped_value);
|
errdefer allocator.free(duped_value);
|
||||||
try wildcard_rules.append(allocator, .{ .allow = duped_value });
|
try wildcard_rules.append(allocator, Rule.allowRule(duped_value));
|
||||||
},
|
},
|
||||||
.not_in_entry => {
|
.not_in_entry => {
|
||||||
log.warn(.browser, "robots unexpected rule", .{ .rule = "allow" });
|
log.warn(.browser, "robots unexpected rule", .{ .rule = "allow" });
|
||||||
@@ -224,7 +266,7 @@ fn parseRulesWithUserAgent(
|
|||||||
|
|
||||||
const duped_value = try allocator.dupe(u8, value);
|
const duped_value = try allocator.dupe(u8, value);
|
||||||
errdefer allocator.free(duped_value);
|
errdefer allocator.free(duped_value);
|
||||||
try rules.append(allocator, .{ .disallow = duped_value });
|
try rules.append(allocator, Rule.disallowRule(duped_value));
|
||||||
},
|
},
|
||||||
.in_other_entry => {},
|
.in_other_entry => {},
|
||||||
.in_wildcard_entry => {
|
.in_wildcard_entry => {
|
||||||
@@ -232,7 +274,7 @@ fn parseRulesWithUserAgent(
|
|||||||
|
|
||||||
const duped_value = try allocator.dupe(u8, value);
|
const duped_value = try allocator.dupe(u8, value);
|
||||||
errdefer allocator.free(duped_value);
|
errdefer allocator.free(duped_value);
|
||||||
try wildcard_rules.append(allocator, .{ .disallow = duped_value });
|
try wildcard_rules.append(allocator, Rule.disallowRule(duped_value));
|
||||||
},
|
},
|
||||||
.not_in_entry => {
|
.not_in_entry => {
|
||||||
log.warn(.browser, "robots unexpected rule", .{ .rule = "disallow" });
|
log.warn(.browser, "robots unexpected rule", .{ .rule = "disallow" });
|
||||||
@@ -261,13 +303,13 @@ pub fn fromBytes(allocator: std.mem.Allocator, user_agent: []const u8, bytes: []
|
|||||||
std.mem.sort(Rule, rules, {}, struct {
|
std.mem.sort(Rule, rules, {}, struct {
|
||||||
fn lessThan(_: void, a: Rule, b: Rule) bool {
|
fn lessThan(_: void, a: Rule, b: Rule) bool {
|
||||||
const a_len = switch (a) {
|
const a_len = switch (a) {
|
||||||
.allow => |p| p.len,
|
.allow => |p| p.pattern.len,
|
||||||
.disallow => |p| p.len,
|
.disallow => |p| p.pattern.len,
|
||||||
};
|
};
|
||||||
|
|
||||||
const b_len = switch (b) {
|
const b_len = switch (b) {
|
||||||
.allow => |p| p.len,
|
.allow => |p| p.pattern.len,
|
||||||
.disallow => |p| p.len,
|
.disallow => |p| p.pattern.len,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Sort by length first.
|
// Sort by length first.
|
||||||
@@ -297,6 +339,27 @@ pub fn deinit(self: *Robots, allocator: std.mem.Allocator) void {
|
|||||||
allocator.free(self.rules);
|
allocator.free(self.rules);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// There are rules for how the pattern in robots.txt should be matched.
|
||||||
|
///
|
||||||
|
/// * should match 0 or more of any character.
|
||||||
|
/// $ should signify the end of a path, making it exact.
|
||||||
|
/// otherwise, it is a prefix path.
|
||||||
|
fn matchPattern(compiled: CompiledPattern, path: []const u8) bool {
|
||||||
|
switch (compiled.ty) {
|
||||||
|
.prefix => return std.mem.startsWith(u8, path, compiled.pattern),
|
||||||
|
.exact => {
|
||||||
|
const pattern = compiled.pattern;
|
||||||
|
return std.mem.eql(u8, path, pattern[0 .. pattern.len - 1]);
|
||||||
|
},
|
||||||
|
.wildcard => {
|
||||||
|
const pattern = compiled.pattern;
|
||||||
|
const exact_match = pattern[pattern.len - 1] == '$';
|
||||||
|
const inner_pattern = if (exact_match) pattern[0 .. pattern.len - 1] else pattern;
|
||||||
|
return matchPatternRecursive(inner_pattern, path, exact_match);
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn matchPatternRecursive(pattern: []const u8, path: []const u8, exact_match: bool) bool {
|
fn matchPatternRecursive(pattern: []const u8, path: []const u8, exact_match: bool) bool {
|
||||||
if (pattern.len == 0) return true;
|
if (pattern.len == 0) return true;
|
||||||
|
|
||||||
@@ -328,31 +391,14 @@ fn matchPatternRecursive(pattern: []const u8, path: []const u8, exact_match: boo
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// There are rules for how the pattern in robots.txt should be matched.
|
|
||||||
///
|
|
||||||
/// * should match 0 or more of any character.
|
|
||||||
/// $ should signify the end of a path, making it exact.
|
|
||||||
/// otherwise, it is a prefix path.
|
|
||||||
fn matchPattern(pattern: []const u8, path: []const u8) ?usize {
|
|
||||||
if (pattern.len == 0) return 0;
|
|
||||||
const exact_match = pattern[pattern.len - 1] == '$';
|
|
||||||
const inner_pattern = if (exact_match) pattern[0 .. pattern.len - 1] else pattern;
|
|
||||||
|
|
||||||
if (matchPatternRecursive(
|
|
||||||
inner_pattern,
|
|
||||||
path,
|
|
||||||
exact_match,
|
|
||||||
)) return pattern.len else return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn isAllowed(self: *const Robots, path: []const u8) bool {
|
pub fn isAllowed(self: *const Robots, path: []const u8) bool {
|
||||||
for (self.rules) |rule| {
|
for (self.rules) |rule| {
|
||||||
switch (rule) {
|
switch (rule) {
|
||||||
.allow => |pattern| {
|
.allow => |compiled| {
|
||||||
if (matchPattern(pattern, path) != null) return true;
|
if (matchPattern(compiled, path)) return true;
|
||||||
},
|
},
|
||||||
.disallow => |pattern| {
|
.disallow => |compiled| {
|
||||||
if (matchPattern(pattern, path) != null) return false;
|
if (matchPattern(compiled, path)) return false;
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -360,6 +406,12 @@ pub fn isAllowed(self: *const Robots, path: []const u8) bool {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn testMatch(pattern: []const u8, path: []const u8) bool {
|
||||||
|
comptime if (!builtin.is_test) unreachable;
|
||||||
|
|
||||||
|
return matchPattern(CompiledPattern.compile(pattern), path);
|
||||||
|
}
|
||||||
|
|
||||||
test "Robots: simple robots.txt" {
|
test "Robots: simple robots.txt" {
|
||||||
const allocator = std.testing.allocator;
|
const allocator = std.testing.allocator;
|
||||||
|
|
||||||
@@ -380,77 +432,77 @@ test "Robots: simple robots.txt" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try std.testing.expectEqual(1, rules.len);
|
try std.testing.expectEqual(1, rules.len);
|
||||||
try std.testing.expectEqualStrings("/admin/", rules[0].disallow);
|
try std.testing.expectEqualStrings("/admin/", rules[0].disallow.pattern);
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: matchPattern - simple prefix" {
|
test "Robots: matchPattern - simple prefix" {
|
||||||
try std.testing.expect(matchPattern("/admin", "/admin/page") != null);
|
try std.testing.expect(testMatch("/admin", "/admin/page"));
|
||||||
try std.testing.expect(matchPattern("/admin", "/admin") != null);
|
try std.testing.expect(testMatch("/admin", "/admin"));
|
||||||
try std.testing.expect(matchPattern("/admin", "/other") == null);
|
try std.testing.expect(!testMatch("/admin", "/other"));
|
||||||
try std.testing.expect(matchPattern("/admin/page", "/admin") == null);
|
try std.testing.expect(!testMatch("/admin/page", "/admin"));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: matchPattern - single wildcard" {
|
test "Robots: matchPattern - single wildcard" {
|
||||||
try std.testing.expect(matchPattern("/admin/*", "/admin/") != null);
|
try std.testing.expect(testMatch("/admin/*", "/admin/"));
|
||||||
try std.testing.expect(matchPattern("/admin/*", "/admin/page") != null);
|
try std.testing.expect(testMatch("/admin/*", "/admin/page"));
|
||||||
try std.testing.expect(matchPattern("/admin/*", "/admin/page/subpage") != null);
|
try std.testing.expect(testMatch("/admin/*", "/admin/page/subpage"));
|
||||||
try std.testing.expect(matchPattern("/admin/*", "/other/page") == null);
|
try std.testing.expect(!testMatch("/admin/*", "/other/page"));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: matchPattern - wildcard in middle" {
|
test "Robots: matchPattern - wildcard in middle" {
|
||||||
try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/xyz") != null);
|
try std.testing.expect(testMatch("/abc/*/xyz", "/abc/def/xyz"));
|
||||||
try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def/ghi/xyz") != null);
|
try std.testing.expect(testMatch("/abc/*/xyz", "/abc/def/ghi/xyz"));
|
||||||
try std.testing.expect(matchPattern("/abc/*/xyz", "/abc/def") == null);
|
try std.testing.expect(!testMatch("/abc/*/xyz", "/abc/def"));
|
||||||
try std.testing.expect(matchPattern("/abc/*/xyz", "/other/def/xyz") == null);
|
try std.testing.expect(!testMatch("/abc/*/xyz", "/other/def/xyz"));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: matchPattern - complex wildcard case" {
|
test "Robots: matchPattern - complex wildcard case" {
|
||||||
try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/def/def/xyz") != null);
|
try std.testing.expect(testMatch("/abc/*/def/xyz", "/abc/def/def/xyz"));
|
||||||
try std.testing.expect(matchPattern("/abc/*/def/xyz", "/abc/ANYTHING/def/xyz") != null);
|
try std.testing.expect(testMatch("/abc/*/def/xyz", "/abc/ANYTHING/def/xyz"));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: matchPattern - multiple wildcards" {
|
test "Robots: matchPattern - multiple wildcards" {
|
||||||
try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/b/y/c") != null);
|
try std.testing.expect(testMatch("/a/*/b/*/c", "/a/x/b/y/c"));
|
||||||
try std.testing.expect(matchPattern("/a/*/b/*/c", "/a/x/y/b/z/w/c") != null);
|
try std.testing.expect(testMatch("/a/*/b/*/c", "/a/x/y/b/z/w/c"));
|
||||||
try std.testing.expect(matchPattern("/*.php", "/index.php") != null);
|
try std.testing.expect(testMatch("/*.php", "/index.php"));
|
||||||
try std.testing.expect(matchPattern("/*.php", "/admin/index.php") != null);
|
try std.testing.expect(testMatch("/*.php", "/admin/index.php"));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: matchPattern - end anchor" {
|
test "Robots: matchPattern - end anchor" {
|
||||||
try std.testing.expect(matchPattern("/*.php$", "/index.php") != null);
|
try std.testing.expect(testMatch("/*.php$", "/index.php"));
|
||||||
try std.testing.expect(matchPattern("/*.php$", "/index.php?param=value") == null);
|
try std.testing.expect(!testMatch("/*.php$", "/index.php?param=value"));
|
||||||
try std.testing.expect(matchPattern("/admin$", "/admin") != null);
|
try std.testing.expect(testMatch("/admin$", "/admin"));
|
||||||
try std.testing.expect(matchPattern("/admin$", "/admin/") == null);
|
try std.testing.expect(!testMatch("/admin$", "/admin/"));
|
||||||
try std.testing.expect(matchPattern("/fish$", "/fish") != null);
|
try std.testing.expect(testMatch("/fish$", "/fish"));
|
||||||
try std.testing.expect(matchPattern("/fish$", "/fishheads") == null);
|
try std.testing.expect(!testMatch("/fish$", "/fishheads"));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: matchPattern - wildcard with extension" {
|
test "Robots: matchPattern - wildcard with extension" {
|
||||||
try std.testing.expect(matchPattern("/fish*.php", "/fish.php") != null);
|
try std.testing.expect(testMatch("/fish*.php", "/fish.php"));
|
||||||
try std.testing.expect(matchPattern("/fish*.php", "/fishheads.php") != null);
|
try std.testing.expect(testMatch("/fish*.php", "/fishheads.php"));
|
||||||
try std.testing.expect(matchPattern("/fish*.php", "/fish/salmon.php") != null);
|
try std.testing.expect(testMatch("/fish*.php", "/fish/salmon.php"));
|
||||||
try std.testing.expect(matchPattern("/fish*.php", "/fish.asp") == null);
|
try std.testing.expect(!testMatch("/fish*.php", "/fish.asp"));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: matchPattern - empty and edge cases" {
|
test "Robots: matchPattern - empty and edge cases" {
|
||||||
try std.testing.expect(matchPattern("", "/anything") != null);
|
try std.testing.expect(testMatch("", "/anything"));
|
||||||
try std.testing.expect(matchPattern("/", "/") != null);
|
try std.testing.expect(testMatch("/", "/"));
|
||||||
try std.testing.expect(matchPattern("*", "/anything") != null);
|
try std.testing.expect(testMatch("*", "/anything"));
|
||||||
try std.testing.expect(matchPattern("/*", "/anything") != null);
|
try std.testing.expect(testMatch("/*", "/anything"));
|
||||||
try std.testing.expect(matchPattern("$", "") != null);
|
try std.testing.expect(testMatch("$", ""));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: matchPattern - real world examples" {
|
test "Robots: matchPattern - real world examples" {
|
||||||
try std.testing.expect(matchPattern("/", "/anything") != null);
|
try std.testing.expect(testMatch("/", "/anything"));
|
||||||
|
|
||||||
try std.testing.expect(matchPattern("/admin/", "/admin/page") != null);
|
try std.testing.expect(testMatch("/admin/", "/admin/page"));
|
||||||
try std.testing.expect(matchPattern("/admin/", "/public/page") == null);
|
try std.testing.expect(!testMatch("/admin/", "/public/page"));
|
||||||
|
|
||||||
try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf") != null);
|
try std.testing.expect(testMatch("/*.pdf$", "/document.pdf"));
|
||||||
try std.testing.expect(matchPattern("/*.pdf$", "/document.pdf.bak") == null);
|
try std.testing.expect(!testMatch("/*.pdf$", "/document.pdf.bak"));
|
||||||
|
|
||||||
try std.testing.expect(matchPattern("/*?", "/page?param=value") != null);
|
try std.testing.expect(testMatch("/*?", "/page?param=value"));
|
||||||
try std.testing.expect(matchPattern("/*?", "/page") == null);
|
try std.testing.expect(!testMatch("/*?", "/page"));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "Robots: isAllowed - basic allow/disallow" {
|
test "Robots: isAllowed - basic allow/disallow" {
|
||||||
|
|||||||
Reference in New Issue
Block a user