Files
browser/src/browser/URL.zig
Pierre Tachoire ba9777e754
Some checks failed
e2e-test / zig build release (push) Has been cancelled
e2e-test / demo-scripts (push) Has been cancelled
e2e-test / wba-demo-scripts (push) Has been cancelled
e2e-test / wba-test (push) Has been cancelled
e2e-test / cdp-and-hyperfine-bench (push) Has been cancelled
e2e-test / perf-fmt (push) Has been cancelled
e2e-test / browser fetch (push) Has been cancelled
zig-test / zig test using v8 in debug mode (push) Has been cancelled
zig-test / zig test (push) Has been cancelled
zig-test / perf-fmt (push) Has been cancelled
Merge pull request #1609 from lightpanda-io/web-bot-auth
Web Bot Auth
2026-03-13 08:31:25 +01:00

1417 lines
48 KiB
Zig

// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const Allocator = std.mem.Allocator;
const ResolveOpts = struct {
encode: bool = false,
always_dupe: bool = false,
};
// path is anytype, so that it can be used with both []const u8 and [:0]const u8
pub fn resolve(allocator: Allocator, base: [:0]const u8, path: anytype, comptime opts: ResolveOpts) ![:0]const u8 {
const PT = @TypeOf(path);
if (base.len == 0 or isCompleteHTTPUrl(path)) {
if (comptime opts.always_dupe or !isNullTerminated(PT)) {
const duped = try allocator.dupeZ(u8, path);
return processResolved(allocator, duped, opts);
}
if (comptime opts.encode) {
return processResolved(allocator, path, opts);
}
return path;
}
if (path.len == 0) {
if (comptime opts.always_dupe) {
const duped = try allocator.dupeZ(u8, base);
return processResolved(allocator, duped, opts);
}
if (comptime opts.encode) {
return processResolved(allocator, base, opts);
}
return base;
}
if (path[0] == '?') {
const base_path_end = std.mem.indexOfAny(u8, base, "?#") orelse base.len;
const result = try std.mem.joinZ(allocator, "", &.{ base[0..base_path_end], path });
return processResolved(allocator, result, opts);
}
if (path[0] == '#') {
const base_fragment_start = std.mem.indexOfScalar(u8, base, '#') orelse base.len;
const result = try std.mem.joinZ(allocator, "", &.{ base[0..base_fragment_start], path });
return processResolved(allocator, result, opts);
}
if (std.mem.startsWith(u8, path, "//")) {
// network-path reference
const index = std.mem.indexOfScalar(u8, base, ':') orelse {
if (comptime isNullTerminated(PT)) {
if (comptime opts.encode) {
return processResolved(allocator, path, opts);
}
return path;
}
const duped = try allocator.dupeZ(u8, path);
return processResolved(allocator, duped, opts);
};
const protocol = base[0 .. index + 1];
const result = try std.mem.joinZ(allocator, "", &.{ protocol, path });
return processResolved(allocator, result, opts);
}
const scheme_end = std.mem.indexOf(u8, base, "://");
const authority_start = if (scheme_end) |end| end + 3 else 0;
const path_start = std.mem.indexOfScalarPos(u8, base, authority_start, '/') orelse base.len;
if (path[0] == '/') {
const result = try std.mem.joinZ(allocator, "", &.{ base[0..path_start], path });
return processResolved(allocator, result, opts);
}
var normalized_base: []const u8 = base[0..path_start];
if (path_start < base.len) {
if (std.mem.lastIndexOfScalar(u8, base[path_start + 1 ..], '/')) |pos| {
normalized_base = base[0 .. path_start + 1 + pos];
}
}
// trailing space so that we always have space to append the null terminator
// and so that we can compare the next two characters without needing to length check
var out = try std.mem.join(allocator, "", &.{ normalized_base, "/", path, " " });
const end = out.len - 2;
const path_marker = path_start + 1;
// Strip out ./ and ../. This is done in-place, because doing so can
// only ever make `out` smaller. After this, `out` cannot be freed by
// an allocator, which is ok, because we expect allocator to be an arena.
var in_i: usize = 0;
var out_i: usize = 0;
while (in_i < end) {
if (out[in_i] == '.' and (out_i == 0 or out[out_i - 1] == '/')) {
if (out[in_i + 1] == '/') { // always safe, because we added a whitespace
// /./
in_i += 2;
continue;
}
if (out[in_i + 1] == '.' and out[in_i + 2] == '/') { // always safe, because we added two whitespaces
// /../
if (out_i > path_marker) {
// go back before the /
out_i -= 2;
while (out_i > 1 and out[out_i - 1] != '/') {
out_i -= 1;
}
} else {
// if out_i == path_marker, than we've reached the start of
// the path. We can't ../ any more. E.g.:
// http://www.example.com/../hello.
// You might think that's an error, but, at least with
// new URL('../hello', 'http://www.example.com/')
// it just ignores the extra ../
}
in_i += 3;
continue;
}
if (in_i == end - 1) {
// ignore trailing dot
break;
}
}
const c = out[in_i];
out[out_i] = c;
in_i += 1;
out_i += 1;
}
// we always have an extra space
out[out_i] = 0;
return processResolved(allocator, out[0..out_i :0], opts);
}
fn processResolved(allocator: Allocator, url: [:0]const u8, comptime opts: ResolveOpts) ![:0]const u8 {
if (!comptime opts.encode) {
return url;
}
return ensureEncoded(allocator, url);
}
pub fn ensureEncoded(allocator: Allocator, url: [:0]const u8) ![:0]const u8 {
const scheme_end = std.mem.indexOf(u8, url, "://");
const authority_start = if (scheme_end) |end| end + 3 else 0;
const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url;
const query_start = std.mem.indexOfScalarPos(u8, url, path_start, '?');
const fragment_start = std.mem.indexOfScalarPos(u8, url, query_start orelse path_start, '#');
const path_end = query_start orelse fragment_start orelse url.len;
const query_end = if (query_start) |_| (fragment_start orelse url.len) else path_end;
const path_to_encode = url[path_start..path_end];
const encoded_path = try percentEncodeSegment(allocator, path_to_encode, .path);
const encoded_query = if (query_start) |qs| blk: {
const query_to_encode = url[qs + 1 .. query_end];
const encoded = try percentEncodeSegment(allocator, query_to_encode, .query);
break :blk encoded;
} else null;
const encoded_fragment = if (fragment_start) |fs| blk: {
const fragment_to_encode = url[fs + 1 ..];
const encoded = try percentEncodeSegment(allocator, fragment_to_encode, .query);
break :blk encoded;
} else null;
if (encoded_path.ptr == path_to_encode.ptr and
(encoded_query == null or encoded_query.?.ptr == url[query_start.? + 1 .. query_end].ptr) and
(encoded_fragment == null or encoded_fragment.?.ptr == url[fragment_start.? + 1 ..].ptr))
{
// nothing has changed
return url;
}
var buf = try std.ArrayList(u8).initCapacity(allocator, url.len + 20);
try buf.appendSlice(allocator, url[0..path_start]);
try buf.appendSlice(allocator, encoded_path);
if (encoded_query) |eq| {
try buf.append(allocator, '?');
try buf.appendSlice(allocator, eq);
}
if (encoded_fragment) |ef| {
try buf.append(allocator, '#');
try buf.appendSlice(allocator, ef);
}
try buf.append(allocator, 0);
return buf.items[0 .. buf.items.len - 1 :0];
}
const EncodeSet = enum { path, query, userinfo };
fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime encode_set: EncodeSet) ![]const u8 {
// Check if encoding is needed
var needs_encoding = false;
for (segment) |c| {
if (shouldPercentEncode(c, encode_set)) {
needs_encoding = true;
break;
}
}
if (!needs_encoding) {
return segment;
}
var buf = try std.ArrayList(u8).initCapacity(allocator, segment.len + 10);
var i: usize = 0;
while (i < segment.len) : (i += 1) {
const c = segment[i];
// Check if this is an already-encoded sequence (%XX)
if (c == '%' and i + 2 < segment.len) {
const end = i + 2;
const h1 = segment[i + 1];
const h2 = segment[end];
if (std.ascii.isHex(h1) and std.ascii.isHex(h2)) {
try buf.appendSlice(allocator, segment[i .. end + 1]);
i = end;
continue;
}
}
if (shouldPercentEncode(c, encode_set)) {
try buf.writer(allocator).print("%{X:0>2}", .{c});
} else {
try buf.append(allocator, c);
}
}
return buf.items;
}
fn shouldPercentEncode(c: u8, comptime encode_set: EncodeSet) bool {
return switch (c) {
// Unreserved characters (RFC 3986)
'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => false,
// sub-delims allowed in path/query but some must be encoded in userinfo
'!', '$', '&', '\'', '(', ')', '*', '+', ',' => false,
';', '=' => encode_set == .userinfo,
// Separators: userinfo must encode these
'/', ':', '@' => encode_set == .userinfo,
// '?' is allowed in queries but not in paths or userinfo
'?' => encode_set != .query,
// Everything else needs encoding (including space)
else => true,
};
}
fn isNullTerminated(comptime value: type) bool {
return @typeInfo(value).pointer.sentinel_ptr != null;
}
pub fn isCompleteHTTPUrl(url: []const u8) bool {
if (url.len < 3) { // Minimum is "x://"
return false;
}
// very common case
if (url[0] == '/') {
return false;
}
// blob: and data: URLs are complete but don't follow scheme:// pattern
if (std.mem.startsWith(u8, url, "blob:") or std.mem.startsWith(u8, url, "data:")) {
return true;
}
// Check if there's a scheme (protocol) ending with ://
const colon_pos = std.mem.indexOfScalar(u8, url, ':') orelse return false;
// Check if it's followed by //
if (colon_pos + 2 >= url.len or url[colon_pos + 1] != '/' or url[colon_pos + 2] != '/') {
return false;
}
// Validate that everything before the colon is a valid scheme
// A scheme must start with a letter and contain only letters, digits, +, -, .
if (colon_pos == 0) {
return false;
}
const scheme = url[0..colon_pos];
if (!std.ascii.isAlphabetic(scheme[0])) {
return false;
}
for (scheme[1..]) |c| {
if (!std.ascii.isAlphanumeric(c) and c != '+' and c != '-' and c != '.') {
return false;
}
}
return true;
}
pub fn getUsername(raw: [:0]const u8) []const u8 {
const user_info = getUserInfo(raw) orelse return "";
const pos = std.mem.indexOfScalarPos(u8, user_info, 0, ':') orelse return user_info;
return user_info[0..pos];
}
pub fn getPassword(raw: [:0]const u8) []const u8 {
const user_info = getUserInfo(raw) orelse return "";
const pos = std.mem.indexOfScalarPos(u8, user_info, 0, ':') orelse return "";
return user_info[pos + 1 ..];
}
pub fn getPathname(raw: [:0]const u8) []const u8 {
const protocol_end = std.mem.indexOf(u8, raw, "://") orelse 0;
const path_start = std.mem.indexOfScalarPos(u8, raw, if (protocol_end > 0) protocol_end + 3 else 0, '/') orelse raw.len;
const query_or_hash_start = std.mem.indexOfAnyPos(u8, raw, path_start, "?#") orelse raw.len;
if (path_start >= query_or_hash_start) {
if (std.mem.indexOf(u8, raw, "://") != null) return "/";
return "";
}
return raw[path_start..query_or_hash_start];
}
pub fn getProtocol(raw: [:0]const u8) []const u8 {
const pos = std.mem.indexOfScalarPos(u8, raw, 0, ':') orelse return "";
return raw[0 .. pos + 1];
}
pub fn isHTTPS(raw: [:0]const u8) bool {
return std.mem.startsWith(u8, raw, "https:");
}
pub fn getHostname(raw: [:0]const u8) []const u8 {
const host = getHost(raw);
const pos = std.mem.lastIndexOfScalar(u8, host, ':') orelse return host;
return host[0..pos];
}
pub fn getPort(raw: [:0]const u8) []const u8 {
const host = getHost(raw);
const pos = std.mem.lastIndexOfScalar(u8, host, ':') orelse return "";
if (pos + 1 >= host.len) {
return "";
}
for (host[pos + 1 ..]) |c| {
if (c < '0' or c > '9') {
return "";
}
}
return host[pos + 1 ..];
}
pub fn getSearch(raw: [:0]const u8) []const u8 {
const pos = std.mem.indexOfScalarPos(u8, raw, 0, '?') orelse return "";
const query_part = raw[pos..];
if (std.mem.indexOfScalarPos(u8, query_part, 0, '#')) |fragment_start| {
return query_part[0..fragment_start];
}
return query_part;
}
pub fn getHash(raw: [:0]const u8) []const u8 {
const start = std.mem.indexOfScalarPos(u8, raw, 0, '#') orelse return "";
return raw[start..];
}
pub fn getOrigin(allocator: Allocator, raw: [:0]const u8) !?[]const u8 {
const scheme_end = std.mem.indexOf(u8, raw, "://") orelse return null;
// Only HTTP and HTTPS schemes have origins
const protocol = raw[0 .. scheme_end + 1];
if (!std.mem.eql(u8, protocol, "http:") and !std.mem.eql(u8, protocol, "https:")) {
return null;
}
var authority_start = scheme_end + 3;
const has_user_info = if (std.mem.indexOf(u8, raw[authority_start..], "@")) |pos| blk: {
authority_start += pos + 1;
break :blk true;
} else false;
// Find end of authority (start of path/query/fragment or end of string)
const authority_end_relative = std.mem.indexOfAny(u8, raw[authority_start..], "/?#");
const authority_end = if (authority_end_relative) |end|
authority_start + end
else
raw.len;
// Check for port in the host:port section
const host_part = raw[authority_start..authority_end];
if (std.mem.lastIndexOfScalar(u8, host_part, ':')) |colon_pos_in_host| {
const port = host_part[colon_pos_in_host + 1 ..];
// Validate it's actually a port (all digits)
for (port) |c| {
if (c < '0' or c > '9') {
// Not a port (probably IPv6)
if (has_user_info) {
// Need to allocate to exclude user info
return try std.fmt.allocPrint(allocator, "{s}//{s}", .{ raw[0 .. scheme_end + 1], host_part });
}
// Can return a slice
return raw[0..authority_end];
}
}
// Check if it's a default port that should be excluded from origin
const is_default =
(std.mem.eql(u8, protocol, "http:") and std.mem.eql(u8, port, "80")) or
(std.mem.eql(u8, protocol, "https:") and std.mem.eql(u8, port, "443"));
if (is_default or has_user_info) {
// Need to allocate to build origin without default port and/or user info
const hostname = host_part[0..colon_pos_in_host];
if (is_default) {
return try std.fmt.allocPrint(allocator, "{s}//{s}", .{ protocol, hostname });
} else {
return try std.fmt.allocPrint(allocator, "{s}//{s}", .{ protocol, host_part });
}
}
} else if (has_user_info) {
// No port, but has user info - need to allocate
return try std.fmt.allocPrint(allocator, "{s}//{s}", .{ raw[0 .. scheme_end + 1], host_part });
}
// Common case: no user info, no default port - return slice (zero allocation!)
return raw[0..authority_end];
}
fn getUserInfo(raw: [:0]const u8) ?[]const u8 {
const scheme_end = std.mem.indexOf(u8, raw, "://") orelse return null;
const authority_start = scheme_end + 3;
const pos = std.mem.indexOfScalar(u8, raw[authority_start..], '@') orelse return null;
const path_start = std.mem.indexOfScalarPos(u8, raw, authority_start, '/') orelse raw.len;
const full_pos = authority_start + pos;
if (full_pos < path_start) {
return raw[authority_start..full_pos];
}
return null;
}
pub fn getHost(raw: [:0]const u8) []const u8 {
const scheme_end = std.mem.indexOf(u8, raw, "://") orelse return "";
var authority_start = scheme_end + 3;
if (std.mem.indexOf(u8, raw[authority_start..], "@")) |pos| {
authority_start += pos + 1;
}
const authority = raw[authority_start..];
const path_start = std.mem.indexOfAny(u8, authority, "/?#") orelse return authority;
return authority[0..path_start];
}
// Returns true if these two URLs point to the same document.
pub fn eqlDocument(first: [:0]const u8, second: [:0]const u8) bool {
// First '#' signifies the start of the fragment.
const first_hash_index = std.mem.indexOfScalar(u8, first, '#') orelse first.len;
const second_hash_index = std.mem.indexOfScalar(u8, second, '#') orelse second.len;
return std.mem.eql(u8, first[0..first_hash_index], second[0..second_hash_index]);
}
// Helper function to build a URL from components
pub fn buildUrl(
allocator: Allocator,
protocol: []const u8,
host: []const u8,
pathname: []const u8,
search: []const u8,
hash: []const u8,
) ![:0]const u8 {
return std.fmt.allocPrintSentinel(allocator, "{s}//{s}{s}{s}{s}", .{
protocol,
host,
pathname,
search,
hash,
}, 0);
}
pub fn setProtocol(current: [:0]const u8, value: []const u8, allocator: Allocator) ![:0]const u8 {
const host = getHost(current);
const pathname = getPathname(current);
const search = getSearch(current);
const hash = getHash(current);
// Add : suffix if not present
const protocol = if (value.len > 0 and value[value.len - 1] != ':')
try std.fmt.allocPrint(allocator, "{s}:", .{value})
else
value;
return buildUrl(allocator, protocol, host, pathname, search, hash);
}
pub fn setHost(current: [:0]const u8, value: []const u8, allocator: Allocator) ![:0]const u8 {
const protocol = getProtocol(current);
const pathname = getPathname(current);
const search = getSearch(current);
const hash = getHash(current);
// Check if the new value includes a port
const colon_pos = std.mem.lastIndexOfScalar(u8, value, ':');
const clean_host = if (colon_pos) |pos| blk: {
const port_str = value[pos + 1 ..];
// Remove default ports
if (std.mem.eql(u8, protocol, "https:") and std.mem.eql(u8, port_str, "443")) {
break :blk value[0..pos];
}
if (std.mem.eql(u8, protocol, "http:") and std.mem.eql(u8, port_str, "80")) {
break :blk value[0..pos];
}
break :blk value;
} else blk: {
// No port in new value - preserve existing port
const current_port = getPort(current);
if (current_port.len > 0) {
break :blk try std.fmt.allocPrint(allocator, "{s}:{s}", .{ value, current_port });
}
break :blk value;
};
return buildUrl(allocator, protocol, clean_host, pathname, search, hash);
}
pub fn setHostname(current: [:0]const u8, value: []const u8, allocator: Allocator) ![:0]const u8 {
const current_port = getPort(current);
const new_host = if (current_port.len > 0)
try std.fmt.allocPrint(allocator, "{s}:{s}", .{ value, current_port })
else
value;
return setHost(current, new_host, allocator);
}
pub fn setPort(current: [:0]const u8, value: ?[]const u8, allocator: Allocator) ![:0]const u8 {
const hostname = getHostname(current);
const protocol = getProtocol(current);
const pathname = getPathname(current);
const search = getSearch(current);
const hash = getHash(current);
// Handle null or default ports
const new_host = if (value) |port_str| blk: {
if (port_str.len == 0) {
break :blk hostname;
}
// Check if this is a default port for the protocol
if (std.mem.eql(u8, protocol, "https:") and std.mem.eql(u8, port_str, "443")) {
break :blk hostname;
}
if (std.mem.eql(u8, protocol, "http:") and std.mem.eql(u8, port_str, "80")) {
break :blk hostname;
}
break :blk try std.fmt.allocPrint(allocator, "{s}:{s}", .{ hostname, port_str });
} else hostname;
return buildUrl(allocator, protocol, new_host, pathname, search, hash);
}
pub fn setPathname(current: [:0]const u8, value: []const u8, allocator: Allocator) ![:0]const u8 {
const protocol = getProtocol(current);
const host = getHost(current);
const search = getSearch(current);
const hash = getHash(current);
// Add / prefix if not present and value is not empty
const pathname = if (value.len > 0 and value[0] != '/')
try std.fmt.allocPrint(allocator, "/{s}", .{value})
else
value;
return buildUrl(allocator, protocol, host, pathname, search, hash);
}
pub fn setSearch(current: [:0]const u8, value: []const u8, allocator: Allocator) ![:0]const u8 {
const protocol = getProtocol(current);
const host = getHost(current);
const pathname = getPathname(current);
const hash = getHash(current);
// Add ? prefix if not present and value is not empty
const search = if (value.len > 0 and value[0] != '?')
try std.fmt.allocPrint(allocator, "?{s}", .{value})
else
value;
return buildUrl(allocator, protocol, host, pathname, search, hash);
}
pub fn setHash(current: [:0]const u8, value: []const u8, allocator: Allocator) ![:0]const u8 {
const protocol = getProtocol(current);
const host = getHost(current);
const pathname = getPathname(current);
const search = getSearch(current);
// Add # prefix if not present and value is not empty
const hash = if (value.len > 0 and value[0] != '#')
try std.fmt.allocPrint(allocator, "#{s}", .{value})
else
value;
return buildUrl(allocator, protocol, host, pathname, search, hash);
}
pub fn setUsername(current: [:0]const u8, value: []const u8, allocator: Allocator) ![:0]const u8 {
const protocol = getProtocol(current);
const host = getHost(current);
const pathname = getPathname(current);
const search = getSearch(current);
const hash = getHash(current);
const password = getPassword(current);
const encoded_username = try percentEncodeSegment(allocator, value, .userinfo);
return buildUrlWithUserInfo(allocator, protocol, encoded_username, password, host, pathname, search, hash);
}
pub fn setPassword(current: [:0]const u8, value: []const u8, allocator: Allocator) ![:0]const u8 {
const protocol = getProtocol(current);
const host = getHost(current);
const pathname = getPathname(current);
const search = getSearch(current);
const hash = getHash(current);
const username = getUsername(current);
const encoded_password = try percentEncodeSegment(allocator, value, .userinfo);
return buildUrlWithUserInfo(allocator, protocol, username, encoded_password, host, pathname, search, hash);
}
fn buildUrlWithUserInfo(
allocator: Allocator,
protocol: []const u8,
username: []const u8,
password: []const u8,
host: []const u8,
pathname: []const u8,
search: []const u8,
hash: []const u8,
) ![:0]const u8 {
if (username.len == 0 and password.len == 0) {
return buildUrl(allocator, protocol, host, pathname, search, hash);
} else if (password.len == 0) {
return std.fmt.allocPrintSentinel(allocator, "{s}//{s}@{s}{s}{s}{s}", .{
protocol,
username,
host,
pathname,
search,
hash,
}, 0);
} else {
return std.fmt.allocPrintSentinel(allocator, "{s}//{s}:{s}@{s}{s}{s}{s}", .{
protocol,
username,
password,
host,
pathname,
search,
hash,
}, 0);
}
}
pub fn concatQueryString(arena: Allocator, url: []const u8, query_string: []const u8) ![:0]const u8 {
if (query_string.len == 0) {
return arena.dupeZ(u8, url);
}
var buf: std.ArrayList(u8) = .empty;
// the most space well need is the url + ('?' or '&') + the query_string + null terminator
try buf.ensureTotalCapacity(arena, url.len + 2 + query_string.len);
buf.appendSliceAssumeCapacity(url);
if (std.mem.indexOfScalar(u8, url, '?')) |index| {
const last_index = url.len - 1;
if (index != last_index and url[last_index] != '&') {
buf.appendAssumeCapacity('&');
}
} else {
buf.appendAssumeCapacity('?');
}
buf.appendSliceAssumeCapacity(query_string);
buf.appendAssumeCapacity(0);
return buf.items[0 .. buf.items.len - 1 :0];
}
pub fn getRobotsUrl(arena: Allocator, url: [:0]const u8) ![:0]const u8 {
const origin = try getOrigin(arena, url) orelse return error.NoOrigin;
return try std.fmt.allocPrintSentinel(
arena,
"{s}/robots.txt",
.{origin},
0,
);
}
pub fn unescape(arena: Allocator, input: []const u8) ![]const u8 {
if (std.mem.indexOfScalar(u8, input, '%') == null) {
return input;
}
var result = try std.ArrayList(u8).initCapacity(arena, input.len);
var i: usize = 0;
while (i < input.len) {
if (input[i] == '%' and i + 2 < input.len) {
const hex = input[i + 1 .. i + 3];
const byte = std.fmt.parseInt(u8, hex, 16) catch {
result.appendAssumeCapacity(input[i]);
i += 1;
continue;
};
result.appendAssumeCapacity(byte);
i += 3;
} else {
result.appendAssumeCapacity(input[i]);
i += 1;
}
}
return result.items;
}
const testing = @import("../testing.zig");
test "URL: isCompleteHTTPUrl" {
try testing.expectEqual(true, isCompleteHTTPUrl("http://example.com/about"));
try testing.expectEqual(true, isCompleteHTTPUrl("HttP://example.com/about"));
try testing.expectEqual(true, isCompleteHTTPUrl("httpS://example.com/about"));
try testing.expectEqual(true, isCompleteHTTPUrl("HTTPs://example.com/about"));
try testing.expectEqual(true, isCompleteHTTPUrl("ftp://example.com/about"));
try testing.expectEqual(false, isCompleteHTTPUrl("/example.com"));
try testing.expectEqual(false, isCompleteHTTPUrl("../../about"));
try testing.expectEqual(false, isCompleteHTTPUrl("about"));
}
test "URL: resolve regression (#1093)" {
defer testing.reset();
const Case = struct {
base: [:0]const u8,
path: [:0]const u8,
expected: [:0]const u8,
};
const cases = [_]Case{
.{
.base = "https://alas.aws.amazon.com/alas2.html",
.path = "../static/bootstrap.min.css",
.expected = "https://alas.aws.amazon.com/static/bootstrap.min.css",
},
};
for (cases) |case| {
const result = try resolve(testing.arena_allocator, case.base, case.path, .{});
try testing.expectString(case.expected, result);
}
}
test "URL: resolve" {
defer testing.reset();
const Case = struct {
base: [:0]const u8,
path: [:0]const u8,
expected: [:0]const u8,
};
const cases = [_]Case{
.{
.base = "https://example/dir",
.path = "abc../test",
.expected = "https://example/abc../test",
},
.{
.base = "https://example/dir",
.path = "abc.",
.expected = "https://example/abc.",
},
.{
.base = "https://example/dir",
.path = "abc/.",
.expected = "https://example/abc/",
},
.{
.base = "https://example/xyz/abc/123",
.path = "something.js",
.expected = "https://example/xyz/abc/something.js",
},
.{
.base = "https://example/xyz/abc/123",
.path = "/something.js",
.expected = "https://example/something.js",
},
.{
.base = "https://example/",
.path = "something.js",
.expected = "https://example/something.js",
},
.{
.base = "https://example/",
.path = "/something.js",
.expected = "https://example/something.js",
},
.{
.base = "https://example",
.path = "something.js",
.expected = "https://example/something.js",
},
.{
.base = "https://example",
.path = "abc/something.js",
.expected = "https://example/abc/something.js",
},
.{
.base = "https://example/nested",
.path = "abc/something.js",
.expected = "https://example/abc/something.js",
},
.{
.base = "https://example/nested/",
.path = "abc/something.js",
.expected = "https://example/nested/abc/something.js",
},
.{
.base = "https://example/nested/",
.path = "/abc/something.js",
.expected = "https://example/abc/something.js",
},
.{
.base = "https://example/nested/",
.path = "http://www.github.com/example/",
.expected = "http://www.github.com/example/",
},
.{
.base = "https://example/nested/",
.path = "",
.expected = "https://example/nested/",
},
.{
.base = "https://example/abc/aaa",
.path = "./hello/./world",
.expected = "https://example/abc/hello/world",
},
.{
.base = "https://example/abc/aaa/",
.path = "../hello",
.expected = "https://example/abc/hello",
},
.{
.base = "https://example/abc/aaa",
.path = "../hello",
.expected = "https://example/hello",
},
.{
.base = "https://example/abc/aaa/",
.path = "./.././.././hello",
.expected = "https://example/hello",
},
.{
.base = "some/page",
.path = "hello",
.expected = "some/hello",
},
.{
.base = "some/page/",
.path = "hello",
.expected = "some/page/hello",
},
.{
.base = "some/page/other",
.path = ".././hello",
.expected = "some/hello",
},
.{
.base = "https://www.example.com/hello/world",
.path = "//example/about",
.expected = "https://example/about",
},
.{
.base = "http:",
.path = "//example.com/over/9000",
.expected = "http://example.com/over/9000",
},
.{
.base = "https://example.com/",
.path = "../hello",
.expected = "https://example.com/hello",
},
.{
.base = "https://www.example.com/hello/world/",
.path = "../../../../example/about",
.expected = "https://www.example.com/example/about",
},
};
for (cases) |case| {
const result = try resolve(testing.arena_allocator, case.base, case.path, .{});
try testing.expectString(case.expected, result);
}
}
test "URL: ensureEncoded" {
defer testing.reset();
const Case = struct {
url: [:0]const u8,
expected: [:0]const u8,
};
const cases = [_]Case{
.{
.url = "https://example.com/over 9000!",
.expected = "https://example.com/over%209000!",
},
.{
.url = "http://example.com/hello world.html",
.expected = "http://example.com/hello%20world.html",
},
.{
.url = "https://example.com/file[1].html",
.expected = "https://example.com/file%5B1%5D.html",
},
.{
.url = "https://example.com/file{name}.html",
.expected = "https://example.com/file%7Bname%7D.html",
},
.{
.url = "https://example.com/page?query=hello world",
.expected = "https://example.com/page?query=hello%20world",
},
.{
.url = "https://example.com/page?a=1&b=value with spaces",
.expected = "https://example.com/page?a=1&b=value%20with%20spaces",
},
.{
.url = "https://example.com/page#section one",
.expected = "https://example.com/page#section%20one",
},
.{
.url = "https://example.com/my path?query=my value#my anchor",
.expected = "https://example.com/my%20path?query=my%20value#my%20anchor",
},
.{
.url = "https://example.com/already%20encoded",
.expected = "https://example.com/already%20encoded",
},
.{
.url = "https://example.com/file%5B1%5D.html",
.expected = "https://example.com/file%5B1%5D.html",
},
.{
.url = "https://example.com/caf%C3%A9",
.expected = "https://example.com/caf%C3%A9",
},
.{
.url = "https://example.com/page?query=already%20encoded",
.expected = "https://example.com/page?query=already%20encoded",
},
.{
.url = "https://example.com/page?a=1&b=value%20here",
.expected = "https://example.com/page?a=1&b=value%20here",
},
.{
.url = "https://example.com/page#section%20one",
.expected = "https://example.com/page#section%20one",
},
.{
.url = "https://example.com/part%20encoded and not",
.expected = "https://example.com/part%20encoded%20and%20not",
},
.{
.url = "https://example.com/page?a=encoded%20value&b=not encoded",
.expected = "https://example.com/page?a=encoded%20value&b=not%20encoded",
},
.{
.url = "https://example.com/my%20path?query=not encoded#encoded%20anchor",
.expected = "https://example.com/my%20path?query=not%20encoded#encoded%20anchor",
},
.{
.url = "https://example.com/fully%20encoded?query=also%20encoded#and%20this",
.expected = "https://example.com/fully%20encoded?query=also%20encoded#and%20this",
},
.{
.url = "https://example.com/path-with_under~tilde",
.expected = "https://example.com/path-with_under~tilde",
},
.{
.url = "https://example.com/sub-delims!$&'()*+,;=",
.expected = "https://example.com/sub-delims!$&'()*+,;=",
},
.{
.url = "https://example.com",
.expected = "https://example.com",
},
.{
.url = "https://example.com?query=value",
.expected = "https://example.com?query=value",
},
.{
.url = "https://example.com/clean/path",
.expected = "https://example.com/clean/path",
},
.{
.url = "https://example.com/path?clean=query#clean-fragment",
.expected = "https://example.com/path?clean=query#clean-fragment",
},
.{
.url = "https://example.com/100% complete",
.expected = "https://example.com/100%25%20complete",
},
.{
.url = "https://example.com/path?value=100% done",
.expected = "https://example.com/path?value=100%25%20done",
},
.{
.url = "about:blank",
.expected = "about:blank",
},
};
for (cases) |case| {
const result = try ensureEncoded(testing.arena_allocator, case.url);
try testing.expectString(case.expected, result);
}
}
test "URL: resolve with encoding" {
defer testing.reset();
const Case = struct {
base: [:0]const u8,
path: [:0]const u8,
expected: [:0]const u8,
};
const cases = [_]Case{
// Spaces should be encoded as %20, but ! is allowed
.{
.base = "https://example.com/dir/",
.path = "over 9000!",
.expected = "https://example.com/dir/over%209000!",
},
.{
.base = "https://example.com/",
.path = "hello world.html",
.expected = "https://example.com/hello%20world.html",
},
// Multiple spaces
.{
.base = "https://example.com/",
.path = "path with multiple spaces",
.expected = "https://example.com/path%20with%20%20multiple%20%20%20spaces",
},
// Special characters that need encoding
.{
.base = "https://example.com/",
.path = "file[1].html",
.expected = "https://example.com/file%5B1%5D.html",
},
.{
.base = "https://example.com/",
.path = "file{name}.html",
.expected = "https://example.com/file%7Bname%7D.html",
},
.{
.base = "https://example.com/",
.path = "file<test>.html",
.expected = "https://example.com/file%3Ctest%3E.html",
},
.{
.base = "https://example.com/",
.path = "file\"quote\".html",
.expected = "https://example.com/file%22quote%22.html",
},
.{
.base = "https://example.com/",
.path = "file|pipe.html",
.expected = "https://example.com/file%7Cpipe.html",
},
.{
.base = "https://example.com/",
.path = "file\\backslash.html",
.expected = "https://example.com/file%5Cbackslash.html",
},
.{
.base = "https://example.com/",
.path = "file^caret.html",
.expected = "https://example.com/file%5Ecaret.html",
},
.{
.base = "https://example.com/",
.path = "file`backtick`.html",
.expected = "https://example.com/file%60backtick%60.html",
},
// Characters that should NOT be encoded
.{
.base = "https://example.com/",
.path = "path-with_under~tilde.html",
.expected = "https://example.com/path-with_under~tilde.html",
},
.{
.base = "https://example.com/",
.path = "path/with/slashes",
.expected = "https://example.com/path/with/slashes",
},
.{
.base = "https://example.com/",
.path = "sub-delims!$&'()*+,;=.html",
.expected = "https://example.com/sub-delims!$&'()*+,;=.html",
},
// Already encoded characters should not be double-encoded
.{
.base = "https://example.com/",
.path = "already%20encoded",
.expected = "https://example.com/already%20encoded",
},
.{
.base = "https://example.com/",
.path = "file%5B1%5D.html",
.expected = "https://example.com/file%5B1%5D.html",
},
// Mix of encoded and unencoded
.{
.base = "https://example.com/",
.path = "part%20encoded and not",
.expected = "https://example.com/part%20encoded%20and%20not",
},
// Query strings and fragments ARE encoded
.{
.base = "https://example.com/",
.path = "file name.html?query=value with spaces",
.expected = "https://example.com/file%20name.html?query=value%20with%20spaces",
},
.{
.base = "https://example.com/",
.path = "file name.html#anchor with spaces",
.expected = "https://example.com/file%20name.html#anchor%20with%20spaces",
},
.{
.base = "https://example.com/",
.path = "file.html?hello=world !",
.expected = "https://example.com/file.html?hello=world%20!",
},
// Query structural characters should NOT be encoded
.{
.base = "https://example.com/",
.path = "file.html?a=1&b=2",
.expected = "https://example.com/file.html?a=1&b=2",
},
// Relative paths with encoding
.{
.base = "https://example.com/dir/page.html",
.path = "../other dir/file.html",
.expected = "https://example.com/other%20dir/file.html",
},
.{
.base = "https://example.com/dir/",
.path = "./sub dir/file.html",
.expected = "https://example.com/dir/sub%20dir/file.html",
},
// Absolute paths with encoding
.{
.base = "https://example.com/some/path",
.path = "/absolute path/file.html",
.expected = "https://example.com/absolute%20path/file.html",
},
// Unicode/high bytes (though ideally these should be UTF-8 encoded first)
.{
.base = "https://example.com/",
.path = "café",
.expected = "https://example.com/caf%C3%A9",
},
// Empty path
.{
.base = "https://example.com/",
.path = "",
.expected = "https://example.com/",
},
// Complete URL as path (should not be encoded)
.{
.base = "https://example.com/",
.path = "https://other.com/path with spaces",
.expected = "https://other.com/path%20with%20spaces",
},
};
for (cases) |case| {
const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encode = true });
try testing.expectString(case.expected, result);
}
}
test "URL: eqlDocument" {
defer testing.reset();
{
const url = "https://lightpanda.io/about";
try testing.expectEqual(true, eqlDocument(url, url));
}
{
const url1 = "https://lightpanda.io/about";
const url2 = "http://lightpanda.io/about";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io/about";
const url2 = "https://example.com/about";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io:8080/about";
const url2 = "https://lightpanda.io:9090/about";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io/about";
const url2 = "https://lightpanda.io/contact";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io/about?foo=bar";
const url2 = "https://lightpanda.io/about?baz=qux";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io/about#section1";
const url2 = "https://lightpanda.io/about#section2";
try testing.expectEqual(true, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io/about";
const url2 = "https://lightpanda.io/about/";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io/about?foo=bar";
const url2 = "https://lightpanda.io/about";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io/about";
const url2 = "https://lightpanda.io/about?foo=bar";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io/about?foo=bar";
const url2 = "https://lightpanda.io/about?foo=bar";
try testing.expectEqual(true, eqlDocument(url1, url2));
}
{
const url1 = "https://lightpanda.io/about?";
const url2 = "https://lightpanda.io/about";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
{
const url1 = "https://duckduckgo.com/";
const url2 = "https://duckduckgo.com/?q=lightpanda";
try testing.expectEqual(false, eqlDocument(url1, url2));
}
}
test "URL: concatQueryString" {
defer testing.reset();
const arena = testing.arena_allocator;
{
const url = try concatQueryString(arena, "https://www.lightpanda.io/", "");
try testing.expectEqual("https://www.lightpanda.io/", url);
}
{
const url = try concatQueryString(arena, "https://www.lightpanda.io/index?", "");
try testing.expectEqual("https://www.lightpanda.io/index?", url);
}
{
const url = try concatQueryString(arena, "https://www.lightpanda.io/index?", "a=b");
try testing.expectEqual("https://www.lightpanda.io/index?a=b", url);
}
{
const url = try concatQueryString(arena, "https://www.lightpanda.io/index?1=2", "a=b");
try testing.expectEqual("https://www.lightpanda.io/index?1=2&a=b", url);
}
{
const url = try concatQueryString(arena, "https://www.lightpanda.io/index?1=2&", "a=b");
try testing.expectEqual("https://www.lightpanda.io/index?1=2&a=b", url);
}
}
test "URL: getRobotsUrl" {
defer testing.reset();
const arena = testing.arena_allocator;
{
const url = try getRobotsUrl(arena, "https://www.lightpanda.io");
try testing.expectEqual("https://www.lightpanda.io/robots.txt", url);
}
{
const url = try getRobotsUrl(arena, "https://www.lightpanda.io/some/path");
try testing.expectString("https://www.lightpanda.io/robots.txt", url);
}
{
const url = try getRobotsUrl(arena, "https://www.lightpanda.io:8080/page");
try testing.expectString("https://www.lightpanda.io:8080/robots.txt", url);
}
{
const url = try getRobotsUrl(arena, "http://example.com/deep/nested/path?query=value#fragment");
try testing.expectString("http://example.com/robots.txt", url);
}
{
const url = try getRobotsUrl(arena, "https://user:pass@example.com/page");
try testing.expectString("https://example.com/robots.txt", url);
}
}
test "URL: unescape" {
defer testing.reset();
const arena = testing.arena_allocator;
{
const result = try unescape(arena, "hello world");
try testing.expectEqual("hello world", result);
}
{
const result = try unescape(arena, "hello%20world");
try testing.expectEqual("hello world", result);
}
{
const result = try unescape(arena, "%48%65%6c%6c%6f");
try testing.expectEqual("Hello", result);
}
{
const result = try unescape(arena, "%48%65%6C%6C%6F");
try testing.expectEqual("Hello", result);
}
{
const result = try unescape(arena, "a%3Db");
try testing.expectEqual("a=b", result);
}
{
const result = try unescape(arena, "a%3DB");
try testing.expectEqual("a=B", result);
}
{
const result = try unescape(arena, "ZDIgPSAndHdvJzs%3D");
try testing.expectEqual("ZDIgPSAndHdvJzs=", result);
}
{
const result = try unescape(arena, "%5a%44%4d%67%50%53%41%6e%64%47%68%79%5a%57%55%6e%4f%77%3D%3D");
try testing.expectEqual("ZDMgPSAndGhyZWUnOw==", result);
}
{
const result = try unescape(arena, "hello%2world");
try testing.expectEqual("hello%2world", result);
}
{
const result = try unescape(arena, "hello%ZZworld");
try testing.expectEqual("hello%ZZworld", result);
}
{
const result = try unescape(arena, "hello%");
try testing.expectEqual("hello%", result);
}
{
const result = try unescape(arena, "hello%2");
try testing.expectEqual("hello%2", result);
}
}
test "URL: getHost" {
try testing.expectEqualSlices(u8, "example.com:8080", getHost("https://example.com:8080/path"));
try testing.expectEqualSlices(u8, "example.com", getHost("https://example.com/path"));
try testing.expectEqualSlices(u8, "example.com:443", getHost("https://example.com:443/"));
try testing.expectEqualSlices(u8, "example.com", getHost("https://user:pass@example.com/page"));
try testing.expectEqualSlices(u8, "example.com:8080", getHost("https://user:pass@example.com:8080/page"));
try testing.expectEqualSlices(u8, "", getHost("not-a-url"));
}