From fcb3f08bcbd45abfb284380546d1e177550308d3 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Wed, 25 Feb 2026 08:17:05 +0800 Subject: [PATCH] Add url encoding option to URL.resolve Given: a.href = "over 9000!" Then: a.href === BASE_URL + '/over%209000!'; This commits adds an escape: bool option to URL.resolve which will escape the path, query and fragment when true. Also changes the Anchor, Image, Link and IFrame getSrc to escape. Escaping is also used when navigating a frame. --- src/browser/Page.zig | 16 +- src/browser/URL.zig | 308 +++++++++++++++++- src/browser/tests/element/html/anchor.html | 8 + src/browser/tests/element/html/image.html | 9 + src/browser/tests/frames/frames.html | 3 +- .../frames/support/{sub1.html => sub 1.html} | 0 src/browser/webapi/element/html/Anchor.zig | 2 +- src/browser/webapi/element/html/IFrame.zig | 2 +- src/browser/webapi/element/html/Image.zig | 2 +- src/browser/webapi/element/html/Link.zig | 2 +- src/browser/webapi/element/html/Media.zig | 2 +- src/browser/webapi/element/html/Script.zig | 2 +- src/browser/webapi/element/html/Video.zig | 2 +- 13 files changed, 338 insertions(+), 20 deletions(-) rename src/browser/tests/frames/support/{sub1.html => sub 1.html} (100%) diff --git a/src/browser/Page.zig b/src/browser/Page.zig index 11ff9ebc..66a3e083 100644 --- a/src/browser/Page.zig +++ b/src/browser/Page.zig @@ -566,7 +566,7 @@ fn scheduleNavigationWithArena(self: *Page, arena: Allocator, request_url: []con arena, self.base(), request_url, - .{ .always_dupe = true }, + .{ .always_dupe = true, .encode = true }, ); const session = self._session; @@ -1203,7 +1203,7 @@ pub fn iframeAddedCallback(self: *Page, iframe: *Element.Html.IFrame) !void { return; } - const src = try iframe.getSrc(self); + const src = iframe.asElement().getAttributeSafe(comptime .wrap("src")) orelse return; if (src.len == 0) { return; } @@ -1225,8 +1225,16 @@ pub fn iframeAddedCallback(self: *Page, iframe: *Element.Html.IFrame) !void { .timestamp = timestamp(.monotonic), }); - page_frame.navigate(src, .{ .reason = .initialFrameNavigation }) catch |err| { - log.warn(.page, "iframe navigate failure", .{ .url = src, .err = err }); + // navigate will dupe the url + const url = try URL.resolve( + self.call_arena, + self.base(), + src, + .{ .encode = true }, + ); + + page_frame.navigate(url, .{ .reason = .initialFrameNavigation }) catch |err| { + log.warn(.page, "iframe navigate failure", .{ .url = url, .err = err }); self._pending_loads -= 1; iframe._content_window = null; page_frame.deinit(); diff --git a/src/browser/URL.zig b/src/browser/URL.zig index 716480b1..3a2a0514 100644 --- a/src/browser/URL.zig +++ b/src/browser/URL.zig @@ -20,44 +20,61 @@ const std = @import("std"); const Allocator = std.mem.Allocator; const ResolveOpts = struct { + encode: bool = false, always_dupe: bool = false, }; + // path is anytype, so that it can be used with both []const u8 and [:0]const u8 pub fn resolve(allocator: Allocator, base: [:0]const u8, path: anytype, comptime opts: ResolveOpts) ![:0]const u8 { const PT = @TypeOf(path); if (base.len == 0 or isCompleteHTTPUrl(path)) { if (comptime opts.always_dupe or !isNullTerminated(PT)) { - return allocator.dupeZ(u8, path); + const duped = try allocator.dupeZ(u8, path); + return encodeURL(allocator, duped, opts); + } + if (comptime opts.encode) { + return encodeURL(allocator, path, opts); } return path; } if (path.len == 0) { if (comptime opts.always_dupe) { - return allocator.dupeZ(u8, base); + const duped = try allocator.dupeZ(u8, base); + return encodeURL(allocator, duped, opts); + } + if (comptime opts.encode) { + return encodeURL(allocator, base, opts); } return base; } if (path[0] == '?') { const base_path_end = std.mem.indexOfAny(u8, base, "?#") orelse base.len; - return std.mem.joinZ(allocator, "", &.{ base[0..base_path_end], path }); + const result = try std.mem.joinZ(allocator, "", &.{ base[0..base_path_end], path }); + return encodeURL(allocator, result, opts); } if (path[0] == '#') { const base_fragment_start = std.mem.indexOfScalar(u8, base, '#') orelse base.len; - return std.mem.joinZ(allocator, "", &.{ base[0..base_fragment_start], path }); + const result = try std.mem.joinZ(allocator, "", &.{ base[0..base_fragment_start], path }); + return encodeURL(allocator, result, opts); } if (std.mem.startsWith(u8, path, "//")) { // network-path reference const index = std.mem.indexOfScalar(u8, base, ':') orelse { if (comptime isNullTerminated(PT)) { + if (comptime opts.encode) { + return encodeURL(allocator, path, opts); + } return path; } - return allocator.dupeZ(u8, path); + const duped = try allocator.dupeZ(u8, path); + return encodeURL(allocator, duped, opts); }; const protocol = base[0 .. index + 1]; - return std.mem.joinZ(allocator, "", &.{ protocol, path }); + const result = try std.mem.joinZ(allocator, "", &.{ protocol, path }); + return encodeURL(allocator, result, opts); } const scheme_end = std.mem.indexOf(u8, base, "://"); @@ -65,7 +82,8 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, path: anytype, comptime const path_start = std.mem.indexOfScalarPos(u8, base, authority_start, '/') orelse base.len; if (path[0] == '/') { - return std.mem.joinZ(allocator, "", &.{ base[0..path_start], path }); + const result = try std.mem.joinZ(allocator, "", &.{ base[0..path_start], path }); + return encodeURL(allocator, result, opts); } var normalized_base: []const u8 = base[0..path_start]; @@ -127,7 +145,115 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, path: anytype, comptime // we always have an extra space out[out_i] = 0; - return out[0..out_i :0]; + return encodeURL(allocator, out[0..out_i :0], opts); +} + +fn encodeURL(allocator: Allocator, url: [:0]const u8, comptime opts: ResolveOpts) ![:0]const u8 { + if (!comptime opts.encode) { + return url; + } + + const scheme_end = std.mem.indexOf(u8, url, "://"); + const authority_start = if (scheme_end) |end| end + 3 else 0; + const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url; + + const query_start = std.mem.indexOfScalarPos(u8, url, path_start, '?'); + const fragment_start = std.mem.indexOfScalarPos(u8, url, query_start orelse path_start, '#'); + + const path_end = query_start orelse fragment_start orelse url.len; + const query_end = if (query_start) |_| (fragment_start orelse url.len) else path_end; + + const path_to_encode = url[path_start..path_end]; + const encoded_path = try percentEncodeSegment(allocator, path_to_encode, true); + + const encoded_query = if (query_start) |qs| blk: { + const query_to_encode = url[qs + 1 .. query_end]; + const encoded = try percentEncodeSegment(allocator, query_to_encode, false); + break :blk encoded; + } else null; + + const encoded_fragment = if (fragment_start) |fs| blk: { + const fragment_to_encode = url[fs + 1 ..]; + const encoded = try percentEncodeSegment(allocator, fragment_to_encode, false); + break :blk encoded; + } else null; + + if (encoded_path.ptr == path_to_encode.ptr and + (encoded_query == null or encoded_query.?.ptr == url[query_start.? + 1 .. query_end].ptr) and + (encoded_fragment == null or encoded_fragment.?.ptr == url[fragment_start.? + 1 ..].ptr)) { + // nothing has changed + return url; + } + + var buf = try std.ArrayList(u8).initCapacity(allocator, url.len + 20); + try buf.appendSlice(allocator, url[0..path_start]); + try buf.appendSlice(allocator, encoded_path); + if (encoded_query) |eq| { + try buf.append(allocator, '?'); + try buf.appendSlice(allocator, eq); + } + if (encoded_fragment) |ef| { + try buf.append(allocator, '#'); + try buf.appendSlice(allocator, ef); + } + try buf.append(allocator, 0); + return buf.items[0 .. buf.items.len - 1 :0]; +} + +fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime is_path: bool) ![]const u8 { + // Check if encoding is needed + var needs_encoding = false; + for (segment) |c| { + if (shouldPercentEncode(c, is_path)) { + needs_encoding = true; + break; + } + } + if (!needs_encoding) { + return segment; + } + + var buf = try std.ArrayList(u8).initCapacity(allocator, segment.len + 10); + + var i: usize = 0; + while (i < segment.len) : (i += 1) { + const c = segment[i]; + + // Check if this is an already-encoded sequence (%XX) + if (c == '%' and i + 2 < segment.len) { + const end = i + 2; + const h1 = segment[i + 1]; + const h2 = segment[end]; + if (std.ascii.isHex(h1) and std.ascii.isHex(h2)) { + try buf.appendSlice(allocator, segment[i .. end + 1]); + i = end; + continue; + } + } + + if (shouldPercentEncode(c, is_path)) { + try buf.writer(allocator).print("%{X:0>2}", .{c}); + } else { + try buf.append(allocator, c); + } + } + + return buf.items; +} + +fn shouldPercentEncode(c: u8, comptime is_path: bool) bool { + return switch (c) { + // Unreserved characters (RFC 3986) + 'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => false, + // sub-delims allowed in both path and query + '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=' => false, + // Separators allowed in both path and query + '/', ':', '@' => false, + // Query-specific: '?' is allowed in queries but not in paths + '?' => comptime is_path, + // Everything else needs encoding (including space) + else => true, + }; } fn isNullTerminated(comptime value: type) bool { @@ -691,6 +817,172 @@ test "URL: resolve" { } } +test "URL: resolve with encoding" { + defer testing.reset(); + + const Case = struct { + base: [:0]const u8, + path: [:0]const u8, + expected: [:0]const u8, + }; + + const cases = [_]Case{ + // Spaces should be encoded as %20, but ! is allowed + .{ + .base = "https://example.com/dir/", + .path = "over 9000!", + .expected = "https://example.com/dir/over%209000!", + }, + .{ + .base = "https://example.com/", + .path = "hello world.html", + .expected = "https://example.com/hello%20world.html", + }, + // Multiple spaces + .{ + .base = "https://example.com/", + .path = "path with multiple spaces", + .expected = "https://example.com/path%20with%20%20multiple%20%20%20spaces", + }, + // Special characters that need encoding + .{ + .base = "https://example.com/", + .path = "file[1].html", + .expected = "https://example.com/file%5B1%5D.html", + }, + .{ + .base = "https://example.com/", + .path = "file{name}.html", + .expected = "https://example.com/file%7Bname%7D.html", + }, + .{ + .base = "https://example.com/", + .path = "file.html", + .expected = "https://example.com/file%3Ctest%3E.html", + }, + .{ + .base = "https://example.com/", + .path = "file\"quote\".html", + .expected = "https://example.com/file%22quote%22.html", + }, + .{ + .base = "https://example.com/", + .path = "file|pipe.html", + .expected = "https://example.com/file%7Cpipe.html", + }, + .{ + .base = "https://example.com/", + .path = "file\\backslash.html", + .expected = "https://example.com/file%5Cbackslash.html", + }, + .{ + .base = "https://example.com/", + .path = "file^caret.html", + .expected = "https://example.com/file%5Ecaret.html", + }, + .{ + .base = "https://example.com/", + .path = "file`backtick`.html", + .expected = "https://example.com/file%60backtick%60.html", + }, + // Characters that should NOT be encoded + .{ + .base = "https://example.com/", + .path = "path-with_under~tilde.html", + .expected = "https://example.com/path-with_under~tilde.html", + }, + .{ + .base = "https://example.com/", + .path = "path/with/slashes", + .expected = "https://example.com/path/with/slashes", + }, + .{ + .base = "https://example.com/", + .path = "sub-delims!$&'()*+,;=.html", + .expected = "https://example.com/sub-delims!$&'()*+,;=.html", + }, + // Already encoded characters should not be double-encoded + .{ + .base = "https://example.com/", + .path = "already%20encoded", + .expected = "https://example.com/already%20encoded", + }, + .{ + .base = "https://example.com/", + .path = "file%5B1%5D.html", + .expected = "https://example.com/file%5B1%5D.html", + }, + // Mix of encoded and unencoded + .{ + .base = "https://example.com/", + .path = "part%20encoded and not", + .expected = "https://example.com/part%20encoded%20and%20not", + }, + // Query strings and fragments ARE encoded + .{ + .base = "https://example.com/", + .path = "file name.html?query=value with spaces", + .expected = "https://example.com/file%20name.html?query=value%20with%20spaces", + }, + .{ + .base = "https://example.com/", + .path = "file name.html#anchor with spaces", + .expected = "https://example.com/file%20name.html#anchor%20with%20spaces", + }, + .{ + .base = "https://example.com/", + .path = "file.html?hello=world !", + .expected = "https://example.com/file.html?hello=world%20!", + }, + // Query structural characters should NOT be encoded + .{ + .base = "https://example.com/", + .path = "file.html?a=1&b=2", + .expected = "https://example.com/file.html?a=1&b=2", + }, + // Relative paths with encoding + .{ + .base = "https://example.com/dir/page.html", + .path = "../other dir/file.html", + .expected = "https://example.com/other%20dir/file.html", + }, + .{ + .base = "https://example.com/dir/", + .path = "./sub dir/file.html", + .expected = "https://example.com/dir/sub%20dir/file.html", + }, + // Absolute paths with encoding + .{ + .base = "https://example.com/some/path", + .path = "/absolute path/file.html", + .expected = "https://example.com/absolute%20path/file.html", + }, + // Unicode/high bytes (though ideally these should be UTF-8 encoded first) + .{ + .base = "https://example.com/", + .path = "café", + .expected = "https://example.com/caf%C3%A9", + }, + // Empty path + .{ + .base = "https://example.com/", + .path = "", + .expected = "https://example.com/", + }, + // Complete URL as path (should not be encoded) + .{ + .base = "https://example.com/", + .path = "https://other.com/path with spaces", + .expected = "https://other.com/path%20with%20spaces", + }, + }; + + for (cases) |case| { + const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encode = true }); + try testing.expectString(case.expected, result); + } +} + test "URL: eqlDocument" { defer testing.reset(); { diff --git a/src/browser/tests/element/html/anchor.html b/src/browser/tests/element/html/anchor.html index 3c248a7b..0522163f 100644 --- a/src/browser/tests/element/html/anchor.html +++ b/src/browser/tests/element/html/anchor.html @@ -245,3 +245,11 @@ testing.expectEqual('', b.toString()); } + + diff --git a/src/browser/tests/element/html/image.html b/src/browser/tests/element/html/image.html index 1fda424a..e7868229 100644 --- a/src/browser/tests/element/html/image.html +++ b/src/browser/tests/element/html/image.html @@ -172,3 +172,12 @@ }); } + + diff --git a/src/browser/tests/frames/frames.html b/src/browser/tests/frames/frames.html index 00403f3a..1aa81b21 100644 --- a/src/browser/tests/frames/frames.html +++ b/src/browser/tests/frames/frames.html @@ -7,7 +7,7 @@ } - +