Merge pull request #1646 from lightpanda-io/url_encoding

Add url encoding option to URL.resolve
2026-03-22 04:34:44 +00:00 · 2026-02-26 07:02:49 +08:00
parent ab85b4b129 fcb3f08bcb
commit db8fb8b05d
13 changed files with 338 additions and 20 deletions
--- a/src/browser/Page.zig
+++ b/src/browser/Page.zig
@@ -569,7 +569,7 @@ fn scheduleNavigationWithArena(self: *Page, arena: Allocator, request_url: []con
        arena,
        self.base(),
        request_url,
-        .{ .always_dupe = true },
+        .{ .always_dupe = true, .encode = true },
    );

    const session = self._session;
@@ -1206,7 +1206,7 @@ pub fn iframeAddedCallback(self: *Page, iframe: *Element.Html.IFrame) !void {
        return;
    }

-    const src = try iframe.getSrc(self);
+    const src = iframe.asElement().getAttributeSafe(comptime .wrap("src")) orelse return;
    if (src.len == 0) {
        return;
    }
@@ -1228,8 +1228,16 @@ pub fn iframeAddedCallback(self: *Page, iframe: *Element.Html.IFrame) !void {
        .timestamp = timestamp(.monotonic),
    });

-    page_frame.navigate(src, .{ .reason = .initialFrameNavigation }) catch |err| {
-        log.warn(.page, "iframe navigate failure", .{ .url = src, .err = err });
+    // navigate will dupe the url
+    const url = try URL.resolve(
+        self.call_arena,
+        self.base(),
+        src,
+        .{ .encode = true },
+    );
+
+    page_frame.navigate(url, .{ .reason = .initialFrameNavigation }) catch |err| {
+        log.warn(.page, "iframe navigate failure", .{ .url = url, .err = err });
        self._pending_loads -= 1;
        iframe._content_window = null;
        page_frame.deinit();
--- a/src/browser/URL.zig
+++ b/src/browser/URL.zig
@@ -20,44 +20,61 @@ const std = @import("std");
 const Allocator = std.mem.Allocator;

 const ResolveOpts = struct {
+    encode: bool = false,
    always_dupe: bool = false,
 };
+
 // path is anytype, so that it can be used with both []const u8 and [:0]const u8
 pub fn resolve(allocator: Allocator, base: [:0]const u8, path: anytype, comptime opts: ResolveOpts) ![:0]const u8 {
    const PT = @TypeOf(path);
    if (base.len == 0 or isCompleteHTTPUrl(path)) {
        if (comptime opts.always_dupe or !isNullTerminated(PT)) {
-            return allocator.dupeZ(u8, path);
+            const duped = try allocator.dupeZ(u8, path);
+            return encodeURL(allocator, duped, opts);
+        }
+        if (comptime opts.encode) {
+            return encodeURL(allocator, path, opts);
        }
        return path;
    }

    if (path.len == 0) {
        if (comptime opts.always_dupe) {
-            return allocator.dupeZ(u8, base);
+            const duped = try allocator.dupeZ(u8, base);
+            return encodeURL(allocator, duped, opts);
+        }
+        if (comptime opts.encode) {
+            return encodeURL(allocator, base, opts);
        }
        return base;
    }

    if (path[0] == '?') {
        const base_path_end = std.mem.indexOfAny(u8, base, "?#") orelse base.len;
-        return std.mem.joinZ(allocator, "", &.{ base[0..base_path_end], path });
+        const result = try std.mem.joinZ(allocator, "", &.{ base[0..base_path_end], path });
+        return encodeURL(allocator, result, opts);
    }
    if (path[0] == '#') {
        const base_fragment_start = std.mem.indexOfScalar(u8, base, '#') orelse base.len;
-        return std.mem.joinZ(allocator, "", &.{ base[0..base_fragment_start], path });
+        const result = try std.mem.joinZ(allocator, "", &.{ base[0..base_fragment_start], path });
+        return encodeURL(allocator, result, opts);
    }

    if (std.mem.startsWith(u8, path, "//")) {
        // network-path reference
        const index = std.mem.indexOfScalar(u8, base, ':') orelse {
            if (comptime isNullTerminated(PT)) {
+                if (comptime opts.encode) {
+                    return encodeURL(allocator, path, opts);
+                }
                return path;
            }
-            return allocator.dupeZ(u8, path);
+            const duped = try allocator.dupeZ(u8, path);
+            return encodeURL(allocator, duped, opts);
        };
        const protocol = base[0 .. index + 1];
-        return std.mem.joinZ(allocator, "", &.{ protocol, path });
+        const result = try std.mem.joinZ(allocator, "", &.{ protocol, path });
+        return encodeURL(allocator, result, opts);
    }

    const scheme_end = std.mem.indexOf(u8, base, "://");
@@ -65,7 +82,8 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, path: anytype, comptime
    const path_start = std.mem.indexOfScalarPos(u8, base, authority_start, '/') orelse base.len;

    if (path[0] == '/') {
-        return std.mem.joinZ(allocator, "", &.{ base[0..path_start], path });
+        const result = try std.mem.joinZ(allocator, "", &.{ base[0..path_start], path });
+        return encodeURL(allocator, result, opts);
    }

    var normalized_base: []const u8 = base[0..path_start];
@@ -127,7 +145,115 @@ pub fn resolve(allocator: Allocator, base: [:0]const u8, path: anytype, comptime

    // we always have an extra space
    out[out_i] = 0;
-    return out[0..out_i :0];
+    return encodeURL(allocator, out[0..out_i :0], opts);
+}
+
+fn encodeURL(allocator: Allocator, url: [:0]const u8, comptime opts: ResolveOpts) ![:0]const u8 {
+    if (!comptime opts.encode) {
+        return url;
+    }
+
+    const scheme_end = std.mem.indexOf(u8, url, "://");
+    const authority_start = if (scheme_end) |end| end + 3 else 0;
+    const path_start = std.mem.indexOfScalarPos(u8, url, authority_start, '/') orelse return url;
+
+    const query_start = std.mem.indexOfScalarPos(u8, url, path_start, '?');
+    const fragment_start = std.mem.indexOfScalarPos(u8, url, query_start orelse path_start, '#');
+
+    const path_end = query_start orelse fragment_start orelse url.len;
+    const query_end = if (query_start) |_| (fragment_start orelse url.len) else path_end;
+
+    const path_to_encode = url[path_start..path_end];
+    const encoded_path = try percentEncodeSegment(allocator, path_to_encode, true);
+
+    const encoded_query = if (query_start) |qs| blk: {
+        const query_to_encode = url[qs + 1 .. query_end];
+        const encoded = try percentEncodeSegment(allocator, query_to_encode, false);
+        break :blk encoded;
+    } else null;
+
+    const encoded_fragment = if (fragment_start) |fs| blk: {
+        const fragment_to_encode = url[fs + 1 ..];
+        const encoded = try percentEncodeSegment(allocator, fragment_to_encode, false);
+        break :blk encoded;
+    } else null;
+
+    if (encoded_path.ptr == path_to_encode.ptr and
+        (encoded_query == null or encoded_query.?.ptr == url[query_start.? + 1 .. query_end].ptr) and
+        (encoded_fragment == null or encoded_fragment.?.ptr == url[fragment_start.? + 1 ..].ptr)) {
+        // nothing has changed
+        return url;
+    }
+
+    var buf = try std.ArrayList(u8).initCapacity(allocator, url.len + 20);
+    try buf.appendSlice(allocator, url[0..path_start]);
+    try buf.appendSlice(allocator, encoded_path);
+    if (encoded_query) |eq| {
+        try buf.append(allocator, '?');
+        try buf.appendSlice(allocator, eq);
+    }
+    if (encoded_fragment) |ef| {
+        try buf.append(allocator, '#');
+        try buf.appendSlice(allocator, ef);
+    }
+    try buf.append(allocator, 0);
+    return buf.items[0 .. buf.items.len - 1 :0];
+}
+
+fn percentEncodeSegment(allocator: Allocator, segment: []const u8, comptime is_path: bool) ![]const u8 {
+    // Check if encoding is needed
+    var needs_encoding = false;
+    for (segment) |c| {
+        if (shouldPercentEncode(c, is_path)) {
+            needs_encoding = true;
+            break;
+        }
+    }
+    if (!needs_encoding) {
+        return segment;
+    }
+
+    var buf = try std.ArrayList(u8).initCapacity(allocator, segment.len + 10);
+
+    var i: usize = 0;
+    while (i < segment.len) : (i += 1) {
+        const c = segment[i];
+
+        // Check if this is an already-encoded sequence (%XX)
+        if (c == '%' and i + 2 < segment.len) {
+            const end = i + 2;
+            const h1 = segment[i + 1];
+            const h2 = segment[end];
+            if (std.ascii.isHex(h1) and std.ascii.isHex(h2)) {
+                try buf.appendSlice(allocator, segment[i .. end + 1]);
+                i = end;
+                continue;
+            }
+        }
+
+        if (shouldPercentEncode(c, is_path)) {
+            try buf.writer(allocator).print("%{X:0>2}", .{c});
+        } else {
+            try buf.append(allocator, c);
+        }
+    }
+
+    return buf.items;
+}
+
+fn shouldPercentEncode(c: u8, comptime is_path: bool) bool {
+    return switch (c) {
+        // Unreserved characters (RFC 3986)
+        'A'...'Z', 'a'...'z', '0'...'9', '-', '.', '_', '~' => false,
+        // sub-delims allowed in both path and query
+        '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=' => false,
+        // Separators allowed in both path and query
+        '/', ':', '@' => false,
+        // Query-specific: '?' is allowed in queries but not in paths
+        '?' => comptime is_path,
+        // Everything else needs encoding (including space)
+        else => true,
+    };
 }

 fn isNullTerminated(comptime value: type) bool {
@@ -691,6 +817,172 @@ test "URL: resolve" {
    }
 }

+test "URL: resolve with encoding" {
+    defer testing.reset();
+
+    const Case = struct {
+        base: [:0]const u8,
+        path: [:0]const u8,
+        expected: [:0]const u8,
+    };
+
+    const cases = [_]Case{
+        // Spaces should be encoded as %20, but ! is allowed
+        .{
+            .base = "https://example.com/dir/",
+            .path = "over 9000!",
+            .expected = "https://example.com/dir/over%209000!",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "hello world.html",
+            .expected = "https://example.com/hello%20world.html",
+        },
+        // Multiple spaces
+        .{
+            .base = "https://example.com/",
+            .path = "path with  multiple   spaces",
+            .expected = "https://example.com/path%20with%20%20multiple%20%20%20spaces",
+        },
+        // Special characters that need encoding
+        .{
+            .base = "https://example.com/",
+            .path = "file[1].html",
+            .expected = "https://example.com/file%5B1%5D.html",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file{name}.html",
+            .expected = "https://example.com/file%7Bname%7D.html",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file<test>.html",
+            .expected = "https://example.com/file%3Ctest%3E.html",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file\"quote\".html",
+            .expected = "https://example.com/file%22quote%22.html",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file|pipe.html",
+            .expected = "https://example.com/file%7Cpipe.html",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file\\backslash.html",
+            .expected = "https://example.com/file%5Cbackslash.html",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file^caret.html",
+            .expected = "https://example.com/file%5Ecaret.html",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file`backtick`.html",
+            .expected = "https://example.com/file%60backtick%60.html",
+        },
+        // Characters that should NOT be encoded
+        .{
+            .base = "https://example.com/",
+            .path = "path-with_under~tilde.html",
+            .expected = "https://example.com/path-with_under~tilde.html",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "path/with/slashes",
+            .expected = "https://example.com/path/with/slashes",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "sub-delims!$&'()*+,;=.html",
+            .expected = "https://example.com/sub-delims!$&'()*+,;=.html",
+        },
+        // Already encoded characters should not be double-encoded
+        .{
+            .base = "https://example.com/",
+            .path = "already%20encoded",
+            .expected = "https://example.com/already%20encoded",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file%5B1%5D.html",
+            .expected = "https://example.com/file%5B1%5D.html",
+        },
+        // Mix of encoded and unencoded
+        .{
+            .base = "https://example.com/",
+            .path = "part%20encoded and not",
+            .expected = "https://example.com/part%20encoded%20and%20not",
+        },
+        // Query strings and fragments ARE encoded
+        .{
+            .base = "https://example.com/",
+            .path = "file name.html?query=value with spaces",
+            .expected = "https://example.com/file%20name.html?query=value%20with%20spaces",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file name.html#anchor with spaces",
+            .expected = "https://example.com/file%20name.html#anchor%20with%20spaces",
+        },
+        .{
+            .base = "https://example.com/",
+            .path = "file.html?hello=world !",
+            .expected = "https://example.com/file.html?hello=world%20!",
+        },
+        // Query structural characters should NOT be encoded
+        .{
+            .base = "https://example.com/",
+            .path = "file.html?a=1&b=2",
+            .expected = "https://example.com/file.html?a=1&b=2",
+        },
+        // Relative paths with encoding
+        .{
+            .base = "https://example.com/dir/page.html",
+            .path = "../other dir/file.html",
+            .expected = "https://example.com/other%20dir/file.html",
+        },
+        .{
+            .base = "https://example.com/dir/",
+            .path = "./sub dir/file.html",
+            .expected = "https://example.com/dir/sub%20dir/file.html",
+        },
+        // Absolute paths with encoding
+        .{
+            .base = "https://example.com/some/path",
+            .path = "/absolute path/file.html",
+            .expected = "https://example.com/absolute%20path/file.html",
+        },
+        // Unicode/high bytes (though ideally these should be UTF-8 encoded first)
+        .{
+            .base = "https://example.com/",
+            .path = "café",
+            .expected = "https://example.com/caf%C3%A9",
+        },
+        // Empty path
+        .{
+            .base = "https://example.com/",
+            .path = "",
+            .expected = "https://example.com/",
+        },
+        // Complete URL as path (should not be encoded)
+        .{
+            .base = "https://example.com/",
+            .path = "https://other.com/path with spaces",
+            .expected = "https://other.com/path%20with%20spaces",
+        },
+    };
+
+    for (cases) |case| {
+        const result = try resolve(testing.arena_allocator, case.base, case.path, .{ .encode = true });
+        try testing.expectString(case.expected, result);
+    }
+}
+
 test "URL: eqlDocument" {
    defer testing.reset();
    {
--- a/src/browser/tests/element/html/anchor.html
+++ b/src/browser/tests/element/html/anchor.html
@@ -245,3 +245,11 @@
  testing.expectEqual('', b.toString());
 }
 </script>
+
+<script id=url_encode>
+  {
+    let a = document.createElement('a');
+    a.href = 'over 9000!';
+    testing.expectEqual(testing.BASE_URL + 'element/html/over%209000!', a.href);
+  }
+</script>
--- a/src/browser/tests/element/html/image.html
+++ b/src/browser/tests/element/html/image.html
@@ -172,3 +172,12 @@
  });
 }
 </script>
+
+<script id=url_encode>
+  {
+    let img = document.createElement('img');
+    img.src = 'over 9000!?hello=world !';
+    testing.expectEqual('over 9000!?hello=world !', img.getAttribute('src'));
+    testing.expectEqual(testing.BASE_URL + 'element/html/over%209000!?hello=world%20!', img.src);
+  }
+</script>
--- a/src/browser/tests/frames/frames.html
+++ b/src/browser/tests/frames/frames.html
@@ -7,7 +7,7 @@
  }
 </script>

-<iframe id=f1 onload="frame1Onload" src="support/sub1.html"></iframe>
+<iframe id=f1 onload="frame1Onload" src="support/sub 1.html"></iframe>
 <iframe id=f2 src="support/sub2.html"></iframe>

 <script id="basic">
@@ -25,6 +25,7 @@

    testing.expectEqual(0, $('#f1').childNodes.length);

+    testing.expectEqual(testing.BASE_URL + 'frames/support/sub%201.html', $('#f1').src);
    testing.expectEqual(window[0], $('#f1').contentWindow);
    testing.expectEqual(window[1], $('#f2').contentWindow);

--- a/src/browser/tests/frames/support/sub1.html
+++ b/src/browser/tests/frames/support/sub1.html
--- a/src/browser/webapi/element/html/Anchor.zig
+++ b/src/browser/webapi/element/html/Anchor.zig
@@ -44,7 +44,7 @@ pub fn getHref(self: *Anchor, page: *Page) ![]const u8 {
    if (href.len == 0) {
        return "";
    }
-    return URL.resolve(page.call_arena, page.base(), href, .{});
+    return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true });
 }

 pub fn setHref(self: *Anchor, value: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/IFrame.zig
+++ b/src/browser/webapi/element/html/IFrame.zig
@@ -50,7 +50,7 @@ pub fn getContentDocument(self: *const IFrame) ?*Document {

 pub fn getSrc(self: *const IFrame, page: *Page) ![:0]const u8 {
    if (self._src.len == 0) return "";
-    return try URL.resolve(page.call_arena, page.base(), self._src, .{});
+    return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true });
 }

 pub fn setSrc(self: *IFrame, src: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Image.zig
+++ b/src/browser/webapi/element/html/Image.zig
@@ -46,7 +46,7 @@ pub fn getSrc(self: *const Image, page: *Page) ![]const u8 {
    }

    // Always resolve the src against the page URL
-    return URL.resolve(page.call_arena, page.base(), src, .{});
+    return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true });
 }

 pub fn setSrc(self: *Image, value: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Link.zig
+++ b/src/browser/webapi/element/html/Link.zig
@@ -46,7 +46,7 @@ pub fn getHref(self: *Link, page: *Page) ![]const u8 {
    }

    // Always resolve the href against the page URL
-    return URL.resolve(page.call_arena, page.base(), href, .{});
+    return URL.resolve(page.call_arena, page.base(), href, .{ .encode = true });
 }

 pub fn setHref(self: *Link, value: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Media.zig
+++ b/src/browser/webapi/element/html/Media.zig
@@ -236,7 +236,7 @@ pub fn getSrc(self: *const Media, page: *Page) ![]const u8 {
        return "";
    }
    const URL = @import("../../URL.zig");
-    return URL.resolve(page.call_arena, page.base(), src, .{});
+    return URL.resolve(page.call_arena, page.base(), src, .{ .encode = true });
 }

 pub fn setSrc(self: *Media, value: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Script.zig
+++ b/src/browser/webapi/element/html/Script.zig
@@ -46,7 +46,7 @@ pub fn asNode(self: *Script) *Node {

 pub fn getSrc(self: *const Script, page: *Page) ![]const u8 {
    if (self._src.len == 0) return "";
-    return try URL.resolve(page.call_arena, page.base(), self._src, .{});
+    return try URL.resolve(page.call_arena, page.base(), self._src, .{ .encode = true });
 }

 pub fn setSrc(self: *Script, src: []const u8, page: *Page) !void {
--- a/src/browser/webapi/element/html/Video.zig
+++ b/src/browser/webapi/element/html/Video.zig
@@ -59,7 +59,7 @@ pub fn getPoster(self: *const Video, page: *Page) ![]const u8 {
    }

    const URL = @import("../../URL.zig");
-    return URL.resolve(page.call_arena, page.base(), poster, .{});
+    return URL.resolve(page.call_arena, page.base(), poster, .{ .encode = true });
 }

 pub fn setPoster(self: *Video, value: []const u8, page: *Page) !void {