From 94531cb3d039be57742dbe770a9c5f7de03a3bd4 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Sat, 8 Feb 2025 11:39:19 +0800 Subject: [PATCH] Improve memory and performance of url.Query 1 - Use getOrPut to avoid making 2 map lookups where possible. 2 - Use an arena allocator for Values, which makes memory management simpler. 3 - Because of #2, we no longer need to allocate key or values which don't need to be unescaped. The downside is that the input string has to outlive the query.Values (but I think this is currently always the case) 4 - Optimize unescape logic & allocations 5 - Improve test coverage --- src/str/parser.zig | 38 +++++ src/url/query.zig | 407 +++++++++++++++++++++++++++++++-------------- 2 files changed, 316 insertions(+), 129 deletions(-) diff --git a/src/str/parser.zig b/src/str/parser.zig index 55b6bb32..ee64a922 100644 --- a/src/str/parser.zig +++ b/src/str/parser.zig @@ -52,6 +52,27 @@ pub const Reader = struct { } }; +// converts a comptime-known string (i.e. null terminated) to an uint +pub fn asUint(comptime string: anytype) AsUintReturn(string) { + const byteLength = @bitSizeOf(@TypeOf(string.*)) / 8 - 1; + const expectedType = *const [byteLength:0]u8; + if (@TypeOf(string) != expectedType) { + @compileError("expected : " ++ @typeName(expectedType) ++ + ", got: " ++ @typeName(@TypeOf(string))); + } + + return @bitCast(@as(*const [byteLength]u8, string).*); +} + +fn AsUintReturn(comptime string: anytype) type { + return @Type(.{ + .Int = .{ + .bits = @bitSizeOf(@TypeOf(string.*)) - 8, // (- 8) to exclude sentinel 0 + .signedness = .unsigned, + }, + }); +} + const testing = std.testing; test "parser.Reader: skip" { var r = Reader{ .data = "foo" }; @@ -85,3 +106,20 @@ test "parser.Reader: until" { try testing.expectEqualStrings("", r.until('.')); try testing.expectEqualStrings("", r.tail()); } + +test "parser: asUint" { + const ASCII_x = @as(u8, @bitCast([1]u8{'x'})); + const ASCII_ab = @as(u16, @bitCast([2]u8{ 'a', 'b' })); + const ASCII_xyz = @as(u24, @bitCast([3]u8{ 'x', 'y', 'z' })); + const ASCII_abcd = @as(u32, @bitCast([4]u8{ 'a', 'b', 'c', 'd' })); + + try testing.expectEqual(ASCII_x, asUint("x")); + try testing.expectEqual(ASCII_ab, asUint("ab")); + try testing.expectEqual(ASCII_xyz, asUint("xyz")); + try testing.expectEqual(ASCII_abcd, asUint("abcd")); + + try testing.expectEqual(u8, @TypeOf(asUint("x"))); + try testing.expectEqual(u16, @TypeOf(asUint("ab"))); + try testing.expectEqual(u24, @TypeOf(asUint("xyz"))); + try testing.expectEqual(u32, @TypeOf(asUint("abcd"))); +} diff --git a/src/url/query.zig b/src/url/query.zig index 5defb288..1b4d55f7 100644 --- a/src/url/query.zig +++ b/src/url/query.zig @@ -19,59 +19,58 @@ const std = @import("std"); const Reader = @import("../str/parser.zig").Reader; +const asUint = @import("../str/parser.zig").asUint; // Values is a map with string key of string values. pub const Values = struct { - alloc: std.mem.Allocator, + arena: std.heap.ArenaAllocator, map: std.StringArrayHashMapUnmanaged(List), const List = std.ArrayListUnmanaged([]const u8); - pub fn init(alloc: std.mem.Allocator) Values { + pub fn init(allocator: std.mem.Allocator) Values { return .{ - .alloc = alloc, .map = .{}, + .arena = std.heap.ArenaAllocator.init(allocator), }; } pub fn deinit(self: *Values) void { - var it = self.map.iterator(); - while (it.next()) |entry| { - for (entry.value_ptr.items) |v| self.alloc.free(v); - entry.value_ptr.deinit(self.alloc); - self.alloc.free(entry.key_ptr.*); - } - self.map.deinit(self.alloc); + self.arena.deinit(); } // add the key value couple to the values. // the key and the value are duplicated. pub fn append(self: *Values, k: []const u8, v: []const u8) !void { - const vv = try self.alloc.dupe(u8, v); + const allocator = self.arena.allocator(); + const owned_value = try allocator.dupe(u8, v); - if (self.map.getPtr(k)) |list| { - return try list.append(self.alloc, vv); + var gop = try self.map.getOrPut(allocator, k); + if (gop.found_existing) { + return gop.value_ptr.append(allocator, owned_value); } - const kk = try self.alloc.dupe(u8, k); + gop.key_ptr.* = try allocator.dupe(u8, k); + var list = List{}; - try list.append(self.alloc, vv); - try self.map.put(self.alloc, kk, list); + try list.append(allocator, owned_value); + gop.value_ptr.* = list; } // append by taking the ownership of the key and the value fn appendOwned(self: *Values, k: []const u8, v: []const u8) !void { - if (self.map.getPtr(k)) |list| { - self.alloc.free(k); - return try list.append(self.alloc, v); + const allocator = self.arena.allocator(); + var gop = try self.map.getOrPut(allocator, k); + if (gop.found_existing) { + return gop.value_ptr.append(allocator, v); } var list = List{}; - try list.append(self.alloc, v); - try self.map.put(self.alloc, k, list); + try list.append(allocator, v); + gop.value_ptr.* = list; } - pub fn get(self: *Values, k: []const u8) [][]const u8 { + pub fn get(self: *const Values, k: []const u8) []const []const u8 { if (self.map.get(k)) |list| { return list.items; } @@ -79,7 +78,7 @@ pub const Values = struct { return &[_][]const u8{}; } - pub fn first(self: *Values, k: []const u8) []const u8 { + pub fn first(self: *const Values, k: []const u8) []const u8 { if (self.map.getPtr(k)) |list| { if (list.items.len == 0) return ""; return list.items[0]; @@ -89,10 +88,7 @@ pub const Values = struct { } pub fn delete(self: *Values, k: []const u8) void { - if (self.map.getPtr(k)) |list| { - list.deinit(self.alloc); - _ = self.map.fetchSwapRemove(k); - } + _ = self.map.fetchSwapRemove(k); } pub fn deleteValue(self: *Values, k: []const u8, v: []const u8) void { @@ -106,80 +102,48 @@ pub const Values = struct { } } - pub fn count(self: *Values) usize { + pub fn count(self: *const Values) usize { return self.map.count(); } - // the caller owned the returned string. - pub fn encode(self: *Values, writer: anytype) !void { - var i: usize = 0; + pub fn encode(self: *const Values, writer: anytype) !void { var it = self.map.iterator(); + + const first_entry = it.next() orelse return; + try encodeKeyValues(first_entry, writer); + while (it.next()) |entry| { - defer i += 1; - if (i > 0) try writer.writeByte('&'); - - if (entry.value_ptr.items.len == 0) { - try escape(writer, entry.key_ptr.*); - continue; - } - - const start = i; - for (entry.value_ptr.items) |v| { - defer i += 1; - if (start < i) try writer.writeByte('&'); - - try escape(writer, entry.key_ptr.*); - if (v.len > 0) try writer.writeByte('='); - try escape(writer, v); - } + try writer.writeByte('&'); + try encodeKeyValues(entry, writer); } } }; -fn unhex(c: u8) u8 { - if ('0' <= c and c <= '9') return c - '0'; - if ('a' <= c and c <= 'f') return c - 'a' + 10; - if ('A' <= c and c <= 'F') return c - 'A' + 10; - return 0; -} +fn encodeKeyValues(entry: anytype, writer: anytype) !void { + const key = entry.key_ptr.*; -// unescape decodes a percent encoded string. -// The caller owned the returned string. -pub fn unescape(alloc: std.mem.Allocator, s: []const u8) ![]const u8 { - var buf: std.ArrayListUnmanaged(u8) = .{}; - defer buf.deinit(alloc); - - var i: usize = 0; - while (i < s.len) { - defer i += 1; - - switch (s[i]) { - '%' => { - if (i + 2 > s.len) return error.EscapeError; - if (!std.ascii.isHex(s[i + 1])) return error.EscapeError; - if (!std.ascii.isHex(s[i + 2])) return error.EscapeError; - - try buf.append(alloc, unhex(s[i + 1]) << 4 | unhex(s[i + 2])); - i += 2; - }, - '+' => try buf.append(alloc, ' '), // TODO should we decode or keep as it? - else => try buf.append(alloc, s[i]), - } + try escape(key, writer); + const values = entry.value_ptr.items; + if (values.len == 0) { + return; } - return try buf.toOwnedSlice(alloc); + if (values[0].len > 0) { + try writer.writeByte('='); + try escape(values[0], writer); + } + + for (values[1..]) |value| { + try writer.writeByte('&'); + try escape(key, writer); + if (value.len > 0) { + try writer.writeByte('='); + try escape(value, writer); + } + } } -test "unescape" { - var v: []const u8 = undefined; - const alloc = std.testing.allocator; - - v = try unescape(alloc, "%7E"); - try std.testing.expect(std.mem.eql(u8, "~", v)); - alloc.free(v); -} - -pub fn escape(writer: anytype, raw: []const u8) !void { +fn escape(raw: []const u8, writer: anytype) !void { var start: usize = 0; for (raw, 0..) |char, index| { if ('a' <= char and char <= 'z' or 'A' <= char and char <= 'Z' or '0' <= char and char <= '9') { @@ -197,6 +161,8 @@ pub fn parseQuery(alloc: std.mem.Allocator, s: []const u8) !Values { var values = Values.init(alloc); errdefer values.deinit(); + const arena = values.arena.allocator(); + const ln = s.len; if (ln == 0) return values; @@ -213,8 +179,8 @@ pub fn parseQuery(alloc: std.mem.Allocator, s: []const u8) !Values { const v = rr.tail(); // decode k and v - const kk = try unescape(alloc, k); - const vv = try unescape(alloc, v); + const kk = try unescape(arena, k); + const vv = try unescape(arena, v); try values.appendOwned(kk, vv); @@ -224,61 +190,244 @@ pub fn parseQuery(alloc: std.mem.Allocator, s: []const u8) !Values { return values; } -test "parse empty query" { - var values = try parseQuery(std.testing.allocator, ""); - defer values.deinit(); +// The return'd string may or may not be allocated. Callers should use arenas +fn unescape(allocator: std.mem.Allocator, input: []const u8) ![]const u8 { + const HEX_CHAR = comptime blk: { + var all = std.mem.zeroes([256]bool); + for ('a'..('f' + 1)) |b| all[b] = true; + for ('A'..('F' + 1)) |b| all[b] = true; + for ('0'..('9' + 1)) |b| all[b] = true; + break :blk all; + }; - try std.testing.expect(values.count() == 0); + const HEX_DECODE = comptime blk: { + var all = std.mem.zeroes([256]u8); + for ('a'..('z' + 1)) |b| all[b] = b - 'a' + 10; + for ('A'..('Z' + 1)) |b| all[b] = b - 'A' + 10; + for ('0'..('9' + 1)) |b| all[b] = b - '0'; + break :blk all; + }; + + var has_plus = false; + var unescaped_len = input.len; + + { + // Figure out if we have any spaces and what the final unescaped length + // will be (which will let us know if we have anything to unescape in + // the first place) + var i: usize = 0; + while (i < input.len) { + const c = input[i]; + if (c == '%') { + if (i + 2 >= input.len or !HEX_CHAR[input[i + 1]] or !HEX_CHAR[input[i + 2]]) { + return error.EscapeError; + } + i += 3; + unescaped_len -= 2; + } else if (c == '+') { + has_plus = true; + i += 1; + } else { + i += 1; + } + } + } + + // no encoding, and no plus. nothing to unescape + if (unescaped_len == input.len and has_plus == false) { + return input; + } + + var unescaped = try allocator.alloc(u8, unescaped_len); + errdefer allocator.free(unescaped); + + var input_pos: usize = 0; + for (0..unescaped_len) |unescaped_pos| { + switch (input[input_pos]) { + '+' => { + unescaped[unescaped_pos] = ' '; + input_pos += 1; + }, + '%' => { + const encoded = input[input_pos + 1 .. input_pos + 3]; + const encoded_as_uint = @as(u16, @bitCast(encoded[0..2].*)); + unescaped[unescaped_pos] = switch (encoded_as_uint) { + asUint("20") => ' ', + asUint("21") => '!', + asUint("22") => '"', + asUint("23") => '#', + asUint("24") => '$', + asUint("25") => '%', + asUint("26") => '&', + asUint("27") => '\'', + asUint("28") => '(', + asUint("29") => ')', + asUint("2A") => '*', + asUint("2B") => '+', + asUint("2C") => ',', + asUint("2F") => '/', + asUint("3A") => ':', + asUint("3B") => ';', + asUint("3D") => '=', + asUint("3F") => '?', + asUint("40") => '@', + asUint("5B") => '[', + asUint("5D") => ']', + else => HEX_DECODE[encoded[0]] << 4 | HEX_DECODE[encoded[1]], + }; + input_pos += 3; + }, + else => |c| { + unescaped[unescaped_pos] = c; + input_pos += 1; + }, + } + } + return unescaped; } -test "parse empty query &" { - var values = try parseQuery(std.testing.allocator, "&"); - defer values.deinit(); +const testing = std.testing; +test "url.Query: unescape" { + const allocator = testing.allocator; + const cases = [_]struct { expected: []const u8, input: []const u8, free: bool }{ + .{ .expected = "", .input = "", .free = false }, + .{ .expected = "over", .input = "over", .free = false }, + .{ .expected = "Hello World", .input = "Hello World", .free = false }, + .{ .expected = "~", .input = "%7E", .free = true }, + .{ .expected = "~", .input = "%7e", .free = true }, + .{ .expected = "Hello~World", .input = "Hello%7eWorld", .free = true }, + .{ .expected = "Hello World", .input = "Hello++World", .free = true }, + }; - try std.testing.expect(values.count() == 0); + for (cases) |case| { + const value = try unescape(allocator, case.input); + defer if (case.free) { + allocator.free(value); + }; + try testing.expectEqualStrings(case.expected, value); + } + + try testing.expectError(error.EscapeError, unescape(undefined, "%")); + try testing.expectError(error.EscapeError, unescape(undefined, "%a")); + try testing.expectError(error.EscapeError, unescape(undefined, "%1")); + try testing.expectError(error.EscapeError, unescape(undefined, "123%45%6")); + try testing.expectError(error.EscapeError, unescape(undefined, "%zzzzz")); + try testing.expectError(error.EscapeError, unescape(undefined, "%0\xff")); } -test "parse query" { - var values = try parseQuery(std.testing.allocator, "a=b&b=c"); - defer values.deinit(); +test "url.Query: parseQuery" { + try testParseQuery(.{}, ""); - try std.testing.expect(values.count() == 2); - try std.testing.expect(values.get("a").len == 1); - try std.testing.expect(std.mem.eql(u8, values.get("a")[0], "b")); - try std.testing.expect(std.mem.eql(u8, values.first("a"), "b")); + try testParseQuery(.{}, "&"); - try std.testing.expect(values.get("b").len == 1); - try std.testing.expect(std.mem.eql(u8, values.get("b")[0], "c")); - try std.testing.expect(std.mem.eql(u8, values.first("b"), "c")); + try testParseQuery(.{ .a = [_][]const u8{"b"} }, "a=b"); + + try testParseQuery(.{ .hello = [_][]const u8{"world"} }, "hello=world"); + + try testParseQuery(.{ .hello = [_][]const u8{ "world", "all" } }, "hello=world&hello=all"); + + try testParseQuery(.{ + .a = [_][]const u8{"b"}, + .b = [_][]const u8{"c"}, + }, "a=b&b=c"); + + try testParseQuery(.{ .a = [_][]const u8{""} }, "a"); + try testParseQuery(.{ .a = [_][]const u8{ "", "", "" } }, "a&a&a"); + + try testParseQuery(.{ .abc = [_][]const u8{""} }, "abc"); + try testParseQuery(.{ + .abc = [_][]const u8{""}, + .dde = [_][]const u8{ "", "" }, + }, "abc&dde&dde"); + + try testParseQuery(.{ + .@"power is >" = [_][]const u8{"9,000?"}, + }, "power%20is%20%3E=9%2C000%3F"); } -test "parse query no value" { - var values = try parseQuery(std.testing.allocator, "a"); +test "url.Query.Values: get/first/count" { + var values = Values.init(testing.allocator); defer values.deinit(); - try std.testing.expect(values.count() == 1); - try std.testing.expect(std.mem.eql(u8, values.first("a"), "")); + { + // empty + try testing.expectEqual(0, values.count()); + try testing.expectEqual(0, values.get("").len); + try testing.expectEqualStrings("", values.first("")); + try testing.expectEqual(0, values.get("key").len); + try testing.expectEqualStrings("", values.first("key")); + } + + { + // add 1 value => key + try values.appendOwned("key", "value"); + try testing.expectEqual(1, values.count()); + try testing.expectEqual(1, values.get("key").len); + try testing.expectEqualSlices( + []const u8, + &.{"value"}, + values.get("key"), + ); + try testing.expectEqualStrings("value", values.first("key")); + } + + { + // add another value for the same key + try values.appendOwned("key", "another"); + try testing.expectEqual(1, values.count()); + try testing.expectEqual(2, values.get("key").len); + try testing.expectEqualSlices( + []const u8, + &.{ "value", "another" }, + values.get("key"), + ); + try testing.expectEqualStrings("value", values.first("key")); + } + + { + // add a new key (and value) + try values.appendOwned("over", "9000!"); + try testing.expectEqual(2, values.count()); + try testing.expectEqual(2, values.get("key").len); + try testing.expectEqual(1, values.get("over").len); + try testing.expectEqualSlices( + []const u8, + &.{"9000!"}, + values.get("over"), + ); + try testing.expectEqualStrings("9000!", values.first("over")); + } } -test "parse query dup" { - var values = try parseQuery(std.testing.allocator, "a=b&a=c"); +test "url.Query.Values: encode" { + var values = try parseQuery( + testing.allocator, + "hello=world&i%20will%20not%20fear=%3E%3E&a=b&a=c", + ); defer values.deinit(); - try std.testing.expect(values.count() == 1); - try std.testing.expect(std.mem.eql(u8, values.first("a"), "b")); - try std.testing.expect(values.get("a").len == 2); -} - -test "encode query" { - var values = try parseQuery(std.testing.allocator, "a=b&b=c"); - defer values.deinit(); - - try values.append("a", "~"); - var buf: std.ArrayListUnmanaged(u8) = .{}; - defer buf.deinit(std.testing.allocator); - - try values.encode(buf.writer(std.testing.allocator)); - - try std.testing.expect(std.mem.eql(u8, buf.items, "a=b&a=%7E&b=c")); + defer buf.deinit(testing.allocator); + try values.encode(buf.writer(testing.allocator)); + try testing.expectEqualStrings( + "hello=world&i%20will%20not%20fear=%3E%3E&a=b&a=c", + buf.items, + ); +} + +fn testParseQuery(expected: anytype, query: []const u8) !void { + var values = try parseQuery(testing.allocator, query); + defer values.deinit(); + + var count: usize = 0; + inline for (@typeInfo(@TypeOf(expected)).Struct.fields) |f| { + const actual = values.get(f.name); + const expect = @field(expected, f.name); + try testing.expectEqual(expect.len, actual.len); + for (expect, actual) |e, a| { + try testing.expectEqualStrings(e, a); + } + count += 1; + } + try testing.expectEqual(count, values.count()); }