mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-03-21 20:24:42 +00:00
Merge pull request #1605 from egrs/wpt-chardata-utf16
Some checks failed
e2e-test / zig build release (push) Has been cancelled
e2e-test / demo-scripts (push) Has been cancelled
e2e-test / cdp-and-hyperfine-bench (push) Has been cancelled
e2e-test / perf-fmt (push) Has been cancelled
e2e-test / browser fetch (push) Has been cancelled
zig-test / zig test using v8 in debug mode (push) Has been cancelled
zig-test / zig test (push) Has been cancelled
zig-test / perf-fmt (push) Has been cancelled
Some checks failed
e2e-test / zig build release (push) Has been cancelled
e2e-test / demo-scripts (push) Has been cancelled
e2e-test / cdp-and-hyperfine-bench (push) Has been cancelled
e2e-test / perf-fmt (push) Has been cancelled
e2e-test / browser fetch (push) Has been cancelled
zig-test / zig test using v8 in debug mode (push) Has been cancelled
zig-test / zig test (push) Has been cancelled
zig-test / perf-fmt (push) Has been cancelled
fix CharacterData methods to use UTF-16 code unit offsets
This commit is contained in:
@@ -33,6 +33,66 @@ _type: Type,
|
||||
_proto: *Node,
|
||||
_data: []const u8 = "",
|
||||
|
||||
/// Count UTF-16 code units in a UTF-8 string.
|
||||
/// 4-byte UTF-8 sequences (codepoints >= U+10000) produce 2 UTF-16 code units (surrogate pair),
|
||||
/// everything else produces 1.
|
||||
fn utf16Len(data: []const u8) usize {
|
||||
var count: usize = 0;
|
||||
var i: usize = 0;
|
||||
while (i < data.len) {
|
||||
const byte = data[i];
|
||||
const seq_len = std.unicode.utf8ByteSequenceLength(byte) catch {
|
||||
// Invalid UTF-8 byte — count as 1 code unit, advance 1 byte
|
||||
i += 1;
|
||||
count += 1;
|
||||
continue;
|
||||
};
|
||||
if (i + seq_len > data.len) {
|
||||
// Truncated sequence
|
||||
count += 1;
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if (seq_len == 4) {
|
||||
count += 2; // surrogate pair
|
||||
} else {
|
||||
count += 1;
|
||||
}
|
||||
i += seq_len;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/// Convert a UTF-16 code unit offset to a UTF-8 byte offset.
|
||||
/// Returns IndexSizeError if utf16_offset > utf16 length of data.
|
||||
pub fn utf16OffsetToUtf8(data: []const u8, utf16_offset: usize) error{IndexSizeError}!usize {
|
||||
var utf16_pos: usize = 0;
|
||||
var i: usize = 0;
|
||||
while (i < data.len) {
|
||||
if (utf16_pos == utf16_offset) return i;
|
||||
const byte = data[i];
|
||||
const seq_len = std.unicode.utf8ByteSequenceLength(byte) catch {
|
||||
i += 1;
|
||||
utf16_pos += 1;
|
||||
continue;
|
||||
};
|
||||
if (i + seq_len > data.len) {
|
||||
utf16_pos += 1;
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if (seq_len == 4) {
|
||||
utf16_pos += 2;
|
||||
} else {
|
||||
utf16_pos += 1;
|
||||
}
|
||||
i += seq_len;
|
||||
}
|
||||
// At end of string — valid only if offset equals total length
|
||||
if (utf16_pos == utf16_offset) return i;
|
||||
return error.IndexSizeError;
|
||||
}
|
||||
|
||||
pub const Type = union(enum) {
|
||||
text: Text,
|
||||
comment: Comment,
|
||||
@@ -128,6 +188,17 @@ pub fn setData(self: *CData, value: ?[]const u8, page: *Page) !void {
|
||||
page.characterDataChange(self.asNode(), old_value);
|
||||
}
|
||||
|
||||
/// JS bridge wrapper for `data` setter.
|
||||
/// Handles [LegacyNullToEmptyString]: null → setData(null) → "".
|
||||
/// Passes everything else (including undefined) through V8 toString,
|
||||
/// so `undefined` becomes the string "undefined" per spec.
|
||||
pub fn _setData(self: *CData, value: js.Value, page: *Page) !void {
|
||||
if (value.isNull()) {
|
||||
return self.setData(null, page);
|
||||
}
|
||||
return self.setData(try value.toZig([]const u8), page);
|
||||
}
|
||||
|
||||
pub fn format(self: *const CData, writer: *std.io.Writer) !void {
|
||||
return switch (self._type) {
|
||||
.text => writer.print("<text>{s}</text>", .{self._data}),
|
||||
@@ -138,7 +209,7 @@ pub fn format(self: *const CData, writer: *std.io.Writer) !void {
|
||||
}
|
||||
|
||||
pub fn getLength(self: *const CData) usize {
|
||||
return std.unicode.utf8CountCodepoints(self._data) catch self._data.len;
|
||||
return utf16Len(self._data);
|
||||
}
|
||||
|
||||
pub fn isEqualNode(self: *const CData, other: *const CData) bool {
|
||||
@@ -163,49 +234,52 @@ pub fn appendData(self: *CData, data: []const u8, page: *Page) !void {
|
||||
}
|
||||
|
||||
pub fn deleteData(self: *CData, offset: usize, count: usize, page: *Page) !void {
|
||||
if (offset > self._data.len) return error.IndexSizeError;
|
||||
const end = @min(offset + count, self._data.len);
|
||||
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
|
||||
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
|
||||
const byte_end = utf16OffsetToUtf8(self._data, end_utf16) catch self._data.len;
|
||||
|
||||
// Just slice - original data stays in arena
|
||||
const old_value = self._data;
|
||||
if (offset == 0) {
|
||||
self._data = self._data[end..];
|
||||
} else if (end >= self._data.len) {
|
||||
self._data = self._data[0..offset];
|
||||
if (byte_offset == 0) {
|
||||
self._data = self._data[byte_end..];
|
||||
} else if (byte_end >= self._data.len) {
|
||||
self._data = self._data[0..byte_offset];
|
||||
} else {
|
||||
self._data = try std.mem.concat(page.arena, u8, &.{
|
||||
self._data[0..offset],
|
||||
self._data[end..],
|
||||
self._data[0..byte_offset],
|
||||
self._data[byte_end..],
|
||||
});
|
||||
}
|
||||
page.characterDataChange(self.asNode(), old_value);
|
||||
}
|
||||
|
||||
pub fn insertData(self: *CData, offset: usize, data: []const u8, page: *Page) !void {
|
||||
if (offset > self._data.len) return error.IndexSizeError;
|
||||
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
|
||||
const new_data = try std.mem.concat(page.arena, u8, &.{
|
||||
self._data[0..offset],
|
||||
self._data[0..byte_offset],
|
||||
data,
|
||||
self._data[offset..],
|
||||
self._data[byte_offset..],
|
||||
});
|
||||
try self.setData(new_data, page);
|
||||
}
|
||||
|
||||
pub fn replaceData(self: *CData, offset: usize, count: usize, data: []const u8, page: *Page) !void {
|
||||
if (offset > self._data.len) return error.IndexSizeError;
|
||||
const end = @min(offset + count, self._data.len);
|
||||
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
|
||||
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
|
||||
const byte_end = utf16OffsetToUtf8(self._data, end_utf16) catch self._data.len;
|
||||
const new_data = try std.mem.concat(page.arena, u8, &.{
|
||||
self._data[0..offset],
|
||||
self._data[0..byte_offset],
|
||||
data,
|
||||
self._data[end..],
|
||||
self._data[byte_end..],
|
||||
});
|
||||
try self.setData(new_data, page);
|
||||
}
|
||||
|
||||
pub fn substringData(self: *const CData, offset: usize, count: usize) ![]const u8 {
|
||||
if (offset > self._data.len) return error.IndexSizeError;
|
||||
const end = @min(offset + count, self._data.len);
|
||||
return self._data[offset..end];
|
||||
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
|
||||
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
|
||||
const byte_end = utf16OffsetToUtf8(self._data, end_utf16) catch self._data.len;
|
||||
return self._data[byte_offset..byte_end];
|
||||
}
|
||||
|
||||
pub fn remove(self: *CData, page: *Page) !void {
|
||||
@@ -276,7 +350,7 @@ pub const JsApi = struct {
|
||||
pub const enumerable = false;
|
||||
};
|
||||
|
||||
pub const data = bridge.accessor(CData.getData, CData.setData, .{});
|
||||
pub const data = bridge.accessor(CData.getData, CData._setData, .{});
|
||||
pub const length = bridge.accessor(CData.getLength, null, .{});
|
||||
|
||||
pub const appendData = bridge.function(CData.appendData, .{});
|
||||
@@ -341,3 +415,42 @@ test "WebApi: CData.render" {
|
||||
try std.testing.expect(result == test_case.result);
|
||||
}
|
||||
}
|
||||
|
||||
test "utf16Len" {
|
||||
// ASCII: 1 byte = 1 code unit each
|
||||
try std.testing.expectEqual(@as(usize, 0), utf16Len(""));
|
||||
try std.testing.expectEqual(@as(usize, 5), utf16Len("hello"));
|
||||
// CJK: 3 bytes UTF-8 = 1 UTF-16 code unit each
|
||||
try std.testing.expectEqual(@as(usize, 2), utf16Len("資料")); // 6 bytes, 2 code units
|
||||
// Emoji U+1F320: 4 bytes UTF-8 = 2 UTF-16 code units (surrogate pair)
|
||||
try std.testing.expectEqual(@as(usize, 2), utf16Len("🌠")); // 4 bytes, 2 code units
|
||||
// Mixed: 🌠(2) + " test "(6) + 🌠(2) + " TEST"(5) = 15
|
||||
try std.testing.expectEqual(@as(usize, 15), utf16Len("🌠 test 🌠 TEST"));
|
||||
// 2-byte UTF-8 (e.g. é U+00E9): 1 UTF-16 code unit
|
||||
try std.testing.expectEqual(@as(usize, 4), utf16Len("café")); // c(1) + a(1) + f(1) + é(1)
|
||||
}
|
||||
|
||||
test "utf16OffsetToUtf8" {
|
||||
// ASCII: offsets map 1:1
|
||||
try std.testing.expectEqual(@as(usize, 0), try utf16OffsetToUtf8("hello", 0));
|
||||
try std.testing.expectEqual(@as(usize, 3), try utf16OffsetToUtf8("hello", 3));
|
||||
try std.testing.expectEqual(@as(usize, 5), try utf16OffsetToUtf8("hello", 5)); // end
|
||||
try std.testing.expectError(error.IndexSizeError, utf16OffsetToUtf8("hello", 6)); // past end
|
||||
|
||||
// CJK "資料" (6 bytes, 2 UTF-16 code units)
|
||||
try std.testing.expectEqual(@as(usize, 0), try utf16OffsetToUtf8("資料", 0)); // before 資
|
||||
try std.testing.expectEqual(@as(usize, 3), try utf16OffsetToUtf8("資料", 1)); // before 料
|
||||
try std.testing.expectEqual(@as(usize, 6), try utf16OffsetToUtf8("資料", 2)); // end
|
||||
try std.testing.expectError(error.IndexSizeError, utf16OffsetToUtf8("資料", 3));
|
||||
|
||||
// Emoji "🌠AB" (4+1+1 = 6 bytes; 2+1+1 = 4 UTF-16 code units)
|
||||
try std.testing.expectEqual(@as(usize, 0), try utf16OffsetToUtf8("🌠AB", 0)); // before 🌠
|
||||
// offset 1 lands inside the surrogate pair — still valid UTF-16 offset
|
||||
try std.testing.expectEqual(@as(usize, 4), try utf16OffsetToUtf8("🌠AB", 2)); // before A
|
||||
try std.testing.expectEqual(@as(usize, 5), try utf16OffsetToUtf8("🌠AB", 3)); // before B
|
||||
try std.testing.expectEqual(@as(usize, 6), try utf16OffsetToUtf8("🌠AB", 4)); // end
|
||||
|
||||
// Empty string: only offset 0 is valid
|
||||
try std.testing.expectEqual(@as(usize, 0), try utf16OffsetToUtf8("", 0));
|
||||
try std.testing.expectError(error.IndexSizeError, utf16OffsetToUtf8("", 1));
|
||||
}
|
||||
|
||||
@@ -35,15 +35,13 @@ pub fn getWholeText(self: *Text) []const u8 {
|
||||
pub fn splitText(self: *Text, offset: usize, page: *Page) !*Text {
|
||||
const data = self._proto._data;
|
||||
|
||||
if (offset > data.len) {
|
||||
return error.IndexSizeError;
|
||||
}
|
||||
const byte_offset = CData.utf16OffsetToUtf8(data, offset) catch return error.IndexSizeError;
|
||||
|
||||
const new_data = data[offset..];
|
||||
const new_data = data[byte_offset..];
|
||||
const new_node = try page.createTextNode(new_data);
|
||||
const new_text = new_node.as(Text);
|
||||
|
||||
const old_data = data[0..offset];
|
||||
const old_data = data[0..byte_offset];
|
||||
try self._proto.setData(old_data, page);
|
||||
|
||||
// If this node has a parent, insert the new node right after this one
|
||||
|
||||
Reference in New Issue
Block a user