mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-03-22 12:44:43 +00:00
Merge pull request #1611 from lightpanda-io/utf_range_offsets
Get both start and end bytes in a single pass
This commit is contained in:
@@ -93,6 +93,48 @@ pub fn utf16OffsetToUtf8(data: []const u8, utf16_offset: usize) error{IndexSizeE
|
|||||||
return error.IndexSizeError;
|
return error.IndexSizeError;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert a UTF-16 code unit range to UTF-8 byte offsets in a single pass.
|
||||||
|
/// Returns IndexSizeError if utf16_start > utf16 length of data.
|
||||||
|
/// Clamps utf16_end to the actual string length if it exceeds it.
|
||||||
|
fn utf16RangeToUtf8(data: []const u8, utf16_start: usize, utf16_end: usize) !struct { start: usize, end: usize } {
|
||||||
|
var i: usize = 0;
|
||||||
|
var utf16_pos: usize = 0;
|
||||||
|
var byte_start: ?usize = null;
|
||||||
|
|
||||||
|
while (i < data.len) {
|
||||||
|
// Record start offset when we reach it
|
||||||
|
if (utf16_pos == utf16_start) {
|
||||||
|
byte_start = i;
|
||||||
|
}
|
||||||
|
// If we've found start and reached end, return both
|
||||||
|
if (utf16_pos == utf16_end and byte_start != null) {
|
||||||
|
return .{ .start = byte_start.?, .end = i };
|
||||||
|
}
|
||||||
|
|
||||||
|
const byte = data[i];
|
||||||
|
const seq_len = std.unicode.utf8ByteSequenceLength(byte) catch {
|
||||||
|
i += 1;
|
||||||
|
utf16_pos += 1;
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
if (i + seq_len > data.len) {
|
||||||
|
utf16_pos += 1;
|
||||||
|
i += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
utf16_pos += if (seq_len == 4) 2 else 1;
|
||||||
|
i += seq_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
// At end of string
|
||||||
|
if (utf16_pos == utf16_start) {
|
||||||
|
byte_start = i;
|
||||||
|
}
|
||||||
|
const start = byte_start orelse return error.IndexSizeError;
|
||||||
|
// End is either exactly at utf16_end or clamped to string end
|
||||||
|
return .{ .start = start, .end = i };
|
||||||
|
}
|
||||||
|
|
||||||
pub const Type = union(enum) {
|
pub const Type = union(enum) {
|
||||||
text: Text,
|
text: Text,
|
||||||
comment: Comment,
|
comment: Comment,
|
||||||
@@ -234,20 +276,19 @@ pub fn appendData(self: *CData, data: []const u8, page: *Page) !void {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn deleteData(self: *CData, offset: usize, count: usize, page: *Page) !void {
|
pub fn deleteData(self: *CData, offset: usize, count: usize, page: *Page) !void {
|
||||||
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
|
|
||||||
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
|
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
|
||||||
const byte_end = utf16OffsetToUtf8(self._data, end_utf16) catch self._data.len;
|
const range = try utf16RangeToUtf8(self._data, offset, end_utf16);
|
||||||
|
|
||||||
// Just slice - original data stays in arena
|
// Just slice - original data stays in arena
|
||||||
const old_value = self._data;
|
const old_value = self._data;
|
||||||
if (byte_offset == 0) {
|
if (range.start == 0) {
|
||||||
self._data = self._data[byte_end..];
|
self._data = self._data[range.end..];
|
||||||
} else if (byte_end >= self._data.len) {
|
} else if (range.end >= self._data.len) {
|
||||||
self._data = self._data[0..byte_offset];
|
self._data = self._data[0..range.start];
|
||||||
} else {
|
} else {
|
||||||
self._data = try std.mem.concat(page.arena, u8, &.{
|
self._data = try std.mem.concat(page.arena, u8, &.{
|
||||||
self._data[0..byte_offset],
|
self._data[0..range.start],
|
||||||
self._data[byte_end..],
|
self._data[range.end..],
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
page.characterDataChange(self.asNode(), old_value);
|
page.characterDataChange(self.asNode(), old_value);
|
||||||
@@ -264,22 +305,20 @@ pub fn insertData(self: *CData, offset: usize, data: []const u8, page: *Page) !v
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn replaceData(self: *CData, offset: usize, count: usize, data: []const u8, page: *Page) !void {
|
pub fn replaceData(self: *CData, offset: usize, count: usize, data: []const u8, page: *Page) !void {
|
||||||
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
|
|
||||||
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
|
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
|
||||||
const byte_end = utf16OffsetToUtf8(self._data, end_utf16) catch self._data.len;
|
const range = try utf16RangeToUtf8(self._data, offset, end_utf16);
|
||||||
const new_data = try std.mem.concat(page.arena, u8, &.{
|
const new_data = try std.mem.concat(page.arena, u8, &.{
|
||||||
self._data[0..byte_offset],
|
self._data[0..range.start],
|
||||||
data,
|
data,
|
||||||
self._data[byte_end..],
|
self._data[range.end..],
|
||||||
});
|
});
|
||||||
try self.setData(new_data, page);
|
try self.setData(new_data, page);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn substringData(self: *const CData, offset: usize, count: usize) ![]const u8 {
|
pub fn substringData(self: *const CData, offset: usize, count: usize) ![]const u8 {
|
||||||
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
|
|
||||||
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
|
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
|
||||||
const byte_end = utf16OffsetToUtf8(self._data, end_utf16) catch self._data.len;
|
const range = try utf16RangeToUtf8(self._data, offset, end_utf16);
|
||||||
return self._data[byte_offset..byte_end];
|
return self._data[range.start..range.end];
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove(self: *CData, page: *Page) !void {
|
pub fn remove(self: *CData, page: *Page) !void {
|
||||||
@@ -460,3 +499,94 @@ test "utf16OffsetToUtf8" {
|
|||||||
try std.testing.expectEqual(@as(usize, 0), try utf16OffsetToUtf8("", 0));
|
try std.testing.expectEqual(@as(usize, 0), try utf16OffsetToUtf8("", 0));
|
||||||
try std.testing.expectError(error.IndexSizeError, utf16OffsetToUtf8("", 1));
|
try std.testing.expectError(error.IndexSizeError, utf16OffsetToUtf8("", 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test "utf16RangeToUtf8" {
|
||||||
|
// ASCII: basic range
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("hello", 1, 4);
|
||||||
|
try std.testing.expectEqual(@as(usize, 1), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 4), result.end);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ASCII: range to end
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("hello", 2, 5);
|
||||||
|
try std.testing.expectEqual(@as(usize, 2), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 5), result.end);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ASCII: range past end (should clamp)
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("hello", 2, 100);
|
||||||
|
try std.testing.expectEqual(@as(usize, 2), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 5), result.end); // clamped
|
||||||
|
}
|
||||||
|
|
||||||
|
// ASCII: full range
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("hello", 0, 5);
|
||||||
|
try std.testing.expectEqual(@as(usize, 0), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 5), result.end);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ASCII: start past end
|
||||||
|
try std.testing.expectError(error.IndexSizeError, utf16RangeToUtf8("hello", 6, 10));
|
||||||
|
|
||||||
|
// CJK "資料" (6 bytes, 2 UTF-16 code units)
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("資料", 0, 1);
|
||||||
|
try std.testing.expectEqual(@as(usize, 0), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 3), result.end); // after 資
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("資料", 1, 2);
|
||||||
|
try std.testing.expectEqual(@as(usize, 3), result.start); // before 料
|
||||||
|
try std.testing.expectEqual(@as(usize, 6), result.end); // end
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("資料", 0, 2);
|
||||||
|
try std.testing.expectEqual(@as(usize, 0), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 6), result.end);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Emoji "🌠AB" (4+1+1 = 6 bytes; 2+1+1 = 4 UTF-16 code units)
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("🌠AB", 0, 2);
|
||||||
|
try std.testing.expectEqual(@as(usize, 0), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 4), result.end); // after 🌠
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("🌠AB", 2, 3);
|
||||||
|
try std.testing.expectEqual(@as(usize, 4), result.start); // before A
|
||||||
|
try std.testing.expectEqual(@as(usize, 5), result.end); // before B
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("🌠AB", 0, 4);
|
||||||
|
try std.testing.expectEqual(@as(usize, 0), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 6), result.end);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Empty string
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("", 0, 0);
|
||||||
|
try std.testing.expectEqual(@as(usize, 0), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 0), result.end);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("", 0, 100);
|
||||||
|
try std.testing.expectEqual(@as(usize, 0), result.start);
|
||||||
|
try std.testing.expectEqual(@as(usize, 0), result.end); // clamped
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mixed "🌠 test 🌠" (4+1+4+1+4 = 14 bytes; 2+1+4+1+2 = 10 UTF-16 code units)
|
||||||
|
{
|
||||||
|
const result = try utf16RangeToUtf8("🌠 test 🌠", 3, 7);
|
||||||
|
try std.testing.expectEqual(@as(usize, 5), result.start); // before 'test'
|
||||||
|
try std.testing.expectEqual(@as(usize, 9), result.end); // after 'test', before second space
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user