Merge pull request #1611 from lightpanda-io/utf_range_offsets

Get both start and end bytes in a single pass
This commit is contained in:
Karl Seguin
2026-02-21 07:01:30 +08:00
committed by GitHub

View File

@@ -93,6 +93,48 @@ pub fn utf16OffsetToUtf8(data: []const u8, utf16_offset: usize) error{IndexSizeE
return error.IndexSizeError; return error.IndexSizeError;
} }
/// Convert a UTF-16 code unit range to UTF-8 byte offsets in a single pass.
/// Returns IndexSizeError if utf16_start > utf16 length of data.
/// Clamps utf16_end to the actual string length if it exceeds it.
fn utf16RangeToUtf8(data: []const u8, utf16_start: usize, utf16_end: usize) !struct { start: usize, end: usize } {
var i: usize = 0;
var utf16_pos: usize = 0;
var byte_start: ?usize = null;
while (i < data.len) {
// Record start offset when we reach it
if (utf16_pos == utf16_start) {
byte_start = i;
}
// If we've found start and reached end, return both
if (utf16_pos == utf16_end and byte_start != null) {
return .{ .start = byte_start.?, .end = i };
}
const byte = data[i];
const seq_len = std.unicode.utf8ByteSequenceLength(byte) catch {
i += 1;
utf16_pos += 1;
continue;
};
if (i + seq_len > data.len) {
utf16_pos += 1;
i += 1;
continue;
}
utf16_pos += if (seq_len == 4) 2 else 1;
i += seq_len;
}
// At end of string
if (utf16_pos == utf16_start) {
byte_start = i;
}
const start = byte_start orelse return error.IndexSizeError;
// End is either exactly at utf16_end or clamped to string end
return .{ .start = start, .end = i };
}
pub const Type = union(enum) { pub const Type = union(enum) {
text: Text, text: Text,
comment: Comment, comment: Comment,
@@ -234,20 +276,19 @@ pub fn appendData(self: *CData, data: []const u8, page: *Page) !void {
} }
pub fn deleteData(self: *CData, offset: usize, count: usize, page: *Page) !void { pub fn deleteData(self: *CData, offset: usize, count: usize, page: *Page) !void {
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize); const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
const byte_end = utf16OffsetToUtf8(self._data, end_utf16) catch self._data.len; const range = try utf16RangeToUtf8(self._data, offset, end_utf16);
// Just slice - original data stays in arena // Just slice - original data stays in arena
const old_value = self._data; const old_value = self._data;
if (byte_offset == 0) { if (range.start == 0) {
self._data = self._data[byte_end..]; self._data = self._data[range.end..];
} else if (byte_end >= self._data.len) { } else if (range.end >= self._data.len) {
self._data = self._data[0..byte_offset]; self._data = self._data[0..range.start];
} else { } else {
self._data = try std.mem.concat(page.arena, u8, &.{ self._data = try std.mem.concat(page.arena, u8, &.{
self._data[0..byte_offset], self._data[0..range.start],
self._data[byte_end..], self._data[range.end..],
}); });
} }
page.characterDataChange(self.asNode(), old_value); page.characterDataChange(self.asNode(), old_value);
@@ -264,22 +305,20 @@ pub fn insertData(self: *CData, offset: usize, data: []const u8, page: *Page) !v
} }
pub fn replaceData(self: *CData, offset: usize, count: usize, data: []const u8, page: *Page) !void { pub fn replaceData(self: *CData, offset: usize, count: usize, data: []const u8, page: *Page) !void {
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize); const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
const byte_end = utf16OffsetToUtf8(self._data, end_utf16) catch self._data.len; const range = try utf16RangeToUtf8(self._data, offset, end_utf16);
const new_data = try std.mem.concat(page.arena, u8, &.{ const new_data = try std.mem.concat(page.arena, u8, &.{
self._data[0..byte_offset], self._data[0..range.start],
data, data,
self._data[byte_end..], self._data[range.end..],
}); });
try self.setData(new_data, page); try self.setData(new_data, page);
} }
pub fn substringData(self: *const CData, offset: usize, count: usize) ![]const u8 { pub fn substringData(self: *const CData, offset: usize, count: usize) ![]const u8 {
const byte_offset = try utf16OffsetToUtf8(self._data, offset);
const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize); const end_utf16 = std.math.add(usize, offset, count) catch std.math.maxInt(usize);
const byte_end = utf16OffsetToUtf8(self._data, end_utf16) catch self._data.len; const range = try utf16RangeToUtf8(self._data, offset, end_utf16);
return self._data[byte_offset..byte_end]; return self._data[range.start..range.end];
} }
pub fn remove(self: *CData, page: *Page) !void { pub fn remove(self: *CData, page: *Page) !void {
@@ -460,3 +499,94 @@ test "utf16OffsetToUtf8" {
try std.testing.expectEqual(@as(usize, 0), try utf16OffsetToUtf8("", 0)); try std.testing.expectEqual(@as(usize, 0), try utf16OffsetToUtf8("", 0));
try std.testing.expectError(error.IndexSizeError, utf16OffsetToUtf8("", 1)); try std.testing.expectError(error.IndexSizeError, utf16OffsetToUtf8("", 1));
} }
test "utf16RangeToUtf8" {
// ASCII: basic range
{
const result = try utf16RangeToUtf8("hello", 1, 4);
try std.testing.expectEqual(@as(usize, 1), result.start);
try std.testing.expectEqual(@as(usize, 4), result.end);
}
// ASCII: range to end
{
const result = try utf16RangeToUtf8("hello", 2, 5);
try std.testing.expectEqual(@as(usize, 2), result.start);
try std.testing.expectEqual(@as(usize, 5), result.end);
}
// ASCII: range past end (should clamp)
{
const result = try utf16RangeToUtf8("hello", 2, 100);
try std.testing.expectEqual(@as(usize, 2), result.start);
try std.testing.expectEqual(@as(usize, 5), result.end); // clamped
}
// ASCII: full range
{
const result = try utf16RangeToUtf8("hello", 0, 5);
try std.testing.expectEqual(@as(usize, 0), result.start);
try std.testing.expectEqual(@as(usize, 5), result.end);
}
// ASCII: start past end
try std.testing.expectError(error.IndexSizeError, utf16RangeToUtf8("hello", 6, 10));
// CJK "資料" (6 bytes, 2 UTF-16 code units)
{
const result = try utf16RangeToUtf8("資料", 0, 1);
try std.testing.expectEqual(@as(usize, 0), result.start);
try std.testing.expectEqual(@as(usize, 3), result.end); // after 資
}
{
const result = try utf16RangeToUtf8("資料", 1, 2);
try std.testing.expectEqual(@as(usize, 3), result.start); // before 料
try std.testing.expectEqual(@as(usize, 6), result.end); // end
}
{
const result = try utf16RangeToUtf8("資料", 0, 2);
try std.testing.expectEqual(@as(usize, 0), result.start);
try std.testing.expectEqual(@as(usize, 6), result.end);
}
// Emoji "🌠AB" (4+1+1 = 6 bytes; 2+1+1 = 4 UTF-16 code units)
{
const result = try utf16RangeToUtf8("🌠AB", 0, 2);
try std.testing.expectEqual(@as(usize, 0), result.start);
try std.testing.expectEqual(@as(usize, 4), result.end); // after 🌠
}
{
const result = try utf16RangeToUtf8("🌠AB", 2, 3);
try std.testing.expectEqual(@as(usize, 4), result.start); // before A
try std.testing.expectEqual(@as(usize, 5), result.end); // before B
}
{
const result = try utf16RangeToUtf8("🌠AB", 0, 4);
try std.testing.expectEqual(@as(usize, 0), result.start);
try std.testing.expectEqual(@as(usize, 6), result.end);
}
// Empty string
{
const result = try utf16RangeToUtf8("", 0, 0);
try std.testing.expectEqual(@as(usize, 0), result.start);
try std.testing.expectEqual(@as(usize, 0), result.end);
}
{
const result = try utf16RangeToUtf8("", 0, 100);
try std.testing.expectEqual(@as(usize, 0), result.start);
try std.testing.expectEqual(@as(usize, 0), result.end); // clamped
}
// Mixed "🌠 test 🌠" (4+1+4+1+4 = 14 bytes; 2+1+4+1+2 = 10 UTF-16 code units)
{
const result = try utf16RangeToUtf8("🌠 test 🌠", 3, 7);
try std.testing.expectEqual(@as(usize, 5), result.start); // before 'test'
try std.testing.expectEqual(@as(usize, 9), result.end); // after 'test', before second space
}
}