Experiment with collapsing whitespace

This might break things, and in the scale of things, probably doesn't save
enough memory, BUT...if you look at the most common text nones on a page,
you'll likely see hundreds or thousands of nodes containing just space and
newline. On an Amazon product page, for example, there are 80 text nodes
containing just a newline + 80 spaces. In fact, there are over 4000 text nodes
containing just whitespace.

Ideally, we could simply ignore them and not even generate the Node. But that's
likely to break some JavaScript (e.g. it would mess up the child count, the
firstChild, etc...). So this simply attempts to collapse the leading and
trailing whitespace (it doesn't trim them, it merely replaces \s+ with " ").

This could _still_ break some JavaScript, but seems safer. The real win is here
is that " " gets interned, so doesn't require an dupe/allocation.
This commit is contained in:
Karl Seguin
2026-01-22 15:00:32 +08:00
parent 065ca39d60
commit 7a0be11d40
11 changed files with 70 additions and 21 deletions

View File

@@ -1292,7 +1292,7 @@ pub fn appendNew(self: *Page, parent: *Node, child: Node.NodeOrText) !void {
return; return;
} }
} }
break :blk try self.createTextNode(txt); break :blk try self.createTextNode(txt, parent);
}, },
}; };
@@ -2112,10 +2112,45 @@ fn populateElementAttributes(self: *Page, element: *Element, list: anytype) !voi
} }
} }
pub fn createTextNode(self: *Page, text: []const u8) !*Node { pub fn createTextNode(self: *Page, text: []const u8, parent: ?*Node) !*Node {
var normalized = text;
if (parent) |p| {
if (text.len > 0 and text.len <= self.buf.len and canCollapseWhiteSpace(p)) {
const has_leading_ws = switch (text[0]) {
' ', '\t', '\r', '\n' => true,
else => false,
};
const has_trailing_ws = switch (text[text.len - 1]) {
' ', '\t', '\r', '\n' => true,
else => false,
};
if (has_leading_ws or has_trailing_ws) {
const trimmed = std.mem.trim(u8, text, " \t\r\n");
var idx: usize = 0;
var buf = &self.buf;
if (has_leading_ws) {
buf[idx] = ' ';
idx += 1;
}
@memcpy(buf[idx..][0..trimmed.len], trimmed);
idx += trimmed.len;
if (has_trailing_ws) {
buf[idx] = ' ';
idx += 1;
}
normalized = buf[0..idx];
}
}
}
// might seem unlikely that we get an intern hit, but we'll get some nodes // might seem unlikely that we get an intern hit, but we'll get some nodes
// with just '\n' // with just '\n'
const owned_text = try self.dupeString(text); const owned_text = try self.dupeString(normalized);
const cd = try self._factory.node(CData{ const cd = try self._factory.node(CData{
._proto = undefined, ._proto = undefined,
._type = .{ .text = .{ ._type = .{ .text = .{
@@ -2127,6 +2162,16 @@ pub fn createTextNode(self: *Page, text: []const u8) !*Node {
return cd.asNode(); return cd.asNode();
} }
fn canCollapseWhiteSpace(node: *Node) bool {
// it's possible that some of these are safe to collapse, but it isn't worth
// the risk/complexity.
const el = node.is(Element.Html) orelse return false;
return switch (el._type) {
.script, .style, .textarea, .pre => false,
else => true,
};
}
pub fn createComment(self: *Page, text: []const u8) !*Node { pub fn createComment(self: *Page, text: []const u8) !*Node {
const owned_text = try self.dupeString(text); const owned_text = try self.dupeString(text);
const cd = try self._factory.node(CData{ const cd = try self._factory.node(CData{

View File

@@ -410,7 +410,7 @@ fn _appendBeforeSiblingCallback(self: *Parser, sibling: *Node, node_or_text: h5e
const parent = sibling.parentNode() orelse return error.NoParent; const parent = sibling.parentNode() orelse return error.NoParent;
const node: *Node = switch (node_or_text.toUnion()) { const node: *Node = switch (node_or_text.toUnion()) {
.node => |cpn| getNode(cpn), .node => |cpn| getNode(cpn),
.text => |txt| try self.page.createTextNode(txt), .text => |txt| try self.page.createTextNode(txt, parent),
}; };
try self.page.insertNodeRelative(parent, node, .{ .before = sibling }, .{}); try self.page.insertNodeRelative(parent, node, .{ .before = sibling }, .{});
} }

View File

@@ -54,7 +54,7 @@ pub fn createHTMLDocument(_: *const DOMImplementation, title: ?[]const u8, page:
if (title) |t| { if (title) |t| {
const title_node = try page.createElementNS(.html, "title", null); const title_node = try page.createElementNS(.html, "title", null);
_ = try head_node.appendChild(title_node, page); _ = try head_node.appendChild(title_node, page);
const text_node = try page.createTextNode(t); const text_node = try page.createTextNode(t, title_node);
_ = try title_node.appendChild(text_node, page); _ = try title_node.appendChild(text_node, page);
} }

View File

@@ -313,7 +313,7 @@ pub fn createComment(self: *Document, data: []const u8, page: *Page) !*Node {
} }
pub fn createTextNode(self: *Document, data: []const u8, page: *Page) !*Node { pub fn createTextNode(self: *Document, data: []const u8, page: *Page) !*Node {
const node = try page.createTextNode(data); const node = try page.createTextNode(data, null);
// Track owner document if it's not the main document // Track owner document if it's not the main document
if (self != page.document) { if (self != page.document) {
try page.setNodeOwnerDocument(node, self); try page.setNodeOwnerDocument(node, self);

View File

@@ -589,8 +589,9 @@ pub fn insertAdjacentText(
data: []const u8, data: []const u8,
page: *Page, page: *Page,
) !void { ) !void {
const text_node = try page.createTextNode(data); const node = self.asNode();
const target_node, const prev_node = try self.asNode().findAdjacentNodes(where); const text_node = try page.createTextNode(data, node);
const target_node, const prev_node = try node.findAdjacentNodes(where);
_ = try target_node.insertBefore(text_node, prev_node, page); _ = try target_node.insertBefore(text_node, prev_node, page);
} }

View File

@@ -676,7 +676,7 @@ pub fn cloneNode(self: *Node, deep_: ?bool, page: *Page) error{ OutOfMemory, Str
.cdata => |cd| { .cdata => |cd| {
const data = cd.getData(); const data = cd.getData();
return switch (cd._type) { return switch (cd._type) {
.text => page.createTextNode(data), .text => page.createTextNode(data, null),
.cdata_section => page.createCDATASection(data), .cdata_section => page.createCDATASection(data),
.comment => page.createComment(data), .comment => page.createComment(data),
.processing_instruction => |pi| page.createProcessingInstruction(pi._target, data), .processing_instruction => |pi| page.createProcessingInstruction(pi._target, data),
@@ -994,7 +994,7 @@ pub const NodeOrText = union(enum) {
pub fn toNode(self: *const NodeOrText, page: *Page) !*Node { pub fn toNode(self: *const NodeOrText, page: *Page) !*Node {
return switch (self.*) { return switch (self.*) {
.node => |n| n, .node => |n| n,
.text => |txt| page.createTextNode(txt), .text => |txt| page.createTextNode(txt, null),
}; };
} }
}; };

View File

@@ -322,8 +322,8 @@ pub fn insertNode(self: *Range, node: *Node, page: *Page) !void {
const before_text = text_data[0..offset]; const before_text = text_data[0..offset];
const after_text = text_data[offset..]; const after_text = text_data[offset..];
const before = try page.createTextNode(before_text); const before = try page.createTextNode(before_text, parent);
const after = try page.createTextNode(after_text); const after = try page.createTextNode(after_text, parent);
_ = try parent.replaceChild(before, container, page); _ = try parent.replaceChild(before, container, page);
_ = try parent.insertBefore(node, before.nextSibling(), page); _ = try parent.insertBefore(node, before.nextSibling(), page);
@@ -423,8 +423,9 @@ pub fn cloneContents(self: *const Range, page: *Page) !*DocumentFragment {
const text_data = self._proto._start_container.getData(); const text_data = self._proto._start_container.getData();
if (self._proto._start_offset < text_data.len and self._proto._end_offset <= text_data.len) { if (self._proto._start_offset < text_data.len and self._proto._end_offset <= text_data.len) {
const cloned_text = text_data[self._proto._start_offset..self._proto._end_offset]; const cloned_text = text_data[self._proto._start_offset..self._proto._end_offset];
const text_node = try page.createTextNode(cloned_text); const parent = fragment.asNode();
_ = try fragment.asNode().appendChild(text_node, page); const text_node = try page.createTextNode(cloned_text, parent);
_ = try parent.appendChild(text_node, page);
} }
} else { } else {
// Clone child nodes in range // Clone child nodes in range
@@ -444,7 +445,7 @@ pub fn cloneContents(self: *const Range, page: *Page) !*DocumentFragment {
if (self._proto._start_offset < text_data.len) { if (self._proto._start_offset < text_data.len) {
// Clone from start_offset to end of text // Clone from start_offset to end of text
const cloned_text = text_data[self._proto._start_offset..]; const cloned_text = text_data[self._proto._start_offset..];
const text_node = try page.createTextNode(cloned_text); const text_node = try page.createTextNode(cloned_text, null);
_ = try fragment.asNode().appendChild(text_node, page); _ = try fragment.asNode().appendChild(text_node, page);
} }
} }
@@ -465,8 +466,9 @@ pub fn cloneContents(self: *const Range, page: *Page) !*DocumentFragment {
if (self._proto._end_offset > 0 and self._proto._end_offset <= text_data.len) { if (self._proto._end_offset > 0 and self._proto._end_offset <= text_data.len) {
// Clone from start to end_offset // Clone from start to end_offset
const cloned_text = text_data[0..self._proto._end_offset]; const cloned_text = text_data[0..self._proto._end_offset];
const text_node = try page.createTextNode(cloned_text); const parent = fragment.asNode();
_ = try fragment.asNode().appendChild(text_node, page); const text_node = try page.createTextNode(cloned_text, parent);
_ = try parent.appendChild(text_node, page);
} }
} }
} }

View File

@@ -24,7 +24,7 @@ const Text = @This();
_proto: *CData, _proto: *CData,
pub fn init(str: ?[]const u8, page: *Page) !*Text { pub fn init(str: ?[]const u8, page: *Page) !*Text {
const node = try page.createTextNode(str orelse ""); const node = try page.createTextNode(str orelse "", null);
return node.as(Text); return node.as(Text);
} }
@@ -40,7 +40,7 @@ pub fn splitText(self: *Text, offset: usize, page: *Page) !*Text {
} }
const new_data = data[offset..]; const new_data = data[offset..];
const new_node = try page.createTextNode(new_data); const new_node = try page.createTextNode(new_data, null);
const new_text = new_node.as(Text); const new_text = new_node.as(Text);
const old_data = data[0..offset]; const old_data = data[0..offset];

View File

@@ -264,7 +264,7 @@ pub fn setInnerText(self: *HtmlElement, text: []const u8, page: *Page) !void {
} }
// Create and append text node // Create and append text node
const text_node = try page.createTextNode(text); const text_node = try page.createTextNode(text, parent);
try page.appendNode(parent, text_node, .{ .child_already_connected = false }); try page.appendNode(parent, text_node, .{ .child_already_connected = false });
} }

View File

@@ -79,7 +79,7 @@ pub fn setDefaultValue(self: *TextArea, value: []const u8, page: *Page) !void {
} }
// No text child exists, create one // No text child exists, create one
const text_node = try page.createTextNode(owned); const text_node = try page.createTextNode(owned, node);
_ = try node.appendChild(text_node, page); _ = try node.appendChild(text_node, page);
} }

View File

@@ -134,6 +134,7 @@ pub const String = packed struct {
// This can be used outside of the small string optimization // This can be used outside of the small string optimization
pub fn intern(input: []const u8) ?[]const u8 { pub fn intern(input: []const u8) ?[]const u8 {
switch (input.len) { switch (input.len) {
0 => return "",
1 => switch (input[0]) { 1 => switch (input[0]) {
'\n' => return "\n", '\n' => return "\n",
'\r' => return "\r", '\r' => return "\r",