SemanticTree: add pruning support and move logic to walk

This commit is contained in:
Adrià Arrufat
2026-03-09 13:02:03 +09:00
parent 330dfccb89
commit b8a3135835
3 changed files with 40 additions and 40 deletions

View File

@@ -35,6 +35,7 @@ dom_node: *Node,
registry: *CDPNode.Registry, registry: *CDPNode.Registry,
page: *Page, page: *Page,
arena: std.mem.Allocator, arena: std.mem.Allocator,
prune: bool = false,
pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void { pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void {
var visitor = JsonVisitor{ .jw = jw, .tree = self }; var visitor = JsonVisitor{ .jw = jw, .tree = self };
@@ -155,12 +156,39 @@ fn walk(self: @This(), node: *Node, parent_xpath: []const u8, visitor: anytype)
.node_name = node_name, .node_name = node_name,
}; };
if (try visitor.visit(node, &data)) { var should_visit = true;
if (self.prune) {
const structural = isStructuralRole(role);
const has_explicit_label = if (node.is(Element)) |el|
el.getAttributeSafe(.wrap("aria-label")) != null or el.getAttributeSafe(.wrap("title")) != null
else
false;
if (structural and !is_interactive and !has_explicit_label) {
should_visit = false;
}
if (std.mem.eql(u8, role, "StaticText") and node._parent != null) {
const parent_axn = AXNode.fromNode(node._parent.?);
const parent_name = try parent_axn.getName(self.page, self.arena);
if (parent_name != null and name != null and std.mem.indexOf(u8, parent_name.?, name.?) != null) {
should_visit = false;
}
}
}
var did_visit = false;
if (should_visit) {
did_visit = try visitor.visit(node, &data);
}
var it = node.childrenIterator(); var it = node.childrenIterator();
while (it.next()) |child| { while (it.next()) |child| {
try self.walk(child, xpath, visitor); try self.walk(child, xpath, visitor);
} }
try visitor.leave(node, &data);
if (did_visit) {
try visitor.leave();
} }
} }
@@ -264,7 +292,7 @@ const JsonVisitor = struct {
return true; return true;
} }
pub fn leave(self: *JsonVisitor, _: *Node, _: *NodeData) !void { pub fn leave(self: *JsonVisitor) !void {
try self.jw.endArray(); try self.jw.endArray();
try self.jw.endObject(); try self.jw.endObject();
} }
@@ -282,29 +310,6 @@ const TextVisitor = struct {
depth: usize, depth: usize,
pub fn visit(self: *TextVisitor, node: *Node, data: *NodeData) !bool { pub fn visit(self: *TextVisitor, node: *Node, data: *NodeData) !bool {
// Pruning Heuristic:
// If it's a structural node (none/generic) and has no unique label, unwrap it.
// We only keep 'none'/'generic' if they are interactive.
const structural = isStructuralRole(data.role);
const has_explicit_label = if (node.is(Element)) |el|
el.getAttributeSafe(.wrap("aria-label")) != null or el.getAttributeSafe(.wrap("title")) != null
else
false;
if (structural and !data.is_interactive and !has_explicit_label) {
// Just unwrap (don't print this node, but visit children at same depth)
return true;
}
// Skip redundant StaticText nodes if the parent already captures the text
if (std.mem.eql(u8, data.role, "StaticText") and node._parent != null) {
const parent_axn = AXNode.fromNode(node._parent.?);
const parent_name = try parent_axn.getName(self.tree.page, self.tree.arena);
if (parent_name != null and data.name != null and std.mem.indexOf(u8, parent_name.?, data.name.?) != null) {
return false;
}
}
// Format: " [12] link: Hacker News (value)" // Format: " [12] link: Hacker News (value)"
for (0..(self.depth * 2)) |_| { for (0..(self.depth * 2)) |_| {
try self.writer.writeByte(' '); try self.writer.writeByte(' ');
@@ -334,16 +339,7 @@ const TextVisitor = struct {
return true; return true;
} }
pub fn leave(self: *TextVisitor, node: *Node, data: *NodeData) !void { pub fn leave(self: *TextVisitor) !void {
const structural = isStructuralRole(data.role);
const has_explicit_label = if (node.is(Element)) |el|
el.getAttributeSafe(.wrap("aria-label")) != null or el.getAttributeSafe(.wrap("title")) != null
else
false;
if (structural and !data.is_interactive and !has_explicit_label) {
return;
}
self.depth -= 1; self.depth -= 1;
} }
}; };

View File

@@ -39,6 +39,7 @@ pub fn processMessage(cmd: anytype) !void {
fn getSemanticTree(cmd: anytype) !void { fn getSemanticTree(cmd: anytype) !void {
const Params = struct { const Params = struct {
format: ?[]const u8 = null, format: ?[]const u8 = null,
prune: ?bool = null,
}; };
const params = (try cmd.params(Params)) orelse Params{}; const params = (try cmd.params(Params)) orelse Params{};
@@ -46,15 +47,17 @@ fn getSemanticTree(cmd: anytype) !void {
const page = bc.session.currentPage() orelse return error.PageNotLoaded; const page = bc.session.currentPage() orelse return error.PageNotLoaded;
const dom_node = page.document.asNode(); const dom_node = page.document.asNode();
const st = SemanticTree{ var st = SemanticTree{
.dom_node = dom_node, .dom_node = dom_node,
.registry = &bc.node_registry, .registry = &bc.node_registry,
.page = page, .page = page,
.arena = cmd.arena, .arena = cmd.arena,
.prune = params.prune orelse false,
}; };
if (params.format) |format| { if (params.format) |format| {
if (std.mem.eql(u8, format, "text")) { if (std.mem.eql(u8, format, "text")) {
st.prune = params.prune orelse true; // text format defaults to pruned
var aw: std.Io.Writer.Allocating = .init(cmd.arena); var aw: std.Io.Writer.Allocating = .init(cmd.arena);
defer aw.deinit(); defer aw.deinit();
try st.textStringify(&aw.writer); try st.textStringify(&aw.writer);

View File

@@ -134,6 +134,7 @@ const ToolStreamingText = struct {
.registry = self.registry.?, .registry = self.registry.?,
.page = self.page, .page = self.page,
.arena = self.arena.?, .arena = self.arena.?,
.prune = true,
}; };
st.textStringify(w) catch |err| { st.textStringify(w) catch |err| {