From b8a3135835cbbb05e8b1bf22904dbe0938352d7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Arrufat?= Date: Mon, 9 Mar 2026 13:02:03 +0900 Subject: [PATCH] SemanticTree: add pruning support and move logic to walk --- src/SemanticTree.zig | 74 ++++++++++++++++++++---------------------- src/cdp/domains/lp.zig | 5 ++- src/mcp/tools.zig | 1 + 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/SemanticTree.zig b/src/SemanticTree.zig index 03b48ec4..d9b5b1e6 100644 --- a/src/SemanticTree.zig +++ b/src/SemanticTree.zig @@ -35,6 +35,7 @@ dom_node: *Node, registry: *CDPNode.Registry, page: *Page, arena: std.mem.Allocator, +prune: bool = false, pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void { var visitor = JsonVisitor{ .jw = jw, .tree = self }; @@ -155,12 +156,39 @@ fn walk(self: @This(), node: *Node, parent_xpath: []const u8, visitor: anytype) .node_name = node_name, }; - if (try visitor.visit(node, &data)) { - var it = node.childrenIterator(); - while (it.next()) |child| { - try self.walk(child, xpath, visitor); + var should_visit = true; + if (self.prune) { + const structural = isStructuralRole(role); + const has_explicit_label = if (node.is(Element)) |el| + el.getAttributeSafe(.wrap("aria-label")) != null or el.getAttributeSafe(.wrap("title")) != null + else + false; + + if (structural and !is_interactive and !has_explicit_label) { + should_visit = false; } - try visitor.leave(node, &data); + + if (std.mem.eql(u8, role, "StaticText") and node._parent != null) { + const parent_axn = AXNode.fromNode(node._parent.?); + const parent_name = try parent_axn.getName(self.page, self.arena); + if (parent_name != null and name != null and std.mem.indexOf(u8, parent_name.?, name.?) != null) { + should_visit = false; + } + } + } + + var did_visit = false; + if (should_visit) { + did_visit = try visitor.visit(node, &data); + } + + var it = node.childrenIterator(); + while (it.next()) |child| { + try self.walk(child, xpath, visitor); + } + + if (did_visit) { + try visitor.leave(); } } @@ -264,7 +292,7 @@ const JsonVisitor = struct { return true; } - pub fn leave(self: *JsonVisitor, _: *Node, _: *NodeData) !void { + pub fn leave(self: *JsonVisitor) !void { try self.jw.endArray(); try self.jw.endObject(); } @@ -282,29 +310,6 @@ const TextVisitor = struct { depth: usize, pub fn visit(self: *TextVisitor, node: *Node, data: *NodeData) !bool { - // Pruning Heuristic: - // If it's a structural node (none/generic) and has no unique label, unwrap it. - // We only keep 'none'/'generic' if they are interactive. - const structural = isStructuralRole(data.role); - const has_explicit_label = if (node.is(Element)) |el| - el.getAttributeSafe(.wrap("aria-label")) != null or el.getAttributeSafe(.wrap("title")) != null - else - false; - - if (structural and !data.is_interactive and !has_explicit_label) { - // Just unwrap (don't print this node, but visit children at same depth) - return true; - } - - // Skip redundant StaticText nodes if the parent already captures the text - if (std.mem.eql(u8, data.role, "StaticText") and node._parent != null) { - const parent_axn = AXNode.fromNode(node._parent.?); - const parent_name = try parent_axn.getName(self.tree.page, self.tree.arena); - if (parent_name != null and data.name != null and std.mem.indexOf(u8, parent_name.?, data.name.?) != null) { - return false; - } - } - // Format: " [12] link: Hacker News (value)" for (0..(self.depth * 2)) |_| { try self.writer.writeByte(' '); @@ -334,16 +339,7 @@ const TextVisitor = struct { return true; } - pub fn leave(self: *TextVisitor, node: *Node, data: *NodeData) !void { - const structural = isStructuralRole(data.role); - const has_explicit_label = if (node.is(Element)) |el| - el.getAttributeSafe(.wrap("aria-label")) != null or el.getAttributeSafe(.wrap("title")) != null - else - false; - - if (structural and !data.is_interactive and !has_explicit_label) { - return; - } + pub fn leave(self: *TextVisitor) !void { self.depth -= 1; } }; diff --git a/src/cdp/domains/lp.zig b/src/cdp/domains/lp.zig index 2470a312..6d853e76 100644 --- a/src/cdp/domains/lp.zig +++ b/src/cdp/domains/lp.zig @@ -39,6 +39,7 @@ pub fn processMessage(cmd: anytype) !void { fn getSemanticTree(cmd: anytype) !void { const Params = struct { format: ?[]const u8 = null, + prune: ?bool = null, }; const params = (try cmd.params(Params)) orelse Params{}; @@ -46,15 +47,17 @@ fn getSemanticTree(cmd: anytype) !void { const page = bc.session.currentPage() orelse return error.PageNotLoaded; const dom_node = page.document.asNode(); - const st = SemanticTree{ + var st = SemanticTree{ .dom_node = dom_node, .registry = &bc.node_registry, .page = page, .arena = cmd.arena, + .prune = params.prune orelse false, }; if (params.format) |format| { if (std.mem.eql(u8, format, "text")) { + st.prune = params.prune orelse true; // text format defaults to pruned var aw: std.Io.Writer.Allocating = .init(cmd.arena); defer aw.deinit(); try st.textStringify(&aw.writer); diff --git a/src/mcp/tools.zig b/src/mcp/tools.zig index ebdd0408..a475b987 100644 --- a/src/mcp/tools.zig +++ b/src/mcp/tools.zig @@ -134,6 +134,7 @@ const ToolStreamingText = struct { .registry = self.registry.?, .page = self.page, .arena = self.arena.?, + .prune = true, }; st.textStringify(w) catch |err| {