From 015edc38481b989da5fb1fe7acd5f6c3c44f28fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Arrufat?= Date: Wed, 18 Mar 2026 10:56:50 +0900 Subject: [PATCH] SemanticTree: implement interactiveOnly filter and optimize token usage --- src/SemanticTree.zig | 47 +++++++++++++++++++++++++++++-------- src/browser/interactive.zig | 18 ++++++++++++-- src/cdp/domains/lp.zig | 2 ++ 3 files changed, 55 insertions(+), 12 deletions(-) diff --git a/src/SemanticTree.zig b/src/SemanticTree.zig index 3433ca6b..e9661bad 100644 --- a/src/SemanticTree.zig +++ b/src/SemanticTree.zig @@ -37,6 +37,7 @@ registry: *CDPNode.Registry, page: *Page, arena: std.mem.Allocator, prune: bool = false, +interactive_only: bool = false, pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void { var visitor = JsonVisitor{ .jw = jw, .tree = self }; @@ -174,7 +175,23 @@ fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_nam }; var should_visit = true; - if (self.prune) { + if (self.interactive_only) { + var keep = false; + if (interactive.isInteractiveRole(role)) { + keep = true; + } else if (interactive.isContentRole(role)) { + if (name != null and name.?.len > 0) { + keep = true; + } + } else if (std.mem.eql(u8, role, "RootWebArea")) { + keep = true; + } else if (is_interactive) { + keep = true; + } + if (!keep) { + should_visit = false; + } + } else if (self.prune) { if (structural and !is_interactive and !has_explicit_label) { should_visit = false; } @@ -389,36 +406,46 @@ const TextVisitor = struct { depth: usize, pub fn visit(self: *TextVisitor, node: *Node, data: *NodeData) !bool { - // Format: " [12] link: Hacker News (value)" - for (0..(self.depth * 2)) |_| { + for (0..self.depth) |_| { try self.writer.writeByte(' '); } - try self.writer.print("[{d}] {s}: ", .{ data.id, data.role }); + var name_to_print: ?[]const u8 = null; if (data.name) |n| { if (n.len > 0) { - try self.writer.writeAll(n); + name_to_print = n; } } else if (node.is(CData.Text)) |text_node| { const trimmed = std.mem.trim(u8, text_node.getWholeText(), " \t\r\n"); if (trimmed.len > 0) { - try self.writer.writeAll(trimmed); + name_to_print = trimmed; + } + } + + const is_text_only = std.mem.eql(u8, data.role, "StaticText") or std.mem.eql(u8, data.role, "none") or std.mem.eql(u8, data.role, "generic"); + + if (is_text_only and name_to_print != null) { + try self.writer.print("{d} '{s}'", .{ data.id, name_to_print.? }); + } else { + try self.writer.print("{d} {s}", .{ data.id, data.role }); + if (name_to_print) |n| { + try self.writer.print(" '{s}'", .{n}); } } if (data.value) |v| { if (v.len > 0) { - try self.writer.print(" (value: {s})", .{v}); + try self.writer.print(" value='{s}'", .{v}); } } if (data.options) |options| { - try self.writer.writeAll(" options: ["); + try self.writer.writeAll(" options=["); for (options, 0..) |opt, i| { - if (i > 0) try self.writer.writeAll(", "); + if (i > 0) try self.writer.writeAll(","); try self.writer.print("'{s}'", .{opt.value}); if (opt.selected) { - try self.writer.writeAll(" (selected)"); + try self.writer.writeAll("*"); } } try self.writer.writeAll("]\n"); diff --git a/src/browser/interactive.zig b/src/browser/interactive.zig index 2d03db51..04a0d492 100644 --- a/src/browser/interactive.zig +++ b/src/browser/interactive.zig @@ -253,12 +253,13 @@ pub fn classifyInteractivity( return null; } -fn isInteractiveRole(role: []const u8) bool { +pub fn isInteractiveRole(role: []const u8) bool { const interactive_roles = [_][]const u8{ "button", "link", "tab", "menuitem", "menuitemcheckbox", "menuitemradio", "switch", "checkbox", "radio", "slider", "spinbutton", "searchbox", - "combobox", "option", "treeitem", + "combobox", "option", "treeitem", "textbox", + "listbox", "iframe", }; for (interactive_roles) |r| { if (std.ascii.eqlIgnoreCase(role, r)) return true; @@ -266,6 +267,19 @@ fn isInteractiveRole(role: []const u8) bool { return false; } +pub fn isContentRole(role: []const u8) bool { + const content_roles = [_][]const u8{ + "heading", "cell", "gridcell", + "columnheader", "rowheader", "listitem", + "article", "region", "main", + "navigation", + }; + for (content_roles) |r| { + if (std.ascii.eqlIgnoreCase(role, r)) return true; + } + return false; +} + fn getRole(el: *Element) ?[]const u8 { // Explicit role attribute takes precedence if (el.getAttributeSafe(comptime .wrap("role"))) |role| return role; diff --git a/src/cdp/domains/lp.zig b/src/cdp/domains/lp.zig index 19fc8cac..b0d5c574 100644 --- a/src/cdp/domains/lp.zig +++ b/src/cdp/domains/lp.zig @@ -52,6 +52,7 @@ fn getSemanticTree(cmd: anytype) !void { const Params = struct { format: ?enum { text } = null, prune: ?bool = null, + interactiveOnly: ?bool = null, }; const params = (try cmd.params(Params)) orelse Params{}; @@ -65,6 +66,7 @@ fn getSemanticTree(cmd: anytype) !void { .page = page, .arena = cmd.arena, .prune = params.prune orelse false, + .interactive_only = params.interactiveOnly orelse false, }; if (params.format) |format| {