diff --git a/src/SemanticTree.zig b/src/SemanticTree.zig index 3433ca6b..166acf4a 100644 --- a/src/SemanticTree.zig +++ b/src/SemanticTree.zig @@ -36,7 +36,8 @@ dom_node: *Node, registry: *CDPNode.Registry, page: *Page, arena: std.mem.Allocator, -prune: bool = false, +prune: bool = true, +interactive_only: bool = false, pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void { var visitor = JsonVisitor{ .jw = jw, .tree = self }; @@ -174,7 +175,23 @@ fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_nam }; var should_visit = true; - if (self.prune) { + if (self.interactive_only) { + var keep = false; + if (interactive.isInteractiveRole(role)) { + keep = true; + } else if (interactive.isContentRole(role)) { + if (name != null and name.?.len > 0) { + keep = true; + } + } else if (std.mem.eql(u8, role, "RootWebArea")) { + keep = true; + } else if (is_interactive) { + keep = true; + } + if (!keep) { + should_visit = false; + } + } else if (self.prune) { if (structural and !is_interactive and !has_explicit_label) { should_visit = false; } @@ -389,36 +406,45 @@ const TextVisitor = struct { depth: usize, pub fn visit(self: *TextVisitor, node: *Node, data: *NodeData) !bool { - // Format: " [12] link: Hacker News (value)" - for (0..(self.depth * 2)) |_| { + for (0..self.depth) |_| { try self.writer.writeByte(' '); } - try self.writer.print("[{d}] {s}: ", .{ data.id, data.role }); + var name_to_print: ?[]const u8 = null; if (data.name) |n| { if (n.len > 0) { - try self.writer.writeAll(n); + name_to_print = n; } } else if (node.is(CData.Text)) |text_node| { const trimmed = std.mem.trim(u8, text_node.getWholeText(), " \t\r\n"); if (trimmed.len > 0) { - try self.writer.writeAll(trimmed); + name_to_print = trimmed; } } + const is_text_only = std.mem.eql(u8, data.role, "StaticText") or std.mem.eql(u8, data.role, "none") or std.mem.eql(u8, data.role, "generic"); + + try self.writer.print("{d}", .{data.id}); + if (!is_text_only) { + try self.writer.print(" {s}", .{data.role}); + } + if (name_to_print) |n| { + try self.writer.print(" '{s}'", .{n}); + } + if (data.value) |v| { if (v.len > 0) { - try self.writer.print(" (value: {s})", .{v}); + try self.writer.print(" value='{s}'", .{v}); } } if (data.options) |options| { - try self.writer.writeAll(" options: ["); + try self.writer.writeAll(" options=["); for (options, 0..) |opt, i| { - if (i > 0) try self.writer.writeAll(", "); + if (i > 0) try self.writer.writeAll(","); try self.writer.print("'{s}'", .{opt.value}); if (opt.selected) { - try self.writer.writeAll(" (selected)"); + try self.writer.writeAll("*"); } } try self.writer.writeAll("]\n"); diff --git a/src/browser/interactive.zig b/src/browser/interactive.zig index 2d03db51..075e8554 100644 --- a/src/browser/interactive.zig +++ b/src/browser/interactive.zig @@ -253,17 +253,52 @@ pub fn classifyInteractivity( return null; } -fn isInteractiveRole(role: []const u8) bool { - const interactive_roles = [_][]const u8{ - "button", "link", "tab", "menuitem", - "menuitemcheckbox", "menuitemradio", "switch", "checkbox", - "radio", "slider", "spinbutton", "searchbox", - "combobox", "option", "treeitem", - }; - for (interactive_roles) |r| { - if (std.ascii.eqlIgnoreCase(role, r)) return true; - } - return false; +pub fn isInteractiveRole(role: []const u8) bool { + const MAX_LEN = "menuitemcheckbox".len; + if (role.len > MAX_LEN) return false; + var buf: [MAX_LEN]u8 = undefined; + const lowered = std.ascii.lowerString(&buf, role); + const interactive_roles = std.StaticStringMap(void).initComptime(.{ + .{ "button", {} }, + .{ "checkbox", {} }, + .{ "combobox", {} }, + .{ "iframe", {} }, + .{ "link", {} }, + .{ "listbox", {} }, + .{ "menuitem", {} }, + .{ "menuitemcheckbox", {} }, + .{ "menuitemradio", {} }, + .{ "option", {} }, + .{ "radio", {} }, + .{ "searchbox", {} }, + .{ "slider", {} }, + .{ "spinbutton", {} }, + .{ "switch", {} }, + .{ "tab", {} }, + .{ "textbox", {} }, + .{ "treeitem", {} }, + }); + return interactive_roles.has(lowered); +} + +pub fn isContentRole(role: []const u8) bool { + const MAX_LEN = "columnheader".len; + if (role.len > MAX_LEN) return false; + var buf: [MAX_LEN]u8 = undefined; + const lowered = std.ascii.lowerString(&buf, role); + const content_roles = std.StaticStringMap(void).initComptime(.{ + .{ "article", {} }, + .{ "cell", {} }, + .{ "columnheader", {} }, + .{ "gridcell", {} }, + .{ "heading", {} }, + .{ "listitem", {} }, + .{ "main", {} }, + .{ "navigation", {} }, + .{ "region", {} }, + .{ "rowheader", {} }, + }); + return content_roles.has(lowered); } fn getRole(el: *Element) ?[]const u8 { diff --git a/src/cdp/domains/lp.zig b/src/cdp/domains/lp.zig index 19fc8cac..efbf9ec7 100644 --- a/src/cdp/domains/lp.zig +++ b/src/cdp/domains/lp.zig @@ -52,6 +52,7 @@ fn getSemanticTree(cmd: anytype) !void { const Params = struct { format: ?enum { text } = null, prune: ?bool = null, + interactiveOnly: ?bool = null, }; const params = (try cmd.params(Params)) orelse Params{}; @@ -64,12 +65,12 @@ fn getSemanticTree(cmd: anytype) !void { .registry = &bc.node_registry, .page = page, .arena = cmd.arena, - .prune = params.prune orelse false, + .prune = params.prune orelse true, + .interactive_only = params.interactiveOnly orelse false, }; if (params.format) |format| { if (format == .text) { - st.prune = params.prune orelse true; var aw: std.Io.Writer.Allocating = .init(cmd.arena); defer aw.deinit(); try st.textStringify(&aw.writer);