// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) // // Francis Bouvier // Pierre Tachoire // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as // published by the Free Software Foundation, either version 3 of the // License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. See . const std = @import("std"); const lp = @import("lightpanda"); const log = @import("log.zig"); const isAllWhitespace = @import("string.zig").isAllWhitespace; const Page = lp.Page; const interactive = @import("browser/interactive.zig"); const CData = @import("browser/webapi/CData.zig"); const Element = @import("browser/webapi/Element.zig"); const Node = @import("browser/webapi/Node.zig"); const AXNode = @import("cdp/AXNode.zig"); const CDPNode = @import("cdp/Node.zig"); const Self = @This(); dom_node: *Node, registry: *CDPNode.Registry, page: *Page, arena: std.mem.Allocator, prune: bool = true, interactive_only: bool = false, max_depth: u32 = std.math.maxInt(u32) - 1, pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void { var visitor = JsonVisitor{ .jw = jw, .tree = self }; var xpath_buffer: std.ArrayList(u8) = .{}; const listener_targets = interactive.buildListenerTargetMap(self.page, self.arena) catch |err| { log.err(.app, "listener map failed", .{ .err = err }); return error.WriteFailed; }; self.walk(self.dom_node, &xpath_buffer, null, &visitor, 1, listener_targets, 0) catch |err| { log.err(.app, "semantic tree json dump failed", .{ .err = err }); return error.WriteFailed; }; } pub fn textStringify(self: @This(), writer: *std.Io.Writer) error{WriteFailed}!void { var visitor = TextVisitor{ .writer = writer, .tree = self, .depth = 0 }; var xpath_buffer: std.ArrayList(u8) = .empty; const listener_targets = interactive.buildListenerTargetMap(self.page, self.arena) catch |err| { log.err(.app, "listener map failed", .{ .err = err }); return error.WriteFailed; }; self.walk(self.dom_node, &xpath_buffer, null, &visitor, 1, listener_targets, 0) catch |err| { log.err(.app, "semantic tree text dump failed", .{ .err = err }); return error.WriteFailed; }; } const OptionData = struct { value: []const u8, text: []const u8, selected: bool, }; const NodeData = struct { id: CDPNode.Id, axn: AXNode, role: []const u8, name: ?[]const u8, value: ?[]const u8, options: ?[]OptionData = null, xpath: []const u8, is_interactive: bool, node_name: []const u8, }; fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_name: ?[]const u8, visitor: anytype, index: usize, listener_targets: interactive.ListenerTargetMap, current_depth: u32) !void { if (current_depth > self.max_depth) return; // 1. Skip non-content nodes if (node.is(Element)) |el| { const tag = el.getTag(); if (tag.isMetadata() or tag == .svg) return; // We handle options/optgroups natively inside their parents, skip them in the general walk if (tag == .datalist or tag == .option or tag == .optgroup) return; // Check visibility using the engine's checkVisibility which handles CSS display: none if (!el.checkVisibility(self.page)) { return; } if (el.is(Element.Html)) |html_el| { if (html_el.getHidden()) return; } } else if (node.is(CData.Text)) |text_node| { const text = text_node.getWholeText(); if (isAllWhitespace(text)) { return; } } else if (node._type != .document and node._type != .document_fragment) { return; } const cdp_node = try self.registry.register(node); const axn = AXNode.fromNode(node); const role = try axn.getRole(); var is_interactive = false; var value: ?[]const u8 = null; var options: ?[]OptionData = null; var node_name: []const u8 = "text"; if (node.is(Element)) |el| { node_name = el.getTagNameLower(); if (el.is(Element.Html.Input)) |input| { value = input.getValue(); if (el.getAttributeSafe(comptime lp.String.wrap("list"))) |list_id| { options = try extractDataListOptions(list_id, self.page, self.arena); } } else if (el.is(Element.Html.TextArea)) |textarea| { value = textarea.getValue(); } else if (el.is(Element.Html.Select)) |select| { value = select.getValue(self.page); options = try extractSelectOptions(el.asNode(), self.page, self.arena); } if (el.is(Element.Html)) |html_el| { if (interactive.classifyInteractivity(el, html_el, listener_targets) != null) { is_interactive = true; } } } else if (node._type == .document or node._type == .document_fragment) { node_name = "root"; } const initial_xpath_len = xpath_buffer.items.len; try appendXPathSegment(node, xpath_buffer.writer(self.arena), index); const xpath = xpath_buffer.items; var name = try axn.getName(self.page, self.arena); const has_explicit_label = if (node.is(Element)) |el| el.getAttributeSafe(.wrap("aria-label")) != null or el.getAttributeSafe(.wrap("title")) != null else false; const structural = isStructuralRole(role); // Filter out computed concatenated names for generic containers without explicit labels. // This prevents token bloat and ensures their StaticText children aren't incorrectly pruned. // We ignore interactivity because a generic wrapper with an event listener still shouldn't hoist all text. if (name != null and structural and !has_explicit_label) { name = null; } var data = NodeData{ .id = cdp_node.id, .axn = axn, .role = role, .name = name, .value = value, .options = options, .xpath = xpath, .is_interactive = is_interactive, .node_name = node_name, }; var should_visit = true; if (self.interactive_only) { var keep = false; if (interactive.isInteractiveRole(role)) { keep = true; } else if (interactive.isContentRole(role)) { if (name != null and name.?.len > 0) { keep = true; } } else if (std.mem.eql(u8, role, "RootWebArea")) { keep = true; } else if (is_interactive) { keep = true; } if (!keep) { should_visit = false; } } else if (self.prune) { if (structural and !is_interactive and !has_explicit_label) { should_visit = false; } if (std.mem.eql(u8, role, "StaticText") and node._parent != null) { if (parent_name != null and name != null and std.mem.indexOf(u8, parent_name.?, name.?) != null) { should_visit = false; } } } var did_visit = false; var should_walk_children = true; if (should_visit) { should_walk_children = try visitor.visit(node, &data); did_visit = true; // Always true if should_visit was true, because visit() executed and opened structures } else { // If we skip the node, we must NOT tell the visitor to close it later did_visit = false; } if (should_walk_children) { // If we are printing this node normally OR skipping it and unrolling its children, // we walk the children iterator. var it = node.childrenIterator(); var tag_counts = std.StringArrayHashMap(usize).init(self.arena); while (it.next()) |child| { var tag: []const u8 = "text()"; if (child.is(Element)) |el| { tag = el.getTagNameLower(); } const gop = try tag_counts.getOrPut(tag); if (!gop.found_existing) { gop.value_ptr.* = 0; } gop.value_ptr.* += 1; try self.walk(child, xpath_buffer, name, visitor, gop.value_ptr.*, listener_targets, current_depth + 1); } } if (did_visit) { try visitor.leave(); } xpath_buffer.shrinkRetainingCapacity(initial_xpath_len); } fn extractSelectOptions(node: *Node, page: *Page, arena: std.mem.Allocator) ![]OptionData { var options = std.ArrayListUnmanaged(OptionData){}; var it = node.childrenIterator(); while (it.next()) |child| { if (child.is(Element)) |el| { if (el.getTag() == .option) { if (el.is(Element.Html.Option)) |opt| { const text = opt.getText(page); const value = opt.getValue(page); const selected = opt.getSelected(); try options.append(arena, .{ .text = text, .value = value, .selected = selected }); } } else if (el.getTag() == .optgroup) { var group_it = child.childrenIterator(); while (group_it.next()) |group_child| { if (group_child.is(Element.Html.Option)) |opt| { const text = opt.getText(page); const value = opt.getValue(page); const selected = opt.getSelected(); try options.append(arena, .{ .text = text, .value = value, .selected = selected }); } } } } } return options.toOwnedSlice(arena); } fn extractDataListOptions(list_id: []const u8, page: *Page, arena: std.mem.Allocator) !?[]OptionData { if (page.document.getElementById(list_id, page)) |referenced_el| { if (referenced_el.getTag() == .datalist) { return try extractSelectOptions(referenced_el.asNode(), page, arena); } } return null; } fn appendXPathSegment(node: *Node, writer: anytype, index: usize) !void { if (node.is(Element)) |el| { const tag = el.getTagNameLower(); try std.fmt.format(writer, "/{s}[{d}]", .{ tag, index }); } else if (node.is(CData.Text)) |_| { try std.fmt.format(writer, "/text()[{d}]", .{index}); } } const JsonVisitor = struct { jw: *std.json.Stringify, tree: Self, pub fn visit(self: *JsonVisitor, node: *Node, data: *NodeData) !bool { try self.jw.beginObject(); try self.jw.objectField("nodeId"); try self.jw.write(try std.fmt.allocPrint(self.tree.arena, "{d}", .{data.id})); try self.jw.objectField("backendDOMNodeId"); try self.jw.write(data.id); try self.jw.objectField("nodeName"); try self.jw.write(data.node_name); try self.jw.objectField("xpath"); try self.jw.write(data.xpath); if (node.is(Element)) |el| { try self.jw.objectField("nodeType"); try self.jw.write(1); try self.jw.objectField("isInteractive"); try self.jw.write(data.is_interactive); try self.jw.objectField("role"); try self.jw.write(data.role); if (data.name) |name| { if (name.len > 0) { try self.jw.objectField("name"); try self.jw.write(name); } } if (data.value) |value| { try self.jw.objectField("value"); try self.jw.write(value); } if (el._attributes) |attrs| { try self.jw.objectField("attributes"); try self.jw.beginObject(); var iter = attrs.iterator(); while (iter.next()) |attr| { try self.jw.objectField(attr._name.str()); try self.jw.write(attr._value.str()); } try self.jw.endObject(); } if (data.options) |options| { try self.jw.objectField("options"); try self.jw.beginArray(); for (options) |opt| { try self.jw.beginObject(); try self.jw.objectField("value"); try self.jw.write(opt.value); try self.jw.objectField("text"); try self.jw.write(opt.text); try self.jw.objectField("selected"); try self.jw.write(opt.selected); try self.jw.endObject(); } try self.jw.endArray(); } } else if (node.is(CData.Text)) |text_node| { try self.jw.objectField("nodeType"); try self.jw.write(3); try self.jw.objectField("nodeValue"); try self.jw.write(text_node.getWholeText()); } else { try self.jw.objectField("nodeType"); try self.jw.write(9); } try self.jw.objectField("children"); try self.jw.beginArray(); if (data.options != null) { // Signal to not walk children, as we handled them natively return false; } return true; } pub fn leave(self: *JsonVisitor) !void { try self.jw.endArray(); try self.jw.endObject(); } }; fn isStructuralRole(role: []const u8) bool { const structural_roles = std.StaticStringMap(void).initComptime(.{ .{ "none", {} }, .{ "generic", {} }, .{ "InlineTextBox", {} }, .{ "banner", {} }, .{ "navigation", {} }, .{ "main", {} }, .{ "list", {} }, .{ "listitem", {} }, .{ "table", {} }, .{ "rowgroup", {} }, .{ "row", {} }, .{ "cell", {} }, .{ "region", {} }, }); return structural_roles.has(role); } const TextVisitor = struct { writer: *std.Io.Writer, tree: Self, depth: usize, pub fn visit(self: *TextVisitor, node: *Node, data: *NodeData) !bool { for (0..self.depth) |_| { try self.writer.writeByte(' '); } var name_to_print: ?[]const u8 = null; if (data.name) |n| { if (n.len > 0) { name_to_print = n; } } else if (node.is(CData.Text)) |text_node| { const trimmed = std.mem.trim(u8, text_node.getWholeText(), " \t\r\n"); if (trimmed.len > 0) { name_to_print = trimmed; } } const is_text_only = std.mem.eql(u8, data.role, "StaticText") or std.mem.eql(u8, data.role, "none") or std.mem.eql(u8, data.role, "generic"); try self.writer.print("{d}", .{data.id}); if (!is_text_only) { try self.writer.print(" {s}", .{data.role}); } if (name_to_print) |n| { try self.writer.print(" '{s}'", .{n}); } if (data.value) |v| { if (v.len > 0) { try self.writer.print(" value='{s}'", .{v}); } } if (data.options) |options| { try self.writer.writeAll(" options=["); for (options, 0..) |opt, i| { if (i > 0) try self.writer.writeAll(","); try self.writer.print("'{s}'", .{opt.value}); if (opt.selected) { try self.writer.writeAll("*"); } } try self.writer.writeAll("]\n"); self.depth += 1; return false; // Native handling complete, do not walk children } try self.writer.writeByte('\n'); self.depth += 1; // If this is a leaf-like semantic node and we already have a name, // skip children to avoid redundant StaticText or noise. const is_leaf_semantic = std.mem.eql(u8, data.role, "link") or std.mem.eql(u8, data.role, "button") or std.mem.eql(u8, data.role, "heading") or std.mem.eql(u8, data.role, "code"); if (is_leaf_semantic and data.name != null and data.name.?.len > 0) { return false; } return true; } pub fn leave(self: *TextVisitor) !void { if (self.depth > 0) { self.depth -= 1; } } }; const testing = @import("testing.zig"); test "SemanticTree backendDOMNodeId" { var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); var page = try testing.pageTest("cdp/registry1.html"); defer testing.reset(); defer page._session.removePage(); const st: Self = .{ .dom_node = page.window._document.asNode(), .registry = ®istry, .page = page, .arena = testing.arena_allocator, .prune = false, .interactive_only = false, .max_depth = std.math.maxInt(u32) - 1, }; const json_str = try std.json.Stringify.valueAlloc(testing.allocator, st, .{}); defer testing.allocator.free(json_str); try testing.expect(std.mem.indexOf(u8, json_str, "\"backendDOMNodeId\":") != null); } test "SemanticTree max_depth" { var registry: CDPNode.Registry = .init(testing.allocator); defer registry.deinit(); var page = try testing.pageTest("cdp/registry1.html"); defer testing.reset(); defer page._session.removePage(); const st: Self = .{ .dom_node = page.window._document.asNode(), .registry = ®istry, .page = page, .arena = testing.arena_allocator, .prune = false, .interactive_only = false, .max_depth = 1, }; var aw: std.Io.Writer.Allocating = .init(testing.allocator); defer aw.deinit(); try st.textStringify(&aw.writer); const text_str = aw.written(); try testing.expect(std.mem.indexOf(u8, text_str, "other") == null); }