diff --git a/src/Config.zig b/src/Config.zig index a06fcc51..73c7f3a7 100644 --- a/src/Config.zig +++ b/src/Config.zig @@ -200,6 +200,8 @@ pub const DumpFormat = enum { html, markdown, wpt, + semantic_tree, + semantic_tree_text, }; pub const Fetch = struct { @@ -346,7 +348,7 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void { \\ \\Options: \\--dump Dumps document to stdout. - \\ Argument must be 'html' or 'markdown'. + \\ Argument must be 'html', 'markdown', 'semantic_tree', or 'semantic_tree_text'. \\ Defaults to no dump. \\ \\--strip_mode Comma separated list of tag groups to remove from dump diff --git a/src/SemanticTree.zig b/src/SemanticTree.zig new file mode 100644 index 00000000..8f5eb755 --- /dev/null +++ b/src/SemanticTree.zig @@ -0,0 +1,450 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. See . + +const std = @import("std"); + +const lp = @import("lightpanda"); +const log = @import("log.zig"); +const isAllWhitespace = @import("string.zig").isAllWhitespace; +const Page = lp.Page; +const interactive = @import("browser/interactive.zig"); + +const CData = @import("browser/webapi/CData.zig"); +const Element = @import("browser/webapi/Element.zig"); +const Node = @import("browser/webapi/Node.zig"); +const AXNode = @import("cdp/AXNode.zig"); +const CDPNode = @import("cdp/Node.zig"); + +const Self = @This(); + +dom_node: *Node, +registry: *CDPNode.Registry, +page: *Page, +arena: std.mem.Allocator, +prune: bool = false, + +pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void { + var visitor = JsonVisitor{ .jw = jw, .tree = self }; + var xpath_buffer: std.ArrayList(u8) = .{}; + const listener_targets = interactive.buildListenerTargetMap(self.page, self.arena) catch |err| { + log.err(.app, "listener map failed", .{ .err = err }); + return error.WriteFailed; + }; + self.walk(self.dom_node, &xpath_buffer, null, &visitor, 1, listener_targets) catch |err| { + log.err(.app, "semantic tree json dump failed", .{ .err = err }); + return error.WriteFailed; + }; +} + +pub fn textStringify(self: @This(), writer: *std.Io.Writer) error{WriteFailed}!void { + var visitor = TextVisitor{ .writer = writer, .tree = self, .depth = 0 }; + var xpath_buffer: std.ArrayList(u8) = .empty; + const listener_targets = interactive.buildListenerTargetMap(self.page, self.arena) catch |err| { + log.err(.app, "listener map failed", .{ .err = err }); + return error.WriteFailed; + }; + self.walk(self.dom_node, &xpath_buffer, null, &visitor, 1, listener_targets) catch |err| { + log.err(.app, "semantic tree text dump failed", .{ .err = err }); + return error.WriteFailed; + }; +} + +const OptionData = struct { + value: []const u8, + text: []const u8, + selected: bool, +}; + +const NodeData = struct { + id: u32, + axn: AXNode, + role: []const u8, + name: ?[]const u8, + value: ?[]const u8, + options: ?[]OptionData = null, + xpath: []const u8, + is_interactive: bool, + node_name: []const u8, +}; + +fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_name: ?[]const u8, visitor: anytype, index: usize, listener_targets: interactive.ListenerTargetMap) !void { + // 1. Skip non-content nodes + if (node.is(Element)) |el| { + const tag = el.getTag(); + if (tag.isMetadata() or tag == .svg) return; + + // We handle options/optgroups natively inside their parents, skip them in the general walk + if (tag == .datalist or tag == .option or tag == .optgroup) return; + + // Check visibility using the engine's checkVisibility which handles CSS display: none + if (!el.checkVisibility(self.page)) { + return; + } + + if (el.is(Element.Html)) |html_el| { + if (html_el.getHidden()) return; + } + } else if (node.is(CData.Text)) |text_node| { + const text = text_node.getWholeText(); + if (isAllWhitespace(text)) { + return; + } + } else if (node._type != .document and node._type != .document_fragment) { + return; + } + + const cdp_node = try self.registry.register(node); + const axn = AXNode.fromNode(node); + const role = try axn.getRole(); + + var is_interactive = false; + var value: ?[]const u8 = null; + var options: ?[]OptionData = null; + var node_name: []const u8 = "text"; + + if (node.is(Element)) |el| { + node_name = el.getTagNameLower(); + + if (el.is(Element.Html.Input)) |input| { + value = input.getValue(); + if (el.getAttributeSafe(comptime lp.String.wrap("list"))) |list_id| { + options = try extractDataListOptions(list_id, self.page, self.arena); + } + } else if (el.is(Element.Html.TextArea)) |textarea| { + value = textarea.getValue(); + } else if (el.is(Element.Html.Select)) |select| { + value = select.getValue(self.page); + options = try extractSelectOptions(el.asNode(), self.page, self.arena); + } + + if (el.is(Element.Html)) |html_el| { + if (interactive.classifyInteractivity(el, html_el, listener_targets) != null) { + is_interactive = true; + } + } + } else if (node._type == .document or node._type == .document_fragment) { + node_name = "root"; + } + + const initial_xpath_len = xpath_buffer.items.len; + try appendXPathSegment(node, xpath_buffer.writer(self.arena), index); + const xpath = xpath_buffer.items; + + var name = try axn.getName(self.page, self.arena); + + const has_explicit_label = if (node.is(Element)) |el| + el.getAttributeSafe(.wrap("aria-label")) != null or el.getAttributeSafe(.wrap("title")) != null + else + false; + + const structural = isStructuralRole(role); + + // Filter out computed concatenated names for generic containers without explicit labels. + // This prevents token bloat and ensures their StaticText children aren't incorrectly pruned. + // We ignore interactivity because a generic wrapper with an event listener still shouldn't hoist all text. + if (name != null and structural and !has_explicit_label) { + name = null; + } + + var data = NodeData{ + .id = cdp_node.id, + .axn = axn, + .role = role, + .name = name, + .value = value, + .options = options, + .xpath = xpath, + .is_interactive = is_interactive, + .node_name = node_name, + }; + + var should_visit = true; + if (self.prune) { + if (structural and !is_interactive and !has_explicit_label) { + should_visit = false; + } + + if (std.mem.eql(u8, role, "StaticText") and node._parent != null) { + if (parent_name != null and name != null and std.mem.indexOf(u8, parent_name.?, name.?) != null) { + should_visit = false; + } + } + } + + var did_visit = false; + var should_walk_children = true; + if (should_visit) { + should_walk_children = try visitor.visit(node, &data); + did_visit = true; // Always true if should_visit was true, because visit() executed and opened structures + } else { + // If we skip the node, we must NOT tell the visitor to close it later + did_visit = false; + } + + if (should_walk_children) { + // If we are printing this node normally OR skipping it and unrolling its children, + // we walk the children iterator. + var it = node.childrenIterator(); + var tag_counts = std.StringArrayHashMap(usize).init(self.arena); + while (it.next()) |child| { + var tag: []const u8 = "text()"; + if (child.is(Element)) |el| { + tag = el.getTagNameLower(); + } + + const gop = try tag_counts.getOrPut(tag); + if (!gop.found_existing) { + gop.value_ptr.* = 0; + } + gop.value_ptr.* += 1; + + try self.walk(child, xpath_buffer, name, visitor, gop.value_ptr.*, listener_targets); + } + } + + if (did_visit) { + try visitor.leave(); + } + + xpath_buffer.shrinkRetainingCapacity(initial_xpath_len); +} + +fn extractSelectOptions(node: *Node, page: *Page, arena: std.mem.Allocator) ![]OptionData { + var options = std.ArrayListUnmanaged(OptionData){}; + var it = node.childrenIterator(); + while (it.next()) |child| { + if (child.is(Element)) |el| { + if (el.getTag() == .option) { + if (el.is(Element.Html.Option)) |opt| { + const text = opt.getText(); + const value = opt.getValue(page); + const selected = opt.getSelected(); + try options.append(arena, .{ .text = text, .value = value, .selected = selected }); + } + } else if (el.getTag() == .optgroup) { + var group_it = child.childrenIterator(); + while (group_it.next()) |group_child| { + if (group_child.is(Element.Html.Option)) |opt| { + const text = opt.getText(); + const value = opt.getValue(page); + const selected = opt.getSelected(); + try options.append(arena, .{ .text = text, .value = value, .selected = selected }); + } + } + } + } + } + return options.toOwnedSlice(arena); +} + +fn extractDataListOptions(list_id: []const u8, page: *Page, arena: std.mem.Allocator) !?[]OptionData { + if (page.document.getElementById(list_id, page)) |referenced_el| { + if (referenced_el.getTag() == .datalist) { + return try extractSelectOptions(referenced_el.asNode(), page, arena); + } + } + return null; +} + +fn appendXPathSegment(node: *Node, writer: anytype, index: usize) !void { + if (node.is(Element)) |el| { + const tag = el.getTagNameLower(); + try std.fmt.format(writer, "/{s}[{d}]", .{ tag, index }); + } else if (node.is(CData.Text)) |_| { + try std.fmt.format(writer, "/text()[{d}]", .{index}); + } +} + +const JsonVisitor = struct { + jw: *std.json.Stringify, + tree: Self, + + pub fn visit(self: *JsonVisitor, node: *Node, data: *NodeData) !bool { + try self.jw.beginObject(); + + try self.jw.objectField("nodeId"); + try self.jw.write(try std.fmt.allocPrint(self.tree.arena, "{d}", .{data.id})); + + try self.jw.objectField("backendDOMNodeId"); + try self.jw.write(data.id); + + try self.jw.objectField("nodeName"); + try self.jw.write(data.node_name); + + try self.jw.objectField("xpath"); + try self.jw.write(data.xpath); + + if (node.is(Element)) |el| { + try self.jw.objectField("nodeType"); + try self.jw.write(1); + + try self.jw.objectField("isInteractive"); + try self.jw.write(data.is_interactive); + + try self.jw.objectField("role"); + try self.jw.write(data.role); + + if (data.name) |name| { + if (name.len > 0) { + try self.jw.objectField("name"); + try self.jw.write(name); + } + } + + if (data.value) |value| { + try self.jw.objectField("value"); + try self.jw.write(value); + } + + if (el._attributes) |attrs| { + try self.jw.objectField("attributes"); + try self.jw.beginObject(); + var iter = attrs.iterator(); + while (iter.next()) |attr| { + try self.jw.objectField(attr._name.str()); + try self.jw.write(attr._value.str()); + } + try self.jw.endObject(); + } + + if (data.options) |options| { + try self.jw.objectField("options"); + try self.jw.beginArray(); + for (options) |opt| { + try self.jw.beginObject(); + try self.jw.objectField("value"); + try self.jw.write(opt.value); + try self.jw.objectField("text"); + try self.jw.write(opt.text); + try self.jw.objectField("selected"); + try self.jw.write(opt.selected); + try self.jw.endObject(); + } + try self.jw.endArray(); + } + } else if (node.is(CData.Text)) |text_node| { + try self.jw.objectField("nodeType"); + try self.jw.write(3); + try self.jw.objectField("nodeValue"); + try self.jw.write(text_node.getWholeText()); + } else { + try self.jw.objectField("nodeType"); + try self.jw.write(9); + } + + try self.jw.objectField("children"); + try self.jw.beginArray(); + + if (data.options != null) { + // Signal to not walk children, as we handled them natively + return false; + } + + return true; + } + + pub fn leave(self: *JsonVisitor) !void { + try self.jw.endArray(); + try self.jw.endObject(); + } +}; + +fn isStructuralRole(role: []const u8) bool { + const structural_roles = std.StaticStringMap(void).initComptime(.{ + .{ "none", {} }, + .{ "generic", {} }, + .{ "InlineTextBox", {} }, + .{ "banner", {} }, + .{ "navigation", {} }, + .{ "main", {} }, + .{ "list", {} }, + .{ "listitem", {} }, + .{ "table", {} }, + .{ "rowgroup", {} }, + .{ "row", {} }, + .{ "cell", {} }, + .{ "region", {} }, + }); + return structural_roles.has(role); +} + +const TextVisitor = struct { + writer: *std.Io.Writer, + tree: Self, + depth: usize, + + pub fn visit(self: *TextVisitor, node: *Node, data: *NodeData) !bool { + // Format: " [12] link: Hacker News (value)" + for (0..(self.depth * 2)) |_| { + try self.writer.writeByte(' '); + } + try self.writer.print("[{d}] {s}: ", .{ data.id, data.role }); + + if (data.name) |n| { + if (n.len > 0) { + try self.writer.writeAll(n); + } + } else if (node.is(CData.Text)) |text_node| { + const trimmed = std.mem.trim(u8, text_node.getWholeText(), " \t\r\n"); + if (trimmed.len > 0) { + try self.writer.writeAll(trimmed); + } + } + + if (data.value) |v| { + if (v.len > 0) { + try self.writer.print(" (value: {s})", .{v}); + } + } + + if (data.options) |options| { + try self.writer.writeAll(" options: ["); + for (options, 0..) |opt, i| { + if (i > 0) try self.writer.writeAll(", "); + try self.writer.print("'{s}'", .{opt.value}); + if (opt.selected) { + try self.writer.writeAll(" (selected)"); + } + } + try self.writer.writeAll("]\n"); + self.depth += 1; + return false; // Native handling complete, do not walk children + } + + try self.writer.writeByte('\n'); + self.depth += 1; + + // If this is a leaf-like semantic node and we already have a name, + // skip children to avoid redundant StaticText or noise. + const is_leaf_semantic = std.mem.eql(u8, data.role, "link") or + std.mem.eql(u8, data.role, "button") or + std.mem.eql(u8, data.role, "heading") or + std.mem.eql(u8, data.role, "code"); + if (is_leaf_semantic and data.name != null and data.name.?.len > 0) { + return false; + } + + return true; + } + + pub fn leave(self: *TextVisitor) !void { + if (self.depth > 0) { + self.depth -= 1; + } + } +}; diff --git a/src/browser/interactive.zig b/src/browser/interactive.zig index b0428a6c..2d03db51 100644 --- a/src/browser/interactive.zig +++ b/src/browser/interactive.zig @@ -157,7 +157,7 @@ pub fn collectInteractiveElements( .node = node, .tag_name = el.getTagNameLower(), .role = getRole(el), - .name = getAccessibleName(el), + .name = try getAccessibleName(el, arena), .interactivity_type = itype, .listener_types = listener_types, .disabled = isDisabled(el), @@ -178,12 +178,12 @@ pub fn collectInteractiveElements( return results.items; } -const ListenerTargetMap = std.AutoHashMapUnmanaged(usize, std.ArrayList([]const u8)); +pub const ListenerTargetMap = std.AutoHashMapUnmanaged(usize, std.ArrayList([]const u8)); /// Pre-build a map from event_target pointer → list of event type names. /// This lets both classifyInteractivity (O(1) "has any?") and /// getListenerTypes (O(1) "which ones?") avoid re-iterating per element. -fn buildListenerTargetMap(page: *Page, arena: Allocator) !ListenerTargetMap { +pub fn buildListenerTargetMap(page: *Page, arena: Allocator) !ListenerTargetMap { var map = ListenerTargetMap{}; // addEventListener registrations @@ -209,7 +209,7 @@ fn buildListenerTargetMap(page: *Page, arena: Allocator) !ListenerTargetMap { return map; } -fn classifyInteractivity( +pub fn classifyInteractivity( el: *Element, html_el: *Element.Html, listener_targets: ListenerTargetMap, @@ -296,7 +296,7 @@ fn getRole(el: *Element) ?[]const u8 { }; } -fn getAccessibleName(el: *Element) ?[]const u8 { +fn getAccessibleName(el: *Element, arena: Allocator) !?[]const u8 { // aria-label if (el.getAttributeSafe(comptime .wrap("aria-label"))) |v| { if (v.len > 0) return v; @@ -325,11 +325,15 @@ fn getAccessibleName(el: *Element) ?[]const u8 { } // Text content (first non-empty text node, trimmed) - return getTextContent(el.asNode()); + return try getTextContent(el.asNode(), arena); } -fn getTextContent(node: *Node) ?[]const u8 { - var tw = TreeWalker.FullExcludeSelf.init(node, .{}); +fn getTextContent(node: *Node, arena: Allocator) !?[]const u8 { + var tw: TreeWalker.FullExcludeSelf = .init(node, .{}); + + var arr: std.ArrayList(u8) = .empty; + var single_chunk: ?[]const u8 = null; + while (tw.next()) |child| { // Skip text inside script/style elements. if (child.is(Element)) |el| { @@ -344,13 +348,29 @@ fn getTextContent(node: *Node) ?[]const u8 { if (child.is(Node.CData)) |cdata| { if (cdata.is(Node.CData.Text)) |text| { const content = std.mem.trim(u8, text.getWholeText(), &std.ascii.whitespace); - if (content.len > 0) return content; + if (content.len > 0) { + if (single_chunk == null and arr.items.len == 0) { + single_chunk = content; + } else { + if (single_chunk) |sc| { + try arr.appendSlice(arena, sc); + try arr.append(arena, ' '); + single_chunk = null; + } + try arr.appendSlice(arena, content); + try arr.append(arena, ' '); + } + } } } } - return null; -} + if (single_chunk) |sc| return sc; + if (arr.items.len == 0) return null; + + // strip out trailing space + return arr.items[0 .. arr.items.len - 1]; +} fn isDisabled(el: *Element) bool { if (el.getAttributeSafe(comptime .wrap("disabled")) != null) return true; return isDisabledByFieldset(el); diff --git a/src/browser/markdown.zig b/src/browser/markdown.zig index f0ccd56e..8a4984a4 100644 --- a/src/browser/markdown.zig +++ b/src/browser/markdown.zig @@ -24,6 +24,7 @@ const TreeWalker = @import("webapi/TreeWalker.zig"); const CData = @import("webapi/CData.zig"); const Element = @import("webapi/Element.zig"); const Node = @import("webapi/Node.zig"); +const isAllWhitespace = @import("../string.zig").isAllWhitespace; pub const Opts = struct { // Options for future customization (e.g., dialect) @@ -46,13 +47,6 @@ const State = struct { last_char_was_newline: bool = true, }; -fn isBlock(tag: Element.Tag) bool { - return switch (tag) { - .p, .div, .section, .article, .main, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .pre, .table, .hr => true, - else => false, - }; -} - fn shouldAddSpacing(tag: Element.Tag) bool { return switch (tag) { .p, .h1, .h2, .h3, .h4, .h5, .h6, .blockquote, .pre, .table => true, @@ -99,26 +93,18 @@ fn isSignificantText(node: *Node) bool { } fn isVisibleElement(el: *Element) bool { - return switch (el.getTag()) { - .script, .style, .noscript, .template, .head, .meta, .link, .title, .svg => false, - else => true, - }; + const tag = el.getTag(); + return !tag.isMetadata() and tag != .svg; } fn getAnchorLabel(el: *Element) ?[]const u8 { return el.getAttributeSafe(comptime .wrap("aria-label")) orelse el.getAttributeSafe(comptime .wrap("title")); } -fn isAllWhitespace(text: []const u8) bool { - return for (text) |c| { - if (!std.ascii.isWhitespace(c)) break false; - } else true; -} - fn hasBlockDescendant(root: *Node) bool { var tw = TreeWalker.FullExcludeSelf.Elements.init(root, .{}); while (tw.next()) |el| { - if (isBlock(el.getTag())) return true; + if (el.getTag().isBlock()) return true; } return false; } @@ -192,7 +178,7 @@ fn renderElement(el: *Element, state: *State, writer: *std.Io.Writer, page: *Pag // --- Opening Tag Logic --- // Ensure block elements start on a new line (double newline for paragraphs etc) - if (isBlock(tag) and !state.in_table) { + if (tag.isBlock() and !state.in_table) { try ensureNewline(state, writer); if (shouldAddSpacing(tag)) { try writer.writeByte('\n'); @@ -431,7 +417,7 @@ fn renderElement(el: *Element, state: *State, writer: *std.Io.Writer, page: *Pag } // Post-block newlines - if (isBlock(tag) and !state.in_table) { + if (tag.isBlock() and !state.in_table) { try ensureNewline(state, writer); } } diff --git a/src/browser/webapi/Element.zig b/src/browser/webapi/Element.zig index 7a6598a3..2b5db1f7 100644 --- a/src/browser/webapi/Element.zig +++ b/src/browser/webapi/Element.zig @@ -1580,6 +1580,36 @@ pub const Tag = enum { else => tag, }; } + + pub fn isBlock(self: Tag) bool { + // zig fmt: off + return switch (self) { + // Semantic Layout + .article, .aside, .footer, .header, .main, .nav, .section, + // Grouping / Containers + .address, .div, .fieldset, .figure, .p, + // Headings + .h1, .h2, .h3, .h4, .h5, .h6, + // Lists + .dl, .ol, .ul, + // Preformatted / Quotes + .blockquote, .pre, + // Tables + .table, + // Other + .hr, + => true, + else => false, + }; + // zig fmt: on + } + + pub fn isMetadata(self: Tag) bool { + return switch (self) { + .base, .head, .link, .meta, .noscript, .script, .style, .template, .title => true, + else => false, + }; + } }; pub const JsApi = struct { diff --git a/src/cdp/AXNode.zig b/src/cdp/AXNode.zig index 487e79ad..718d0bab 100644 --- a/src/cdp/AXNode.zig +++ b/src/cdp/AXNode.zig @@ -557,13 +557,13 @@ pub const Writer = struct { pub const AXRole = enum(u8) { // zig fmt: off - none, article, banner, blockquote, button, caption, cell, checkbox, code, - columnheader, combobox, complementary, contentinfo, definition, deletion, - dialog, document, emphasis, figure, form, group, heading, image, insertion, - link, list, listbox, listitem, main, marquee, meter, navigation, option, + none, article, banner, blockquote, button, caption, cell, checkbox, code, color, + columnheader, combobox, complementary, contentinfo, date, definition, deletion, + dialog, document, emphasis, figure, file, form, group, heading, image, insertion, + link, list, listbox, listitem, main, marquee, menuitem, meter, month, navigation, option, paragraph, presentation, progressbar, radio, region, row, rowgroup, rowheader, searchbox, separator, slider, spinbutton, status, strong, - subscript, superscript, table, term, textbox, time, RootWebArea, LineBreak, + subscript, superscript, @"switch", table, term, textbox, time, RootWebArea, LineBreak, StaticText, // zig fmt: on @@ -620,9 +620,13 @@ pub const AXRole = enum(u8) { .number => .spinbutton, .search => .searchbox, .checkbox => .checkbox, + .color => .color, + .date => .date, + .file => .file, + .month => .month, + .@"datetime-local", .week, .time => .combobox, // zig fmt: off - .password, .@"datetime-local", .hidden, .month, .color, - .week, .time, .file, .date => .none, + .password, .hidden => .none, // zig fmt: on }; }, @@ -738,6 +742,44 @@ const AXSource = enum(u8) { value, // input value }; +pub fn getName(self: AXNode, page: *Page, allocator: std.mem.Allocator) !?[]const u8 { + var aw: std.Io.Writer.Allocating = .init(allocator); + defer aw.deinit(); + + // writeName expects a std.json.Stringify instance. + const TextCaptureWriter = struct { + aw: *std.Io.Writer.Allocating, + writer: *std.Io.Writer, + + pub fn write(w: @This(), val: anytype) !void { + const T = @TypeOf(val); + if (T == []const u8 or T == [:0]const u8 or T == *const [val.len]u8) { + try w.aw.writer.writeAll(val); + } else if (comptime std.meta.hasMethod(T, "format")) { + try std.fmt.format(w.aw.writer, "{s}", .{val}); + } else { + // Ignore unexpected types (e.g. booleans) to avoid garbage output + } + } + + // Mock JSON Stringifier lifecycle methods + pub fn beginWriteRaw(_: @This()) !void {} + pub fn endWriteRaw(_: @This()) void {} + }; + + const w: TextCaptureWriter = .{ .aw = &aw, .writer = &aw.writer }; + + const source = try self.writeName(w, page); + if (source != null) { + // Remove literal quotes inserted by writeString. + var raw_text = std.mem.trim(u8, aw.written(), "\""); + raw_text = std.mem.trim(u8, raw_text, &std.ascii.whitespace); + return try allocator.dupe(u8, raw_text); + } + + return null; +} + fn writeName(axnode: AXNode, w: anytype, page: *Page) !?AXSource { const node = axnode.dom; @@ -823,15 +865,17 @@ fn writeName(axnode: AXNode, w: anytype, page: *Page) !?AXSource { .object, .progress, .meter, .main, .nav, .aside, .header, .footer, .form, .section, .article, .ul, .ol, .dl, .menu, .thead, .tbody, .tfoot, .tr, .td, .div, .span, .p, .details, .li, - .style, .script, + .style, .script, .html, .body, // zig fmt: on => {}, else => { // write text content if exists. - var buf = std.Io.Writer.Allocating.init(page.call_arena); - try el.getInnerText(&buf.writer); - try writeString(buf.written(), w); - return .contents; + var buf: std.Io.Writer.Allocating = .init(page.call_arena); + try writeAccessibleNameFallback(node, &buf.writer, page); + if (buf.written().len > 0) { + try writeString(buf.written(), w); + return .contents; + } }, } @@ -855,6 +899,48 @@ fn writeName(axnode: AXNode, w: anytype, page: *Page) !?AXSource { }; } +fn writeAccessibleNameFallback(node: *DOMNode, writer: *std.Io.Writer, page: *Page) !void { + var it = node.childrenIterator(); + while (it.next()) |child| { + switch (child._type) { + .cdata => |cd| switch (cd._type) { + .text => |*text| { + const content = std.mem.trim(u8, text.getWholeText(), &std.ascii.whitespace); + if (content.len > 0) { + try writer.writeAll(content); + try writer.writeByte(' '); + } + }, + else => {}, + }, + .element => |el| { + if (el.getTag() == .img) { + if (el.getAttributeSafe(.wrap("alt"))) |alt| { + try writer.writeAll(alt); + try writer.writeByte(' '); + } + } else if (el.getTag() == .svg) { + // Try to find a inside SVG + var sit = child.childrenIterator(); + while (sit.next()) |s_child| { + if (s_child.is(DOMNode.Element)) |s_el| { + if (std.mem.eql(u8, s_el.getTagNameLower(), "title")) { + try writeAccessibleNameFallback(s_child, writer, page); + try writer.writeByte(' '); + } + } + } + } else { + if (!el.getTag().isMetadata()) { + try writeAccessibleNameFallback(child, writer, page); + } + } + }, + else => {}, + } + } +} + fn isHidden(elt: *DOMNode.Element) bool { if (elt.getAttributeSafe(comptime .wrap("aria-hidden"))) |value| { if (std.mem.eql(u8, value, "true")) { @@ -987,7 +1073,7 @@ fn isIgnore(self: AXNode, page: *Page) bool { return false; } -fn getRole(self: AXNode) ![]const u8 { +pub fn getRole(self: AXNode) ![]const u8 { if (self.role_attr) |role_value| { // TODO the role can have multiple comma separated values. return role_value; diff --git a/src/cdp/domains/lp.zig b/src/cdp/domains/lp.zig index 1e8e1168..2026b17d 100644 --- a/src/cdp/domains/lp.zig +++ b/src/cdp/domains/lp.zig @@ -18,25 +18,67 @@ const std = @import("std"); const lp = @import("lightpanda"); +const log = @import("../../log.zig"); const markdown = lp.markdown; +const SemanticTree = lp.SemanticTree; const interactive = lp.interactive; const structured_data = lp.structured_data; const Node = @import("../Node.zig"); +const DOMNode = @import("../../browser/webapi/Node.zig"); pub fn processMessage(cmd: anytype) !void { const action = std.meta.stringToEnum(enum { getMarkdown, + getSemanticTree, getInteractiveElements, getStructuredData, }, cmd.input.action) orelse return error.UnknownMethod; switch (action) { .getMarkdown => return getMarkdown(cmd), + .getSemanticTree => return getSemanticTree(cmd), .getInteractiveElements => return getInteractiveElements(cmd), .getStructuredData => return getStructuredData(cmd), } } +fn getSemanticTree(cmd: anytype) !void { + const Params = struct { + format: ?enum { text } = null, + prune: ?bool = null, + }; + const params = (try cmd.params(Params)) orelse Params{}; + + const bc = cmd.browser_context orelse return error.NoBrowserContext; + const page = bc.session.currentPage() orelse return error.PageNotLoaded; + const dom_node = page.document.asNode(); + + var st = SemanticTree{ + .dom_node = dom_node, + .registry = &bc.node_registry, + .page = page, + .arena = cmd.arena, + .prune = params.prune orelse false, + }; + + if (params.format) |format| { + if (format == .text) { + st.prune = params.prune orelse true; + var aw: std.Io.Writer.Allocating = .init(cmd.arena); + defer aw.deinit(); + try st.textStringify(&aw.writer); + + return cmd.sendResult(.{ + .semanticTree = aw.written(), + }, .{}); + } + } + + return cmd.sendResult(.{ + .semanticTree = st, + }, .{}); +} + fn getMarkdown(cmd: anytype) !void { const Params = struct { nodeId: ?Node.Id = null, @@ -51,7 +93,7 @@ fn getMarkdown(cmd: anytype) !void { else page.document.asNode(); - var aw = std.Io.Writer.Allocating.init(cmd.arena); + var aw: std.Io.Writer.Allocating = .init(cmd.arena); defer aw.deinit(); try markdown.dump(dom_node, .{}, &aw.writer, page); diff --git a/src/lightpanda.zig b/src/lightpanda.zig index 0b72e5ed..4fac3921 100644 --- a/src/lightpanda.zig +++ b/src/lightpanda.zig @@ -22,6 +22,7 @@ pub const Network = @import("network/Runtime.zig"); pub const Server = @import("Server.zig"); pub const Config = @import("Config.zig"); pub const URL = @import("browser/URL.zig"); +pub const String = @import("string.zig").String; pub const Page = @import("browser/Page.zig"); pub const Browser = @import("browser/Browser.zig"); pub const Session = @import("browser/Session.zig"); @@ -31,6 +32,8 @@ pub const log = @import("log.zig"); pub const js = @import("browser/js/js.zig"); pub const dump = @import("browser/dump.zig"); pub const markdown = @import("browser/markdown.zig"); +pub const SemanticTree = @import("SemanticTree.zig"); +pub const CDPNode = @import("cdp/Node.zig"); pub const interactive = @import("browser/interactive.zig"); pub const structured_data = @import("browser/structured_data.zig"); pub const mcp = @import("mcp.zig"); @@ -110,6 +113,24 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void { switch (mode) { .html => try dump.root(page.window._document, opts.dump, writer, page), .markdown => try markdown.dump(page.window._document.asNode(), .{}, writer, page), + .semantic_tree, .semantic_tree_text => { + var registry = CDPNode.Registry.init(app.allocator); + defer registry.deinit(); + + const st: SemanticTree = .{ + .dom_node = page.window._document.asNode(), + .registry = ®istry, + .page = page, + .arena = page.call_arena, + .prune = (mode == .semantic_tree_text), + }; + + if (mode == .semantic_tree) { + try std.json.Stringify.value(st, .{}, writer); + } else { + try st.textStringify(writer); + } + }, .wpt => try dumpWPT(page, writer), } } diff --git a/src/mcp/Server.zig b/src/mcp/Server.zig index caed9eef..6f8b1f21 100644 --- a/src/mcp/Server.zig +++ b/src/mcp/Server.zig @@ -7,6 +7,7 @@ const HttpClient = @import("../browser/HttpClient.zig"); const testing = @import("../testing.zig"); const protocol = @import("protocol.zig"); const router = @import("router.zig"); +const CDPNode = @import("../cdp/Node.zig"); const Self = @This(); @@ -17,6 +18,7 @@ http_client: *HttpClient, notification: *lp.Notification, browser: lp.Browser, session: *lp.Session, +node_registry: CDPNode.Registry, writer: *std.io.Writer, mutex: std.Thread.Mutex = .{}, @@ -44,12 +46,15 @@ pub fn init(allocator: std.mem.Allocator, app: *App, writer: *std.io.Writer) !*S .http_client = http_client, .notification = notification, .session = undefined, + .node_registry = CDPNode.Registry.init(allocator), }; + self.session = try self.browser.newSession(self.notification); return self; } pub fn deinit(self: *Self) void { + self.node_registry.deinit(); self.aw.deinit(); self.browser.deinit(); self.notification.deinit(); @@ -82,7 +87,7 @@ pub fn sendResult(self: *Self, id: std.json.Value, result: anytype) !void { } pub fn sendError(self: *Self, id: std.json.Value, code: protocol.ErrorCode, message: []const u8) !void { - try self.sendResponse(protocol.Response{ + try self.sendResponse(.{ .id = id, .@"error" = protocol.Error{ .code = @intFromEnum(code), diff --git a/src/mcp/protocol.zig b/src/mcp/protocol.zig index 97035c0f..1c488535 100644 --- a/src/mcp/protocol.zig +++ b/src/mcp/protocol.zig @@ -114,6 +114,7 @@ pub const Tool = struct { }; pub fn minify(comptime json: []const u8) []const u8 { + @setEvalBranchQuota(100000); return comptime blk: { var res: []const u8 = ""; var in_string = false; diff --git a/src/mcp/tools.zig b/src/mcp/tools.zig index e7ad5668..f5126be0 100644 --- a/src/mcp/tools.zig +++ b/src/mcp/tools.zig @@ -8,6 +8,7 @@ const Element = @import("../browser/webapi/Element.zig"); const Selector = @import("../browser/webapi/selector/Selector.zig"); const protocol = @import("protocol.zig"); const Server = @import("Server.zig"); +const CDPNode = @import("../cdp/Node.zig"); pub const tool_list = [_]protocol.Tool{ .{ @@ -61,6 +62,18 @@ pub const tool_list = [_]protocol.Tool{ \\} ), }, + .{ + .name = "semantic_tree", + .description = "Get the page content as a simplified semantic DOM tree for AI reasoning. If a url is provided, it navigates to that url first.", + .inputSchema = protocol.minify( + \\{ + \\ "type": "object", + \\ "properties": { + \\ "url": { "type": "string", "description": "Optional URL to navigate to before fetching the semantic tree." } + \\ } + \\} + ), + }, .{ .name = "interactiveElements", .description = "Extract interactive elements from the opened page. If a url is provided, it navigates to that url first.", @@ -103,13 +116,16 @@ const EvaluateParams = struct { const ToolStreamingText = struct { page: *lp.Page, - action: enum { markdown, links }, + action: enum { markdown, links, semantic_tree }, + registry: ?*CDPNode.Registry = null, + arena: ?std.mem.Allocator = null, pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) !void { try jw.beginWriteRaw(); try jw.writer.writeByte('"'); - var escaped = protocol.JsonEscapingWriter.init(jw.writer); + var escaped: protocol.JsonEscapingWriter = .init(jw.writer); const w = &escaped.writer; + switch (self.action) { .markdown => lp.markdown.dump(self.page.document.asNode(), .{}, w, self.page) catch |err| { log.err(.mcp, "markdown dump failed", .{ .err = err }); @@ -137,7 +153,21 @@ const ToolStreamingText = struct { log.err(.mcp, "query links failed", .{ .err = err }); } }, + .semantic_tree => { + const st = lp.SemanticTree{ + .dom_node = self.page.document.asNode(), + .registry = self.registry.?, + .page = self.page, + .arena = self.arena.?, + .prune = true, + }; + + st.textStringify(w) catch |err| { + log.err(.mcp, "semantic tree dump failed", .{ .err = err }); + }; + }, } + try jw.writer.writeByte('"'); jw.endWriteRaw(); } @@ -151,6 +181,7 @@ const ToolAction = enum { interactiveElements, structuredData, evaluate, + semantic_tree, }; const tool_map = std.StaticStringMap(ToolAction).initComptime(.{ @@ -161,6 +192,7 @@ const tool_map = std.StaticStringMap(ToolAction).initComptime(.{ .{ "interactiveElements", .interactiveElements }, .{ "structuredData", .structuredData }, .{ "evaluate", .evaluate }, + .{ "semantic_tree", .semantic_tree }, }); pub fn handleCall(server: *Server, arena: std.mem.Allocator, req: protocol.Request) !void { @@ -188,6 +220,7 @@ pub fn handleCall(server: *Server, arena: std.mem.Allocator, req: protocol.Reque .interactiveElements => try handleInteractiveElements(server, arena, req.id.?, call_params.arguments), .structuredData => try handleStructuredData(server, arena, req.id.?, call_params.arguments), .evaluate => try handleEvaluate(server, arena, req.id.?, call_params.arguments), + .semantic_tree => try handleSemanticTree(server, arena, req.id.?, call_params.arguments), } } @@ -241,6 +274,27 @@ fn handleLinks(server: *Server, arena: std.mem.Allocator, id: std.json.Value, ar try server.sendResult(id, protocol.CallToolResult(ToolStreamingText){ .content = &content }); } +fn handleSemanticTree(server: *Server, arena: std.mem.Allocator, id: std.json.Value, arguments: ?std.json.Value) !void { + const TreeParams = struct { + url: ?[:0]const u8 = null, + }; + if (arguments) |args_raw| { + if (std.json.parseFromValueLeaky(TreeParams, arena, args_raw, .{ .ignore_unknown_fields = true })) |args| { + if (args.url) |u| { + try performGoto(server, u, id); + } + } else |_| {} + } + const page = server.session.currentPage() orelse { + return server.sendError(id, .PageNotLoaded, "Page not loaded"); + }; + + const content = [_]protocol.TextContent(ToolStreamingText){.{ + .text = .{ .page = page, .action = .semantic_tree, .registry = &server.node_registry, .arena = arena }, + }}; + try server.sendResult(id, protocol.CallToolResult(ToolStreamingText){ .content = &content }); +} + fn handleInteractiveElements(server: *Server, arena: std.mem.Allocator, id: std.json.Value, arguments: ?std.json.Value) !void { const Params = struct { url: ?[:0]const u8 = null, diff --git a/src/string.zig b/src/string.zig index 8cb15c8f..d00ec33b 100644 --- a/src/string.zig +++ b/src/string.zig @@ -305,6 +305,12 @@ pub const String = packed struct { } }; +pub fn isAllWhitespace(text: []const u8) bool { + return for (text) |c| { + if (!std.ascii.isWhitespace(c)) break false; + } else true; +} + // Discriminatory type that signals the bridge to use arena instead of call_arena // Use this for strings that need to persist beyond the current call // The caller can unwrap and store just the underlying .str field