Merge pull request #1776 from lightpanda-io/semantic-tree
Some checks failed
e2e-test / zig build release (push) Has been cancelled
e2e-test / demo-scripts (push) Has been cancelled
e2e-test / cdp-and-hyperfine-bench (push) Has been cancelled
e2e-test / perf-fmt (push) Has been cancelled
e2e-test / browser fetch (push) Has been cancelled
zig-test / zig test using v8 in debug mode (push) Has been cancelled
zig-test / zig test (push) Has been cancelled
zig-test / perf-fmt (push) Has been cancelled

Add native Semantic Tree extraction engine for AI agents
This commit is contained in:
Adrià Arrufat
2026-03-11 21:01:04 +09:00
committed by GitHub
12 changed files with 752 additions and 49 deletions

View File

@@ -200,6 +200,8 @@ pub const DumpFormat = enum {
html, html,
markdown, markdown,
wpt, wpt,
semantic_tree,
semantic_tree_text,
}; };
pub const Fetch = struct { pub const Fetch = struct {
@@ -346,7 +348,7 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void {
\\ \\
\\Options: \\Options:
\\--dump Dumps document to stdout. \\--dump Dumps document to stdout.
\\ Argument must be 'html' or 'markdown'. \\ Argument must be 'html', 'markdown', 'semantic_tree', or 'semantic_tree_text'.
\\ Defaults to no dump. \\ Defaults to no dump.
\\ \\
\\--strip_mode Comma separated list of tag groups to remove from dump \\--strip_mode Comma separated list of tag groups to remove from dump

450
src/SemanticTree.zig Normal file
View File

@@ -0,0 +1,450 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. See <https://www.gnu.org/licenses/>.
const std = @import("std");
const lp = @import("lightpanda");
const log = @import("log.zig");
const isAllWhitespace = @import("string.zig").isAllWhitespace;
const Page = lp.Page;
const interactive = @import("browser/interactive.zig");
const CData = @import("browser/webapi/CData.zig");
const Element = @import("browser/webapi/Element.zig");
const Node = @import("browser/webapi/Node.zig");
const AXNode = @import("cdp/AXNode.zig");
const CDPNode = @import("cdp/Node.zig");
const Self = @This();
dom_node: *Node,
registry: *CDPNode.Registry,
page: *Page,
arena: std.mem.Allocator,
prune: bool = false,
pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void {
var visitor = JsonVisitor{ .jw = jw, .tree = self };
var xpath_buffer: std.ArrayList(u8) = .{};
const listener_targets = interactive.buildListenerTargetMap(self.page, self.arena) catch |err| {
log.err(.app, "listener map failed", .{ .err = err });
return error.WriteFailed;
};
self.walk(self.dom_node, &xpath_buffer, null, &visitor, 1, listener_targets) catch |err| {
log.err(.app, "semantic tree json dump failed", .{ .err = err });
return error.WriteFailed;
};
}
pub fn textStringify(self: @This(), writer: *std.Io.Writer) error{WriteFailed}!void {
var visitor = TextVisitor{ .writer = writer, .tree = self, .depth = 0 };
var xpath_buffer: std.ArrayList(u8) = .empty;
const listener_targets = interactive.buildListenerTargetMap(self.page, self.arena) catch |err| {
log.err(.app, "listener map failed", .{ .err = err });
return error.WriteFailed;
};
self.walk(self.dom_node, &xpath_buffer, null, &visitor, 1, listener_targets) catch |err| {
log.err(.app, "semantic tree text dump failed", .{ .err = err });
return error.WriteFailed;
};
}
const OptionData = struct {
value: []const u8,
text: []const u8,
selected: bool,
};
const NodeData = struct {
id: u32,
axn: AXNode,
role: []const u8,
name: ?[]const u8,
value: ?[]const u8,
options: ?[]OptionData = null,
xpath: []const u8,
is_interactive: bool,
node_name: []const u8,
};
fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_name: ?[]const u8, visitor: anytype, index: usize, listener_targets: interactive.ListenerTargetMap) !void {
// 1. Skip non-content nodes
if (node.is(Element)) |el| {
const tag = el.getTag();
if (tag.isMetadata() or tag == .svg) return;
// We handle options/optgroups natively inside their parents, skip them in the general walk
if (tag == .datalist or tag == .option or tag == .optgroup) return;
// Check visibility using the engine's checkVisibility which handles CSS display: none
if (!el.checkVisibility(self.page)) {
return;
}
if (el.is(Element.Html)) |html_el| {
if (html_el.getHidden()) return;
}
} else if (node.is(CData.Text)) |text_node| {
const text = text_node.getWholeText();
if (isAllWhitespace(text)) {
return;
}
} else if (node._type != .document and node._type != .document_fragment) {
return;
}
const cdp_node = try self.registry.register(node);
const axn = AXNode.fromNode(node);
const role = try axn.getRole();
var is_interactive = false;
var value: ?[]const u8 = null;
var options: ?[]OptionData = null;
var node_name: []const u8 = "text";
if (node.is(Element)) |el| {
node_name = el.getTagNameLower();
if (el.is(Element.Html.Input)) |input| {
value = input.getValue();
if (el.getAttributeSafe(comptime lp.String.wrap("list"))) |list_id| {
options = try extractDataListOptions(list_id, self.page, self.arena);
}
} else if (el.is(Element.Html.TextArea)) |textarea| {
value = textarea.getValue();
} else if (el.is(Element.Html.Select)) |select| {
value = select.getValue(self.page);
options = try extractSelectOptions(el.asNode(), self.page, self.arena);
}
if (el.is(Element.Html)) |html_el| {
if (interactive.classifyInteractivity(el, html_el, listener_targets) != null) {
is_interactive = true;
}
}
} else if (node._type == .document or node._type == .document_fragment) {
node_name = "root";
}
const initial_xpath_len = xpath_buffer.items.len;
try appendXPathSegment(node, xpath_buffer.writer(self.arena), index);
const xpath = xpath_buffer.items;
var name = try axn.getName(self.page, self.arena);
const has_explicit_label = if (node.is(Element)) |el|
el.getAttributeSafe(.wrap("aria-label")) != null or el.getAttributeSafe(.wrap("title")) != null
else
false;
const structural = isStructuralRole(role);
// Filter out computed concatenated names for generic containers without explicit labels.
// This prevents token bloat and ensures their StaticText children aren't incorrectly pruned.
// We ignore interactivity because a generic wrapper with an event listener still shouldn't hoist all text.
if (name != null and structural and !has_explicit_label) {
name = null;
}
var data = NodeData{
.id = cdp_node.id,
.axn = axn,
.role = role,
.name = name,
.value = value,
.options = options,
.xpath = xpath,
.is_interactive = is_interactive,
.node_name = node_name,
};
var should_visit = true;
if (self.prune) {
if (structural and !is_interactive and !has_explicit_label) {
should_visit = false;
}
if (std.mem.eql(u8, role, "StaticText") and node._parent != null) {
if (parent_name != null and name != null and std.mem.indexOf(u8, parent_name.?, name.?) != null) {
should_visit = false;
}
}
}
var did_visit = false;
var should_walk_children = true;
if (should_visit) {
should_walk_children = try visitor.visit(node, &data);
did_visit = true; // Always true if should_visit was true, because visit() executed and opened structures
} else {
// If we skip the node, we must NOT tell the visitor to close it later
did_visit = false;
}
if (should_walk_children) {
// If we are printing this node normally OR skipping it and unrolling its children,
// we walk the children iterator.
var it = node.childrenIterator();
var tag_counts = std.StringArrayHashMap(usize).init(self.arena);
while (it.next()) |child| {
var tag: []const u8 = "text()";
if (child.is(Element)) |el| {
tag = el.getTagNameLower();
}
const gop = try tag_counts.getOrPut(tag);
if (!gop.found_existing) {
gop.value_ptr.* = 0;
}
gop.value_ptr.* += 1;
try self.walk(child, xpath_buffer, name, visitor, gop.value_ptr.*, listener_targets);
}
}
if (did_visit) {
try visitor.leave();
}
xpath_buffer.shrinkRetainingCapacity(initial_xpath_len);
}
fn extractSelectOptions(node: *Node, page: *Page, arena: std.mem.Allocator) ![]OptionData {
var options = std.ArrayListUnmanaged(OptionData){};
var it = node.childrenIterator();
while (it.next()) |child| {
if (child.is(Element)) |el| {
if (el.getTag() == .option) {
if (el.is(Element.Html.Option)) |opt| {
const text = opt.getText();
const value = opt.getValue(page);
const selected = opt.getSelected();
try options.append(arena, .{ .text = text, .value = value, .selected = selected });
}
} else if (el.getTag() == .optgroup) {
var group_it = child.childrenIterator();
while (group_it.next()) |group_child| {
if (group_child.is(Element.Html.Option)) |opt| {
const text = opt.getText();
const value = opt.getValue(page);
const selected = opt.getSelected();
try options.append(arena, .{ .text = text, .value = value, .selected = selected });
}
}
}
}
}
return options.toOwnedSlice(arena);
}
fn extractDataListOptions(list_id: []const u8, page: *Page, arena: std.mem.Allocator) !?[]OptionData {
if (page.document.getElementById(list_id, page)) |referenced_el| {
if (referenced_el.getTag() == .datalist) {
return try extractSelectOptions(referenced_el.asNode(), page, arena);
}
}
return null;
}
fn appendXPathSegment(node: *Node, writer: anytype, index: usize) !void {
if (node.is(Element)) |el| {
const tag = el.getTagNameLower();
try std.fmt.format(writer, "/{s}[{d}]", .{ tag, index });
} else if (node.is(CData.Text)) |_| {
try std.fmt.format(writer, "/text()[{d}]", .{index});
}
}
const JsonVisitor = struct {
jw: *std.json.Stringify,
tree: Self,
pub fn visit(self: *JsonVisitor, node: *Node, data: *NodeData) !bool {
try self.jw.beginObject();
try self.jw.objectField("nodeId");
try self.jw.write(try std.fmt.allocPrint(self.tree.arena, "{d}", .{data.id}));
try self.jw.objectField("backendDOMNodeId");
try self.jw.write(data.id);
try self.jw.objectField("nodeName");
try self.jw.write(data.node_name);
try self.jw.objectField("xpath");
try self.jw.write(data.xpath);
if (node.is(Element)) |el| {
try self.jw.objectField("nodeType");
try self.jw.write(1);
try self.jw.objectField("isInteractive");
try self.jw.write(data.is_interactive);
try self.jw.objectField("role");
try self.jw.write(data.role);
if (data.name) |name| {
if (name.len > 0) {
try self.jw.objectField("name");
try self.jw.write(name);
}
}
if (data.value) |value| {
try self.jw.objectField("value");
try self.jw.write(value);
}
if (el._attributes) |attrs| {
try self.jw.objectField("attributes");
try self.jw.beginObject();
var iter = attrs.iterator();
while (iter.next()) |attr| {
try self.jw.objectField(attr._name.str());
try self.jw.write(attr._value.str());
}
try self.jw.endObject();
}
if (data.options) |options| {
try self.jw.objectField("options");
try self.jw.beginArray();
for (options) |opt| {
try self.jw.beginObject();
try self.jw.objectField("value");
try self.jw.write(opt.value);
try self.jw.objectField("text");
try self.jw.write(opt.text);
try self.jw.objectField("selected");
try self.jw.write(opt.selected);
try self.jw.endObject();
}
try self.jw.endArray();
}
} else if (node.is(CData.Text)) |text_node| {
try self.jw.objectField("nodeType");
try self.jw.write(3);
try self.jw.objectField("nodeValue");
try self.jw.write(text_node.getWholeText());
} else {
try self.jw.objectField("nodeType");
try self.jw.write(9);
}
try self.jw.objectField("children");
try self.jw.beginArray();
if (data.options != null) {
// Signal to not walk children, as we handled them natively
return false;
}
return true;
}
pub fn leave(self: *JsonVisitor) !void {
try self.jw.endArray();
try self.jw.endObject();
}
};
fn isStructuralRole(role: []const u8) bool {
const structural_roles = std.StaticStringMap(void).initComptime(.{
.{ "none", {} },
.{ "generic", {} },
.{ "InlineTextBox", {} },
.{ "banner", {} },
.{ "navigation", {} },
.{ "main", {} },
.{ "list", {} },
.{ "listitem", {} },
.{ "table", {} },
.{ "rowgroup", {} },
.{ "row", {} },
.{ "cell", {} },
.{ "region", {} },
});
return structural_roles.has(role);
}
const TextVisitor = struct {
writer: *std.Io.Writer,
tree: Self,
depth: usize,
pub fn visit(self: *TextVisitor, node: *Node, data: *NodeData) !bool {
// Format: " [12] link: Hacker News (value)"
for (0..(self.depth * 2)) |_| {
try self.writer.writeByte(' ');
}
try self.writer.print("[{d}] {s}: ", .{ data.id, data.role });
if (data.name) |n| {
if (n.len > 0) {
try self.writer.writeAll(n);
}
} else if (node.is(CData.Text)) |text_node| {
const trimmed = std.mem.trim(u8, text_node.getWholeText(), " \t\r\n");
if (trimmed.len > 0) {
try self.writer.writeAll(trimmed);
}
}
if (data.value) |v| {
if (v.len > 0) {
try self.writer.print(" (value: {s})", .{v});
}
}
if (data.options) |options| {
try self.writer.writeAll(" options: [");
for (options, 0..) |opt, i| {
if (i > 0) try self.writer.writeAll(", ");
try self.writer.print("'{s}'", .{opt.value});
if (opt.selected) {
try self.writer.writeAll(" (selected)");
}
}
try self.writer.writeAll("]\n");
self.depth += 1;
return false; // Native handling complete, do not walk children
}
try self.writer.writeByte('\n');
self.depth += 1;
// If this is a leaf-like semantic node and we already have a name,
// skip children to avoid redundant StaticText or noise.
const is_leaf_semantic = std.mem.eql(u8, data.role, "link") or
std.mem.eql(u8, data.role, "button") or
std.mem.eql(u8, data.role, "heading") or
std.mem.eql(u8, data.role, "code");
if (is_leaf_semantic and data.name != null and data.name.?.len > 0) {
return false;
}
return true;
}
pub fn leave(self: *TextVisitor) !void {
if (self.depth > 0) {
self.depth -= 1;
}
}
};

View File

@@ -157,7 +157,7 @@ pub fn collectInteractiveElements(
.node = node, .node = node,
.tag_name = el.getTagNameLower(), .tag_name = el.getTagNameLower(),
.role = getRole(el), .role = getRole(el),
.name = getAccessibleName(el), .name = try getAccessibleName(el, arena),
.interactivity_type = itype, .interactivity_type = itype,
.listener_types = listener_types, .listener_types = listener_types,
.disabled = isDisabled(el), .disabled = isDisabled(el),
@@ -178,12 +178,12 @@ pub fn collectInteractiveElements(
return results.items; return results.items;
} }
const ListenerTargetMap = std.AutoHashMapUnmanaged(usize, std.ArrayList([]const u8)); pub const ListenerTargetMap = std.AutoHashMapUnmanaged(usize, std.ArrayList([]const u8));
/// Pre-build a map from event_target pointer → list of event type names. /// Pre-build a map from event_target pointer → list of event type names.
/// This lets both classifyInteractivity (O(1) "has any?") and /// This lets both classifyInteractivity (O(1) "has any?") and
/// getListenerTypes (O(1) "which ones?") avoid re-iterating per element. /// getListenerTypes (O(1) "which ones?") avoid re-iterating per element.
fn buildListenerTargetMap(page: *Page, arena: Allocator) !ListenerTargetMap { pub fn buildListenerTargetMap(page: *Page, arena: Allocator) !ListenerTargetMap {
var map = ListenerTargetMap{}; var map = ListenerTargetMap{};
// addEventListener registrations // addEventListener registrations
@@ -209,7 +209,7 @@ fn buildListenerTargetMap(page: *Page, arena: Allocator) !ListenerTargetMap {
return map; return map;
} }
fn classifyInteractivity( pub fn classifyInteractivity(
el: *Element, el: *Element,
html_el: *Element.Html, html_el: *Element.Html,
listener_targets: ListenerTargetMap, listener_targets: ListenerTargetMap,
@@ -296,7 +296,7 @@ fn getRole(el: *Element) ?[]const u8 {
}; };
} }
fn getAccessibleName(el: *Element) ?[]const u8 { fn getAccessibleName(el: *Element, arena: Allocator) !?[]const u8 {
// aria-label // aria-label
if (el.getAttributeSafe(comptime .wrap("aria-label"))) |v| { if (el.getAttributeSafe(comptime .wrap("aria-label"))) |v| {
if (v.len > 0) return v; if (v.len > 0) return v;
@@ -325,11 +325,15 @@ fn getAccessibleName(el: *Element) ?[]const u8 {
} }
// Text content (first non-empty text node, trimmed) // Text content (first non-empty text node, trimmed)
return getTextContent(el.asNode()); return try getTextContent(el.asNode(), arena);
} }
fn getTextContent(node: *Node) ?[]const u8 { fn getTextContent(node: *Node, arena: Allocator) !?[]const u8 {
var tw = TreeWalker.FullExcludeSelf.init(node, .{}); var tw: TreeWalker.FullExcludeSelf = .init(node, .{});
var arr: std.ArrayList(u8) = .empty;
var single_chunk: ?[]const u8 = null;
while (tw.next()) |child| { while (tw.next()) |child| {
// Skip text inside script/style elements. // Skip text inside script/style elements.
if (child.is(Element)) |el| { if (child.is(Element)) |el| {
@@ -344,13 +348,29 @@ fn getTextContent(node: *Node) ?[]const u8 {
if (child.is(Node.CData)) |cdata| { if (child.is(Node.CData)) |cdata| {
if (cdata.is(Node.CData.Text)) |text| { if (cdata.is(Node.CData.Text)) |text| {
const content = std.mem.trim(u8, text.getWholeText(), &std.ascii.whitespace); const content = std.mem.trim(u8, text.getWholeText(), &std.ascii.whitespace);
if (content.len > 0) return content; if (content.len > 0) {
if (single_chunk == null and arr.items.len == 0) {
single_chunk = content;
} else {
if (single_chunk) |sc| {
try arr.appendSlice(arena, sc);
try arr.append(arena, ' ');
single_chunk = null;
}
try arr.appendSlice(arena, content);
try arr.append(arena, ' ');
}
} }
} }
} }
return null;
} }
if (single_chunk) |sc| return sc;
if (arr.items.len == 0) return null;
// strip out trailing space
return arr.items[0 .. arr.items.len - 1];
}
fn isDisabled(el: *Element) bool { fn isDisabled(el: *Element) bool {
if (el.getAttributeSafe(comptime .wrap("disabled")) != null) return true; if (el.getAttributeSafe(comptime .wrap("disabled")) != null) return true;
return isDisabledByFieldset(el); return isDisabledByFieldset(el);

View File

@@ -24,6 +24,7 @@ const TreeWalker = @import("webapi/TreeWalker.zig");
const CData = @import("webapi/CData.zig"); const CData = @import("webapi/CData.zig");
const Element = @import("webapi/Element.zig"); const Element = @import("webapi/Element.zig");
const Node = @import("webapi/Node.zig"); const Node = @import("webapi/Node.zig");
const isAllWhitespace = @import("../string.zig").isAllWhitespace;
pub const Opts = struct { pub const Opts = struct {
// Options for future customization (e.g., dialect) // Options for future customization (e.g., dialect)
@@ -46,13 +47,6 @@ const State = struct {
last_char_was_newline: bool = true, last_char_was_newline: bool = true,
}; };
fn isBlock(tag: Element.Tag) bool {
return switch (tag) {
.p, .div, .section, .article, .main, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .pre, .table, .hr => true,
else => false,
};
}
fn shouldAddSpacing(tag: Element.Tag) bool { fn shouldAddSpacing(tag: Element.Tag) bool {
return switch (tag) { return switch (tag) {
.p, .h1, .h2, .h3, .h4, .h5, .h6, .blockquote, .pre, .table => true, .p, .h1, .h2, .h3, .h4, .h5, .h6, .blockquote, .pre, .table => true,
@@ -99,26 +93,18 @@ fn isSignificantText(node: *Node) bool {
} }
fn isVisibleElement(el: *Element) bool { fn isVisibleElement(el: *Element) bool {
return switch (el.getTag()) { const tag = el.getTag();
.script, .style, .noscript, .template, .head, .meta, .link, .title, .svg => false, return !tag.isMetadata() and tag != .svg;
else => true,
};
} }
fn getAnchorLabel(el: *Element) ?[]const u8 { fn getAnchorLabel(el: *Element) ?[]const u8 {
return el.getAttributeSafe(comptime .wrap("aria-label")) orelse el.getAttributeSafe(comptime .wrap("title")); return el.getAttributeSafe(comptime .wrap("aria-label")) orelse el.getAttributeSafe(comptime .wrap("title"));
} }
fn isAllWhitespace(text: []const u8) bool {
return for (text) |c| {
if (!std.ascii.isWhitespace(c)) break false;
} else true;
}
fn hasBlockDescendant(root: *Node) bool { fn hasBlockDescendant(root: *Node) bool {
var tw = TreeWalker.FullExcludeSelf.Elements.init(root, .{}); var tw = TreeWalker.FullExcludeSelf.Elements.init(root, .{});
while (tw.next()) |el| { while (tw.next()) |el| {
if (isBlock(el.getTag())) return true; if (el.getTag().isBlock()) return true;
} }
return false; return false;
} }
@@ -192,7 +178,7 @@ fn renderElement(el: *Element, state: *State, writer: *std.Io.Writer, page: *Pag
// --- Opening Tag Logic --- // --- Opening Tag Logic ---
// Ensure block elements start on a new line (double newline for paragraphs etc) // Ensure block elements start on a new line (double newline for paragraphs etc)
if (isBlock(tag) and !state.in_table) { if (tag.isBlock() and !state.in_table) {
try ensureNewline(state, writer); try ensureNewline(state, writer);
if (shouldAddSpacing(tag)) { if (shouldAddSpacing(tag)) {
try writer.writeByte('\n'); try writer.writeByte('\n');
@@ -431,7 +417,7 @@ fn renderElement(el: *Element, state: *State, writer: *std.Io.Writer, page: *Pag
} }
// Post-block newlines // Post-block newlines
if (isBlock(tag) and !state.in_table) { if (tag.isBlock() and !state.in_table) {
try ensureNewline(state, writer); try ensureNewline(state, writer);
} }
} }

View File

@@ -1580,6 +1580,36 @@ pub const Tag = enum {
else => tag, else => tag,
}; };
} }
pub fn isBlock(self: Tag) bool {
// zig fmt: off
return switch (self) {
// Semantic Layout
.article, .aside, .footer, .header, .main, .nav, .section,
// Grouping / Containers
.address, .div, .fieldset, .figure, .p,
// Headings
.h1, .h2, .h3, .h4, .h5, .h6,
// Lists
.dl, .ol, .ul,
// Preformatted / Quotes
.blockquote, .pre,
// Tables
.table,
// Other
.hr,
=> true,
else => false,
};
// zig fmt: on
}
pub fn isMetadata(self: Tag) bool {
return switch (self) {
.base, .head, .link, .meta, .noscript, .script, .style, .template, .title => true,
else => false,
};
}
}; };
pub const JsApi = struct { pub const JsApi = struct {

View File

@@ -557,13 +557,13 @@ pub const Writer = struct {
pub const AXRole = enum(u8) { pub const AXRole = enum(u8) {
// zig fmt: off // zig fmt: off
none, article, banner, blockquote, button, caption, cell, checkbox, code, none, article, banner, blockquote, button, caption, cell, checkbox, code, color,
columnheader, combobox, complementary, contentinfo, definition, deletion, columnheader, combobox, complementary, contentinfo, date, definition, deletion,
dialog, document, emphasis, figure, form, group, heading, image, insertion, dialog, document, emphasis, figure, file, form, group, heading, image, insertion,
link, list, listbox, listitem, main, marquee, meter, navigation, option, link, list, listbox, listitem, main, marquee, menuitem, meter, month, navigation, option,
paragraph, presentation, progressbar, radio, region, row, rowgroup, paragraph, presentation, progressbar, radio, region, row, rowgroup,
rowheader, searchbox, separator, slider, spinbutton, status, strong, rowheader, searchbox, separator, slider, spinbutton, status, strong,
subscript, superscript, table, term, textbox, time, RootWebArea, LineBreak, subscript, superscript, @"switch", table, term, textbox, time, RootWebArea, LineBreak,
StaticText, StaticText,
// zig fmt: on // zig fmt: on
@@ -620,9 +620,13 @@ pub const AXRole = enum(u8) {
.number => .spinbutton, .number => .spinbutton,
.search => .searchbox, .search => .searchbox,
.checkbox => .checkbox, .checkbox => .checkbox,
.color => .color,
.date => .date,
.file => .file,
.month => .month,
.@"datetime-local", .week, .time => .combobox,
// zig fmt: off // zig fmt: off
.password, .@"datetime-local", .hidden, .month, .color, .password, .hidden => .none,
.week, .time, .file, .date => .none,
// zig fmt: on // zig fmt: on
}; };
}, },
@@ -738,6 +742,44 @@ const AXSource = enum(u8) {
value, // input value value, // input value
}; };
pub fn getName(self: AXNode, page: *Page, allocator: std.mem.Allocator) !?[]const u8 {
var aw: std.Io.Writer.Allocating = .init(allocator);
defer aw.deinit();
// writeName expects a std.json.Stringify instance.
const TextCaptureWriter = struct {
aw: *std.Io.Writer.Allocating,
writer: *std.Io.Writer,
pub fn write(w: @This(), val: anytype) !void {
const T = @TypeOf(val);
if (T == []const u8 or T == [:0]const u8 or T == *const [val.len]u8) {
try w.aw.writer.writeAll(val);
} else if (comptime std.meta.hasMethod(T, "format")) {
try std.fmt.format(w.aw.writer, "{s}", .{val});
} else {
// Ignore unexpected types (e.g. booleans) to avoid garbage output
}
}
// Mock JSON Stringifier lifecycle methods
pub fn beginWriteRaw(_: @This()) !void {}
pub fn endWriteRaw(_: @This()) void {}
};
const w: TextCaptureWriter = .{ .aw = &aw, .writer = &aw.writer };
const source = try self.writeName(w, page);
if (source != null) {
// Remove literal quotes inserted by writeString.
var raw_text = std.mem.trim(u8, aw.written(), "\"");
raw_text = std.mem.trim(u8, raw_text, &std.ascii.whitespace);
return try allocator.dupe(u8, raw_text);
}
return null;
}
fn writeName(axnode: AXNode, w: anytype, page: *Page) !?AXSource { fn writeName(axnode: AXNode, w: anytype, page: *Page) !?AXSource {
const node = axnode.dom; const node = axnode.dom;
@@ -823,15 +865,17 @@ fn writeName(axnode: AXNode, w: anytype, page: *Page) !?AXSource {
.object, .progress, .meter, .main, .nav, .aside, .header, .object, .progress, .meter, .main, .nav, .aside, .header,
.footer, .form, .section, .article, .ul, .ol, .dl, .menu, .footer, .form, .section, .article, .ul, .ol, .dl, .menu,
.thead, .tbody, .tfoot, .tr, .td, .div, .span, .p, .details, .li, .thead, .tbody, .tfoot, .tr, .td, .div, .span, .p, .details, .li,
.style, .script, .style, .script, .html, .body,
// zig fmt: on // zig fmt: on
=> {}, => {},
else => { else => {
// write text content if exists. // write text content if exists.
var buf = std.Io.Writer.Allocating.init(page.call_arena); var buf: std.Io.Writer.Allocating = .init(page.call_arena);
try el.getInnerText(&buf.writer); try writeAccessibleNameFallback(node, &buf.writer, page);
if (buf.written().len > 0) {
try writeString(buf.written(), w); try writeString(buf.written(), w);
return .contents; return .contents;
}
}, },
} }
@@ -855,6 +899,48 @@ fn writeName(axnode: AXNode, w: anytype, page: *Page) !?AXSource {
}; };
} }
fn writeAccessibleNameFallback(node: *DOMNode, writer: *std.Io.Writer, page: *Page) !void {
var it = node.childrenIterator();
while (it.next()) |child| {
switch (child._type) {
.cdata => |cd| switch (cd._type) {
.text => |*text| {
const content = std.mem.trim(u8, text.getWholeText(), &std.ascii.whitespace);
if (content.len > 0) {
try writer.writeAll(content);
try writer.writeByte(' ');
}
},
else => {},
},
.element => |el| {
if (el.getTag() == .img) {
if (el.getAttributeSafe(.wrap("alt"))) |alt| {
try writer.writeAll(alt);
try writer.writeByte(' ');
}
} else if (el.getTag() == .svg) {
// Try to find a <title> inside SVG
var sit = child.childrenIterator();
while (sit.next()) |s_child| {
if (s_child.is(DOMNode.Element)) |s_el| {
if (std.mem.eql(u8, s_el.getTagNameLower(), "title")) {
try writeAccessibleNameFallback(s_child, writer, page);
try writer.writeByte(' ');
}
}
}
} else {
if (!el.getTag().isMetadata()) {
try writeAccessibleNameFallback(child, writer, page);
}
}
},
else => {},
}
}
}
fn isHidden(elt: *DOMNode.Element) bool { fn isHidden(elt: *DOMNode.Element) bool {
if (elt.getAttributeSafe(comptime .wrap("aria-hidden"))) |value| { if (elt.getAttributeSafe(comptime .wrap("aria-hidden"))) |value| {
if (std.mem.eql(u8, value, "true")) { if (std.mem.eql(u8, value, "true")) {
@@ -987,7 +1073,7 @@ fn isIgnore(self: AXNode, page: *Page) bool {
return false; return false;
} }
fn getRole(self: AXNode) ![]const u8 { pub fn getRole(self: AXNode) ![]const u8 {
if (self.role_attr) |role_value| { if (self.role_attr) |role_value| {
// TODO the role can have multiple comma separated values. // TODO the role can have multiple comma separated values.
return role_value; return role_value;

View File

@@ -18,25 +18,67 @@
const std = @import("std"); const std = @import("std");
const lp = @import("lightpanda"); const lp = @import("lightpanda");
const log = @import("../../log.zig");
const markdown = lp.markdown; const markdown = lp.markdown;
const SemanticTree = lp.SemanticTree;
const interactive = lp.interactive; const interactive = lp.interactive;
const structured_data = lp.structured_data; const structured_data = lp.structured_data;
const Node = @import("../Node.zig"); const Node = @import("../Node.zig");
const DOMNode = @import("../../browser/webapi/Node.zig");
pub fn processMessage(cmd: anytype) !void { pub fn processMessage(cmd: anytype) !void {
const action = std.meta.stringToEnum(enum { const action = std.meta.stringToEnum(enum {
getMarkdown, getMarkdown,
getSemanticTree,
getInteractiveElements, getInteractiveElements,
getStructuredData, getStructuredData,
}, cmd.input.action) orelse return error.UnknownMethod; }, cmd.input.action) orelse return error.UnknownMethod;
switch (action) { switch (action) {
.getMarkdown => return getMarkdown(cmd), .getMarkdown => return getMarkdown(cmd),
.getSemanticTree => return getSemanticTree(cmd),
.getInteractiveElements => return getInteractiveElements(cmd), .getInteractiveElements => return getInteractiveElements(cmd),
.getStructuredData => return getStructuredData(cmd), .getStructuredData => return getStructuredData(cmd),
} }
} }
fn getSemanticTree(cmd: anytype) !void {
const Params = struct {
format: ?enum { text } = null,
prune: ?bool = null,
};
const params = (try cmd.params(Params)) orelse Params{};
const bc = cmd.browser_context orelse return error.NoBrowserContext;
const page = bc.session.currentPage() orelse return error.PageNotLoaded;
const dom_node = page.document.asNode();
var st = SemanticTree{
.dom_node = dom_node,
.registry = &bc.node_registry,
.page = page,
.arena = cmd.arena,
.prune = params.prune orelse false,
};
if (params.format) |format| {
if (format == .text) {
st.prune = params.prune orelse true;
var aw: std.Io.Writer.Allocating = .init(cmd.arena);
defer aw.deinit();
try st.textStringify(&aw.writer);
return cmd.sendResult(.{
.semanticTree = aw.written(),
}, .{});
}
}
return cmd.sendResult(.{
.semanticTree = st,
}, .{});
}
fn getMarkdown(cmd: anytype) !void { fn getMarkdown(cmd: anytype) !void {
const Params = struct { const Params = struct {
nodeId: ?Node.Id = null, nodeId: ?Node.Id = null,
@@ -51,7 +93,7 @@ fn getMarkdown(cmd: anytype) !void {
else else
page.document.asNode(); page.document.asNode();
var aw = std.Io.Writer.Allocating.init(cmd.arena); var aw: std.Io.Writer.Allocating = .init(cmd.arena);
defer aw.deinit(); defer aw.deinit();
try markdown.dump(dom_node, .{}, &aw.writer, page); try markdown.dump(dom_node, .{}, &aw.writer, page);

View File

@@ -22,6 +22,7 @@ pub const Network = @import("network/Runtime.zig");
pub const Server = @import("Server.zig"); pub const Server = @import("Server.zig");
pub const Config = @import("Config.zig"); pub const Config = @import("Config.zig");
pub const URL = @import("browser/URL.zig"); pub const URL = @import("browser/URL.zig");
pub const String = @import("string.zig").String;
pub const Page = @import("browser/Page.zig"); pub const Page = @import("browser/Page.zig");
pub const Browser = @import("browser/Browser.zig"); pub const Browser = @import("browser/Browser.zig");
pub const Session = @import("browser/Session.zig"); pub const Session = @import("browser/Session.zig");
@@ -31,6 +32,8 @@ pub const log = @import("log.zig");
pub const js = @import("browser/js/js.zig"); pub const js = @import("browser/js/js.zig");
pub const dump = @import("browser/dump.zig"); pub const dump = @import("browser/dump.zig");
pub const markdown = @import("browser/markdown.zig"); pub const markdown = @import("browser/markdown.zig");
pub const SemanticTree = @import("SemanticTree.zig");
pub const CDPNode = @import("cdp/Node.zig");
pub const interactive = @import("browser/interactive.zig"); pub const interactive = @import("browser/interactive.zig");
pub const structured_data = @import("browser/structured_data.zig"); pub const structured_data = @import("browser/structured_data.zig");
pub const mcp = @import("mcp.zig"); pub const mcp = @import("mcp.zig");
@@ -110,6 +113,24 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
switch (mode) { switch (mode) {
.html => try dump.root(page.window._document, opts.dump, writer, page), .html => try dump.root(page.window._document, opts.dump, writer, page),
.markdown => try markdown.dump(page.window._document.asNode(), .{}, writer, page), .markdown => try markdown.dump(page.window._document.asNode(), .{}, writer, page),
.semantic_tree, .semantic_tree_text => {
var registry = CDPNode.Registry.init(app.allocator);
defer registry.deinit();
const st: SemanticTree = .{
.dom_node = page.window._document.asNode(),
.registry = &registry,
.page = page,
.arena = page.call_arena,
.prune = (mode == .semantic_tree_text),
};
if (mode == .semantic_tree) {
try std.json.Stringify.value(st, .{}, writer);
} else {
try st.textStringify(writer);
}
},
.wpt => try dumpWPT(page, writer), .wpt => try dumpWPT(page, writer),
} }
} }

View File

@@ -7,6 +7,7 @@ const HttpClient = @import("../browser/HttpClient.zig");
const testing = @import("../testing.zig"); const testing = @import("../testing.zig");
const protocol = @import("protocol.zig"); const protocol = @import("protocol.zig");
const router = @import("router.zig"); const router = @import("router.zig");
const CDPNode = @import("../cdp/Node.zig");
const Self = @This(); const Self = @This();
@@ -17,6 +18,7 @@ http_client: *HttpClient,
notification: *lp.Notification, notification: *lp.Notification,
browser: lp.Browser, browser: lp.Browser,
session: *lp.Session, session: *lp.Session,
node_registry: CDPNode.Registry,
writer: *std.io.Writer, writer: *std.io.Writer,
mutex: std.Thread.Mutex = .{}, mutex: std.Thread.Mutex = .{},
@@ -44,12 +46,15 @@ pub fn init(allocator: std.mem.Allocator, app: *App, writer: *std.io.Writer) !*S
.http_client = http_client, .http_client = http_client,
.notification = notification, .notification = notification,
.session = undefined, .session = undefined,
.node_registry = CDPNode.Registry.init(allocator),
}; };
self.session = try self.browser.newSession(self.notification); self.session = try self.browser.newSession(self.notification);
return self; return self;
} }
pub fn deinit(self: *Self) void { pub fn deinit(self: *Self) void {
self.node_registry.deinit();
self.aw.deinit(); self.aw.deinit();
self.browser.deinit(); self.browser.deinit();
self.notification.deinit(); self.notification.deinit();
@@ -82,7 +87,7 @@ pub fn sendResult(self: *Self, id: std.json.Value, result: anytype) !void {
} }
pub fn sendError(self: *Self, id: std.json.Value, code: protocol.ErrorCode, message: []const u8) !void { pub fn sendError(self: *Self, id: std.json.Value, code: protocol.ErrorCode, message: []const u8) !void {
try self.sendResponse(protocol.Response{ try self.sendResponse(.{
.id = id, .id = id,
.@"error" = protocol.Error{ .@"error" = protocol.Error{
.code = @intFromEnum(code), .code = @intFromEnum(code),

View File

@@ -114,6 +114,7 @@ pub const Tool = struct {
}; };
pub fn minify(comptime json: []const u8) []const u8 { pub fn minify(comptime json: []const u8) []const u8 {
@setEvalBranchQuota(100000);
return comptime blk: { return comptime blk: {
var res: []const u8 = ""; var res: []const u8 = "";
var in_string = false; var in_string = false;

View File

@@ -8,6 +8,7 @@ const Element = @import("../browser/webapi/Element.zig");
const Selector = @import("../browser/webapi/selector/Selector.zig"); const Selector = @import("../browser/webapi/selector/Selector.zig");
const protocol = @import("protocol.zig"); const protocol = @import("protocol.zig");
const Server = @import("Server.zig"); const Server = @import("Server.zig");
const CDPNode = @import("../cdp/Node.zig");
pub const tool_list = [_]protocol.Tool{ pub const tool_list = [_]protocol.Tool{
.{ .{
@@ -61,6 +62,18 @@ pub const tool_list = [_]protocol.Tool{
\\} \\}
), ),
}, },
.{
.name = "semantic_tree",
.description = "Get the page content as a simplified semantic DOM tree for AI reasoning. If a url is provided, it navigates to that url first.",
.inputSchema = protocol.minify(
\\{
\\ "type": "object",
\\ "properties": {
\\ "url": { "type": "string", "description": "Optional URL to navigate to before fetching the semantic tree." }
\\ }
\\}
),
},
.{ .{
.name = "interactiveElements", .name = "interactiveElements",
.description = "Extract interactive elements from the opened page. If a url is provided, it navigates to that url first.", .description = "Extract interactive elements from the opened page. If a url is provided, it navigates to that url first.",
@@ -103,13 +116,16 @@ const EvaluateParams = struct {
const ToolStreamingText = struct { const ToolStreamingText = struct {
page: *lp.Page, page: *lp.Page,
action: enum { markdown, links }, action: enum { markdown, links, semantic_tree },
registry: ?*CDPNode.Registry = null,
arena: ?std.mem.Allocator = null,
pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) !void { pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) !void {
try jw.beginWriteRaw(); try jw.beginWriteRaw();
try jw.writer.writeByte('"'); try jw.writer.writeByte('"');
var escaped = protocol.JsonEscapingWriter.init(jw.writer); var escaped: protocol.JsonEscapingWriter = .init(jw.writer);
const w = &escaped.writer; const w = &escaped.writer;
switch (self.action) { switch (self.action) {
.markdown => lp.markdown.dump(self.page.document.asNode(), .{}, w, self.page) catch |err| { .markdown => lp.markdown.dump(self.page.document.asNode(), .{}, w, self.page) catch |err| {
log.err(.mcp, "markdown dump failed", .{ .err = err }); log.err(.mcp, "markdown dump failed", .{ .err = err });
@@ -137,7 +153,21 @@ const ToolStreamingText = struct {
log.err(.mcp, "query links failed", .{ .err = err }); log.err(.mcp, "query links failed", .{ .err = err });
} }
}, },
.semantic_tree => {
const st = lp.SemanticTree{
.dom_node = self.page.document.asNode(),
.registry = self.registry.?,
.page = self.page,
.arena = self.arena.?,
.prune = true,
};
st.textStringify(w) catch |err| {
log.err(.mcp, "semantic tree dump failed", .{ .err = err });
};
},
} }
try jw.writer.writeByte('"'); try jw.writer.writeByte('"');
jw.endWriteRaw(); jw.endWriteRaw();
} }
@@ -151,6 +181,7 @@ const ToolAction = enum {
interactiveElements, interactiveElements,
structuredData, structuredData,
evaluate, evaluate,
semantic_tree,
}; };
const tool_map = std.StaticStringMap(ToolAction).initComptime(.{ const tool_map = std.StaticStringMap(ToolAction).initComptime(.{
@@ -161,6 +192,7 @@ const tool_map = std.StaticStringMap(ToolAction).initComptime(.{
.{ "interactiveElements", .interactiveElements }, .{ "interactiveElements", .interactiveElements },
.{ "structuredData", .structuredData }, .{ "structuredData", .structuredData },
.{ "evaluate", .evaluate }, .{ "evaluate", .evaluate },
.{ "semantic_tree", .semantic_tree },
}); });
pub fn handleCall(server: *Server, arena: std.mem.Allocator, req: protocol.Request) !void { pub fn handleCall(server: *Server, arena: std.mem.Allocator, req: protocol.Request) !void {
@@ -188,6 +220,7 @@ pub fn handleCall(server: *Server, arena: std.mem.Allocator, req: protocol.Reque
.interactiveElements => try handleInteractiveElements(server, arena, req.id.?, call_params.arguments), .interactiveElements => try handleInteractiveElements(server, arena, req.id.?, call_params.arguments),
.structuredData => try handleStructuredData(server, arena, req.id.?, call_params.arguments), .structuredData => try handleStructuredData(server, arena, req.id.?, call_params.arguments),
.evaluate => try handleEvaluate(server, arena, req.id.?, call_params.arguments), .evaluate => try handleEvaluate(server, arena, req.id.?, call_params.arguments),
.semantic_tree => try handleSemanticTree(server, arena, req.id.?, call_params.arguments),
} }
} }
@@ -241,6 +274,27 @@ fn handleLinks(server: *Server, arena: std.mem.Allocator, id: std.json.Value, ar
try server.sendResult(id, protocol.CallToolResult(ToolStreamingText){ .content = &content }); try server.sendResult(id, protocol.CallToolResult(ToolStreamingText){ .content = &content });
} }
fn handleSemanticTree(server: *Server, arena: std.mem.Allocator, id: std.json.Value, arguments: ?std.json.Value) !void {
const TreeParams = struct {
url: ?[:0]const u8 = null,
};
if (arguments) |args_raw| {
if (std.json.parseFromValueLeaky(TreeParams, arena, args_raw, .{ .ignore_unknown_fields = true })) |args| {
if (args.url) |u| {
try performGoto(server, u, id);
}
} else |_| {}
}
const page = server.session.currentPage() orelse {
return server.sendError(id, .PageNotLoaded, "Page not loaded");
};
const content = [_]protocol.TextContent(ToolStreamingText){.{
.text = .{ .page = page, .action = .semantic_tree, .registry = &server.node_registry, .arena = arena },
}};
try server.sendResult(id, protocol.CallToolResult(ToolStreamingText){ .content = &content });
}
fn handleInteractiveElements(server: *Server, arena: std.mem.Allocator, id: std.json.Value, arguments: ?std.json.Value) !void { fn handleInteractiveElements(server: *Server, arena: std.mem.Allocator, id: std.json.Value, arguments: ?std.json.Value) !void {
const Params = struct { const Params = struct {
url: ?[:0]const u8 = null, url: ?[:0]const u8 = null,

View File

@@ -305,6 +305,12 @@ pub const String = packed struct {
} }
}; };
pub fn isAllWhitespace(text: []const u8) bool {
return for (text) |c| {
if (!std.ascii.isWhitespace(c)) break false;
} else true;
}
// Discriminatory type that signals the bridge to use arena instead of call_arena // Discriminatory type that signals the bridge to use arena instead of call_arena
// Use this for strings that need to persist beyond the current call // Use this for strings that need to persist beyond the current call
// The caller can unwrap and store just the underlying .str field // The caller can unwrap and store just the underlying .str field