diff --git a/src/browser/EventManager.zig b/src/browser/EventManager.zig index 17271635..52485256 100644 --- a/src/browser/EventManager.zig +++ b/src/browser/EventManager.zig @@ -98,6 +98,14 @@ pub const Callback = union(enum) { object: js.Object, }; +pub fn hasListener(self: *EventManager, target: *EventTarget, typ: []const u8) bool { + const type_string = String.wrap(typ); + return self.lookup.contains(.{ + .event_target = @intFromPtr(target), + .type_string = type_string, + }); +} + pub fn register(self: *EventManager, target: *EventTarget, typ: []const u8, callback: Callback, opts: RegisterOptions) !void { if (comptime IS_DEBUG) { log.debug(.event, "eventManager.register", .{ .type = typ, .capture = opts.capture, .once = opts.once, .target = target.toString() }); diff --git a/src/cdp/AXNode.zig b/src/cdp/AXNode.zig index 487e79ad..d7c68db2 100644 --- a/src/cdp/AXNode.zig +++ b/src/cdp/AXNode.zig @@ -987,7 +987,7 @@ fn isIgnore(self: AXNode, page: *Page) bool { return false; } -fn getRole(self: AXNode) ![]const u8 { +pub fn getRole(self: AXNode) ![]const u8 { if (self.role_attr) |role_value| { // TODO the role can have multiple comma separated values. return role_value; diff --git a/src/cdp/domains/lp.zig b/src/cdp/domains/lp.zig index 5503c356..a5fcffe6 100644 --- a/src/cdp/domains/lp.zig +++ b/src/cdp/domains/lp.zig @@ -18,19 +18,53 @@ const std = @import("std"); const lp = @import("lightpanda"); +const log = @import("../../log.zig"); const markdown = lp.markdown; const Node = @import("../Node.zig"); +const DOMNode = @import("../../browser/webapi/Node.zig"); +const SemanticTree = @import("../semantic_tree.zig"); pub fn processMessage(cmd: anytype) !void { const action = std.meta.stringToEnum(enum { getMarkdown, + getSemanticTree, }, cmd.input.action) orelse return error.UnknownMethod; switch (action) { .getMarkdown => return getMarkdown(cmd), + .getSemanticTree => return getSemanticTree(cmd), } } +const SemanticTreeResult = struct { + dom_node: *DOMNode, + registry: *Node.Registry, + page: *lp.Page, + arena: std.mem.Allocator, + + pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void { + SemanticTree.dump(self.dom_node, self.registry, jw, self.page, self.arena) catch |err| { + log.err(.cdp, "semantic tree dump failed", .{ .err = err }); + return error.WriteFailed; + }; + } +}; + +fn getSemanticTree(cmd: anytype) !void { + const bc = cmd.browser_context orelse return error.NoBrowserContext; + const page = bc.session.currentPage() orelse return error.PageNotLoaded; + const dom_node = page.document.asNode(); + + return cmd.sendResult(.{ + .semanticTree = .{ + .dom_node = dom_node, + .registry = &bc.node_registry, + .page = page, + .arena = cmd.arena, + }, + }, .{}); +} + fn getMarkdown(cmd: anytype) !void { const Params = struct { nodeId: ?Node.Id = null, @@ -45,7 +79,7 @@ fn getMarkdown(cmd: anytype) !void { else page.document.asNode(); - var aw = std.Io.Writer.Allocating.init(cmd.arena); + var aw: std.Io.Writer.Allocating = .init(cmd.arena); defer aw.deinit(); try markdown.dump(dom_node, .{}, &aw.writer, page); diff --git a/src/cdp/semantic_tree.zig b/src/cdp/semantic_tree.zig new file mode 100644 index 00000000..d3b637f4 --- /dev/null +++ b/src/cdp/semantic_tree.zig @@ -0,0 +1,221 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +const std = @import("std"); + +const lp = @import("lightpanda"); +const Page = lp.Page; + +const CData = @import("../browser/webapi/CData.zig"); +const Element = @import("../browser/webapi/Element.zig"); +const Node = @import("../browser/webapi/Node.zig"); +const AXNode = @import("AXNode.zig"); +const CDPNode = @import("Node.zig"); + +pub fn dump(root: *Node, registry: *CDPNode.Registry, jw: *std.json.Stringify, page: *Page, arena: std.mem.Allocator) !void { + try dumpNode(root, registry, jw, page, "", arena); +} + +fn isAllWhitespace(text: []const u8) bool { + for (text) |c| { + if (!std.ascii.isWhitespace(c)) return false; + } + return true; +} + +fn getXPathSegment(node: *Node, arena: std.mem.Allocator) ![]const u8 { + if (node.is(Element)) |el| { + const tag = el.getTagNameLower(); + var index: usize = 1; + + if (node._parent) |parent| { + var it = parent.childrenIterator(); + while (it.next()) |sibling| { + if (sibling == node) break; + if (sibling.is(Element)) |s_el| { + if (std.mem.eql(u8, s_el.getTagNameLower(), tag)) { + index += 1; + } + } + } + } + return std.fmt.allocPrint(arena, "/{s}[{d}]", .{ tag, index }); + } else if (node.is(CData.Text) != null) { + var index: usize = 1; + if (node._parent) |parent| { + var it = parent.childrenIterator(); + while (it.next()) |sibling| { + if (sibling == node) break; + if (sibling.is(CData.Text) != null) { + index += 1; + } + } + } + return std.fmt.allocPrint(arena, "/text()[{d}]", .{index}); + } + return ""; +} + +fn dumpNode(node: *Node, registry: *CDPNode.Registry, jw: *std.json.Stringify, page: *Page, parent_xpath: []const u8, arena: std.mem.Allocator) !void { + // 1. Skip non-content nodes + if (node.is(Element)) |el| { + const tag = el.getTagNameLower(); + if (std.mem.eql(u8, tag, "script") or + std.mem.eql(u8, tag, "style") or + std.mem.eql(u8, tag, "meta") or + std.mem.eql(u8, tag, "link") or + std.mem.eql(u8, tag, "noscript") or + std.mem.eql(u8, tag, "svg") or + std.mem.eql(u8, tag, "head") or + std.mem.eql(u8, tag, "title")) + { + return; + } + + // CSS display: none visibility check (inline style only for now) + if (el.getAttributeSafe(comptime lp.String.wrap("style"))) |style| { + if (std.mem.indexOf(u8, style, "display: none") != null or + std.mem.indexOf(u8, style, "display:none") != null) + { + return; + } + } + + if (el.is(Element.Html)) |html_el| { + if (html_el.getHidden()) return; + } + } else if (node.is(CData.Text) != null) { + const text_node = node.is(CData.Text).?; + const text = text_node.getWholeText(); + if (isAllWhitespace(text)) { + return; + } + } else if (node._type != .document and node._type != .document_fragment) { + return; + } + + const cdp_node = try registry.register(node); + const axn = AXNode.fromNode(node); + + const role = try axn.getRole(); + + var is_interactive = false; + var node_name: []const u8 = "text"; + + if (node.is(Element)) |el| { + node_name = el.getTagNameLower(); + + if (std.mem.eql(u8, role, "button") or + std.mem.eql(u8, role, "link") or + std.mem.eql(u8, role, "checkbox") or + std.mem.eql(u8, role, "radio") or + std.mem.eql(u8, role, "textbox") or + std.mem.eql(u8, role, "combobox") or + std.mem.eql(u8, role, "searchbox") or + std.mem.eql(u8, role, "slider") or + std.mem.eql(u8, role, "spinbutton") or + std.mem.eql(u8, role, "switch") or + std.mem.eql(u8, role, "menuitem")) + { + is_interactive = true; + } + + const event_target = node.asEventTarget(); + if (page._event_manager.hasListener(event_target, "click") or + page._event_manager.hasListener(event_target, "mousedown") or + page._event_manager.hasListener(event_target, "mouseup") or + page._event_manager.hasListener(event_target, "keydown") or + page._event_manager.hasListener(event_target, "change") or + page._event_manager.hasListener(event_target, "input")) + { + is_interactive = true; + } + + if (el.is(Element.Html)) |html_el| { + if (html_el.hasAttributeFunction(.onclick, page) or + html_el.hasAttributeFunction(.onmousedown, page) or + html_el.hasAttributeFunction(.onmouseup, page) or + html_el.hasAttributeFunction(.onkeydown, page) or + html_el.hasAttributeFunction(.onchange, page) or + html_el.hasAttributeFunction(.oninput, page)) + { + is_interactive = true; + } + } + } else if (node._type == .document or node._type == .document_fragment) { + node_name = "root"; + } + + const segment = try getXPathSegment(node, arena); + const xpath = try std.mem.concat(arena, u8, &.{ parent_xpath, segment }); + + try jw.beginObject(); + + try jw.objectField("nodeId"); + try jw.write(cdp_node.id); + + try jw.objectField("backendNodeId"); + try jw.write(cdp_node.id); + + try jw.objectField("nodeName"); + try jw.write(node_name); + + try jw.objectField("xpath"); + try jw.write(xpath); + + if (node.is(Element)) |el| { + try jw.objectField("nodeType"); + try jw.write(1); + + try jw.objectField("isInteractive"); + try jw.write(is_interactive); + + try jw.objectField("role"); + try jw.write(role); + + if (el._attributes) |attrs| { + try jw.objectField("attributes"); + try jw.beginObject(); + var iter = attrs.iterator(); + while (iter.next()) |attr| { + try jw.objectField(attr._name.str()); + try jw.write(attr._value.str()); + } + try jw.endObject(); + } + } else if (node.is(CData.Text) != null) { + const text_node = node.is(CData.Text).?; + try jw.objectField("nodeType"); + try jw.write(3); + try jw.objectField("nodeValue"); + try jw.write(text_node.getWholeText()); + } else { + try jw.objectField("nodeType"); + try jw.write(9); + } + + try jw.objectField("children"); + try jw.beginArray(); + var it = node.childrenIterator(); + while (it.next()) |child| { + try dumpNode(child, registry, jw, page, xpath, arena); + } + try jw.endArray(); + + try jw.endObject(); +} diff --git a/src/lightpanda.zig b/src/lightpanda.zig index 26bc23f0..b1bbed59 100644 --- a/src/lightpanda.zig +++ b/src/lightpanda.zig @@ -21,6 +21,7 @@ pub const App = @import("App.zig"); pub const Server = @import("Server.zig"); pub const Config = @import("Config.zig"); pub const URL = @import("browser/URL.zig"); +pub const String = @import("string.zig").String; pub const Page = @import("browser/Page.zig"); pub const Browser = @import("browser/Browser.zig"); pub const Session = @import("browser/Session.zig");