SemanticTree: improve visibility, AX roles and xpath generation

- Use `checkVisibility` for more accurate element visibility detection.
- Add support for color, date, file, and month AX roles.
- Optimize XPath generation by tracking sibling indices during the walk.
- Refine interactivity detection for form elements.
This commit is contained in:
Adrià Arrufat
2026-03-10 09:23:06 +09:00
parent 83ba974f94
commit a318c6263d
2 changed files with 59 additions and 73 deletions

View File

@@ -40,7 +40,7 @@ prune: bool = false,
pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void { pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!void {
var visitor = JsonVisitor{ .jw = jw, .tree = self }; var visitor = JsonVisitor{ .jw = jw, .tree = self };
var xpath_buffer: std.ArrayList(u8) = .{}; var xpath_buffer: std.ArrayList(u8) = .{};
self.walk(self.dom_node, &xpath_buffer, null, &visitor) catch |err| { self.walk(self.dom_node, &xpath_buffer, null, &visitor, 1) catch |err| {
log.err(.app, "semantic tree json dump failed", .{ .err = err }); log.err(.app, "semantic tree json dump failed", .{ .err = err });
return error.WriteFailed; return error.WriteFailed;
}; };
@@ -49,7 +49,7 @@ pub fn jsonStringify(self: @This(), jw: *std.json.Stringify) error{WriteFailed}!
pub fn textStringify(self: @This(), writer: *std.Io.Writer) error{WriteFailed}!void { pub fn textStringify(self: @This(), writer: *std.Io.Writer) error{WriteFailed}!void {
var visitor = TextVisitor{ .writer = writer, .tree = self, .depth = 0 }; var visitor = TextVisitor{ .writer = writer, .tree = self, .depth = 0 };
var xpath_buffer: std.ArrayList(u8) = .empty; var xpath_buffer: std.ArrayList(u8) = .empty;
self.walk(self.dom_node, &xpath_buffer, null, &visitor) catch |err| { self.walk(self.dom_node, &xpath_buffer, null, &visitor, 1) catch |err| {
log.err(.app, "semantic tree text dump failed", .{ .err = err }); log.err(.app, "semantic tree text dump failed", .{ .err = err });
return error.WriteFailed; return error.WriteFailed;
}; };
@@ -73,26 +73,7 @@ const NodeData = struct {
node_name: []const u8, node_name: []const u8,
}; };
fn isDisplayNone(style: []const u8) bool { fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_name: ?[]const u8, visitor: anytype, index: usize) !void {
var it = std.mem.splitScalar(u8, style, ';');
while (it.next()) |decl| {
var decl_it = std.mem.splitScalar(u8, decl, ':');
const prop = decl_it.next() orelse continue;
const value = decl_it.next() orelse continue;
const prop_trimmed = std.mem.trim(u8, prop, &std.ascii.whitespace);
const value_trimmed = std.mem.trim(u8, value, &std.ascii.whitespace);
if (std.ascii.eqlIgnoreCase(prop_trimmed, "display") and
std.ascii.eqlIgnoreCase(value_trimmed, "none"))
{
return true;
}
}
return false;
}
fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_name: ?[]const u8, visitor: anytype) !void {
// 1. Skip non-content nodes // 1. Skip non-content nodes
if (node.is(Element)) |el| { if (node.is(Element)) |el| {
const tag = el.getTag(); const tag = el.getTag();
@@ -101,11 +82,9 @@ fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_nam
// We handle options/optgroups natively inside their parents, skip them in the general walk // We handle options/optgroups natively inside their parents, skip them in the general walk
if (tag == .datalist or tag == .option or tag == .optgroup) return; if (tag == .datalist or tag == .option or tag == .optgroup) return;
// CSS display: none visibility check (inline style only for now) // Check visibility using the engine's checkVisibility which handles CSS display: none
if (el.getAttributeSafe(comptime lp.String.wrap("style"))) |style| { if (!el.checkVisibility(self.page)) {
if (isDisplayNone(style)) { return;
return;
}
} }
if (el.is(Element.Html)) |html_el| { if (el.is(Element.Html)) |html_el| {
@@ -136,6 +115,26 @@ fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_nam
const ax_role = std.meta.stringToEnum(AXNode.AXRole, role) orelse .none; const ax_role = std.meta.stringToEnum(AXNode.AXRole, role) orelse .none;
is_interactive = ax_role.isInteractive(); is_interactive = ax_role.isInteractive();
if (el.is(Element.Html.Input)) |input| {
// Force all non-hidden inputs to be interactive
if (input._input_type != .hidden) {
is_interactive = true;
}
value = input.getValue();
if (el.getAttributeSafe(comptime lp.String.wrap("list"))) |list_id| {
options = try extractDataListOptions(list_id, self.page, self.arena);
}
} else if (el.is(Element.Html.TextArea)) |textarea| {
is_interactive = true;
value = textarea.getValue();
} else if (el.is(Element.Html.Select)) |select| {
is_interactive = true;
value = select.getValue(self.page);
options = try extractSelectOptions(el.asNode(), self.page, self.arena);
} else if (el.getTag() == .button) {
is_interactive = true;
}
const event_target = node.asEventTarget(); const event_target = node.asEventTarget();
if (self.page._event_manager.hasListener(event_target, "click") or if (self.page._event_manager.hasListener(event_target, "click") or
self.page._event_manager.hasListener(event_target, "mousedown") or self.page._event_manager.hasListener(event_target, "mousedown") or
@@ -158,24 +157,12 @@ fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_nam
is_interactive = true; is_interactive = true;
} }
} }
if (el.is(Element.Html.Input)) |input| {
value = input.getValue();
if (el.getAttributeSafe(comptime lp.String.wrap("list"))) |list_id| {
options = try extractDataListOptions(list_id, self.page, self.arena);
}
} else if (el.is(Element.Html.TextArea)) |textarea| {
value = textarea.getValue();
} else if (el.is(Element.Html.Select)) |select| {
value = select.getValue(self.page);
options = try extractSelectOptions(el.asNode(), self.page, self.arena);
}
} else if (node._type == .document or node._type == .document_fragment) { } else if (node._type == .document or node._type == .document_fragment) {
node_name = "root"; node_name = "root";
} }
const initial_xpath_len = xpath_buffer.items.len; const initial_xpath_len = xpath_buffer.items.len;
try appendXPathSegment(node, xpath_buffer.writer(self.arena)); try appendXPathSegment(node, xpath_buffer.writer(self.arena), index);
const xpath = xpath_buffer.items; const xpath = xpath_buffer.items;
const name = try axn.getName(self.page, self.arena); const name = try axn.getName(self.page, self.arena);
@@ -225,8 +212,20 @@ fn walk(self: @This(), node: *Node, xpath_buffer: *std.ArrayList(u8), parent_nam
// If we are printing this node normally OR skipping it and unrolling its children, // If we are printing this node normally OR skipping it and unrolling its children,
// we walk the children iterator. // we walk the children iterator.
var it = node.childrenIterator(); var it = node.childrenIterator();
var tag_counts = std.StringArrayHashMap(usize).init(self.arena);
while (it.next()) |child| { while (it.next()) |child| {
try self.walk(child, xpath_buffer, name, visitor); var tag: []const u8 = "text()";
if (child.is(Element)) |el| {
tag = el.getTagNameLower();
}
const gop = try tag_counts.getOrPut(tag);
if (!gop.found_existing) {
gop.value_ptr.* = 0;
}
gop.value_ptr.* += 1;
try self.walk(child, xpath_buffer, name, visitor, gop.value_ptr.*);
} }
} }
@@ -274,34 +273,11 @@ fn extractDataListOptions(list_id: []const u8, page: *Page, arena: std.mem.Alloc
return null; return null;
} }
fn appendXPathSegment(node: *Node, writer: anytype) !void { fn appendXPathSegment(node: *Node, writer: anytype, index: usize) !void {
if (node.is(Element)) |el| { if (node.is(Element)) |el| {
const tag = el.getTagNameLower(); const tag = el.getTagNameLower();
var index: usize = 1;
if (node._parent) |parent| {
var it = parent.childrenIterator();
while (it.next()) |sibling| {
if (sibling == node) break;
if (sibling.is(Element)) |s_el| {
if (std.mem.eql(u8, s_el.getTagNameLower(), tag)) {
index += 1;
}
}
}
}
try std.fmt.format(writer, "/{s}[{d}]", .{ tag, index }); try std.fmt.format(writer, "/{s}[{d}]", .{ tag, index });
} else if (node.is(CData.Text) != null) { } else if (node.is(CData.Text) != null) {
var index: usize = 1;
if (node._parent) |parent| {
var it = parent.childrenIterator();
while (it.next()) |sibling| {
if (sibling == node) break;
if (sibling.is(CData.Text) != null) {
index += 1;
}
}
}
try std.fmt.format(writer, "/text()[{d}]", .{index}); try std.fmt.format(writer, "/text()[{d}]", .{index});
} }
} }

View File

@@ -557,10 +557,10 @@ pub const Writer = struct {
pub const AXRole = enum(u8) { pub const AXRole = enum(u8) {
// zig fmt: off // zig fmt: off
none, article, banner, blockquote, button, caption, cell, checkbox, code, none, article, banner, blockquote, button, caption, cell, checkbox, code, color,
columnheader, combobox, complementary, contentinfo, definition, deletion, columnheader, combobox, complementary, contentinfo, date, definition, deletion,
dialog, document, emphasis, figure, form, group, heading, image, insertion, dialog, document, emphasis, figure, file, form, group, heading, image, insertion,
link, list, listbox, listitem, main, marquee, menuitem, meter, navigation, option, link, list, listbox, listitem, main, marquee, menuitem, meter, month, navigation, option,
paragraph, presentation, progressbar, radio, region, row, rowgroup, paragraph, presentation, progressbar, radio, region, row, rowgroup,
rowheader, searchbox, separator, slider, spinbutton, status, strong, rowheader, searchbox, separator, slider, spinbutton, status, strong,
subscript, superscript, @"switch", table, term, textbox, time, RootWebArea, LineBreak, subscript, superscript, @"switch", table, term, textbox, time, RootWebArea, LineBreak,
@@ -580,6 +580,10 @@ pub const AXRole = enum(u8) {
.spinbutton, .spinbutton,
.@"switch", .@"switch",
.menuitem, .menuitem,
.color,
.date,
.file,
.month,
=> true, => true,
else => false, else => false,
}; };
@@ -638,9 +642,13 @@ pub const AXRole = enum(u8) {
.number => .spinbutton, .number => .spinbutton,
.search => .searchbox, .search => .searchbox,
.checkbox => .checkbox, .checkbox => .checkbox,
.color => .color,
.date => .date,
.file => .file,
.month => .month,
.@"datetime-local", .week, .time => .combobox,
// zig fmt: off // zig fmt: off
.password, .@"datetime-local", .hidden, .month, .color, .password, .hidden => .none,
.week, .time, .file, .date => .none,
// zig fmt: on // zig fmt: on
}; };
}, },
@@ -883,7 +891,7 @@ fn writeName(axnode: AXNode, w: anytype, page: *Page) !?AXSource {
.object, .progress, .meter, .main, .nav, .aside, .header, .object, .progress, .meter, .main, .nav, .aside, .header,
.footer, .form, .section, .article, .ul, .ol, .dl, .menu, .footer, .form, .section, .article, .ul, .ol, .dl, .menu,
.thead, .tbody, .tfoot, .tr, .td, .div, .span, .p, .details, .li, .thead, .tbody, .tfoot, .tr, .td, .div, .span, .p, .details, .li,
.style, .script, .style, .script, .html, .body,
// zig fmt: on // zig fmt: on
=> {}, => {},
else => { else => {
@@ -943,7 +951,9 @@ fn writeAccessibleNameFallback(node: *DOMNode, writer: *std.Io.Writer, page: *Pa
} }
} }
} else { } else {
try writeAccessibleNameFallback(child, writer, page); if (!el.getTag().isMetadata()) {
try writeAccessibleNameFallback(child, writer, page);
}
} }
}, },
else => {}, else => {},