From 22d31b1527a42a2ccc8ed2baf9cde857d4b60597 Mon Sep 17 00:00:00 2001 From: egrs Date: Tue, 10 Mar 2026 09:19:51 +0100 Subject: [PATCH 1/2] add LP.getStructuredData CDP command --- src/browser/structured_data.zig | 435 ++++++++++++++++++++++++++++++++ src/cdp/domains/lp.zig | 34 +++ src/lightpanda.zig | 1 + 3 files changed, 470 insertions(+) create mode 100644 src/browser/structured_data.zig diff --git a/src/browser/structured_data.zig b/src/browser/structured_data.zig new file mode 100644 index 00000000..4335fb88 --- /dev/null +++ b/src/browser/structured_data.zig @@ -0,0 +1,435 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +const std = @import("std"); + +const Page = @import("Page.zig"); +const URL = @import("URL.zig"); +const TreeWalker = @import("webapi/TreeWalker.zig"); +const Element = @import("webapi/Element.zig"); +const Node = @import("webapi/Node.zig"); + +const Allocator = std.mem.Allocator; + +/// Key-value pair for structured data properties. +pub const Property = struct { + key: []const u8, + value: []const u8, +}; + +pub const AlternateLink = struct { + href: []const u8, + hreflang: ?[]const u8, + type: ?[]const u8, + title: ?[]const u8, +}; + +pub const StructuredData = struct { + json_ld: []const []const u8, + open_graph: []const Property, + twitter_card: []const Property, + meta: []const Property, + links: []const Property, + alternate: []const AlternateLink, + + pub fn jsonStringify(self: *const StructuredData, jw: anytype) !void { + try jw.beginObject(); + + try jw.objectField("jsonLd"); + try jw.write(self.json_ld); + + try jw.objectField("openGraph"); + try writeProperties(jw, self.open_graph); + + try jw.objectField("twitterCard"); + try writeProperties(jw, self.twitter_card); + + try jw.objectField("meta"); + try writeProperties(jw, self.meta); + + try jw.objectField("links"); + try writeProperties(jw, self.links); + + if (self.alternate.len > 0) { + try jw.objectField("alternate"); + try jw.beginArray(); + for (self.alternate) |alt| { + try jw.beginObject(); + try jw.objectField("href"); + try jw.write(alt.href); + if (alt.hreflang) |v| { + try jw.objectField("hreflang"); + try jw.write(v); + } + if (alt.type) |v| { + try jw.objectField("type"); + try jw.write(v); + } + if (alt.title) |v| { + try jw.objectField("title"); + try jw.write(v); + } + try jw.endObject(); + } + try jw.endArray(); + } + + try jw.endObject(); + } +}; + +fn writeProperties(jw: anytype, properties: []const Property) !void { + try jw.beginObject(); + for (properties) |prop| { + try jw.objectField(prop.key); + try jw.write(prop.value); + } + try jw.endObject(); +} + +/// Extract all structured data from the page. +pub fn collectStructuredData( + root: *Node, + arena: Allocator, + page: *Page, +) !StructuredData { + var json_ld: std.ArrayList([]const u8) = .empty; + var open_graph: std.ArrayList(Property) = .empty; + var twitter_card: std.ArrayList(Property) = .empty; + var meta: std.ArrayList(Property) = .empty; + var links: std.ArrayList(Property) = .empty; + var alternate: std.ArrayList(AlternateLink) = .empty; + + // Extract language from the root element. + if (root.is(Element)) |root_el| { + if (root_el.getAttributeSafe(comptime .wrap("lang"))) |lang| { + try meta.append(arena, .{ .key = "language", .value = lang }); + } + } else { + // Root is document — check documentElement. + var children = root.childrenIterator(); + while (children.next()) |child| { + const el = child.is(Element) orelse continue; + if (el.getTag() == .html) { + if (el.getAttributeSafe(comptime .wrap("lang"))) |lang| { + try meta.append(arena, .{ .key = "language", .value = lang }); + } + break; + } + } + } + + var tw = TreeWalker.Full.init(root, .{}); + while (tw.next()) |node| { + const el = node.is(Element) orelse continue; + + switch (el.getTag()) { + .script => { + try collectJsonLd(el, arena, &json_ld); + tw.skipChildren(); + }, + .meta => collectMeta(el, &open_graph, &twitter_card, &meta, arena) catch {}, + .title => try collectTitle(node, arena, &meta), + .link => try collectLink(el, arena, page, &links, &alternate), + // Skip body subtree for non-JSON-LD — all other metadata is in . + // JSON-LD can appear in so we don't skip the whole body. + else => {}, + } + } + + return .{ + .json_ld = json_ld.items, + .open_graph = open_graph.items, + .twitter_card = twitter_card.items, + .meta = meta.items, + .links = links.items, + .alternate = alternate.items, + }; +} + +fn collectJsonLd( + el: *Element, + arena: Allocator, + json_ld: *std.ArrayList([]const u8), +) !void { + const type_attr = el.getAttributeSafe(comptime .wrap("type")) orelse return; + if (!std.ascii.eqlIgnoreCase(type_attr, "application/ld+json")) return; + + var buf: std.Io.Writer.Allocating = .init(arena); + try el.asNode().getTextContent(&buf.writer); + const text = buf.written(); + if (text.len > 0) { + try json_ld.append(arena, std.mem.trim(u8, text, &std.ascii.whitespace)); + } +} + +fn collectMeta( + el: *Element, + open_graph: *std.ArrayList(Property), + twitter_card: *std.ArrayList(Property), + meta: *std.ArrayList(Property), + arena: Allocator, +) !void { + // charset: (no content attribute needed). + if (el.getAttributeSafe(comptime .wrap("charset"))) |charset| { + try meta.append(arena, .{ .key = "charset", .value = charset }); + } + + const content = el.getAttributeSafe(comptime .wrap("content")) orelse return; + + // Open Graph: + if (el.getAttributeSafe(comptime .wrap("property"))) |property| { + if (startsWith(property, "og:")) { + try open_graph.append(arena, .{ .key = property[3..], .value = content }); + return; + } + // Article, profile, etc. are OG sub-namespaces. + if (startsWith(property, "article:") or + startsWith(property, "profile:") or + startsWith(property, "book:") or + startsWith(property, "music:") or + startsWith(property, "video:")) + { + try open_graph.append(arena, .{ .key = property, .value = content }); + return; + } + } + + // Twitter Cards: + if (el.getAttributeSafe(comptime .wrap("name"))) |name| { + if (startsWith(name, "twitter:")) { + try twitter_card.append(arena, .{ .key = name[8..], .value = content }); + return; + } + + // Standard meta tags by name. + const known_names = [_][]const u8{ + "description", "author", "keywords", "robots", + "viewport", "generator", "theme-color", + }; + for (known_names) |known| { + if (std.ascii.eqlIgnoreCase(name, known)) { + try meta.append(arena, .{ .key = known, .value = content }); + return; + } + } + } + + // http-equiv (e.g. Content-Type, refresh) + if (el.getAttributeSafe(comptime .wrap("http-equiv"))) |http_equiv| { + try meta.append(arena, .{ .key = http_equiv, .value = content }); + } +} + +fn collectTitle( + node: *Node, + arena: Allocator, + meta: *std.ArrayList(Property), +) !void { + var buf: std.Io.Writer.Allocating = .init(arena); + try node.getTextContent(&buf.writer); + const text = std.mem.trim(u8, buf.written(), &std.ascii.whitespace); + if (text.len > 0) { + try meta.append(arena, .{ .key = "title", .value = text }); + } +} + +fn collectLink( + el: *Element, + arena: Allocator, + page: *Page, + links: *std.ArrayList(Property), + alternate: *std.ArrayList(AlternateLink), +) !void { + const rel = el.getAttributeSafe(comptime .wrap("rel")) orelse return; + const raw_href = el.getAttributeSafe(comptime .wrap("href")) orelse return; + const href = URL.resolve(arena, page.base(), raw_href, .{ .encode = true }) catch raw_href; + + if (std.ascii.eqlIgnoreCase(rel, "alternate")) { + try alternate.append(arena, .{ + .href = href, + .hreflang = el.getAttributeSafe(comptime .wrap("hreflang")), + .type = el.getAttributeSafe(comptime .wrap("type")), + .title = el.getAttributeSafe(comptime .wrap("title")), + }); + return; + } + + const relevant_rels = [_][]const u8{ + "canonical", "icon", "manifest", "shortcut icon", + "apple-touch-icon", "search", "author", "license", + "dns-prefetch", "preconnect", + }; + for (relevant_rels) |known| { + if (std.ascii.eqlIgnoreCase(rel, known)) { + try links.append(arena, .{ .key = known, .value = href }); + return; + } + } +} + +fn startsWith(haystack: []const u8, prefix: []const u8) bool { + if (haystack.len < prefix.len) return false; + return std.mem.eql(u8, haystack[0..prefix.len], prefix); +} + +// --- Tests --- + +const testing = @import("../testing.zig"); + +fn testStructuredData(html: []const u8) !StructuredData { + const page = try testing.test_session.createPage(); + defer testing.test_session.removePage(); + + const doc = page.window._document; + const div = try doc.createElement("div", null, page); + try page.parseHtmlAsChildren(div.asNode(), html); + + return collectStructuredData(div.asNode(), page.call_arena, page); +} + +fn findProperty(props: []const Property, key: []const u8) ?[]const u8 { + for (props) |p| { + if (std.mem.eql(u8, p.key, key)) return p.value; + } + return null; +} + +test "structured_data: json-ld" { + const data = try testStructuredData( + \\ + ); + try testing.expectEqual(1, data.json_ld.len); + try testing.expect(std.mem.indexOf(u8, data.json_ld[0], "Article") != null); +} + +test "structured_data: multiple json-ld" { + const data = try testStructuredData( + \\ + \\ + \\ + ); + try testing.expectEqual(2, data.json_ld.len); +} + +test "structured_data: open graph" { + const data = try testStructuredData( + \\ + \\ + \\ + \\ + \\ + \\ + ); + try testing.expectEqual(6, data.open_graph.len); + try testing.expectEqual("My Page", findProperty(data.open_graph, "title").?); + try testing.expectEqual("article", findProperty(data.open_graph, "type").?); + try testing.expectEqual("2026-03-10", findProperty(data.open_graph, "article:published_time").?); +} + +test "structured_data: twitter card" { + const data = try testStructuredData( + \\ + \\ + \\ + ); + try testing.expectEqual(3, data.twitter_card.len); + try testing.expectEqual("summary_large_image", findProperty(data.twitter_card, "card").?); + try testing.expectEqual("@example", findProperty(data.twitter_card, "site").?); +} + +test "structured_data: meta tags" { + const data = try testStructuredData( + \\Page Title + \\ + \\ + \\ + \\ + ); + try testing.expectEqual("Page Title", findProperty(data.meta, "title").?); + try testing.expectEqual("A test page", findProperty(data.meta, "description").?); + try testing.expectEqual("Test Author", findProperty(data.meta, "author").?); + try testing.expectEqual("test, example", findProperty(data.meta, "keywords").?); + try testing.expectEqual("index, follow", findProperty(data.meta, "robots").?); +} + +test "structured_data: link elements" { + const data = try testStructuredData( + \\ + \\ + \\ + \\ + ); + try testing.expectEqual(3, data.links.len); + try testing.expectEqual("https://example.com/page", findProperty(data.links, "canonical").?); + // stylesheet should be filtered out + try testing.expectEqual(null, findProperty(data.links, "stylesheet")); +} + +test "structured_data: alternate links" { + const data = try testStructuredData( + \\ + \\ + ); + try testing.expectEqual(2, data.alternate.len); + try testing.expectEqual("fr", data.alternate[0].hreflang.?); + try testing.expectEqual("French", data.alternate[0].title.?); + try testing.expectEqual("de", data.alternate[1].hreflang.?); + try testing.expectEqual(null, data.alternate[1].title); +} + +test "structured_data: non-metadata elements ignored" { + const data = try testStructuredData( + \\
Just text
+ \\

More text

+ \\Link + ); + try testing.expectEqual(0, data.json_ld.len); + try testing.expectEqual(0, data.open_graph.len); + try testing.expectEqual(0, data.twitter_card.len); + try testing.expectEqual(0, data.meta.len); + try testing.expectEqual(0, data.links.len); +} + +test "structured_data: charset and http-equiv" { + const data = try testStructuredData( + \\ + \\ + ); + try testing.expectEqual("utf-8", findProperty(data.meta, "charset").?); + try testing.expectEqual("text/html; charset=utf-8", findProperty(data.meta, "Content-Type").?); +} + +test "structured_data: mixed content" { + const data = try testStructuredData( + \\My Site + \\ + \\ + \\ + \\ + \\ + ); + try testing.expectEqual(1, data.json_ld.len); + try testing.expectEqual(1, data.open_graph.len); + try testing.expectEqual(1, data.twitter_card.len); + try testing.expectEqual("My Site", findProperty(data.meta, "title").?); + try testing.expectEqual("A page", findProperty(data.meta, "description").?); + try testing.expectEqual(1, data.links.len); +} diff --git a/src/cdp/domains/lp.zig b/src/cdp/domains/lp.zig index 5503c356..84ae8417 100644 --- a/src/cdp/domains/lp.zig +++ b/src/cdp/domains/lp.zig @@ -19,15 +19,18 @@ const std = @import("std"); const lp = @import("lightpanda"); const markdown = lp.markdown; +const structured_data = lp.structured_data; const Node = @import("../Node.zig"); pub fn processMessage(cmd: anytype) !void { const action = std.meta.stringToEnum(enum { getMarkdown, + getStructuredData, }, cmd.input.action) orelse return error.UnknownMethod; switch (action) { .getMarkdown => return getMarkdown(cmd), + .getStructuredData => return getStructuredData(cmd), } } @@ -54,6 +57,21 @@ fn getMarkdown(cmd: anytype) !void { }, .{}); } +fn getStructuredData(cmd: anytype) !void { + const bc = cmd.browser_context orelse return error.NoBrowserContext; + const page = bc.session.currentPage() orelse return error.PageNotLoaded; + + const data = try structured_data.collectStructuredData( + page.document.asNode(), + cmd.arena, + page, + ); + + return cmd.sendResult(.{ + .structuredData = data, + }, .{}); +} + const testing = @import("../testing.zig"); test "cdp.lp: getMarkdown" { var ctx = testing.context(); @@ -70,3 +88,19 @@ test "cdp.lp: getMarkdown" { const result = ctx.client.?.sent.items[0].object.get("result").?.object; try testing.expect(result.get("markdown") != null); } + +test "cdp.lp: getStructuredData" { + var ctx = testing.context(); + defer ctx.deinit(); + + const bc = try ctx.loadBrowserContext(.{}); + _ = try bc.session.createPage(); + + try ctx.processMessage(.{ + .id = 1, + .method = "LP.getStructuredData", + }); + + const result = ctx.client.?.sent.items[0].object.get("result").?.object; + try testing.expect(result.get("structuredData") != null); +} diff --git a/src/lightpanda.zig b/src/lightpanda.zig index 26bc23f0..33dad427 100644 --- a/src/lightpanda.zig +++ b/src/lightpanda.zig @@ -30,6 +30,7 @@ pub const log = @import("log.zig"); pub const js = @import("browser/js/js.zig"); pub const dump = @import("browser/dump.zig"); pub const markdown = @import("browser/markdown.zig"); +pub const structured_data = @import("browser/structured_data.zig"); pub const mcp = @import("mcp.zig"); pub const build_config = @import("build_config"); pub const crash_handler = @import("crash_handler.zig"); From f4ca5313e6bdbf5d9518241642c1058ce3ecf0fb Mon Sep 17 00:00:00 2001 From: egrs Date: Tue, 10 Mar 2026 13:18:25 +0100 Subject: [PATCH 2/2] use std.mem.startsWith, group duplicate property keys into arrays Address review feedback: - replace custom startsWith helper with std.mem.startsWith - writeProperties now groups repeated keys (e.g. multiple og:image) into JSON arrays; single-occurrence keys remain strings - add test for duplicate key serialization --- src/browser/structured_data.zig | 82 +++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 14 deletions(-) diff --git a/src/browser/structured_data.zig b/src/browser/structured_data.zig index 4335fb88..9b6e7fbe 100644 --- a/src/browser/structured_data.zig +++ b/src/browser/structured_data.zig @@ -93,11 +93,41 @@ pub const StructuredData = struct { } }; +/// Serializes properties as a JSON object. When a key appears multiple times +/// (e.g. multiple og:image tags), values are grouped into an array. +/// Alternatives considered: always-array values (verbose), or an array of +/// {key, value} pairs (preserves order but less ergonomic for consumers). fn writeProperties(jw: anytype, properties: []const Property) !void { try jw.beginObject(); - for (properties) |prop| { + for (properties, 0..) |prop, i| { + // Skip keys already written by an earlier occurrence. + var already_written = false; + for (properties[0..i]) |prev| { + if (std.mem.eql(u8, prev.key, prop.key)) { + already_written = true; + break; + } + } + if (already_written) continue; + + // Count total occurrences to decide string vs array. + var count: usize = 0; + for (properties) |p| { + if (std.mem.eql(u8, p.key, prop.key)) count += 1; + } + try jw.objectField(prop.key); - try jw.write(prop.value); + if (count == 1) { + try jw.write(prop.value); + } else { + try jw.beginArray(); + for (properties) |p| { + if (std.mem.eql(u8, p.key, prop.key)) { + try jw.write(p.value); + } + } + try jw.endArray(); + } } try jw.endObject(); } @@ -194,16 +224,16 @@ fn collectMeta( // Open Graph: if (el.getAttributeSafe(comptime .wrap("property"))) |property| { - if (startsWith(property, "og:")) { + if (std.mem.startsWith(u8, property, "og:")) { try open_graph.append(arena, .{ .key = property[3..], .value = content }); return; } // Article, profile, etc. are OG sub-namespaces. - if (startsWith(property, "article:") or - startsWith(property, "profile:") or - startsWith(property, "book:") or - startsWith(property, "music:") or - startsWith(property, "video:")) + if (std.mem.startsWith(u8, property, "article:") or + std.mem.startsWith(u8, property, "profile:") or + std.mem.startsWith(u8, property, "book:") or + std.mem.startsWith(u8, property, "music:") or + std.mem.startsWith(u8, property, "video:")) { try open_graph.append(arena, .{ .key = property, .value = content }); return; @@ -212,7 +242,7 @@ fn collectMeta( // Twitter Cards: if (el.getAttributeSafe(comptime .wrap("name"))) |name| { - if (startsWith(name, "twitter:")) { + if (std.mem.startsWith(u8, name, "twitter:")) { try twitter_card.append(arena, .{ .key = name[8..], .value = content }); return; } @@ -283,11 +313,6 @@ fn collectLink( } } -fn startsWith(haystack: []const u8, prefix: []const u8) bool { - if (haystack.len < prefix.len) return false; - return std.mem.eql(u8, haystack[0..prefix.len], prefix); -} - // --- Tests --- const testing = @import("../testing.zig"); @@ -344,6 +369,35 @@ test "structured_data: open graph" { try testing.expectEqual("2026-03-10", findProperty(data.open_graph, "article:published_time").?); } +test "structured_data: open graph duplicate keys" { + const data = try testStructuredData( + \\ + \\ + \\ + \\ + ); + // Duplicate keys are preserved as separate Property entries. + try testing.expectEqual(4, data.open_graph.len); + + // Verify serialization groups duplicates into arrays. + const json = try std.json.Stringify.valueAlloc(testing.allocator, data, .{}); + defer testing.allocator.free(json); + + const parsed = try std.json.parseFromSlice(std.json.Value, testing.allocator, json, .{}); + defer parsed.deinit(); + const og = parsed.value.object.get("openGraph").?.object; + // "title" appears once → string. + switch (og.get("title").?) { + .string => {}, + else => return error.TestUnexpectedResult, + } + // "image" appears 3 times → array. + switch (og.get("image").?) { + .array => |arr| try testing.expectEqual(3, arr.items.len), + else => return error.TestUnexpectedResult, + } +} + test "structured_data: twitter card" { const data = try testStructuredData( \\