From 1b5efea6ebc8d5ade3aafe330de1c8f34ef98be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Arrufat?= Date: Sun, 15 Feb 2026 23:15:56 +0900 Subject: [PATCH] Add --dump-markdown flag Add a new module to handle HTML-to-Markdown conversion and integrate it into the fetch command via a new CLI flag. --- src/Config.zig | 15 +++ src/browser/markdown.zig | 282 +++++++++++++++++++++++++++++++++++++++ src/lightpanda.zig | 8 +- src/main.zig | 3 +- 4 files changed, 306 insertions(+), 2 deletions(-) create mode 100644 src/browser/markdown.zig diff --git a/src/Config.zig b/src/Config.zig index c9725168..8b29ee24 100644 --- a/src/Config.zig +++ b/src/Config.zig @@ -166,6 +166,7 @@ pub const Serve = struct { pub const Fetch = struct { url: [:0]const u8, dump: bool = false, + dump_markdown: bool = false, common: Common = .{}, withbase: bool = false, strip: dump.Opts.Strip = .{}, @@ -308,6 +309,9 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void { \\--dump Dumps document to stdout. \\ Defaults to false. \\ + \\--dump-markdown Dumps document to stdout as Markdown. + \\ Defaults to false. + \\ \\--strip_mode Comma separated list of tag groups to remove from dump \\ the dump. e.g. --strip_mode js,css \\ - "js" script and link[as=script, rel=preload] @@ -410,6 +414,10 @@ fn inferMode(opt: []const u8) ?RunMode { return .fetch; } + if (std.mem.eql(u8, opt, "--dump-markdown")) { + return .fetch; + } + if (std.mem.eql(u8, opt, "--noscript")) { return .fetch; } @@ -547,6 +555,7 @@ fn parseFetchArgs( args: *std.process.ArgIterator, ) !Fetch { var fetch_dump: bool = false; + var fetch_dump_markdown: bool = false; var withbase: bool = false; var url: ?[:0]const u8 = null; var common: Common = .{}; @@ -558,6 +567,11 @@ fn parseFetchArgs( continue; } + if (std.mem.eql(u8, "--dump-markdown", opt)) { + fetch_dump_markdown = true; + continue; + } + if (std.mem.eql(u8, "--noscript", opt)) { log.warn(.app, "deprecation warning", .{ .feature = "--noscript argument", @@ -622,6 +636,7 @@ fn parseFetchArgs( return .{ .url = url.?, .dump = fetch_dump, + .dump_markdown = fetch_dump_markdown, .strip = strip, .common = common, .withbase = withbase, diff --git a/src/browser/markdown.zig b/src/browser/markdown.zig new file mode 100644 index 00000000..4738218f --- /dev/null +++ b/src/browser/markdown.zig @@ -0,0 +1,282 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +const std = @import("std"); +const Page = @import("Page.zig"); +const Node = @import("webapi/Node.zig"); +const Element = @import("webapi/Element.zig"); +const Slot = @import("webapi/element/html/Slot.zig"); + +pub const Opts = struct { + // Options for future customization (e.g., dialect) +}; + +const State = struct { + list_depth: usize = 0, + in_pre: bool = false, + in_code: bool = false, + in_blockquote: bool = false, + last_char_was_newline: bool = true, +}; + +pub fn dump(node: *Node, opts: Opts, writer: *std.Io.Writer, page: *Page) !void { + _ = opts; + var state = State{}; + try render(node, &state, writer, page); + if (!state.last_char_was_newline) { + try writer.writeByte('\n'); + } +} + +fn render(node: *Node, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void { + switch (node._type) { + .document, .document_fragment => { + try renderChildren(node, state, writer, page); + }, + .element => |el| { + try renderElement(el, state, writer, page); + }, + .cdata => |cd| { + if (node.is(Node.CData.Text)) |_| { + try renderText(cd.getData(), state, writer); + } + }, + else => {}, // Ignore other node types + } +} + +fn renderChildren(parent: *Node, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void { + var it = parent.childrenIterator(); + while (it.next()) |child| { + try render(child, state, writer, page); + } +} + +fn renderElement(el: *Element, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void { + const tag = el.getTag(); + + // Skip hidden/metadata elements + switch (tag) { + .script, .style, .noscript, .template, .head, .meta, .link, .title, .svg => return, + else => {}, + } + + // --- Opening Tag Logic --- + + // Ensure block elements start on a new line (double newline for paragraphs etc) + switch (tag) { + .p, .div, .section, .article, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .pre, .table, .hr => { + if (!state.last_char_was_newline) { + try writer.writeByte('\n'); + state.last_char_was_newline = true; + } + if (tag == .p or tag == .h1 or tag == .h2 or tag == .h3 or tag == .h4 or tag == .h5 or tag == .h6 or tag == .blockquote or tag == .pre or tag == .table) { + // Add an extra newline for spacing between blocks + try writer.writeByte('\n'); + } + }, + .li, .tr => { + if (!state.last_char_was_newline) { + try writer.writeByte('\n'); + state.last_char_was_newline = true; + } + }, + else => {}, + } + + // Prefixes + switch (tag) { + .h1 => try writer.writeAll("# "), + .h2 => try writer.writeAll("## "), + .h3 => try writer.writeAll("### "), + .h4 => try writer.writeAll("#### "), + .h5 => try writer.writeAll("##### "), + .h6 => try writer.writeAll("###### "), + .ul, .ol => { + state.list_depth += 1; + }, + .li => { + const indent = if (state.list_depth > 0) state.list_depth - 1 else 0; + try writeIndentation(indent, writer); + try writer.writeAll("- "); + state.last_char_was_newline = false; + }, + .blockquote => { + try writer.writeAll("> "); + state.in_blockquote = true; + state.last_char_was_newline = false; + }, + .pre => { + try writer.writeAll("```\n"); + state.in_pre = true; + state.last_char_was_newline = true; + }, + .code => { + if (!state.in_pre) { + try writer.writeByte('`'); + state.in_code = true; + state.last_char_was_newline = false; + } + }, + .b, .strong => { + try writer.writeAll("**"); + state.last_char_was_newline = false; + }, + .i, .em => { + try writer.writeAll("*"); + state.last_char_was_newline = false; + }, + .hr => { + try writer.writeAll("---\n"); + state.last_char_was_newline = true; + return; // Void element + }, + .br => { + try writer.writeByte('\n'); + state.last_char_was_newline = true; + return; // Void element + }, + .img => { + try writer.writeAll("!["); + if (el.getAttributeSafe(comptime .wrap("alt"))) |alt| { + try writer.writeAll(alt); + } + try writer.writeAll("]("); + if (el.getAttributeSafe(comptime .wrap("src"))) |src| { + try writer.writeAll(src); + } + try writer.writeAll(")"); + state.last_char_was_newline = false; + return; // Treat as void + }, + .anchor => { + try writer.writeByte('['); + state.last_char_was_newline = false; + }, + else => {}, + } + + // --- Render Children --- + try renderChildren(el.asNode(), state, writer, page); + + // --- Closing Tag Logic --- + + // Suffixes + switch (tag) { + .anchor => { + try writer.writeAll("]("); + if (el.getAttributeSafe(comptime .wrap("href"))) |href| { + try writer.writeAll(href); + } + try writer.writeByte(')'); + state.last_char_was_newline = false; + }, + .pre => { + if (!state.last_char_was_newline) { + try writer.writeByte('\n'); + } + try writer.writeAll("```\n"); + state.in_pre = false; + state.last_char_was_newline = true; + }, + .code => { + if (!state.in_pre) { + try writer.writeByte('`'); + state.in_code = false; + state.last_char_was_newline = false; + } + }, + .b, .strong => { + try writer.writeAll("**"); + state.last_char_was_newline = false; + }, + .i, .em => { + try writer.writeAll("*"); + state.last_char_was_newline = false; + }, + .blockquote => { + state.in_blockquote = false; + }, + .ul, .ol => { + if (state.list_depth > 0) state.list_depth -= 1; + }, + else => {}, + } + + // Post-block newlines + switch (tag) { + .p, .div, .section, .article, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .table, .tr => { + if (!state.last_char_was_newline) { + try writer.writeByte('\n'); + state.last_char_was_newline = true; + } + }, + else => {}, + } +} + +fn renderText(text: []const u8, state: *State, writer: *std.Io.Writer) anyerror!void { + if (text.len == 0) return; + + if (state.in_pre) { + try writer.writeAll(text); + if (text.len > 0 and text[text.len - 1] == '\n') { + state.last_char_was_newline = true; + } else { + state.last_char_was_newline = false; + } + return; + } + + // Collapse whitespace + var it = std.mem.tokenizeAny(u8, text, " \t\n\r"); + var first = true; + while (it.next()) |word| { + // If this is the first word we're writing in this sequence... + if (first) { + // ...and we didn't just write a newline... + if (!state.last_char_was_newline) { + // ...check if the original text had leading whitespace. + if (text.len > 0 and std.ascii.isWhitespace(text[0])) { + try writer.writeByte(' '); + } + } + } else { + // Between words always add space + try writer.writeByte(' '); + } + + try writer.writeAll(word); + state.last_char_was_newline = false; + first = false; + } + + // Handle trailing whitespace from the original text + if (!first and !state.last_char_was_newline) { + if (text.len > 0 and std.ascii.isWhitespace(text[text.len - 1])) { + try writer.writeByte(' '); + } + } +} + +fn writeIndentation(level: usize, writer: *std.Io.Writer) anyerror!void { + var i: usize = 0; + while (i < level) : (i += 1) { + try writer.writeAll(" "); + } +} diff --git a/src/lightpanda.zig b/src/lightpanda.zig index c40120cc..7a7f7341 100644 --- a/src/lightpanda.zig +++ b/src/lightpanda.zig @@ -28,6 +28,7 @@ pub const Notification = @import("Notification.zig"); pub const log = @import("log.zig"); pub const js = @import("browser/js/js.zig"); pub const dump = @import("browser/dump.zig"); +pub const markdown = @import("browser/markdown.zig"); pub const build_config = @import("build_config"); pub const crash_handler = @import("crash_handler.zig"); @@ -36,6 +37,7 @@ const IS_DEBUG = @import("builtin").mode == .Debug; pub const FetchOpts = struct { wait_ms: u32 = 5000, dump: dump.RootOpts, + dump_markdown: bool = false, writer: ?*std.Io.Writer = null, }; pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void { @@ -94,7 +96,11 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void { _ = session.wait(opts.wait_ms); const writer = opts.writer orelse return; - try dump.root(page.window._document, opts.dump, writer, page); + if (opts.dump_markdown) { + try markdown.dump(page.window._document.asNode(), .{}, writer, page); + } else { + try dump.root(page.window._document, opts.dump, writer, page); + } try writer.flush(); } diff --git a/src/main.zig b/src/main.zig index 3bbcb492..22f8dc39 100644 --- a/src/main.zig +++ b/src/main.zig @@ -111,6 +111,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void { var fetch_opts = lp.FetchOpts{ .wait_ms = 5000, + .dump_markdown = opts.dump_markdown, .dump = .{ .strip = opts.strip, .with_base = opts.withbase, @@ -119,7 +120,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void { var stdout = std.fs.File.stdout(); var writer = stdout.writer(&.{}); - if (opts.dump) { + if (opts.dump or opts.dump_markdown) { fetch_opts.writer = &writer.interface; }