From 1b5efea6ebc8d5ade3aafe330de1c8f34ef98be8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A0=20Arrufat?= <adria.arrufat@gmail.com>
Date: Sun, 15 Feb 2026 23:15:56 +0900
Subject: [PATCH] Add --dump-markdown flag

Add a new module to handle HTML-to-Markdown conversion and
integrate it into the fetch command via a new CLI flag.
---
 src/Config.zig           |  15 +++
 src/browser/markdown.zig | 282 +++++++++++++++++++++++++++++++++++++++
 src/lightpanda.zig       |   8 +-
 src/main.zig             |   3 +-
 4 files changed, 306 insertions(+), 2 deletions(-)
 create mode 100644 src/browser/markdown.zig

diff --git a/src/Config.zig b/src/Config.zig
index c9725168..8b29ee24 100644
--- a/src/Config.zig
+++ b/src/Config.zig
@@ -166,6 +166,7 @@ pub const Serve = struct {
 pub const Fetch = struct {
     url: [:0]const u8,
     dump: bool = false,
+    dump_markdown: bool = false,
     common: Common = .{},
     withbase: bool = false,
     strip: dump.Opts.Strip = .{},
@@ -308,6 +309,9 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void {
         \\--dump          Dumps document to stdout.
         \\                Defaults to false.
         \\
+        \\--dump-markdown Dumps document to stdout as Markdown.
+        \\                Defaults to false.
+        \\
         \\--strip_mode    Comma separated list of tag groups to remove from dump
         \\                the dump. e.g. --strip_mode js,css
         \\                  - "js" script and link[as=script, rel=preload]
@@ -410,6 +414,10 @@ fn inferMode(opt: []const u8) ?RunMode {
         return .fetch;
     }
 
+    if (std.mem.eql(u8, opt, "--dump-markdown")) {
+        return .fetch;
+    }
+
     if (std.mem.eql(u8, opt, "--noscript")) {
         return .fetch;
     }
@@ -547,6 +555,7 @@ fn parseFetchArgs(
     args: *std.process.ArgIterator,
 ) !Fetch {
     var fetch_dump: bool = false;
+    var fetch_dump_markdown: bool = false;
     var withbase: bool = false;
     var url: ?[:0]const u8 = null;
     var common: Common = .{};
@@ -558,6 +567,11 @@ fn parseFetchArgs(
             continue;
         }
 
+        if (std.mem.eql(u8, "--dump-markdown", opt)) {
+            fetch_dump_markdown = true;
+            continue;
+        }
+
         if (std.mem.eql(u8, "--noscript", opt)) {
             log.warn(.app, "deprecation warning", .{
                 .feature = "--noscript argument",
@@ -622,6 +636,7 @@ fn parseFetchArgs(
     return .{
         .url = url.?,
         .dump = fetch_dump,
+        .dump_markdown = fetch_dump_markdown,
         .strip = strip,
         .common = common,
         .withbase = withbase,
diff --git a/src/browser/markdown.zig b/src/browser/markdown.zig
new file mode 100644
index 00000000..4738218f
--- /dev/null
+++ b/src/browser/markdown.zig
@@ -0,0 +1,282 @@
+// Copyright (C) 2023-2026  Lightpanda (Selecy SAS)
+//
+// Francis Bouvier <francis@lightpanda.io>
+// Pierre Tachoire <pierre@lightpanda.io>
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+const std = @import("std");
+const Page = @import("Page.zig");
+const Node = @import("webapi/Node.zig");
+const Element = @import("webapi/Element.zig");
+const Slot = @import("webapi/element/html/Slot.zig");
+
+pub const Opts = struct {
+    // Options for future customization (e.g., dialect)
+};
+
+const State = struct {
+    list_depth: usize = 0,
+    in_pre: bool = false,
+    in_code: bool = false,
+    in_blockquote: bool = false,
+    last_char_was_newline: bool = true,
+};
+
+pub fn dump(node: *Node, opts: Opts, writer: *std.Io.Writer, page: *Page) !void {
+    _ = opts;
+    var state = State{};
+    try render(node, &state, writer, page);
+    if (!state.last_char_was_newline) {
+        try writer.writeByte('\n');
+    }
+}
+
+fn render(node: *Node, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void {
+    switch (node._type) {
+        .document, .document_fragment => {
+            try renderChildren(node, state, writer, page);
+        },
+        .element => |el| {
+            try renderElement(el, state, writer, page);
+        },
+        .cdata => |cd| {
+            if (node.is(Node.CData.Text)) |_| {
+                try renderText(cd.getData(), state, writer);
+            }
+        },
+        else => {}, // Ignore other node types
+    }
+}
+
+fn renderChildren(parent: *Node, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void {
+    var it = parent.childrenIterator();
+    while (it.next()) |child| {
+        try render(child, state, writer, page);
+    }
+}
+
+fn renderElement(el: *Element, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void {
+    const tag = el.getTag();
+
+    // Skip hidden/metadata elements
+    switch (tag) {
+        .script, .style, .noscript, .template, .head, .meta, .link, .title, .svg => return,
+        else => {},
+    }
+
+    // --- Opening Tag Logic ---
+
+    // Ensure block elements start on a new line (double newline for paragraphs etc)
+    switch (tag) {
+        .p, .div, .section, .article, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .pre, .table, .hr => {
+            if (!state.last_char_was_newline) {
+                try writer.writeByte('\n');
+                state.last_char_was_newline = true;
+            }
+            if (tag == .p or tag == .h1 or tag == .h2 or tag == .h3 or tag == .h4 or tag == .h5 or tag == .h6 or tag == .blockquote or tag == .pre or tag == .table) {
+                // Add an extra newline for spacing between blocks
+                try writer.writeByte('\n');
+            }
+        },
+        .li, .tr => {
+            if (!state.last_char_was_newline) {
+                try writer.writeByte('\n');
+                state.last_char_was_newline = true;
+            }
+        },
+        else => {},
+    }
+
+    // Prefixes
+    switch (tag) {
+        .h1 => try writer.writeAll("# "),
+        .h2 => try writer.writeAll("## "),
+        .h3 => try writer.writeAll("### "),
+        .h4 => try writer.writeAll("#### "),
+        .h5 => try writer.writeAll("##### "),
+        .h6 => try writer.writeAll("###### "),
+        .ul, .ol => {
+            state.list_depth += 1;
+        },
+        .li => {
+            const indent = if (state.list_depth > 0) state.list_depth - 1 else 0;
+            try writeIndentation(indent, writer);
+            try writer.writeAll("- ");
+            state.last_char_was_newline = false;
+        },
+        .blockquote => {
+            try writer.writeAll("> ");
+            state.in_blockquote = true;
+            state.last_char_was_newline = false;
+        },
+        .pre => {
+            try writer.writeAll("```\n");
+            state.in_pre = true;
+            state.last_char_was_newline = true;
+        },
+        .code => {
+            if (!state.in_pre) {
+                try writer.writeByte('`');
+                state.in_code = true;
+                state.last_char_was_newline = false;
+            }
+        },
+        .b, .strong => {
+            try writer.writeAll("**");
+            state.last_char_was_newline = false;
+        },
+        .i, .em => {
+            try writer.writeAll("*");
+            state.last_char_was_newline = false;
+        },
+        .hr => {
+            try writer.writeAll("---\n");
+            state.last_char_was_newline = true;
+            return; // Void element
+        },
+        .br => {
+            try writer.writeByte('\n');
+            state.last_char_was_newline = true;
+            return; // Void element
+        },
+        .img => {
+            try writer.writeAll("![");
+            if (el.getAttributeSafe(comptime .wrap("alt"))) |alt| {
+                try writer.writeAll(alt);
+            }
+            try writer.writeAll("](");
+            if (el.getAttributeSafe(comptime .wrap("src"))) |src| {
+                try writer.writeAll(src);
+            }
+            try writer.writeAll(")");
+            state.last_char_was_newline = false;
+            return; // Treat as void
+        },
+        .anchor => {
+            try writer.writeByte('[');
+            state.last_char_was_newline = false;
+        },
+        else => {},
+    }
+
+    // --- Render Children ---
+    try renderChildren(el.asNode(), state, writer, page);
+
+    // --- Closing Tag Logic ---
+
+    // Suffixes
+    switch (tag) {
+        .anchor => {
+            try writer.writeAll("](");
+            if (el.getAttributeSafe(comptime .wrap("href"))) |href| {
+                try writer.writeAll(href);
+            }
+            try writer.writeByte(')');
+            state.last_char_was_newline = false;
+        },
+        .pre => {
+            if (!state.last_char_was_newline) {
+                try writer.writeByte('\n');
+            }
+            try writer.writeAll("```\n");
+            state.in_pre = false;
+            state.last_char_was_newline = true;
+        },
+        .code => {
+            if (!state.in_pre) {
+                try writer.writeByte('`');
+                state.in_code = false;
+                state.last_char_was_newline = false;
+            }
+        },
+        .b, .strong => {
+            try writer.writeAll("**");
+            state.last_char_was_newline = false;
+        },
+        .i, .em => {
+            try writer.writeAll("*");
+            state.last_char_was_newline = false;
+        },
+        .blockquote => {
+            state.in_blockquote = false;
+        },
+        .ul, .ol => {
+            if (state.list_depth > 0) state.list_depth -= 1;
+        },
+        else => {},
+    }
+
+    // Post-block newlines
+    switch (tag) {
+        .p, .div, .section, .article, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .table, .tr => {
+            if (!state.last_char_was_newline) {
+                try writer.writeByte('\n');
+                state.last_char_was_newline = true;
+            }
+        },
+        else => {},
+    }
+}
+
+fn renderText(text: []const u8, state: *State, writer: *std.Io.Writer) anyerror!void {
+    if (text.len == 0) return;
+
+    if (state.in_pre) {
+        try writer.writeAll(text);
+        if (text.len > 0 and text[text.len - 1] == '\n') {
+            state.last_char_was_newline = true;
+        } else {
+            state.last_char_was_newline = false;
+        }
+        return;
+    }
+
+    // Collapse whitespace
+    var it = std.mem.tokenizeAny(u8, text, " \t\n\r");
+    var first = true;
+    while (it.next()) |word| {
+        // If this is the first word we're writing in this sequence...
+        if (first) {
+            // ...and we didn't just write a newline...
+            if (!state.last_char_was_newline) {
+                // ...check if the original text had leading whitespace.
+                if (text.len > 0 and std.ascii.isWhitespace(text[0])) {
+                    try writer.writeByte(' ');
+                }
+            }
+        } else {
+            // Between words always add space
+            try writer.writeByte(' ');
+        }
+
+        try writer.writeAll(word);
+        state.last_char_was_newline = false;
+        first = false;
+    }
+
+    // Handle trailing whitespace from the original text
+    if (!first and !state.last_char_was_newline) {
+        if (text.len > 0 and std.ascii.isWhitespace(text[text.len - 1])) {
+            try writer.writeByte(' ');
+        }
+    }
+}
+
+fn writeIndentation(level: usize, writer: *std.Io.Writer) anyerror!void {
+    var i: usize = 0;
+    while (i < level) : (i += 1) {
+        try writer.writeAll("  ");
+    }
+}
diff --git a/src/lightpanda.zig b/src/lightpanda.zig
index c40120cc..7a7f7341 100644
--- a/src/lightpanda.zig
+++ b/src/lightpanda.zig
@@ -28,6 +28,7 @@ pub const Notification = @import("Notification.zig");
 pub const log = @import("log.zig");
 pub const js = @import("browser/js/js.zig");
 pub const dump = @import("browser/dump.zig");
+pub const markdown = @import("browser/markdown.zig");
 pub const build_config = @import("build_config");
 pub const crash_handler = @import("crash_handler.zig");
 
@@ -36,6 +37,7 @@ const IS_DEBUG = @import("builtin").mode == .Debug;
 pub const FetchOpts = struct {
     wait_ms: u32 = 5000,
     dump: dump.RootOpts,
+    dump_markdown: bool = false,
     writer: ?*std.Io.Writer = null,
 };
 pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
@@ -94,7 +96,11 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
     _ = session.wait(opts.wait_ms);
 
     const writer = opts.writer orelse return;
-    try dump.root(page.window._document, opts.dump, writer, page);
+    if (opts.dump_markdown) {
+        try markdown.dump(page.window._document.asNode(), .{}, writer, page);
+    } else {
+        try dump.root(page.window._document, opts.dump, writer, page);
+    }
     try writer.flush();
 }
 
diff --git a/src/main.zig b/src/main.zig
index 3bbcb492..22f8dc39 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -111,6 +111,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void {
 
             var fetch_opts = lp.FetchOpts{
                 .wait_ms = 5000,
+                .dump_markdown = opts.dump_markdown,
                 .dump = .{
                     .strip = opts.strip,
                     .with_base = opts.withbase,
@@ -119,7 +120,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void {
 
             var stdout = std.fs.File.stdout();
             var writer = stdout.writer(&.{});
-            if (opts.dump) {
+            if (opts.dump or opts.dump_markdown) {
                 fetch_opts.writer = &writer.interface;
             }