Add --dump-markdown flag

Add a new module to handle HTML-to-Markdown conversion and
integrate it into the fetch command via a new CLI flag.
This commit is contained in:
Adrià Arrufat
2026-02-15 23:15:56 +09:00
parent c9433782d8
commit 1b5efea6eb
4 changed files with 306 additions and 2 deletions

View File

@@ -166,6 +166,7 @@ pub const Serve = struct {
pub const Fetch = struct { pub const Fetch = struct {
url: [:0]const u8, url: [:0]const u8,
dump: bool = false, dump: bool = false,
dump_markdown: bool = false,
common: Common = .{}, common: Common = .{},
withbase: bool = false, withbase: bool = false,
strip: dump.Opts.Strip = .{}, strip: dump.Opts.Strip = .{},
@@ -308,6 +309,9 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void {
\\--dump Dumps document to stdout. \\--dump Dumps document to stdout.
\\ Defaults to false. \\ Defaults to false.
\\ \\
\\--dump-markdown Dumps document to stdout as Markdown.
\\ Defaults to false.
\\
\\--strip_mode Comma separated list of tag groups to remove from dump \\--strip_mode Comma separated list of tag groups to remove from dump
\\ the dump. e.g. --strip_mode js,css \\ the dump. e.g. --strip_mode js,css
\\ - "js" script and link[as=script, rel=preload] \\ - "js" script and link[as=script, rel=preload]
@@ -410,6 +414,10 @@ fn inferMode(opt: []const u8) ?RunMode {
return .fetch; return .fetch;
} }
if (std.mem.eql(u8, opt, "--dump-markdown")) {
return .fetch;
}
if (std.mem.eql(u8, opt, "--noscript")) { if (std.mem.eql(u8, opt, "--noscript")) {
return .fetch; return .fetch;
} }
@@ -547,6 +555,7 @@ fn parseFetchArgs(
args: *std.process.ArgIterator, args: *std.process.ArgIterator,
) !Fetch { ) !Fetch {
var fetch_dump: bool = false; var fetch_dump: bool = false;
var fetch_dump_markdown: bool = false;
var withbase: bool = false; var withbase: bool = false;
var url: ?[:0]const u8 = null; var url: ?[:0]const u8 = null;
var common: Common = .{}; var common: Common = .{};
@@ -558,6 +567,11 @@ fn parseFetchArgs(
continue; continue;
} }
if (std.mem.eql(u8, "--dump-markdown", opt)) {
fetch_dump_markdown = true;
continue;
}
if (std.mem.eql(u8, "--noscript", opt)) { if (std.mem.eql(u8, "--noscript", opt)) {
log.warn(.app, "deprecation warning", .{ log.warn(.app, "deprecation warning", .{
.feature = "--noscript argument", .feature = "--noscript argument",
@@ -622,6 +636,7 @@ fn parseFetchArgs(
return .{ return .{
.url = url.?, .url = url.?,
.dump = fetch_dump, .dump = fetch_dump,
.dump_markdown = fetch_dump_markdown,
.strip = strip, .strip = strip,
.common = common, .common = common,
.withbase = withbase, .withbase = withbase,

282
src/browser/markdown.zig Normal file
View File

@@ -0,0 +1,282 @@
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
//
// Francis Bouvier <francis@lightpanda.io>
// Pierre Tachoire <pierre@lightpanda.io>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
const std = @import("std");
const Page = @import("Page.zig");
const Node = @import("webapi/Node.zig");
const Element = @import("webapi/Element.zig");
const Slot = @import("webapi/element/html/Slot.zig");
pub const Opts = struct {
// Options for future customization (e.g., dialect)
};
const State = struct {
list_depth: usize = 0,
in_pre: bool = false,
in_code: bool = false,
in_blockquote: bool = false,
last_char_was_newline: bool = true,
};
pub fn dump(node: *Node, opts: Opts, writer: *std.Io.Writer, page: *Page) !void {
_ = opts;
var state = State{};
try render(node, &state, writer, page);
if (!state.last_char_was_newline) {
try writer.writeByte('\n');
}
}
fn render(node: *Node, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void {
switch (node._type) {
.document, .document_fragment => {
try renderChildren(node, state, writer, page);
},
.element => |el| {
try renderElement(el, state, writer, page);
},
.cdata => |cd| {
if (node.is(Node.CData.Text)) |_| {
try renderText(cd.getData(), state, writer);
}
},
else => {}, // Ignore other node types
}
}
fn renderChildren(parent: *Node, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void {
var it = parent.childrenIterator();
while (it.next()) |child| {
try render(child, state, writer, page);
}
}
fn renderElement(el: *Element, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void {
const tag = el.getTag();
// Skip hidden/metadata elements
switch (tag) {
.script, .style, .noscript, .template, .head, .meta, .link, .title, .svg => return,
else => {},
}
// --- Opening Tag Logic ---
// Ensure block elements start on a new line (double newline for paragraphs etc)
switch (tag) {
.p, .div, .section, .article, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .pre, .table, .hr => {
if (!state.last_char_was_newline) {
try writer.writeByte('\n');
state.last_char_was_newline = true;
}
if (tag == .p or tag == .h1 or tag == .h2 or tag == .h3 or tag == .h4 or tag == .h5 or tag == .h6 or tag == .blockquote or tag == .pre or tag == .table) {
// Add an extra newline for spacing between blocks
try writer.writeByte('\n');
}
},
.li, .tr => {
if (!state.last_char_was_newline) {
try writer.writeByte('\n');
state.last_char_was_newline = true;
}
},
else => {},
}
// Prefixes
switch (tag) {
.h1 => try writer.writeAll("# "),
.h2 => try writer.writeAll("## "),
.h3 => try writer.writeAll("### "),
.h4 => try writer.writeAll("#### "),
.h5 => try writer.writeAll("##### "),
.h6 => try writer.writeAll("###### "),
.ul, .ol => {
state.list_depth += 1;
},
.li => {
const indent = if (state.list_depth > 0) state.list_depth - 1 else 0;
try writeIndentation(indent, writer);
try writer.writeAll("- ");
state.last_char_was_newline = false;
},
.blockquote => {
try writer.writeAll("> ");
state.in_blockquote = true;
state.last_char_was_newline = false;
},
.pre => {
try writer.writeAll("```\n");
state.in_pre = true;
state.last_char_was_newline = true;
},
.code => {
if (!state.in_pre) {
try writer.writeByte('`');
state.in_code = true;
state.last_char_was_newline = false;
}
},
.b, .strong => {
try writer.writeAll("**");
state.last_char_was_newline = false;
},
.i, .em => {
try writer.writeAll("*");
state.last_char_was_newline = false;
},
.hr => {
try writer.writeAll("---\n");
state.last_char_was_newline = true;
return; // Void element
},
.br => {
try writer.writeByte('\n');
state.last_char_was_newline = true;
return; // Void element
},
.img => {
try writer.writeAll("![");
if (el.getAttributeSafe(comptime .wrap("alt"))) |alt| {
try writer.writeAll(alt);
}
try writer.writeAll("](");
if (el.getAttributeSafe(comptime .wrap("src"))) |src| {
try writer.writeAll(src);
}
try writer.writeAll(")");
state.last_char_was_newline = false;
return; // Treat as void
},
.anchor => {
try writer.writeByte('[');
state.last_char_was_newline = false;
},
else => {},
}
// --- Render Children ---
try renderChildren(el.asNode(), state, writer, page);
// --- Closing Tag Logic ---
// Suffixes
switch (tag) {
.anchor => {
try writer.writeAll("](");
if (el.getAttributeSafe(comptime .wrap("href"))) |href| {
try writer.writeAll(href);
}
try writer.writeByte(')');
state.last_char_was_newline = false;
},
.pre => {
if (!state.last_char_was_newline) {
try writer.writeByte('\n');
}
try writer.writeAll("```\n");
state.in_pre = false;
state.last_char_was_newline = true;
},
.code => {
if (!state.in_pre) {
try writer.writeByte('`');
state.in_code = false;
state.last_char_was_newline = false;
}
},
.b, .strong => {
try writer.writeAll("**");
state.last_char_was_newline = false;
},
.i, .em => {
try writer.writeAll("*");
state.last_char_was_newline = false;
},
.blockquote => {
state.in_blockquote = false;
},
.ul, .ol => {
if (state.list_depth > 0) state.list_depth -= 1;
},
else => {},
}
// Post-block newlines
switch (tag) {
.p, .div, .section, .article, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .table, .tr => {
if (!state.last_char_was_newline) {
try writer.writeByte('\n');
state.last_char_was_newline = true;
}
},
else => {},
}
}
fn renderText(text: []const u8, state: *State, writer: *std.Io.Writer) anyerror!void {
if (text.len == 0) return;
if (state.in_pre) {
try writer.writeAll(text);
if (text.len > 0 and text[text.len - 1] == '\n') {
state.last_char_was_newline = true;
} else {
state.last_char_was_newline = false;
}
return;
}
// Collapse whitespace
var it = std.mem.tokenizeAny(u8, text, " \t\n\r");
var first = true;
while (it.next()) |word| {
// If this is the first word we're writing in this sequence...
if (first) {
// ...and we didn't just write a newline...
if (!state.last_char_was_newline) {
// ...check if the original text had leading whitespace.
if (text.len > 0 and std.ascii.isWhitespace(text[0])) {
try writer.writeByte(' ');
}
}
} else {
// Between words always add space
try writer.writeByte(' ');
}
try writer.writeAll(word);
state.last_char_was_newline = false;
first = false;
}
// Handle trailing whitespace from the original text
if (!first and !state.last_char_was_newline) {
if (text.len > 0 and std.ascii.isWhitespace(text[text.len - 1])) {
try writer.writeByte(' ');
}
}
}
fn writeIndentation(level: usize, writer: *std.Io.Writer) anyerror!void {
var i: usize = 0;
while (i < level) : (i += 1) {
try writer.writeAll(" ");
}
}

View File

@@ -28,6 +28,7 @@ pub const Notification = @import("Notification.zig");
pub const log = @import("log.zig"); pub const log = @import("log.zig");
pub const js = @import("browser/js/js.zig"); pub const js = @import("browser/js/js.zig");
pub const dump = @import("browser/dump.zig"); pub const dump = @import("browser/dump.zig");
pub const markdown = @import("browser/markdown.zig");
pub const build_config = @import("build_config"); pub const build_config = @import("build_config");
pub const crash_handler = @import("crash_handler.zig"); pub const crash_handler = @import("crash_handler.zig");
@@ -36,6 +37,7 @@ const IS_DEBUG = @import("builtin").mode == .Debug;
pub const FetchOpts = struct { pub const FetchOpts = struct {
wait_ms: u32 = 5000, wait_ms: u32 = 5000,
dump: dump.RootOpts, dump: dump.RootOpts,
dump_markdown: bool = false,
writer: ?*std.Io.Writer = null, writer: ?*std.Io.Writer = null,
}; };
pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void { pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
@@ -94,7 +96,11 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
_ = session.wait(opts.wait_ms); _ = session.wait(opts.wait_ms);
const writer = opts.writer orelse return; const writer = opts.writer orelse return;
if (opts.dump_markdown) {
try markdown.dump(page.window._document.asNode(), .{}, writer, page);
} else {
try dump.root(page.window._document, opts.dump, writer, page); try dump.root(page.window._document, opts.dump, writer, page);
}
try writer.flush(); try writer.flush();
} }

View File

@@ -111,6 +111,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void {
var fetch_opts = lp.FetchOpts{ var fetch_opts = lp.FetchOpts{
.wait_ms = 5000, .wait_ms = 5000,
.dump_markdown = opts.dump_markdown,
.dump = .{ .dump = .{
.strip = opts.strip, .strip = opts.strip,
.with_base = opts.withbase, .with_base = opts.withbase,
@@ -119,7 +120,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void {
var stdout = std.fs.File.stdout(); var stdout = std.fs.File.stdout();
var writer = stdout.writer(&.{}); var writer = stdout.writer(&.{});
if (opts.dump) { if (opts.dump or opts.dump_markdown) {
fetch_opts.writer = &writer.interface; fetch_opts.writer = &writer.interface;
} }