diff --git a/src/Config.zig b/src/Config.zig index 39422791..ddef91be 100644 --- a/src/Config.zig +++ b/src/Config.zig @@ -182,9 +182,14 @@ pub const Serve = struct { common: Common = .{}, }; +pub const DumpFormat = enum { + html, + markdown, +}; + pub const Fetch = struct { url: [:0]const u8, - dump: bool = false, + dump_mode: ?DumpFormat = null, common: Common = .{}, withbase: bool = false, strip: dump.Opts.Strip = .{}, @@ -321,11 +326,12 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void { \\ \\fetch command \\Fetches the specified URL - \\Example: {s} fetch --dump https://lightpanda.io/ + \\Example: {s} fetch --dump html https://lightpanda.io/ \\ \\Options: \\--dump Dumps document to stdout. - \\ Defaults to false. + \\ Argument must be 'html' or 'markdown'. + \\ Defaults to no dump. \\ \\--strip_mode Comma separated list of tag groups to remove from dump \\ the dump. e.g. --strip_mode js,css @@ -532,7 +538,7 @@ fn parseFetchArgs( allocator: Allocator, args: *std.process.ArgIterator, ) !Fetch { - var fetch_dump: bool = false; + var dump_mode: ?DumpFormat = null; var withbase: bool = false; var url: ?[:0]const u8 = null; var common: Common = .{}; @@ -540,7 +546,17 @@ fn parseFetchArgs( while (args.next()) |opt| { if (std.mem.eql(u8, "--dump", opt)) { - fetch_dump = true; + var peek_args = args.*; + if (peek_args.next()) |next_arg| { + if (std.meta.stringToEnum(DumpFormat, next_arg)) |mode| { + dump_mode = mode; + _ = args.next(); + } else { + dump_mode = .html; + } + } else { + dump_mode = .html; + } continue; } @@ -607,7 +623,7 @@ fn parseFetchArgs( return .{ .url = url.?, - .dump = fetch_dump, + .dump_mode = dump_mode, .strip = strip, .common = common, .withbase = withbase, diff --git a/src/browser/markdown.zig b/src/browser/markdown.zig new file mode 100644 index 00000000..fcbac478 --- /dev/null +++ b/src/browser/markdown.zig @@ -0,0 +1,506 @@ +// Copyright (C) 2023-2026 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +const std = @import("std"); + +const Page = @import("Page.zig"); +const Element = @import("webapi/Element.zig"); +const Node = @import("webapi/Node.zig"); + +pub const Opts = struct { + // Options for future customization (e.g., dialect) +}; + +const State = struct { + const ListType = enum { ordered, unordered }; + const ListState = struct { + type: ListType, + index: usize, + }; + + list_depth: usize = 0, + list_stack: [32]ListState = undefined, + in_pre: bool = false, + pre_node: ?*Node = null, + in_code: bool = false, + in_table: bool = false, + table_row_index: usize = 0, + table_col_count: usize = 0, + last_char_was_newline: bool = true, +}; + +fn isBlock(tag: Element.Tag) bool { + return switch (tag) { + .p, .div, .section, .article, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .pre, .table, .hr => true, + else => false, + }; +} + +fn shouldAddSpacing(tag: Element.Tag) bool { + return switch (tag) { + .p, .h1, .h2, .h3, .h4, .h5, .h6, .blockquote, .pre, .table => true, + else => false, + }; +} + +fn ensureNewline(state: *State, writer: *std.Io.Writer) !void { + if (!state.last_char_was_newline) { + try writer.writeByte('\n'); + state.last_char_was_newline = true; + } +} + +pub fn dump(node: *Node, opts: Opts, writer: *std.Io.Writer, page: *Page) !void { + _ = opts; + var state = State{}; + try render(node, &state, writer, page); + if (!state.last_char_was_newline) { + try writer.writeByte('\n'); + } +} + +fn render(node: *Node, state: *State, writer: *std.Io.Writer, page: *Page) error{WriteFailed}!void { + switch (node._type) { + .document, .document_fragment => { + try renderChildren(node, state, writer, page); + }, + .element => |el| { + try renderElement(el, state, writer, page); + }, + .cdata => |cd| { + if (node.is(Node.CData.Text)) |_| { + var text = cd.getData(); + if (state.in_pre) { + if (state.pre_node) |pre| { + if (node.parentNode() == pre and node.nextSibling() == null) { + text = std.mem.trimRight(u8, text, " \t\r\n"); + } + } + } + try renderText(text, state, writer); + } + }, + else => {}, // Ignore other node types + } +} + +fn renderChildren(parent: *Node, state: *State, writer: *std.Io.Writer, page: *Page) !void { + var it = parent.childrenIterator(); + while (it.next()) |child| { + try render(child, state, writer, page); + } +} + +fn renderElement(el: *Element, state: *State, writer: *std.Io.Writer, page: *Page) !void { + const tag = el.getTag(); + + // Skip hidden/metadata elements + switch (tag) { + .script, .style, .noscript, .template, .head, .meta, .link, .title, .svg => return, + else => {}, + } + + // --- Opening Tag Logic --- + + // Ensure block elements start on a new line (double newline for paragraphs etc) + if (isBlock(tag)) { + if (!state.in_table) { + try ensureNewline(state, writer); + if (shouldAddSpacing(tag)) { + // Add an extra newline for spacing between blocks + try writer.writeByte('\n'); + } + } + } else if (tag == .li or tag == .tr) { + try ensureNewline(state, writer); + } + + // Prefixes + switch (tag) { + .h1 => try writer.writeAll("# "), + .h2 => try writer.writeAll("## "), + .h3 => try writer.writeAll("### "), + .h4 => try writer.writeAll("#### "), + .h5 => try writer.writeAll("##### "), + .h6 => try writer.writeAll("###### "), + .ul => { + if (state.list_depth < state.list_stack.len) { + state.list_stack[state.list_depth] = .{ .type = .unordered, .index = 0 }; + state.list_depth += 1; + } + }, + .ol => { + if (state.list_depth < state.list_stack.len) { + state.list_stack[state.list_depth] = .{ .type = .ordered, .index = 1 }; + state.list_depth += 1; + } + }, + .li => { + const indent = if (state.list_depth > 0) state.list_depth - 1 else 0; + for (0..indent) |_| try writer.writeAll(" "); + + if (state.list_depth > 0) { + const current_list = &state.list_stack[state.list_depth - 1]; + if (current_list.type == .ordered) { + try writer.print("{d}. ", .{current_list.index}); + current_list.index += 1; + } else { + try writer.writeAll("- "); + } + } else { + try writer.writeAll("- "); + } + state.last_char_was_newline = false; + }, + .table => { + state.in_table = true; + state.table_row_index = 0; + state.table_col_count = 0; + }, + .tr => { + state.table_col_count = 0; + try writer.writeByte('|'); + }, + .td, .th => { + // Note: leading pipe handled by previous cell closing or tr opening + state.last_char_was_newline = false; + try writer.writeByte(' '); + }, + .blockquote => { + try writer.writeAll("> "); + state.last_char_was_newline = false; + }, + .pre => { + try writer.writeAll("```\n"); + state.in_pre = true; + state.pre_node = el.asNode(); + state.last_char_was_newline = true; + }, + .code => { + if (!state.in_pre) { + try writer.writeByte('`'); + state.in_code = true; + state.last_char_was_newline = false; + } + }, + .b, .strong => { + try writer.writeAll("**"); + state.last_char_was_newline = false; + }, + .i, .em => { + try writer.writeAll("*"); + state.last_char_was_newline = false; + }, + .s, .del => { + try writer.writeAll("~~"); + state.last_char_was_newline = false; + }, + .hr => { + try writer.writeAll("---\n"); + state.last_char_was_newline = true; + return; // Void element + }, + .br => { + if (state.in_table) { + try writer.writeByte(' '); + } else { + try writer.writeByte('\n'); + state.last_char_was_newline = true; + } + return; // Void element + }, + .img => { + try writer.writeAll("!["); + if (el.getAttributeSafe(comptime .wrap("alt"))) |alt| { + try escapeMarkdown(writer, alt); + } + try writer.writeAll("]("); + if (el.getAttributeSafe(comptime .wrap("src"))) |src| { + try writer.writeAll(src); + } + try writer.writeAll(")"); + state.last_char_was_newline = false; + return; // Treat as void + }, + .anchor => { + try writer.writeByte('['); + try renderChildren(el.asNode(), state, writer, page); + try writer.writeAll("]("); + if (el.getAttributeSafe(comptime .wrap("href"))) |href| { + try writer.writeAll(href); + } + try writer.writeByte(')'); + state.last_char_was_newline = false; + return; + }, + .input => { + if (el.getAttributeSafe(comptime .wrap("type"))) |type_attr| { + if (std.ascii.eqlIgnoreCase(type_attr, "checkbox")) { + if (el.getAttributeSafe(comptime .wrap("checked"))) |_| { + try writer.writeAll("[x] "); + } else { + try writer.writeAll("[ ] "); + } + state.last_char_was_newline = false; + } + } + return; + }, + else => {}, + } + + // --- Render Children --- + try renderChildren(el.asNode(), state, writer, page); + + // --- Closing Tag Logic --- + + // Suffixes + switch (tag) { + .pre => { + if (!state.last_char_was_newline) { + try writer.writeByte('\n'); + } + try writer.writeAll("```\n"); + state.in_pre = false; + state.pre_node = null; + state.last_char_was_newline = true; + }, + .code => { + if (!state.in_pre) { + try writer.writeByte('`'); + state.in_code = false; + state.last_char_was_newline = false; + } + }, + .b, .strong => { + try writer.writeAll("**"); + state.last_char_was_newline = false; + }, + .i, .em => { + try writer.writeAll("*"); + state.last_char_was_newline = false; + }, + .s, .del => { + try writer.writeAll("~~"); + state.last_char_was_newline = false; + }, + .blockquote => {}, + .ul, .ol => { + if (state.list_depth > 0) state.list_depth -= 1; + }, + .table => { + state.in_table = false; + }, + .tr => { + try writer.writeByte('\n'); + if (state.table_row_index == 0) { + try writer.writeByte('|'); + var i: usize = 0; + while (i < state.table_col_count) : (i += 1) { + try writer.writeAll("---|"); + } + try writer.writeByte('\n'); + } + state.table_row_index += 1; + state.last_char_was_newline = true; + }, + .td, .th => { + try writer.writeAll(" |"); + state.table_col_count += 1; + state.last_char_was_newline = false; + }, + else => {}, + } + + // Post-block newlines + if (isBlock(tag)) { + if (!state.in_table) { + try ensureNewline(state, writer); + } + } +} + +fn renderText(text: []const u8, state: *State, writer: *std.Io.Writer) !void { + if (text.len == 0) return; + + if (state.in_pre) { + try writer.writeAll(text); + if (text.len > 0 and text[text.len - 1] == '\n') { + state.last_char_was_newline = true; + } else { + state.last_char_was_newline = false; + } + return; + } + + // Check for pure whitespace + const is_all_whitespace = for (text) |c| { + if (!std.ascii.isWhitespace(c)) break false; + } else true; + + if (is_all_whitespace) { + if (!state.last_char_was_newline) { + try writer.writeByte(' '); + } + return; + } + + // Collapse whitespace + var it = std.mem.tokenizeAny(u8, text, " \t\n\r"); + var first = true; + while (it.next()) |word| { + if (first) { + if (!state.last_char_was_newline) { + if (text.len > 0 and std.ascii.isWhitespace(text[0])) { + try writer.writeByte(' '); + } + } + } else { + try writer.writeByte(' '); + } + + try escapeMarkdown(writer, word); + state.last_char_was_newline = false; + first = false; + } + + // Handle trailing whitespace from the original text + if (!first and !state.last_char_was_newline) { + if (text.len > 0 and std.ascii.isWhitespace(text[text.len - 1])) { + try writer.writeByte(' '); + } + } +} + +fn escapeMarkdown(writer: *std.Io.Writer, text: []const u8) !void { + for (text) |c| { + switch (c) { + '\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '!', '|' => { + try writer.writeByte('\\'); + try writer.writeByte(c); + }, + else => try writer.writeByte(c), + } + } +} + +fn testMarkdownHTML(html: []const u8, expected: []const u8) !void { + const testing = @import("../testing.zig"); + const page = try testing.test_session.createPage(); + defer testing.test_session.removePage(); + const doc = page.window._document; + + const div = try doc.createElement("div", null, page); + try page.parseHtmlAsChildren(div.asNode(), html); + + var aw: std.Io.Writer.Allocating = .init(testing.allocator); + defer aw.deinit(); + try dump(div.asNode(), .{}, &aw.writer, page); + + try testing.expectString(expected, aw.written()); +} + +test "markdown: basic" { + try testMarkdownHTML("Hello world", "Hello world\n"); +} + +test "markdown: whitespace" { + try testMarkdownHTML("A B", "A B\n"); +} + +test "markdown: escaping" { + try testMarkdownHTML("

# Not a header

", "\n\\# Not a header\n"); +} + +test "markdown: strikethrough" { + try testMarkdownHTML("deleted", "~~deleted~~\n"); +} + +test "markdown: task list" { + try testMarkdownHTML( + \\ + , "[x] [ ] \n"); +} + +test "markdown: ordered list" { + try testMarkdownHTML( + \\
  1. First
  2. Second
+ , "1. First\n2. Second\n"); +} + +test "markdown: table" { + try testMarkdownHTML( + \\ + \\
Head 1Head 2
Cell 1Cell 2
+ , + \\ + \\| Head 1 | Head 2 | + \\|---|---| + \\| Cell 1 | Cell 2 | + \\ + ); +} + +test "markdown: nested lists" { + try testMarkdownHTML( + \\
  • Parent
    • Child
+ , + \\- Parent + \\ - Child + \\ + ); +} + +test "markdown: blockquote" { + try testMarkdownHTML("
Hello world
", "\n> Hello world\n"); +} + +test "markdown: links" { + try testMarkdownHTML("Lightpanda", "[Lightpanda](https://lightpanda.io)\n"); +} + +test "markdown: images" { + try testMarkdownHTML("\"Logo\"", "![Logo](logo.png)\n"); +} + +test "markdown: headings" { + try testMarkdownHTML("

Title

Subtitle

", + \\ + \\# Title + \\ + \\## Subtitle + \\ + ); +} + +test "markdown: code" { + try testMarkdownHTML( + \\

Use git push

+ \\
line 1
+        \\line 2
+ , + \\ + \\Use git push + \\ + \\``` + \\line 1 + \\line 2 + \\``` + \\ + ); +} diff --git a/src/lightpanda.zig b/src/lightpanda.zig index d2736689..f97142d0 100644 --- a/src/lightpanda.zig +++ b/src/lightpanda.zig @@ -28,6 +28,7 @@ pub const Notification = @import("Notification.zig"); pub const log = @import("log.zig"); pub const js = @import("browser/js/js.zig"); pub const dump = @import("browser/dump.zig"); +pub const markdown = @import("browser/markdown.zig"); pub const build_config = @import("build_config"); pub const crash_handler = @import("crash_handler.zig"); @@ -36,6 +37,7 @@ const IS_DEBUG = @import("builtin").mode == .Debug; pub const FetchOpts = struct { wait_ms: u32 = 5000, dump: dump.RootOpts, + dump_mode: ?Config.DumpFormat = null, writer: ?*std.Io.Writer = null, }; pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void { @@ -97,7 +99,12 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void { _ = session.wait(opts.wait_ms); const writer = opts.writer orelse return; - try dump.root(page.window._document, opts.dump, writer, page); + if (opts.dump_mode) |mode| { + switch (mode) { + .html => try dump.root(page.window._document, opts.dump, writer, page), + .markdown => try markdown.dump(page.window._document.asNode(), .{}, writer, page), + } + } try writer.flush(); } diff --git a/src/main.zig b/src/main.zig index 33c7fff8..3b2ded5e 100644 --- a/src/main.zig +++ b/src/main.zig @@ -108,10 +108,11 @@ fn run(allocator: Allocator, main_arena: Allocator) !void { }, .fetch => |opts| { const url = opts.url; - log.debug(.app, "startup", .{ .mode = "fetch", .dump = opts.dump, .url = url, .snapshot = app.snapshot.fromEmbedded() }); + log.debug(.app, "startup", .{ .mode = "fetch", .dump_mode = opts.dump_mode, .url = url, .snapshot = app.snapshot.fromEmbedded() }); var fetch_opts = lp.FetchOpts{ .wait_ms = 5000, + .dump_mode = opts.dump_mode, .dump = .{ .strip = opts.strip, .with_base = opts.withbase, @@ -120,7 +121,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void { var stdout = std.fs.File.stdout(); var writer = stdout.writer(&.{}); - if (opts.dump) { + if (opts.dump_mode != null) { fetch_opts.writer = &writer.interface; }