mirror of
https://github.com/lightpanda-io/browser.git
synced 2026-03-22 04:34:44 +00:00
Add --dump-markdown flag
Add a new module to handle HTML-to-Markdown conversion and integrate it into the fetch command via a new CLI flag.
This commit is contained in:
@@ -166,6 +166,7 @@ pub const Serve = struct {
|
|||||||
pub const Fetch = struct {
|
pub const Fetch = struct {
|
||||||
url: [:0]const u8,
|
url: [:0]const u8,
|
||||||
dump: bool = false,
|
dump: bool = false,
|
||||||
|
dump_markdown: bool = false,
|
||||||
common: Common = .{},
|
common: Common = .{},
|
||||||
withbase: bool = false,
|
withbase: bool = false,
|
||||||
strip: dump.Opts.Strip = .{},
|
strip: dump.Opts.Strip = .{},
|
||||||
@@ -308,6 +309,9 @@ pub fn printUsageAndExit(self: *const Config, success: bool) void {
|
|||||||
\\--dump Dumps document to stdout.
|
\\--dump Dumps document to stdout.
|
||||||
\\ Defaults to false.
|
\\ Defaults to false.
|
||||||
\\
|
\\
|
||||||
|
\\--dump-markdown Dumps document to stdout as Markdown.
|
||||||
|
\\ Defaults to false.
|
||||||
|
\\
|
||||||
\\--strip_mode Comma separated list of tag groups to remove from dump
|
\\--strip_mode Comma separated list of tag groups to remove from dump
|
||||||
\\ the dump. e.g. --strip_mode js,css
|
\\ the dump. e.g. --strip_mode js,css
|
||||||
\\ - "js" script and link[as=script, rel=preload]
|
\\ - "js" script and link[as=script, rel=preload]
|
||||||
@@ -410,6 +414,10 @@ fn inferMode(opt: []const u8) ?RunMode {
|
|||||||
return .fetch;
|
return .fetch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (std.mem.eql(u8, opt, "--dump-markdown")) {
|
||||||
|
return .fetch;
|
||||||
|
}
|
||||||
|
|
||||||
if (std.mem.eql(u8, opt, "--noscript")) {
|
if (std.mem.eql(u8, opt, "--noscript")) {
|
||||||
return .fetch;
|
return .fetch;
|
||||||
}
|
}
|
||||||
@@ -547,6 +555,7 @@ fn parseFetchArgs(
|
|||||||
args: *std.process.ArgIterator,
|
args: *std.process.ArgIterator,
|
||||||
) !Fetch {
|
) !Fetch {
|
||||||
var fetch_dump: bool = false;
|
var fetch_dump: bool = false;
|
||||||
|
var fetch_dump_markdown: bool = false;
|
||||||
var withbase: bool = false;
|
var withbase: bool = false;
|
||||||
var url: ?[:0]const u8 = null;
|
var url: ?[:0]const u8 = null;
|
||||||
var common: Common = .{};
|
var common: Common = .{};
|
||||||
@@ -558,6 +567,11 @@ fn parseFetchArgs(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (std.mem.eql(u8, "--dump-markdown", opt)) {
|
||||||
|
fetch_dump_markdown = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (std.mem.eql(u8, "--noscript", opt)) {
|
if (std.mem.eql(u8, "--noscript", opt)) {
|
||||||
log.warn(.app, "deprecation warning", .{
|
log.warn(.app, "deprecation warning", .{
|
||||||
.feature = "--noscript argument",
|
.feature = "--noscript argument",
|
||||||
@@ -622,6 +636,7 @@ fn parseFetchArgs(
|
|||||||
return .{
|
return .{
|
||||||
.url = url.?,
|
.url = url.?,
|
||||||
.dump = fetch_dump,
|
.dump = fetch_dump,
|
||||||
|
.dump_markdown = fetch_dump_markdown,
|
||||||
.strip = strip,
|
.strip = strip,
|
||||||
.common = common,
|
.common = common,
|
||||||
.withbase = withbase,
|
.withbase = withbase,
|
||||||
|
|||||||
282
src/browser/markdown.zig
Normal file
282
src/browser/markdown.zig
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
// Copyright (C) 2023-2026 Lightpanda (Selecy SAS)
|
||||||
|
//
|
||||||
|
// Francis Bouvier <francis@lightpanda.io>
|
||||||
|
// Pierre Tachoire <pierre@lightpanda.io>
|
||||||
|
//
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Affero General Public License as
|
||||||
|
// published by the Free Software Foundation, either version 3 of the
|
||||||
|
// License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Affero General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Affero General Public License
|
||||||
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
const std = @import("std");
|
||||||
|
const Page = @import("Page.zig");
|
||||||
|
const Node = @import("webapi/Node.zig");
|
||||||
|
const Element = @import("webapi/Element.zig");
|
||||||
|
const Slot = @import("webapi/element/html/Slot.zig");
|
||||||
|
|
||||||
|
pub const Opts = struct {
|
||||||
|
// Options for future customization (e.g., dialect)
|
||||||
|
};
|
||||||
|
|
||||||
|
const State = struct {
|
||||||
|
list_depth: usize = 0,
|
||||||
|
in_pre: bool = false,
|
||||||
|
in_code: bool = false,
|
||||||
|
in_blockquote: bool = false,
|
||||||
|
last_char_was_newline: bool = true,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn dump(node: *Node, opts: Opts, writer: *std.Io.Writer, page: *Page) !void {
|
||||||
|
_ = opts;
|
||||||
|
var state = State{};
|
||||||
|
try render(node, &state, writer, page);
|
||||||
|
if (!state.last_char_was_newline) {
|
||||||
|
try writer.writeByte('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn render(node: *Node, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void {
|
||||||
|
switch (node._type) {
|
||||||
|
.document, .document_fragment => {
|
||||||
|
try renderChildren(node, state, writer, page);
|
||||||
|
},
|
||||||
|
.element => |el| {
|
||||||
|
try renderElement(el, state, writer, page);
|
||||||
|
},
|
||||||
|
.cdata => |cd| {
|
||||||
|
if (node.is(Node.CData.Text)) |_| {
|
||||||
|
try renderText(cd.getData(), state, writer);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
else => {}, // Ignore other node types
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn renderChildren(parent: *Node, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void {
|
||||||
|
var it = parent.childrenIterator();
|
||||||
|
while (it.next()) |child| {
|
||||||
|
try render(child, state, writer, page);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn renderElement(el: *Element, state: *State, writer: *std.Io.Writer, page: *Page) anyerror!void {
|
||||||
|
const tag = el.getTag();
|
||||||
|
|
||||||
|
// Skip hidden/metadata elements
|
||||||
|
switch (tag) {
|
||||||
|
.script, .style, .noscript, .template, .head, .meta, .link, .title, .svg => return,
|
||||||
|
else => {},
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Opening Tag Logic ---
|
||||||
|
|
||||||
|
// Ensure block elements start on a new line (double newline for paragraphs etc)
|
||||||
|
switch (tag) {
|
||||||
|
.p, .div, .section, .article, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .pre, .table, .hr => {
|
||||||
|
if (!state.last_char_was_newline) {
|
||||||
|
try writer.writeByte('\n');
|
||||||
|
state.last_char_was_newline = true;
|
||||||
|
}
|
||||||
|
if (tag == .p or tag == .h1 or tag == .h2 or tag == .h3 or tag == .h4 or tag == .h5 or tag == .h6 or tag == .blockquote or tag == .pre or tag == .table) {
|
||||||
|
// Add an extra newline for spacing between blocks
|
||||||
|
try writer.writeByte('\n');
|
||||||
|
}
|
||||||
|
},
|
||||||
|
.li, .tr => {
|
||||||
|
if (!state.last_char_was_newline) {
|
||||||
|
try writer.writeByte('\n');
|
||||||
|
state.last_char_was_newline = true;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
else => {},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefixes
|
||||||
|
switch (tag) {
|
||||||
|
.h1 => try writer.writeAll("# "),
|
||||||
|
.h2 => try writer.writeAll("## "),
|
||||||
|
.h3 => try writer.writeAll("### "),
|
||||||
|
.h4 => try writer.writeAll("#### "),
|
||||||
|
.h5 => try writer.writeAll("##### "),
|
||||||
|
.h6 => try writer.writeAll("###### "),
|
||||||
|
.ul, .ol => {
|
||||||
|
state.list_depth += 1;
|
||||||
|
},
|
||||||
|
.li => {
|
||||||
|
const indent = if (state.list_depth > 0) state.list_depth - 1 else 0;
|
||||||
|
try writeIndentation(indent, writer);
|
||||||
|
try writer.writeAll("- ");
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
},
|
||||||
|
.blockquote => {
|
||||||
|
try writer.writeAll("> ");
|
||||||
|
state.in_blockquote = true;
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
},
|
||||||
|
.pre => {
|
||||||
|
try writer.writeAll("```\n");
|
||||||
|
state.in_pre = true;
|
||||||
|
state.last_char_was_newline = true;
|
||||||
|
},
|
||||||
|
.code => {
|
||||||
|
if (!state.in_pre) {
|
||||||
|
try writer.writeByte('`');
|
||||||
|
state.in_code = true;
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
.b, .strong => {
|
||||||
|
try writer.writeAll("**");
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
},
|
||||||
|
.i, .em => {
|
||||||
|
try writer.writeAll("*");
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
},
|
||||||
|
.hr => {
|
||||||
|
try writer.writeAll("---\n");
|
||||||
|
state.last_char_was_newline = true;
|
||||||
|
return; // Void element
|
||||||
|
},
|
||||||
|
.br => {
|
||||||
|
try writer.writeByte('\n');
|
||||||
|
state.last_char_was_newline = true;
|
||||||
|
return; // Void element
|
||||||
|
},
|
||||||
|
.img => {
|
||||||
|
try writer.writeAll(";
|
||||||
|
if (el.getAttributeSafe(comptime .wrap("src"))) |src| {
|
||||||
|
try writer.writeAll(src);
|
||||||
|
}
|
||||||
|
try writer.writeAll(")");
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
return; // Treat as void
|
||||||
|
},
|
||||||
|
.anchor => {
|
||||||
|
try writer.writeByte('[');
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
},
|
||||||
|
else => {},
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Render Children ---
|
||||||
|
try renderChildren(el.asNode(), state, writer, page);
|
||||||
|
|
||||||
|
// --- Closing Tag Logic ---
|
||||||
|
|
||||||
|
// Suffixes
|
||||||
|
switch (tag) {
|
||||||
|
.anchor => {
|
||||||
|
try writer.writeAll("](");
|
||||||
|
if (el.getAttributeSafe(comptime .wrap("href"))) |href| {
|
||||||
|
try writer.writeAll(href);
|
||||||
|
}
|
||||||
|
try writer.writeByte(')');
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
},
|
||||||
|
.pre => {
|
||||||
|
if (!state.last_char_was_newline) {
|
||||||
|
try writer.writeByte('\n');
|
||||||
|
}
|
||||||
|
try writer.writeAll("```\n");
|
||||||
|
state.in_pre = false;
|
||||||
|
state.last_char_was_newline = true;
|
||||||
|
},
|
||||||
|
.code => {
|
||||||
|
if (!state.in_pre) {
|
||||||
|
try writer.writeByte('`');
|
||||||
|
state.in_code = false;
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
.b, .strong => {
|
||||||
|
try writer.writeAll("**");
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
},
|
||||||
|
.i, .em => {
|
||||||
|
try writer.writeAll("*");
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
},
|
||||||
|
.blockquote => {
|
||||||
|
state.in_blockquote = false;
|
||||||
|
},
|
||||||
|
.ul, .ol => {
|
||||||
|
if (state.list_depth > 0) state.list_depth -= 1;
|
||||||
|
},
|
||||||
|
else => {},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Post-block newlines
|
||||||
|
switch (tag) {
|
||||||
|
.p, .div, .section, .article, .header, .footer, .nav, .aside, .h1, .h2, .h3, .h4, .h5, .h6, .ul, .ol, .blockquote, .table, .tr => {
|
||||||
|
if (!state.last_char_was_newline) {
|
||||||
|
try writer.writeByte('\n');
|
||||||
|
state.last_char_was_newline = true;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
else => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn renderText(text: []const u8, state: *State, writer: *std.Io.Writer) anyerror!void {
|
||||||
|
if (text.len == 0) return;
|
||||||
|
|
||||||
|
if (state.in_pre) {
|
||||||
|
try writer.writeAll(text);
|
||||||
|
if (text.len > 0 and text[text.len - 1] == '\n') {
|
||||||
|
state.last_char_was_newline = true;
|
||||||
|
} else {
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collapse whitespace
|
||||||
|
var it = std.mem.tokenizeAny(u8, text, " \t\n\r");
|
||||||
|
var first = true;
|
||||||
|
while (it.next()) |word| {
|
||||||
|
// If this is the first word we're writing in this sequence...
|
||||||
|
if (first) {
|
||||||
|
// ...and we didn't just write a newline...
|
||||||
|
if (!state.last_char_was_newline) {
|
||||||
|
// ...check if the original text had leading whitespace.
|
||||||
|
if (text.len > 0 and std.ascii.isWhitespace(text[0])) {
|
||||||
|
try writer.writeByte(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Between words always add space
|
||||||
|
try writer.writeByte(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
try writer.writeAll(word);
|
||||||
|
state.last_char_was_newline = false;
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle trailing whitespace from the original text
|
||||||
|
if (!first and !state.last_char_was_newline) {
|
||||||
|
if (text.len > 0 and std.ascii.isWhitespace(text[text.len - 1])) {
|
||||||
|
try writer.writeByte(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn writeIndentation(level: usize, writer: *std.Io.Writer) anyerror!void {
|
||||||
|
var i: usize = 0;
|
||||||
|
while (i < level) : (i += 1) {
|
||||||
|
try writer.writeAll(" ");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -28,6 +28,7 @@ pub const Notification = @import("Notification.zig");
|
|||||||
pub const log = @import("log.zig");
|
pub const log = @import("log.zig");
|
||||||
pub const js = @import("browser/js/js.zig");
|
pub const js = @import("browser/js/js.zig");
|
||||||
pub const dump = @import("browser/dump.zig");
|
pub const dump = @import("browser/dump.zig");
|
||||||
|
pub const markdown = @import("browser/markdown.zig");
|
||||||
pub const build_config = @import("build_config");
|
pub const build_config = @import("build_config");
|
||||||
pub const crash_handler = @import("crash_handler.zig");
|
pub const crash_handler = @import("crash_handler.zig");
|
||||||
|
|
||||||
@@ -36,6 +37,7 @@ const IS_DEBUG = @import("builtin").mode == .Debug;
|
|||||||
pub const FetchOpts = struct {
|
pub const FetchOpts = struct {
|
||||||
wait_ms: u32 = 5000,
|
wait_ms: u32 = 5000,
|
||||||
dump: dump.RootOpts,
|
dump: dump.RootOpts,
|
||||||
|
dump_markdown: bool = false,
|
||||||
writer: ?*std.Io.Writer = null,
|
writer: ?*std.Io.Writer = null,
|
||||||
};
|
};
|
||||||
pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
|
pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
|
||||||
@@ -94,7 +96,11 @@ pub fn fetch(app: *App, url: [:0]const u8, opts: FetchOpts) !void {
|
|||||||
_ = session.wait(opts.wait_ms);
|
_ = session.wait(opts.wait_ms);
|
||||||
|
|
||||||
const writer = opts.writer orelse return;
|
const writer = opts.writer orelse return;
|
||||||
try dump.root(page.window._document, opts.dump, writer, page);
|
if (opts.dump_markdown) {
|
||||||
|
try markdown.dump(page.window._document.asNode(), .{}, writer, page);
|
||||||
|
} else {
|
||||||
|
try dump.root(page.window._document, opts.dump, writer, page);
|
||||||
|
}
|
||||||
try writer.flush();
|
try writer.flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -111,6 +111,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void {
|
|||||||
|
|
||||||
var fetch_opts = lp.FetchOpts{
|
var fetch_opts = lp.FetchOpts{
|
||||||
.wait_ms = 5000,
|
.wait_ms = 5000,
|
||||||
|
.dump_markdown = opts.dump_markdown,
|
||||||
.dump = .{
|
.dump = .{
|
||||||
.strip = opts.strip,
|
.strip = opts.strip,
|
||||||
.with_base = opts.withbase,
|
.with_base = opts.withbase,
|
||||||
@@ -119,7 +120,7 @@ fn run(allocator: Allocator, main_arena: Allocator) !void {
|
|||||||
|
|
||||||
var stdout = std.fs.File.stdout();
|
var stdout = std.fs.File.stdout();
|
||||||
var writer = stdout.writer(&.{});
|
var writer = stdout.writer(&.{});
|
||||||
if (opts.dump) {
|
if (opts.dump or opts.dump_markdown) {
|
||||||
fetch_opts.writer = &writer.interface;
|
fetch_opts.writer = &writer.interface;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user