markdown: working duckduckgo

This commit is contained in:
Francis Bouvier
2025-06-19 11:22:05 -07:00
parent 326851ed6f
commit 3575f45ac0
2 changed files with 182 additions and 116 deletions

View File

@@ -21,195 +21,254 @@ const std = @import("std");
const parser = @import("netsurf.zig"); const parser = @import("netsurf.zig");
const Walker = @import("dom/walker.zig").WalkerChildren; const Walker = @import("dom/walker.zig").WalkerChildren;
const URL = @import("../url.zig").URL;
const NP = "\n\n"; const NP = "\n\n";
const Elem = struct {
inlin: bool = false,
list_order: ?u8 = null,
parent: ?*Elem = null,
};
const State = struct {
block: bool,
last_char: u8,
elem: ?*Elem = null,
fn is_inline(state: *State) bool {
if (state.elem == null) return false;
return state.elem.?.inlin;
}
fn last_char_space(state: *State) bool {
if (state.last_char == ' ' or state.last_char == '\n') return true;
return false;
}
};
// writer must be a std.io.Writer // writer must be a std.io.Writer
pub fn writeMarkdown(doc: *parser.Document, writer: anytype) !void { pub fn writeMarkdown(url: URL, doc: *parser.Document, writer: anytype) !void {
_ = try writeChildren(parser.documentToNode(doc), true, writer); var state = State{ .block = true, .last_char = '\n' };
_ = try writeChildren(url, parser.documentToNode(doc), &state, writer);
try writer.writeAll("\n"); try writer.writeAll("\n");
} }
fn writeChildren(root: *parser.Node, new_para: bool, writer: anytype) !bool { fn writeChildren(url: URL, root: *parser.Node, state: *State, writer: anytype) !void {
const walker = Walker{}; const walker = Walker{};
var next: ?*parser.Node = null; var next: ?*parser.Node = null;
var _new_para = new_para;
while (true) { while (true) {
next = try walker.get_next(root, next) orelse break; next = try walker.get_next(root, next) orelse break;
_new_para = try writeNode(next.?, _new_para, writer); try writeNode(url, next.?, state, writer);
} }
return _new_para;
} }
fn skipTextChild(root: *parser.Node) !*parser.Node { fn ensureBlock(state: *State, writer: anytype) !void {
const child = parser.nodeFirstChild(root) orelse return root; if (state.is_inline()) return;
const node_type = try parser.nodeType(child); if (!state.block) {
if (node_type == .text) return child; try writer.writeAll(NP);
return root; state.last_char = '\n';
state.block = true;
}
} }
// the returned boolean can be either: fn writeInline(state: *State, text: []const u8, writer: anytype) !void {
// - true if a new paragraph has been written at the end try writer.writeAll(text);
// - false if an inline text (ie. without new paragraph) has been written at the end state.last_char = text[text.len - 1];
// - the value of the writeChildren function if it has been called recursively at the end if (state.block) state.block = false;
// - the new_para received as argument otherwise }
fn writeNode(node: *parser.Node, new_para: bool, writer: anytype) anyerror!bool {
const order = [_][]const u8{
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
"31", "32", "33", "34", "35", "36", "37", "38", "39", "40",
"41", "42", "43", "44", "45", "46", "47", "48", "49", "50",
};
fn writeNode(url: URL, node: *parser.Node, state: *State, writer: anytype) anyerror!void {
switch (try parser.nodeType(node)) { switch (try parser.nodeType(node)) {
.element => { .element => {
const html_element: *parser.ElementHTML = @ptrCast(node); const html_element: *parser.ElementHTML = @ptrCast(node);
const tag = try parser.elementHTMLGetTagType(html_element); const tag = try parser.elementHTMLGetTagType(html_element);
// debug // debug
// try writer.writeAll("\nstart - ");
// try writer.writeAll(@tagName(tag)); // try writer.writeAll(@tagName(tag));
// try writer.writeAll("-"); // try writer.writeAll("\n");
// if (new_para) {
// try writer.writeAll("1");
// } else {
// try writer.writeAll("0");
// }
switch (tag) { switch (tag) {
// skip element, go to children // skip element, go to children
.html, .head, .header, .footer, .meta, .link, .body => { .html, .head, .meta, .link, .body, .span => {
return try writeChildren(node, new_para, writer); try writeChildren(url, node, state, writer);
}, },
// skip element and children (eg. text) // skip element and children
.title, .i, .script, .noscript, .undef, .style => return new_para, .title, .i, .script, .noscript, .undef, .style => {},
// generic elements // generic elements
.h1, .h2, .h3, .h4 => { .h1, .h2, .h3, .h4, .h5, .h6 => {
if (!new_para) { try ensureBlock(state, writer);
try writer.writeAll(NP); if (!state.is_inline()) {
switch (tag) {
.h1 => try writeInline(state, "# ", writer),
.h2 => try writeInline(state, "## ", writer),
.h3 => try writeInline(state, "### ", writer),
.h4 => try writeInline(state, "#### ", writer),
.h5 => try writeInline(state, "##### ", writer),
.h6 => try writeInline(state, "###### ", writer),
else => @panic("only headers tags are supported here"),
}
} }
switch (tag) { try writeChildren(url, node, state, writer);
.h1 => try writer.writeAll("# "), try ensureBlock(state, writer);
.h2 => try writer.writeAll("## "),
.h3 => try writer.writeAll("### "),
.h4 => try writer.writeAll("#### "),
else => @panic("only headers tags are supported here"),
}
const np = try writeChildren(node, false, writer);
if (!np) try writer.writeAll(NP);
return true;
}, },
// containers and dividers // containers and dividers
.nav, .section, .article, .p, .div, .button, .form => { .header, .footer, .nav, .section, .div, .article, .p, .button, .form => {
if (!new_para) try writer.writeAll(NP); try ensureBlock(state, writer);
const np = try writeChildren(node, true, writer); try writeChildren(url, node, state, writer);
if (!np) try writer.writeAll(NP); try ensureBlock(state, writer);
return true;
},
.span => {
return try writeChildren(node, new_para, writer);
},
.b => {
try writer.writeAll("**");
_ = try writeChildren(node, false, writer);
try writer.writeAll("**");
return false;
}, },
.br => { .br => {
if (!new_para) try writer.writeAll(NP); try ensureBlock(state, writer);
return try writeChildren(node, true, writer); try writeChildren(url, node, state, writer);
}, },
.hr => { .hr => {
if (!new_para) try writer.writeAll(NP); try ensureBlock(state, writer);
try writer.writeAll("---"); try writeInline(state, "---", writer);
try writer.writeAll(NP); try ensureBlock(state, writer);
return true; },
// styling
.b => {
var elem = Elem{ .parent = state.elem, .inlin = true };
state.elem = &elem;
defer state.elem = elem.parent;
try writeInline(state, "**", writer);
try writeChildren(url, node, state, writer);
try writeInline(state, "**", writer);
}, },
// specific elements // specific elements
.a => { .a => {
if (!state.last_char_space()) try writeInline(state, " ", writer);
var elem = Elem{ .parent = state.elem, .inlin = true };
state.elem = &elem;
defer state.elem = elem.parent;
const element = parser.nodeToElement(node); const element = parser.nodeToElement(node);
if (try getAttributeValue(element, "href")) |href| { if (try getAttributeValue(element, "href")) |href| {
// TODO: absolute path? try writeInline(state, "[", writer);
try writer.writeAll("["); try writeChildren(url, node, state, writer);
_ = try writeChildren(node, false, writer); try writeInline(state, "](", writer);
try writer.writeAll("]("); // handle relative path
try writer.writeAll(href); if (href[0] == '/') {
try writer.writeAll(")"); try writeInline(state, url.scheme(), writer);
return false; try writeInline(state, "://", writer);
try writeInline(state, url.host(), writer);
}
try writeInline(state, href, writer);
try writeInline(state, ")", writer);
} else {
try writeChildren(url, node, state, writer);
} }
return try writeChildren(node, new_para, writer);
}, },
.img => { .img => {
var elem = Elem{ .parent = state.elem, .inlin = true };
state.elem = &elem;
defer state.elem = elem.parent;
const element = parser.nodeToElement(node); const element = parser.nodeToElement(node);
if (try getAttributeValue(element, "src")) |src| { if (try getAttributeValue(element, "src")) |src| {
// TODO: absolute path? try writeInline(state, "![", writer);
try writer.writeAll("![");
if (try getAttributeValue(element, "alt")) |alt| { if (try getAttributeValue(element, "alt")) |alt| {
try writer.writeAll(alt); try writeInline(state, alt, writer);
} else { } else {
try writer.writeAll(src); try writeInline(state, src, writer);
} }
try writer.writeAll("]("); try writeInline(state, "](", writer);
try writer.writeAll(src); // handle relative path
try writer.writeAll(")"); if (src[0] == '/') {
return false; try writeInline(state, url.scheme(), writer);
try writeInline(state, "://", writer);
try writeInline(state, url.host(), writer);
}
try writeInline(state, src, writer);
try writeInline(state, ")", writer);
} }
return new_para;
},
.ol => {
if (!new_para) try writer.writeAll(NP);
const np = try writeChildren(node, true, writer);
if (!np) try writer.writeAll(NP);
return true;
}, },
.ul => { .ul => {
if (!new_para) try writer.writeAll(NP); var elem = Elem{ .parent = state.elem, .list_order = 0 };
const np = try writeChildren(node, true, writer); state.elem = &elem;
if (!np) try writer.writeAll(NP); defer state.elem = elem.parent;
return true; try ensureBlock(state, writer);
try writeChildren(url, node, state, writer);
try ensureBlock(state, writer);
}, },
.li => { .ol => {
if (!new_para) try writer.writeAll("\n"); var elem = Elem{ .parent = state.elem, .list_order = 1 };
try writer.writeAll("- "); state.elem = &elem;
return try writeChildren(node, false, writer); defer state.elem = elem.parent;
try ensureBlock(state, writer);
try writeChildren(url, node, state, writer);
try ensureBlock(state, writer);
},
.li => blk: {
const parent = state.elem orelse break :blk;
const list_order = parent.list_order orelse break :blk;
if (!state.block) try writer.writeAll("\n");
if (list_order > 0) {
// ordered list
try writeInline(state, order[list_order - 1], writer);
try writeInline(state, ". ", writer);
parent.list_order = list_order + 1;
} else {
// unordered list
try writeInline(state, "- ", writer);
}
try writeChildren(url, node, state, writer);
}, },
.input => { .input => {
var elem = Elem{ .parent = state.elem, .inlin = true };
state.elem = &elem;
defer state.elem = elem.parent;
const element = parser.nodeToElement(node); const element = parser.nodeToElement(node);
if (try getAttributeValue(element, "value")) |value| { if (try getAttributeValue(element, "value")) |value| {
try writer.writeAll(value); try writeInline(state, value, writer);
try writer.writeAll(" "); try writeInline(state, " ", writer);
} }
return false;
}, },
else => { else => {
try writer.writeAll("\n"); try ensureBlock(state, writer);
try writer.writeAll(@tagName(tag)); try writer.writeAll(@tagName(tag));
try writer.writeAll(" not supported\n"); try writer.writeAll(" not supported");
try ensureBlock(state, writer);
}, },
} }
// panic // try writer.writeAll("\nend - ");
// try writer.writeAll(@tagName(tag));
// try writer.writeAll("\n");
}, },
.text => { .text => {
const v = try parser.nodeValue(node) orelse return new_para; const v = try parser.nodeValue(node) orelse return;
const printed = try writeText(v, writer); const printed = try writeText(state, v, writer);
if (printed) return false; if (printed) state.block = false;
return new_para;
},
.cdata_section => {
return new_para;
},
.comment => {
return new_para;
}, },
.cdata_section => {},
.comment => {},
// TODO handle processing instruction dump // TODO handle processing instruction dump
.processing_instruction => return new_para, .processing_instruction => {},
// document fragment is outside of the main document DOM, so we // document fragment is outside of the main document DOM, so we
// don't output it. // don't output it.
.document_fragment => return new_para, .document_fragment => {},
// document will never be called, but required for completeness. // document will never be called, but required for completeness.
.document => return new_para, .document => {},
// done globally instead, but required for completeness. Only the outer DOCTYPE should be written // done globally instead, but required for completeness. Only the outer DOCTYPE should be written
.document_type => return new_para, .document_type => {},
// deprecated // deprecated
.attribute, .entity_reference, .entity, .notation => return new_para, .attribute, .entity_reference, .entity, .notation => {},
} }
return new_para;
} }
// TODO: not sure about + - . ! as they are very common characters // TODO: not sure about + - . ! as they are very common characters
@@ -217,12 +276,12 @@ fn writeNode(node: *parser.Node, new_para: bool, writer: anytype) anyerror!bool
// TODO: | (pipe) // TODO: | (pipe)
const escape = [_]u8{ '\\', '`', '*', '_', '{', '}', '[', ']', '<', '>', '(', ')', '#' }; const escape = [_]u8{ '\\', '`', '*', '_', '{', '}', '[', ']', '<', '>', '(', ')', '#' };
fn writeText(value: []const u8, writer: anytype) !bool { fn writeText(state: *State, value: []const u8, writer: anytype) !bool {
if (value.len == 0) return false; if (value.len == 0) return false;
var last_char: u8 = ' '; var last_char: u8 = ' ';
var printed: u64 = 0; var printed: u64 = 0;
for (value) |v| { for (value, 0..) |v, i| {
// do not print: // do not print:
// - multiple spaces // - multiple spaces
// - return line // - return line
@@ -236,10 +295,17 @@ fn writeText(value: []const u8, writer: anytype) !bool {
if (v == esc) try writer.writeAll("\\"); if (v == esc) try writer.writeAll("\\");
} }
if (printed == 0 and !state.is_inline()) {
if (state.last_char != '\n' and state.last_char != ' ') {
try writer.writeAll(" ");
}
}
last_char = v; last_char = v;
printed += 1; printed += 1;
const x = [_]u8{v}; // TODO: do we have something better? const x = [_]u8{v}; // TODO: do we have something better?
try writer.writeAll(&x); try writer.writeAll(&x);
if (i == value.len - 1) state.last_char = v;
} }
if (printed > 0) return true; if (printed > 0) return true;
return false; return false;

View File

@@ -157,7 +157,7 @@ pub const Page = struct {
// if the page has a pointer to a document, converts the HTML in Markdown and dump it. // if the page has a pointer to a document, converts the HTML in Markdown and dump it.
const doc = parser.documentHTMLToDocument(self.window.document); const doc = parser.documentHTMLToDocument(self.window.document);
try Markdown.writeMarkdown(doc, out); try Markdown.writeMarkdown(self.url, doc, out);
} }
pub fn fetchModuleSource(ctx: *anyopaque, specifier: []const u8) !?[]const u8 { pub fn fetchModuleSource(ctx: *anyopaque, specifier: []const u8) !?[]const u8 {