mirror of
https://github.com/lightpanda-io/browser.git
synced 2025-10-28 14:43:28 +00:00
markdown: working duckduckgo
This commit is contained in:
@@ -21,195 +21,254 @@ const std = @import("std");
|
|||||||
const parser = @import("netsurf.zig");
|
const parser = @import("netsurf.zig");
|
||||||
const Walker = @import("dom/walker.zig").WalkerChildren;
|
const Walker = @import("dom/walker.zig").WalkerChildren;
|
||||||
|
|
||||||
|
const URL = @import("../url.zig").URL;
|
||||||
|
|
||||||
const NP = "\n\n";
|
const NP = "\n\n";
|
||||||
|
|
||||||
|
const Elem = struct {
|
||||||
|
inlin: bool = false,
|
||||||
|
list_order: ?u8 = null,
|
||||||
|
parent: ?*Elem = null,
|
||||||
|
};
|
||||||
|
|
||||||
|
const State = struct {
|
||||||
|
block: bool,
|
||||||
|
last_char: u8,
|
||||||
|
elem: ?*Elem = null,
|
||||||
|
|
||||||
|
fn is_inline(state: *State) bool {
|
||||||
|
if (state.elem == null) return false;
|
||||||
|
return state.elem.?.inlin;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn last_char_space(state: *State) bool {
|
||||||
|
if (state.last_char == ' ' or state.last_char == '\n') return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// writer must be a std.io.Writer
|
// writer must be a std.io.Writer
|
||||||
pub fn writeMarkdown(doc: *parser.Document, writer: anytype) !void {
|
pub fn writeMarkdown(url: URL, doc: *parser.Document, writer: anytype) !void {
|
||||||
_ = try writeChildren(parser.documentToNode(doc), true, writer);
|
var state = State{ .block = true, .last_char = '\n' };
|
||||||
|
_ = try writeChildren(url, parser.documentToNode(doc), &state, writer);
|
||||||
try writer.writeAll("\n");
|
try writer.writeAll("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
fn writeChildren(root: *parser.Node, new_para: bool, writer: anytype) !bool {
|
fn writeChildren(url: URL, root: *parser.Node, state: *State, writer: anytype) !void {
|
||||||
const walker = Walker{};
|
const walker = Walker{};
|
||||||
var next: ?*parser.Node = null;
|
var next: ?*parser.Node = null;
|
||||||
var _new_para = new_para;
|
|
||||||
while (true) {
|
while (true) {
|
||||||
next = try walker.get_next(root, next) orelse break;
|
next = try walker.get_next(root, next) orelse break;
|
||||||
_new_para = try writeNode(next.?, _new_para, writer);
|
try writeNode(url, next.?, state, writer);
|
||||||
}
|
}
|
||||||
return _new_para;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn skipTextChild(root: *parser.Node) !*parser.Node {
|
fn ensureBlock(state: *State, writer: anytype) !void {
|
||||||
const child = parser.nodeFirstChild(root) orelse return root;
|
if (state.is_inline()) return;
|
||||||
const node_type = try parser.nodeType(child);
|
if (!state.block) {
|
||||||
if (node_type == .text) return child;
|
try writer.writeAll(NP);
|
||||||
return root;
|
state.last_char = '\n';
|
||||||
|
state.block = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// the returned boolean can be either:
|
fn writeInline(state: *State, text: []const u8, writer: anytype) !void {
|
||||||
// - true if a new paragraph has been written at the end
|
try writer.writeAll(text);
|
||||||
// - false if an inline text (ie. without new paragraph) has been written at the end
|
state.last_char = text[text.len - 1];
|
||||||
// - the value of the writeChildren function if it has been called recursively at the end
|
if (state.block) state.block = false;
|
||||||
// - the new_para received as argument otherwise
|
}
|
||||||
fn writeNode(node: *parser.Node, new_para: bool, writer: anytype) anyerror!bool {
|
|
||||||
|
const order = [_][]const u8{
|
||||||
|
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
|
||||||
|
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
|
||||||
|
"21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
|
||||||
|
"31", "32", "33", "34", "35", "36", "37", "38", "39", "40",
|
||||||
|
"41", "42", "43", "44", "45", "46", "47", "48", "49", "50",
|
||||||
|
};
|
||||||
|
|
||||||
|
fn writeNode(url: URL, node: *parser.Node, state: *State, writer: anytype) anyerror!void {
|
||||||
switch (try parser.nodeType(node)) {
|
switch (try parser.nodeType(node)) {
|
||||||
.element => {
|
.element => {
|
||||||
const html_element: *parser.ElementHTML = @ptrCast(node);
|
const html_element: *parser.ElementHTML = @ptrCast(node);
|
||||||
const tag = try parser.elementHTMLGetTagType(html_element);
|
const tag = try parser.elementHTMLGetTagType(html_element);
|
||||||
|
|
||||||
// debug
|
// debug
|
||||||
|
// try writer.writeAll("\nstart - ");
|
||||||
// try writer.writeAll(@tagName(tag));
|
// try writer.writeAll(@tagName(tag));
|
||||||
// try writer.writeAll("-");
|
// try writer.writeAll("\n");
|
||||||
// if (new_para) {
|
|
||||||
// try writer.writeAll("1");
|
|
||||||
// } else {
|
|
||||||
// try writer.writeAll("0");
|
|
||||||
// }
|
|
||||||
|
|
||||||
switch (tag) {
|
switch (tag) {
|
||||||
|
|
||||||
// skip element, go to children
|
// skip element, go to children
|
||||||
.html, .head, .header, .footer, .meta, .link, .body => {
|
.html, .head, .meta, .link, .body, .span => {
|
||||||
return try writeChildren(node, new_para, writer);
|
try writeChildren(url, node, state, writer);
|
||||||
},
|
},
|
||||||
|
|
||||||
// skip element and children (eg. text)
|
// skip element and children
|
||||||
.title, .i, .script, .noscript, .undef, .style => return new_para,
|
.title, .i, .script, .noscript, .undef, .style => {},
|
||||||
|
|
||||||
// generic elements
|
// generic elements
|
||||||
.h1, .h2, .h3, .h4 => {
|
.h1, .h2, .h3, .h4, .h5, .h6 => {
|
||||||
if (!new_para) {
|
try ensureBlock(state, writer);
|
||||||
try writer.writeAll(NP);
|
if (!state.is_inline()) {
|
||||||
|
switch (tag) {
|
||||||
|
.h1 => try writeInline(state, "# ", writer),
|
||||||
|
.h2 => try writeInline(state, "## ", writer),
|
||||||
|
.h3 => try writeInline(state, "### ", writer),
|
||||||
|
.h4 => try writeInline(state, "#### ", writer),
|
||||||
|
.h5 => try writeInline(state, "##### ", writer),
|
||||||
|
.h6 => try writeInline(state, "###### ", writer),
|
||||||
|
else => @panic("only headers tags are supported here"),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
switch (tag) {
|
try writeChildren(url, node, state, writer);
|
||||||
.h1 => try writer.writeAll("# "),
|
try ensureBlock(state, writer);
|
||||||
.h2 => try writer.writeAll("## "),
|
|
||||||
.h3 => try writer.writeAll("### "),
|
|
||||||
.h4 => try writer.writeAll("#### "),
|
|
||||||
else => @panic("only headers tags are supported here"),
|
|
||||||
}
|
|
||||||
const np = try writeChildren(node, false, writer);
|
|
||||||
if (!np) try writer.writeAll(NP);
|
|
||||||
return true;
|
|
||||||
},
|
},
|
||||||
|
|
||||||
// containers and dividers
|
// containers and dividers
|
||||||
.nav, .section, .article, .p, .div, .button, .form => {
|
.header, .footer, .nav, .section, .div, .article, .p, .button, .form => {
|
||||||
if (!new_para) try writer.writeAll(NP);
|
try ensureBlock(state, writer);
|
||||||
const np = try writeChildren(node, true, writer);
|
try writeChildren(url, node, state, writer);
|
||||||
if (!np) try writer.writeAll(NP);
|
try ensureBlock(state, writer);
|
||||||
return true;
|
|
||||||
},
|
|
||||||
.span => {
|
|
||||||
return try writeChildren(node, new_para, writer);
|
|
||||||
},
|
|
||||||
.b => {
|
|
||||||
try writer.writeAll("**");
|
|
||||||
_ = try writeChildren(node, false, writer);
|
|
||||||
try writer.writeAll("**");
|
|
||||||
return false;
|
|
||||||
},
|
},
|
||||||
.br => {
|
.br => {
|
||||||
if (!new_para) try writer.writeAll(NP);
|
try ensureBlock(state, writer);
|
||||||
return try writeChildren(node, true, writer);
|
try writeChildren(url, node, state, writer);
|
||||||
},
|
},
|
||||||
.hr => {
|
.hr => {
|
||||||
if (!new_para) try writer.writeAll(NP);
|
try ensureBlock(state, writer);
|
||||||
try writer.writeAll("---");
|
try writeInline(state, "---", writer);
|
||||||
try writer.writeAll(NP);
|
try ensureBlock(state, writer);
|
||||||
return true;
|
},
|
||||||
|
|
||||||
|
// styling
|
||||||
|
.b => {
|
||||||
|
var elem = Elem{ .parent = state.elem, .inlin = true };
|
||||||
|
state.elem = &elem;
|
||||||
|
defer state.elem = elem.parent;
|
||||||
|
try writeInline(state, "**", writer);
|
||||||
|
try writeChildren(url, node, state, writer);
|
||||||
|
try writeInline(state, "**", writer);
|
||||||
},
|
},
|
||||||
|
|
||||||
// specific elements
|
// specific elements
|
||||||
.a => {
|
.a => {
|
||||||
|
if (!state.last_char_space()) try writeInline(state, " ", writer);
|
||||||
|
var elem = Elem{ .parent = state.elem, .inlin = true };
|
||||||
|
state.elem = &elem;
|
||||||
|
defer state.elem = elem.parent;
|
||||||
const element = parser.nodeToElement(node);
|
const element = parser.nodeToElement(node);
|
||||||
if (try getAttributeValue(element, "href")) |href| {
|
if (try getAttributeValue(element, "href")) |href| {
|
||||||
// TODO: absolute path?
|
try writeInline(state, "[", writer);
|
||||||
try writer.writeAll("[");
|
try writeChildren(url, node, state, writer);
|
||||||
_ = try writeChildren(node, false, writer);
|
try writeInline(state, "](", writer);
|
||||||
try writer.writeAll("](");
|
// handle relative path
|
||||||
try writer.writeAll(href);
|
if (href[0] == '/') {
|
||||||
try writer.writeAll(")");
|
try writeInline(state, url.scheme(), writer);
|
||||||
return false;
|
try writeInline(state, "://", writer);
|
||||||
|
try writeInline(state, url.host(), writer);
|
||||||
|
}
|
||||||
|
try writeInline(state, href, writer);
|
||||||
|
try writeInline(state, ")", writer);
|
||||||
|
} else {
|
||||||
|
try writeChildren(url, node, state, writer);
|
||||||
}
|
}
|
||||||
return try writeChildren(node, new_para, writer);
|
|
||||||
},
|
},
|
||||||
.img => {
|
.img => {
|
||||||
|
var elem = Elem{ .parent = state.elem, .inlin = true };
|
||||||
|
state.elem = &elem;
|
||||||
|
defer state.elem = elem.parent;
|
||||||
const element = parser.nodeToElement(node);
|
const element = parser.nodeToElement(node);
|
||||||
if (try getAttributeValue(element, "src")) |src| {
|
if (try getAttributeValue(element, "src")) |src| {
|
||||||
// TODO: absolute path?
|
try writeInline(state, ";
|
try writeInline(state, "](", writer);
|
||||||
try writer.writeAll(src);
|
// handle relative path
|
||||||
try writer.writeAll(")");
|
if (src[0] == '/') {
|
||||||
return false;
|
try writeInline(state, url.scheme(), writer);
|
||||||
|
try writeInline(state, "://", writer);
|
||||||
|
try writeInline(state, url.host(), writer);
|
||||||
|
}
|
||||||
|
try writeInline(state, src, writer);
|
||||||
|
try writeInline(state, ")", writer);
|
||||||
}
|
}
|
||||||
return new_para;
|
|
||||||
},
|
|
||||||
.ol => {
|
|
||||||
if (!new_para) try writer.writeAll(NP);
|
|
||||||
const np = try writeChildren(node, true, writer);
|
|
||||||
if (!np) try writer.writeAll(NP);
|
|
||||||
return true;
|
|
||||||
},
|
},
|
||||||
.ul => {
|
.ul => {
|
||||||
if (!new_para) try writer.writeAll(NP);
|
var elem = Elem{ .parent = state.elem, .list_order = 0 };
|
||||||
const np = try writeChildren(node, true, writer);
|
state.elem = &elem;
|
||||||
if (!np) try writer.writeAll(NP);
|
defer state.elem = elem.parent;
|
||||||
return true;
|
try ensureBlock(state, writer);
|
||||||
|
try writeChildren(url, node, state, writer);
|
||||||
|
try ensureBlock(state, writer);
|
||||||
},
|
},
|
||||||
.li => {
|
.ol => {
|
||||||
if (!new_para) try writer.writeAll("\n");
|
var elem = Elem{ .parent = state.elem, .list_order = 1 };
|
||||||
try writer.writeAll("- ");
|
state.elem = &elem;
|
||||||
return try writeChildren(node, false, writer);
|
defer state.elem = elem.parent;
|
||||||
|
try ensureBlock(state, writer);
|
||||||
|
try writeChildren(url, node, state, writer);
|
||||||
|
try ensureBlock(state, writer);
|
||||||
|
},
|
||||||
|
.li => blk: {
|
||||||
|
const parent = state.elem orelse break :blk;
|
||||||
|
const list_order = parent.list_order orelse break :blk;
|
||||||
|
if (!state.block) try writer.writeAll("\n");
|
||||||
|
if (list_order > 0) {
|
||||||
|
// ordered list
|
||||||
|
try writeInline(state, order[list_order - 1], writer);
|
||||||
|
try writeInline(state, ". ", writer);
|
||||||
|
parent.list_order = list_order + 1;
|
||||||
|
} else {
|
||||||
|
// unordered list
|
||||||
|
try writeInline(state, "- ", writer);
|
||||||
|
}
|
||||||
|
try writeChildren(url, node, state, writer);
|
||||||
},
|
},
|
||||||
.input => {
|
.input => {
|
||||||
|
var elem = Elem{ .parent = state.elem, .inlin = true };
|
||||||
|
state.elem = &elem;
|
||||||
|
defer state.elem = elem.parent;
|
||||||
const element = parser.nodeToElement(node);
|
const element = parser.nodeToElement(node);
|
||||||
if (try getAttributeValue(element, "value")) |value| {
|
if (try getAttributeValue(element, "value")) |value| {
|
||||||
try writer.writeAll(value);
|
try writeInline(state, value, writer);
|
||||||
try writer.writeAll(" ");
|
try writeInline(state, " ", writer);
|
||||||
}
|
}
|
||||||
return false;
|
|
||||||
},
|
},
|
||||||
|
|
||||||
else => {
|
else => {
|
||||||
try writer.writeAll("\n");
|
try ensureBlock(state, writer);
|
||||||
try writer.writeAll(@tagName(tag));
|
try writer.writeAll(@tagName(tag));
|
||||||
try writer.writeAll(" not supported\n");
|
try writer.writeAll(" not supported");
|
||||||
|
try ensureBlock(state, writer);
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
// panic
|
// try writer.writeAll("\nend - ");
|
||||||
|
// try writer.writeAll(@tagName(tag));
|
||||||
|
// try writer.writeAll("\n");
|
||||||
},
|
},
|
||||||
.text => {
|
.text => {
|
||||||
const v = try parser.nodeValue(node) orelse return new_para;
|
const v = try parser.nodeValue(node) orelse return;
|
||||||
const printed = try writeText(v, writer);
|
const printed = try writeText(state, v, writer);
|
||||||
if (printed) return false;
|
if (printed) state.block = false;
|
||||||
return new_para;
|
|
||||||
},
|
|
||||||
.cdata_section => {
|
|
||||||
return new_para;
|
|
||||||
},
|
|
||||||
.comment => {
|
|
||||||
return new_para;
|
|
||||||
},
|
},
|
||||||
|
.cdata_section => {},
|
||||||
|
.comment => {},
|
||||||
// TODO handle processing instruction dump
|
// TODO handle processing instruction dump
|
||||||
.processing_instruction => return new_para,
|
.processing_instruction => {},
|
||||||
// document fragment is outside of the main document DOM, so we
|
// document fragment is outside of the main document DOM, so we
|
||||||
// don't output it.
|
// don't output it.
|
||||||
.document_fragment => return new_para,
|
.document_fragment => {},
|
||||||
// document will never be called, but required for completeness.
|
// document will never be called, but required for completeness.
|
||||||
.document => return new_para,
|
.document => {},
|
||||||
// done globally instead, but required for completeness. Only the outer DOCTYPE should be written
|
// done globally instead, but required for completeness. Only the outer DOCTYPE should be written
|
||||||
.document_type => return new_para,
|
.document_type => {},
|
||||||
// deprecated
|
// deprecated
|
||||||
.attribute, .entity_reference, .entity, .notation => return new_para,
|
.attribute, .entity_reference, .entity, .notation => {},
|
||||||
}
|
}
|
||||||
return new_para;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: not sure about + - . ! as they are very common characters
|
// TODO: not sure about + - . ! as they are very common characters
|
||||||
@@ -217,12 +276,12 @@ fn writeNode(node: *parser.Node, new_para: bool, writer: anytype) anyerror!bool
|
|||||||
// TODO: | (pipe)
|
// TODO: | (pipe)
|
||||||
const escape = [_]u8{ '\\', '`', '*', '_', '{', '}', '[', ']', '<', '>', '(', ')', '#' };
|
const escape = [_]u8{ '\\', '`', '*', '_', '{', '}', '[', ']', '<', '>', '(', ')', '#' };
|
||||||
|
|
||||||
fn writeText(value: []const u8, writer: anytype) !bool {
|
fn writeText(state: *State, value: []const u8, writer: anytype) !bool {
|
||||||
if (value.len == 0) return false;
|
if (value.len == 0) return false;
|
||||||
|
|
||||||
var last_char: u8 = ' ';
|
var last_char: u8 = ' ';
|
||||||
var printed: u64 = 0;
|
var printed: u64 = 0;
|
||||||
for (value) |v| {
|
for (value, 0..) |v, i| {
|
||||||
// do not print:
|
// do not print:
|
||||||
// - multiple spaces
|
// - multiple spaces
|
||||||
// - return line
|
// - return line
|
||||||
@@ -236,10 +295,17 @@ fn writeText(value: []const u8, writer: anytype) !bool {
|
|||||||
if (v == esc) try writer.writeAll("\\");
|
if (v == esc) try writer.writeAll("\\");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (printed == 0 and !state.is_inline()) {
|
||||||
|
if (state.last_char != '\n' and state.last_char != ' ') {
|
||||||
|
try writer.writeAll(" ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
last_char = v;
|
last_char = v;
|
||||||
printed += 1;
|
printed += 1;
|
||||||
const x = [_]u8{v}; // TODO: do we have something better?
|
const x = [_]u8{v}; // TODO: do we have something better?
|
||||||
try writer.writeAll(&x);
|
try writer.writeAll(&x);
|
||||||
|
if (i == value.len - 1) state.last_char = v;
|
||||||
}
|
}
|
||||||
if (printed > 0) return true;
|
if (printed > 0) return true;
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@@ -157,7 +157,7 @@ pub const Page = struct {
|
|||||||
|
|
||||||
// if the page has a pointer to a document, converts the HTML in Markdown and dump it.
|
// if the page has a pointer to a document, converts the HTML in Markdown and dump it.
|
||||||
const doc = parser.documentHTMLToDocument(self.window.document);
|
const doc = parser.documentHTMLToDocument(self.window.document);
|
||||||
try Markdown.writeMarkdown(doc, out);
|
try Markdown.writeMarkdown(self.url, doc, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn fetchModuleSource(ctx: *anyopaque, specifier: []const u8) !?[]const u8 {
|
pub fn fetchModuleSource(ctx: *anyopaque, specifier: []const u8) !?[]const u8 {
|
||||||
|
|||||||
Reference in New Issue
Block a user