Merge pull request #893 from lightpanda-io/dump_noscript

Add a --noscript option to "improve" --dump
This commit is contained in:
Karl Seguin
2025-07-15 21:22:33 +08:00
committed by GitHub
5 changed files with 36 additions and 15 deletions

View File

@@ -110,13 +110,13 @@ pub const Element = struct {
pub fn get_innerHTML(self: *parser.Element, page: *Page) ![]const u8 {
var buf = std.ArrayList(u8).init(page.arena);
try dump.writeChildren(parser.elementToNode(self), buf.writer());
try dump.writeChildren(parser.elementToNode(self), .{}, buf.writer());
return buf.items;
}
pub fn get_outerHTML(self: *parser.Element, page: *Page) ![]const u8 {
var buf = std.ArrayList(u8).init(page.arena);
try dump.writeNode(parser.elementToNode(self), buf.writer());
try dump.writeNode(parser.elementToNode(self), .{}, buf.writer());
return buf.items;
}

View File

@@ -21,10 +21,14 @@ const std = @import("std");
const parser = @import("netsurf.zig");
const Walker = @import("dom/walker.zig").WalkerChildren;
pub const Opts = struct {
exclude_scripts: bool = false,
};
// writer must be a std.io.Writer
pub fn writeHTML(doc: *parser.Document, writer: anytype) !void {
pub fn writeHTML(doc: *parser.Document, opts: Opts, writer: anytype) !void {
try writer.writeAll("<!DOCTYPE html>\n");
try writeChildren(parser.documentToNode(doc), writer);
try writeChildren(parser.documentToNode(doc), opts, writer);
try writer.writeAll("\n");
}
@@ -54,10 +58,15 @@ pub fn writeDocType(doc_type: *parser.DocumentType, writer: anytype) !void {
try writer.writeAll(">");
}
pub fn writeNode(node: *parser.Node, writer: anytype) anyerror!void {
pub fn writeNode(node: *parser.Node, opts: Opts, writer: anytype) anyerror!void {
switch (try parser.nodeType(node)) {
.element => {
// open the tag
const tag_type = try parser.elementHTMLGetTagType(@ptrCast(node));
if (tag_type == .script and opts.exclude_scripts) {
return;
}
const tag = try parser.nodeLocalName(node);
try writer.writeAll("<");
try writer.writeAll(tag);
@@ -82,12 +91,12 @@ pub fn writeNode(node: *parser.Node, writer: anytype) anyerror!void {
// void elements can't have any content.
if (try isVoid(parser.nodeToElement(node))) return;
if (try parser.elementHTMLGetTagType(@ptrCast(node)) == .script) {
if (tag_type == .script) {
try writer.writeAll(try parser.nodeTextContent(node) orelse "");
} else {
// write the children
// TODO avoid recursion
try writeChildren(node, writer);
try writeChildren(node, opts, writer);
}
// close the tag
@@ -129,12 +138,12 @@ pub fn writeNode(node: *parser.Node, writer: anytype) anyerror!void {
}
// writer must be a std.io.Writer
pub fn writeChildren(root: *parser.Node, writer: anytype) !void {
pub fn writeChildren(root: *parser.Node, opts: Opts, writer: anytype) !void {
const walker = Walker{};
var next: ?*parser.Node = null;
while (true) {
next = try walker.get_next(root, next) orelse break;
try writeNode(next.?, writer);
try writeNode(next.?, opts, writer);
}
}
@@ -238,6 +247,6 @@ fn testWriteFullHTML(comptime expected: []const u8, src: []const u8) !void {
defer parser.documentHTMLClose(doc_html) catch {};
const doc = parser.documentHTMLToDocument(doc_html);
try writeHTML(doc, buf.writer(testing.allocator));
try writeHTML(doc, .{}, buf.writer(testing.allocator));
try testing.expectEqualStrings(expected, buf.items);
}

View File

@@ -142,7 +142,7 @@ pub const Page = struct {
}
// dump writes the page content into the given file.
pub fn dump(self: *const Page, out: std.fs.File) !void {
pub fn dump(self: *const Page, opts: Dump.Opts, out: std.fs.File) !void {
if (self.raw_data) |raw_data| {
// raw_data was set if the document was not HTML, dump the data content only.
return try out.writeAll(raw_data);
@@ -150,7 +150,7 @@ pub const Page = struct {
// if the page has a pointer to a document, dumps the HTML.
const doc = parser.documentHTMLToDocument(self.window.document);
try Dump.writeHTML(doc, out);
try Dump.writeHTML(doc, opts, out);
}
pub fn fetchModuleSource(ctx: *anyopaque, src: []const u8) !?[]const u8 {

View File

@@ -36,9 +36,9 @@ pub const XMLSerializer = struct {
pub fn _serializeToString(_: *const XMLSerializer, root: *parser.Node, page: *Page) ![]const u8 {
var buf = std.ArrayList(u8).init(page.arena);
switch (try parser.nodeType(root)) {
.document => try dump.writeHTML(@as(*parser.Document, @ptrCast(root)), buf.writer()),
.document => try dump.writeHTML(@as(*parser.Document, @ptrCast(root)), .{}, buf.writer()),
.document_type => try dump.writeDocType(@as(*parser.DocumentType, @ptrCast(root)), buf.writer()),
else => try dump.writeNode(root, buf.writer()),
else => try dump.writeNode(root, .{}, buf.writer()),
}
return buf.items;
}

View File

@@ -134,7 +134,7 @@ fn run(alloc: Allocator) !void {
// dump
if (opts.dump) {
try page.dump(std.io.getStdOut());
try page.dump(.{ .exclude_scripts = opts.noscript }, std.io.getStdOut());
}
},
else => unreachable,
@@ -212,6 +212,7 @@ const Command = struct {
url: []const u8,
dump: bool = false,
common: Common,
noscript: bool = false,
};
const Common = struct {
@@ -275,6 +276,7 @@ const Command = struct {
\\Options:
\\--dump Dumps document to stdout.
\\ Defaults to false.
\\--noscript Exclude <script> tags in dump. Defaults to false.
\\
++ common_options ++
\\
@@ -352,6 +354,9 @@ fn inferMode(opt: []const u8) ?App.RunMode {
if (std.mem.eql(u8, opt, "--dump")) {
return .fetch;
}
if (std.mem.eql(u8, opt, "--noscript")) {
return .fetch;
}
if (std.mem.startsWith(u8, opt, "--") == false) {
return .fetch;
}
@@ -437,6 +442,7 @@ fn parseFetchArgs(
args: *std.process.ArgIterator,
) !Command.Fetch {
var dump: bool = false;
var noscript: bool = true;
var url: ?[]const u8 = null;
var common: Command.Common = .{};
@@ -446,6 +452,11 @@ fn parseFetchArgs(
continue;
}
if (std.mem.eql(u8, "--noscript", opt)) {
noscript = true;
continue;
}
if (try parseCommonArg(allocator, opt, args, &common)) {
continue;
}
@@ -471,6 +482,7 @@ fn parseFetchArgs(
.url = url.?,
.dump = dump,
.common = common,
.noscript = noscript,
};
}