1 Commits

Author SHA1 Message Date
Muki Kiboigo
4a849e5693 normalize html title whitespace 2025-05-16 07:21:56 -07:00
7 changed files with 38 additions and 14 deletions

View File

@@ -161,7 +161,7 @@ test "matchFirst" {
for (testcases) |tc| {
matcher.reset();
const doc = try parser.documentHTMLParseFromStr(tc.html);
const doc = try parser.documentHTMLParseFromStr(alloc, tc.html);
defer parser.documentHTMLClose(doc) catch {};
const s = css.parse(alloc, tc.q, .{}) catch |e| {

View File

@@ -196,7 +196,10 @@ fn testWriteFullHTML(comptime expected: []const u8, src: []const u8) !void {
var buf = std.ArrayListUnmanaged(u8){};
defer buf.deinit(testing.allocator);
const doc_html = try parser.documentHTMLParseFromStr(src);
var aa = std.heap.ArenaAllocator.init(testing.allocator);
defer aa.deinit();
const doc_html = try parser.documentHTMLParseFromStr(aa.allocator(), src);
defer parser.documentHTMLClose(doc_html) catch {};
const doc = parser.documentHTMLToDocument(doc_html);

View File

@@ -29,6 +29,19 @@ const collection = @import("../dom/html_collection.zig");
const Walker = @import("../dom/walker.zig").WalkerDepthFirst;
const Cookie = @import("../storage/cookie.zig").Cookie;
pub fn normalizeWhitespace(arena: std.mem.Allocator, title: []const u8) ![]const u8 {
var normalized = try std.ArrayListUnmanaged(u8).initCapacity(arena, title.len);
var tokens = std.mem.tokenizeAny(u8, title, &std.ascii.whitespace);
var prepend = false;
while (tokens.next()) |token| {
if (prepend) normalized.appendAssumeCapacity(' ') else prepend = true;
normalized.appendSliceAssumeCapacity(token);
}
return normalized.items;
}
// WEB IDL https://html.spec.whatwg.org/#the-document-object
pub const HTMLDocument = struct {
pub const Self = parser.DocumentHTML;
@@ -94,9 +107,10 @@ pub const HTMLDocument = struct {
return try parser.documentHTMLGetTitle(self);
}
pub fn set_title(self: *parser.DocumentHTML, v: []const u8) ![]const u8 {
try parser.documentHTMLSetTitle(self, v);
return v;
pub fn set_title(self: *parser.DocumentHTML, v: []const u8, state: *SessionState) ![]const u8 {
const normalized = try normalizeWhitespace(state.arena, v);
try parser.documentHTMLSetTitle(self, normalized);
return normalized;
}
pub fn _getElementsByName(self: *parser.DocumentHTML, name: []const u8, state: *SessionState) !NodeList {

View File

@@ -29,6 +29,7 @@ const c = @cImport({
});
const mimalloc = @import("mimalloc.zig");
const normalizeWhitespace = @import("html/document.zig").normalizeWhitespace;
// init initializes netsurf lib.
// init starts a mimalloc heap arena for the netsurf session. The caller must
@@ -2152,12 +2153,12 @@ fn parserErr(err: HubbubErr) ParserError!void {
// documentHTMLParseFromStr parses the given HTML string.
// The caller is responsible for closing the document.
pub fn documentHTMLParseFromStr(str: []const u8) !*DocumentHTML {
pub fn documentHTMLParseFromStr(arena: std.mem.Allocator, str: []const u8) !*DocumentHTML {
var fbs = std.io.fixedBufferStream(str);
return try documentHTMLParse(fbs.reader(), "UTF-8");
return try documentHTMLParse(arena, fbs.reader(), "UTF-8");
}
pub fn documentHTMLParse(reader: anytype, enc: ?[:0]const u8) !*DocumentHTML {
pub fn documentHTMLParse(arena: std.mem.Allocator, reader: anytype, enc: ?[:0]const u8) !*DocumentHTML {
var parser: ?*c.dom_hubbub_parser = undefined;
var doc: ?*c.dom_document = undefined;
var err: c.hubbub_error = undefined;
@@ -2169,7 +2170,11 @@ pub fn documentHTMLParse(reader: anytype, enc: ?[:0]const u8) !*DocumentHTML {
try parseData(parser.?, reader);
return @as(*DocumentHTML, @ptrCast(doc.?));
const html_doc: *DocumentHTML = @ptrCast(doc.?);
const old_title = try documentHTMLGetTitle(html_doc);
const normalized = try normalizeWhitespace(arena, old_title);
try documentHTMLSetTitle(html_doc, normalized);
return html_doc;
}
pub fn documentParseFragmentFromStr(self: *Document, str: []const u8) !*DocumentFragment {

View File

@@ -248,7 +248,7 @@ pub const Page = struct {
const ccharset = try arena.dupeZ(u8, charset);
const html_doc = try parser.documentHTMLParse(reader, ccharset);
const html_doc = try parser.documentHTMLParse(arena, reader, ccharset);
const doc = parser.documentHTMLToDocument(html_doc);
// save a document's pointer in the page.

View File

@@ -703,7 +703,7 @@ pub const XMLHttpRequest = struct {
}
var fbs = std.io.fixedBufferStream(self.response_bytes.items);
const doc = parser.documentHTMLParse(fbs.reader(), ccharset) catch {
const doc = parser.documentHTMLParse(self.arena, fbs.reader(), ccharset) catch {
self.response_obj = .{ .Failure = {} };
return;
};

View File

@@ -214,11 +214,13 @@ pub const Document = struct {
parser.deinit();
try parser.init();
var arena = std.heap.ArenaAllocator.init(allocator);
var fbs = std.io.fixedBufferStream(html);
const html_doc = try parser.documentHTMLParse(fbs.reader(), "utf-8");
const html_doc = try parser.documentHTMLParse(arena.allocator(), fbs.reader(), "utf-8");
return .{
.arena = std.heap.ArenaAllocator.init(allocator),
.arena = arena,
.doc = parser.documentHTMLToDocument(html_doc),
};
}
@@ -410,7 +412,7 @@ pub const JsRunner = struct {
errdefer self.loop.deinit();
var html = std.io.fixedBufferStream(opts.html);
const document = try parser.documentHTMLParse(html.reader(), "UTF-8");
const document = try parser.documentHTMLParse(arena, html.reader(), "UTF-8");
self.state = .{
.arena = arena,