browser: use charset from headers to parse doc

This commit is contained in:
Pierre Tachoire
2024-01-16 11:46:54 +01:00
parent 01a894b7d0
commit 5362fcc7b4

View File

@@ -182,10 +182,10 @@ pub const Page = struct {
log.info("no content-type HTTP header", .{}); log.info("no content-type HTTP header", .{});
return; return;
}; };
log.debug("header content-type: {s}", .{ct});
const mime = try Mime.parse(ct); const mime = try Mime.parse(ct);
if (mime.eql(Mime.HTML)) { if (mime.eql(Mime.HTML)) {
// TODO check content-type try self.loadHTMLDoc(req.reader(), mime.charset orelse "utf-8");
try self.loadHTMLDoc(req.reader());
} else { } else {
log.info("non-HTML document: {s}", .{ct}); log.info("non-HTML document: {s}", .{ct});
@@ -195,10 +195,13 @@ pub const Page = struct {
} }
// https://html.spec.whatwg.org/#read-html // https://html.spec.whatwg.org/#read-html
fn loadHTMLDoc(self: *Page, reader: anytype) !void { fn loadHTMLDoc(self: *Page, reader: anytype, charset: []const u8) !void {
log.debug("parse html", .{}); log.debug("parse html with charset {s}", .{charset});
// TODO pass an encoding detected from HTTP headers.
const html_doc = try parser.documentHTMLParse(reader, "UTF-8"); const ccharset = try self.alloc.dupeZ(u8, charset);
defer self.alloc.free(ccharset);
const html_doc = try parser.documentHTMLParse(reader, ccharset);
const doc = parser.documentHTMLToDocument(html_doc); const doc = parser.documentHTMLToDocument(html_doc);
// save a document's pointer in the page. // save a document's pointer in the page.