loader: parse html per chunk

This commit is contained in:
Pierre Tachoire
2024-01-10 12:38:56 +01:00
parent 3a3da494dc
commit 5d262fc026
2 changed files with 51 additions and 17 deletions

View File

@@ -152,11 +152,13 @@ pub const Page = struct {
// dump writes the page content into the given file. // dump writes the page content into the given file.
pub fn dump(self: *Page, out: std.fs.File) !void { pub fn dump(self: *Page, out: std.fs.File) !void {
// no data loaded, nothin to do.
if (self.raw_data == null) return;
// if no HTML document pointer available, dump the data content only. // if no HTML document pointer available, dump the data content only.
if (self.doc == null) return try out.writeAll(self.raw_data.?); if (self.doc == null) {
// no data loaded, nothing to do.
if (self.raw_data == null) return;
return try out.writeAll(self.raw_data.?);
}
// if the page has a pointer to a document, dumps the HTML. // if the page has a pointer to a document, dumps the HTML.
const root = try parser.documentGetDocumentElement(self.doc.?) orelse return; const root = try parser.documentGetDocumentElement(self.doc.?) orelse return;
@@ -175,22 +177,19 @@ pub const Page = struct {
// TODO handle fragment in url. // TODO handle fragment in url.
// load the data // load the data
var result = try self.loader.fetch(self.allocator, self.uri); var resp = try self.loader.get(self.allocator, self.uri);
defer result.deinit(); defer resp.deinit();
log.info("GET {any} {d}", .{ self.uri, result.status }); const req = resp.req;
log.info("GET {any} {d}", .{ self.uri, req.response.status });
// TODO handle redirection // TODO handle redirection
if (result.status != .ok) return error.BadStatusCode; if (req.response.status != .ok) return error.BadStatusCode;
if (result.body == null) return error.NoBody;
// save the body into the page.
self.raw_data = try self.allocator.dupe(u8, result.body.?);
// TODO handle charset // TODO handle charset
// https://html.spec.whatwg.org/#content-type // https://html.spec.whatwg.org/#content-type
const ct = result.headers.getFirstValue("Content-Type") orelse { const ct = req.response.headers.getFirstValue("Content-Type") orelse {
// no content type in HTTP headers. // no content type in HTTP headers.
// TODO try to sniff mime type from the body. // TODO try to sniff mime type from the body.
log.info("no content-type HTTP header", .{}); log.info("no content-type HTTP header", .{});
@@ -199,16 +198,19 @@ pub const Page = struct {
const mime = try Mime.parse(ct); const mime = try Mime.parse(ct);
if (mime.eql(Mime.HTML)) { if (mime.eql(Mime.HTML)) {
// TODO check content-type // TODO check content-type
try self.loadHTMLDoc(&result); try self.loadHTMLDoc(req.reader());
} else { } else {
log.info("non-HTML document: {s}", .{ct}); log.info("non-HTML document: {s}", .{ct});
// save the body into the page.
self.raw_data = try req.reader().readAllAlloc(self.allocator, 16 * 1024 * 1024);
} }
} }
// https://html.spec.whatwg.org/#read-html // https://html.spec.whatwg.org/#read-html
fn loadHTMLDoc(self: *Page, result: *FetchResult) !void { fn loadHTMLDoc(self: *Page, reader: anytype) !void {
log.debug("parse html", .{}); log.debug("parse html", .{});
const html_doc = try parser.documentHTMLParseFromStr(result.body.?); const html_doc = try parser.documentHTMLParse(reader);
const doc = parser.documentHTMLToDocument(html_doc); const doc = parser.documentHTMLToDocument(html_doc);
// save a document's pointer in the page. // save a document's pointer in the page.

View File

@@ -6,10 +6,12 @@ pub const Loader = struct {
client: std.http.Client, client: std.http.Client,
pub const Response = struct { pub const Response = struct {
req: std.http.Request, allocator: std.mem.Allocator,
req: *std.http.Client.Request,
pub fn deinit(self: *Response) void { pub fn deinit(self: *Response) void {
self.req.deinit(); self.req.deinit();
self.allocator.destroy(self.req);
} }
}; };
@@ -40,6 +42,36 @@ pub const Loader = struct {
.payload = .none, .payload = .none,
}); });
} }
// see
// https://ziglang.org/documentation/master/std/#A;std:http.Client.fetch
// for reference.
// The caller is responsible for calling `deinit()` on the `Response`.
pub fn get(self: *Loader, allocator: std.mem.Allocator, uri: std.Uri) !Response {
var headers = try std.http.Headers.initList(allocator, &[_]std.http.Field{
.{ .name = "User-Agent", .value = user_agent },
.{ .name = "Accept", .value = "*/*" },
.{ .name = "Accept-Language", .value = "en-US,en;q=0.5" },
});
defer headers.deinit();
var resp = Response{
.allocator = allocator,
.req = try allocator.create(std.http.Client.Request),
};
errdefer allocator.destroy(resp.req);
resp.req.* = try self.client.open(.GET, uri, headers, .{
.handle_redirects = true, // TODO handle redirects manually
});
errdefer resp.req.deinit();
try resp.req.send(.{});
try resp.req.finish();
try resp.req.wait();
return resp;
}
}; };
test "basic url fetch" { test "basic url fetch" {