loader: parse html per chunk

This commit is contained in:
Pierre Tachoire
2024-01-10 12:38:56 +01:00
parent 3a3da494dc
commit 5d262fc026
2 changed files with 51 additions and 17 deletions

View File

@@ -152,11 +152,13 @@ pub const Page = struct {
// dump writes the page content into the given file.
pub fn dump(self: *Page, out: std.fs.File) !void {
// no data loaded, nothin to do.
if (self.raw_data == null) return;
// if no HTML document pointer available, dump the data content only.
if (self.doc == null) return try out.writeAll(self.raw_data.?);
if (self.doc == null) {
// no data loaded, nothing to do.
if (self.raw_data == null) return;
return try out.writeAll(self.raw_data.?);
}
// if the page has a pointer to a document, dumps the HTML.
const root = try parser.documentGetDocumentElement(self.doc.?) orelse return;
@@ -175,22 +177,19 @@ pub const Page = struct {
// TODO handle fragment in url.
// load the data
var result = try self.loader.fetch(self.allocator, self.uri);
defer result.deinit();
var resp = try self.loader.get(self.allocator, self.uri);
defer resp.deinit();
log.info("GET {any} {d}", .{ self.uri, result.status });
const req = resp.req;
log.info("GET {any} {d}", .{ self.uri, req.response.status });
// TODO handle redirection
if (result.status != .ok) return error.BadStatusCode;
if (result.body == null) return error.NoBody;
// save the body into the page.
self.raw_data = try self.allocator.dupe(u8, result.body.?);
if (req.response.status != .ok) return error.BadStatusCode;
// TODO handle charset
// https://html.spec.whatwg.org/#content-type
const ct = result.headers.getFirstValue("Content-Type") orelse {
const ct = req.response.headers.getFirstValue("Content-Type") orelse {
// no content type in HTTP headers.
// TODO try to sniff mime type from the body.
log.info("no content-type HTTP header", .{});
@@ -199,16 +198,19 @@ pub const Page = struct {
const mime = try Mime.parse(ct);
if (mime.eql(Mime.HTML)) {
// TODO check content-type
try self.loadHTMLDoc(&result);
try self.loadHTMLDoc(req.reader());
} else {
log.info("non-HTML document: {s}", .{ct});
// save the body into the page.
self.raw_data = try req.reader().readAllAlloc(self.allocator, 16 * 1024 * 1024);
}
}
// https://html.spec.whatwg.org/#read-html
fn loadHTMLDoc(self: *Page, result: *FetchResult) !void {
fn loadHTMLDoc(self: *Page, reader: anytype) !void {
log.debug("parse html", .{});
const html_doc = try parser.documentHTMLParseFromStr(result.body.?);
const html_doc = try parser.documentHTMLParse(reader);
const doc = parser.documentHTMLToDocument(html_doc);
// save a document's pointer in the page.

View File

@@ -6,10 +6,12 @@ pub const Loader = struct {
client: std.http.Client,
pub const Response = struct {
req: std.http.Request,
allocator: std.mem.Allocator,
req: *std.http.Client.Request,
pub fn deinit(self: *Response) void {
self.req.deinit();
self.allocator.destroy(self.req);
}
};
@@ -40,6 +42,36 @@ pub const Loader = struct {
.payload = .none,
});
}
// see
// https://ziglang.org/documentation/master/std/#A;std:http.Client.fetch
// for reference.
// The caller is responsible for calling `deinit()` on the `Response`.
pub fn get(self: *Loader, allocator: std.mem.Allocator, uri: std.Uri) !Response {
var headers = try std.http.Headers.initList(allocator, &[_]std.http.Field{
.{ .name = "User-Agent", .value = user_agent },
.{ .name = "Accept", .value = "*/*" },
.{ .name = "Accept-Language", .value = "en-US,en;q=0.5" },
});
defer headers.deinit();
var resp = Response{
.allocator = allocator,
.req = try allocator.create(std.http.Client.Request),
};
errdefer allocator.destroy(resp.req);
resp.req.* = try self.client.open(.GET, uri, headers, .{
.handle_redirects = true, // TODO handle redirects manually
});
errdefer resp.req.deinit();
try resp.req.send(.{});
try resp.req.finish();
try resp.req.wait();
return resp;
}
};
test "basic url fetch" {